diff --git a/.cargo/config.toml b/.cargo/config.toml
new file mode 100644
index 0000000..a83a16b
--- /dev/null
+++ b/.cargo/config.toml
@@ -0,0 +1,5 @@
+[env]
+WHISPER_DONT_GENERATE_BINDINGS = "1"
+
+[patch.crates-io]
+whisper-rs-sys = { path = "vendor/whisper-rs-sys" }
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c3ea29d..66032c6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -58,13 +58,17 @@ jobs:
- name: Check osd feature only
run: cargo check --no-default-features --features osd
- - name: Verify package (default publish surface)
+ - name: Check local rewrite feature only
+ run: cargo check --no-default-features --features local-rewrite
+
+ - name: Package tarball
run: cargo package --locked
- name: Check cuda feature only (if toolkit available)
run: |
if command -v nvcc >/dev/null 2>&1; then
cargo check --no-default-features --features cuda
+ cargo check --no-default-features --features cuda,local-rewrite
else
echo "CUDA toolkit not available on this runner; skipping cuda feature check"
fi
diff --git a/Cargo.lock b/Cargo.lock
index 1d3e336..9867318 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11,6 +11,12 @@ dependencies = [
"memchr",
]
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
[[package]]
name = "alsa"
version = "0.9.1"
@@ -1016,6 +1022,28 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+[[package]]
+name = "foldhash"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
+
+[[package]]
+name = "font8x8"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "875488b8711a968268c7cf5d139578713097ca4635a76044e8fe8eedf831d07e"
+
+[[package]]
+name = "fontdue"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e57e16b3fe8ff4364c0661fdaac543fb38b29ea9bc9c2f45612d90adf931d2b"
+dependencies = [
+ "hashbrown 0.15.5",
+ "ttf-parser",
+]
+
[[package]]
name = "form_urlencoded"
version = "1.2.2"
@@ -1175,6 +1203,17 @@ dependencies = [
"byteorder",
]
+[[package]]
+name = "hashbrown"
+version = "0.15.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash",
+]
+
[[package]]
name = "hashbrown"
version = "0.16.1"
@@ -1490,7 +1529,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017"
dependencies = [
"equivalent",
- "hashbrown",
+ "hashbrown 0.16.1",
]
[[package]]
@@ -3272,6 +3311,12 @@ version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+[[package]]
+name = "ttf-parser"
+version = "0.21.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c591d83f69777866b9126b24c6dd9a18351f177e49d625920d19f989fd31cf8"
+
[[package]]
name = "typenum"
version = "1.19.0"
@@ -3579,8 +3624,6 @@ dependencies = [
[[package]]
name = "whisper-rs-sys"
version = "0.14.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5e2a6e06e7ac7b8f53c53a5f50bb0bc823ba69b63ecd887339f807a5598bbd2"
dependencies = [
"bindgen 0.71.1",
"cfg-if",
@@ -3590,7 +3633,7 @@ dependencies = [
[[package]]
name = "whispers"
-version = "0.1.0"
+version = "0.1.1"
dependencies = [
"base64 0.22.1",
"clap",
@@ -3602,6 +3645,8 @@ dependencies = [
"encoding_rs",
"evdev",
"flacenc",
+ "font8x8",
+ "fontdue",
"futures-util",
"httpmock",
"indicatif",
diff --git a/Cargo.toml b/Cargo.toml
index 8e0dced..ae9b685 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "whispers"
-version = "0.1.0"
+version = "0.1.1"
edition = "2024"
rust-version = "1.85"
description = "Speech-to-text dictation tool for Wayland"
@@ -8,7 +8,18 @@ license = "MPL-2.0"
repository = "https://github.com/OneNoted/whispers"
keywords = ["wayland", "speech-to-text", "whisper", "dictation", "voice"]
categories = ["accessibility", "multimedia"]
-exclude = [".jj/", ".git/", "target/"]
+include = [
+ "Cargo.toml",
+ "Cargo.lock",
+ "README.md",
+ "LICENSE",
+ "NOTICE",
+ "config.example.toml",
+ ".cargo/config.toml",
+ "sounds/*.wav",
+ "src/**",
+ "vendor/whisper-rs-sys/**/*",
+]
[dependencies]
# Async runtime
@@ -19,7 +30,7 @@ cpal = "0.17"
# Whisper transcription
whisper-rs = "0.15"
-llama-cpp-2 = "0.1.138"
+llama-cpp-2 = { version = "0.1.138", optional = true }
# uinput virtual keyboard for paste keystroke
evdev = { version = "0.13" }
@@ -63,11 +74,20 @@ console = "0.16"
wayland-client = { version = "0.31", optional = true }
wayland-protocols = { version = "0.32", features = ["client"], optional = true }
wayland-protocols-wlr = { version = "0.3", features = ["client"], optional = true }
+font8x8 = { version = "0.3", optional = true }
+fontdue = { version = "0.9", optional = true }
[features]
default = ["osd"]
-cuda = ["whisper-rs/cuda", "llama-cpp-2/cuda"]
-osd = ["dep:wayland-client", "dep:wayland-protocols", "dep:wayland-protocols-wlr"]
+cuda = ["whisper-rs/cuda", "llama-cpp-2?/cuda"]
+local-rewrite = ["dep:llama-cpp-2"]
+osd = [
+ "dep:wayland-client",
+ "dep:wayland-protocols",
+ "dep:wayland-protocols-wlr",
+ "dep:font8x8",
+ "dep:fontdue",
+]
[[bin]]
name = "whispers"
@@ -80,7 +100,8 @@ required-features = ["osd"]
[[bin]]
name = "whispers-rewrite-worker"
-path = "src/bin/whispers-rewrite-worker.rs"
+path = "src/bin/whispers-rewrite-worker/main.rs"
+required-features = ["local-rewrite"]
[dev-dependencies]
httpmock = "0.7"
diff --git a/README.md b/README.md
index e360c57..57893e8 100644
--- a/README.md
+++ b/README.md
@@ -1,348 +1,123 @@
# whispers
-Fast speech-to-text dictation for Wayland with local-first ASR and optional cloud ASR/rewrite backends.
-Press a key to start recording, press it again to transcribe and paste.
+Fast speech-to-text dictation for Wayland.
-Local mode keeps all inference on your machine. Optional cloud modes can offload ASR, rewrite, or both when configured.
-
-Inspired by [hyprwhspr](https://github.com/goodroot/hyprwhspr) by goodroot.
-
-
-
-
-
-## How it works
-
-1. Bind `whispers` to a key in your compositor
-2. First press starts recording (OSD overlay shows audio visualization)
-3. Second press stops recording, transcribes, and pastes via `Ctrl+Shift+V`
-
-The two invocations communicate via PID file + `SIGUSR1` — no daemon, no IPC server.
-
-## Post-processing modes
-
-`whispers` now has two main dictation modes:
-
-- `raw` keeps output close to the direct transcription result and is the default
-- `advanced_local` enables the smart rewrite pipeline after transcription; `[rewrite].backend` chooses whether that rewrite runs locally or in the cloud
-
-The older heuristic cleanup path is still available as deprecated `legacy_basic` for existing configs that already use `[cleanup]`.
-The local rewrite path is managed by `whispers` itself through an internal helper binary installed alongside the main executable, so there is no separate tool or daemon to install manually.
-When `advanced_local` is enabled with `rewrite.backend = "local"`, `whispers` keeps a hidden rewrite worker warm for a short idle window so repeated dictation is much faster without becoming a permanent background daemon.
-Managed rewrite models are the default path. If you point `rewrite.model_path` at your own GGUF, it should be a chat-capable model with an embedded template that `llama.cpp` can apply at runtime.
-Deterministic personalization rules apply in all modes: dictionary replacements, spoken snippets, and optional append-only custom rewrite instructions for `advanced_local`.
-Cloud ASR and cloud rewrite are both optional. Local remains the default.
-
-For file transcription, `whispers transcribe --raw ` always prints the plain ASR transcript without any post-processing.
-
-## Requirements
-
-- Rust 1.85+ (edition 2024)
-- Linux with Wayland compositor
-- `wl-copy` (from `wl-clipboard`)
-- `uinput` access (for virtual keyboard paste)
-- NVIDIA GPU + CUDA toolkit (optional, for GPU acceleration)
-- `python3` on `PATH` if you want to use the optional `faster-whisper` backend
-- `python3.10`, `python3.11`, or `python3.12` on `PATH` if you want to use the experimental NeMo backends
-- If no compatible GPU is available, set `transcription.use_gpu = false` in config
+`whispers` is local-first by default, with optional cloud ASR and rewrite backends when you want them. The normal flow is simple: press a key to start recording, press it again to transcribe and paste.
## Install
-### From crates.io
-
```sh
-# Default install: CPU build with Wayland OSD
+# default install
cargo install whispers
-# Enable CUDA acceleration explicitly
+# CUDA
cargo install whispers --features cuda
-# Build without the OSD overlay
-cargo install whispers --no-default-features
-```
-
-### From git
+# local rewrite support
+cargo install whispers --features local-rewrite
-```sh
-# Default install: CPU build with Wayland OSD
-cargo install --git https://github.com/OneNoted/whispers
+# CUDA + local rewrite
+cargo install whispers --features cuda,local-rewrite
-# Enable CUDA acceleration explicitly
-cargo install --git https://github.com/OneNoted/whispers --features cuda
-
-# Build without the OSD overlay
-cargo install --git https://github.com/OneNoted/whispers --no-default-features
+# no OSD
+cargo install whispers --no-default-features
```
-### Setup
-
-Run the interactive setup wizard to download a local ASR model, generate config, and optionally enable local or cloud advanced dictation. Recommended local models are shown first, and experimental backends like Parakeet are called out explicitly before you opt into them:
+If you want the latest GitHub version instead of crates.io:
```sh
-whispers setup
+cargo install --git https://github.com/OneNoted/whispers --features cuda,local-rewrite
```
-Normal runs keep output concise. Add `-v` when you want detailed diagnostic logs during setup, downloads, or dictation.
+## Requirements
+
+- Linux with Wayland
+- `wl-copy`
+- access to `/dev/uinput`
+- Rust 1.85+
+- CUDA toolkit if you enable the `cuda` feature
-Use a custom config file for any command (including `setup` and `asr-model`):
+If `/dev/uinput` is blocked, add your user to the `input` group and log back in:
```sh
-whispers --config /path/to/config.toml setup
-whispers --config /path/to/config.toml asr-model select tiny
+sudo usermod -aG input $USER
```
-Or manage ASR models manually:
+## Quick Start
```sh
-whispers asr-model list
-whispers asr-model download large-v3-turbo
-whispers asr-model select large-v3-turbo
-whispers asr-model download distil-large-v3.5
-whispers asr-model select distil-large-v3.5
-# Experimental NeMo path:
-whispers asr-model download parakeet-tdt_ctc-1.1b
-whispers asr-model select parakeet-tdt_ctc-1.1b
-
-# Legacy whisper_cpp-only aliases still work for one release:
-whispers model list
-whispers model download large-v3-turbo
-whispers model select large-v3-turbo
+# generate config and download a model
+whispers setup
-whispers rewrite-model list
-whispers rewrite-model download qwen-3.5-4b-q4_k_m
-whispers rewrite-model select qwen-3.5-4b-q4_k_m
-whispers cloud check
+# one-shot dictation
+whispers
-whispers dictionary add "wisper flow" "Wispr Flow"
-whispers dictionary list
-whispers snippets add signature "Best regards,\nNotes"
-whispers snippets list
-whispers rewrite-instructions-path
+# live mode
+whispers voice
```
-That still remains a single install: `whispers` manages local ASR models, the optional local rewrite worker/model, and the optional cloud configuration from the same package. `faster-whisper` is bootstrapped into a hidden managed runtime when you download or prewarm that backend.
-
-## Shell completions
+Default config path:
-Print completion scripts to `stdout`:
-
-```sh
-# auto-detect from $SHELL (falls back to parent process name)
-whispers completions
-
-# or specify manually
-whispers completions zsh
+```text
+~/.config/whispers/config.toml
```
-Supported shells: `bash`, `zsh`, `fish`, `nushell`.
-
-Example install paths:
+Canonical example config:
-```sh
-# bash
-mkdir -p ~/.local/share/bash-completion/completions
-whispers completions bash > ~/.local/share/bash-completion/completions/whispers
-
-# zsh
-mkdir -p ~/.zfunc
-whispers completions zsh > ~/.zfunc/_whispers
-
-# fish
-mkdir -p ~/.config/fish/completions
-whispers completions fish > ~/.config/fish/completions/whispers.fish
+- [config.example.toml](config.example.toml)
-# nushell
-mkdir -p ~/.config/nushell/completions
-whispers completions nushell > ~/.config/nushell/completions/whispers.nu
-```
-
-## Compositor keybinding
+### Keybinding
-### Hyprland
+Hyprland:
```conf
bind = SUPER ALT, D, exec, whispers
```
-### Sway
+Sway:
```conf
bindsym $mod+Alt+d exec whispers
```
-## Configuration
-
-Config lives at `~/.config/whispers/config.toml` by default. Generated automatically by `whispers setup`, or copy from `config.example.toml`:
-
-```toml
-[audio]
-device = "" # empty = system default
-sample_rate = 16000
-
-[transcription]
-backend = "whisper_cpp" # or "faster_whisper" / "nemo" / "cloud"
-fallback = "configured_local" # or "none"
-local_backend = "whisper_cpp"
-selected_model = "large-v3-turbo"
-model_path = "~/.local/share/whispers/ggml-large-v3-turbo.bin"
-language = "auto" # or "en", "fr", "de", etc.
-use_gpu = true # set false to force CPU
-flash_attn = true # only used when use_gpu=true
-idle_timeout_ms = 120000
-
-[postprocess]
-mode = "raw" # or "advanced_local"; deprecated: "legacy_basic"
-
-[session]
-enabled = true
-max_entries = 3
-max_age_ms = 8000
-max_replace_graphemes = 400
-
-[personalization]
-dictionary_path = "~/.local/share/whispers/dictionary.toml"
-snippets_path = "~/.local/share/whispers/snippets.toml"
-snippet_trigger = "insert"
-
-[rewrite]
-backend = "local" # or "cloud"
-fallback = "local" # or "none"
-selected_model = "qwen-3.5-4b-q4_k_m"
-model_path = "" # optional manual GGUF path override
-instructions_path = "~/.local/share/whispers/rewrite-instructions.txt"
-profile = "auto" # or "qwen", "generic", "llama_compat"
-timeout_ms = 30000
-idle_timeout_ms = 120000
-max_output_chars = 1200
-max_tokens = 256
-
-[cloud]
-provider = "openai" # or "openai_compatible"
-base_url = "" # required for openai_compatible
-api_key = "" # optional direct API key; leave empty to use api_key_env
-api_key_env = "OPENAI_API_KEY"
-connect_timeout_ms = 3000
-request_timeout_ms = 15000
-
-[cloud.transcription]
-model = "gpt-4o-mini-transcribe"
-language_mode = "inherit_local" # or "force"
-language = ""
-
-[cloud.rewrite]
-model = "gpt-4.1-mini"
-temperature = 0.1
-max_output_tokens = 256
-
-[feedback]
-enabled = true
-start_sound = "" # empty = bundled sound
-stop_sound = ""
-```
-
-When `advanced_local` is enabled, `whispers` also keeps a short-lived local session ledger in the runtime directory so immediate follow-up corrections like `scratch that` can safely replace the most recent dictation entry when focus has not changed. That session behavior is local either way; only the semantic rewrite stage may be cloud-backed.
-
-## Cloud Modes
-
-- `transcription.backend = "cloud"` uploads recorded audio to the configured provider for ASR.
-- `rewrite.backend = "cloud"` uploads transcript/context JSON to the configured provider for semantic cleanup.
-- `transcription.fallback = "configured_local"` keeps a local ASR fallback path.
-- `rewrite.fallback = "local"` keeps a local rewrite fallback path.
-- Use either `cloud.api_key_env` or `cloud.api_key`. `setup` accepts either an env var name or a pasted key.
-
-Use `whispers cloud check` to validate cloud config, API key resolution, and basic provider connectivity.
-
-## Managed ASR models
-
-`whispers` currently ships managed local ASR entries across two backend families:
-
-| Model | Backend | Scope | Notes |
-|-------|---------|-------|-------|
-| large-v3-turbo | whisper_cpp | Multilingual | Default path |
-| large-v3 | whisper_cpp | Multilingual | Slower, higher accuracy |
-| medium / small / base / tiny | whisper_cpp | Multilingual | Smaller/faster tradeoffs |
-| *.en variants | whisper_cpp | English only | Smaller English Whisper options |
-| distil-large-v3.5 | faster_whisper | English only | Fast English option |
-| parakeet-tdt_ctc-1.1b | nemo | English only | Experimental NeMo ASR benchmark path |
-| canary-qwen-2.5b | nemo | English only | Experimental NeMo ASR/LLM hybrid (currently blocked) |
-
-`large-v3-turbo` remains the default multilingual local model. `distil-large-v3.5` is the speed-focused English option on the optional `faster-whisper` backend. `parakeet-tdt_ctc-1.1b` is kept as an experimental English-only NeMo backend for benchmarking against Whisper-family models, not as the default recommendation. Its first warm-up can be much slower than steady-state dictation, so judge it on warm use rather than the first cold start. `canary-qwen-2.5b` remains listed for evaluation, but the managed path is currently blocked by an upstream NeMo/PEFT initialization incompatibility. Cloud ASR models are configured under `[cloud.transcription]` instead of being downloaded locally.
-
-## Whisper Models
-
-| Model | Size | Speed | Notes |
-|-------|------|-------|-------|
-| large-v3-turbo | 1.6 GB | Fast | Best balance (recommended) |
-| large-v3-turbo-q5_0 | 574 MB | Fast | Quantized, slightly less accurate |
-| large-v3 | 3.1 GB | Slow | Most accurate |
-| small / small.en | 488 MB | Very fast | Good for English-only |
-| tiny / tiny.en | 78 MB | Instant | Least accurate |
-
-Whisper.cpp models are downloaded from [Hugging Face](https://huggingface.co/ggerganov/whisper.cpp) and stored in `~/.local/share/whispers/`. The managed `faster-whisper` backend stores models and its Python runtime under the same XDG data directory.
-
-## Managed rewrite models
-
-When `rewrite.backend = "local"`, `advanced_local` uses a second local model for post-processing. The managed local catalog currently includes:
-
-| Model | Size | Notes |
-|-------|------|-------|
-| qwen-3.5-2b-q4_k_m | ~1.3 GB | Fallback for weaker hardware |
-| qwen-3.5-4b-q4_k_m | ~2.9 GB | Recommended default |
-| qwen-3.5-9b-q4_k_m | ~5.9 GB | Higher quality, heavier |
-
-If you want to tinker, set `rewrite.model_path` to a custom GGUF file. When `rewrite.model_path` is set, it overrides the managed selection.
-`rewrite.profile = "auto"` keeps the prompt/runtime model-aware without requiring manual tuning for managed models, and still falls back safely for custom GGUFs.
-Custom rewrite models should include a chat template that `llama.cpp` can read from the GGUF metadata; otherwise rewrite prompting will fail fast instead of silently producing bad output.
-
-## Personalization
+## Commands
-Dictionary replacements apply deterministically in both `raw` and `advanced_local`, with normalization for case and punctuation but no fuzzy matching. In `advanced_local`, dictionary replacements are applied before the rewrite model and again on the final output so exact names and product terms stay stable.
-
-Spoken snippets also work in all modes. By default, saying `insert ` expands the configured snippet text verbatim after post-processing finishes, so the rewrite model cannot paraphrase it. Change the trigger phrase with `personalization.snippet_trigger`.
-
-Custom rewrite instructions live in a separate plain-text file referenced by `rewrite.instructions_path`. `whispers` appends that file to the built-in rewrite prompt for `advanced_local`, while still enforcing the same final-text-only output contract. The file is optional, and a missing file is ignored.
-
-## Faster Whisper
-
-`faster-whisper` is optional and intended for users who want the fastest English dictation path. The current managed model for it is `distil-large-v3.5`.
-
-Notes:
-- English dictation is the intended use case
-- if it fails at runtime and a local `large-v3-turbo` Whisper model is available, `whispers` falls back to `whisper_cpp`
-- `transcription.idle_timeout_ms = 0` keeps the hidden ASR worker warm indefinitely
-
-## Experimental NeMo backends
-
-`parakeet-tdt_ctc-1.1b` is available as an experimental English-only ASR option on a managed NeMo backend. `canary-qwen-2.5b` remains under evaluation, but the managed path is currently blocked by an upstream initialization issue.
+```sh
+# setup
+whispers setup
-Notes:
-- they are intended for benchmarking and experimentation, not as the default recommendation
-- first warm-up can be much slower than steady-state dictation because the hidden worker and model need to come up
-- the first use bootstraps a hidden managed Python runtime under the XDG data directory
-- the runtime currently requires Python 3.10, 3.11, or 3.12 on `PATH`
-- model downloads are stored as prepared NeMo model directories instead of ggml files
-- if a NeMo backend fails at runtime and a local `large-v3-turbo` Whisper model is available, `whispers` falls back to `whisper_cpp`
+# dictation
+whispers
+whispers voice
+whispers transcribe audio.wav
-## Privacy
+# ASR models
+whispers asr-model list
+whispers asr-model download large-v3-turbo
+whispers asr-model select large-v3-turbo
-- Local-only: no inference-time network traffic
-- Cloud ASR: audio leaves the machine for transcription
-- Cloud rewrite: transcript/context leaves the machine for rewrite
-- Cloud ASR + rewrite: both leave the machine
+# rewrite models
+whispers rewrite-model list
+whispers rewrite-model download qwen-3.5-4b-q4_k_m
+whispers rewrite-model select qwen-3.5-4b-q4_k_m
-## uinput permissions
+# personalization
+whispers dictionary add "wisper flow" "Wispr Flow"
+whispers snippets add signature "Best regards,\nNotes"
-whispers needs access to `/dev/uinput` for the virtual keyboard paste. Add your user to the `input` group:
+# cloud
+whispers cloud check
-```sh
-sudo usermod -aG input $USER
+# shell completions
+whispers completions zsh
```
-Then log out and back in.
-
-## Acknowledgements
+## Notes
-This project is inspired by [hyprwhspr](https://github.com/goodroot/hyprwhspr) by [goodroot](https://github.com/goodroot), which provides native speech-to-text for Linux with support for multiple backends. whispers is a from-scratch Rust reimplementation focused on local-first dictation with minimal dependencies.
+- Local ASR is the default.
+- Local rewrite is installed automatically with `--features local-rewrite`.
+- `whispers` installs the helper rewrite worker for you when that feature is enabled.
+- Shell completions are printed to `stdout`.
## License
diff --git a/config.example.toml b/config.example.toml
index 5a70488..ec37c8c 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -1,5 +1,4 @@
# whispers configuration
-# Copy to ~/.config/whispers/config.toml and customize
#
# Keybinding is handled by your compositor. Example for Hyprland:
# bind = SUPER ALT, D, exec, whispers
@@ -9,42 +8,36 @@
[audio]
# Input device name (empty = system default)
device = ""
-# Sample rate in Hz (ASR expects 16000)
+# Sample rate in Hz (ASR requires 16000)
sample_rate = 16000
[transcription]
# Active transcription backend ("whisper_cpp", "faster_whisper", "nemo", or "cloud")
-# "nemo" is experimental and intended for benchmarking rather than the default recommendation.
backend = "whisper_cpp"
-# Cloud failure policy ("configured_local" or "none")
+# Cloud fallback behavior ("configured_local" or "none")
fallback = "configured_local"
# Local backend used directly in local mode and as the cloud fallback backend
local_backend = "whisper_cpp"
-# Managed ASR model name for the local backend.
-# Recommended defaults: "large-v3-turbo" for multilingual local use,
-# "distil-large-v3.5" for faster English-only local use.
+# Managed ASR model name for the selected backend
selected_model = "large-v3-turbo"
-# Local backend model path. Leave empty to use the managed selection.
-# Experimental NeMo models are managed as prepared model directories, not ggml files.
-# Their first warm-up can be much slower than steady-state dictation.
+# Path to the local backend-specific model or empty to use the selected managed model
+# Manage models with: whispers asr-model list / download / select
model_path = "~/.local/share/whispers/ggml-large-v3-turbo.bin"
-# Language code ("en", "fr", "de", etc.)
+# Language code ("en", "fr", "de", etc.) or "auto" for auto-detect
language = "auto"
# Enable GPU acceleration (set false to force CPU)
use_gpu = true
# Enable flash attention when GPU is enabled
flash_attn = true
# How long the hidden ASR worker stays warm without requests (0 = never expire)
-# Experimental NeMo models usually feel best with a warm worker.
idle_timeout_ms = 120000
[postprocess]
-# "raw" keeps output close to Whisper, "advanced_local" enables the rewrite model
-# "legacy_basic" is deprecated and only kept for older cleanup-based configs
+# "raw" (default), "advanced_local", "agentic_rewrite", or "legacy_basic" for deprecated cleanup configs
mode = "raw"
[session]
-# Enable short-lived session backtracking in advanced_local mode
+# Enable short-lived session backtracking in rewrite modes
enabled = true
# How many recent dictation entries to keep in the runtime session ledger
max_entries = 3
@@ -64,7 +57,7 @@ snippet_trigger = "insert"
[rewrite]
# Rewrite backend ("local" or "cloud")
backend = "local"
-# Cloud rewrite failure policy ("local" or "none")
+# Cloud fallback behavior ("local" or "none")
fallback = "local"
# Managed rewrite model name for advanced_local mode
selected_model = "qwen-3.5-4b-q4_k_m"
@@ -72,24 +65,31 @@ selected_model = "qwen-3.5-4b-q4_k_m"
# Custom rewrite models should be chat-capable GGUFs with an embedded
# chat template that llama.cpp can apply at runtime.
model_path = ""
-# Optional plain-text file with extra rewrite instructions appended to the
-# built-in system prompt. Missing files are ignored.
+# Append-only custom rewrite instructions file (empty = disabled)
instructions_path = "~/.local/share/whispers/rewrite-instructions.txt"
# Rewrite profile selection ("auto", "qwen", "generic", or "llama_compat")
profile = "auto"
# Timeout for local rewrite inference in milliseconds
timeout_ms = 30000
-# How long the hidden rewrite worker stays warm without requests (0 = never expire)
+# How long the hidden rewrite worker stays warm without requests
idle_timeout_ms = 120000
# Maximum characters accepted from the rewrite model
max_output_chars = 1200
# Maximum tokens to generate for rewritten output
max_tokens = 256
+[agentic_rewrite]
+# App-aware rewrite policy rules used by postprocess.mode = "agentic_rewrite"
+policy_path = "~/.local/share/whispers/app-rewrite-policy.toml"
+# Technical glossary used by postprocess.mode = "agentic_rewrite"
+glossary_path = "~/.local/share/whispers/technical-glossary.toml"
+# Default correction policy ("conservative", "balanced", or "aggressive")
+default_correction_policy = "balanced"
+
[cloud]
# Cloud provider ("openai" or "openai_compatible")
provider = "openai"
-# Base URL for openai_compatible providers. Leave empty for OpenAI.
+# Custom base URL for openai_compatible providers (empty uses the OpenAI default)
base_url = ""
# Optional API key stored directly in the config (empty = use api_key_env instead)
api_key = ""
@@ -103,9 +103,9 @@ request_timeout_ms = 15000
[cloud.transcription]
# Cloud transcription model
model = "gpt-4o-mini-transcribe"
-# "inherit_local" uses [transcription].language when it is not "auto"
-# "force" uses the value below instead
+# "inherit_local" uses [transcription].language when it is not "auto"; "force" uses the value below
language_mode = "inherit_local"
+# Language code used when language_mode = "force"
language = ""
[cloud.rewrite]
@@ -113,7 +113,7 @@ language = ""
model = "gpt-4.1-mini"
# Sampling temperature for cloud rewrite
temperature = 0.1
-# Maximum tokens requested from the cloud rewrite backend
+# Maximum tokens requested from the cloud rewrite model
max_output_tokens = 256
[feedback]
diff --git a/docs/refactor-plan.md b/docs/refactor-plan.md
new file mode 100644
index 0000000..073b464
--- /dev/null
+++ b/docs/refactor-plan.md
@@ -0,0 +1,314 @@
+# Whispers Refactor Plan
+
+Status: complete
+Workspace: `refactor-plan` at `/home/notes/Projects/whispers-refactor-plan`
+Planning goal: reduce module sprawl and dependency tangles without mixing in feature work or behavior changes.
+
+## Working Rules
+
+- Keep refactor work in this workspace, not the shared feature workspace.
+- Prefer behavior-preserving extractions first. Delay semantic changes until the new boundaries are in place.
+- Keep each checkpoint to one logical concern and one Conventional Commit description.
+- Run targeted tests after each checkpoint, then broaden to `cargo test` when the phase is stable.
+- Do not start with OSD polish or naming cleanup. Fix structure first.
+
+## Current Diagnosis
+
+The main mess is not the top-level flow. The main mess is that a few large modules own too many responsibilities at once:
+
+- `src/main.rs` is the de facto crate root for almost everything.
+- `src/bin/whispers-rewrite-worker.rs` and `src/bin/whispers-osd.rs` pull shared code in via `#[path = ...]` instead of a shared library crate.
+- `src/postprocess.rs` mixes planning, backend routing, fallback, and finalization.
+- `src/agentic_rewrite.rs` mixes runtime policy logic with file-backed CLI admin.
+- `src/asr.rs` duplicates backend lifecycle logic across batch and live paths.
+- `src/app.rs` mixes orchestration, runtime state, injection policy, and session persistence.
+- `src/personalization.rs`, `src/session.rs`, `src/config.rs`, and `src/setup.rs` each bundle multiple separate concerns.
+
+## Recommended Order
+
+1. Establish crate boundaries.
+2. Fix dependency direction in the runtime path.
+3. Split the largest domain modules by responsibility.
+4. Split config/setup/model/completion orchestration.
+5. Finish with platform adapters and retire stale reporting cleanup if no real surface remains.
+
+## Phase 1: Crate Boundaries
+
+Goal: stop sharing code between binaries through `#[path = ...]` includes and give the project a real library surface.
+
+### Checkpoint 1.1
+
+- Commit: `refactor: add library crate and thin binary entrypoints`
+- Deliverables:
+ - Add `src/lib.rs`.
+ - Move module declarations out of `src/main.rs`.
+ - Make `src/main.rs` a thin CLI entrypoint.
+ - Make `src/bin/whispers-rewrite-worker.rs` and `src/bin/whispers-osd.rs` use library modules instead of `#[path = ...]`.
+- Validation:
+ - `cargo test`
+ - `cargo test --bin whispers`
+ - `cargo test --bin whispers-rewrite-worker`
+
+### Checkpoint 1.2
+
+- Commit: `refactor: isolate binary-only startup code`
+- Deliverables:
+ - Move PID lock and process signaling helpers into a small runtime support module.
+ - Keep binary-specific CLI/bootstrap logic out of domain modules.
+- Validation:
+ - `cargo test main::tests`
+ - `cargo test`
+
+## Phase 2: Runtime Path
+
+Goal: make the dictation path read as orchestration over smaller components instead of one large cross-module knot.
+
+### Checkpoint 2.1
+
+- Commit: `refactor: extract agentic rewrite runtime policy engine`
+- Deliverables:
+ - Split `src/agentic_rewrite.rs` into runtime policy code and file-backed admin/store code.
+ - Runtime modules should not print to stdout or mutate files.
+ - CLI-facing add/list/remove/path helpers should depend on the store layer, not the runtime layer.
+- Validation:
+ - `cargo test agentic_rewrite`
+ - `cargo test postprocess`
+
+### Checkpoint 2.2
+
+- Commit: `refactor: split postprocess planning and execution`
+- Deliverables:
+ - Extract a planning layer from `src/postprocess.rs` for transcript preparation and session intent.
+ - Extract an execution layer for local/cloud rewrite calls.
+ - Keep final acceptance and fallback rules in a smaller decision layer.
+- Validation:
+ - `cargo test postprocess`
+ - `cargo test session`
+ - `cargo test personalization`
+
+### Checkpoint 2.3
+
+- Commit: `refactor: unify asr backend lifecycle`
+- Deliverables:
+ - Remove duplicated backend switching across `prepare_transcriber`, `prepare_live_transcriber`, `transcribe_audio`, and `transcribe_live_audio`.
+ - Centralize fallback policy in one place.
+- Validation:
+ - `cargo test asr`
+ - `cargo test faster_whisper`
+ - `cargo test nemo_asr`
+
+### Checkpoint 2.4
+
+- Commit: `refactor: split rewrite routing from prompt rendering`
+- Status:
+ - completed sub-checkpoints: routing split, prompt rendering split, local rewrite engine extraction, output cleanup plus thin facade
+ - phase status: complete
+- Deliverables:
+ - Separate route selection from prompt/template rendering in `src/rewrite.rs`.
+ - Keep giant prompt contracts out of routing logic.
+- Validation:
+ - `cargo test rewrite`
+ - `cargo test rewrite_profile`
+
+### Checkpoint 2.5
+
+- Commit: `refactor: split app controller from dictation runtime state`
+- Status:
+ - completed sub-checkpoints: extracted runtime state transitions, isolated OSD helpers, kept `run()` as controller orchestration
+ - phase status: complete
+- Deliverables:
+ - Keep `src/app.rs` as orchestration.
+ - Extract dictation runtime state, preview pacing, session updates, and injection decisions into smaller modules.
+ - Minimize direct side effects inside the main dictation loop.
+- Validation:
+ - `cargo test app`
+ - `cargo test session`
+ - targeted manual smoke test for `whispers voice`
+
+## Phase 3: Domain Modules
+
+Goal: split large pure-ish logic files by domain instead of by size.
+
+### Checkpoint 3.1
+
+- Commit: `refactor: split personalization store and rewrite candidates`
+- Status:
+ - completed sub-checkpoints: extracted file-backed dictionary/snippet store, moved rewrite transcript and candidate generation out of the facade, kept `crate::personalization::*` call sites stable via re-exports
+ - phase status: complete
+- Deliverables:
+ - Split `src/personalization.rs` into:
+ - store and CLI mutation helpers
+ - text transformation rules
+ - rewrite candidate building and ranking
+- Validation:
+ - `cargo test personalization`
+
+### Checkpoint 3.2
+
+- Commit: `refactor: split session persistence from backtrack planning`
+- Status:
+ - completed sub-checkpoints: extracted runtime session persistence, isolated backtrack heuristics and typing-context mapping, kept `crate::session::*` paths stable via re-exports
+ - phase status: complete
+- Deliverables:
+ - Move JSON load/save/prune logic away from backtrack heuristics.
+ - Make backtrack planning operate on in-memory data structures.
+- Validation:
+ - `cargo test session`
+ - `cargo test postprocess`
+
+### Checkpoint 3.3
+
+- Commit: `refactor: split cleanup lexicon analysis and rendering`
+- Status:
+ - completed sub-checkpoints: extracted cue-family lexicon and hypothesis matching, isolated piece rendering, kept `crate::cleanup::*` public APIs stable at the root
+ - phase status: complete
+- Deliverables:
+ - Split `src/cleanup.rs` into lexical rules, analysis, and rendering pieces.
+ - Keep the public cleanup API stable until follow-up cleanup is done.
+- Validation:
+ - `cargo test cleanup`
+
+## Phase 4: Config and Command Surface
+
+Goal: remove duplicated sources of truth and reduce direct file mutation from high-level commands.
+
+### Checkpoint 4.1
+
+- Commit: `refactor: split config schema defaults and editing`
+- Status:
+ - completed sub-checkpoints: extracted schema/default types, split load and legacy migration logic from path helpers, isolated TOML mutation helpers behind the root `crate::config::*` facade
+ - phase status: complete
+- Deliverables:
+ - Split `src/config.rs` into schema, defaults/template, load/migrate, and edit/update modules.
+ - Put TOML mutation behind a small config editor API.
+- Validation:
+ - `cargo test config`
+ - `cargo test cli`
+
+### Checkpoint 4.2
+
+- Commit: `refactor: extract setup flow phases`
+- Status:
+ - completed sub-checkpoints: separated interactive selection from config application, isolated post-apply side effects, moved summary and completion rendering out of the flow orchestrator
+ - phase status: complete
+- Deliverables:
+ - Break `src/setup.rs` into prompt/selection, config apply, side effects, and summary/reporting phases.
+ - Keep interactive behavior unchanged.
+- Validation:
+ - `cargo test setup`
+
+### Checkpoint 4.3
+
+- Commit: `refactor: unify model management workflows`
+- Status:
+ - completed sub-checkpoints: extracted shared model config/bootstrap helpers, centralized common download/status logic, trimmed backend-specific model modules down to catalog and backend behavior
+ - phase status: complete
+- Deliverables:
+ - Reduce duplication across `src/model.rs`, `src/asr_model.rs`, and `src/rewrite_model.rs`.
+ - Share download/select/status plumbing where behavior is actually the same.
+- Validation:
+ - `cargo test model`
+ - `cargo test asr_model`
+ - `cargo test rewrite_model`
+
+### Checkpoint 4.4
+
+- Commit: `refactor: isolate shell completion installers`
+- Status:
+ - completed sub-checkpoints: split shell detection from completion rendering, kept `run_completions` as the thin entrypoint, noted that the current tree does not yet include install-path or shell-rc mutation logic
+ - phase status: complete
+- Deliverables:
+ - Separate shell detection, script generation, install-path policy, and shell rc mutation in `src/completions.rs`.
+- Validation:
+ - `cargo test completions`
+
+### Checkpoint 4.5
+
+- Commit: `docs: derive config docs from canonical source`
+- Status:
+ - completed sub-checkpoints: made the config writer template the canonical source, aligned `config.example.toml` with that template, removed the duplicated README config block in favor of referencing the canonical example
+ - phase status: complete
+- Deliverables:
+ - Stop maintaining defaults separately in code, `config.example.toml`, and the README snippet.
+ - Pick one canonical source and generate or reuse it everywhere else.
+- Validation:
+ - `cargo test config`
+ - manual check of `README.md` and `config.example.toml`
+
+## Phase 5: Platform Adapters and Reporting
+
+Goal: separate policy from OS effects in smaller but high-value modules.
+
+### Checkpoint 5.1
+
+- Commit: `refactor: extract injection adapter layer`
+- Status:
+ - completed sub-checkpoints: split clipboard process handling from virtual keyboard emission, kept `TextInjector` as the stable policy/orchestration facade for runtime callers
+ - phase status: complete
+- Deliverables:
+ - Separate injection policy from evdev and clipboard execution in `src/inject.rs`.
+- Validation:
+ - `cargo test inject`
+
+### Checkpoint 5.2
+
+- Commit: `refactor: split audio recorder and dsp helpers`
+- Status:
+ - completed sub-checkpoints: split recorder lifecycle and device/config negotiation from reusable DSP helpers, kept `AudioRecorder` and `preprocess_audio` stable for callers
+ - phase status: complete
+- Deliverables:
+ - Separate recorder lifecycle and device interaction from reusable audio transforms in `src/audio.rs`.
+- Validation:
+ - `cargo test audio`
+
+### Checkpoint 5.3
+
+- Commit: `docs: retire stale status reporting checkpoint`
+- Status:
+ - completed sub-checkpoints: verified that `src/status.rs` is absent, confirmed earlier checkpoints already split the remaining real reporting surfaces (`setup/report.rs` and model status rendering), retired the stale roadmap item instead of inventing a fake module
+ - phase status: complete
+- Deliverables:
+ - Confirm whether a standalone status/reporting module still exists in the current tree.
+ - Retire the stale checkpoint if the earlier refactors already covered the real reporting surfaces.
+- Validation:
+ - manual codebase search for reporting surfaces
+
+## Not Now
+
+- Rewriting the user-facing CLI.
+- Replacing `tokio` structure or async strategy.
+- Changing OSD visuals.
+- Large naming-only passes.
+- Folding unrelated feature work into refactor commits.
+
+## Per-Checkpoint Template
+
+Use this each time work starts on a new item:
+
+1. Confirm the checkpoint and write the exact Conventional Commit description with `jj desc -m`.
+2. Restate the non-goals for that checkpoint.
+3. Move code without changing behavior.
+4. Run targeted tests for touched modules.
+5. If the checkpoint is complete, create the next working-copy change with `jj new`.
+6. Update this file with status notes before moving on.
+
+## Progress Log
+
+- [x] Phase 1.1 complete
+- [x] Phase 1.2 complete
+- [x] Phase 2.1 complete
+- [x] Phase 2.2 complete
+- [x] Phase 2.3 complete
+- [x] Phase 2.4 complete
+- [x] Phase 2.5 complete
+- [x] Phase 3.1 complete
+- [x] Phase 3.2 complete
+- [x] Phase 3.3 complete
+- [x] Phase 4.1 complete
+- [x] Phase 4.2 complete
+- [x] Phase 4.3 complete
+- [x] Phase 4.4 complete
+- [x] Phase 4.5 complete
+- [x] Phase 5.1 complete
+- [x] Phase 5.2 complete
+- [x] Phase 5.3 complete
diff --git a/src/agentic_rewrite/admin.rs b/src/agentic_rewrite/admin.rs
new file mode 100644
index 0000000..5037c93
--- /dev/null
+++ b/src/agentic_rewrite/admin.rs
@@ -0,0 +1,170 @@
+use std::path::Path;
+
+use crate::config::Config;
+use crate::error::Result;
+use crate::rewrite_protocol::RewriteCorrectionPolicy;
+
+use super::{AppRule, ContextMatcher, GlossaryEntry, store};
+
+pub(super) fn print_app_rule_path(config_override: Option<&Path>) -> Result<()> {
+ let config = Config::load(config_override)?;
+ println!("{}", config.resolved_agentic_policy_path().display());
+ Ok(())
+}
+
+pub(super) fn print_glossary_path(config_override: Option<&Path>) -> Result<()> {
+ let config = Config::load(config_override)?;
+ println!("{}", config.resolved_agentic_glossary_path().display());
+ Ok(())
+}
+
+pub(super) fn list_app_rules(config_override: Option<&Path>) -> Result<()> {
+ let config = Config::load(config_override)?;
+ let rules = store::read_policy_file(&config.resolved_agentic_policy_path())?;
+ if rules.is_empty() {
+ println!("No app rules configured.");
+ return Ok(());
+ }
+
+ for rule in rules {
+ println!(
+ "{} | match: {} | correction_policy: {} | instructions: {}",
+ rule.name,
+ render_matcher(&rule.matcher),
+ rule.correction_policy
+ .map(|policy| policy.as_str())
+ .unwrap_or("inherit"),
+ single_line(&rule.instructions)
+ );
+ }
+
+ Ok(())
+}
+
+pub(super) fn add_app_rule(
+ config_override: Option<&Path>,
+ name: &str,
+ instructions: &str,
+ matcher: ContextMatcher,
+ correction_policy: Option,
+) -> Result<()> {
+ let config = Config::load(config_override)?;
+ let path = config.resolved_agentic_policy_path();
+ let mut rules = store::read_policy_file(&path)?;
+ store::upsert_app_rule(
+ &mut rules,
+ AppRule {
+ name: name.to_string(),
+ matcher,
+ instructions: instructions.to_string(),
+ correction_policy,
+ },
+ );
+ store::write_policy_file(&path, &rules)?;
+ println!("Added app rule: {name}");
+ println!("App rules updated: {}", path.display());
+ Ok(())
+}
+
+pub(super) fn remove_app_rule(config_override: Option<&Path>, name: &str) -> Result<()> {
+ let config = Config::load(config_override)?;
+ let path = config.resolved_agentic_policy_path();
+ let mut rules = store::read_policy_file(&path)?;
+ let removed = store::remove_app_rule_entry(&mut rules, name);
+ store::write_policy_file(&path, &rules)?;
+ if removed {
+ println!("Removed app rule: {name}");
+ } else {
+ println!("No app rule matched: {name}");
+ }
+ println!("App rules updated: {}", path.display());
+ Ok(())
+}
+
+pub(super) fn list_glossary(config_override: Option<&Path>) -> Result<()> {
+ let config = Config::load(config_override)?;
+ let entries = store::read_glossary_file(&config.resolved_agentic_glossary_path())?;
+ if entries.is_empty() {
+ println!("No glossary entries configured.");
+ return Ok(());
+ }
+
+ for entry in entries {
+ let aliases = if entry.aliases.is_empty() {
+ "-".to_string()
+ } else {
+ entry.aliases.join(", ")
+ };
+ println!(
+ "{} | aliases: {} | match: {}",
+ entry.term,
+ aliases,
+ render_matcher(&entry.matcher)
+ );
+ }
+
+ Ok(())
+}
+
+pub(super) fn add_glossary_entry(
+ config_override: Option<&Path>,
+ term: &str,
+ aliases: &[String],
+ matcher: ContextMatcher,
+) -> Result<()> {
+ let config = Config::load(config_override)?;
+ let path = config.resolved_agentic_glossary_path();
+ let mut entries = store::read_glossary_file(&path)?;
+ store::upsert_glossary_entry(
+ &mut entries,
+ GlossaryEntry {
+ term: term.to_string(),
+ aliases: aliases.to_vec(),
+ matcher,
+ },
+ );
+ store::write_glossary_file(&path, &entries)?;
+ println!("Added glossary entry: {term}");
+ println!("Glossary updated: {}", path.display());
+ Ok(())
+}
+
+pub(super) fn remove_glossary_entry(config_override: Option<&Path>, term: &str) -> Result<()> {
+ let config = Config::load(config_override)?;
+ let path = config.resolved_agentic_glossary_path();
+ let mut entries = store::read_glossary_file(&path)?;
+ let removed = store::remove_glossary_entry_by_term(&mut entries, term);
+ store::write_glossary_file(&path, &entries)?;
+ if removed {
+ println!("Removed glossary entry: {term}");
+ } else {
+ println!("No glossary entry matched: {term}");
+ }
+ println!("Glossary updated: {}", path.display());
+ Ok(())
+}
+
+fn single_line(text: &str) -> String {
+ text.trim().replace('\n', "\\n")
+}
+
+fn render_matcher(matcher: &ContextMatcher) -> String {
+ let mut parts = Vec::new();
+ if let Some(surface_kind) = matcher.surface_kind {
+ parts.push(format!("surface_kind={}", surface_kind.as_str()));
+ }
+ if let Some(app_id) = matcher.app_id.as_deref() {
+ parts.push(format!("app_id={app_id}"));
+ }
+ if let Some(window_title) = matcher.window_title_contains.as_deref() {
+ parts.push(format!("window_title_contains={window_title}"));
+ }
+ if let Some(browser_domain) = matcher.browser_domain_contains.as_deref() {
+ parts.push(format!("browser_domain_contains={browser_domain}"));
+ }
+ if parts.is_empty() {
+ "global".to_string()
+ } else {
+ parts.join(", ")
+ }
+}
diff --git a/src/agentic_rewrite/mod.rs b/src/agentic_rewrite/mod.rs
new file mode 100644
index 0000000..b387c7b
--- /dev/null
+++ b/src/agentic_rewrite/mod.rs
@@ -0,0 +1,268 @@
+mod admin;
+mod runtime;
+mod store;
+
+use std::path::Path;
+
+use serde::{Deserialize, Serialize};
+
+use crate::config::Config;
+use crate::error::Result;
+use crate::rewrite_protocol::{RewriteCorrectionPolicy, RewriteSurfaceKind, RewriteTranscript};
+
+#[derive(Debug, Clone, Default, Deserialize, Serialize, PartialEq, Eq)]
+#[serde(default)]
+pub struct ContextMatcher {
+ pub surface_kind: Option,
+ pub app_id: Option,
+ pub window_title_contains: Option,
+ pub browser_domain_contains: Option,
+}
+
+#[derive(Debug, Clone, Default, Deserialize, Serialize, PartialEq, Eq)]
+#[serde(default)]
+struct AppRule {
+ name: String,
+ #[serde(flatten)]
+ matcher: ContextMatcher,
+ instructions: String,
+ correction_policy: Option,
+}
+
+#[derive(Debug, Clone, Default, Deserialize, Serialize, PartialEq, Eq)]
+#[serde(default)]
+struct GlossaryEntry {
+ term: String,
+ aliases: Vec,
+ #[serde(flatten)]
+ matcher: ContextMatcher,
+}
+
+#[derive(Debug, Clone)]
+struct PreparedGlossaryEntry {
+ term: String,
+ aliases: Vec,
+ matcher: ContextMatcher,
+ normalized_aliases: Vec>,
+}
+
+pub use runtime::conservative_output_allowed;
+
+pub fn default_policy_path() -> &'static str {
+ store::default_policy_path()
+}
+
+pub fn default_glossary_path() -> &'static str {
+ store::default_glossary_path()
+}
+
+pub fn apply_runtime_policy(config: &Config, transcript: &mut RewriteTranscript) {
+ let policy_rules = store::load_policy_file_for_runtime(&config.resolved_agentic_policy_path());
+ let glossary_entries =
+ store::load_glossary_file_for_runtime(&config.resolved_agentic_glossary_path());
+
+ let policy_context = runtime::resolve_policy_context(
+ config.agentic_rewrite.default_correction_policy,
+ transcript.typing_context.as_ref(),
+ &transcript.rewrite_candidates,
+ &policy_rules,
+ &glossary_entries,
+ );
+
+ for candidate in &policy_context.glossary_candidates {
+ if transcript
+ .rewrite_candidates
+ .iter()
+ .any(|existing| existing.text == candidate.text)
+ {
+ continue;
+ }
+ transcript.rewrite_candidates.push(candidate.clone());
+ }
+
+ transcript.policy_context = policy_context;
+}
+
+pub fn ensure_starter_files(config: &Config) -> Result> {
+ store::ensure_starter_files(config)
+}
+
+pub fn print_app_rule_path(config_override: Option<&Path>) -> Result<()> {
+ admin::print_app_rule_path(config_override)
+}
+
+pub fn print_glossary_path(config_override: Option<&Path>) -> Result<()> {
+ admin::print_glossary_path(config_override)
+}
+
+pub fn list_app_rules(config_override: Option<&Path>) -> Result<()> {
+ admin::list_app_rules(config_override)
+}
+
+pub fn add_app_rule(
+ config_override: Option<&Path>,
+ name: &str,
+ instructions: &str,
+ matcher: ContextMatcher,
+ correction_policy: Option,
+) -> Result<()> {
+ admin::add_app_rule(
+ config_override,
+ name,
+ instructions,
+ matcher,
+ correction_policy,
+ )
+}
+
+pub fn remove_app_rule(config_override: Option<&Path>, name: &str) -> Result<()> {
+ admin::remove_app_rule(config_override, name)
+}
+
+pub fn list_glossary(config_override: Option<&Path>) -> Result<()> {
+ admin::list_glossary(config_override)
+}
+
+pub fn add_glossary_entry(
+ config_override: Option<&Path>,
+ term: &str,
+ aliases: &[String],
+ matcher: ContextMatcher,
+) -> Result<()> {
+ admin::add_glossary_entry(config_override, term, aliases, matcher)
+}
+
+pub fn remove_glossary_entry(config_override: Option<&Path>, term: &str) -> Result<()> {
+ admin::remove_glossary_entry(config_override, term)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::config::Config;
+ use crate::rewrite_protocol::{
+ RewriteCandidate, RewriteCandidateKind, RewritePolicyContext, RewriteTranscript,
+ RewriteTypingContext,
+ };
+
+ fn typing_context(surface_kind: RewriteSurfaceKind) -> RewriteTypingContext {
+ RewriteTypingContext {
+ focus_fingerprint: "focus".into(),
+ app_id: Some("dev.zed.Zed".into()),
+ window_title: Some("docs.rs - serde_json".into()),
+ surface_kind,
+ browser_domain: Some("docs.rs".into()),
+ captured_at_ms: 42,
+ }
+ }
+
+ fn transcript_with_candidates(surface_kind: RewriteSurfaceKind) -> RewriteTranscript {
+ RewriteTranscript {
+ raw_text: "type script and sir dee json".into(),
+ correction_aware_text: "type script and sir dee json".into(),
+ aggressive_correction_text: None,
+ detected_language: Some("en".into()),
+ typing_context: Some(typing_context(surface_kind)),
+ recent_session_entries: Vec::new(),
+ session_backtrack_candidates: Vec::new(),
+ recommended_session_candidate: None,
+ segments: Vec::new(),
+ edit_intents: Vec::new(),
+ edit_signals: Vec::new(),
+ edit_hypotheses: Vec::new(),
+ rewrite_candidates: vec![RewriteCandidate {
+ kind: RewriteCandidateKind::ConservativeCorrection,
+ text: "type script and sir dee json".into(),
+ }],
+ recommended_candidate: None,
+ policy_context: RewritePolicyContext::default(),
+ }
+ }
+
+ #[test]
+ fn apply_runtime_policy_adds_glossary_candidates() {
+ let _env_lock = crate::test_support::env_lock();
+ let _guard = crate::test_support::EnvVarGuard::capture(&[
+ "HOME",
+ "XDG_CONFIG_HOME",
+ "XDG_DATA_HOME",
+ ]);
+ let home = crate::test_support::unique_temp_dir("agentic-runtime-home");
+ crate::test_support::set_env("HOME", &home.to_string_lossy());
+ crate::test_support::remove_env("XDG_CONFIG_HOME");
+ crate::test_support::remove_env("XDG_DATA_HOME");
+
+ let config = Config::default();
+ let glossary_path = config.resolved_agentic_glossary_path();
+ store::write_glossary_file(
+ &glossary_path,
+ &[GlossaryEntry {
+ term: "TypeScript".into(),
+ aliases: vec!["type script".into()],
+ matcher: ContextMatcher {
+ surface_kind: Some(RewriteSurfaceKind::Editor),
+ ..ContextMatcher::default()
+ },
+ }],
+ )
+ .expect("write glossary");
+
+ let mut transcript = transcript_with_candidates(RewriteSurfaceKind::Editor);
+ apply_runtime_policy(&config, &mut transcript);
+ assert!(
+ transcript
+ .rewrite_candidates
+ .iter()
+ .any(|candidate| candidate.text == "TypeScript and sir dee json")
+ );
+ }
+
+ #[test]
+ fn add_and_remove_roundtrip_for_policy_and_glossary() {
+ let _env_lock = crate::test_support::env_lock();
+ let _guard = crate::test_support::EnvVarGuard::capture(&[
+ "HOME",
+ "XDG_CONFIG_HOME",
+ "XDG_DATA_HOME",
+ ]);
+ let home = crate::test_support::unique_temp_dir("agentic-cli-home");
+ crate::test_support::set_env("HOME", &home.to_string_lossy());
+ crate::test_support::remove_env("XDG_CONFIG_HOME");
+ crate::test_support::remove_env("XDG_DATA_HOME");
+
+ add_app_rule(
+ None,
+ "zed",
+ "Preserve Rust identifiers.",
+ ContextMatcher {
+ app_id: Some("dev.zed.Zed".into()),
+ ..ContextMatcher::default()
+ },
+ Some(RewriteCorrectionPolicy::Balanced),
+ )
+ .expect("add app rule");
+ let config = Config::load(None).expect("config");
+ let rules = store::read_policy_file(&config.resolved_agentic_policy_path()).expect("rules");
+ assert_eq!(rules.len(), 1);
+
+ add_glossary_entry(
+ None,
+ "serde_json",
+ &[String::from("sir dee json")],
+ ContextMatcher::default(),
+ )
+ .expect("add glossary entry");
+ let entries =
+ store::read_glossary_file(&config.resolved_agentic_glossary_path()).expect("entries");
+ assert_eq!(entries.len(), 1);
+
+ remove_app_rule(None, "zed").expect("remove app rule");
+ remove_glossary_entry(None, "serde_json").expect("remove glossary entry");
+
+ let rules = store::read_policy_file(&config.resolved_agentic_policy_path()).expect("rules");
+ let entries =
+ store::read_glossary_file(&config.resolved_agentic_glossary_path()).expect("entries");
+ assert!(rules.is_empty());
+ assert!(entries.is_empty());
+ }
+}
diff --git a/src/agentic_rewrite/runtime.rs b/src/agentic_rewrite/runtime.rs
new file mode 100644
index 0000000..ea00a2f
--- /dev/null
+++ b/src/agentic_rewrite/runtime.rs
@@ -0,0 +1,935 @@
+use super::{AppRule, ContextMatcher, GlossaryEntry, PreparedGlossaryEntry};
+use crate::rewrite_protocol::{
+ RewriteCandidate, RewriteCandidateKind, RewriteCorrectionPolicy, RewritePolicyContext,
+ RewritePolicyGlossaryTerm, RewriteSurfaceKind, RewriteTranscript, RewriteTypingContext,
+};
+
+const MAX_GLOSSARY_CANDIDATES: usize = 4;
+
+pub(super) fn resolve_policy_context(
+ default_policy: RewriteCorrectionPolicy,
+ context: Option<&RewriteTypingContext>,
+ rewrite_candidates: &[RewriteCandidate],
+ policy_rules: &[AppRule],
+ glossary_entries: &[GlossaryEntry],
+) -> RewritePolicyContext {
+ let mut matched_rule_names = Vec::new();
+ let mut effective_rule_instructions = Vec::new();
+ let mut correction_policy = default_policy;
+
+ for rule in built_in_rules(default_policy)
+ .into_iter()
+ .filter(|rule| rule.matcher.matches(context))
+ .chain(matching_rules(policy_rules, context))
+ {
+ matched_rule_names.push(rule.name.clone());
+ if let Some(policy) = rule.correction_policy {
+ correction_policy = policy;
+ }
+
+ let instructions = rule.instructions.trim();
+ if !instructions.is_empty() {
+ effective_rule_instructions.push(instructions.to_string());
+ }
+ }
+
+ let mut active_glossary_entries = glossary_entries
+ .iter()
+ .enumerate()
+ .filter_map(|(index, entry)| {
+ PreparedGlossaryEntry::new(entry.clone()).map(|entry| (index, entry))
+ })
+ .filter(|(_, entry)| entry.matcher.matches(context))
+ .collect::>();
+ active_glossary_entries
+ .sort_by_key(|(index, entry)| (entry.matcher.specificity_rank(), *index));
+ let active_glossary_entries = active_glossary_entries
+ .into_iter()
+ .map(|(_, entry)| entry)
+ .collect::>();
+
+ RewritePolicyContext {
+ correction_policy,
+ matched_rule_names,
+ effective_rule_instructions,
+ active_glossary_terms: collapse_glossary_terms(&active_glossary_entries),
+ glossary_candidates: build_glossary_candidates(
+ rewrite_candidates,
+ &active_glossary_entries,
+ ),
+ }
+}
+
+pub fn conservative_output_allowed(transcript: &RewriteTranscript, text: &str) -> bool {
+ let text = text.trim();
+ if text.is_empty() {
+ return false;
+ }
+
+ transcript
+ .rewrite_candidates
+ .iter()
+ .any(|candidate| candidate_supports_output(&candidate.text, text))
+ || transcript
+ .policy_context
+ .glossary_candidates
+ .iter()
+ .any(|candidate| candidate_supports_output(&candidate.text, text))
+}
+
+impl ContextMatcher {
+ fn matches(&self, context: Option<&RewriteTypingContext>) -> bool {
+ if self.is_empty() {
+ return true;
+ }
+
+ let Some(context) = context else {
+ return false;
+ };
+
+ if let Some(surface_kind) = self.surface_kind
+ && context.surface_kind != surface_kind
+ {
+ return false;
+ }
+
+ if let Some(app_id) = self.app_id.as_deref()
+ && context.app_id.as_deref() != Some(app_id)
+ {
+ return false;
+ }
+
+ if let Some(needle) = self.window_title_contains.as_deref()
+ && !contains_ignore_ascii_case(context.window_title.as_deref(), needle)
+ {
+ return false;
+ }
+
+ if let Some(needle) = self.browser_domain_contains.as_deref()
+ && !contains_ignore_ascii_case(context.browser_domain.as_deref(), needle)
+ {
+ return false;
+ }
+
+ true
+ }
+
+ fn specificity_rank(&self) -> (u8, u8) {
+ let strongest_layer = if self.browser_domain_contains.is_some() {
+ 4
+ } else if self.window_title_contains.is_some() {
+ 3
+ } else if self.app_id.is_some() {
+ 2
+ } else if self.surface_kind.is_some() {
+ 1
+ } else {
+ 0
+ };
+ let matcher_count = [
+ self.surface_kind.is_some(),
+ self.app_id.is_some(),
+ self.window_title_contains.is_some(),
+ self.browser_domain_contains.is_some(),
+ ]
+ .into_iter()
+ .filter(|present| *present)
+ .count() as u8;
+ (strongest_layer, matcher_count)
+ }
+
+ fn is_empty(&self) -> bool {
+ self.surface_kind.is_none()
+ && self.app_id.is_none()
+ && self.window_title_contains.is_none()
+ && self.browser_domain_contains.is_none()
+ }
+}
+
+impl AppRule {
+ fn built_in(
+ name: &str,
+ matcher: ContextMatcher,
+ instructions: &str,
+ correction_policy: Option,
+ ) -> Self {
+ Self {
+ name: name.to_string(),
+ matcher,
+ instructions: instructions.to_string(),
+ correction_policy,
+ }
+ }
+}
+
+impl PreparedGlossaryEntry {
+ fn new(entry: GlossaryEntry) -> Option {
+ let term = entry.term.trim().to_string();
+ if term.is_empty() {
+ return None;
+ }
+
+ let aliases = entry
+ .aliases
+ .into_iter()
+ .map(|alias| alias.trim().to_string())
+ .filter(|alias| !alias.is_empty())
+ .collect::>();
+ let normalized_aliases = aliases
+ .iter()
+ .map(|alias| normalized_words(alias))
+ .filter(|words| !words.is_empty())
+ .collect::>();
+
+ Some(Self {
+ term,
+ aliases,
+ matcher: entry.matcher,
+ normalized_aliases,
+ })
+ }
+}
+
+fn candidate_supports_output(candidate: &str, output: &str) -> bool {
+ if candidate.trim() == output.trim() {
+ return true;
+ }
+
+ let candidate_words = normalized_words(candidate);
+ let output_words = normalized_words(output);
+ if candidate_words.is_empty() || output_words.is_empty() {
+ return false;
+ }
+
+ if candidate_words == output_words {
+ return true;
+ }
+
+ if candidate_words.len() != output_words.len() || candidate_words.len() < 4 {
+ return false;
+ }
+
+ let differing_pairs = candidate_words
+ .iter()
+ .zip(&output_words)
+ .filter(|(candidate_word, output_word)| candidate_word != output_word)
+ .collect::>();
+ if differing_pairs.is_empty() || differing_pairs.len() > 2 {
+ return false;
+ }
+
+ differing_pairs
+ .into_iter()
+ .all(|(candidate_word, output_word)| {
+ is_minor_term_normalization(candidate_word, output_word)
+ })
+}
+
+fn is_minor_term_normalization(candidate_word: &str, output_word: &str) -> bool {
+ let candidate_len = candidate_word.chars().count();
+ let output_len = output_word.chars().count();
+ let max_len = candidate_len.max(output_len);
+ if max_len < 3 {
+ return false;
+ }
+
+ let distance = levenshtein_distance(candidate_word, output_word);
+ if distance == 0 || distance > 3 {
+ return false;
+ }
+
+ if phonetic_skeleton(candidate_word) == phonetic_skeleton(output_word) {
+ return true;
+ }
+
+ distance * 2 <= max_len + 1
+}
+
+fn levenshtein_distance(left: &str, right: &str) -> usize {
+ if left == right {
+ return 0;
+ }
+
+ let right_chars = right.chars().collect::>();
+ let mut previous = (0..=right_chars.len()).collect::>();
+ let mut current = vec![0; right_chars.len() + 1];
+
+ for (left_index, left_char) in left.chars().enumerate() {
+ current[0] = left_index + 1;
+ for (right_index, right_char) in right_chars.iter().enumerate() {
+ let substitution_cost = usize::from(left_char != *right_char);
+ current[right_index + 1] = (previous[right_index + 1] + 1)
+ .min(current[right_index] + 1)
+ .min(previous[right_index] + substitution_cost);
+ }
+ std::mem::swap(&mut previous, &mut current);
+ }
+
+ previous[right_chars.len()]
+}
+
+fn phonetic_skeleton(word: &str) -> String {
+ let mut chars = word
+ .chars()
+ .filter(|ch| is_word_char(*ch))
+ .flat_map(|ch| ch.to_lowercase());
+ let Some(first) = chars.next() else {
+ return String::new();
+ };
+
+ let mut skeleton = String::from(first);
+ let mut previous = first;
+ for ch in chars {
+ if matches!(ch, 'a' | 'e' | 'i' | 'o' | 'u' | 'w' | 'y') {
+ continue;
+ }
+ if ch != previous {
+ skeleton.push(ch);
+ previous = ch;
+ }
+ }
+ skeleton
+}
+
+fn built_in_rules(default_policy: RewriteCorrectionPolicy) -> Vec {
+ vec![
+ AppRule::built_in(
+ "baseline/global-default",
+ ContextMatcher::default(),
+ "Use the active typing context, recent dictation context, glossary terms, and bounded candidates to resolve technical dictation cleanly while keeping the final-text-only contract. When the utterance clearly points to software, tools, APIs, Linux components, product names, or other technical concepts, prefer the most plausible intended technical term over a phonetically similar common word. Use category cues like window manager, editor, language, library, shell, or package manager to disambiguate nearby technical names. If it remains genuinely ambiguous, stay close to the transcript.",
+ Some(default_policy),
+ ),
+ AppRule::built_in(
+ "baseline/browser",
+ ContextMatcher {
+ surface_kind: Some(RewriteSurfaceKind::Browser),
+ ..ContextMatcher::default()
+ },
+ "Favor clean prose and natural punctuation for browser text fields, but stay grounded in the listed candidates, glossary evidence, and the utterance's technical topic when it clearly refers to software or documentation.",
+ Some(RewriteCorrectionPolicy::Balanced),
+ ),
+ AppRule::built_in(
+ "baseline/generic-text",
+ ContextMatcher {
+ surface_kind: Some(RewriteSurfaceKind::GenericText),
+ ..ContextMatcher::default()
+ },
+ "Favor clean prose and natural punctuation for general text entry while staying grounded in the listed candidates and glossary evidence. If the utterance clearly discusses technical tools or software, prefer the most plausible technical term over a phonetically similar common word.",
+ Some(RewriteCorrectionPolicy::Balanced),
+ ),
+ AppRule::built_in(
+ "baseline/editor",
+ ContextMatcher {
+ surface_kind: Some(RewriteSurfaceKind::Editor),
+ ..ContextMatcher::default()
+ },
+ "Preserve identifiers, filenames, API names, symbols, and technical casing. Avoid rewriting technical wording into generic prose. Infer likely technical terms and proper names from the utterance when the topic is clearly code, tooling, or software.",
+ Some(RewriteCorrectionPolicy::Balanced),
+ ),
+ AppRule::built_in(
+ "baseline/terminal",
+ ContextMatcher {
+ surface_kind: Some(RewriteSurfaceKind::Terminal),
+ ..ContextMatcher::default()
+ },
+ "Preserve commands, flags, paths, package names, environment variables, and punctuation that changes command meaning. Infer technical commands or package names only when the utterance strongly supports them. If uncertain, prefer the closest listed candidate.",
+ Some(RewriteCorrectionPolicy::Conservative),
+ ),
+ ]
+}
+
+fn matching_rules(rules: &[AppRule], context: Option<&RewriteTypingContext>) -> Vec {
+ let mut matches = rules
+ .iter()
+ .enumerate()
+ .filter(|(_, rule)| rule.matcher.matches(context))
+ .collect::>();
+ matches.sort_by_key(|(index, rule)| (rule.matcher.specificity_rank(), *index));
+ matches.into_iter().map(|(_, rule)| rule.clone()).collect()
+}
+
+fn collapse_glossary_terms(entries: &[PreparedGlossaryEntry]) -> Vec {
+ let mut collapsed = Vec::::new();
+ for entry in entries {
+ if let Some(existing) = collapsed
+ .iter_mut()
+ .find(|candidate| candidate.term == entry.term)
+ {
+ for alias in &entry.aliases {
+ if !existing
+ .aliases
+ .iter()
+ .any(|existing_alias| existing_alias == alias)
+ {
+ existing.aliases.push(alias.clone());
+ }
+ }
+ continue;
+ }
+
+ collapsed.push(RewritePolicyGlossaryTerm {
+ term: entry.term.clone(),
+ aliases: entry.aliases.clone(),
+ });
+ }
+ collapsed
+}
+
+fn build_glossary_candidates(
+ rewrite_candidates: &[RewriteCandidate],
+ glossary_entries: &[PreparedGlossaryEntry],
+) -> Vec {
+ let mut generated = Vec::new();
+ for candidate in rewrite_candidates {
+ if generated.len() >= MAX_GLOSSARY_CANDIDATES {
+ break;
+ }
+
+ if let Some(text) = apply_glossary_entries(&candidate.text, glossary_entries)
+ && text != candidate.text
+ && !generated
+ .iter()
+ .any(|existing: &RewriteCandidate| existing.text == text)
+ && !rewrite_candidates
+ .iter()
+ .any(|existing| existing.text == text)
+ {
+ generated.push(RewriteCandidate {
+ kind: RewriteCandidateKind::GlossaryCorrection,
+ text,
+ });
+ }
+ }
+ generated
+}
+
+fn apply_glossary_entries(text: &str, entries: &[PreparedGlossaryEntry]) -> Option {
+ if text.trim().is_empty() || entries.is_empty() {
+ return None;
+ }
+
+ let spans = collect_word_spans(text);
+ if spans.is_empty() {
+ return None;
+ }
+
+ let mut replacements = collect_glossary_replacements(text, &spans, entries);
+ if replacements.is_empty() {
+ return None;
+ }
+
+ replacements.sort_by_key(|replacement| replacement.start);
+
+ let mut output = String::new();
+ let mut cursor = 0usize;
+ for replacement in replacements {
+ output.push_str(&text[cursor..replacement.start]);
+ output.push_str(&replacement.term);
+ cursor = replacement.end;
+ }
+ output.push_str(&text[cursor..]);
+ Some(output.trim().to_string())
+}
+
+fn collect_glossary_replacements(
+ text: &str,
+ spans: &[WordSpan],
+ entries: &[PreparedGlossaryEntry],
+) -> Vec {
+ let mut candidates = Vec::new();
+ for (priority, entry) in entries.iter().enumerate() {
+ if entry.normalized_aliases.is_empty() {
+ continue;
+ }
+
+ let mut index = 0usize;
+ while index < spans.len() {
+ let Some(alias_len) = best_alias_match(spans, index, &entry.normalized_aliases) else {
+ index += 1;
+ continue;
+ };
+
+ candidates.push(GlossaryReplacement {
+ start: spans[index].start,
+ end: spans[index + alias_len - 1].end,
+ start_span: index,
+ end_span: index + alias_len,
+ term: entry.term.clone(),
+ priority,
+ });
+ index += alias_len;
+ }
+ }
+
+ candidates.sort_by(|left, right| {
+ right
+ .priority
+ .cmp(&left.priority)
+ .then_with(|| {
+ (right.end_span - right.start_span).cmp(&(left.end_span - left.start_span))
+ })
+ .then_with(|| left.start_span.cmp(&right.start_span))
+ });
+
+ let mut selected = Vec::new();
+ for candidate in candidates {
+ if selected.iter().any(|existing: &GlossaryReplacement| {
+ candidate.start_span < existing.end_span && candidate.end_span > existing.start_span
+ }) {
+ continue;
+ }
+ selected.push(candidate);
+ }
+
+ if selected.is_empty() && !text.is_empty() {
+ return Vec::new();
+ }
+
+ selected
+}
+
+fn best_alias_match(spans: &[WordSpan], index: usize, aliases: &[Vec]) -> Option {
+ aliases
+ .iter()
+ .filter(|alias| matches_words(spans, index, alias))
+ .map(Vec::len)
+ .max()
+}
+
+fn matches_words(spans: &[WordSpan], index: usize, words: &[String]) -> bool {
+ if words.is_empty() || index + words.len() > spans.len() {
+ return false;
+ }
+
+ spans[index..index + words.len()]
+ .iter()
+ .zip(words)
+ .all(|(span, word)| span.normalized == *word)
+}
+
+fn collect_word_spans(text: &str) -> Vec {
+ let mut spans = Vec::new();
+ let mut current_start = None;
+
+ for (index, ch) in text.char_indices() {
+ if is_word_char(ch) {
+ current_start.get_or_insert(index);
+ continue;
+ }
+
+ if let Some(start) = current_start.take() {
+ spans.push(WordSpan {
+ start,
+ end: index,
+ normalized: normalize_word(&text[start..index]),
+ });
+ }
+ }
+
+ if let Some(start) = current_start {
+ spans.push(WordSpan {
+ start,
+ end: text.len(),
+ normalized: normalize_word(&text[start..]),
+ });
+ }
+
+ spans
+}
+
+fn normalized_words(text: &str) -> Vec {
+ collect_word_spans(text)
+ .into_iter()
+ .map(|span| span.normalized)
+ .collect()
+}
+
+fn normalize_word(word: &str) -> String {
+ word.chars()
+ .filter(|ch| is_word_char(*ch))
+ .flat_map(|ch| ch.to_lowercase())
+ .collect()
+}
+
+fn is_word_char(ch: char) -> bool {
+ ch.is_alphanumeric() || matches!(ch, '\'' | '-' | '_' | '.')
+}
+
+fn contains_ignore_ascii_case(haystack: Option<&str>, needle: &str) -> bool {
+ let Some(haystack) = haystack else {
+ return false;
+ };
+ haystack
+ .to_ascii_lowercase()
+ .contains(&needle.to_ascii_lowercase())
+}
+
+#[derive(Debug, Clone)]
+struct WordSpan {
+ start: usize,
+ end: usize,
+ normalized: String,
+}
+
+#[derive(Debug, Clone)]
+struct GlossaryReplacement {
+ start: usize,
+ end: usize,
+ start_span: usize,
+ end_span: usize,
+ term: String,
+ priority: usize,
+}
+
+#[cfg(test)]
+mod tests {
+ use super::super::{AppRule, ContextMatcher, GlossaryEntry};
+ use super::*;
+ use crate::rewrite_protocol::{
+ RewriteCandidate, RewriteCandidateKind, RewritePolicyContext, RewriteSurfaceKind,
+ RewriteTranscript, RewriteTypingContext,
+ };
+
+ fn typing_context(surface_kind: RewriteSurfaceKind) -> RewriteTypingContext {
+ RewriteTypingContext {
+ focus_fingerprint: "focus".into(),
+ app_id: Some("dev.zed.Zed".into()),
+ window_title: Some("docs.rs - serde_json".into()),
+ surface_kind,
+ browser_domain: Some("docs.rs".into()),
+ captured_at_ms: 42,
+ }
+ }
+
+ fn transcript_with_candidates(surface_kind: RewriteSurfaceKind) -> RewriteTranscript {
+ RewriteTranscript {
+ raw_text: "type script and sir dee json".into(),
+ correction_aware_text: "type script and sir dee json".into(),
+ aggressive_correction_text: None,
+ detected_language: Some("en".into()),
+ typing_context: Some(typing_context(surface_kind)),
+ recent_session_entries: Vec::new(),
+ session_backtrack_candidates: Vec::new(),
+ recommended_session_candidate: None,
+ segments: Vec::new(),
+ edit_intents: Vec::new(),
+ edit_signals: Vec::new(),
+ edit_hypotheses: Vec::new(),
+ rewrite_candidates: vec![RewriteCandidate {
+ kind: RewriteCandidateKind::ConservativeCorrection,
+ text: "type script and sir dee json".into(),
+ }],
+ recommended_candidate: None,
+ policy_context: RewritePolicyContext::default(),
+ }
+ }
+
+ #[test]
+ fn built_in_terminal_policy_is_conservative() {
+ let context = typing_context(RewriteSurfaceKind::Terminal);
+ let policy = resolve_policy_context(
+ RewriteCorrectionPolicy::Balanced,
+ Some(&context),
+ &[],
+ &[],
+ &[],
+ );
+ assert_eq!(
+ policy.correction_policy,
+ RewriteCorrectionPolicy::Conservative
+ );
+ assert!(
+ policy
+ .matched_rule_names
+ .iter()
+ .any(|name| name == "baseline/terminal")
+ );
+ }
+
+ #[test]
+ fn built_in_policy_guides_technical_term_inference() {
+ let context = typing_context(RewriteSurfaceKind::GenericText);
+ let policy = resolve_policy_context(
+ RewriteCorrectionPolicy::Balanced,
+ Some(&context),
+ &[],
+ &[],
+ &[],
+ );
+ assert!(
+ policy
+ .effective_rule_instructions
+ .iter()
+ .any(|instruction| instruction.contains("phonetically similar common word"))
+ );
+ }
+
+ #[test]
+ fn more_specific_rules_override_less_specific_rules() {
+ let rules = vec![
+ AppRule {
+ name: "surface".into(),
+ matcher: ContextMatcher {
+ surface_kind: Some(RewriteSurfaceKind::Editor),
+ ..ContextMatcher::default()
+ },
+ instructions: "surface".into(),
+ correction_policy: Some(RewriteCorrectionPolicy::Balanced),
+ },
+ AppRule {
+ name: "app".into(),
+ matcher: ContextMatcher {
+ app_id: Some("dev.zed.Zed".into()),
+ ..ContextMatcher::default()
+ },
+ instructions: "app".into(),
+ correction_policy: Some(RewriteCorrectionPolicy::Aggressive),
+ },
+ ];
+ let context = typing_context(RewriteSurfaceKind::Editor);
+ let policy = resolve_policy_context(
+ RewriteCorrectionPolicy::Balanced,
+ Some(&context),
+ &[],
+ &rules,
+ &[],
+ );
+ assert_eq!(
+ policy.correction_policy,
+ RewriteCorrectionPolicy::Aggressive
+ );
+ assert_eq!(
+ policy
+ .effective_rule_instructions
+ .last()
+ .map(String::as_str),
+ Some("app")
+ );
+ }
+
+ #[test]
+ fn higher_precedence_matcher_layers_override_lower_layer_combinations() {
+ let rules = vec![
+ AppRule {
+ name: "surface-and-app".into(),
+ matcher: ContextMatcher {
+ surface_kind: Some(RewriteSurfaceKind::Editor),
+ app_id: Some("dev.zed.Zed".into()),
+ ..ContextMatcher::default()
+ },
+ instructions: "surface-and-app".into(),
+ correction_policy: Some(RewriteCorrectionPolicy::Aggressive),
+ },
+ AppRule {
+ name: "window-title".into(),
+ matcher: ContextMatcher {
+ window_title_contains: Some("serde_json".into()),
+ ..ContextMatcher::default()
+ },
+ instructions: "window-title".into(),
+ correction_policy: Some(RewriteCorrectionPolicy::Conservative),
+ },
+ ];
+ let context = typing_context(RewriteSurfaceKind::Editor);
+ let policy = resolve_policy_context(
+ RewriteCorrectionPolicy::Balanced,
+ Some(&context),
+ &[],
+ &rules,
+ &[],
+ );
+ assert_eq!(
+ policy.correction_policy,
+ RewriteCorrectionPolicy::Conservative
+ );
+ assert_eq!(
+ policy
+ .effective_rule_instructions
+ .last()
+ .map(String::as_str),
+ Some("window-title")
+ );
+ }
+
+ #[test]
+ fn glossary_candidates_follow_matching_scope() {
+ let glossary = vec![
+ GlossaryEntry {
+ term: "TypeScript".into(),
+ aliases: vec!["type script".into()],
+ matcher: ContextMatcher {
+ surface_kind: Some(RewriteSurfaceKind::Editor),
+ ..ContextMatcher::default()
+ },
+ },
+ GlossaryEntry {
+ term: "serde_json".into(),
+ aliases: vec!["sir dee json".into()],
+ matcher: ContextMatcher {
+ browser_domain_contains: Some("docs.rs".into()),
+ ..ContextMatcher::default()
+ },
+ },
+ ];
+ let policy = resolve_policy_context(
+ RewriteCorrectionPolicy::Balanced,
+ Some(&typing_context(RewriteSurfaceKind::Editor)),
+ &[RewriteCandidate {
+ kind: RewriteCandidateKind::Literal,
+ text: "type script and sir dee json".into(),
+ }],
+ &[],
+ &glossary,
+ );
+ assert_eq!(policy.active_glossary_terms.len(), 2);
+ assert_eq!(policy.glossary_candidates.len(), 1);
+ assert_eq!(
+ policy.glossary_candidates[0].text,
+ "TypeScript and serde_json"
+ );
+ }
+
+ #[test]
+ fn glossary_candidates_preserve_scoped_alias_overrides() {
+ let glossary = vec![
+ GlossaryEntry {
+ term: "serde".into(),
+ aliases: vec!["sir dee".into()],
+ matcher: ContextMatcher::default(),
+ },
+ GlossaryEntry {
+ term: "serde_json".into(),
+ aliases: vec!["sir dee".into()],
+ matcher: ContextMatcher {
+ browser_domain_contains: Some("docs.rs".into()),
+ ..ContextMatcher::default()
+ },
+ },
+ ];
+ let policy = resolve_policy_context(
+ RewriteCorrectionPolicy::Balanced,
+ Some(&typing_context(RewriteSurfaceKind::Editor)),
+ &[RewriteCandidate {
+ kind: RewriteCandidateKind::Literal,
+ text: "sir dee".into(),
+ }],
+ &[],
+ &glossary,
+ );
+ assert_eq!(policy.glossary_candidates.len(), 1);
+ assert_eq!(policy.glossary_candidates[0].text, "serde_json");
+ }
+
+ #[test]
+ fn conservative_acceptance_requires_explicit_candidate() {
+ let mut transcript = transcript_with_candidates(RewriteSurfaceKind::Terminal);
+ transcript.policy_context.correction_policy = RewriteCorrectionPolicy::Conservative;
+ transcript.policy_context.glossary_candidates = vec![RewriteCandidate {
+ kind: RewriteCandidateKind::GlossaryCorrection,
+ text: "TypeScript and serde_json".into(),
+ }];
+ assert!(conservative_output_allowed(
+ &transcript,
+ "type script and sir dee json"
+ ));
+ assert!(conservative_output_allowed(
+ &transcript,
+ "TypeScript and serde_json"
+ ));
+ assert!(!conservative_output_allowed(
+ &transcript,
+ "A different rewrite"
+ ));
+ }
+
+ #[test]
+ fn conservative_acceptance_allows_sentence_like_minor_term_normalization() {
+ let mut hyperland_transcript = RewriteTranscript {
+ raw_text: "I'm currently using the window manager hyperland.".into(),
+ correction_aware_text: "I'm currently using the window manager hyperland.".into(),
+ aggressive_correction_text: None,
+ detected_language: Some("en".into()),
+ typing_context: Some(typing_context(RewriteSurfaceKind::Terminal)),
+ recent_session_entries: Vec::new(),
+ session_backtrack_candidates: Vec::new(),
+ recommended_session_candidate: None,
+ segments: Vec::new(),
+ edit_intents: Vec::new(),
+ edit_signals: Vec::new(),
+ edit_hypotheses: Vec::new(),
+ rewrite_candidates: vec![RewriteCandidate {
+ kind: RewriteCandidateKind::ConservativeCorrection,
+ text: "I'm currently using the window manager hyperland.".into(),
+ }],
+ recommended_candidate: None,
+ policy_context: RewritePolicyContext::default(),
+ };
+ hyperland_transcript.policy_context.correction_policy =
+ RewriteCorrectionPolicy::Conservative;
+
+ assert!(conservative_output_allowed(
+ &hyperland_transcript,
+ "I'm currently using the window manager Hyprland."
+ ));
+
+ let mut switch_transcript = RewriteTranscript {
+ raw_text: "I'm switching from Sui to Hyperland.".into(),
+ correction_aware_text: "I'm switching from Sui to Hyperland.".into(),
+ aggressive_correction_text: None,
+ detected_language: Some("en".into()),
+ typing_context: Some(typing_context(RewriteSurfaceKind::Terminal)),
+ recent_session_entries: Vec::new(),
+ session_backtrack_candidates: Vec::new(),
+ recommended_session_candidate: None,
+ segments: Vec::new(),
+ edit_intents: Vec::new(),
+ edit_signals: Vec::new(),
+ edit_hypotheses: Vec::new(),
+ rewrite_candidates: vec![RewriteCandidate {
+ kind: RewriteCandidateKind::ConservativeCorrection,
+ text: "I'm switching from Sui to Hyperland.".into(),
+ }],
+ recommended_candidate: None,
+ policy_context: RewritePolicyContext::default(),
+ };
+ switch_transcript.policy_context.correction_policy = RewriteCorrectionPolicy::Conservative;
+
+ assert!(conservative_output_allowed(
+ &switch_transcript,
+ "I'm switching from Sway to Hyprland."
+ ));
+ }
+
+ #[test]
+ fn conservative_acceptance_keeps_short_command_fragments_strict() {
+ let mut transcript = RewriteTranscript {
+ raw_text: "cargo clipy".into(),
+ correction_aware_text: "cargo clipy".into(),
+ aggressive_correction_text: None,
+ detected_language: Some("en".into()),
+ typing_context: Some(typing_context(RewriteSurfaceKind::Terminal)),
+ recent_session_entries: Vec::new(),
+ session_backtrack_candidates: Vec::new(),
+ recommended_session_candidate: None,
+ segments: Vec::new(),
+ edit_intents: Vec::new(),
+ edit_signals: Vec::new(),
+ edit_hypotheses: Vec::new(),
+ rewrite_candidates: vec![RewriteCandidate {
+ kind: RewriteCandidateKind::ConservativeCorrection,
+ text: "cargo clipy".into(),
+ }],
+ recommended_candidate: None,
+ policy_context: RewritePolicyContext::default(),
+ };
+ transcript.policy_context.correction_policy = RewriteCorrectionPolicy::Conservative;
+
+ assert!(!conservative_output_allowed(&transcript, "cargo clippy"));
+ }
+
+ #[test]
+ fn minor_term_normalization_uses_phonetic_skeleton_without_allowing_unrelated_words() {
+ assert!(is_minor_term_normalization("sui", "sway"));
+ assert!(!is_minor_term_normalization("cat", "dog"));
+ }
+}
diff --git a/src/agentic_rewrite/store.rs b/src/agentic_rewrite/store.rs
new file mode 100644
index 0000000..dc099c9
--- /dev/null
+++ b/src/agentic_rewrite/store.rs
@@ -0,0 +1,232 @@
+use std::path::Path;
+
+use serde::{Deserialize, Serialize};
+
+use crate::config::{Config, PostprocessMode};
+use crate::error::{Result, WhsprError};
+
+use super::{AppRule, GlossaryEntry};
+
+const DEFAULT_POLICY_PATH: &str = "~/.local/share/whispers/app-rewrite-policy.toml";
+const DEFAULT_GLOSSARY_PATH: &str = "~/.local/share/whispers/technical-glossary.toml";
+
+const POLICY_STARTER: &str = r#"# App-aware rewrite policy for whispers agentic_rewrite mode.
+# Rules are layered, not first-match. Matching rules apply in this order:
+# global defaults, surface_kind, app_id, window_title_contains, browser_domain_contains.
+# Later, more specific rules override earlier fields.
+#
+# Uncomment and edit the examples below.
+#
+# [[rules]]
+# name = "terminal-shell"
+# surface_kind = "terminal"
+# correction_policy = "conservative"
+# instructions = "Preserve commands, flags, paths, package names, and environment variables."
+#
+# [[rules]]
+# name = "docs-rs-browser"
+# surface_kind = "browser"
+# browser_domain_contains = "docs.rs"
+# instructions = "Preserve Rust crate names, module paths, and type identifiers."
+#
+# [[rules]]
+# name = "zed-rust"
+# app_id = "dev.zed.Zed"
+# instructions = "Preserve identifiers, filenames, snake_case, camelCase, and Rust terminology."
+"#;
+
+const GLOSSARY_STARTER: &str = r#"# Technical glossary for whispers agentic_rewrite mode.
+# Each entry defines a canonical term plus likely spoken or mis-transcribed aliases.
+#
+# Uncomment and edit the examples below.
+#
+# [[entries]]
+# term = "TypeScript"
+# aliases = ["type script", "types script"]
+# surface_kind = "editor"
+#
+# [[entries]]
+# term = "pyproject.toml"
+# aliases = ["pie project dot toml", "pie project toml"]
+# surface_kind = "terminal"
+#
+# [[entries]]
+# term = "serde_json"
+# aliases = ["sir dee json", "serdy json"]
+# browser_domain_contains = "docs.rs"
+"#;
+
+#[derive(Debug, Clone, Default, Deserialize, Serialize, PartialEq, Eq)]
+#[serde(default)]
+struct PolicyFile {
+ rules: Vec,
+}
+
+#[derive(Debug, Clone, Default, Deserialize, Serialize, PartialEq, Eq)]
+#[serde(default)]
+struct GlossaryFile {
+ entries: Vec,
+}
+
+pub(super) fn default_policy_path() -> &'static str {
+ DEFAULT_POLICY_PATH
+}
+
+pub(super) fn default_glossary_path() -> &'static str {
+ DEFAULT_GLOSSARY_PATH
+}
+
+pub(super) fn ensure_starter_files(config: &Config) -> Result> {
+ if config.postprocess.mode != PostprocessMode::AgenticRewrite {
+ return Ok(Vec::new());
+ }
+
+ let mut created = Vec::new();
+ let policy_path = config.resolved_agentic_policy_path();
+ if ensure_text_file(&policy_path, POLICY_STARTER)? {
+ created.push(policy_path.display().to_string());
+ }
+
+ let glossary_path = config.resolved_agentic_glossary_path();
+ if ensure_text_file(&glossary_path, GLOSSARY_STARTER)? {
+ created.push(glossary_path.display().to_string());
+ }
+
+ Ok(created)
+}
+
+fn ensure_text_file(path: &Path, contents: &str) -> Result {
+ if path.exists() {
+ return Ok(false);
+ }
+
+ write_parent(path)?;
+ std::fs::write(path, contents).map_err(|e| {
+ WhsprError::Config(format!(
+ "failed to write starter file {}: {e}",
+ path.display()
+ ))
+ })?;
+ Ok(true)
+}
+
+pub(super) fn read_policy_file(path: &Path) -> Result> {
+ if !path.exists() {
+ return Ok(Vec::new());
+ }
+
+ let contents = std::fs::read_to_string(path).map_err(|e| {
+ WhsprError::Config(format!("failed to read app rules {}: {e}", path.display()))
+ })?;
+ if contents.trim().is_empty() {
+ return Ok(Vec::new());
+ }
+ let file: PolicyFile = toml::from_str(&contents).map_err(|e| {
+ WhsprError::Config(format!("failed to parse app rules {}: {e}", path.display()))
+ })?;
+ Ok(file.rules)
+}
+
+pub(super) fn write_policy_file(path: &Path, rules: &[AppRule]) -> Result<()> {
+ write_parent(path)?;
+ let contents = toml::to_string_pretty(&PolicyFile {
+ rules: rules.to_vec(),
+ })
+ .map_err(|e| WhsprError::Config(format!("failed to encode app rules: {e}")))?;
+ std::fs::write(path, contents).map_err(|e| {
+ WhsprError::Config(format!("failed to write app rules {}: {e}", path.display()))
+ })?;
+ Ok(())
+}
+
+pub(super) fn read_glossary_file(path: &Path) -> Result> {
+ if !path.exists() {
+ return Ok(Vec::new());
+ }
+
+ let contents = std::fs::read_to_string(path).map_err(|e| {
+ WhsprError::Config(format!("failed to read glossary {}: {e}", path.display()))
+ })?;
+ if contents.trim().is_empty() {
+ return Ok(Vec::new());
+ }
+ let file: GlossaryFile = toml::from_str(&contents).map_err(|e| {
+ WhsprError::Config(format!("failed to parse glossary {}: {e}", path.display()))
+ })?;
+ Ok(file.entries)
+}
+
+pub(super) fn write_glossary_file(path: &Path, entries: &[GlossaryEntry]) -> Result<()> {
+ write_parent(path)?;
+ let contents = toml::to_string_pretty(&GlossaryFile {
+ entries: entries.to_vec(),
+ })
+ .map_err(|e| WhsprError::Config(format!("failed to encode glossary: {e}")))?;
+ std::fs::write(path, contents).map_err(|e| {
+ WhsprError::Config(format!("failed to write glossary {}: {e}", path.display()))
+ })?;
+ Ok(())
+}
+
+pub(super) fn load_policy_file_for_runtime(path: &Path) -> Vec {
+ match read_policy_file(path) {
+ Ok(rules) => rules,
+ Err(err) => {
+ tracing::warn!("{err}; using built-in app rewrite defaults");
+ Vec::new()
+ }
+ }
+}
+
+pub(super) fn load_glossary_file_for_runtime(path: &Path) -> Vec {
+ match read_glossary_file(path) {
+ Ok(entries) => entries,
+ Err(err) => {
+ tracing::warn!("{err}; ignoring runtime glossary");
+ Vec::new()
+ }
+ }
+}
+
+fn write_parent(path: &Path) -> Result<()> {
+ if let Some(parent) = path.parent() {
+ std::fs::create_dir_all(parent).map_err(|e| {
+ WhsprError::Config(format!(
+ "failed to create directory {}: {e}",
+ parent.display()
+ ))
+ })?;
+ }
+ Ok(())
+}
+
+pub(super) fn upsert_app_rule(rules: &mut Vec, rule: AppRule) {
+ if let Some(existing) = rules.iter_mut().find(|existing| existing.name == rule.name) {
+ *existing = rule;
+ return;
+ }
+ rules.push(rule);
+}
+
+pub(super) fn remove_app_rule_entry(rules: &mut Vec, name: &str) -> bool {
+ let before = rules.len();
+ rules.retain(|rule| rule.name != name);
+ before != rules.len()
+}
+
+pub(super) fn upsert_glossary_entry(entries: &mut Vec, entry: GlossaryEntry) {
+ if let Some(existing) = entries
+ .iter_mut()
+ .find(|existing| existing.term == entry.term)
+ {
+ *existing = entry;
+ return;
+ }
+ entries.push(entry);
+}
+
+pub(super) fn remove_glossary_entry_by_term(entries: &mut Vec, term: &str) -> bool {
+ let before = entries.len();
+ entries.retain(|entry| entry.term != term);
+ before != entries.len()
+}
diff --git a/src/app.rs b/src/app.rs
deleted file mode 100644
index 3a99191..0000000
--- a/src/app.rs
+++ /dev/null
@@ -1,234 +0,0 @@
-use std::process::Child;
-use std::time::Instant;
-
-#[cfg(feature = "osd")]
-use std::process::Command;
-
-use crate::asr;
-use crate::audio::AudioRecorder;
-use crate::config::{Config, PostprocessMode};
-use crate::context;
-use crate::error::Result;
-use crate::feedback::FeedbackPlayer;
-use crate::inject::TextInjector;
-use crate::postprocess;
-use crate::session;
-
-pub async fn run(config: Config) -> Result<()> {
- let activation_started = Instant::now();
- // Register signals before startup work to minimize early-signal races.
- let mut sigusr1 =
- tokio::signal::unix::signal(tokio::signal::unix::SignalKind::user_defined1())?;
- let mut sigterm = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())?;
-
- let feedback = FeedbackPlayer::new(
- config.feedback.enabled,
- &config.feedback.start_sound,
- &config.feedback.stop_sound,
- );
-
- // Play start sound first (blocking), then start recording so the sound
- // doesn't leak into the mic.
- feedback.play_start();
- let recording_context = context::capture_typing_context();
- let session_enabled = config.postprocess.mode == PostprocessMode::AdvancedLocal;
- let recent_session = if session_enabled {
- session::load_recent_entry(&config.session, &recording_context)?
- } else {
- None
- };
- let mut recorder = AudioRecorder::new(&config.audio);
- recorder.start()?;
- let mut osd = spawn_osd();
- tracing::info!("recording... (run whispers again to stop)");
-
- let transcriber = asr::prepare_transcriber(&config)?;
- let rewrite_service = postprocess::prepare_rewrite_service(&config);
- asr::prewarm_transcriber(&transcriber, "recording");
- if let Some(service) = rewrite_service.as_ref() {
- postprocess::prewarm_rewrite_service(service, "recording");
- }
-
- tokio::select! {
- _ = sigusr1.recv() => {
- tracing::info!("toggle signal received, stopping recording");
- }
- _ = tokio::signal::ctrl_c() => {
- tracing::info!("interrupted, cancelling");
- kill_osd(&mut osd);
- recorder.stop()?;
- return Ok(());
- }
- _ = sigterm.recv() => {
- tracing::info!("terminated, cancelling");
- kill_osd(&mut osd);
- recorder.stop()?;
- return Ok(());
- }
- }
-
- // Stop recording before playing feedback so the stop sound doesn't
- // leak into the mic.
- kill_osd(&mut osd);
- let audio = recorder.stop()?;
- feedback.play_stop();
- let sample_rate = config.audio.sample_rate;
- let audio_duration_ms = ((audio.len() as f64 / sample_rate as f64) * 1000.0).round() as u64;
-
- tracing::info!(
- samples = audio.len(),
- sample_rate,
- audio_duration_ms,
- "transcribing captured audio"
- );
-
- let transcribe_started = Instant::now();
- let transcript = asr::transcribe_audio(&config, transcriber, audio, sample_rate).await?;
- tracing::info!(
- elapsed_ms = transcribe_started.elapsed().as_millis(),
- transcript_chars = transcript.raw_text.len(),
- "transcription stage finished"
- );
-
- if transcript.is_empty() {
- tracing::warn!("transcription returned empty text");
- postprocess::wait_for_feedback_drain().await;
- return Ok(());
- }
-
- let injection_context = context::capture_typing_context();
- let recent_session = recent_session.filter(|entry| {
- let same_focus = entry.entry.focus_fingerprint == injection_context.focus_fingerprint;
- if !same_focus {
- tracing::debug!(
- previous_focus = entry.entry.focus_fingerprint,
- current_focus = injection_context.focus_fingerprint,
- "session backtrack blocked because focus changed before injection"
- );
- }
- same_focus
- });
- let finalize_started = Instant::now();
- let finalized = postprocess::finalize_transcript(
- &config,
- transcript,
- rewrite_service.as_ref(),
- Some(&injection_context),
- recent_session.as_ref(),
- )
- .await;
- tracing::info!(
- elapsed_ms = finalize_started.elapsed().as_millis(),
- output_chars = finalized.text.len(),
- operation = match finalized.operation {
- postprocess::FinalizedOperation::Append => "append",
- postprocess::FinalizedOperation::ReplaceLastEntry { .. } => "replace_last_entry",
- },
- rewrite_used = finalized.rewrite_summary.rewrite_used,
- "post-processing stage finished"
- );
-
- if finalized.text.is_empty() {
- tracing::warn!("post-processing produced empty text");
- // When the RMS/duration gates skip transcription, the process would
- // exit almost immediately after play_stop(). PipeWire may still be
- // draining the stop sound's last buffer; exiting while it's "warm"
- // causes an audible click as the OS closes our audio file descriptors.
- // With speech, transcription takes seconds — providing natural drain time.
- postprocess::wait_for_feedback_drain().await;
- return Ok(());
- }
-
- // Inject text
- tracing::info!("injecting text: {:?}", finalized.text);
- let injector = TextInjector::new();
- match finalized.operation {
- postprocess::FinalizedOperation::Append => {
- injector.inject(&finalized.text).await?;
- if session_enabled {
- session::record_append(
- &config.session,
- &injection_context,
- &finalized.text,
- finalized.rewrite_summary,
- )?;
- }
- }
- postprocess::FinalizedOperation::ReplaceLastEntry {
- entry_id,
- delete_graphemes,
- } => {
- injector
- .replace_recent_text(delete_graphemes, &finalized.text)
- .await?;
- if session_enabled {
- session::record_replace(
- &config.session,
- &injection_context,
- entry_id,
- &finalized.text,
- finalized.rewrite_summary,
- )?;
- }
- }
- }
-
- tracing::info!("done");
- tracing::info!(
- total_elapsed_ms = activation_started.elapsed().as_millis(),
- "dictation pipeline finished"
- );
- Ok(())
-}
-
-#[cfg(feature = "osd")]
-fn spawn_osd() -> Option {
- // Look for whispers-osd next to our own binary first, then fall back to PATH
- let osd_path = std::env::current_exe()
- .ok()
- .and_then(|p| p.parent().map(|dir| dir.join("whispers-osd")))
- .filter(|p| p.exists())
- .unwrap_or_else(|| "whispers-osd".into());
-
- match Command::new(&osd_path).spawn() {
- Ok(child) => {
- tracing::debug!("spawned whispers-osd (pid {})", child.id());
- Some(child)
- }
- Err(e) => {
- tracing::warn!(
- "failed to spawn whispers-osd from {}: {e}",
- osd_path.display()
- );
- None
- }
- }
-}
-
-#[cfg(not(feature = "osd"))]
-fn spawn_osd() -> Option {
- None
-}
-
-fn kill_osd(child: &mut Option) {
- if let Some(mut c) = child.take() {
- let pid = c.id() as libc::pid_t;
- unsafe {
- libc::kill(pid, libc::SIGTERM);
- }
- let _ = c.wait();
- tracing::debug!("whispers-osd (pid {pid}) terminated");
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn kill_osd_none_is_noop() {
- let mut child: Option = None;
- kill_osd(&mut child);
- assert!(child.is_none());
- }
-}
diff --git a/src/app/mod.rs b/src/app/mod.rs
new file mode 100644
index 0000000..8a58ae6
--- /dev/null
+++ b/src/app/mod.rs
@@ -0,0 +1,67 @@
+use std::time::Instant;
+
+use crate::config::Config;
+use crate::error::Result;
+use crate::postprocess::finalize;
+
+mod osd;
+mod runtime;
+
+use runtime::DictationRuntime;
+
+pub async fn run(config: Config) -> Result<()> {
+ let activation_started = Instant::now();
+ // Register signals before startup work to minimize early-signal races.
+ let mut sigusr1 =
+ tokio::signal::unix::signal(tokio::signal::unix::SignalKind::user_defined1())?;
+ let mut sigterm = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())?;
+ let mut runtime = DictationRuntime::new(config);
+ let recording = runtime.start_recording()?;
+ runtime.prepare_services()?;
+
+ tokio::select! {
+ _ = sigusr1.recv() => {
+ tracing::info!("toggle signal received, stopping recording");
+ }
+ _ = tokio::signal::ctrl_c() => {
+ tracing::info!("interrupted, cancelling");
+ runtime.cancel_recording(recording)?;
+ return Ok(());
+ }
+ _ = sigterm.recv() => {
+ tracing::info!("terminated, cancelling");
+ runtime.cancel_recording(recording)?;
+ return Ok(());
+ }
+ }
+
+ let captured = runtime.finish_recording(recording)?;
+ let transcribed = runtime.transcribe_recording(captured).await?;
+
+ if transcribed.is_empty() {
+ tracing::warn!("transcription returned empty text");
+ finalize::wait_for_feedback_drain().await;
+ return Ok(());
+ }
+
+ let finalized = runtime.finalize_recording(transcribed).await;
+ if finalized.is_empty() {
+ tracing::warn!("post-processing produced empty text");
+ // When the RMS/duration gates skip transcription, the process would
+ // exit almost immediately after play_stop(). PipeWire may still be
+ // draining the stop sound's last buffer; exiting while it's "warm"
+ // causes an audible click as the OS closes our audio file descriptors.
+ // With speech, transcription takes seconds — providing natural drain time.
+ finalize::wait_for_feedback_drain().await;
+ return Ok(());
+ }
+
+ runtime.inject_finalized(finalized).await?;
+
+ tracing::info!("done");
+ tracing::info!(
+ total_elapsed_ms = activation_started.elapsed().as_millis(),
+ "dictation pipeline finished"
+ );
+ Ok(())
+}
diff --git a/src/app/osd.rs b/src/app/osd.rs
new file mode 100644
index 0000000..d6a4af2
--- /dev/null
+++ b/src/app/osd.rs
@@ -0,0 +1,57 @@
+use std::process::Child;
+
+#[cfg(feature = "osd")]
+use std::process::Command;
+
+#[cfg(feature = "osd")]
+pub(super) fn spawn_osd() -> Option {
+ // Look for whispers-osd next to our own binary first, then fall back to PATH
+ let osd_path = std::env::current_exe()
+ .ok()
+ .and_then(|p| p.parent().map(|dir| dir.join("whispers-osd")))
+ .filter(|p| p.exists())
+ .unwrap_or_else(|| "whispers-osd".into());
+
+ match Command::new(&osd_path).spawn() {
+ Ok(child) => {
+ tracing::debug!("spawned whispers-osd (pid {})", child.id());
+ Some(child)
+ }
+ Err(e) => {
+ tracing::warn!(
+ "failed to spawn whispers-osd from {}: {e}",
+ osd_path.display()
+ );
+ None
+ }
+ }
+}
+
+#[cfg(not(feature = "osd"))]
+pub(super) fn spawn_osd() -> Option {
+ None
+}
+
+pub(super) fn kill_osd(child: &mut Option) {
+ if let Some(mut c) = child.take() {
+ let pid = c.id() as libc::pid_t;
+ unsafe {
+ libc::kill(pid, libc::SIGTERM);
+ }
+ let _ = c.wait();
+ tracing::debug!("whispers-osd (pid {pid}) terminated");
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::kill_osd;
+ use std::process::Child;
+
+ #[test]
+ fn kill_osd_none_is_noop() {
+ let mut child: Option = None;
+ kill_osd(&mut child);
+ assert!(child.is_none());
+ }
+}
diff --git a/src/app/runtime.rs b/src/app/runtime.rs
new file mode 100644
index 0000000..ba138b5
--- /dev/null
+++ b/src/app/runtime.rs
@@ -0,0 +1,263 @@
+use std::process::Child;
+use std::time::Instant;
+
+use crate::asr;
+use crate::audio::AudioRecorder;
+use crate::config::Config;
+use crate::context::{self, TypingContext};
+use crate::error::Result;
+use crate::feedback::FeedbackPlayer;
+use crate::inject::TextInjector;
+use crate::postprocess::{execution, finalize};
+use crate::rewrite_worker::RewriteService;
+use crate::session::{self, EligibleSessionEntry};
+use crate::transcribe::Transcript;
+
+pub(super) struct DictationRuntime {
+ config: Config,
+ feedback: FeedbackPlayer,
+ session_enabled: bool,
+ transcriber: Option,
+ rewrite_service: Option,
+}
+
+pub(super) struct ActiveRecording {
+ recorder: AudioRecorder,
+ osd: Option,
+ recent_session: Option,
+}
+
+pub(super) struct CapturedRecording {
+ audio: Vec,
+ sample_rate: u32,
+ recent_session: Option,
+}
+
+pub(super) struct TranscribedRecording {
+ transcript: Transcript,
+ recent_session: Option,
+}
+
+pub(super) struct ReadyInjection {
+ finalized: finalize::FinalizedTranscript,
+ injection_context: TypingContext,
+}
+
+impl DictationRuntime {
+ pub(super) fn new(config: Config) -> Self {
+ let feedback = FeedbackPlayer::new(
+ config.feedback.enabled,
+ &config.feedback.start_sound,
+ &config.feedback.stop_sound,
+ );
+ let session_enabled = config.postprocess.mode.uses_rewrite();
+
+ Self {
+ config,
+ feedback,
+ session_enabled,
+ transcriber: None,
+ rewrite_service: None,
+ }
+ }
+
+ pub(super) fn start_recording(&self) -> Result {
+ // Play start sound first (blocking), then start recording so the sound
+ // doesn't leak into the mic.
+ self.feedback.play_start();
+ let recording_context = context::capture_typing_context();
+ let recent_session = if self.session_enabled {
+ session::load_recent_entry(&self.config.session, &recording_context)?
+ } else {
+ None
+ };
+
+ let mut recorder = AudioRecorder::new(&self.config.audio);
+ recorder.start()?;
+ let osd = super::osd::spawn_osd();
+ tracing::info!("recording... (run whispers again to stop)");
+
+ Ok(ActiveRecording {
+ recorder,
+ osd,
+ recent_session,
+ })
+ }
+
+ pub(super) fn prepare_services(&mut self) -> Result<()> {
+ let transcriber = asr::prepare::prepare_transcriber(&self.config)?;
+ let rewrite_service = execution::prepare_rewrite_service(&self.config);
+ asr::prepare::prewarm_transcriber(&transcriber, "recording");
+ if let Some(service) = rewrite_service.as_ref() {
+ execution::prewarm_rewrite_service(service, "recording");
+ }
+
+ self.transcriber = Some(transcriber);
+ self.rewrite_service = rewrite_service;
+ Ok(())
+ }
+
+ pub(super) fn cancel_recording(&self, mut recording: ActiveRecording) -> Result<()> {
+ super::osd::kill_osd(&mut recording.osd);
+ recording.recorder.stop()?;
+ Ok(())
+ }
+
+ pub(super) fn finish_recording(
+ &self,
+ mut recording: ActiveRecording,
+ ) -> Result {
+ // Stop recording before playing feedback so the stop sound doesn't
+ // leak into the mic.
+ super::osd::kill_osd(&mut recording.osd);
+ let audio = recording.recorder.stop()?;
+ self.feedback.play_stop();
+ let sample_rate = self.config.audio.sample_rate;
+ let audio_duration_ms = ((audio.len() as f64 / sample_rate as f64) * 1000.0).round() as u64;
+
+ tracing::info!(
+ samples = audio.len(),
+ sample_rate,
+ audio_duration_ms,
+ "transcribing captured audio"
+ );
+
+ Ok(CapturedRecording {
+ audio,
+ sample_rate,
+ recent_session: recording.recent_session,
+ })
+ }
+
+ pub(super) async fn transcribe_recording(
+ &mut self,
+ recording: CapturedRecording,
+ ) -> Result {
+ let transcriber = self
+ .transcriber
+ .take()
+ .expect("transcriber prepared before transcription");
+ let transcribe_started = Instant::now();
+ let transcript = asr::execute::transcribe_audio(
+ &self.config,
+ transcriber,
+ recording.audio,
+ recording.sample_rate,
+ )
+ .await?;
+
+ tracing::info!(
+ elapsed_ms = transcribe_started.elapsed().as_millis(),
+ transcript_chars = transcript.raw_text.len(),
+ "transcription stage finished"
+ );
+
+ Ok(TranscribedRecording {
+ transcript,
+ recent_session: recording.recent_session,
+ })
+ }
+
+ pub(super) async fn finalize_recording(
+ &self,
+ recording: TranscribedRecording,
+ ) -> ReadyInjection {
+ let injection_context = context::capture_typing_context();
+ let recent_session = recording.recent_session.filter(|entry| {
+ let same_focus = entry.entry.focus_fingerprint == injection_context.focus_fingerprint;
+ if !same_focus {
+ tracing::debug!(
+ previous_focus = entry.entry.focus_fingerprint,
+ current_focus = injection_context.focus_fingerprint,
+ "session backtrack blocked because focus changed before injection"
+ );
+ }
+ same_focus
+ });
+
+ let finalize_started = Instant::now();
+ let finalized = finalize::finalize_transcript(
+ &self.config,
+ recording.transcript,
+ self.rewrite_service.as_ref(),
+ Some(&injection_context),
+ recent_session.as_ref(),
+ )
+ .await;
+
+ tracing::info!(
+ elapsed_ms = finalize_started.elapsed().as_millis(),
+ output_chars = finalized.text.len(),
+ operation = match finalized.operation {
+ finalize::FinalizedOperation::Append => "append",
+ finalize::FinalizedOperation::ReplaceLastEntry { .. } => "replace_last_entry",
+ },
+ rewrite_used = finalized.rewrite_summary.rewrite_used,
+ "post-processing stage finished"
+ );
+
+ ReadyInjection {
+ finalized,
+ injection_context,
+ }
+ }
+
+ pub(super) async fn inject_finalized(&self, ready: ReadyInjection) -> Result<()> {
+ let ReadyInjection {
+ finalized,
+ injection_context,
+ } = ready;
+ let finalize::FinalizedTranscript {
+ text,
+ operation,
+ rewrite_summary,
+ } = finalized;
+
+ tracing::info!("injecting text: {:?}", text);
+ let injector = TextInjector::new();
+ match operation {
+ finalize::FinalizedOperation::Append => {
+ injector.inject(&text).await?;
+ if self.session_enabled {
+ session::record_append(
+ &self.config.session,
+ &injection_context,
+ &text,
+ rewrite_summary,
+ )?;
+ }
+ }
+ finalize::FinalizedOperation::ReplaceLastEntry {
+ entry_id,
+ delete_graphemes,
+ } => {
+ injector
+ .replace_recent_text(delete_graphemes, &text)
+ .await?;
+ if self.session_enabled {
+ session::record_replace(
+ &self.config.session,
+ &injection_context,
+ entry_id,
+ &text,
+ rewrite_summary,
+ )?;
+ }
+ }
+ }
+
+ Ok(())
+ }
+}
+
+impl TranscribedRecording {
+ pub(super) fn is_empty(&self) -> bool {
+ self.transcript.is_empty()
+ }
+}
+
+impl ReadyInjection {
+ pub(super) fn is_empty(&self) -> bool {
+ self.finalized.text.is_empty()
+ }
+}
diff --git a/src/asr.rs b/src/asr.rs
deleted file mode 100644
index 0014bf7..0000000
--- a/src/asr.rs
+++ /dev/null
@@ -1,417 +0,0 @@
-use crate::cloud::CloudService;
-use crate::config::{Config, TranscriptionBackend, TranscriptionConfig, TranscriptionFallback};
-use crate::error::{Result, WhsprError};
-use crate::faster_whisper::{self, FasterWhisperService};
-use crate::model;
-use crate::nemo_asr::{self, NemoAsrService};
-use crate::transcribe::{
- Transcript, TranscriptionBackend as SyncTranscriptionBackend, WhisperLocal,
-};
-use std::collections::HashSet;
-use std::path::{Path, PathBuf};
-
-pub enum PreparedTranscriber {
- Whisper(tokio::task::JoinHandle>),
- Faster(FasterWhisperService),
- Nemo(NemoAsrService),
- Cloud(CloudService),
-}
-
-pub fn prepare_transcriber(config: &Config) -> Result {
- cleanup_stale_transcribers(config)?;
-
- match config.transcription.backend {
- TranscriptionBackend::WhisperCpp => {
- let whisper_config = config.transcription.clone();
- let model_path = config.resolved_model_path();
- Ok(PreparedTranscriber::Whisper(tokio::task::spawn_blocking(
- move || WhisperLocal::new(&whisper_config, &model_path),
- )))
- }
- TranscriptionBackend::FasterWhisper => {
- faster_whisper::prepare_service(&config.transcription)
- .map(PreparedTranscriber::Faster)
- .ok_or_else(|| {
- WhsprError::Transcription(
- "faster-whisper backend selected but no model path could be resolved"
- .into(),
- )
- })
- }
- TranscriptionBackend::Nemo => nemo_asr::prepare_service(&config.transcription)
- .map(PreparedTranscriber::Nemo)
- .ok_or_else(|| {
- WhsprError::Transcription(
- "nemo backend selected but no model reference could be resolved".into(),
- )
- }),
- TranscriptionBackend::Cloud => Ok(PreparedTranscriber::Cloud(CloudService::new(config)?)),
- }
-}
-
-pub fn cleanup_stale_transcribers(config: &Config) -> Result<()> {
- let retained = retained_socket_paths(config);
- let stale_workers = collect_stale_asr_workers(&retained)?;
- for worker in stale_workers {
- tracing::info!(
- pid = worker.pid,
- kind = worker.kind,
- socket = %worker.socket_path.display(),
- "terminating stale ASR worker"
- );
- let result = unsafe { libc::kill(worker.pid, libc::SIGTERM) };
- if result == 0 {
- continue;
- }
- let err = std::io::Error::last_os_error();
- if err.raw_os_error() == Some(libc::ESRCH) {
- continue;
- }
- return Err(WhsprError::Transcription(format!(
- "failed to terminate stale {} worker (pid {}): {err}",
- worker.kind, worker.pid
- )));
- }
- Ok(())
-}
-
-pub fn prewarm_transcriber(prepared: &PreparedTranscriber, phase: &str) {
- match prepared {
- PreparedTranscriber::Faster(service) => match service.prewarm() {
- Ok(()) => tracing::info!("prewarming faster-whisper worker via {}", phase),
- Err(err) => tracing::warn!("failed to prewarm faster-whisper worker: {err}"),
- },
- PreparedTranscriber::Nemo(service) => match service.prewarm() {
- Ok(()) => tracing::info!("prewarming NeMo ASR worker via {}", phase),
- Err(err) => tracing::warn!("failed to prewarm NeMo ASR worker: {err}"),
- },
- _ => {}
- }
-}
-
-pub async fn transcribe_audio(
- config: &Config,
- prepared: PreparedTranscriber,
- audio: Vec,
- sample_rate: u32,
-) -> Result {
- match prepared {
- PreparedTranscriber::Whisper(handle) => {
- let backend = handle.await.map_err(|e| {
- WhsprError::Transcription(format!("model loading task failed: {e}"))
- })??;
- tokio::task::spawn_blocking(move || backend.transcribe(&audio, sample_rate))
- .await
- .map_err(|e| WhsprError::Transcription(format!("transcription task failed: {e}")))?
- }
- PreparedTranscriber::Faster(service) => match service.transcribe(&audio, sample_rate).await
- {
- Ok(transcript) => Ok(transcript),
- Err(err) => {
- tracing::warn!("faster-whisper transcription failed: {err}");
- fallback_whisper_cpp_transcribe(config, audio, sample_rate).await
- }
- },
- PreparedTranscriber::Nemo(service) => match service.transcribe(&audio, sample_rate).await {
- Ok(transcript) => Ok(transcript),
- Err(err) => {
- tracing::warn!("NeMo ASR transcription failed: {err}");
- fallback_whisper_cpp_transcribe(config, audio, sample_rate).await
- }
- },
- PreparedTranscriber::Cloud(service) => {
- match service.transcribe_audio(config, &audio, sample_rate).await {
- Ok(transcript) => Ok(transcript),
- Err(err) => {
- tracing::warn!("cloud transcription failed: {err}");
- fallback_local_transcribe(config, audio, sample_rate).await
- }
- }
- }
- }
-}
-
-async fn fallback_local_transcribe(
- config: &Config,
- audio: Vec,
- sample_rate: u32,
-) -> Result {
- if config.transcription.backend == TranscriptionBackend::Cloud
- && config.transcription.fallback == TranscriptionFallback::None
- {
- return Err(WhsprError::Transcription(
- "cloud transcription failed and [transcription].fallback = \"none\"".into(),
- ));
- }
-
- let mut local_config = config.transcription.clone();
- local_config.backend = config.transcription.resolved_local_backend();
- let model_path = config.resolved_model_path();
- tracing::warn!(
- "falling back to local ASR backend '{}' using {}",
- local_config.backend.as_str(),
- model_path.display()
- );
- let prepared = match local_config.backend {
- TranscriptionBackend::WhisperCpp => {
- let whisper_config = local_config.clone();
- Ok(PreparedTranscriber::Whisper(tokio::task::spawn_blocking(
- move || WhisperLocal::new(&whisper_config, &model_path),
- )))
- }
- TranscriptionBackend::FasterWhisper => faster_whisper::prepare_service(&local_config)
- .map(PreparedTranscriber::Faster)
- .ok_or_else(|| {
- WhsprError::Transcription(
- "faster-whisper fallback selected but no model path could be resolved".into(),
- )
- }),
- TranscriptionBackend::Nemo => nemo_asr::prepare_service(&local_config)
- .map(PreparedTranscriber::Nemo)
- .ok_or_else(|| {
- WhsprError::Transcription(
- "nemo fallback selected but no model reference could be resolved".into(),
- )
- }),
- TranscriptionBackend::Cloud => Err(WhsprError::Transcription(
- "cloud backend cannot be prepared as a local transcriber".into(),
- )),
- }?;
- match prepared {
- PreparedTranscriber::Whisper(handle) => {
- let backend = handle.await.map_err(|e| {
- WhsprError::Transcription(format!("fallback model loading task failed: {e}"))
- })??;
- tokio::task::spawn_blocking(move || backend.transcribe(&audio, sample_rate))
- .await
- .map_err(|e| {
- WhsprError::Transcription(format!("fallback transcription task failed: {e}"))
- })?
- }
- PreparedTranscriber::Faster(service) => service.transcribe(&audio, sample_rate).await,
- PreparedTranscriber::Nemo(service) => service.transcribe(&audio, sample_rate).await,
- PreparedTranscriber::Cloud(_) => Err(WhsprError::Transcription(
- "cloud fallback resolved to cloud backend".into(),
- )),
- }
-}
-
-async fn fallback_whisper_cpp_transcribe(
- config: &Config,
- audio: Vec,
- sample_rate: u32,
-) -> Result {
- let Some(model_path) = fallback_whisper_model_path() else {
- return Err(WhsprError::Transcription(
- "faster-whisper failed and no local large-v3-turbo fallback model is available".into(),
- ));
- };
- tracing::warn!("falling back to whisper_cpp using {}", model_path.display());
- let whisper_config = whisper_fallback_config(&config.transcription);
- let backend =
- tokio::task::spawn_blocking(move || WhisperLocal::new(&whisper_config, &model_path))
- .await
- .map_err(|e| {
- WhsprError::Transcription(format!("fallback model loading task failed: {e}"))
- })??;
- tokio::task::spawn_blocking(move || backend.transcribe(&audio, sample_rate))
- .await
- .map_err(|e| {
- WhsprError::Transcription(format!("fallback transcription task failed: {e}"))
- })?
-}
-
-fn whisper_fallback_config(config: &TranscriptionConfig) -> TranscriptionConfig {
- let mut fallback = config.clone();
- fallback.backend = TranscriptionBackend::WhisperCpp;
- fallback.local_backend = TranscriptionBackend::WhisperCpp;
- fallback.selected_model = "large-v3-turbo".into();
- fallback.model_path = model::model_path_for_config("ggml-large-v3-turbo.bin");
- fallback
-}
-
-fn fallback_whisper_model_path() -> Option {
- let path = model::selected_model_local_path("large-v3-turbo")?;
- path.exists().then_some(path)
-}
-
-fn retained_socket_paths(config: &Config) -> HashSet {
- let mut retained = HashSet::new();
- match config.transcription.backend {
- TranscriptionBackend::FasterWhisper => {
- if let Some(service) = faster_whisper::prepare_service(&config.transcription) {
- retained.insert(service.socket_path().to_path_buf());
- }
- }
- TranscriptionBackend::Nemo => {
- if let Some(service) = nemo_asr::prepare_service(&config.transcription) {
- retained.insert(service.socket_path().to_path_buf());
- }
- }
- TranscriptionBackend::WhisperCpp | TranscriptionBackend::Cloud => {}
- }
- retained
-}
-
-#[derive(Debug, Clone, PartialEq, Eq)]
-struct AsrWorkerProcess {
- pid: libc::pid_t,
- kind: &'static str,
- socket_path: PathBuf,
-}
-
-fn collect_stale_asr_workers(retained: &HashSet) -> Result> {
- let proc_dir = std::fs::read_dir("/proc")
- .map_err(|e| WhsprError::Transcription(format!("failed to inspect /proc: {e}")))?;
- let mut stale = Vec::new();
- for entry in proc_dir {
- let entry = match entry {
- Ok(entry) => entry,
- Err(_) => continue,
- };
- let file_name = entry.file_name();
- let Some(pid) = file_name.to_string_lossy().parse::().ok() else {
- continue;
- };
- let cmdline = match std::fs::read(entry.path().join("cmdline")) {
- Ok(cmdline) => cmdline,
- Err(_) => continue,
- };
- let Some((kind, socket_path)) = parse_asr_worker_cmdline(&cmdline) else {
- continue;
- };
- if retained.contains(&socket_path) {
- continue;
- }
- stale.push(AsrWorkerProcess {
- pid,
- kind,
- socket_path,
- });
- }
- Ok(stale)
-}
-
-fn parse_asr_worker_cmdline(cmdline: &[u8]) -> Option<(&'static str, PathBuf)> {
- let args: Vec = cmdline
- .split(|byte| *byte == 0)
- .filter(|arg| !arg.is_empty())
- .map(|arg| String::from_utf8_lossy(arg).into_owned())
- .collect();
- if args.is_empty() || !args.iter().any(|arg| arg == "serve") {
- return None;
- }
-
- let kind = if args.iter().any(|arg| {
- Path::new(arg)
- .file_name()
- .is_some_and(|name| name == "faster_whisper_worker.py")
- }) {
- "faster_whisper"
- } else if args.iter().any(|arg| {
- Path::new(arg)
- .file_name()
- .is_some_and(|name| name == "nemo_asr_worker.py")
- }) {
- "nemo"
- } else {
- return None;
- };
-
- let socket_index = args.iter().position(|arg| arg == "--socket-path")?;
- let socket_path = PathBuf::from(args.get(socket_index + 1)?);
- let runtime_scope = asr_runtime_scope_dir();
- if !socket_path.starts_with(&runtime_scope) {
- return None;
- }
- let file_name = socket_path.file_name()?.to_string_lossy();
- if !file_name.starts_with("asr-") || !file_name.ends_with(".sock") {
- return None;
- }
-
- Some((kind, socket_path))
-}
-
-fn asr_runtime_scope_dir() -> PathBuf {
- let base = std::env::var("XDG_RUNTIME_DIR").unwrap_or_else(|_| "/tmp".into());
- PathBuf::from(base).join("whispers")
-}
-
-pub fn validate_transcription_config(config: &Config) -> Result<()> {
- if config.transcription.backend == TranscriptionBackend::Cloud {
- crate::cloud::validate_config(config)?;
- }
-
- if config.transcription.resolved_local_backend() == TranscriptionBackend::FasterWhisper
- && !config.transcription.language.eq_ignore_ascii_case("en")
- && !config.transcription.language.eq_ignore_ascii_case("auto")
- {
- return Err(WhsprError::Config(
- "faster-whisper managed models are currently English-focused; set [transcription].language = \"en\" or \"auto\"".into(),
- ));
- }
-
- if config.transcription.resolved_local_backend() == TranscriptionBackend::FasterWhisper
- && config.transcription.language.eq_ignore_ascii_case("auto")
- {
- tracing::warn!(
- "faster-whisper backend is configured with language = \"auto\"; English dictation is recommended"
- );
- }
-
- if config.transcription.resolved_local_backend() == TranscriptionBackend::Nemo
- && !config.transcription.language.eq_ignore_ascii_case("en")
- && !config.transcription.language.eq_ignore_ascii_case("auto")
- {
- return Err(WhsprError::Config(
- "NeMo experimental ASR models are currently English-only; set [transcription].language = \"en\" or \"auto\"".into(),
- ));
- }
-
- Ok(())
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn parse_faster_worker_cmdline_extracts_socket_path() {
- let socket = asr_runtime_scope_dir().join("asr-faster-123.sock");
- let cmdline = format!(
- "/home/user/.local/share/whispers/faster-whisper/venv/bin/python\0/home/user/.local/share/whispers/faster-whisper/faster_whisper_worker.py\0serve\0--socket-path\0{}\0--model-dir\0/tmp/model\0",
- socket.display()
- );
- let parsed = parse_asr_worker_cmdline(cmdline.as_bytes()).expect("parse worker");
- assert_eq!(parsed.0, "faster_whisper");
- assert_eq!(parsed.1, socket);
- }
-
- #[test]
- fn parse_nemo_worker_cmdline_extracts_socket_path() {
- let socket = asr_runtime_scope_dir().join("asr-nemo-456.sock");
- let cmdline = format!(
- "/home/user/.local/share/whispers/nemo/venv-asr/bin/python\0/home/user/.local/share/whispers/nemo/nemo_asr_worker.py\0serve\0--socket-path\0{}\0--model-ref\0/tmp/model.nemo\0",
- socket.display()
- );
- let parsed = parse_asr_worker_cmdline(cmdline.as_bytes()).expect("parse worker");
- assert_eq!(parsed.0, "nemo");
- assert_eq!(parsed.1, socket);
- }
-
- #[test]
- fn parse_asr_worker_cmdline_ignores_unrelated_processes() {
- let socket = asr_runtime_scope_dir().join("asr-other.sock");
- let cmdline = format!(
- "/usr/bin/python\0/home/user/script.py\0serve\0--socket-path\0{}\0",
- socket.display()
- );
- assert!(parse_asr_worker_cmdline(cmdline.as_bytes()).is_none());
- }
-
- #[test]
- fn parse_asr_worker_cmdline_ignores_socket_outside_runtime_scope() {
- let cmdline = b"/home/user/.local/share/whispers/nemo/venv-asr/bin/python\0/home/user/.local/share/whispers/nemo/nemo_asr_worker.py\0serve\0--socket-path\0/var/run/asr-nemo.sock\0";
- assert!(parse_asr_worker_cmdline(cmdline).is_none());
- }
-}
diff --git a/src/asr/cleanup.rs b/src/asr/cleanup.rs
new file mode 100644
index 0000000..92f6948
--- /dev/null
+++ b/src/asr/cleanup.rs
@@ -0,0 +1,177 @@
+use crate::config::{Config, TranscriptionBackend};
+use crate::error::{Result, WhsprError};
+use std::collections::HashSet;
+use std::path::{Path, PathBuf};
+
+pub fn cleanup_stale_transcribers(config: &Config) -> Result<()> {
+ let retained = retained_socket_paths(config);
+ let stale_workers = collect_stale_asr_workers(&retained)?;
+ for worker in stale_workers {
+ tracing::info!(
+ pid = worker.pid,
+ kind = worker.kind,
+ socket = %worker.socket_path.display(),
+ "terminating stale ASR worker"
+ );
+ let result = unsafe { libc::kill(worker.pid, libc::SIGTERM) };
+ if result == 0 {
+ continue;
+ }
+ let err = std::io::Error::last_os_error();
+ if err.raw_os_error() == Some(libc::ESRCH) {
+ continue;
+ }
+ return Err(WhsprError::Transcription(format!(
+ "failed to terminate stale {} worker (pid {}): {err}",
+ worker.kind, worker.pid
+ )));
+ }
+ Ok(())
+}
+
+fn retained_socket_paths(config: &Config) -> HashSet {
+ let mut retained = HashSet::new();
+ match config.transcription.backend {
+ TranscriptionBackend::FasterWhisper => {
+ if let Some(service) = crate::faster_whisper::prepare_service(&config.transcription) {
+ retained.insert(service.socket_path().to_path_buf());
+ }
+ }
+ TranscriptionBackend::Nemo => {
+ if let Some(service) = crate::nemo_asr::prepare_service(&config.transcription) {
+ retained.insert(service.socket_path().to_path_buf());
+ }
+ }
+ TranscriptionBackend::WhisperCpp | TranscriptionBackend::Cloud => {}
+ }
+ retained
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+struct AsrWorkerProcess {
+ pid: libc::pid_t,
+ kind: &'static str,
+ socket_path: PathBuf,
+}
+
+fn collect_stale_asr_workers(retained: &HashSet) -> Result> {
+ let proc_dir = std::fs::read_dir("/proc")
+ .map_err(|e| WhsprError::Transcription(format!("failed to inspect /proc: {e}")))?;
+ let mut stale = Vec::new();
+ for entry in proc_dir {
+ let entry = match entry {
+ Ok(entry) => entry,
+ Err(_) => continue,
+ };
+ let file_name = entry.file_name();
+ let Some(pid) = file_name.to_string_lossy().parse::().ok() else {
+ continue;
+ };
+ let cmdline = match std::fs::read(entry.path().join("cmdline")) {
+ Ok(cmdline) => cmdline,
+ Err(_) => continue,
+ };
+ let Some((kind, socket_path)) = parse_asr_worker_cmdline(&cmdline) else {
+ continue;
+ };
+ if retained.contains(&socket_path) {
+ continue;
+ }
+ stale.push(AsrWorkerProcess {
+ pid,
+ kind,
+ socket_path,
+ });
+ }
+ Ok(stale)
+}
+
+fn parse_asr_worker_cmdline(cmdline: &[u8]) -> Option<(&'static str, PathBuf)> {
+ let args: Vec = cmdline
+ .split(|byte| *byte == 0)
+ .filter(|arg| !arg.is_empty())
+ .map(|arg| String::from_utf8_lossy(arg).into_owned())
+ .collect();
+ if args.is_empty() || !args.iter().any(|arg| arg == "serve") {
+ return None;
+ }
+
+ let kind = if args.iter().any(|arg| {
+ Path::new(arg)
+ .file_name()
+ .is_some_and(|name| name == "faster_whisper_worker.py")
+ }) {
+ "faster_whisper"
+ } else if args.iter().any(|arg| {
+ Path::new(arg)
+ .file_name()
+ .is_some_and(|name| name == "nemo_asr_worker.py")
+ }) {
+ "nemo"
+ } else {
+ return None;
+ };
+
+ let socket_index = args.iter().position(|arg| arg == "--socket-path")?;
+ let socket_path = PathBuf::from(args.get(socket_index + 1)?);
+ let runtime_scope = asr_runtime_scope_dir();
+ if !socket_path.starts_with(&runtime_scope) {
+ return None;
+ }
+ let file_name = socket_path.file_name()?.to_string_lossy();
+ if !file_name.starts_with("asr-") || !file_name.ends_with(".sock") {
+ return None;
+ }
+
+ Some((kind, socket_path))
+}
+
+fn asr_runtime_scope_dir() -> PathBuf {
+ let base = std::env::var("XDG_RUNTIME_DIR").unwrap_or_else(|_| "/tmp".into());
+ PathBuf::from(base).join("whispers")
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn parse_faster_worker_cmdline_extracts_socket_path() {
+ let socket = asr_runtime_scope_dir().join("asr-faster-123.sock");
+ let cmdline = format!(
+ "/home/user/.local/share/whispers/faster-whisper/venv/bin/python\0/home/user/.local/share/whispers/faster-whisper/faster_whisper_worker.py\0serve\0--socket-path\0{}\0--model-dir\0/tmp/model\0",
+ socket.display()
+ );
+ let parsed = parse_asr_worker_cmdline(cmdline.as_bytes()).expect("parse worker");
+ assert_eq!(parsed.0, "faster_whisper");
+ assert_eq!(parsed.1, socket);
+ }
+
+ #[test]
+ fn parse_nemo_worker_cmdline_extracts_socket_path() {
+ let socket = asr_runtime_scope_dir().join("asr-nemo-456.sock");
+ let cmdline = format!(
+ "/home/user/.local/share/whispers/nemo/venv-asr/bin/python\0/home/user/.local/share/whispers/nemo/nemo_asr_worker.py\0serve\0--socket-path\0{}\0--model-ref\0/tmp/model.nemo\0",
+ socket.display()
+ );
+ let parsed = parse_asr_worker_cmdline(cmdline.as_bytes()).expect("parse worker");
+ assert_eq!(parsed.0, "nemo");
+ assert_eq!(parsed.1, socket);
+ }
+
+ #[test]
+ fn parse_asr_worker_cmdline_ignores_unrelated_processes() {
+ let socket = asr_runtime_scope_dir().join("asr-other.sock");
+ let cmdline = format!(
+ "/usr/bin/python\0/home/user/script.py\0serve\0--socket-path\0{}\0",
+ socket.display()
+ );
+ assert!(parse_asr_worker_cmdline(cmdline.as_bytes()).is_none());
+ }
+
+ #[test]
+ fn parse_asr_worker_cmdline_ignores_socket_outside_runtime_scope() {
+ let cmdline = b"/home/user/.local/share/whispers/nemo/venv-asr/bin/python\0/home/user/.local/share/whispers/nemo/nemo_asr_worker.py\0serve\0--socket-path\0/var/run/asr-nemo.sock\0";
+ assert!(parse_asr_worker_cmdline(cmdline).is_none());
+ }
+}
diff --git a/src/asr/execute.rs b/src/asr/execute.rs
new file mode 100644
index 0000000..2069deb
--- /dev/null
+++ b/src/asr/execute.rs
@@ -0,0 +1,188 @@
+use super::prepare::{self, PreparedTranscriber};
+use crate::config::{Config, TranscriptionBackend, TranscriptionConfig, TranscriptionFallback};
+use crate::error::{Result, WhsprError};
+use crate::model;
+use crate::transcribe::{Transcript, TranscriptionBackend as _};
+
+pub async fn transcribe_audio(
+ config: &Config,
+ prepared: PreparedTranscriber,
+ audio: Vec,
+ sample_rate: u32,
+) -> Result {
+ match prepared {
+ prepared @ PreparedTranscriber::Whisper(_) => {
+ transcribe_with_prepared(prepared, &audio, sample_rate, "").await
+ }
+ prepared @ PreparedTranscriber::Faster(_) => {
+ match transcribe_with_prepared(prepared, &audio, sample_rate, "").await {
+ Ok(transcript) => Ok(transcript),
+ Err(err) => {
+ tracing::warn!("faster-whisper transcription failed: {err}");
+ fallback_whisper_cpp_transcribe(config, audio, sample_rate).await
+ }
+ }
+ }
+ prepared @ PreparedTranscriber::Nemo(_) => {
+ match transcribe_with_prepared(prepared, &audio, sample_rate, "").await {
+ Ok(transcript) => Ok(transcript),
+ Err(err) => {
+ tracing::warn!("NeMo ASR transcription failed: {err}");
+ fallback_whisper_cpp_transcribe(config, audio, sample_rate).await
+ }
+ }
+ }
+ PreparedTranscriber::Cloud(service) => {
+ match service.transcribe_audio(config, &audio, sample_rate).await {
+ Ok(transcript) => Ok(transcript),
+ Err(err) => {
+ tracing::warn!("cloud transcription failed: {err}");
+ fallback_local_transcribe(config, audio, sample_rate).await
+ }
+ }
+ }
+ }
+}
+
+async fn transcribe_with_prepared(
+ prepared: PreparedTranscriber,
+ audio: &[f32],
+ sample_rate: u32,
+ task_label: &str,
+) -> Result {
+ match prepared {
+ PreparedTranscriber::Whisper(handle) => {
+ let audio = audio.to_vec();
+ let backend = handle
+ .await
+ .map_err(|e| transcription_task_error(task_label, "model loading", &e))??;
+ tokio::task::spawn_blocking(move || backend.transcribe(&audio, sample_rate))
+ .await
+ .map_err(|e| transcription_task_error(task_label, "transcription", &e))?
+ }
+ PreparedTranscriber::Faster(service) => service.transcribe(audio, sample_rate).await,
+ PreparedTranscriber::Nemo(service) => service.transcribe(audio, sample_rate).await,
+ PreparedTranscriber::Cloud(_) => Err(WhsprError::Transcription(
+ "cloud transcriber cannot be executed without the caller-owned config".into(),
+ )),
+ }
+}
+
+async fn fallback_local_transcribe(
+ config: &Config,
+ audio: Vec,
+ sample_rate: u32,
+) -> Result {
+ let (local_config, model_path) = local_fallback_config(config)?;
+ tracing::warn!(
+ "falling back to local ASR backend '{}' using {}",
+ local_config.backend.as_str(),
+ model_path.display()
+ );
+ let prepared = prepare::prepare_local_transcriber(&local_config, &model_path)?;
+ transcribe_with_prepared(prepared, &audio, sample_rate, "fallback").await
+}
+
+async fn fallback_whisper_cpp_transcribe(
+ config: &Config,
+ audio: Vec,
+ sample_rate: u32,
+) -> Result {
+ let Some(model_path) = fallback_whisper_model_path() else {
+ return Err(WhsprError::Transcription(
+ "faster-whisper failed and no local large-v3-turbo fallback model is available".into(),
+ ));
+ };
+ tracing::warn!("falling back to whisper_cpp using {}", model_path.display());
+ let whisper_config = whisper_fallback_config(&config.transcription);
+ let prepared = prepare::prepare_local_transcriber(&whisper_config, &model_path)?;
+ transcribe_with_prepared(prepared, &audio, sample_rate, "fallback").await
+}
+
+fn local_fallback_config(config: &Config) -> Result<(TranscriptionConfig, std::path::PathBuf)> {
+ if config.transcription.backend == TranscriptionBackend::Cloud
+ && config.transcription.fallback == TranscriptionFallback::None
+ {
+ return Err(WhsprError::Transcription(
+ "cloud transcription failed and [transcription].fallback = \"none\"".into(),
+ ));
+ }
+
+ let mut local_config = config.transcription.clone();
+ local_config.backend = config.transcription.resolved_local_backend();
+ Ok((local_config, config.resolved_model_path()))
+}
+
+fn whisper_fallback_config(config: &TranscriptionConfig) -> TranscriptionConfig {
+ let mut fallback = config.clone();
+ fallback.backend = TranscriptionBackend::WhisperCpp;
+ fallback.local_backend = TranscriptionBackend::WhisperCpp;
+ fallback.selected_model = "large-v3-turbo".into();
+ fallback.model_path = model::model_path_for_config("ggml-large-v3-turbo.bin");
+ fallback
+}
+
+fn fallback_whisper_model_path() -> Option {
+ let path = model::selected_model_local_path("large-v3-turbo")?;
+ path.exists().then_some(path)
+}
+
+fn transcription_task_error(
+ task_label: &str,
+ phase: &str,
+ error: &tokio::task::JoinError,
+) -> WhsprError {
+ let prefix = if task_label.is_empty() {
+ String::new()
+ } else {
+ format!("{task_label} ")
+ };
+ WhsprError::Transcription(format!("{prefix}{phase} task failed: {error}"))
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[tokio::test]
+ async fn cloud_fallback_none_returns_existing_error() {
+ let mut config = Config::default();
+ config.transcription.backend = TranscriptionBackend::Cloud;
+ config.transcription.fallback = TranscriptionFallback::None;
+
+ let err = fallback_local_transcribe(&config, vec![0.0; 16], 16_000)
+ .await
+ .expect_err("fallback should fail");
+ match err {
+ WhsprError::Transcription(message) => {
+ assert_eq!(
+ message,
+ "cloud transcription failed and [transcription].fallback = \"none\""
+ );
+ }
+ other => panic!("unexpected error: {other:?}"),
+ }
+ }
+
+ #[test]
+ fn local_fallback_config_resolves_configured_local_backend() {
+ let mut config = Config::default();
+ config.transcription.backend = TranscriptionBackend::Cloud;
+ config.transcription.local_backend = TranscriptionBackend::Nemo;
+
+ let (local_config, _) = local_fallback_config(&config).expect("fallback config");
+ assert_eq!(local_config.backend, TranscriptionBackend::Nemo);
+ }
+
+ #[test]
+ fn whisper_fallback_config_pins_whisper_cpp_large_v3_turbo() {
+ let fallback = whisper_fallback_config(&TranscriptionConfig::default());
+ assert_eq!(fallback.backend, TranscriptionBackend::WhisperCpp);
+ assert_eq!(fallback.local_backend, TranscriptionBackend::WhisperCpp);
+ assert_eq!(fallback.selected_model, "large-v3-turbo");
+ assert_eq!(
+ fallback.model_path,
+ model::model_path_for_config("ggml-large-v3-turbo.bin")
+ );
+ }
+}
diff --git a/src/asr/mod.rs b/src/asr/mod.rs
new file mode 100644
index 0000000..fbd90e2
--- /dev/null
+++ b/src/asr/mod.rs
@@ -0,0 +1,4 @@
+pub mod cleanup;
+pub mod execute;
+pub mod prepare;
+pub mod validation;
diff --git a/src/asr/prepare.rs b/src/asr/prepare.rs
new file mode 100644
index 0000000..1e24580
--- /dev/null
+++ b/src/asr/prepare.rs
@@ -0,0 +1,70 @@
+use crate::cloud::CloudService;
+use crate::config::{Config, TranscriptionBackend, TranscriptionConfig};
+use crate::error::{Result, WhsprError};
+use crate::faster_whisper::{self, FasterWhisperService};
+use crate::nemo_asr::{self, NemoAsrService};
+use crate::transcribe::WhisperLocal;
+use std::path::Path;
+
+pub enum PreparedTranscriber {
+ Whisper(tokio::task::JoinHandle>),
+ Faster(FasterWhisperService),
+ Nemo(NemoAsrService),
+ Cloud(CloudService),
+}
+
+pub fn prepare_transcriber(config: &Config) -> Result {
+ super::cleanup::cleanup_stale_transcribers(config)?;
+
+ if config.transcription.backend == TranscriptionBackend::Cloud {
+ return Ok(PreparedTranscriber::Cloud(CloudService::new(config)?));
+ }
+
+ prepare_local_transcriber(&config.transcription, &config.resolved_model_path())
+}
+
+pub(crate) fn prepare_local_transcriber(
+ transcription: &TranscriptionConfig,
+ model_path: &Path,
+) -> Result {
+ match transcription.backend {
+ TranscriptionBackend::WhisperCpp => {
+ let whisper_config = transcription.clone();
+ let model_path = model_path.to_path_buf();
+ Ok(PreparedTranscriber::Whisper(tokio::task::spawn_blocking(
+ move || WhisperLocal::new(&whisper_config, &model_path),
+ )))
+ }
+ TranscriptionBackend::FasterWhisper => faster_whisper::prepare_service(transcription)
+ .map(PreparedTranscriber::Faster)
+ .ok_or_else(|| {
+ WhsprError::Transcription(
+ "faster-whisper backend selected but no model path could be resolved".into(),
+ )
+ }),
+ TranscriptionBackend::Nemo => nemo_asr::prepare_service(transcription)
+ .map(PreparedTranscriber::Nemo)
+ .ok_or_else(|| {
+ WhsprError::Transcription(
+ "nemo backend selected but no model reference could be resolved".into(),
+ )
+ }),
+ TranscriptionBackend::Cloud => Err(WhsprError::Transcription(
+ "cloud backend cannot be prepared as a local transcriber".into(),
+ )),
+ }
+}
+
+pub fn prewarm_transcriber(prepared: &PreparedTranscriber, phase: &str) {
+ match prepared {
+ PreparedTranscriber::Faster(service) => match service.prewarm() {
+ Ok(()) => tracing::info!("prewarming faster-whisper worker via {}", phase),
+ Err(err) => tracing::warn!("failed to prewarm faster-whisper worker: {err}"),
+ },
+ PreparedTranscriber::Nemo(service) => match service.prewarm() {
+ Ok(()) => tracing::info!("prewarming NeMo ASR worker via {}", phase),
+ Err(err) => tracing::warn!("failed to prewarm NeMo ASR worker: {err}"),
+ },
+ _ => {}
+ }
+}
diff --git a/src/asr/validation.rs b/src/asr/validation.rs
new file mode 100644
index 0000000..21678de
--- /dev/null
+++ b/src/asr/validation.rs
@@ -0,0 +1,77 @@
+use crate::config::{Config, TranscriptionBackend};
+use crate::error::{Result, WhsprError};
+
+pub fn validate_transcription_config(config: &Config) -> Result<()> {
+ if config.transcription.backend == TranscriptionBackend::Cloud {
+ crate::cloud::validate_config(config)?;
+ }
+
+ if config.transcription.resolved_local_backend() == TranscriptionBackend::FasterWhisper
+ && !config.transcription.language.eq_ignore_ascii_case("en")
+ && !config.transcription.language.eq_ignore_ascii_case("auto")
+ {
+ return Err(WhsprError::Config(
+ "faster-whisper managed models are currently English-focused; set [transcription].language = \"en\" or \"auto\"".into(),
+ ));
+ }
+
+ if config.transcription.resolved_local_backend() == TranscriptionBackend::FasterWhisper
+ && config.transcription.language.eq_ignore_ascii_case("auto")
+ {
+ tracing::warn!(
+ "faster-whisper backend is configured with language = \"auto\"; English dictation is recommended"
+ );
+ }
+
+ if config.transcription.resolved_local_backend() == TranscriptionBackend::Nemo
+ && !config.transcription.language.eq_ignore_ascii_case("en")
+ && !config.transcription.language.eq_ignore_ascii_case("auto")
+ {
+ return Err(WhsprError::Config(
+ "NeMo experimental ASR models are currently English-only; set [transcription].language = \"en\" or \"auto\"".into(),
+ ));
+ }
+
+ Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn faster_whisper_rejects_non_english_explicit_language() {
+ let mut config = Config::default();
+ config.transcription.backend = TranscriptionBackend::FasterWhisper;
+ config.transcription.local_backend = TranscriptionBackend::FasterWhisper;
+ config.transcription.language = "sv".into();
+
+ let err = validate_transcription_config(&config).expect_err("config should fail");
+ match err {
+ WhsprError::Config(message) => {
+ assert!(
+ message.contains("faster-whisper managed models are currently English-focused")
+ );
+ }
+ other => panic!("unexpected error: {other:?}"),
+ }
+ }
+
+ #[test]
+ fn nemo_rejects_non_english_explicit_language() {
+ let mut config = Config::default();
+ config.transcription.backend = TranscriptionBackend::Nemo;
+ config.transcription.local_backend = TranscriptionBackend::Nemo;
+ config.transcription.language = "sv".into();
+
+ let err = validate_transcription_config(&config).expect_err("config should fail");
+ match err {
+ WhsprError::Config(message) => {
+ assert!(
+ message.contains("NeMo experimental ASR models are currently English-only")
+ );
+ }
+ other => panic!("unexpected error: {other:?}"),
+ }
+ }
+}
diff --git a/src/asr_model.rs b/src/asr_model.rs
index 3e0b9b4..d2a0ad5 100644
--- a/src/asr_model.rs
+++ b/src/asr_model.rs
@@ -1,9 +1,8 @@
use std::path::{Path, PathBuf};
-use crate::config::{
- self, TranscriptionBackend, resolve_config_path, update_config_transcription_selection,
-};
+use crate::config::{TranscriptionBackend, update_config_transcription_selection};
use crate::error::{Result, WhsprError};
+use crate::model_support;
use crate::{faster_whisper, model, nemo_asr};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -258,11 +257,7 @@ pub fn selected_model_path(name: &str) -> Option {
fn active_model_name(
config_path_override: Option<&Path>,
) -> Option<(TranscriptionBackend, String)> {
- let config_path = resolve_config_path(config_path_override);
- if !config_path.exists() {
- return None;
- }
- let config = config::Config::load(Some(&config_path)).ok()?;
+ let config = model_support::load_config_if_exists(config_path_override)?;
Some((
config.transcription.resolved_local_backend(),
config.transcription.selected_model,
@@ -285,12 +280,7 @@ fn model_status(info: &AsrModelInfo, active: Option<(TranscriptionBackend, &str)
})
.unwrap_or(false);
- match (is_active, is_local) {
- (true, true) => "active",
- (true, false) => "active (missing)",
- (_, true) => "local",
- _ => "remote",
- }
+ model_support::managed_download_status(is_active, is_local)
}
pub fn list_models(config_path_override: Option<&Path>) {
@@ -387,13 +377,10 @@ pub fn select_model(name: &str, config_path_override: Option<&Path>) -> Result<(
)));
}
- let config_path = resolve_config_path(config_path_override);
- if !config_path.exists() {
- config::write_default_config(
- &config_path,
- &model::model_path_for_config("ggml-large-v3-turbo.bin"),
- )?;
- }
+ let (config_path, _) = model_support::ensure_default_config(
+ config_path_override,
+ &model::model_path_for_config("ggml-large-v3-turbo.bin"),
+ )?;
let config_model_path = match info.backend {
TranscriptionBackend::WhisperCpp => model::model_path_for_config(
@@ -410,7 +397,7 @@ pub fn select_model(name: &str, config_path_override: Option<&Path>) -> Result<(
info.backend,
info.name,
&config_model_path,
- config::Config::load(Some(&config_path))
+ model_support::load_config_at_if_exists(&config_path)
.map(|config| config.transcription.backend != TranscriptionBackend::Cloud)
.unwrap_or(true),
)?;
diff --git a/src/audio/dsp.rs b/src/audio/dsp.rs
new file mode 100644
index 0000000..0a720ea
--- /dev/null
+++ b/src/audio/dsp.rs
@@ -0,0 +1,166 @@
+const HIGHPASS_CUTOFF_HZ: f32 = 80.0;
+const TRIM_FRAME_MS: usize = 10;
+const TRIM_PADDING_MS: usize = 40;
+const TRIM_MIN_RMS: f32 = 0.002;
+const TRIM_RELATIVE_RMS: f32 = 0.08;
+pub(super) const NORMALIZE_TARGET_PEAK: f32 = 0.85;
+const NORMALIZE_MAX_GAIN: f32 = 2.5;
+const NORMALIZE_MIN_PEAK: f32 = 0.005;
+
+pub fn preprocess_audio(samples: &mut Vec, sample_rate: u32) {
+ if samples.is_empty() || sample_rate == 0 {
+ return;
+ }
+
+ let before_len = samples.len();
+ let before = audio_stats(samples);
+
+ remove_dc_offset(samples);
+ apply_highpass(samples, sample_rate, HIGHPASS_CUTOFF_HZ);
+ trim_silence(samples, sample_rate);
+ let gain = normalize_peak(samples);
+
+ let after = audio_stats(samples);
+ tracing::debug!(
+ "audio preprocessing: len {} -> {}, rms {:.4} -> {:.4}, peak {:.4} -> {:.4}, gain {:.2}x",
+ before_len,
+ samples.len(),
+ before.rms,
+ after.rms,
+ before.peak,
+ after.peak,
+ gain
+ );
+}
+
+#[derive(Clone, Copy)]
+struct AudioStats {
+ rms: f32,
+ peak: f32,
+}
+
+fn audio_stats(samples: &[f32]) -> AudioStats {
+ if samples.is_empty() {
+ return AudioStats {
+ rms: 0.0,
+ peak: 0.0,
+ };
+ }
+
+ let mut peak = 0.0f32;
+ let mut energy = 0.0f32;
+ for sample in samples {
+ peak = peak.max(sample.abs());
+ energy += sample * sample;
+ }
+
+ AudioStats {
+ rms: (energy / samples.len() as f32).sqrt(),
+ peak,
+ }
+}
+
+pub(super) fn remove_dc_offset(samples: &mut [f32]) {
+ if samples.is_empty() {
+ return;
+ }
+
+ let mean = samples.iter().copied().sum::() / samples.len() as f32;
+ if mean.abs() < 1e-6 {
+ return;
+ }
+
+ for sample in samples {
+ *sample -= mean;
+ }
+}
+
+fn apply_highpass(samples: &mut [f32], sample_rate: u32, cutoff_hz: f32) {
+ if samples.len() < 2 || sample_rate == 0 || cutoff_hz <= 0.0 {
+ return;
+ }
+
+ let dt = 1.0 / sample_rate as f32;
+ let rc = 1.0 / (2.0 * std::f32::consts::PI * cutoff_hz);
+ let alpha = rc / (rc + dt);
+
+ let mut previous_input = samples[0];
+ let mut previous_output = 0.0f32;
+ samples[0] = 0.0;
+
+ for sample in samples.iter_mut().skip(1) {
+ let input = *sample;
+ let output = alpha * (previous_output + input - previous_input);
+ *sample = output;
+ previous_input = input;
+ previous_output = output;
+ }
+}
+
+pub(super) fn trim_silence(samples: &mut Vec, sample_rate: u32) {
+ if samples.is_empty() || sample_rate == 0 {
+ return;
+ }
+
+ let frame_len = ((sample_rate as usize * TRIM_FRAME_MS) / 1000).max(1);
+ if samples.len() <= frame_len * 2 {
+ return;
+ }
+
+ let frame_rms: Vec = samples.chunks(frame_len).map(frame_rms).collect();
+ let peak_rms = frame_rms.iter().copied().fold(0.0f32, f32::max);
+ if peak_rms <= 0.0 {
+ return;
+ }
+
+ let threshold = (peak_rms * TRIM_RELATIVE_RMS).max(TRIM_MIN_RMS);
+ let Some(first_active) = frame_rms.iter().position(|rms| *rms >= threshold) else {
+ return;
+ };
+ let Some(last_active) = frame_rms.iter().rposition(|rms| *rms >= threshold) else {
+ return;
+ };
+
+ let padding_samples = (sample_rate as usize * TRIM_PADDING_MS) / 1000;
+ let padding_frames = padding_samples.div_ceil(frame_len);
+ let start_frame = first_active.saturating_sub(padding_frames);
+ let end_frame = (last_active + 1 + padding_frames).min(frame_rms.len());
+
+ let start = start_frame.saturating_mul(frame_len);
+ let end = (end_frame.saturating_mul(frame_len)).min(samples.len());
+ if start == 0 && end == samples.len() {
+ return;
+ }
+ if start >= end {
+ return;
+ }
+
+ *samples = samples[start..end].to_vec();
+}
+
+fn frame_rms(frame: &[f32]) -> f32 {
+ if frame.is_empty() {
+ return 0.0;
+ }
+
+ let energy = frame.iter().map(|sample| sample * sample).sum::();
+ (energy / frame.len() as f32).sqrt()
+}
+
+pub(super) fn normalize_peak(samples: &mut [f32]) -> f32 {
+ let peak = samples.iter().copied().map(f32::abs).fold(0.0f32, f32::max);
+ if !(NORMALIZE_MIN_PEAK..NORMALIZE_TARGET_PEAK).contains(&peak) {
+ return 1.0;
+ }
+
+ let gain = (NORMALIZE_TARGET_PEAK / peak).min(NORMALIZE_MAX_GAIN);
+ if gain <= 1.0 {
+ return 1.0;
+ }
+
+ for sample in samples {
+ *sample = (*sample * gain).clamp(-1.0, 1.0);
+ }
+
+ gain
+}
diff --git a/src/audio/mod.rs b/src/audio/mod.rs
new file mode 100644
index 0000000..bf4207e
--- /dev/null
+++ b/src/audio/mod.rs
@@ -0,0 +1,8 @@
+mod dsp;
+mod recorder;
+
+#[cfg(test)]
+mod tests;
+
+pub use dsp::preprocess_audio;
+pub use recorder::AudioRecorder;
diff --git a/src/audio.rs b/src/audio/recorder.rs
similarity index 55%
rename from src/audio.rs
rename to src/audio/recorder.rs
index fd0f940..8e045bc 100644
--- a/src/audio.rs
+++ b/src/audio/recorder.rs
@@ -7,14 +7,6 @@ use crate::config::AudioConfig;
use crate::error::{Result, WhsprError};
const PREALLOC_SECONDS: usize = 120;
-const HIGHPASS_CUTOFF_HZ: f32 = 80.0;
-const TRIM_FRAME_MS: usize = 10;
-const TRIM_PADDING_MS: usize = 40;
-const TRIM_MIN_RMS: f32 = 0.002;
-const TRIM_RELATIVE_RMS: f32 = 0.08;
-const NORMALIZE_TARGET_PEAK: f32 = 0.85;
-const NORMALIZE_MAX_GAIN: f32 = 2.5;
-const NORMALIZE_MIN_PEAK: f32 = 0.005;
pub struct AudioRecorder {
config: AudioConfig,
@@ -154,7 +146,7 @@ impl AudioRecorder {
pub fn stop(&mut self) -> Result> {
// Take and leak the stream — cpal's ALSA backend calls snd_pcm_close()
// on drop without draining first, which causes an audible click on
- // PipeWire when the stream is still "warm". The OS reclaims file
+ // PipeWire when the stream is still "warm". The OS reclaims file
// descriptors on process exit.
if let Some(stream) = self.stream.take() {
let _ = stream.pause();
@@ -182,169 +174,11 @@ impl AudioRecorder {
buffer[start + i] *= gain;
}
- preprocess_audio(&mut buffer, self.config.sample_rate);
+ super::dsp::preprocess_audio(&mut buffer, self.config.sample_rate);
Ok(buffer)
}
}
-pub fn preprocess_audio(samples: &mut Vec, sample_rate: u32) {
- if samples.is_empty() || sample_rate == 0 {
- return;
- }
-
- let before_len = samples.len();
- let before = audio_stats(samples);
-
- remove_dc_offset(samples);
- apply_highpass(samples, sample_rate, HIGHPASS_CUTOFF_HZ);
- trim_silence(samples, sample_rate);
- let gain = normalize_peak(samples);
-
- let after = audio_stats(samples);
- tracing::debug!(
- "audio preprocessing: len {} -> {}, rms {:.4} -> {:.4}, peak {:.4} -> {:.4}, gain {:.2}x",
- before_len,
- samples.len(),
- before.rms,
- after.rms,
- before.peak,
- after.peak,
- gain
- );
-}
-
-#[derive(Clone, Copy)]
-struct AudioStats {
- rms: f32,
- peak: f32,
-}
-
-fn audio_stats(samples: &[f32]) -> AudioStats {
- if samples.is_empty() {
- return AudioStats {
- rms: 0.0,
- peak: 0.0,
- };
- }
-
- let mut peak = 0.0f32;
- let mut energy = 0.0f32;
- for sample in samples {
- peak = peak.max(sample.abs());
- energy += sample * sample;
- }
-
- AudioStats {
- rms: (energy / samples.len() as f32).sqrt(),
- peak,
- }
-}
-
-fn remove_dc_offset(samples: &mut [f32]) {
- if samples.is_empty() {
- return;
- }
-
- let mean = samples.iter().copied().sum::() / samples.len() as f32;
- if mean.abs() < 1e-6 {
- return;
- }
-
- for sample in samples {
- *sample -= mean;
- }
-}
-
-fn apply_highpass(samples: &mut [f32], sample_rate: u32, cutoff_hz: f32) {
- if samples.len() < 2 || sample_rate == 0 || cutoff_hz <= 0.0 {
- return;
- }
-
- let dt = 1.0 / sample_rate as f32;
- let rc = 1.0 / (2.0 * std::f32::consts::PI * cutoff_hz);
- let alpha = rc / (rc + dt);
-
- let mut previous_input = samples[0];
- let mut previous_output = 0.0f32;
- samples[0] = 0.0;
-
- for sample in samples.iter_mut().skip(1) {
- let input = *sample;
- let output = alpha * (previous_output + input - previous_input);
- *sample = output;
- previous_input = input;
- previous_output = output;
- }
-}
-
-fn trim_silence(samples: &mut Vec, sample_rate: u32) {
- if samples.is_empty() || sample_rate == 0 {
- return;
- }
-
- let frame_len = ((sample_rate as usize * TRIM_FRAME_MS) / 1000).max(1);
- if samples.len() <= frame_len * 2 {
- return;
- }
-
- let frame_rms: Vec = samples.chunks(frame_len).map(frame_rms).collect();
- let peak_rms = frame_rms.iter().copied().fold(0.0f32, f32::max);
- if peak_rms <= 0.0 {
- return;
- }
-
- let threshold = (peak_rms * TRIM_RELATIVE_RMS).max(TRIM_MIN_RMS);
- let Some(first_active) = frame_rms.iter().position(|rms| *rms >= threshold) else {
- return;
- };
- let Some(last_active) = frame_rms.iter().rposition(|rms| *rms >= threshold) else {
- return;
- };
-
- let padding_samples = (sample_rate as usize * TRIM_PADDING_MS) / 1000;
- let padding_frames = padding_samples.div_ceil(frame_len);
- let start_frame = first_active.saturating_sub(padding_frames);
- let end_frame = (last_active + 1 + padding_frames).min(frame_rms.len());
-
- let start = start_frame.saturating_mul(frame_len);
- let end = (end_frame.saturating_mul(frame_len)).min(samples.len());
- if start == 0 && end == samples.len() {
- return;
- }
- if start >= end {
- return;
- }
-
- *samples = samples[start..end].to_vec();
-}
-
-fn frame_rms(frame: &[f32]) -> f32 {
- if frame.is_empty() {
- return 0.0;
- }
-
- let energy = frame.iter().map(|sample| sample * sample).sum::();
- (energy / frame.len() as f32).sqrt()
-}
-
-fn normalize_peak(samples: &mut [f32]) -> f32 {
- let peak = samples.iter().copied().map(f32::abs).fold(0.0f32, f32::max);
- if !(NORMALIZE_MIN_PEAK..NORMALIZE_TARGET_PEAK).contains(&peak) {
- return 1.0;
- }
-
- let gain = (NORMALIZE_TARGET_PEAK / peak).min(NORMALIZE_MAX_GAIN);
- if gain <= 1.0 {
- return 1.0;
- }
-
- for sample in samples {
- *sample = (*sample * gain).clamp(-1.0, 1.0);
- }
-
- gain
-}
-
fn choose_input_config(
device: &cpal::Device,
sample_rate: u32,
@@ -368,7 +202,6 @@ fn choose_input_config(
if format_score == 0 {
continue;
}
- // Prefer mono (20), then fewer channels over more (penalty scales with count)
let channel_score: u8 = if cfg.channels() == 1 {
20
} else {
@@ -400,7 +233,7 @@ fn choose_input_config(
})
}
-fn append_mono_f32(data: &[f32], channels: usize, out: &mut Vec) {
+pub(super) fn append_mono_f32(data: &[f32], channels: usize, out: &mut Vec) {
if channels <= 1 {
out.extend_from_slice(data);
return;
@@ -412,7 +245,7 @@ fn append_mono_f32(data: &[f32], channels: usize, out: &mut Vec) {
}
}
-fn append_mono_i16(data: &[i16], channels: usize, out: &mut Vec) {
+pub(super) fn append_mono_i16(data: &[i16], channels: usize, out: &mut Vec) {
const I16_SCALE: f32 = 32768.0;
if channels <= 1 {
out.extend(data.iter().map(|s| *s as f32 / I16_SCALE));
@@ -425,7 +258,7 @@ fn append_mono_i16(data: &[i16], channels: usize, out: &mut Vec) {
}
}
-fn append_mono_u16(data: &[u16], channels: usize, out: &mut Vec) {
+pub(super) fn append_mono_u16(data: &[u16], channels: usize, out: &mut Vec) {
if channels <= 1 {
out.extend(
data.iter()
@@ -442,88 +275,3 @@ fn append_mono_u16(data: &[u16], channels: usize, out: &mut Vec) {
out.push(sum / frame.len() as f32);
}
}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- fn approx_eq(a: f32, b: f32, eps: f32) -> bool {
- (a - b).abs() <= eps
- }
-
- #[test]
- fn append_mono_f32_passthrough_for_single_channel() {
- let mut out = Vec::new();
- append_mono_f32(&[0.1, -0.2, 0.3], 1, &mut out);
- assert_eq!(out, vec![0.1, -0.2, 0.3]);
- }
-
- #[test]
- fn append_mono_f32_downmixes_stereo() {
- let mut out = Vec::new();
- append_mono_f32(&[1.0, -1.0, 0.5, 0.5], 2, &mut out);
- assert!(approx_eq(out[0], 0.0, 1e-6));
- assert!(approx_eq(out[1], 0.5, 1e-6));
- }
-
- #[test]
- fn append_mono_i16_converts_to_f32() {
- let mut out = Vec::new();
- append_mono_i16(&[i16::MAX, i16::MIN], 1, &mut out);
- assert!(approx_eq(out[0], 1.0, 1e-4));
- assert!(out[1] < -0.99);
- }
-
- #[test]
- fn append_mono_u16_downmixes_and_converts() {
- let mut out = Vec::new();
- append_mono_u16(&[0, u16::MAX], 2, &mut out);
- assert!(approx_eq(out[0], 0.0, 0.01));
- }
-
- #[test]
- fn remove_dc_offset_centers_signal() {
- let mut samples = vec![0.3, 0.5, 0.7];
- remove_dc_offset(&mut samples);
- let mean = samples.iter().copied().sum::() / samples.len() as f32;
- assert!(mean.abs() < 1e-6);
- }
-
- #[test]
- fn trim_silence_removes_outer_quiet_sections() {
- let sample_rate = 1000;
- let mut samples = vec![0.0; 120];
- samples.extend(std::iter::repeat_n(0.2, 200));
- samples.extend(vec![0.0; 120]);
-
- trim_silence(&mut samples, sample_rate);
-
- assert!(samples.len() < 440);
- assert!(samples.len() >= 200);
- assert!(samples.iter().any(|sample| sample.abs() >= 0.19));
- }
-
- #[test]
- fn normalize_peak_boosts_quiet_audio_without_clipping() {
- let mut samples = vec![0.2, -0.3, 0.4];
- let gain = normalize_peak(&mut samples);
- let peak = samples.iter().copied().map(f32::abs).fold(0.0f32, f32::max);
-
- assert!(gain > 1.0);
- assert!(approx_eq(peak, NORMALIZE_TARGET_PEAK, 1e-4));
- assert!(samples.iter().all(|sample| sample.abs() <= 1.0));
- }
-
- #[test]
- fn preprocess_audio_reduces_leading_and_trailing_silence() {
- let sample_rate = 16000;
- let mut samples = vec![0.0; 1600];
- samples.extend((0..3200).map(|idx| if idx % 2 == 0 { 0.08 } else { -0.08 }));
- samples.extend(vec![0.0; 1600]);
-
- preprocess_audio(&mut samples, sample_rate);
-
- assert!(samples.len() < 6400);
- assert!(samples.iter().any(|sample| sample.abs() > 0.1));
- }
-}
diff --git a/src/audio/tests.rs b/src/audio/tests.rs
new file mode 100644
index 0000000..58b296d
--- /dev/null
+++ b/src/audio/tests.rs
@@ -0,0 +1,81 @@
+use super::{dsp, preprocess_audio, recorder};
+
+fn approx_eq(a: f32, b: f32, eps: f32) -> bool {
+ (a - b).abs() <= eps
+}
+
+#[test]
+fn append_mono_f32_passthrough_for_single_channel() {
+ let mut out = Vec::new();
+ recorder::append_mono_f32(&[0.1, -0.2, 0.3], 1, &mut out);
+ assert_eq!(out, vec![0.1, -0.2, 0.3]);
+}
+
+#[test]
+fn append_mono_f32_downmixes_stereo() {
+ let mut out = Vec::new();
+ recorder::append_mono_f32(&[1.0, -1.0, 0.5, 0.5], 2, &mut out);
+ assert!(approx_eq(out[0], 0.0, 1e-6));
+ assert!(approx_eq(out[1], 0.5, 1e-6));
+}
+
+#[test]
+fn append_mono_i16_converts_to_f32() {
+ let mut out = Vec::new();
+ recorder::append_mono_i16(&[i16::MAX, i16::MIN], 1, &mut out);
+ assert!(approx_eq(out[0], 1.0, 1e-4));
+ assert!(out[1] < -0.99);
+}
+
+#[test]
+fn append_mono_u16_downmixes_and_converts() {
+ let mut out = Vec::new();
+ recorder::append_mono_u16(&[0, u16::MAX], 2, &mut out);
+ assert!(approx_eq(out[0], 0.0, 0.01));
+}
+
+#[test]
+fn remove_dc_offset_centers_signal() {
+ let mut samples = vec![0.3, 0.5, 0.7];
+ dsp::remove_dc_offset(&mut samples);
+ let mean = samples.iter().copied().sum::() / samples.len() as f32;
+ assert!(mean.abs() < 1e-6);
+}
+
+#[test]
+fn trim_silence_removes_outer_quiet_sections() {
+ let sample_rate = 1000;
+ let mut samples = vec![0.0; 120];
+ samples.extend(std::iter::repeat_n(0.2, 200));
+ samples.extend(vec![0.0; 120]);
+
+ dsp::trim_silence(&mut samples, sample_rate);
+
+ assert!(samples.len() < 440);
+ assert!(samples.len() >= 200);
+ assert!(samples.iter().any(|sample| sample.abs() >= 0.19));
+}
+
+#[test]
+fn normalize_peak_boosts_quiet_audio_without_clipping() {
+ let mut samples = vec![0.2, -0.3, 0.4];
+ let gain = dsp::normalize_peak(&mut samples);
+ let peak = samples.iter().copied().map(f32::abs).fold(0.0f32, f32::max);
+
+ assert!(gain > 1.0);
+ assert!(approx_eq(peak, dsp::NORMALIZE_TARGET_PEAK, 1e-4));
+ assert!(samples.iter().all(|sample| sample.abs() <= 1.0));
+}
+
+#[test]
+fn preprocess_audio_reduces_leading_and_trailing_silence() {
+ let sample_rate = 16000;
+ let mut samples = vec![0.0; 1600];
+ samples.extend((0..3200).map(|idx| if idx % 2 == 0 { 0.08 } else { -0.08 }));
+ samples.extend(vec![0.0; 1600]);
+
+ preprocess_audio(&mut samples, sample_rate);
+
+ assert!(samples.len() < 6400);
+ assert!(samples.iter().any(|sample| sample.abs() > 0.1));
+}
diff --git a/src/bin/whispers-osd.rs b/src/bin/whispers-osd.rs
index ed47233..9fe2d92 100644
--- a/src/bin/whispers-osd.rs
+++ b/src/bin/whispers-osd.rs
@@ -1,63 +1,93 @@
#[path = "../branding.rs"]
mod branding;
+#[path = "../osd_protocol.rs"]
+mod osd_protocol;
use std::ffi::CString;
+use std::io::{BufRead, BufReader};
use std::os::fd::AsRawFd;
use std::os::unix::io::{AsFd, FromRawFd};
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
use std::sync::Arc;
+use std::sync::OnceLock;
use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
use std::time::Instant;
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
+use font8x8::{BASIC_FONTS, UnicodeFonts};
+use fontdue::Font;
+use osd_protocol::{OsdEvent, VoiceOsdStatus, VoiceOsdUpdate};
use wayland_client::protocol::{
wl_buffer, wl_compositor, wl_registry, wl_shm, wl_shm_pool, wl_surface,
};
use wayland_client::{Connection, Dispatch, QueueHandle, delegate_noop};
use wayland_protocols_wlr::layer_shell::v1::client::{zwlr_layer_shell_v1, zwlr_layer_surface_v1};
-// --- Layout ---
-const NUM_BARS: usize = 28;
+const NUM_BARS: usize = 12;
const BAR_WIDTH: u32 = 3;
const BAR_GAP: u32 = 2;
-const PAD_X: u32 = 10;
-const PAD_Y: u32 = 8;
const BAR_MIN_HEIGHT: f32 = 2.0;
-const BAR_MAX_HEIGHT: f32 = 30.0;
-const OSD_WIDTH: u32 = PAD_X * 2 + NUM_BARS as u32 * BAR_WIDTH + (NUM_BARS as u32 - 1) * BAR_GAP;
-const OSD_HEIGHT: u32 = BAR_MAX_HEIGHT as u32 + PAD_Y * 2;
+const BAR_MAX_HEIGHT: f32 = 20.0;
+const METER_WIDTH: u32 = 128;
+const METER_HEIGHT: u32 = 72;
+const VOICE_WIDTH: u32 = 760;
+const VOICE_HEIGHT: u32 = 248;
const MARGIN_BOTTOM: i32 = 40;
-const CORNER_RADIUS: u32 = 12;
const BORDER_WIDTH: u32 = 1;
-const RISE_RATE: f32 = 0.55;
-const DECAY_RATE: f32 = 0.88;
-
-// --- Animation ---
+const RISE_RATE: f32 = 0.40;
+const DECAY_RATE: f32 = 0.92;
const FPS: i32 = 30;
const FRAME_MS: i32 = 1000 / FPS;
-// --- Colors ---
-const BG_R: u8 = 18;
-const BG_G: u8 = 18;
-const BG_B: u8 = 30;
-const BG_A: u8 = 185;
-
-const BORDER_R: u8 = 140;
-const BORDER_G: u8 = 180;
+const BG_R: u8 = 15;
+const BG_G: u8 = 16;
+const BG_B: u8 = 24;
+const BG_A: u8 = 212;
+const BORDER_R: u8 = 200;
+const BORDER_G: u8 = 215;
const BORDER_B: u8 = 255;
-const BORDER_A: u8 = 40;
-
-// Bar gradient: teal → violet
-const BAR_LEFT_R: f32 = 0.0;
-const BAR_LEFT_G: f32 = 0.82;
-const BAR_LEFT_B: f32 = 0.75;
-const BAR_RIGHT_R: f32 = 0.65;
-const BAR_RIGHT_G: f32 = 0.35;
-const BAR_RIGHT_B: f32 = 1.0;
+const BORDER_A: u8 = 28;
+
+const PANEL_R: u8 = 22;
+const PANEL_G: u8 = 28;
+const PANEL_B: u8 = 39;
+const PANEL_A: u8 = 192;
+const PANEL_BORDER_A: u8 = 18;
+const TRACK_R: u8 = 28;
+const TRACK_G: u8 = 36;
+const TRACK_B: u8 = 50;
+const TRACK_A: u8 = 218;
+const HIGHLIGHT_A: u8 = 14;
+const SHADOW_R: u8 = 3;
+const SHADOW_G: u8 = 6;
+const SHADOW_B: u8 = 10;
+
+const BAR_REST_R: f32 = 0.863;
+const BAR_REST_G: f32 = 0.882;
+const BAR_REST_B: f32 = 0.922;
+const BAR_REST_A: f32 = 0.706;
+const BAR_PEAK_R: f32 = 0.392;
+const BAR_PEAK_G: f32 = 0.608;
+const BAR_PEAK_B: f32 = 1.0;
+const BAR_PEAK_A: f32 = 0.941;
+
+const TEXT_PRIMARY: (u8, u8, u8, u8) = (242, 246, 255, 230);
+const TEXT_MUTED: (u8, u8, u8, u8) = (185, 196, 220, 200);
+const TEXT_UNSTABLE: (u8, u8, u8, u8) = (115, 235, 226, 240);
+const TEXT_REWRITE: (u8, u8, u8, u8) = (255, 208, 126, 220);
+const TEXT_REWRITE_PRIMARY: (u8, u8, u8, u8) = (255, 236, 205, 240);
+const TEXT_WARNING: (u8, u8, u8, u8) = (255, 153, 134, 235);
+const ACCENT_STATUS: (u8, u8, u8) = (109, 236, 196);
static SHOULD_EXIT: AtomicBool = AtomicBool::new(false);
+static OSD_FONT: OnceLock