From f75475a6d7ba5d289bba5e89e097613538546c40 Mon Sep 17 00:00:00 2001 From: Roel Van Gils Date: Thu, 7 May 2026 22:24:06 +0200 Subject: [PATCH] On-disk Whisper transcription cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Repeat runs of `dpub convert --transcribe` against the same audio + model + language combination skip Whisper entirely. The cache lives in `~/.cache/dpub/transcripts/` (Unix) / `%LOCALAPPDATA%\dpub\transcripts\` (Windows); one JSON file per (audio, model, language) tuple keyed by SHA-256 of the inputs. Modifying any input invalidates the entry naturally — no manual cache management. Failures are non-fatal: corrupt cache files, IO errors, disk-full all log a warning and degrade silently to a fresh transcription. Set `DPUB_NO_TRANSCRIPT_CACHE=1` to bypass entirely (debugging). End-to-end measured on the 4h22m cavia book: - cold run: 722 s (Whisper on 109 audio files) - warm run: 21 s (109/109 cache hits) - 34× speedup Most of the warm-run time is Opus re-encoding + ZIP write; the cache lookup is dominated by audio file hashing (~ms per MB). Implementation: - `Segment` and `Word` in dpub-whisper now derive `serde::Deserialize` alongside the existing `Serialize`. Round-trip prerequisite. - New `transcript_cache` module in dpub-convert (~280 lines, 8 unit tests). `CachedTranscriber` wraps `dpub_whisper::Transcriber`, hashes the model once at construction, hashes audio per call, and stores a JSON envelope with diagnostic metadata + the segment payload. - `inject_transcripts` swaps in `CachedTranscriber`; the existing in-memory `HashMap>` cache stays so we don't re-hash audio across sections that share a file. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 1 + Cargo.lock | 3 + crates/dpub-convert/Cargo.toml | 3 + crates/dpub-convert/src/lib.rs | 6 +- crates/dpub-convert/src/transcript_cache.rs | 345 ++++++++++++++++++++ crates/dpub-whisper/src/lib.rs | 4 +- 6 files changed, 359 insertions(+), 3 deletions(-) create mode 100644 crates/dpub-convert/src/transcript_cache.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index df33bee..157a282 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ All notable changes to this project will be documented in this file. The format ### Added +- **On-disk Whisper transcription cache.** Repeat runs of `dpub convert --transcribe` against the same audio + model + language combination now skip Whisper entirely. Cache lives in `~/.cache/dpub/transcripts/` (Unix) / `%LOCALAPPDATA%\dpub\transcripts\` (Windows); one JSON file per (audio, model, language) tuple keyed by SHA-256 of the inputs. Modifying any input invalidates the entry naturally. Failures are non-fatal — corrupt cache files, IO errors, or disk-full all degrade silently to a fresh transcription. Set `DPUB_NO_TRANSCRIPT_CACHE=1` to bypass for debugging. - **Ground truth text alignment** (`--ground-truth `). Pass a plain text or markdown file containing the real book text and dpub will align it word-by-word against Whisper's transcription, replacing Whisper's approximate text with the real prose while keeping the word-level audio sync. Section headings are matched against the DAISY NCC headings via Jaro-Winkler fuzzy matching, so a single file with the whole book works as long as the chapters are in the right order. Markdown vs plain text is auto-detected. Requires `--transcribe` (Whisper still runs to produce timestamps). - **`--ground-truth-strategy `** controls how book content the narrator skipped (colophon, index, acknowledgements) is handled. `no-sync` (default) includes the text in the EPUB without a Media Overlay entry — visible, no karaoke highlight on those passages. `drop` excludes it entirely. `bracket` spans the available time gap proportionally for continuous (if imperfect) sync. - **Audiobook-specific boundary trimming.** Audiobook copyright preambles and outros (Whisper-only material) are detected automatically and discarded — they never leak into the first or last real word's timestamp. The detector requires a run of at least 5 consecutive matching words before it commits to the alignment, so a single coincidental match (e.g. the book title appearing in the preamble) can't trigger early alignment. diff --git a/Cargo.lock b/Cargo.lock index 45c614c..c8c3c9d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -423,6 +423,9 @@ dependencies = [ "dpub-whisper", "epub3-writer", "rayon", + "serde", + "serde_json", + "sha2", "tempfile", "thiserror 1.0.69", "tracing", diff --git a/crates/dpub-convert/Cargo.toml b/crates/dpub-convert/Cargo.toml index 04b25ae..a835fbe 100644 --- a/crates/dpub-convert/Cargo.toml +++ b/crates/dpub-convert/Cargo.toml @@ -30,6 +30,9 @@ thiserror = { workspace = true } uuid = { workspace = true } chrono = { workspace = true } rayon = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +sha2 = { workspace = true } tempfile = "3" [dev-dependencies] diff --git a/crates/dpub-convert/src/lib.rs b/crates/dpub-convert/src/lib.rs index f31350d..8ed4842 100644 --- a/crates/dpub-convert/src/lib.rs +++ b/crates/dpub-convert/src/lib.rs @@ -20,6 +20,7 @@ use rayon::prelude::*; mod error; mod text_cleanup; +mod transcript_cache; pub use dpub_align::BoundaryStrategy; pub use error::{Error, Result}; @@ -776,7 +777,10 @@ fn inject_transcripts( // Load the GGML model exactly once for the whole book. Calling // `dpub_whisper::transcribe` per file would re-load 1.5 GB+ of // weights into Metal/CUDA buffers for every audio file (#10). - let transcriber = dpub_whisper::Transcriber::new(&whisper_opts)?; + // The wrapper layers an on-disk cache on top so repeat + // conversions of the same audio + model + language complete in + // seconds. Set `DPUB_NO_TRANSCRIPT_CACHE=1` to bypass. + let transcriber = transcript_cache::CachedTranscriber::new(&whisper_opts)?; // Read and split the ground truth file once, mapping section // index → owned section text. None when no ground truth is in use. diff --git a/crates/dpub-convert/src/transcript_cache.rs b/crates/dpub-convert/src/transcript_cache.rs new file mode 100644 index 0000000..59bd669 --- /dev/null +++ b/crates/dpub-convert/src/transcript_cache.rs @@ -0,0 +1,345 @@ +//! On-disk cache for Whisper transcription output. +//! +//! Whisper is the slowest stage of `dpub convert --transcribe`. The +//! output is deterministic given the audio bytes, the model bytes, +//! the language code, and our serialisation schema — so we hash those, +//! key a JSON file by the result, and skip re-running Whisper when +//! the same combination has been seen before. +//! +//! Cache layout: +//! - Directory: `~/.cache/dpub/transcripts/` (Unix), `%LOCALAPPDATA%\dpub\transcripts\` (Windows). +//! - Filename: `.json` where `combined_hash` derives from +//! `(audio_sha256, model_sha256, language, schema_version)`. +//! - Format: JSON envelope with diagnostic metadata + the +//! `Vec` payload. +//! +//! The cache is purely an optimisation: read failures fall back to a +//! fresh transcription, write failures are logged and ignored. +//! `DPUB_NO_TRANSCRIPT_CACHE=1` disables both reads and writes. + +use std::fs; +use std::io::{Read, Write}; +use std::path::{Path, PathBuf}; + +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; + +use dpub_whisper::{Segment, TranscribeOptions, Transcriber}; + +/// Bumped whenever the on-disk JSON shape changes. Old cache files +/// hash to a different key after a bump and will simply be ignored +/// (and overwritten on the next miss). No deletion needed. +const SCHEMA_VERSION: u32 = 1; + +/// Disk cache wrapper around `dpub_whisper::Transcriber`. Keeps the +/// model loaded and its hash memoised across all calls in one run. +pub(crate) struct CachedTranscriber { + inner: Transcriber, + model_sha: String, + language: String, + cache_dir: PathBuf, + cache_enabled: bool, +} + +impl CachedTranscriber { + pub(crate) fn new(opts: &TranscribeOptions) -> crate::Result { + let inner = Transcriber::new(opts)?; + let model_sha = hash_file(&opts.model_path).unwrap_or_else(|e| { + // Hashing failure isn't fatal — it just disables the + // cache for this run. Log it so the user knows why they + // didn't get a speedup. + tracing::warn!( + "transcript cache: model hash failed ({e}); cache disabled this run" + ); + String::new() + }); + let cache_enabled = !model_sha.is_empty() + && std::env::var_os("DPUB_NO_TRANSCRIPT_CACHE").is_none(); + let cache_dir = transcripts_cache_dir(); + if cache_enabled { + // Create the dir lazily; ignore failures (we'll log on first write). + let _ = fs::create_dir_all(&cache_dir); + } + Ok(Self { + inner, + model_sha, + language: opts.language.clone(), + cache_dir, + cache_enabled, + }) + } + + pub(crate) fn transcribe(&self, audio_path: &Path) -> crate::Result> { + if !self.cache_enabled { + return Ok(self.inner.transcribe(audio_path)?); + } + let audio_sha = match hash_file(audio_path) { + Ok(s) => s, + Err(e) => { + tracing::warn!( + "transcript cache: audio hash failed for {} ({e}); transcribing without cache", + audio_path.display() + ); + return Ok(self.inner.transcribe(audio_path)?); + } + }; + let key = combined_key(&audio_sha, &self.model_sha, &self.language); + let cache_path = self.cache_dir.join(format!("{key}.json")); + + if let Some(segments) = read_cached(&cache_path) { + tracing::info!( + "transcript cache: hit for {} ({} segments)", + audio_path.display(), + segments.len() + ); + return Ok(segments); + } + + let segments = self.inner.transcribe(audio_path)?; + let envelope = Envelope { + schema_version: SCHEMA_VERSION, + audio_sha256: audio_sha, + model_sha256: self.model_sha.clone(), + language: self.language.clone(), + dpub_whisper_version: env!("CARGO_PKG_VERSION").to_owned(), + segments: segments.clone(), + }; + if let Err(e) = write_cached(&cache_path, &envelope) { + tracing::warn!( + "transcript cache: write failed for {} ({e}); transcript will be re-computed next time", + cache_path.display() + ); + } else { + tracing::debug!( + "transcript cache: stored {} ({} segments)", + cache_path.display(), + envelope.segments.len() + ); + } + Ok(segments) + } +} + +/// JSON envelope written to disk. The metadata fields duplicate the +/// inputs that already feed into the cache key — they're for `jq` +/// debugging, not lookup. +#[derive(Debug, Serialize, Deserialize)] +struct Envelope { + schema_version: u32, + audio_sha256: String, + model_sha256: String, + language: String, + dpub_whisper_version: String, + segments: Vec, +} + +/// Look up the cache file. Returns `Some(segments)` on a clean hit. +/// Any error (missing file, corrupt JSON, schema mismatch) yields +/// `None`; missing files are silent, real errors log a warning. +fn read_cached(path: &Path) -> Option> { + let bytes = match fs::read(path) { + Ok(b) => b, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => return None, + Err(e) => { + tracing::warn!("transcript cache: read failed for {}: {e}", path.display()); + return None; + } + }; + let env: Envelope = match serde_json::from_slice(&bytes) { + Ok(e) => e, + Err(e) => { + tracing::warn!( + "transcript cache: ignoring malformed entry {}: {e}", + path.display() + ); + return None; + } + }; + if env.schema_version != SCHEMA_VERSION { + return None; + } + Some(env.segments) +} + +/// Atomically write the cache entry (`.partial` then rename). Same +/// pattern as the model downloader in `dpub-cli/src/setup.rs`. +fn write_cached(path: &Path, envelope: &Envelope) -> std::io::Result<()> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + let partial = path.with_extension("json.partial"); + let json = serde_json::to_vec(envelope).map_err(std::io::Error::other)?; + { + let mut f = fs::File::create(&partial)?; + f.write_all(&json)?; + f.sync_data()?; + } + fs::rename(&partial, path)?; + Ok(()) +} + +/// Stream-hash a file's bytes with SHA-256. Mirrors the helper used +/// for `dpub setup --whisper-model …` model verification but lives +/// here to avoid a cross-crate dependency for ~15 lines. +fn hash_file(path: &Path) -> std::io::Result { + let mut file = fs::File::open(path)?; + let mut hasher = Sha256::new(); + let mut buf = vec![0u8; 64 * 1024]; + loop { + let n = file.read(&mut buf)?; + if n == 0 { + break; + } + hasher.update(&buf[..n]); + } + Ok(hex(hasher.finalize().as_slice())) +} + +fn hex(bytes: &[u8]) -> String { + use std::fmt::Write; + let mut s = String::with_capacity(bytes.len() * 2); + for b in bytes { + let _ = write!(&mut s, "{b:02x}"); + } + s +} + +/// Combined cache key: `sha256(audio_sha || model_sha || lang || schema_version)`, +/// truncated to 32 hex chars. Truncation is fine: SHA-256 has no +/// adversary here, only the normal birthday-bound risk, which at 128 +/// bits of entropy is ~2^64 inputs before a collision is even +/// plausible. Real-world cache will have a few thousand entries max. +fn combined_key(audio_sha: &str, model_sha: &str, language: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(audio_sha.as_bytes()); + hasher.update(b"\0"); + hasher.update(model_sha.as_bytes()); + hasher.update(b"\0"); + hasher.update(language.as_bytes()); + hasher.update(b"\0"); + hasher.update(SCHEMA_VERSION.to_le_bytes()); + let hex = hex(hasher.finalize().as_slice()); + hex[..32].to_owned() +} + +/// Return the platform-appropriate transcripts cache directory. +/// Mirrors the layout of `~/.cache/dpub/models/` in `dpub-cli/setup.rs`. +fn transcripts_cache_dir() -> PathBuf { + if cfg!(target_os = "windows") { + let base = std::env::var_os("LOCALAPPDATA") + .map_or_else(|| PathBuf::from("."), PathBuf::from); + base.join("dpub").join("transcripts") + } else { + let home = std::env::var_os("HOME") + .map_or_else(|| PathBuf::from("."), PathBuf::from); + home.join(".cache").join("dpub").join("transcripts") + } +} + +#[cfg(test)] +mod tests { + use super::*; + use dpub_whisper::Word; + + fn sample_segments() -> Vec { + vec![Segment { + start_seconds: 0.0, + end_seconds: 1.5, + text: "Hello world.".into(), + words: vec![ + Word { + start_seconds: 0.0, + end_seconds: 0.5, + text: "Hello".into(), + }, + Word { + start_seconds: 0.5, + end_seconds: 1.5, + text: "world.".into(), + }, + ], + }] + } + + #[test] + fn round_trip_envelope() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("entry.json"); + let env = Envelope { + schema_version: SCHEMA_VERSION, + audio_sha256: "aaaa".into(), + model_sha256: "bbbb".into(), + language: "nl".into(), + dpub_whisper_version: "0.6.0".into(), + segments: sample_segments(), + }; + write_cached(&path, &env).unwrap(); + let got = read_cached(&path).expect("hit"); + assert_eq!(got, env.segments); + } + + #[test] + fn missing_file_is_silent_miss() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("nope.json"); + assert!(read_cached(&path).is_none()); + } + + #[test] + fn corrupt_file_is_warning_miss() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("bad.json"); + fs::write(&path, b"not json").unwrap(); + assert!(read_cached(&path).is_none()); + } + + #[test] + fn schema_mismatch_treated_as_miss() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("v0.json"); + let json = serde_json::json!({ + "schema_version": SCHEMA_VERSION + 99, + "audio_sha256": "a", + "model_sha256": "b", + "language": "nl", + "dpub_whisper_version": "0.6.0", + "segments": [], + }); + fs::write(&path, serde_json::to_vec(&json).unwrap()).unwrap(); + assert!(read_cached(&path).is_none()); + } + + #[test] + fn hash_file_is_deterministic() { + let dir = tempfile::tempdir().unwrap(); + let p = dir.path().join("a.bin"); + fs::write(&p, b"hello world").unwrap(); + assert_eq!(hash_file(&p).unwrap(), hash_file(&p).unwrap()); + } + + #[test] + fn hash_file_distinguishes_inputs() { + let dir = tempfile::tempdir().unwrap(); + let a = dir.path().join("a.bin"); + let b = dir.path().join("b.bin"); + fs::write(&a, b"hello").unwrap(); + fs::write(&b, b"world").unwrap(); + assert_ne!(hash_file(&a).unwrap(), hash_file(&b).unwrap()); + } + + #[test] + fn combined_key_changes_when_any_input_changes() { + let base = combined_key("aaaa", "bbbb", "nl"); + assert_ne!(base, combined_key("zzzz", "bbbb", "nl")); + assert_ne!(base, combined_key("aaaa", "zzzz", "nl")); + assert_ne!(base, combined_key("aaaa", "bbbb", "en")); + } + + #[test] + fn cache_dir_ends_in_transcripts() { + let dir = transcripts_cache_dir(); + assert_eq!(dir.file_name().unwrap(), "transcripts"); + let parent_name = dir.parent().unwrap().file_name().unwrap(); + assert_eq!(parent_name, "dpub"); + } +} diff --git a/crates/dpub-whisper/src/lib.rs b/crates/dpub-whisper/src/lib.rs index 8fd6f14..850f69a 100644 --- a/crates/dpub-whisper/src/lib.rs +++ b/crates/dpub-whisper/src/lib.rs @@ -47,7 +47,7 @@ use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextPar /// One transcribed time-range with the text Whisper produced for it. /// /// Times are in seconds (whisper.cpp returns centiseconds; we convert). -#[derive(Debug, Clone, PartialEq, serde::Serialize)] +#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)] pub struct Segment { pub start_seconds: f64, pub end_seconds: f64, @@ -65,7 +65,7 @@ pub struct Segment { /// the produced EPUB). Times are in seconds; whisper.cpp's token /// timestamps are notoriously approximate (~100–300 ms tolerance), so /// callers should not rely on word boundaries being lip-sync-accurate. -#[derive(Debug, Clone, PartialEq, serde::Serialize)] +#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)] pub struct Word { pub start_seconds: f64, pub end_seconds: f64,