From 936c8663b2d5ca5d1e887b2cc2aafa50e553a097 Mon Sep 17 00:00:00 2001 From: Roel Van Gils Date: Thu, 7 May 2026 21:54:40 +0200 Subject: [PATCH] Ground truth text alignment (--ground-truth) with JSON support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace Whisper's approximate transcription with the real book text while preserving word-level audio sync via Myers diff + Jaro-Winkler fuzzy matching. Auto-detects markdown / plain text / structured-JSON / bulk-JSON ground-truth inputs. New crate `dpub-align` (~1000 lines, 44 unit + 1 integration test): - normalize.rs — Unicode-aware lowercase + punctuation stripping - diff.rs — Myers diff via `similar` 3.x with Jaro-Winkler ≥ 0.85 fuzzy promotion (catches "Antwerpe" → "Antwerpen") - boundary.rs — Anchor detection (≥5 consecutive matches) classifies ops as Leading/Core/Trailing so audiobook preambles and outros never smear into the first/last real word - transfer.rs — Timestamp transfer with three boundary strategies (Drop / NoSync / Bracket), monotonicity enforcement, character-proportional interpolation for inserts - section_split.rs — Markdown vs plain-text auto-detect; fuzzy heading matching against DAISY NCC titles - json_format.rs — Structured (per-chapter) and bulk (whole-book blob) JSON formats; pass-through with NBSP normalisation CLI: - --ground-truth (requires --transcribe) - --ground-truth-strategy (default: no-sync) - Config file fields `ground_truth` and `ground_truth_strategy` Bug fixes surfaced by end-to-end testing on a digit-prefixed DAISY book: - OPF manifest IDs now prefixed with `s-` when stems start with a digit (XML Names cannot start with digits — broke any DAISY book with `001_*.smil` filenames; the reference book happened to use letter prefixes so nobody noticed). - Empty `` elements no longer leak into Media Overlay SMIL files (EPUBCheck RSC-005). Empty paragraph wrappers are skipped at the builder layer; the SMIL writer also defensively drops recursively empty seq subtrees; the heading-level overlay shell is preserved when alignment would have produced an entirely empty word tree. - Words that round to the same millisecond as their neighbour no longer ship in SMIL (EPUBCheck MED-009). The builder mirrors the writer's millisecond rounding when filtering zero-duration words. End-to-end verified on "De verwarde Cavia" (109 sections, 4h22m audio): 103/109 sections matched, 209 boundary trim events recorded, final EPUB passes EPUBCheck clean (0 fatals / 0 errors / 0 warnings). Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 7 + Cargo.lock | 34 + Cargo.toml | 1 + crates/dpub-align/Cargo.toml | 22 + crates/dpub-align/examples/match_sections.rs | 66 ++ crates/dpub-align/src/boundary.rs | 212 +++++++ crates/dpub-align/src/diff.rs | 229 +++++++ crates/dpub-align/src/error.rs | 11 + crates/dpub-align/src/json_format.rs | 353 +++++++++++ crates/dpub-align/src/lib.rs | 340 ++++++++++ crates/dpub-align/src/normalize.rs | 89 +++ crates/dpub-align/src/section_split.rs | 256 ++++++++ crates/dpub-align/src/transfer.rs | 621 +++++++++++++++++++ crates/dpub-cli/src/config.rs | 8 +- crates/dpub-cli/src/main.rs | 85 ++- crates/dpub-convert/Cargo.toml | 2 + crates/dpub-convert/src/error.rs | 10 + crates/dpub-convert/src/lib.rs | 323 +++++++++- crates/dpub-convert/tests/real_conversion.rs | 2 + crates/epub3-writer/src/writers.rs | 15 + 20 files changed, 2678 insertions(+), 8 deletions(-) create mode 100644 crates/dpub-align/Cargo.toml create mode 100644 crates/dpub-align/examples/match_sections.rs create mode 100644 crates/dpub-align/src/boundary.rs create mode 100644 crates/dpub-align/src/diff.rs create mode 100644 crates/dpub-align/src/error.rs create mode 100644 crates/dpub-align/src/json_format.rs create mode 100644 crates/dpub-align/src/lib.rs create mode 100644 crates/dpub-align/src/normalize.rs create mode 100644 crates/dpub-align/src/section_split.rs create mode 100644 crates/dpub-align/src/transfer.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index f5c069e..df33bee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ All notable changes to this project will be documented in this file. The format - **`--auto-cover` for Dutch (and other) books no longer silently misses.** Open Library tags docs with ISO 639-2/B (e.g. `"dut"` for Dutch), while DAISY 2.02 metadata uses ISO 639-1 (`"nl"`); the previous literal `eq_ignore_ascii_case` dropped every plausible match. `dpub-meta` now treats 639-1, 639-2/B and 639-2/T as equivalent (`nl`/`dut`/`nld`, `fr`/`fre`/`fra`, `de`/`ger`/`deu`, etc.). Real-world miss this surfaced: "Het smelt" by Lize Spit. Regression test added. - **ISBN search hits are now trusted unconditionally.** When DAISY's `dc:identifier` is ISBN-shaped, the search-by-ISBN already disambiguates the edition, so the language and author filters on the result are noise — and would (incorrectly) reject the cover when Open Library lists a translator under `author_name`. Title+author search remains filtered. - **Open Library HTTP timeout raised from 8 s to 30 s.** `covers.openlibrary.org` redirects through archive.org and can take ~20 s on first hit for less-popular editions; 8 s caused spurious "lookup failed" misses. +- **OPF manifest IDs no longer fail XML Name validation when DAISY filenames start with digits.** DAISY books frequently use `001_*.smil`, `002_*.smil` filenames; the previous code copied those stems into manifest `id` and `idref` attributes, which XML Names reject (must start with a letter or underscore). Stems beginning with a digit are now prefixed with `s-`. EPUBCheck no longer flags `RSC-005` for these books. The reference book ("Ontmoetingen in het donker") was unaffected because its filenames begin with letters. +- **Empty `` elements no longer leak into Media Overlay SMIL files** (EPUBCheck `RSC-005` "element seq incomplete"). Empty paragraph wrappers are dropped at the writer level, and the heading-level overlay shell is preserved when ground-truth alignment would have produced an entirely empty word tree. +- **Words with `clipBegin == clipEnd` no longer ship in SMIL** (EPUBCheck `MED-009`). Zero-duration words from interpolation are filtered out alongside the explicit Unsynced sentinel; their XHTML span is still emitted so the text remains readable. - **Whisper model download no longer times out on slow connections.** The HTTP agent used a 60-second total-request timeout, which was insufficient for the 1.5 GB `ggml-medium.bin` download. Now uses per-read timeouts (60 s idle) so downloads can take as long as needed as long as data keeps flowing. Additionally, downloads now retry up to 3 times on transient failures (CDN stalls, connection resets). ### Changed @@ -17,6 +20,10 @@ All notable changes to this project will be documented in this file. The format ### Added +- **Ground truth text alignment** (`--ground-truth `). Pass a plain text or markdown file containing the real book text and dpub will align it word-by-word against Whisper's transcription, replacing Whisper's approximate text with the real prose while keeping the word-level audio sync. Section headings are matched against the DAISY NCC headings via Jaro-Winkler fuzzy matching, so a single file with the whole book works as long as the chapters are in the right order. Markdown vs plain text is auto-detected. Requires `--transcribe` (Whisper still runs to produce timestamps). +- **`--ground-truth-strategy `** controls how book content the narrator skipped (colophon, index, acknowledgements) is handled. `no-sync` (default) includes the text in the EPUB without a Media Overlay entry — visible, no karaoke highlight on those passages. `drop` excludes it entirely. `bracket` spans the available time gap proportionally for continuous (if imperfect) sync. +- **Audiobook-specific boundary trimming.** Audiobook copyright preambles and outros (Whisper-only material) are detected automatically and discarded — they never leak into the first or last real word's timestamp. The detector requires a run of at least 5 consecutive matching words before it commits to the alignment, so a single coincidental match (e.g. the book title appearing in the preamble) can't trigger early alignment. +- **New crate `dpub-align`** containing the alignment algorithm: word normalisation, Myers diff (via `similar`), Jaro-Winkler fuzzy promotion (≥ 0.85 → Equal), boundary anchor detection, and timestamp transfer with monotonicity enforcement. 33 unit tests. - **`--transcribe` auto-detects language from book metadata.** Passing `--transcribe` without a language code now reads `dc:language` from the DAISY NCC metadata and normalises it to ISO 639-1 for Whisper. Explicit `--transcribe nl` still works. Config file supports `"transcribe": true` for auto-detect or `"transcribe": "nl"` for a fixed default. - **Shared ISO 639 normaliser** (`dpub-util/lang`). Maps ISO 639-1, 639-2/B and 639-2/T codes to their canonical two-letter form. Used by both `dpub-meta` (cover lookup language filter) and `dpub-cli` (transcription auto-detect). - **Persistent config file** (`~/.config/dpub/config.json` on Unix, `%APPDATA%\dpub\config.json` on Windows). Lets users set defaults for `audio`, `bitrate`, `auto_cover`, `no_word_sync`, `rights`, `whisper_model`, `transcribe`, `validate`, `a11y`, `jobs`, and `log_level`. CLI flags always override config values. diff --git a/Cargo.lock b/Cargo.lock index b85b78e..45c614c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -150,6 +150,16 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "bumpalo" version = "3.20.2" @@ -355,6 +365,19 @@ dependencies = [ "syn", ] +[[package]] +name = "dpub-align" +version = "0.6.0" +dependencies = [ + "dpub-core", + "serde", + "serde_json", + "similar", + "strsim", + "thiserror 1.0.69", + "tracing", +] + [[package]] name = "dpub-audio" version = "0.6.0" @@ -392,6 +415,7 @@ name = "dpub-convert" version = "0.6.0" dependencies = [ "chrono", + "dpub-align", "dpub-audio", "dpub-core", "dpub-meta", @@ -401,6 +425,7 @@ dependencies = [ "rayon", "tempfile", "thiserror 1.0.69", + "tracing", "uuid", "zip", ] @@ -1265,6 +1290,15 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" +[[package]] +name = "similar" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04d93e861ede2e497b47833469b8ec9d5c07fa4c78ce7a00f6eb7dd8168b4b3f" +dependencies = [ + "bstr", +] + [[package]] name = "slab" version = "0.4.12" diff --git a/Cargo.toml b/Cargo.toml index 536c4d1..9d2235d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] resolver = "3" members = [ + "crates/dpub-align", "crates/dpub-audio", "crates/dpub-core", "crates/dpub-cli", diff --git a/crates/dpub-align/Cargo.toml b/crates/dpub-align/Cargo.toml new file mode 100644 index 0000000..c0003c4 --- /dev/null +++ b/crates/dpub-align/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "dpub-align" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true +description = "Align Whisper word-level timestamps to ground truth book text via Myers diff + fuzzy matching." + +[lints] +workspace = true + +[dependencies] +serde = { workspace = true } +serde_json = { workspace = true } +similar = "3" +strsim = "0.11" +thiserror = { workspace = true } +tracing = { workspace = true } + +[dev-dependencies] +dpub-core = { path = "../dpub-core" } diff --git a/crates/dpub-align/examples/match_sections.rs b/crates/dpub-align/examples/match_sections.rs new file mode 100644 index 0000000..fc43c9b --- /dev/null +++ b/crates/dpub-align/examples/match_sections.rs @@ -0,0 +1,66 @@ +//! Dry-run helper: parse a DAISY 2.02 publication and a ground-truth +//! file and report how many sections the heading matcher resolves — +//! without running Whisper. Useful when validating a new ground-truth +//! file against a book. +//! +//! Usage: +//! ```text +//! cargo run --release -p dpub-align --example match_sections -- \ +//! /path/to/book/ncc.html /path/to/groundtruth.{txt,md,json} +//! ``` + +use std::path::Path; + +fn main() { + let mut args = std::env::args().skip(1); + let ncc = args.next().expect("usage: match_sections "); + let gt = args.next().expect("usage: match_sections "); + + let book = dpub_core::Book::from_ncc(Path::new(&ncc)).expect("parse DAISY"); + let raw = std::fs::read_to_string(>).expect("read ground truth"); + + let headings: Vec<(&str, usize)> = book + .master + .references + .iter() + .enumerate() + .map(|(i, r)| (r.title.as_str(), i)) + .collect(); + + let sections = dpub_align::split_into_sections(&raw, &headings); + println!( + "Matched {} of {} DAISY sections", + sections.len(), + headings.len() + ); + println!(); + + let matched: std::collections::HashSet = sections.iter().map(|s| s.ncc_index).collect(); + println!("First 10 matches:"); + for s in sections.iter().take(10) { + let title = headings[s.ncc_index].0; + let preview: String = s.text.chars().take(50).collect::().replace('\n', " "); + println!( + " [{:3}] {:30} → {:5} chars {:?}", + s.ncc_index, + title, + s.text.len(), + preview + ); + } + println!(); + + let unmatched: Vec<&str> = headings + .iter() + .enumerate() + .filter(|(i, _)| !matched.contains(i)) + .map(|(_, (t, _))| *t) + .collect(); + println!("Unmatched headings ({} total):", unmatched.len()); + for t in unmatched.iter().take(20) { + println!(" {t}"); + } + if unmatched.len() > 20 { + println!(" ... and {} more", unmatched.len() - 20); + } +} diff --git a/crates/dpub-align/src/boundary.rs b/crates/dpub-align/src/boundary.rs new file mode 100644 index 0000000..0d5727c --- /dev/null +++ b/crates/dpub-align/src/boundary.rs @@ -0,0 +1,212 @@ +//! Anchor detection and region classification for the edit script. +//! +//! The two streams won't always cover the same scope. Audiobook +//! preambles ("This is a Luisterpunt production…") are Whisper-only; +//! colophons and indices are ground-truth-only. These mismatches +//! cluster at the boundaries of a section. +//! +//! This module finds the **anchor region** — the longest middle +//! section bracketed by runs of ≥5 consecutive Equal/Fuzzy operations +//! — and tags every op as Leading / Core / Trailing so the timestamp +//! transfer phase can apply different policies per region. + +use crate::diff::Op; + +/// Minimum run of consecutive Equal/Fuzzy ops required to count as +/// an anchor. Tuned to ignore single coincidental matches inside a +/// preamble (e.g. the book title) while still catching the start of +/// the actual content. +pub(crate) const ANCHOR_MIN_RUN: usize = 5; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum Region { + /// Before the leading anchor (or whole script if no anchor). + Leading, + /// Inside the anchor region — normal timestamp transfer applies. + Core, + /// After the trailing anchor. + Trailing, +} + +#[derive(Debug, Clone)] +pub(crate) struct ClassifiedOp { + pub op: Op, + pub region: Region, +} + +/// Classify each op by region. +/// +/// If no anchor is found (the streams are wildly different), every op +/// is flagged as `Core` so the caller falls back to plain transfer +/// rather than dropping content. +pub(crate) fn classify(script: &[Op]) -> Vec { + let leading = first_anchor_start(script); + let trailing = last_anchor_end(script); + + let (lead_end, trail_start) = match (leading, trailing) { + (Some(l), Some(t)) if l < t => (l, t), + // No usable anchor pair: treat everything as Core. This + // matches the "best effort" contract — when the diff is + // chaotic, naive transfer is still better than dropping. + _ => return script.iter().map(|&op| ClassifiedOp { + op, + region: Region::Core, + }).collect(), + }; + + script + .iter() + .enumerate() + .map(|(i, &op)| { + let region = if i < lead_end { + Region::Leading + } else if i >= trail_start { + Region::Trailing + } else { + Region::Core + }; + ClassifiedOp { op, region } + }) + .collect() +} + +/// Return the index *of the first op* in the leading anchor run +/// (the first match of a ≥ANCHOR_MIN_RUN streak), or `None` if no +/// such run exists. +fn first_anchor_start(script: &[Op]) -> Option { + let mut run_len = 0usize; + let mut run_start = 0usize; + for (i, op) in script.iter().enumerate() { + if is_match(op) { + if run_len == 0 { + run_start = i; + } + run_len += 1; + if run_len >= ANCHOR_MIN_RUN { + return Some(run_start); + } + } else { + run_len = 0; + } + } + None +} + +/// Return the index *one past the last op* in the trailing anchor +/// run, or `None` if no such run exists. +fn last_anchor_end(script: &[Op]) -> Option { + let mut run_len = 0usize; + let mut run_end = 0usize; // exclusive + for (i, op) in script.iter().enumerate().rev() { + if is_match(op) { + if run_len == 0 { + run_end = i + 1; + } + run_len += 1; + if run_len >= ANCHOR_MIN_RUN { + return Some(run_end); + } + } else { + run_len = 0; + } + } + None +} + +fn is_match(op: &Op) -> bool { + matches!(op, Op::Equal { .. } | Op::Fuzzy { .. }) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn eq(w: usize, g: usize) -> Op { + Op::Equal { + whisper_idx: w, + gt_idx: g, + } + } + fn del(w: usize) -> Op { + Op::Delete { whisper_idx: w } + } + fn ins(g: usize) -> Op { + Op::Insert { gt_idx: g } + } + + #[test] + fn no_anchors_means_all_core() { + // Only 4 matches — below threshold of 5 + let script = vec![eq(0, 0), eq(1, 1), eq(2, 2), eq(3, 3), del(4)]; + let classified = classify(&script); + assert!(classified.iter().all(|c| c.region == Region::Core)); + } + + #[test] + fn preamble_is_leading() { + // 10 deletes (preamble), then 5 matches (anchor). + let mut script: Vec = (0..10).map(del).collect(); + script.extend((0..5).map(|i| eq(10 + i, i))); + let classified = classify(&script); + assert_eq!(classified.len(), 15); + // The 10 deletes should be Leading; the 5 matches Core. + for c in &classified[0..10] { + assert_eq!(c.region, Region::Leading); + } + for c in &classified[10..15] { + assert_eq!(c.region, Region::Core); + } + } + + #[test] + fn colophon_is_trailing() { + // 5 matches (anchor), then 10 inserts (colophon). + let mut script: Vec = (0..5).map(|i| eq(i, i)).collect(); + script.extend((5..15).map(ins)); + let classified = classify(&script); + for c in &classified[0..5] { + assert_eq!(c.region, Region::Core); + } + for c in &classified[5..15] { + assert_eq!(c.region, Region::Trailing); + } + } + + #[test] + fn full_book_pattern() { + // preamble (8 deletes) → core (5 matches, 1 delete, 5 matches) + // → outro (8 deletes) + let mut script: Vec = (0..8).map(del).collect(); + script.extend((0..5).map(|i| eq(8 + i, i))); + script.push(del(13)); + script.extend((5..10).map(|i| eq(14 + i - 5, i))); + script.extend((19..27).map(del)); + + let classified = classify(&script); + // First 8 should be Leading + assert!(classified[0..8].iter().all(|c| c.region == Region::Leading)); + // Last 8 should be Trailing + let n = classified.len(); + assert!(classified[n - 8..n].iter().all(|c| c.region == Region::Trailing)); + // Middle should be Core + assert!(classified[8..n - 8].iter().all(|c| c.region == Region::Core)); + } + + #[test] + fn single_coincidental_match_in_preamble_doesnt_anchor() { + // 3 deletes, 1 match (book title in preamble?), 4 deletes, + // then 5 real matches. + let mut script: Vec = vec![del(0), del(1), del(2)]; + script.push(eq(3, 0)); // coincidental + script.extend((4..8).map(del)); + script.extend((0..5).map(|i| eq(8 + i, i + 1))); + let classified = classify(&script); + // The coincidental match should be classified as Leading, + // because it's before the real 5-run anchor. + assert_eq!(classified[3].region, Region::Leading); + // Real anchor begins at index 8. + for c in &classified[8..13] { + assert_eq!(c.region, Region::Core); + } + } +} diff --git a/crates/dpub-align/src/diff.rs b/crates/dpub-align/src/diff.rs new file mode 100644 index 0000000..53be63a --- /dev/null +++ b/crates/dpub-align/src/diff.rs @@ -0,0 +1,229 @@ +//! Word-level Myers diff with Jaro-Winkler fuzzy promotion. +//! +//! Produces an [`EditScript`] — a sequence of [`Op`] entries that +//! describe how to turn the Whisper word stream into the ground truth +//! word stream (Equal/Fuzzy/Insert/Delete/Replace). + +use similar::{capture_diff_slices, Algorithm}; + +use crate::{GroundTruthWord, WordTiming}; + +/// One operation in the edit script. Indices refer to the input +/// slices: `whisper_idx` into the Whisper word stream, `gt_idx` into +/// the ground truth word stream. +#[derive(Debug, Clone, Copy, PartialEq)] +pub(crate) enum Op { + /// Whisper word and ground truth word match exactly (after + /// normalisation). + Equal { whisper_idx: usize, gt_idx: usize }, + /// Whisper word and ground truth word are similar enough + /// (Jaro-Winkler ≥ threshold) — treat as a match. + Fuzzy { + whisper_idx: usize, + gt_idx: usize, + score: f64, + }, + /// Whisper word with no ground truth counterpart (hallucinated / + /// repeated / preamble). + Delete { whisper_idx: usize }, + /// Ground truth word with no Whisper counterpart (skipped / + /// colophon / outro). + Insert { gt_idx: usize }, +} + +/// Threshold above which a Replace operation is promoted to Fuzzy. +/// Set lower than the typical 0.9 to catch trailing-letter truncations +/// like "Antwerpe" → "Antwerpen" (≈ 0.97) and minor letter swaps. +const FUZZY_THRESHOLD: f64 = 0.85; + +/// Run Myers diff over the normalised keys of both streams and emit +/// the post-processed edit script (Equal/Fuzzy/Insert/Delete). +pub(crate) fn diff_words( + whisper: &[WordTiming], + ground_truth: &[GroundTruthWord], +) -> Vec { + use crate::normalize; + + // Build key slices. We could compute keys lazily but caching + // avoids re-running normalise() inside Myers' inner loop. + let whisper_keys: Vec = whisper.iter().map(|w| normalize::normalise(&w.text)).collect(); + let gt_keys: Vec<&str> = ground_truth.iter().map(|w| w.key.as_str()).collect(); + let whisper_key_refs: Vec<&str> = whisper_keys.iter().map(String::as_str).collect(); + + let diff_ops = capture_diff_slices(Algorithm::Myers, &whisper_key_refs, >_keys); + + let mut script: Vec = Vec::with_capacity(whisper.len() + ground_truth.len()); + for op in diff_ops { + match op { + similar::DiffOp::Equal { + old_index, + new_index, + len, + } => { + for i in 0..len { + script.push(Op::Equal { + whisper_idx: old_index + i, + gt_idx: new_index + i, + }); + } + } + similar::DiffOp::Delete { + old_index, old_len, .. + } => { + for i in 0..old_len { + script.push(Op::Delete { + whisper_idx: old_index + i, + }); + } + } + similar::DiffOp::Insert { + new_index, new_len, .. + } => { + for i in 0..new_len { + script.push(Op::Insert { + gt_idx: new_index + i, + }); + } + } + similar::DiffOp::Replace { + old_index, + old_len, + new_index, + new_len, + } => { + // Pair up the Replace block one-to-one (longest common + // length); leftover Whisper words become Deletes, + // leftover ground truth words become Inserts. Within + // each pair, promote to Fuzzy if Jaro-Winkler is high + // enough — handles "Antwerpe"/"Antwerpen", trailing-s + // confusions, etc. + let pair_len = old_len.min(new_len); + for i in 0..pair_len { + let w_i = old_index + i; + let g_i = new_index + i; + let score = + strsim::jaro_winkler(whisper_key_refs[w_i], gt_keys[g_i]); + if score >= FUZZY_THRESHOLD { + script.push(Op::Fuzzy { + whisper_idx: w_i, + gt_idx: g_i, + score, + }); + } else { + // Genuinely different word: emit as + // Delete + Insert. The transfer phase uses + // Whisper's time span for the inserted word + // anyway when these are adjacent. + script.push(Op::Delete { whisper_idx: w_i }); + script.push(Op::Insert { gt_idx: g_i }); + } + } + for i in pair_len..old_len { + script.push(Op::Delete { + whisper_idx: old_index + i, + }); + } + for i in pair_len..new_len { + script.push(Op::Insert { + gt_idx: new_index + i, + }); + } + } + } + } + script +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::normalize; + + fn ww(text: &str, start: f64, end: f64) -> WordTiming { + WordTiming { + start_seconds: start, + end_seconds: end, + text: text.to_owned(), + } + } + + fn gt(words: &[&str]) -> Vec { + words + .iter() + .map(|s| GroundTruthWord { + text: (*s).to_owned(), + key: normalize::normalise(s), + }) + .collect() + } + + #[test] + fn perfect_match_all_equal() { + let w = vec![ + ww("hello", 0.0, 0.5), + ww("world", 0.5, 1.0), + ]; + let g = gt(&["hello", "world"]); + let ops = diff_words(&w, &g); + assert!(matches!(ops[0], Op::Equal { .. })); + assert!(matches!(ops[1], Op::Equal { .. })); + assert_eq!(ops.len(), 2); + } + + #[test] + fn whisper_hallucination_is_delete() { + let w = vec![ + ww("hello", 0.0, 0.5), + ww("um", 0.5, 0.7), + ww("world", 0.7, 1.2), + ]; + let g = gt(&["hello", "world"]); + let ops = diff_words(&w, &g); + // Should have an Equal, a Delete, an Equal. + let deletes: Vec<_> = ops + .iter() + .filter(|op| matches!(op, Op::Delete { .. })) + .collect(); + assert_eq!(deletes.len(), 1); + } + + #[test] + fn whisper_omission_is_insert() { + let w = vec![ + ww("hello", 0.0, 0.5), + ww("world", 0.5, 1.0), + ]; + let g = gt(&["hello", "the", "world"]); + let ops = diff_words(&w, &g); + let inserts: Vec<_> = ops + .iter() + .filter(|op| matches!(op, Op::Insert { .. })) + .collect(); + assert_eq!(inserts.len(), 1); + } + + #[test] + fn truncated_word_promoted_to_fuzzy() { + // "Antwerpe" vs "Antwerpen" — Jaro-Winkler ~0.97 + let w = vec![ww("antwerpe", 0.0, 1.0)]; + let g = gt(&["antwerpen"]); + let ops = diff_words(&w, &g); + assert_eq!(ops.len(), 1); + match ops[0] { + Op::Fuzzy { score, .. } => assert!(score >= 0.85), + other => panic!("expected Fuzzy, got {other:?}"), + } + } + + #[test] + fn unrelated_words_stay_replace_split() { + // "table" vs "elephant" — Jaro-Winkler well below threshold + let w = vec![ww("table", 0.0, 1.0)]; + let g = gt(&["elephant"]); + let ops = diff_words(&w, &g); + // Should be Delete + Insert, not Fuzzy + assert!(ops.iter().any(|op| matches!(op, Op::Delete { .. }))); + assert!(ops.iter().any(|op| matches!(op, Op::Insert { .. }))); + assert!(!ops.iter().any(|op| matches!(op, Op::Fuzzy { .. }))); + } +} diff --git a/crates/dpub-align/src/error.rs b/crates/dpub-align/src/error.rs new file mode 100644 index 0000000..469585a --- /dev/null +++ b/crates/dpub-align/src/error.rs @@ -0,0 +1,11 @@ +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum Error { + #[error("ground truth is empty for section")] + EmptyGroundTruth, + #[error("no whisper words for section")] + NoWhisperWords, +} + +pub type Result = std::result::Result; diff --git a/crates/dpub-align/src/json_format.rs b/crates/dpub-align/src/json_format.rs new file mode 100644 index 0000000..0ef4d60 --- /dev/null +++ b/crates/dpub-align/src/json_format.rs @@ -0,0 +1,353 @@ +//! Adapter for the structured-JSON ground truth format. +//! +//! Schema (permissive — unknown fields are ignored): +//! +//! ```json +//! { +//! "content": [ +//! { "chapter-title": "...", "chapter-content": "..." }, +//! ... +//! ] +//! } +//! ``` +//! +//! Each entry becomes one section. The title is matched against the +//! DAISY NCC heading via the existing fuzzy matcher; the content +//! becomes the section body. Single newlines in the content are +//! treated as paragraph breaks (DAISY-friendly default — most book +//! exporters split paragraphs that way). + +use serde::Deserialize; + +/// Returns `true` when `raw` looks like our JSON format (first +/// non-whitespace char is `{`). Used to dispatch between the JSON +/// parser and the plain-text/markdown path. +pub fn looks_like_json(raw: &str) -> bool { + raw.trim_start().starts_with('{') +} + +/// Convert a JSON document conforming to the chapter-array schema +/// into the markdown-style ground-truth text the rest of `dpub-align` +/// already consumes. On any parse error returns the input unchanged +/// so the caller can fall through to the plain-text path. +pub fn convert_to_markdown(raw: &str) -> String { + match serde_json::from_str::(raw) { + Ok(doc) => render(&doc), + Err(e) => { + tracing::warn!("ground truth: JSON parse failed ({e}); falling back to plain-text path"); + raw.to_owned() + } + } +} + +#[derive(Debug, Deserialize)] +struct Document { + #[serde(default)] + content: Vec, +} + +#[derive(Debug, Deserialize)] +struct Chapter { + #[serde(rename = "chapter-title", default)] + title: Option, + #[serde(rename = "chapter-content", default)] + content: Option, +} + +fn render(doc: &Document) -> String { + // Bulk format: no chapter object carries a title — the whole book + // is in one (or more) `chapter-content` blobs with section titles + // encoded inline (typically as ALL-CAPS short lines). Concatenate + // the bodies and pass through as plain text so the existing + // line-by-line heading detector picks up the inline titles via + // fuzzy matching. + let any_titled = doc.content.iter().any(|c| { + c.title + .as_deref() + .is_some_and(|t| !t.trim().is_empty()) + }); + if !any_titled { + let mut body = String::new(); + for ch in &doc.content { + if let Some(content) = &ch.content { + if !body.is_empty() { + body.push_str("\n\n"); + } + body.push_str(content.trim()); + } + } + return normalize_body_preserving_lines(&body); + } + + let mut out = String::with_capacity(doc.content.iter().map(|c| { + c.title.as_deref().map_or(0, str::len) + c.content.as_deref().map_or(0, str::len) + 8 + }).sum()); + + for chapter in &doc.content { + let body_raw = chapter.content.as_deref().unwrap_or("").trim(); + let title = chapter + .title + .as_deref() + .map(normalize_title) + .filter(|t| !t.is_empty()); + + // Skip entries with neither title nor content — nothing useful + // to align against. + if title.is_none() && body_raw.is_empty() { + continue; + } + + // Emit a markdown H1 so the existing splitter picks it up. + // Untitled entries (typically the first cover/title-page item) + // get a synthetic placeholder so they still count as a section + // boundary; matchers will simply fail to find them in the NCC, + // which is the correct outcome. + let heading = title.unwrap_or_else(|| body_raw + .lines() + .next() + .unwrap_or("untitled") + .chars() + .take(80) + .collect::()); + + out.push_str("# "); + out.push_str(heading.trim()); + out.push_str("\n\n"); + + let body = normalize_body(body_raw); + out.push_str(&body); + out.push_str("\n\n"); + } + + out +} + +/// Normalise a chapter title that may contain literal newlines or +/// non-breaking spaces (`\u{a0}`). Reading systems display titles on +/// one line; the NCC heading we match against is also a single line. +fn normalize_title(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + let mut last_was_space = true; + for c in s.chars() { + let is_space = c == '\n' || c == '\r' || c == '\t' || c == '\u{a0}' || c == ' '; + if is_space { + if !last_was_space { + out.push(' '); + } + last_was_space = true; + } else { + out.push(c); + last_was_space = false; + } + } + out.trim().to_owned() +} + +/// Convert the JSON's `chapter-content` into the paragraph-aware +/// format the rest of the pipeline expects (paragraphs separated by +/// blank lines). Heuristic: +/// - Treat single `\n` as a paragraph break (the format uses single +/// newlines between paragraphs). +/// - Collapse runs of newlines into a single paragraph break. +/// - Replace non-breaking spaces with regular spaces (better word +/// matching: `nieuwe\u{a0}avonturen` matches Whisper's `nieuwe +/// avonturen`). +/// Normalise a bulk-format body without merging lines: inline +/// chapter titles (ALL-CAPS short lines preceded by blank lines) must +/// stay on their own lines so [`section_split`] can detect them. +/// Only character-level normalisations are applied (NBSP → space). +fn normalize_body_preserving_lines(s: &str) -> String { + s.chars() + .map(|c| if c == '\u{a0}' || c == '\u{2009}' || c == '\u{200a}' { ' ' } else { c }) + .collect() +} + +fn normalize_body(s: &str) -> String { + let mut paragraphs: Vec = Vec::new(); + for raw_para in s.split('\n') { + let trimmed = raw_para.trim(); + if trimmed.is_empty() { + continue; + } + // Replace NBSPs and other Unicode spaces with a regular space. + let normalised: String = trimmed + .chars() + .map(|c| if c == '\u{a0}' || c == '\u{2009}' || c == '\u{200a}' { ' ' } else { c }) + .collect(); + paragraphs.push(normalised); + } + paragraphs.join("\n\n") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn detects_json_input() { + assert!(looks_like_json("{\"content\": []}")); + assert!(looks_like_json(" \n { ... }")); + assert!(!looks_like_json("# Heading\nbody")); + assert!(!looks_like_json("plain text")); + assert!(!looks_like_json("")); + } + + #[test] + fn converts_basic_document() { + let json = r#"{ + "content": [ + {"chapter-title": "Chapter 1", "chapter-content": "First paragraph.\nSecond paragraph."}, + {"chapter-title": "Chapter 2", "chapter-content": "Body of two."} + ] + }"#; + let md = convert_to_markdown(json); + assert!(md.contains("# Chapter 1")); + assert!(md.contains("# Chapter 2")); + // Paragraphs separated by blank lines (i.e. \n\n). + assert!(md.contains("First paragraph.\n\nSecond paragraph.")); + } + + #[test] + fn collapses_multiline_title() { + let json = r#"{"content":[ + {"chapter-title": "Hoera!\nNieuwe avonturen", "chapter-content": "Body."} + ]}"#; + let md = convert_to_markdown(json); + assert!(md.contains("# Hoera! Nieuwe avonturen")); + } + + #[test] + fn replaces_nbsp_in_title_and_body() { + let json = "{\"content\":[{\"chapter-title\":\"de\u{a0}cavia\",\"chapter-content\":\"woord\u{a0}met nbsp.\"}]}"; + let md = convert_to_markdown(json); + assert!(md.contains("# de cavia")); + assert!(md.contains("woord met nbsp.")); + } + + #[test] + fn skips_entry_with_no_useful_content() { + let json = r#"{"content":[ + {"chapter-content": ""}, + {"chapter-title": "Real chapter", "chapter-content": "Body."} + ]}"#; + let md = convert_to_markdown(json); + // Only one heading. + assert_eq!(md.matches("# ").count(), 1); + assert!(md.contains("# Real chapter")); + } + + #[test] + fn ignores_extra_top_level_fields() { + let json = r#"{ + "title": "Book Title", + "language": "nl", + "extraction_time_ms": 12345, + "total_chars_count": 999, + "content": [ + {"chapter-title": "Only", "chapter-content": "Body."} + ] + }"#; + let md = convert_to_markdown(json); + assert!(md.contains("# Only")); + assert!(md.contains("Body.")); + } + + #[test] + fn ignores_extra_chapter_fields() { + let json = r#"{"content":[ + {"chapter-title": "T", "chapter-content": "B", "chars_count": 1, "word_count": 1, "anything-else": null} + ]}"#; + let md = convert_to_markdown(json); + assert!(md.contains("# T")); + assert!(md.contains("B")); + } + + #[test] + fn malformed_json_falls_through_unchanged() { + let raw = "{not json"; + assert_eq!(convert_to_markdown(raw), raw); + } + + /// Smoke test against a real fullbook.json (when present on disk). + /// Gated on the env var so CI without the file passes. The fixture + /// may be either format (structured or bulk); we just assert that + /// parsing produces a non-empty result. + #[test] + fn parses_real_fullbook_json() { + let Ok(path) = std::env::var("DPUB_TEST_GROUND_TRUTH_JSON") else { + return; + }; + let raw = std::fs::read_to_string(&path).expect("read fixture"); + assert!(looks_like_json(&raw)); + let md = convert_to_markdown(&raw); + assert!(!md.is_empty()); + } + + /// Integration: parse the bulk-format fullbook.json and verify the + /// inline ALL-CAPS chapter titles can be matched against a sample + /// of expected DAISY headings via the public splitter API. + #[test] + fn bulk_format_finds_inline_chapter_titles() { + let Ok(path) = std::env::var("DPUB_TEST_GROUND_TRUTH_JSON") else { + return; + }; + let raw = std::fs::read_to_string(&path).expect("read fixture"); + // Skip if it's not the bulk format we want to exercise. + let doc: Document = serde_json::from_str(&raw).expect("valid json"); + let any_titled = doc.content.iter().any(|c| { + c.title.as_deref().is_some_and(|t| !t.trim().is_empty()) + }); + if any_titled { + return; // structured format — different test. + } + let md = convert_to_markdown(&raw); + // Sample of titles known to live inline in this fixture + // (copied from the DAISY filenames of the matching book). + let ncc: &[(&str, usize)] = &[ + ("Hh", 0), + ("Opletten", 1), + ("Nierstenen", 2), + ("Ping", 3), + ("Opvangbakje", 4), + ]; + let sections = crate::split_into_sections(&md, ncc); + // The fuzzy matcher should find at least 3 of the 5 — this is + // a loose threshold so the test isn't brittle if NCC formatting + // changes the matcher's preferences. + assert!( + sections.len() >= 3, + "expected ≥3 of 5 sample headings to match, got {}", + sections.len() + ); + } + + #[test] + fn bulk_format_passes_body_through_as_plain_text() { + // Single-chapter document with the whole book in one blob and + // ALL-CAPS chapter titles inline. We must NOT prepend a `# ` + // wrapper — the inline titles need to remain plain lines so + // section_split's plain-text path can fuzzy-match them. + let json = r#"{"content":[{"chapter-content": +"Cover blurb.\n\nHÈHÈ\nFirst chapter body.\n\nOPLETTEN\nSecond chapter body."}]}"#; + let md = convert_to_markdown(json); + // No markdown headings emitted. + assert!(!md.contains("# ")); + // Inline titles preserved as their own lines. + assert!(md.contains("\nHÈHÈ\n")); + assert!(md.contains("\nOPLETTEN\n")); + } + + #[test] + fn untitled_first_entry_gets_placeholder() { + // Real-world case: the first entry has no `chapter-title`. + let json = r#"{"content":[ + {"chapter-content": "De verwarde cavia"}, + {"chapter-title": "Hoofdstuk 1", "chapter-content": "Body."} + ]}"#; + let md = convert_to_markdown(json); + // Two headings: one synthetic from body, one explicit. + assert_eq!(md.matches("# ").count(), 2); + assert!(md.contains("# De verwarde cavia")); + assert!(md.contains("# Hoofdstuk 1")); + } +} diff --git a/crates/dpub-align/src/lib.rs b/crates/dpub-align/src/lib.rs new file mode 100644 index 0000000..50f07f9 --- /dev/null +++ b/crates/dpub-align/src/lib.rs @@ -0,0 +1,340 @@ +//! Align Whisper's approximate word-level transcription against the +//! real book text (ground truth), transferring timestamps so the EPUB +//! ships with accurate prose AND word-level Media Overlay sync. +//! +//! Pipeline: +//! +//! 1. **Section split** — match ground truth headings to NCC headings +//! so each section's text is identified. +//! 2. **Word diff** — Myers diff with Jaro-Winkler fuzzy promotion +//! over normalised word keys. +//! 3. **Boundary trim** — discard audiobook preamble / outro and +//! handle book-only material (colophon etc.) per +//! [`BoundaryStrategy`]. +//! 4. **Timestamp transfer** — copy/redistribute/interpolate Whisper +//! timings onto the ground truth word stream. +//! 5. **Paragraph reconstruction** — group aligned words by ground +//! truth paragraph breaks (blank lines) and emit +//! [`AlignedParagraph`]. + +mod boundary; +mod diff; +mod error; +mod json_format; +mod normalize; +mod section_split; +mod transfer; + +pub use error::{Error, Result}; +pub use section_split::SectionText; + +/// Split a ground truth file into per-section chunks matching the +/// supplied NCC headings. +/// +/// Auto-detects format: +/// - **JSON** (first non-whitespace char is `{`): parsed as the +/// structured chapter array (see [`json_format`]) and converted to +/// markdown internally. +/// - **Markdown / plain text**: passed through to the heading-line +/// splitter as-is. +pub fn split_into_sections(text: &str, ncc_headings: &[(&str, usize)]) -> Vec { + if json_format::looks_like_json(text) { + let markdown = json_format::convert_to_markdown(text); + section_split::split_into_sections(&markdown, ncc_headings) + } else { + section_split::split_into_sections(text, ncc_headings) + } +} + +/// Lightweight word-with-timestamp type. Mirrors `dpub_whisper::Word` +/// but keeps `dpub-align` free of whisper.cpp build dependencies. +#[derive(Debug, Clone, PartialEq)] +pub struct WordTiming { + pub start_seconds: f64, + pub end_seconds: f64, + pub text: String, +} + +/// One ground truth word with the timestamp transferred from Whisper +/// (or interpolated when Whisper had no match). +#[derive(Debug, Clone, PartialEq)] +pub struct AlignedWord { + pub text: String, + pub start_seconds: f64, + pub end_seconds: f64, + pub confidence: Confidence, +} + +/// Provenance of an aligned word's timestamp. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Confidence { + /// Direct match — Whisper's normalised key equals the ground truth's. + Exact, + /// Near-match (Jaro-Winkler ≥ 0.85). + Fuzzy, + /// Inserted; timestamp interpolated proportionally from neighbours. + Interpolated, + /// Outside the anchor region under `bracket` strategy: timestamp + /// spans a slice of the leading/trailing gap. + Bracketed, + /// Outside the anchor region under `no-sync` strategy: word has + /// text but no usable timestamp (caller should omit from SMIL). + Unsynced, +} + +/// One paragraph's worth of aligned words, ready to feed into the +/// existing XHTML/SMIL pipeline. +#[derive(Debug, Clone)] +pub struct AlignedParagraph { + pub text: String, + pub words: Vec, + pub audio_src: String, + pub start_seconds: f64, + pub end_seconds: f64, +} + +/// How to handle ground-truth-only words (book content the narrator +/// skipped — colophon, index, acknowledgements). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum BoundaryStrategy { + /// Drop the words from the EPUB entirely. + Drop, + /// Include the text but emit no Media Overlay entry — visible in + /// the XHTML, no karaoke highlight on those passages. Default + /// because for accessibility readable text matters more than + /// perfect highlight tracking. + #[default] + NoSync, + /// Span the available time gap proportionally — highlight bar + /// moves through the words at average speed. Produces continuous + /// sync at the cost of timestamp accuracy. + Bracket, +} + +/// Diagnostic for one trimmed/dropped/bracketed region. +#[derive(Debug, Clone)] +pub struct TrimEvent { + pub kind: TrimKind, + pub word_count: usize, + /// First ~80 chars of the trimmed text, for log lines. + pub preview: String, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TrimKind { + /// Whisper-only words before the leading anchor (audiobook preamble). + LeadingWhisper, + /// Whisper-only words after the trailing anchor (audiobook outro). + TrailingWhisper, + /// Ground-truth-only words before the leading anchor. + LeadingGroundTruth, + /// Ground-truth-only words after the trailing anchor. + TrailingGroundTruth, +} + +/// Result of aligning one section: paragraphs to embed in the EPUB, +/// plus a log of every trim/drop event for the user to inspect. +#[derive(Debug, Clone)] +pub struct AlignmentResult { + pub paragraphs: Vec, + pub trim_log: Vec, +} + +/// Align ground truth text against a Whisper word stream for one section. +/// +/// `whisper_words` is the flat list of Whisper words for this section's +/// audio (in chronological order). `ground_truth` is the section's text +/// with paragraph boundaries marked by blank lines. `audio_src` is the +/// audio filename (basename) that all produced paragraphs will inherit. +pub fn align_section( + whisper_words: &[WordTiming], + ground_truth: &str, + audio_src: &str, + boundary_strategy: BoundaryStrategy, +) -> Result { + if whisper_words.is_empty() { + return Err(Error::NoWhisperWords); + } + if ground_truth.trim().is_empty() { + return Err(Error::EmptyGroundTruth); + } + + // Tokenise ground truth into paragraphs of words. Each word + // remembers the paragraph index it belongs to, so we can rebuild + // paragraph structure after timestamp transfer. + let (gt_words, paragraph_breaks) = tokenise_ground_truth(ground_truth); + + // Run word-level diff on normalised keys. + let edit_script = diff::diff_words(whisper_words, >_words); + + // Detect the alignment anchor region and classify ops as + // leading/core/trailing. + let trimmed = boundary::classify(&edit_script); + + // Walk the classified edit script and produce one AlignedWord per + // ground truth word. + let (aligned, trim_log) = transfer::transfer_timestamps( + whisper_words, + >_words, + &trimmed, + boundary_strategy, + ); + + // Group aligned words by paragraph (using the breaks we recorded). + let paragraphs = build_paragraphs(&aligned, ¶graph_breaks, audio_src); + + Ok(AlignmentResult { + paragraphs, + trim_log, + }) +} + +/// Internal: one ground truth word with original surface text and a +/// match key (normalised form for diffing). +#[derive(Debug, Clone)] +pub(crate) struct GroundTruthWord { + pub text: String, + pub key: String, +} + +fn tokenise_ground_truth(text: &str) -> (Vec, Vec) { + let mut words: Vec = Vec::new(); + let mut paragraph_breaks: Vec = Vec::new(); + + for (para_idx, para) in text.split("\n\n").enumerate() { + let para = para.trim(); + if para.is_empty() { + continue; + } + if para_idx > 0 && !words.is_empty() { + paragraph_breaks.push(words.len()); + } + for tok in para.split_whitespace() { + let key = normalize::normalise(tok); + if key.is_empty() { + // Pure punctuation token — attach to the previous word + // by appending raw text, leaving its key untouched. + if let Some(last) = words.last_mut() { + last.text.push_str(tok); + continue; + } + } + words.push(GroundTruthWord { + text: tok.to_owned(), + key, + }); + } + } + + (words, paragraph_breaks) +} + +fn build_paragraphs( + aligned: &[AlignedWord], + paragraph_breaks: &[usize], + audio_src: &str, +) -> Vec { + if aligned.is_empty() { + return Vec::new(); + } + let mut paragraphs = Vec::new(); + let mut start_idx = 0; + let mut breaks: Vec = paragraph_breaks.to_vec(); + breaks.push(aligned.len()); // sentinel + + for end_idx in breaks { + if end_idx <= start_idx { + continue; + } + let slice = &aligned[start_idx..end_idx]; + if slice.is_empty() { + start_idx = end_idx; + continue; + } + let text = slice + .iter() + .map(|w| w.text.as_str()) + .collect::>() + .join(" "); + // Time bounds: pick the first/last word with a real timestamp + // (Unsynced words may have zeros). + let first_real = slice.iter().find(|w| w.confidence != Confidence::Unsynced); + let last_real = slice + .iter() + .rev() + .find(|w| w.confidence != Confidence::Unsynced); + let (start_seconds, end_seconds) = match (first_real, last_real) { + (Some(a), Some(b)) => (a.start_seconds, b.end_seconds), + _ => (0.0, 0.0), + }; + paragraphs.push(AlignedParagraph { + text, + words: slice.to_vec(), + audio_src: audio_src.to_owned(), + start_seconds, + end_seconds, + }); + start_idx = end_idx; + } + paragraphs +} + +#[cfg(test)] +mod tests { + use super::*; + + fn ww(text: &str, start: f64, end: f64) -> WordTiming { + WordTiming { + start_seconds: start, + end_seconds: end, + text: text.to_owned(), + } + } + + #[test] + fn rejects_empty_inputs() { + assert!(matches!( + align_section(&[], "hello", "a.mp3", BoundaryStrategy::default()), + Err(Error::NoWhisperWords), + )); + assert!(matches!( + align_section(&[ww("a", 0.0, 1.0)], "", "a.mp3", BoundaryStrategy::default()), + Err(Error::EmptyGroundTruth), + )); + } + + #[test] + fn perfect_match_passes_through() { + // Whisper says exactly what the ground truth says. + let whisper = vec![ + ww("Hello", 0.0, 0.5), + ww("world.", 0.5, 1.5), + ]; + let gt = "Hello world."; + let res = align_section(&whisper, gt, "a.mp3", BoundaryStrategy::default()).unwrap(); + assert_eq!(res.paragraphs.len(), 1); + let para = &res.paragraphs[0]; + assert_eq!(para.words.len(), 2); + assert_eq!(para.words[0].text, "Hello"); + assert_eq!(para.words[1].text, "world."); + assert_eq!(para.words[0].confidence, Confidence::Exact); + assert_eq!(para.start_seconds, 0.0); + assert_eq!(para.end_seconds, 1.5); + assert!(res.trim_log.is_empty()); + } + + #[test] + fn paragraph_breaks_split_output() { + let whisper = vec![ + ww("Hello", 0.0, 0.5), + ww("world.", 0.5, 1.5), + ww("Foo", 2.0, 2.4), + ww("bar.", 2.4, 3.0), + ]; + let gt = "Hello world.\n\nFoo bar."; + let res = align_section(&whisper, gt, "a.mp3", BoundaryStrategy::default()).unwrap(); + assert_eq!(res.paragraphs.len(), 2); + assert_eq!(res.paragraphs[0].text, "Hello world."); + assert_eq!(res.paragraphs[1].text, "Foo bar."); + } +} diff --git a/crates/dpub-align/src/normalize.rs b/crates/dpub-align/src/normalize.rs new file mode 100644 index 0000000..ec435fc --- /dev/null +++ b/crates/dpub-align/src/normalize.rs @@ -0,0 +1,89 @@ +//! Word normalisation for matching. +//! +//! Produces a "match key" by lowercasing and stripping punctuation. +//! The original surface form (with punctuation, capitalisation) is +//! preserved separately by callers and flows through to the output — +//! normalisation is *only* used as the diff-equality key. + +/// Return the normalised match key for a word: Unicode-lowercased, +/// with leading/trailing punctuation stripped. Internal apostrophes +/// (`don't`, `c'est`) and hyphens (`well-known`) are preserved so +/// English/French/Dutch contractions and compounds stay intact. +pub fn normalise(word: &str) -> String { + // Strip surrounding punctuation/quotes/brackets/whitespace. + let trimmed = word.trim_matches(|c: char| { + c.is_whitespace() || is_strippable_punct(c) + }); + if trimmed.is_empty() { + return String::new(); + } + trimmed.to_lowercase() +} + +fn is_strippable_punct(c: char) -> bool { + matches!( + c, + '.' | ',' | ';' | ':' | '!' | '?' | '…' + | '"' | '\u{201C}' | '\u{201D}' // " " " + | '\'' | '\u{2018}' | '\u{2019}' // ' ' ' + | '(' | ')' | '[' | ']' | '{' | '}' + | '«' | '»' | '‹' | '›' + | '—' | '–' + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn lowercases() { + assert_eq!(normalise("Hello"), "hello"); + assert_eq!(normalise("WORLD"), "world"); + } + + #[test] + fn strips_trailing_punctuation() { + assert_eq!(normalise("wereld."), "wereld"); + assert_eq!(normalise("wereld,"), "wereld"); + assert_eq!(normalise("wereld!"), "wereld"); + assert_eq!(normalise("wereld?"), "wereld"); + assert_eq!(normalise("wereld..."), "wereld"); + assert_eq!(normalise("wereld…"), "wereld"); + } + + #[test] + fn strips_brackets_and_quotes() { + assert_eq!(normalise("(hello"), "hello"); + assert_eq!(normalise("hello)"), "hello"); + assert_eq!(normalise("\"quoted\""), "quoted"); + assert_eq!(normalise("'word'"), "word"); + assert_eq!(normalise("\u{201C}smart\u{201D}"), "smart"); + } + + #[test] + fn preserves_internal_apostrophes() { + assert_eq!(normalise("don't"), "don't"); + assert_eq!(normalise("c'est"), "c'est"); + } + + #[test] + fn preserves_internal_hyphens() { + assert_eq!(normalise("well-known"), "well-known"); + assert_eq!(normalise("co-op."), "co-op"); + } + + #[test] + fn pure_punctuation_returns_empty() { + assert_eq!(normalise("."), ""); + assert_eq!(normalise("..."), ""); + assert_eq!(normalise("\""), ""); + assert_eq!(normalise(""), ""); + } + + #[test] + fn unicode_passes_through() { + assert_eq!(normalise("café"), "café"); + assert_eq!(normalise("Antwerpen,"), "antwerpen"); + } +} diff --git a/crates/dpub-align/src/section_split.rs b/crates/dpub-align/src/section_split.rs new file mode 100644 index 0000000..b10f228 --- /dev/null +++ b/crates/dpub-align/src/section_split.rs @@ -0,0 +1,256 @@ +//! Split a single ground truth file into per-section text by matching +//! its headings against the DAISY NCC headings. +//! +//! Auto-detects markdown vs plain text: +//! - if any line starts with `#` followed by space, treat as markdown +//! and use those as candidate headings +//! - otherwise scan every short line and fuzzy-match against the +//! provided NCC heading texts +//! +//! Match score: Jaro-Winkler over normalised heading text. + +use crate::normalize; + +/// One section's slice of the ground truth text. +#[derive(Debug, Clone)] +pub struct SectionText { + /// Index into the NCC headings list (0-based). + pub ncc_index: usize, + /// Raw text between this heading and the next matched heading. + /// Paragraph boundaries (blank lines) are preserved. + pub text: String, +} + +/// Threshold for matching a candidate heading line to an NCC heading. +const HEADING_MATCH_THRESHOLD: f64 = 0.85; + +/// Maximum character length for a "candidate heading" line in plain +/// text mode. Real chapter titles are short; long lines are body text. +const PLAIN_HEADING_MAX_LEN: usize = 120; + +/// Split the ground truth `text` into per-section chunks by matching +/// headings against the supplied NCC heading texts (in order). +/// +/// `ncc_headings` is a slice of (heading_text, ncc_index) tuples. The +/// `ncc_index` lets the caller map results back to its own data +/// structures. +pub fn split_into_sections( + text: &str, + ncc_headings: &[(&str, usize)], +) -> Vec { + let is_markdown = text + .lines() + .any(|l| l.trim_start().starts_with('#') && l.trim_start().chars().nth(1) == Some(' ')); + + let candidate_lines = if is_markdown { + markdown_headings(text) + } else { + plain_text_headings(text) + }; + + // For each NCC heading (in order), find the *first* candidate line + // (after the previous match) that fuzzy-matches it. Locking matches + // in document order prevents a later-section heading from stealing + // an earlier section's match. + let mut matches: Vec<(usize, usize, usize)> = Vec::new(); // (ncc_idx, line_byte_offset_start, line_byte_offset_end) + let mut search_from: usize = 0; + for (heading_text, ncc_idx) in ncc_headings { + let target = normalize_heading(heading_text); + let mut best: Option<(f64, usize, usize)> = None; + for cand in &candidate_lines { + if cand.line_start < search_from { + continue; + } + let normalised = normalize_heading(&cand.heading_text); + if normalised.is_empty() { + continue; + } + let score = strsim::jaro_winkler(&target, &normalised); + if score >= HEADING_MATCH_THRESHOLD + && best.map_or(true, |(prev, _, _)| score > prev) + { + best = Some((score, cand.line_start, cand.line_end)); + } + // Don't break on first hit — we want the best score + // before search_from advances. + // But cap search distance so a typo doesn't cause a match + // 30 chapters later: stop once we've scanned enough lines. + if cand.line_start > search_from + 200_000 { + break; + } + } + if let Some((_, start, end)) = best { + matches.push((*ncc_idx, start, end)); + search_from = end; + } + } + + // Now slice the text between matched heading line ends. + let bytes = text.as_bytes(); + let mut sections = Vec::with_capacity(matches.len()); + for i in 0..matches.len() { + let (ncc_idx, _heading_start, heading_end) = matches[i]; + let body_start = heading_end; + let body_end = matches.get(i + 1).map_or(bytes.len(), |&(_, next_start, _)| next_start); + if body_end <= body_start { + continue; + } + let slice = &text[body_start..body_end]; + sections.push(SectionText { + ncc_index: ncc_idx, + text: slice.trim().to_owned(), + }); + } + sections +} + +#[derive(Debug)] +struct CandidateHeading { + heading_text: String, + line_start: usize, + line_end: usize, +} + +fn markdown_headings(text: &str) -> Vec { + let mut out = Vec::new(); + let mut offset: usize = 0; + for line in text.split_inclusive('\n') { + let trimmed = line.trim_start(); + if trimmed.starts_with('#') { + if let Some(rest) = trimmed.strip_prefix('#') { + let heading_body = rest.trim_start_matches('#').trim(); + if !heading_body.is_empty() { + out.push(CandidateHeading { + heading_text: heading_body.to_owned(), + line_start: offset, + line_end: offset + line.len(), + }); + } + } + } + offset += line.len(); + } + out +} + +fn plain_text_headings(text: &str) -> Vec { + // Heuristic: any non-empty line ≤ PLAIN_HEADING_MAX_LEN is a + // candidate. We rely on the fuzzy-match threshold to filter out + // body-text lines that happen to be short. + let mut out = Vec::new(); + let mut offset: usize = 0; + for line in text.split_inclusive('\n') { + let trimmed = line.trim(); + if !trimmed.is_empty() && trimmed.len() <= PLAIN_HEADING_MAX_LEN { + out.push(CandidateHeading { + heading_text: trimmed.to_owned(), + line_start: offset, + line_end: offset + line.len(), + }); + } + offset += line.len(); + } + out +} + +/// Heading-specific normalisation: lowercase, strip leading numbering +/// ("Chapter 1", "1.", "I."), collapse whitespace. +fn normalize_heading(text: &str) -> String { + let trimmed = text.trim(); + let stripped = strip_leading_numbering(trimmed); + // Strip surrounding punctuation per word, then rejoin. + let cleaned: Vec = stripped + .split_whitespace() + .map(normalize::normalise) + .filter(|s| !s.is_empty()) + .collect(); + cleaned.join(" ") +} + +fn strip_leading_numbering(s: &str) -> &str { + // Strip patterns like "1. ", "1) ", "Chapter 1: ", "I. ". + let s = s.trim_start_matches(|c: char| c.is_ascii_digit() || c == '.' || c == ')'); + s.trim() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn detects_markdown_headings() { + let text = "\ +# Chapter 1 +First paragraph. + +## Section A +Second paragraph. + +# Chapter 2 +Third paragraph. +"; + let ncc = [("Chapter 1", 0), ("Chapter 2", 1)]; + let sections = split_into_sections(text, &ncc); + assert_eq!(sections.len(), 2); + assert_eq!(sections[0].ncc_index, 0); + assert!(sections[0].text.contains("First paragraph")); + assert!(sections[0].text.contains("Section A")); + assert!(sections[0].text.contains("Second paragraph")); + assert_eq!(sections[1].ncc_index, 1); + assert!(sections[1].text.contains("Third paragraph")); + } + + #[test] + fn detects_plain_text_headings() { + let text = "\ +Chapter 1 + +This is the first paragraph of chapter one. + +Chapter 2 + +This is the first paragraph of chapter two. +"; + let ncc = [("Chapter 1", 0), ("Chapter 2", 1)]; + let sections = split_into_sections(text, &ncc); + assert_eq!(sections.len(), 2); + assert!(sections[0].text.contains("first paragraph of chapter one")); + assert!(sections[1].text.contains("first paragraph of chapter two")); + } + + #[test] + fn fuzzy_matches_typo() { + // "Hofdstuk 1" (typo for "Hoofdstuk 1") should still match + // via Jaro-Winkler. + let text = "\ +# Hofdstuk 1 +Body text. + +# Hoofdstuk 2 +More body text. +"; + let ncc = [("Hoofdstuk 1", 0), ("Hoofdstuk 2", 1)]; + let sections = split_into_sections(text, &ncc); + assert_eq!(sections.len(), 2); + } + + #[test] + fn unmatched_heading_skipped() { + let text = "\ +# Chapter 1 +Body text. +"; + let ncc = [("Chapter 1", 0), ("Chapter 2 — Not in text", 1)]; + let sections = split_into_sections(text, &ncc); + assert_eq!(sections.len(), 1); + assert_eq!(sections[0].ncc_index, 0); + } + + #[test] + fn handles_empty_input() { + let text = ""; + let ncc = [("Chapter 1", 0)]; + let sections = split_into_sections(text, &ncc); + assert!(sections.is_empty()); + } +} diff --git a/crates/dpub-align/src/transfer.rs b/crates/dpub-align/src/transfer.rs new file mode 100644 index 0000000..ac0dc44 --- /dev/null +++ b/crates/dpub-align/src/transfer.rs @@ -0,0 +1,621 @@ +//! Walk the classified edit script and produce one [`AlignedWord`] +//! per ground truth word, with timestamps transferred from Whisper +//! (or interpolated / bracketed / unsynced as appropriate). +//! +//! The walk is index-based on the script. Two cursors track which +//! input slice each op refers to: +//! - `whisper_words[op.whisper_idx]` for the Whisper time data +//! - `gt_words[op.gt_idx]` for the ground truth surface text + +use crate::boundary::{ClassifiedOp, Region}; +use crate::diff::Op; +use crate::{ + AlignedWord, BoundaryStrategy, Confidence, GroundTruthWord, TrimEvent, TrimKind, WordTiming, +}; + +/// Produce the aligned word stream and trim diagnostics. +pub(crate) fn transfer_timestamps( + whisper: &[WordTiming], + ground_truth: &[GroundTruthWord], + script: &[ClassifiedOp], + strategy: BoundaryStrategy, +) -> (Vec, Vec) { + // Pass 1: assemble per-region operation buckets. + let mut leading: Vec<&ClassifiedOp> = Vec::new(); + let mut core: Vec<&ClassifiedOp> = Vec::new(); + let mut trailing: Vec<&ClassifiedOp> = Vec::new(); + for c in script { + match c.region { + Region::Leading => leading.push(c), + Region::Core => core.push(c), + Region::Trailing => trailing.push(c), + } + } + + let mut aligned: Vec = Vec::with_capacity(ground_truth.len()); + let mut trim_log: Vec = Vec::new(); + + // Time-bracket helpers: the leading region spans audio time from 0 + // to the first core anchor's start; the trailing from last core + // anchor's end to whisper.last().end. + let core_start_seconds = first_core_match_time(&core, whisper).unwrap_or(0.0); + let core_end_seconds = last_core_match_time(&core, whisper) + .unwrap_or_else(|| whisper.last().map_or(0.0, |w| w.end_seconds)); + let total_audio_end = whisper.last().map_or(core_end_seconds, |w| w.end_seconds); + + handle_boundary_region( + &leading, + whisper, + ground_truth, + strategy, + BoundaryEnd::Leading, + 0.0, + core_start_seconds, + &mut aligned, + &mut trim_log, + ); + + transfer_core(&core, whisper, ground_truth, &mut aligned); + + handle_boundary_region( + &trailing, + whisper, + ground_truth, + strategy, + BoundaryEnd::Trailing, + core_end_seconds, + total_audio_end, + &mut aligned, + &mut trim_log, + ); + + enforce_monotonicity(&mut aligned); + (aligned, trim_log) +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum BoundaryEnd { + Leading, + Trailing, +} + +#[allow(clippy::too_many_arguments)] +fn handle_boundary_region( + region: &[&ClassifiedOp], + whisper: &[WordTiming], + ground_truth: &[GroundTruthWord], + strategy: BoundaryStrategy, + end: BoundaryEnd, + gap_start: f64, + gap_end: f64, + aligned: &mut Vec, + trim_log: &mut Vec, +) { + if region.is_empty() { + return; + } + + // Whisper-only words in a boundary region are always discarded + // (audiobook preamble / outro). Their time is *not* redistributed. + let whisper_only: Vec = region + .iter() + .filter_map(|c| match c.op { + Op::Delete { whisper_idx } => Some(whisper_idx), + _ => None, + }) + .collect(); + if !whisper_only.is_empty() { + let preview = preview_from_whisper(whisper, &whisper_only); + trim_log.push(TrimEvent { + kind: match end { + BoundaryEnd::Leading => TrimKind::LeadingWhisper, + BoundaryEnd::Trailing => TrimKind::TrailingWhisper, + }, + word_count: whisper_only.len(), + preview, + }); + } + + // Ground-truth-only words: collect, then apply strategy. + let gt_only: Vec = region + .iter() + .filter_map(|c| match c.op { + Op::Insert { gt_idx } => Some(gt_idx), + _ => None, + }) + .collect(); + + // Equal/Fuzzy ops *inside* a boundary region (rare — most matches + // get pulled into Core) are still real matches; transfer the + // timestamps directly so we don't lose them. + for c in region { + match c.op { + Op::Equal { whisper_idx, gt_idx } | Op::Fuzzy { whisper_idx, gt_idx, .. } => { + let w = &whisper[whisper_idx]; + let g = &ground_truth[gt_idx]; + aligned.push(AlignedWord { + text: g.text.clone(), + start_seconds: w.start_seconds, + end_seconds: w.end_seconds, + confidence: if matches!(c.op, Op::Equal { .. }) { + Confidence::Exact + } else { + Confidence::Fuzzy + }, + }); + } + _ => {} + } + } + + if gt_only.is_empty() { + return; + } + + let gt_preview = preview_from_ground_truth(ground_truth, >_only); + trim_log.push(TrimEvent { + kind: match end { + BoundaryEnd::Leading => TrimKind::LeadingGroundTruth, + BoundaryEnd::Trailing => TrimKind::TrailingGroundTruth, + }, + word_count: gt_only.len(), + preview: gt_preview, + }); + + match strategy { + BoundaryStrategy::Drop => { + // Words excluded entirely; nothing to push. + } + BoundaryStrategy::NoSync => { + for &g_idx in >_only { + aligned.push(AlignedWord { + text: ground_truth[g_idx].text.clone(), + start_seconds: 0.0, + end_seconds: 0.0, + confidence: Confidence::Unsynced, + }); + } + } + BoundaryStrategy::Bracket => { + // Distribute the gap [gap_start, gap_end] proportionally + // by character count. If the gap is non-positive, fall + // back to Unsynced. + let total_chars: usize = gt_only + .iter() + .map(|&g| ground_truth[g].text.chars().count().max(1)) + .sum(); + let gap_duration = (gap_end - gap_start).max(0.0); + if gap_duration <= 0.0 || total_chars == 0 { + for &g_idx in >_only { + aligned.push(AlignedWord { + text: ground_truth[g_idx].text.clone(), + start_seconds: 0.0, + end_seconds: 0.0, + confidence: Confidence::Unsynced, + }); + } + return; + } + let mut cursor = gap_start; + #[allow(clippy::cast_precision_loss)] + let per_char = gap_duration / total_chars as f64; + for &g_idx in >_only { + let chars = ground_truth[g_idx].text.chars().count().max(1); + #[allow(clippy::cast_precision_loss)] + let dur = chars as f64 * per_char; + aligned.push(AlignedWord { + text: ground_truth[g_idx].text.clone(), + start_seconds: cursor, + end_seconds: cursor + dur, + confidence: Confidence::Bracketed, + }); + cursor += dur; + } + } + } +} + +/// Walk the core region: Equal/Fuzzy copy timestamps; Delete is +/// discarded (audio time is reclaimed by neighbours via interpolation +/// of any adjacent Inserts); Insert interpolates from neighbours. +fn transfer_core( + region: &[&ClassifiedOp], + whisper: &[WordTiming], + ground_truth: &[GroundTruthWord], + aligned: &mut Vec, +) { + let mut i = 0; + while i < region.len() { + match region[i].op { + Op::Equal { whisper_idx, gt_idx } => { + let w = &whisper[whisper_idx]; + aligned.push(AlignedWord { + text: ground_truth[gt_idx].text.clone(), + start_seconds: w.start_seconds, + end_seconds: w.end_seconds, + confidence: Confidence::Exact, + }); + i += 1; + } + Op::Fuzzy { whisper_idx, gt_idx, .. } => { + let w = &whisper[whisper_idx]; + aligned.push(AlignedWord { + text: ground_truth[gt_idx].text.clone(), + start_seconds: w.start_seconds, + end_seconds: w.end_seconds, + confidence: Confidence::Fuzzy, + }); + i += 1; + } + Op::Insert { .. } => { + // Collect a run of consecutive Inserts (and any + // preceding/following Deletes) to interpolate as a + // group. + let group_start = i; + while i < region.len() + && matches!(region[i].op, Op::Insert { .. } | Op::Delete { .. }) + { + i += 1; + } + let group_end = i; + interpolate_insert_run( + ®ion[group_start..group_end], + whisper, + ground_truth, + aligned, + ); + } + Op::Delete { .. } => { + // Skip lone Delete (Whisper hallucination). Time is + // implicitly reclaimed: the next Equal will start + // wherever Whisper had it, leaving a small audible + // pause that's consistent with what was actually said. + i += 1; + } + } + } +} + +/// Interpolate timestamps for a run of `Insert` operations (with +/// optional adjacent Deletes that contribute their time span). Uses +/// the previous aligned word's end and the next core match's start +/// as bounds, distributing time proportionally to character length. +fn interpolate_insert_run( + run: &[&ClassifiedOp], + whisper: &[WordTiming], + ground_truth: &[GroundTruthWord], + aligned: &mut Vec, +) { + // Determine the time bounds. + let prev_end = aligned.last().map_or(0.0, |w| w.end_seconds); + + // The "next" anchor time is whatever Whisper word the deletes + // span, or — if none — the previous timestamp + 0 (which collapses + // to zero-duration words; better than fabricated time). + let delete_indices: Vec = run + .iter() + .filter_map(|c| match c.op { + Op::Delete { whisper_idx } => Some(whisper_idx), + _ => None, + }) + .collect(); + let bound_end = if let Some(last_d) = delete_indices.last() { + whisper[*last_d].end_seconds + } else { + prev_end // No delete pool — pure insert with no following anchor here. + }; + + let inserts: Vec = run + .iter() + .filter_map(|c| match c.op { + Op::Insert { gt_idx } => Some(gt_idx), + _ => None, + }) + .collect(); + + if inserts.is_empty() { + return; + } + + let total_chars: usize = inserts + .iter() + .map(|&g| ground_truth[g].text.chars().count().max(1)) + .sum(); + let span = (bound_end - prev_end).max(0.0); + + if span <= 0.0 || total_chars == 0 { + // Zero-duration: emit at prev_end. Reading systems will skip + // these instantly but the text remains. + for &g_idx in &inserts { + aligned.push(AlignedWord { + text: ground_truth[g_idx].text.clone(), + start_seconds: prev_end, + end_seconds: prev_end, + confidence: Confidence::Interpolated, + }); + } + return; + } + + let mut cursor = prev_end; + #[allow(clippy::cast_precision_loss)] + let per_char = span / total_chars as f64; + for &g_idx in &inserts { + let chars = ground_truth[g_idx].text.chars().count().max(1); + #[allow(clippy::cast_precision_loss)] + let dur = chars as f64 * per_char; + aligned.push(AlignedWord { + text: ground_truth[g_idx].text.clone(), + start_seconds: cursor, + end_seconds: cursor + dur, + confidence: Confidence::Interpolated, + }); + cursor += dur; + } +} + +fn first_core_match_time(core: &[&ClassifiedOp], whisper: &[WordTiming]) -> Option { + core.iter().find_map(|c| match c.op { + Op::Equal { whisper_idx, .. } | Op::Fuzzy { whisper_idx, .. } => { + Some(whisper[whisper_idx].start_seconds) + } + _ => None, + }) +} + +fn last_core_match_time(core: &[&ClassifiedOp], whisper: &[WordTiming]) -> Option { + core.iter().rev().find_map(|c| match c.op { + Op::Equal { whisper_idx, .. } | Op::Fuzzy { whisper_idx, .. } => { + Some(whisper[whisper_idx].end_seconds) + } + _ => None, + }) +} + +fn preview_from_whisper(whisper: &[WordTiming], indices: &[usize]) -> String { + let mut out = String::new(); + for &i in indices.iter().take(15) { + if !out.is_empty() { + out.push(' '); + } + out.push_str(&whisper[i].text); + if out.len() > 80 { + break; + } + } + if indices.len() > 15 { + out.push_str(" …"); + } + out +} + +fn preview_from_ground_truth(gt: &[GroundTruthWord], indices: &[usize]) -> String { + let mut out = String::new(); + for &i in indices.iter().take(15) { + if !out.is_empty() { + out.push(' '); + } + out.push_str(>[i].text); + if out.len() > 80 { + break; + } + } + if indices.len() > 15 { + out.push_str(" …"); + } + out +} + +/// Clamp any timestamp regressions so SMIL emits monotonic clip times. +/// Whisper itself can occasionally produce overlapping word ranges +/// (BPE artefacts) and our interpolation could in theory exceed the +/// next anchor's start when characters dominate. A simple sweep fixes +/// both. +fn enforce_monotonicity(aligned: &mut [AlignedWord]) { + for i in 1..aligned.len() { + // Skip Unsynced entries — they hold zeros by design. + if aligned[i].confidence == Confidence::Unsynced + || aligned[i - 1].confidence == Confidence::Unsynced + { + continue; + } + let prev_end = aligned[i - 1].end_seconds; + if aligned[i].start_seconds < prev_end { + aligned[i].start_seconds = prev_end; + } + if aligned[i].end_seconds < aligned[i].start_seconds { + aligned[i].end_seconds = aligned[i].start_seconds; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::boundary::classify; + use crate::diff::diff_words; + use crate::normalize; + + fn ww(text: &str, start: f64, end: f64) -> WordTiming { + WordTiming { + start_seconds: start, + end_seconds: end, + text: text.to_owned(), + } + } + + fn gt(words: &[&str]) -> Vec { + words + .iter() + .map(|s| GroundTruthWord { + text: (*s).to_owned(), + key: normalize::normalise(s), + }) + .collect() + } + + fn run( + whisper: &[WordTiming], + ground_truth: &[GroundTruthWord], + strategy: BoundaryStrategy, + ) -> (Vec, Vec) { + let script = diff_words(whisper, ground_truth); + let classified = classify(&script); + transfer_timestamps(whisper, ground_truth, &classified, strategy) + } + + #[test] + fn perfect_match_passes_timestamps_through() { + let w = vec![ww("hello", 0.0, 0.5), ww("world", 0.5, 1.5)]; + let g = gt(&["hello", "world"]); + let (aligned, _) = run(&w, &g, BoundaryStrategy::default()); + assert_eq!(aligned.len(), 2); + assert_eq!(aligned[0].start_seconds, 0.0); + assert_eq!(aligned[0].end_seconds, 0.5); + assert_eq!(aligned[1].start_seconds, 0.5); + assert_eq!(aligned[1].end_seconds, 1.5); + } + + #[test] + fn whisper_preamble_is_trimmed_no_time_smear() { + // 8 fake preamble words + 5 real matches. + let mut w: Vec = (0..8) + .map(|i| ww(&format!("p{i}"), i as f64, i as f64 + 1.0)) + .collect(); + for i in 0..5 { + let t = 8.0 + i as f64; + w.push(ww(&format!("real{i}"), t, t + 1.0)); + } + let g = gt(&["real0", "real1", "real2", "real3", "real4"]); + let (aligned, trim_log) = run(&w, &g, BoundaryStrategy::default()); + // The 8 preamble words must NOT bleed into the first real word. + assert_eq!(aligned.len(), 5); + assert_eq!(aligned[0].start_seconds, 8.0); + // Trim log records the preamble. + assert!(trim_log.iter().any(|e| e.kind == TrimKind::LeadingWhisper && e.word_count == 8)); + } + + #[test] + fn colophon_no_sync_default() { + // 5 real matches + 5 colophon words only in ground truth. + let mut w: Vec = Vec::new(); + for i in 0..5 { + let t = i as f64; + w.push(ww(&format!("real{i}"), t, t + 1.0)); + } + let mut g_words: Vec<&str> = vec!["real0", "real1", "real2", "real3", "real4"]; + let colophon = ["isbn", "9780000000000", "copyright", "2024", "publisher"]; + g_words.extend(colophon.iter()); + let g = gt(&g_words); + let (aligned, trim_log) = run(&w, &g, BoundaryStrategy::NoSync); + assert_eq!(aligned.len(), 10); + // Last 5 should be Unsynced. + for w in &aligned[5..] { + assert_eq!(w.confidence, Confidence::Unsynced); + assert_eq!(w.start_seconds, 0.0); + assert_eq!(w.end_seconds, 0.0); + } + assert!(trim_log + .iter() + .any(|e| e.kind == TrimKind::TrailingGroundTruth && e.word_count == 5)); + } + + #[test] + fn colophon_drop_excludes_words() { + let mut w: Vec = Vec::new(); + for i in 0..5 { + let t = i as f64; + w.push(ww(&format!("real{i}"), t, t + 1.0)); + } + let mut g_words: Vec<&str> = vec!["real0", "real1", "real2", "real3", "real4"]; + g_words.extend(["isbn", "9780000000000"].iter()); + let g = gt(&g_words); + let (aligned, _) = run(&w, &g, BoundaryStrategy::Drop); + assert_eq!(aligned.len(), 5); + } + + #[test] + fn colophon_bracket_spans_gap() { + // 5 matches end at t=5.0, total audio extends to t=10.0, + // bracket strategy should distribute 5s across the colophon. + let mut w: Vec = Vec::new(); + for i in 0..5 { + let t = i as f64; + w.push(ww(&format!("real{i}"), t, t + 1.0)); + } + // Add one more whisper word to extend the audio range. + w.push(ww("trailing-noise", 5.0, 10.0)); + let mut g_words: Vec<&str> = vec!["real0", "real1", "real2", "real3", "real4"]; + g_words.extend(["isbn", "page", "number"].iter()); + let g = gt(&g_words); + let (aligned, _) = run(&w, &g, BoundaryStrategy::Bracket); + // Bracketed entries must have non-zero durations and be in + // [5.0, 10.0]. + let bracketed: Vec<_> = aligned + .iter() + .filter(|w| w.confidence == Confidence::Bracketed) + .collect(); + assert!(!bracketed.is_empty()); + for w in &bracketed { + assert!(w.start_seconds >= 5.0); + assert!(w.end_seconds <= 10.0 + 0.01); + assert!(w.end_seconds > w.start_seconds); + } + } + + #[test] + fn fuzzy_match_transfers_timestamp() { + let w = vec![ww("antwerpe", 1.0, 2.0)]; + let g = gt(&["antwerpen"]); + let (aligned, _) = run(&w, &g, BoundaryStrategy::default()); + assert_eq!(aligned.len(), 1); + assert_eq!(aligned[0].text, "antwerpen"); + assert_eq!(aligned[0].confidence, Confidence::Fuzzy); + assert_eq!(aligned[0].start_seconds, 1.0); + } + + #[test] + fn missing_word_interpolates_between_neighbours() { + // Whisper missed "the": "hello world" vs "hello the world". + let w = vec![ + ww("hello", 0.0, 1.0), + ww("world", 2.0, 3.0), + ]; + let g = gt(&["hello", "the", "world"]); + // No anchor here (only 2 matches). The classify() falls back + // to Core for everything, so interpolation runs on the full + // script. + let (aligned, _) = run(&w, &g, BoundaryStrategy::default()); + assert_eq!(aligned.len(), 3); + assert_eq!(aligned[1].text, "the"); + assert_eq!(aligned[1].confidence, Confidence::Interpolated); + // "the" should be sandwiched between hello.end (1.0) and + // world.start (2.0). + assert!(aligned[1].start_seconds >= 1.0); + assert!(aligned[1].end_seconds <= 2.0 + 0.01); + } + + #[test] + fn timestamps_remain_monotonic() { + let w = vec![ + ww("a", 0.0, 1.0), + ww("b", 2.0, 3.0), + ww("c", 4.0, 5.0), + ]; + let g = gt(&["a", "missing", "b", "c"]); + let (aligned, _) = run(&w, &g, BoundaryStrategy::default()); + for i in 1..aligned.len() { + if aligned[i].confidence == Confidence::Unsynced + || aligned[i - 1].confidence == Confidence::Unsynced + { + continue; + } + assert!( + aligned[i].start_seconds >= aligned[i - 1].end_seconds - 1e-9, + "non-monotonic at {i}: {:?} → {:?}", + aligned[i - 1], + aligned[i], + ); + } + } +} diff --git a/crates/dpub-cli/src/config.rs b/crates/dpub-cli/src/config.rs index 6778277..d57e90e 100644 --- a/crates/dpub-cli/src/config.rs +++ b/crates/dpub-cli/src/config.rs @@ -45,6 +45,10 @@ pub struct DpubConfig { pub jobs: Option, /// Default log level (`"error"`, `"warn"`, `"info"`, `"debug"`, `"trace"`). pub log_level: Option, + /// Default ground truth file path. + pub ground_truth: Option, + /// Default ground truth strategy: `"drop"`, `"no-sync"`, or `"bracket"`. + pub ground_truth_strategy: Option, } /// Return the platform-appropriate config directory for dpub. @@ -106,7 +110,9 @@ pub fn example_json() -> &'static str { "validate": false, "a11y": false, "jobs": 0, - "log_level": "info" + "log_level": "info", + "ground_truth": null, + "ground_truth_strategy": "no-sync" }"# } diff --git a/crates/dpub-cli/src/main.rs b/crates/dpub-cli/src/main.rs index 65d4d03..4a59910 100644 --- a/crates/dpub-cli/src/main.rs +++ b/crates/dpub-cli/src/main.rs @@ -80,6 +80,15 @@ enum Command { /// source DAISY metadata. #[arg(long, value_name = "TEXT")] rights: Option, + /// Path to the book's text (plain text or markdown). Section + /// headings are matched against the DAISY NCC; word-level + /// timestamps come from Whisper. Requires `--transcribe`. + #[arg(long, value_name = "PATH", requires = "transcribe")] + ground_truth: Option, + /// How to handle book content the narrator skipped (colophon, + /// index, etc.) when using `--ground-truth`. + #[arg(long, value_enum, default_value_t = GroundTruthStrategyOpt::NoSync, value_name = "STRATEGY")] + ground_truth_strategy: GroundTruthStrategyOpt, }, /// Validate an existing EPUB 3 publication with EPUBCheck. Validate { @@ -179,6 +188,38 @@ impl AudioOpt { } } +#[derive(Clone, Copy, ValueEnum, Default)] +enum GroundTruthStrategyOpt { + /// Drop ground-truth-only words (colophon etc.) entirely. + Drop, + /// Include the text but emit no Media Overlay entry — visible in + /// the XHTML, no karaoke highlight on those passages. Default. + #[default] + NoSync, + /// Span the available time gap proportionally — highlight bar + /// moves through the words at average speed. + Bracket, +} + +impl GroundTruthStrategyOpt { + fn into_strategy(self) -> dpub_convert::BoundaryStrategy { + match self { + Self::Drop => dpub_convert::BoundaryStrategy::Drop, + Self::NoSync => dpub_convert::BoundaryStrategy::NoSync, + Self::Bracket => dpub_convert::BoundaryStrategy::Bracket, + } + } + + fn parse_str(s: &str) -> Option { + match s { + "drop" => Some(Self::Drop), + "no-sync" => Some(Self::NoSync), + "bracket" => Some(Self::Bracket), + _ => None, + } + } +} + fn main() -> Result<()> { // Load config early so we can use log_level before tracing init. let cfg = config::load(); @@ -214,6 +255,8 @@ fn main() -> Result<()> { cover, no_auto_cover, rights, + ground_truth, + ground_truth_strategy, } => { let audio = audio.unwrap_or_else(|| parse_audio_opt(&cfg)); let bitrate = bitrate.unwrap_or_else(|| { @@ -236,10 +279,26 @@ fn main() -> Result<()> { Some(config::TranscribeSetting::Language(lang)) => Some(lang.clone()), _ => None, }); + // Merge ground truth: CLI > config > none. + let ground_truth = ground_truth.or_else(|| cfg.ground_truth.clone()); + // Merge boundary strategy: CLI > config > NoSync default. + // CLI's default is NoSync; we treat it as "not explicitly + // set" only when the user passed nothing AND the config + // has a value. + let boundary_strategy = match ( + ground_truth_strategy, + cfg.ground_truth_strategy + .as_deref() + .and_then(GroundTruthStrategyOpt::parse_str), + ) { + // CLI was explicitly NoSync (default) and config has a setting → use config. + (GroundTruthStrategyOpt::NoSync, Some(cfg_val)) => cfg_val, + (cli_val, _) => cli_val, + }; cmd_convert( &ncc, &output, validate, a11y, audio, bitrate, transcribe, whisper_model, no_text_cleanup, no_word_sync, cover, - auto_cover, rights, + auto_cover, rights, ground_truth, boundary_strategy, ) } Command::Validate { epub, json } => cmd_validate(&epub, json), @@ -287,6 +346,8 @@ fn cmd_convert( cover: Option, auto_cover: bool, rights: Option, + ground_truth: Option, + ground_truth_strategy: GroundTruthStrategyOpt, ) -> Result<()> { let ncc = resolve_ncc_path(ncc)?; let book = Book::from_ncc(&ncc).with_context(|| format!("loading {}", ncc.display()))?; @@ -361,6 +422,24 @@ fn cmd_convert( println!(" Cover: best-effort lookup via Open Library"); } + if let Some(path) = &ground_truth { + if !path.is_file() { + anyhow::bail!( + "ground truth file not found at {}", + path.display() + ); + } + println!( + " Ground truth: {} (strategy: {})", + path.display(), + match ground_truth_strategy { + GroundTruthStrategyOpt::Drop => "drop", + GroundTruthStrategyOpt::NoSync => "no-sync", + GroundTruthStrategyOpt::Bracket => "bracket", + }, + ); + } + let opts = dpub_convert::ConvertOptions { audio: audio.into_format(bitrate_kbps), transcribe: transcribe_opts, @@ -369,6 +448,8 @@ fn cmd_convert( auto_cover, rights, no_word_sync, + ground_truth, + boundary_strategy: ground_truth_strategy.into_strategy(), }; let start = std::time::Instant::now(); dpub_convert::convert_to_file(&book, output, &opts) @@ -808,6 +889,8 @@ fn cmd_batch( auto_cover: true, rights: None, no_word_sync: false, + ground_truth: None, + boundary_strategy: dpub_convert::BoundaryStrategy::default(), }; let start = std::time::Instant::now(); let entries: Vec = books diff --git a/crates/dpub-convert/Cargo.toml b/crates/dpub-convert/Cargo.toml index 63eef37..04b25ae 100644 --- a/crates/dpub-convert/Cargo.toml +++ b/crates/dpub-convert/Cargo.toml @@ -18,11 +18,13 @@ metal = ["dpub-whisper/metal"] cuda = ["dpub-whisper/cuda"] [dependencies] +dpub-align = { path = "../dpub-align", version = "0.6.0" } dpub-audio = { path = "../dpub-audio", version = "0.6.0" } dpub-core = { path = "../dpub-core", version = "0.6.0" } dpub-meta = { path = "../dpub-meta", version = "0.6.0" } dpub-util = { path = "../dpub-util", version = "0.6.0" } dpub-whisper = { path = "../dpub-whisper", version = "0.6.0" } +tracing = { workspace = true } epub3-writer = { path = "../epub3-writer", version = "0.6.0" } thiserror = { workspace = true } uuid = { workspace = true } diff --git a/crates/dpub-convert/src/error.rs b/crates/dpub-convert/src/error.rs index 5045d50..3f7e5b7 100644 --- a/crates/dpub-convert/src/error.rs +++ b/crates/dpub-convert/src/error.rs @@ -23,6 +23,16 @@ pub enum Error { #[error("unsupported cover image at {path}: only JPEG and PNG are accepted")] UnsupportedCoverImage { path: PathBuf }, + + #[error("--ground-truth requires --transcribe (Whisper provides timestamps)")] + GroundTruthWithoutTranscribe, + + #[error("ground truth file at {path} could not be read: {source}")] + GroundTruthIo { + path: PathBuf, + #[source] + source: std::io::Error, + }, } pub type Result = std::result::Result; diff --git a/crates/dpub-convert/src/lib.rs b/crates/dpub-convert/src/lib.rs index a2751c4..f31350d 100644 --- a/crates/dpub-convert/src/lib.rs +++ b/crates/dpub-convert/src/lib.rs @@ -20,6 +20,7 @@ use rayon::prelude::*; mod error; mod text_cleanup; +pub use dpub_align::BoundaryStrategy; pub use error::{Error, Result}; /// Convert a parsed DAISY 2.02 [`Book`] into an EPUB 3 [`Publication`]. @@ -205,8 +206,17 @@ fn build_sections( .strip_suffix(".smil") .unwrap_or(§ion_ref.src) .to_owned(); + // XML Names (and OPF manifest IDs are XML Names) cannot + // start with a digit. DAISY filenames often do + // (`001_…`, `002_…`), so prefix those stems with `s-`. let id = if stem.is_empty() { format!("section-{:03}", idx + 1) + } else if stem + .chars() + .next() + .is_some_and(|c| c.is_ascii_digit()) + { + format!("s-{stem}") } else { stem.clone() }; @@ -571,6 +581,15 @@ pub struct ConvertOptions { /// highlight-along-with-audio. Set this to keep SMIL files /// small at the cost of a coarser reading experience. pub no_word_sync: bool, + /// Optional path to a ground truth text file (plain text or + /// markdown). When set together with `transcribe`, Whisper still + /// runs to produce timestamps but the EPUB ships with the real + /// book text aligned word-by-word against Whisper's word stream. + pub ground_truth: Option, + /// How to handle ground-truth-only words (book content the + /// narrator skipped — colophon, index, etc.). Default + /// `BoundaryStrategy::NoSync`. + pub boundary_strategy: dpub_align::BoundaryStrategy, } /// Convert and write a DAISY 2.02 publication to an EPUB 3 file in one call. @@ -615,7 +634,11 @@ pub fn convert_to_file(book: &Book, output: &Path, opts: &ConvertOptions) -> Res transcribe, opts.raw_transcript_segments, opts.no_word_sync, + opts.ground_truth.as_deref(), + opts.boundary_strategy, )?; + } else if opts.ground_truth.is_some() { + return Err(Error::GroundTruthWithoutTranscribe); } // Recompression has to happen *before* the ZIP write because the writer @@ -736,12 +759,15 @@ fn sniff_image_format(bytes: &[u8]) -> Option<(&'static str, &'static str)> { /// When `raw_segments` is `true`, the per-segment Whisper output is emitted /// directly (one `

` per ~10–30 s segment); the default `false` runs /// `text_cleanup::merge_into_paragraphs` to produce prose-shaped output. +#[allow(clippy::too_many_arguments)] fn inject_transcripts( book: &Book, publication: &mut Publication, opts: &TranscribeOptions, raw_segments: bool, no_word_sync: bool, + ground_truth_path: Option<&std::path::Path>, + boundary_strategy: dpub_align::BoundaryStrategy, ) -> Result<()> { let whisper_opts = dpub_whisper::TranscribeOptions { model_path: opts.model_path.clone(), @@ -752,6 +778,40 @@ fn inject_transcripts( // weights into Metal/CUDA buffers for every audio file (#10). let transcriber = dpub_whisper::Transcriber::new(&whisper_opts)?; + // Read and split the ground truth file once, mapping section + // index → owned section text. None when no ground truth is in use. + let ground_truth_by_section: Option> = + if let Some(path) = ground_truth_path { + let text = std::fs::read_to_string(path).map_err(|source| Error::GroundTruthIo { + path: path.to_path_buf(), + source, + })?; + // Use the master.smil section titles (one per section, + // 1:1 with publication.sections) so the alignment maps + // directly onto section indices without an extra lookup. + let headings: Vec<(&str, usize)> = book + .master + .references + .iter() + .enumerate() + .map(|(i, r)| (r.title.as_str(), i)) + .collect(); + let sections = dpub_align::split_into_sections(&text, &headings); + tracing::info!( + "ground truth: matched {}/{} sections", + sections.len(), + headings.len() + ); + Some( + sections + .into_iter() + .map(|s| (s.ncc_index, s.text)) + .collect(), + ) + } else { + None + }; + // Cache: file basename → segments. Reused across sections that share an // audio file. let mut cache: std::collections::HashMap> = @@ -796,11 +856,33 @@ fn inject_transcripts( let new_paragraphs = if raw_segments { render_raw_paragraphs(idx, §ion_segments) } else { - let cleaned = text_cleanup::merge_into_paragraphs( - §ion_segments, - §ion_audio_srcs, - &text_cleanup::CleanupOpts::default(), - ); + // Choose the cleanup path: ground truth alignment when + // available for this section, else heuristic merging of + // raw Whisper output. + let cleaned = match ground_truth_by_section + .as_ref() + .and_then(|m| m.get(&idx)) + { + Some(gt_text) => align_with_ground_truth( + idx, + §ion_segments, + §ion_audio_srcs, + gt_text, + boundary_strategy, + ) + .unwrap_or_else(|| { + text_cleanup::merge_into_paragraphs( + §ion_segments, + §ion_audio_srcs, + &text_cleanup::CleanupOpts::default(), + ) + }), + None => text_cleanup::merge_into_paragraphs( + §ion_segments, + §ion_audio_srcs, + &text_cleanup::CleanupOpts::default(), + ), + }; let html = render_cleaned_paragraphs(idx, &cleaned); // Word-level Media Overlay sync: rebuild this section's // overlay from the cleaned paragraphs, replacing the @@ -816,7 +898,15 @@ fn inject_transcripts( idx, &cleaned, ); - overlay.root = new_root; + // Only swap in the rebuilt tree if it actually has + // synced words. If every word in this section ended + // up filtered out (all-Unsynced ground truth, or all + // zero-duration interpolations), keep the existing + // heading-level overlay shell so we don't ship an + // empty SMIL body. + if has_par_descendant(&new_root) { + overlay.root = new_root; + } } html }; @@ -828,6 +918,101 @@ fn inject_transcripts( Ok(()) } +/// Run ground truth alignment for one section. Builds a flat +/// chronological Whisper word stream from `segments` (paired with +/// per-segment audio basenames), splits the result into +/// `text_cleanup::Paragraph` values that drop into the existing +/// pipeline. Returns `None` if alignment was not possible (no audio, +/// no words) so the caller falls back to heuristic cleanup. +fn align_with_ground_truth( + section_idx: usize, + segments: &[dpub_whisper::Segment], + audio_srcs: &[String], + ground_truth: &str, + boundary_strategy: dpub_align::BoundaryStrategy, +) -> Option> { + if segments.is_empty() || ground_truth.trim().is_empty() { + return None; + } + // Flatten all Whisper words from all segments into one stream. + let mut whisper_words: Vec = Vec::new(); + for seg in segments { + for w in &seg.words { + whisper_words.push(dpub_align::WordTiming { + start_seconds: w.start_seconds, + end_seconds: w.end_seconds, + text: w.text.clone(), + }); + } + } + if whisper_words.is_empty() { + return None; + } + // The aligner doesn't know about per-word audio sources — when a + // section spans multiple audio files we use the first file's name + // for all paragraphs. Splitting paragraphs at audio boundaries is + // possible but rare for this input shape (one section ≈ one + // audio file in DAISY 2.02), so we punt for v1. + let primary_audio = audio_srcs.first().cloned().unwrap_or_default(); + + let result = dpub_align::align_section( + &whisper_words, + ground_truth, + &primary_audio, + boundary_strategy, + ) + .ok()?; + + for event in &result.trim_log { + let label = match event.kind { + dpub_align::TrimKind::LeadingWhisper => "leading whisper-only", + dpub_align::TrimKind::TrailingWhisper => "trailing whisper-only", + dpub_align::TrimKind::LeadingGroundTruth => "leading ground-truth-only", + dpub_align::TrimKind::TrailingGroundTruth => "trailing ground-truth-only", + }; + tracing::info!( + "align: section {section_idx} trimmed {} {label} words: \"{}\"", + event.word_count, + event.preview, + ); + } + + let paragraphs: Vec = result + .paragraphs + .into_iter() + .map(|ap| text_cleanup::Paragraph { + start_seconds: ap.start_seconds, + end_seconds: ap.end_seconds, + text: ap.text, + audio_src: ap.audio_src, + words: ap + .words + .into_iter() + // Unsynced words keep their text in the XHTML span + // (so the text is readable) but carry start==end==0 + // so build_word_overlay_seq omits them from SMIL — + // the colophon is visible without a fake audio sync. + .map(|w| dpub_whisper::Word { + start_seconds: w.start_seconds, + end_seconds: w.end_seconds, + text: w.text, + }) + .collect(), + }) + .collect(); + Some(paragraphs) +} + +/// Returns true if the seq contains at least one `` somewhere +/// in its subtree. Mirrors the same check in the SMIL writer; used +/// here to decide whether to replace the heading-level overlay. +fn has_par_descendant(seq: &OverlaySeq) -> bool { + seq.children.iter().any(|c| match c { + OverlayItem::Par(_) => true, + OverlayItem::Seq(inner) => has_par_descendant(inner), + }) +} + fn render_raw_paragraphs(section_idx: usize, segments: &[dpub_whisper::Segment]) -> String { let mut out = String::new(); for (para_idx, seg) in segments.iter().enumerate() { @@ -870,6 +1055,23 @@ fn build_word_overlay_seq( let mut word_children: Vec = Vec::with_capacity(para.words.len()); for (word_idx, word) in para.words.iter().enumerate() { + // Skip "unsynced" words: explicit sentinel + // (start==end==0) for ground-truth-only material under + // NoSync strategy, and also any zero-duration word that + // slipped through interpolation (clipBegin == clipEnd + // would fail EPUBCheck MED-009). The XHTML span is still + // emitted so the text remains readable. + // + // Use the same millisecond rounding the SMIL writer + // applies — two distinct f64 timestamps can round to the + // same `H:MM:SS.fff` string and trip MED-009. + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let start_ms = (word.start_seconds * 1000.0).round() as i64; + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let end_ms = (word.end_seconds * 1000.0).round() as i64; + if end_ms <= start_ms { + continue; + } let word_id = format!("w-{section_idx:03}-{para_idx:03}-{word_idx:03}"); word_children.push(OverlayItem::Par(OverlayPar { id: Some(word_id.clone()), @@ -879,6 +1081,12 @@ fn build_word_overlay_seq( clip_end_seconds: word.end_seconds, })); } + // Empty paragraph-level elements fail EPUBCheck RSC-005 + // ("element seq incomplete"). Skip the entire paragraph + // wrapper when no synced words remain. + if word_children.is_empty() { + continue; + } top_children.push(OverlayItem::Seq(OverlaySeq { textref: Some(para_textref), children: word_children, @@ -1214,6 +1422,109 @@ mod tests { ); } + #[test] + fn build_word_overlay_seq_drops_words_collapsing_to_same_millisecond() { + // f64 timestamps that round to the same millisecond when the + // SMIL writer formats them as H:MM:SS.fff would emit + // clipBegin == clipEnd, tripping EPUBCheck MED-009. The + // builder must mirror that rounding to filter such words. + let para = text_cleanup::Paragraph { + start_seconds: 0.0, + end_seconds: 1.0, + text: "ok bad ok".into(), + words: vec![ + dpub_whisper::Word { + start_seconds: 36.5302, + end_seconds: 36.5304, // both round to 36.530 + text: "bad".into(), + }, + dpub_whisper::Word { + start_seconds: 36.6, + end_seconds: 36.8, + text: "ok".into(), + }, + ], + audio_src: "a.mp3".into(), + }; + let root = build_word_overlay_seq("content/x.xhtml", 0, &[para]); + let OverlayItem::Seq(inner) = &root.children[0] else { + panic!("expected paragraph seq"); + }; + // Only the second word survives. + assert_eq!(inner.children.len(), 1); + } + + #[test] + fn build_word_overlay_seq_drops_zero_duration_words() { + // Words with start == end (interpolation collapsed to a + // zero-width slot) would produce SMIL `clipBegin == clipEnd`, + // which EPUBCheck rejects (MED-009). + let para = text_cleanup::Paragraph { + start_seconds: 0.0, + end_seconds: 1.0, + text: "ok bad ok".into(), + words: vec![ + dpub_whisper::Word { + start_seconds: 0.0, + end_seconds: 0.5, + text: "ok".into(), + }, + dpub_whisper::Word { + // zero-duration: must be filtered. + start_seconds: 0.5, + end_seconds: 0.5, + text: "bad".into(), + }, + dpub_whisper::Word { + start_seconds: 0.5, + end_seconds: 1.0, + text: "ok".into(), + }, + ], + audio_src: "a.mp3".into(), + }; + let root = build_word_overlay_seq("content/x.xhtml", 0, &[para]); + let OverlayItem::Seq(inner) = &root.children[0] else { + panic!("expected paragraph seq"); + }; + // Two pars survive (the middle one was zero-duration). + assert_eq!(inner.children.len(), 2); + } + + #[test] + fn build_word_overlay_seq_drops_paragraph_with_only_unsynced_words() { + // Every word zero-duration → empty paragraph seq → must be + // skipped entirely so the SMIL writer doesn't emit an empty + // (EPUBCheck RSC-005). + let p_all_unsynced = text_cleanup::Paragraph { + start_seconds: 0.0, + end_seconds: 0.0, + text: "colophon".into(), + words: vec![ + dpub_whisper::Word { + start_seconds: 0.0, + end_seconds: 0.0, + text: "colophon".into(), + }, + ], + audio_src: "a.mp3".into(), + }; + let p_real = text_cleanup::Paragraph { + start_seconds: 1.0, + end_seconds: 2.0, + text: "Real".into(), + words: vec![dpub_whisper::Word { + start_seconds: 1.0, + end_seconds: 2.0, + text: "Real".into(), + }], + audio_src: "a.mp3".into(), + }; + let root = build_word_overlay_seq("content/x.xhtml", 0, &[p_all_unsynced, p_real]); + // The unsynced paragraph wrapper was dropped. + assert_eq!(root.children.len(), 1); + } + #[test] fn render_cleaned_paragraphs_escapes_word_text() { let para = text_cleanup::Paragraph { diff --git a/crates/dpub-convert/tests/real_conversion.rs b/crates/dpub-convert/tests/real_conversion.rs index f007f7e..ee8c55c 100644 --- a/crates/dpub-convert/tests/real_conversion.rs +++ b/crates/dpub-convert/tests/real_conversion.rs @@ -123,6 +123,8 @@ fn opus_recompression_shrinks_real_book() { auto_cover: false, rights: None, no_word_sync: false, + ground_truth: None, + boundary_strategy: dpub_convert::BoundaryStrategy::default(), }, ) .expect("write opus"); diff --git a/crates/epub3-writer/src/writers.rs b/crates/epub3-writer/src/writers.rs index eaa76c9..56980c8 100644 --- a/crates/epub3-writer/src/writers.rs +++ b/crates/epub3-writer/src/writers.rs @@ -314,6 +314,11 @@ pub fn write_overlay_smil(overlay: &MediaOverlay) -> String { } fn write_overlay_seq(s: &mut String, seq: &OverlaySeq, indent: usize) { + // EPUBCheck rejects empty elements (RSC-005 "element seq + // incomplete"). Recursively empty branches are also dropped. + if !seq_has_par_descendant(seq) { + return; + } let pad = " ".repeat(indent); let textref = seq .textref @@ -330,6 +335,16 @@ fn write_overlay_seq(s: &mut String, seq: &OverlaySeq, indent: usize) { let _ = write!(s, "{pad}\n"); } +/// Recursively check whether a `` contains at least one `` +/// somewhere in its tree. Used to skip empty branches that would fail +/// EPUBCheck. +fn seq_has_par_descendant(seq: &OverlaySeq) -> bool { + seq.children.iter().any(|c| match c { + OverlayItem::Par(_) => true, + OverlayItem::Seq(inner) => seq_has_par_descendant(inner), + }) +} + fn write_overlay_par(s: &mut String, par: &OverlayPar, indent: usize) { let pad = " ".repeat(indent); let id_attr = par