From 936c8663b2d5ca5d1e887b2cc2aafa50e553a097 Mon Sep 17 00:00:00 2001
From: Roel Van Gils <roel@elevenways.be>
Date: Thu, 7 May 2026 21:54:40 +0200
Subject: [PATCH] Ground truth text alignment (--ground-truth) with JSON
 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace Whisper's approximate transcription with the real book text
while preserving word-level audio sync via Myers diff + Jaro-Winkler
fuzzy matching. Auto-detects markdown / plain text / structured-JSON /
bulk-JSON ground-truth inputs.

New crate `dpub-align` (~1000 lines, 44 unit + 1 integration test):

- normalize.rs   — Unicode-aware lowercase + punctuation stripping
- diff.rs        — Myers diff via `similar` 3.x with Jaro-Winkler ≥ 0.85
                   fuzzy promotion (catches "Antwerpe" → "Antwerpen")
- boundary.rs    — Anchor detection (≥5 consecutive matches) classifies
                   ops as Leading/Core/Trailing so audiobook preambles
                   and outros never smear into the first/last real word
- transfer.rs    — Timestamp transfer with three boundary strategies
                   (Drop / NoSync / Bracket), monotonicity enforcement,
                   character-proportional interpolation for inserts
- section_split.rs — Markdown vs plain-text auto-detect; fuzzy heading
                   matching against DAISY NCC titles
- json_format.rs — Structured (per-chapter) and bulk (whole-book blob)
                   JSON formats; pass-through with NBSP normalisation

CLI:
- --ground-truth <PATH> (requires --transcribe)
- --ground-truth-strategy <drop|no-sync|bracket> (default: no-sync)
- Config file fields `ground_truth` and `ground_truth_strategy`

Bug fixes surfaced by end-to-end testing on a digit-prefixed DAISY book:
- OPF manifest IDs now prefixed with `s-` when stems start with a digit
  (XML Names cannot start with digits — broke any DAISY book with
  `001_*.smil` filenames; the reference book happened to use letter
  prefixes so nobody noticed).
- Empty `<seq>` elements no longer leak into Media Overlay SMIL files
  (EPUBCheck RSC-005). Empty paragraph wrappers are skipped at the
  builder layer; the SMIL writer also defensively drops recursively
  empty seq subtrees; the heading-level overlay shell is preserved
  when alignment would have produced an entirely empty word tree.
- Words that round to the same millisecond as their neighbour no
  longer ship in SMIL (EPUBCheck MED-009). The builder mirrors the
  writer's millisecond rounding when filtering zero-duration words.

End-to-end verified on "De verwarde Cavia" (109 sections, 4h22m
audio): 103/109 sections matched, 209 boundary trim events recorded,
final EPUB passes EPUBCheck clean (0 fatals / 0 errors / 0 warnings).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                                 |   7 +
 Cargo.lock                                   |  34 +
 Cargo.toml                                   |   1 +
 crates/dpub-align/Cargo.toml                 |  22 +
 crates/dpub-align/examples/match_sections.rs |  66 ++
 crates/dpub-align/src/boundary.rs            | 212 +++++++
 crates/dpub-align/src/diff.rs                | 229 +++++++
 crates/dpub-align/src/error.rs               |  11 +
 crates/dpub-align/src/json_format.rs         | 353 +++++++++++
 crates/dpub-align/src/lib.rs                 | 340 ++++++++++
 crates/dpub-align/src/normalize.rs           |  89 +++
 crates/dpub-align/src/section_split.rs       | 256 ++++++++
 crates/dpub-align/src/transfer.rs            | 621 +++++++++++++++++++
 crates/dpub-cli/src/config.rs                |   8 +-
 crates/dpub-cli/src/main.rs                  |  85 ++-
 crates/dpub-convert/Cargo.toml               |   2 +
 crates/dpub-convert/src/error.rs             |  10 +
 crates/dpub-convert/src/lib.rs               | 323 +++++++++-
 crates/dpub-convert/tests/real_conversion.rs |   2 +
 crates/epub3-writer/src/writers.rs           |  15 +
 20 files changed, 2678 insertions(+), 8 deletions(-)
 create mode 100644 crates/dpub-align/Cargo.toml
 create mode 100644 crates/dpub-align/examples/match_sections.rs
 create mode 100644 crates/dpub-align/src/boundary.rs
 create mode 100644 crates/dpub-align/src/diff.rs
 create mode 100644 crates/dpub-align/src/error.rs
 create mode 100644 crates/dpub-align/src/json_format.rs
 create mode 100644 crates/dpub-align/src/lib.rs
 create mode 100644 crates/dpub-align/src/normalize.rs
 create mode 100644 crates/dpub-align/src/section_split.rs
 create mode 100644 crates/dpub-align/src/transfer.rs
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f5c069e..df33bee 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,9 @@ All notable changes to this project will be documented in this file. The format
 - **`--auto-cover` for Dutch (and other) books no longer silently misses.** Open Library tags docs with ISO 639-2/B (e.g. `"dut"` for Dutch), while DAISY 2.02 metadata uses ISO 639-1 (`"nl"`); the previous literal `eq_ignore_ascii_case` dropped every plausible match. `dpub-meta` now treats 639-1, 639-2/B and 639-2/T as equivalent (`nl`/`dut`/`nld`, `fr`/`fre`/`fra`, `de`/`ger`/`deu`, etc.). Real-world miss this surfaced: "Het smelt" by Lize Spit. Regression test added.
 - **ISBN search hits are now trusted unconditionally.** When DAISY's `dc:identifier` is ISBN-shaped, the search-by-ISBN already disambiguates the edition, so the language and author filters on the result are noise — and would (incorrectly) reject the cover when Open Library lists a translator under `author_name`. Title+author search remains filtered.
 - **Open Library HTTP timeout raised from 8 s to 30 s.** `covers.openlibrary.org` redirects through archive.org and can take ~20 s on first hit for less-popular editions; 8 s caused spurious "lookup failed" misses.
+- **OPF manifest IDs no longer fail XML Name validation when DAISY filenames start with digits.** DAISY books frequently use `001_*.smil`, `002_*.smil` filenames; the previous code copied those stems into manifest `id` and `idref` attributes, which XML Names reject (must start with a letter or underscore). Stems beginning with a digit are now prefixed with `s-`. EPUBCheck no longer flags `RSC-005` for these books. The reference book ("Ontmoetingen in het donker") was unaffected because its filenames begin with letters.
+- **Empty `<seq>` elements no longer leak into Media Overlay SMIL files** (EPUBCheck `RSC-005` "element seq incomplete"). Empty paragraph wrappers are dropped at the writer level, and the heading-level overlay shell is preserved when ground-truth alignment would have produced an entirely empty word tree.
+- **Words with `clipBegin == clipEnd` no longer ship in SMIL** (EPUBCheck `MED-009`). Zero-duration words from interpolation are filtered out alongside the explicit Unsynced sentinel; their XHTML span is still emitted so the text remains readable.
 - **Whisper model download no longer times out on slow connections.** The HTTP agent used a 60-second total-request timeout, which was insufficient for the 1.5 GB `ggml-medium.bin` download. Now uses per-read timeouts (60 s idle) so downloads can take as long as needed as long as data keeps flowing. Additionally, downloads now retry up to 3 times on transient failures (CDN stalls, connection resets).
 
 ### Changed
@@ -17,6 +20,10 @@ All notable changes to this project will be documented in this file. The format
 
 ### Added
 
+- **Ground truth text alignment** (`--ground-truth <PATH>`). Pass a plain text or markdown file containing the real book text and dpub will align it word-by-word against Whisper's transcription, replacing Whisper's approximate text with the real prose while keeping the word-level audio sync. Section headings are matched against the DAISY NCC headings via Jaro-Winkler fuzzy matching, so a single file with the whole book works as long as the chapters are in the right order. Markdown vs plain text is auto-detected. Requires `--transcribe` (Whisper still runs to produce timestamps).
+- **`--ground-truth-strategy <drop|no-sync|bracket>`** controls how book content the narrator skipped (colophon, index, acknowledgements) is handled. `no-sync` (default) includes the text in the EPUB without a Media Overlay entry — visible, no karaoke highlight on those passages. `drop` excludes it entirely. `bracket` spans the available time gap proportionally for continuous (if imperfect) sync.
+- **Audiobook-specific boundary trimming.** Audiobook copyright preambles and outros (Whisper-only material) are detected automatically and discarded — they never leak into the first or last real word's timestamp. The detector requires a run of at least 5 consecutive matching words before it commits to the alignment, so a single coincidental match (e.g. the book title appearing in the preamble) can't trigger early alignment.
+- **New crate `dpub-align`** containing the alignment algorithm: word normalisation, Myers diff (via `similar`), Jaro-Winkler fuzzy promotion (≥ 0.85 → Equal), boundary anchor detection, and timestamp transfer with monotonicity enforcement. 33 unit tests.
 - **`--transcribe` auto-detects language from book metadata.** Passing `--transcribe` without a language code now reads `dc:language` from the DAISY NCC metadata and normalises it to ISO 639-1 for Whisper. Explicit `--transcribe nl` still works. Config file supports `"transcribe": true` for auto-detect or `"transcribe": "nl"` for a fixed default.
 - **Shared ISO 639 normaliser** (`dpub-util/lang`). Maps ISO 639-1, 639-2/B and 639-2/T codes to their canonical two-letter form. Used by both `dpub-meta` (cover lookup language filter) and `dpub-cli` (transcription auto-detect).
 - **Persistent config file** (`~/.config/dpub/config.json` on Unix, `%APPDATA%\dpub\config.json` on Windows). Lets users set defaults for `audio`, `bitrate`, `auto_cover`, `no_word_sync`, `rights`, `whisper_model`, `transcribe`, `validate`, `a11y`, `jobs`, and `log_level`. CLI flags always override config values.
diff --git a/Cargo.lock b/Cargo.lock
index b85b78e..45c614c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -150,6 +150,16 @@ dependencies = [
  "generic-array",
 ]
 
+[[package]]
+name = "bstr"
+version = "1.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab"
+dependencies = [
+ "memchr",
+ "serde",
+]
+
 [[package]]
 name = "bumpalo"
 version = "3.20.2"
@@ -355,6 +365,19 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "dpub-align"
+version = "0.6.0"
+dependencies = [
+ "dpub-core",
+ "serde",
+ "serde_json",
+ "similar",
+ "strsim",
+ "thiserror 1.0.69",
+ "tracing",
+]
+
 [[package]]
 name = "dpub-audio"
 version = "0.6.0"
@@ -392,6 +415,7 @@ name = "dpub-convert"
 version = "0.6.0"
 dependencies = [
  "chrono",
+ "dpub-align",
  "dpub-audio",
  "dpub-core",
  "dpub-meta",
@@ -401,6 +425,7 @@ dependencies = [
  "rayon",
  "tempfile",
  "thiserror 1.0.69",
+ "tracing",
  "uuid",
  "zip",
 ]
@@ -1265,6 +1290,15 @@ version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214"
 
+[[package]]
+name = "similar"
+version = "3.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04d93e861ede2e497b47833469b8ec9d5c07fa4c78ce7a00f6eb7dd8168b4b3f"
+dependencies = [
+ "bstr",
+]
+
 [[package]]
 name = "slab"
 version = "0.4.12"
diff --git a/Cargo.toml b/Cargo.toml
index 536c4d1..9d2235d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,7 @@
 [workspace]
 resolver = "3"
 members = [
+    "crates/dpub-align",
     "crates/dpub-audio",
     "crates/dpub-core",
     "crates/dpub-cli",
diff --git a/crates/dpub-align/Cargo.toml b/crates/dpub-align/Cargo.toml
new file mode 100644
index 0000000..c0003c4
--- /dev/null
+++ b/crates/dpub-align/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "dpub-align"
+version.workspace = true
+edition.workspace = true
+rust-version.workspace = true
+license.workspace = true
+repository.workspace = true
+description = "Align Whisper word-level timestamps to ground truth book text via Myers diff + fuzzy matching."
+
+[lints]
+workspace = true
+
+[dependencies]
+serde = { workspace = true }
+serde_json = { workspace = true }
+similar = "3"
+strsim = "0.11"
+thiserror = { workspace = true }
+tracing = { workspace = true }
+
+[dev-dependencies]
+dpub-core = { path = "../dpub-core" }
diff --git a/crates/dpub-align/examples/match_sections.rs b/crates/dpub-align/examples/match_sections.rs
new file mode 100644
index 0000000..fc43c9b
--- /dev/null
+++ b/crates/dpub-align/examples/match_sections.rs
@@ -0,0 +1,66 @@
+//! Dry-run helper: parse a DAISY 2.02 publication and a ground-truth
+//! file and report how many sections the heading matcher resolves —
+//! without running Whisper. Useful when validating a new ground-truth
+//! file against a book.
+//!
+//! Usage:
+//! ```text
+//! cargo run --release -p dpub-align --example match_sections -- \
+//!   /path/to/book/ncc.html /path/to/groundtruth.{txt,md,json}
+//! ```
+
+use std::path::Path;
+
+fn main() {
+    let mut args = std::env::args().skip(1);
+    let ncc = args.next().expect("usage: match_sections <ncc.html> <ground-truth>");
+    let gt = args.next().expect("usage: match_sections <ncc.html> <ground-truth>");
+
+    let book = dpub_core::Book::from_ncc(Path::new(&ncc)).expect("parse DAISY");
+    let raw = std::fs::read_to_string(&gt).expect("read ground truth");
+
+    let headings: Vec<(&str, usize)> = book
+        .master
+        .references
+        .iter()
+        .enumerate()
+        .map(|(i, r)| (r.title.as_str(), i))
+        .collect();
+
+    let sections = dpub_align::split_into_sections(&raw, &headings);
+    println!(
+        "Matched {} of {} DAISY sections",
+        sections.len(),
+        headings.len()
+    );
+    println!();
+
+    let matched: std::collections::HashSet<usize> = sections.iter().map(|s| s.ncc_index).collect();
+    println!("First 10 matches:");
+    for s in sections.iter().take(10) {
+        let title = headings[s.ncc_index].0;
+        let preview: String = s.text.chars().take(50).collect::<String>().replace('\n', " ");
+        println!(
+            "  [{:3}] {:30}  → {:5} chars  {:?}",
+            s.ncc_index,
+            title,
+            s.text.len(),
+            preview
+        );
+    }
+    println!();
+
+    let unmatched: Vec<&str> = headings
+        .iter()
+        .enumerate()
+        .filter(|(i, _)| !matched.contains(i))
+        .map(|(_, (t, _))| *t)
+        .collect();
+    println!("Unmatched headings ({} total):", unmatched.len());
+    for t in unmatched.iter().take(20) {
+        println!("  {t}");
+    }
+    if unmatched.len() > 20 {
+        println!("  ... and {} more", unmatched.len() - 20);
+    }
+}
diff --git a/crates/dpub-align/src/boundary.rs b/crates/dpub-align/src/boundary.rs
new file mode 100644
index 0000000..0d5727c
--- /dev/null
+++ b/crates/dpub-align/src/boundary.rs
@@ -0,0 +1,212 @@
+//! Anchor detection and region classification for the edit script.
+//!
+//! The two streams won't always cover the same scope. Audiobook
+//! preambles ("This is a Luisterpunt production…") are Whisper-only;
+//! colophons and indices are ground-truth-only. These mismatches
+//! cluster at the boundaries of a section.
+//!
+//! This module finds the **anchor region** — the longest middle
+//! section bracketed by runs of ≥5 consecutive Equal/Fuzzy operations
+//! — and tags every op as Leading / Core / Trailing so the timestamp
+//! transfer phase can apply different policies per region.
+
+use crate::diff::Op;
+
+/// Minimum run of consecutive Equal/Fuzzy ops required to count as
+/// an anchor. Tuned to ignore single coincidental matches inside a
+/// preamble (e.g. the book title) while still catching the start of
+/// the actual content.
+pub(crate) const ANCHOR_MIN_RUN: usize = 5;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) enum Region {
+    /// Before the leading anchor (or whole script if no anchor).
+    Leading,
+    /// Inside the anchor region — normal timestamp transfer applies.
+    Core,
+    /// After the trailing anchor.
+    Trailing,
+}
+
+#[derive(Debug, Clone)]
+pub(crate) struct ClassifiedOp {
+    pub op: Op,
+    pub region: Region,
+}
+
+/// Classify each op by region.
+///
+/// If no anchor is found (the streams are wildly different), every op
+/// is flagged as `Core` so the caller falls back to plain transfer
+/// rather than dropping content.
+pub(crate) fn classify(script: &[Op]) -> Vec<ClassifiedOp> {
+    let leading = first_anchor_start(script);
+    let trailing = last_anchor_end(script);
+
+    let (lead_end, trail_start) = match (leading, trailing) {
+        (Some(l), Some(t)) if l < t => (l, t),
+        // No usable anchor pair: treat everything as Core. This
+        // matches the "best effort" contract — when the diff is
+        // chaotic, naive transfer is still better than dropping.
+        _ => return script.iter().map(|&op| ClassifiedOp {
+            op,
+            region: Region::Core,
+        }).collect(),
+    };
+
+    script
+        .iter()
+        .enumerate()
+        .map(|(i, &op)| {
+            let region = if i < lead_end {
+                Region::Leading
+            } else if i >= trail_start {
+                Region::Trailing
+            } else {
+                Region::Core
+            };
+            ClassifiedOp { op, region }
+        })
+        .collect()
+}
+
+/// Return the index *of the first op* in the leading anchor run
+/// (the first match of a ≥ANCHOR_MIN_RUN streak), or `None` if no
+/// such run exists.
+fn first_anchor_start(script: &[Op]) -> Option<usize> {
+    let mut run_len = 0usize;
+    let mut run_start = 0usize;
+    for (i, op) in script.iter().enumerate() {
+        if is_match(op) {
+            if run_len == 0 {
+                run_start = i;
+            }
+            run_len += 1;
+            if run_len >= ANCHOR_MIN_RUN {
+                return Some(run_start);
+            }
+        } else {
+            run_len = 0;
+        }
+    }
+    None
+}
+
+/// Return the index *one past the last op* in the trailing anchor
+/// run, or `None` if no such run exists.
+fn last_anchor_end(script: &[Op]) -> Option<usize> {
+    let mut run_len = 0usize;
+    let mut run_end = 0usize; // exclusive
+    for (i, op) in script.iter().enumerate().rev() {
+        if is_match(op) {
+            if run_len == 0 {
+                run_end = i + 1;
+            }
+            run_len += 1;
+            if run_len >= ANCHOR_MIN_RUN {
+                return Some(run_end);
+            }
+        } else {
+            run_len = 0;
+        }
+    }
+    None
+}
+
+fn is_match(op: &Op) -> bool {
+    matches!(op, Op::Equal { .. } | Op::Fuzzy { .. })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn eq(w: usize, g: usize) -> Op {
+        Op::Equal {
+            whisper_idx: w,
+            gt_idx: g,
+        }
+    }
+    fn del(w: usize) -> Op {
+        Op::Delete { whisper_idx: w }
+    }
+    fn ins(g: usize) -> Op {
+        Op::Insert { gt_idx: g }
+    }
+
+    #[test]
+    fn no_anchors_means_all_core() {
+        // Only 4 matches — below threshold of 5
+        let script = vec![eq(0, 0), eq(1, 1), eq(2, 2), eq(3, 3), del(4)];
+        let classified = classify(&script);
+        assert!(classified.iter().all(|c| c.region == Region::Core));
+    }
+
+    #[test]
+    fn preamble_is_leading() {
+        // 10 deletes (preamble), then 5 matches (anchor).
+        let mut script: Vec<Op> = (0..10).map(del).collect();
+        script.extend((0..5).map(|i| eq(10 + i, i)));
+        let classified = classify(&script);
+        assert_eq!(classified.len(), 15);
+        // The 10 deletes should be Leading; the 5 matches Core.
+        for c in &classified[0..10] {
+            assert_eq!(c.region, Region::Leading);
+        }
+        for c in &classified[10..15] {
+            assert_eq!(c.region, Region::Core);
+        }
+    }
+
+    #[test]
+    fn colophon_is_trailing() {
+        // 5 matches (anchor), then 10 inserts (colophon).
+        let mut script: Vec<Op> = (0..5).map(|i| eq(i, i)).collect();
+        script.extend((5..15).map(ins));
+        let classified = classify(&script);
+        for c in &classified[0..5] {
+            assert_eq!(c.region, Region::Core);
+        }
+        for c in &classified[5..15] {
+            assert_eq!(c.region, Region::Trailing);
+        }
+    }
+
+    #[test]
+    fn full_book_pattern() {
+        // preamble (8 deletes) → core (5 matches, 1 delete, 5 matches)
+        // → outro (8 deletes)
+        let mut script: Vec<Op> = (0..8).map(del).collect();
+        script.extend((0..5).map(|i| eq(8 + i, i)));
+        script.push(del(13));
+        script.extend((5..10).map(|i| eq(14 + i - 5, i)));
+        script.extend((19..27).map(del));
+
+        let classified = classify(&script);
+        // First 8 should be Leading
+        assert!(classified[0..8].iter().all(|c| c.region == Region::Leading));
+        // Last 8 should be Trailing
+        let n = classified.len();
+        assert!(classified[n - 8..n].iter().all(|c| c.region == Region::Trailing));
+        // Middle should be Core
+        assert!(classified[8..n - 8].iter().all(|c| c.region == Region::Core));
+    }
+
+    #[test]
+    fn single_coincidental_match_in_preamble_doesnt_anchor() {
+        // 3 deletes, 1 match (book title in preamble?), 4 deletes,
+        // then 5 real matches.
+        let mut script: Vec<Op> = vec![del(0), del(1), del(2)];
+        script.push(eq(3, 0)); // coincidental
+        script.extend((4..8).map(del));
+        script.extend((0..5).map(|i| eq(8 + i, i + 1)));
+        let classified = classify(&script);
+        // The coincidental match should be classified as Leading,
+        // because it's before the real 5-run anchor.
+        assert_eq!(classified[3].region, Region::Leading);
+        // Real anchor begins at index 8.
+        for c in &classified[8..13] {
+            assert_eq!(c.region, Region::Core);
+        }
+    }
+}
diff --git a/crates/dpub-align/src/diff.rs b/crates/dpub-align/src/diff.rs
new file mode 100644
index 0000000..53be63a
--- /dev/null
+++ b/crates/dpub-align/src/diff.rs
@@ -0,0 +1,229 @@
+//! Word-level Myers diff with Jaro-Winkler fuzzy promotion.
+//!
+//! Produces an [`EditScript`] — a sequence of [`Op`] entries that
+//! describe how to turn the Whisper word stream into the ground truth
+//! word stream (Equal/Fuzzy/Insert/Delete/Replace).
+
+use similar::{capture_diff_slices, Algorithm};
+
+use crate::{GroundTruthWord, WordTiming};
+
+/// One operation in the edit script. Indices refer to the input
+/// slices: `whisper_idx` into the Whisper word stream, `gt_idx` into
+/// the ground truth word stream.
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub(crate) enum Op {
+    /// Whisper word and ground truth word match exactly (after
+    /// normalisation).
+    Equal { whisper_idx: usize, gt_idx: usize },
+    /// Whisper word and ground truth word are similar enough
+    /// (Jaro-Winkler ≥ threshold) — treat as a match.
+    Fuzzy {
+        whisper_idx: usize,
+        gt_idx: usize,
+        score: f64,
+    },
+    /// Whisper word with no ground truth counterpart (hallucinated /
+    /// repeated / preamble).
+    Delete { whisper_idx: usize },
+    /// Ground truth word with no Whisper counterpart (skipped /
+    /// colophon / outro).
+    Insert { gt_idx: usize },
+}
+
+/// Threshold above which a Replace operation is promoted to Fuzzy.
+/// Set lower than the typical 0.9 to catch trailing-letter truncations
+/// like "Antwerpe" → "Antwerpen" (≈ 0.97) and minor letter swaps.
+const FUZZY_THRESHOLD: f64 = 0.85;
+
+/// Run Myers diff over the normalised keys of both streams and emit
+/// the post-processed edit script (Equal/Fuzzy/Insert/Delete).
+pub(crate) fn diff_words(
+    whisper: &[WordTiming],
+    ground_truth: &[GroundTruthWord],
+) -> Vec<Op> {
+    use crate::normalize;
+
+    // Build key slices. We could compute keys lazily but caching
+    // avoids re-running normalise() inside Myers' inner loop.
+    let whisper_keys: Vec<String> = whisper.iter().map(|w| normalize::normalise(&w.text)).collect();
+    let gt_keys: Vec<&str> = ground_truth.iter().map(|w| w.key.as_str()).collect();
+    let whisper_key_refs: Vec<&str> = whisper_keys.iter().map(String::as_str).collect();
+
+    let diff_ops = capture_diff_slices(Algorithm::Myers, &whisper_key_refs, &gt_keys);
+
+    let mut script: Vec<Op> = Vec::with_capacity(whisper.len() + ground_truth.len());
+    for op in diff_ops {
+        match op {
+            similar::DiffOp::Equal {
+                old_index,
+                new_index,
+                len,
+            } => {
+                for i in 0..len {
+                    script.push(Op::Equal {
+                        whisper_idx: old_index + i,
+                        gt_idx: new_index + i,
+                    });
+                }
+            }
+            similar::DiffOp::Delete {
+                old_index, old_len, ..
+            } => {
+                for i in 0..old_len {
+                    script.push(Op::Delete {
+                        whisper_idx: old_index + i,
+                    });
+                }
+            }
+            similar::DiffOp::Insert {
+                new_index, new_len, ..
+            } => {
+                for i in 0..new_len {
+                    script.push(Op::Insert {
+                        gt_idx: new_index + i,
+                    });
+                }
+            }
+            similar::DiffOp::Replace {
+                old_index,
+                old_len,
+                new_index,
+                new_len,
+            } => {
+                // Pair up the Replace block one-to-one (longest common
+                // length); leftover Whisper words become Deletes,
+                // leftover ground truth words become Inserts. Within
+                // each pair, promote to Fuzzy if Jaro-Winkler is high
+                // enough — handles "Antwerpe"/"Antwerpen", trailing-s
+                // confusions, etc.
+                let pair_len = old_len.min(new_len);
+                for i in 0..pair_len {
+                    let w_i = old_index + i;
+                    let g_i = new_index + i;
+                    let score =
+                        strsim::jaro_winkler(whisper_key_refs[w_i], gt_keys[g_i]);
+                    if score >= FUZZY_THRESHOLD {
+                        script.push(Op::Fuzzy {
+                            whisper_idx: w_i,
+                            gt_idx: g_i,
+                            score,
+                        });
+                    } else {
+                        // Genuinely different word: emit as
+                        // Delete + Insert. The transfer phase uses
+                        // Whisper's time span for the inserted word
+                        // anyway when these are adjacent.
+                        script.push(Op::Delete { whisper_idx: w_i });
+                        script.push(Op::Insert { gt_idx: g_i });
+                    }
+                }
+                for i in pair_len..old_len {
+                    script.push(Op::Delete {
+                        whisper_idx: old_index + i,
+                    });
+                }
+                for i in pair_len..new_len {
+                    script.push(Op::Insert {
+                        gt_idx: new_index + i,
+                    });
+                }
+            }
+        }
+    }
+    script
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::normalize;
+
+    fn ww(text: &str, start: f64, end: f64) -> WordTiming {
+        WordTiming {
+            start_seconds: start,
+            end_seconds: end,
+            text: text.to_owned(),
+        }
+    }
+
+    fn gt(words: &[&str]) -> Vec<GroundTruthWord> {
+        words
+            .iter()
+            .map(|s| GroundTruthWord {
+                text: (*s).to_owned(),
+                key: normalize::normalise(s),
+            })
+            .collect()
+    }
+
+    #[test]
+    fn perfect_match_all_equal() {
+        let w = vec![
+            ww("hello", 0.0, 0.5),
+            ww("world", 0.5, 1.0),
+        ];
+        let g = gt(&["hello", "world"]);
+        let ops = diff_words(&w, &g);
+        assert!(matches!(ops[0], Op::Equal { .. }));
+        assert!(matches!(ops[1], Op::Equal { .. }));
+        assert_eq!(ops.len(), 2);
+    }
+
+    #[test]
+    fn whisper_hallucination_is_delete() {
+        let w = vec![
+            ww("hello", 0.0, 0.5),
+            ww("um", 0.5, 0.7),
+            ww("world", 0.7, 1.2),
+        ];
+        let g = gt(&["hello", "world"]);
+        let ops = diff_words(&w, &g);
+        // Should have an Equal, a Delete, an Equal.
+        let deletes: Vec<_> = ops
+            .iter()
+            .filter(|op| matches!(op, Op::Delete { .. }))
+            .collect();
+        assert_eq!(deletes.len(), 1);
+    }
+
+    #[test]
+    fn whisper_omission_is_insert() {
+        let w = vec![
+            ww("hello", 0.0, 0.5),
+            ww("world", 0.5, 1.0),
+        ];
+        let g = gt(&["hello", "the", "world"]);
+        let ops = diff_words(&w, &g);
+        let inserts: Vec<_> = ops
+            .iter()
+            .filter(|op| matches!(op, Op::Insert { .. }))
+            .collect();
+        assert_eq!(inserts.len(), 1);
+    }
+
+    #[test]
+    fn truncated_word_promoted_to_fuzzy() {
+        // "Antwerpe" vs "Antwerpen" — Jaro-Winkler ~0.97
+        let w = vec![ww("antwerpe", 0.0, 1.0)];
+        let g = gt(&["antwerpen"]);
+        let ops = diff_words(&w, &g);
+        assert_eq!(ops.len(), 1);
+        match ops[0] {
+            Op::Fuzzy { score, .. } => assert!(score >= 0.85),
+            other => panic!("expected Fuzzy, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn unrelated_words_stay_replace_split() {
+        // "table" vs "elephant" — Jaro-Winkler well below threshold
+        let w = vec![ww("table", 0.0, 1.0)];
+        let g = gt(&["elephant"]);
+        let ops = diff_words(&w, &g);
+        // Should be Delete + Insert, not Fuzzy
+        assert!(ops.iter().any(|op| matches!(op, Op::Delete { .. })));
+        assert!(ops.iter().any(|op| matches!(op, Op::Insert { .. })));
+        assert!(!ops.iter().any(|op| matches!(op, Op::Fuzzy { .. })));
+    }
+}
diff --git a/crates/dpub-align/src/error.rs b/crates/dpub-align/src/error.rs
new file mode 100644
index 0000000..469585a
--- /dev/null
+++ b/crates/dpub-align/src/error.rs
@@ -0,0 +1,11 @@
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum Error {
+    #[error("ground truth is empty for section")]
+    EmptyGroundTruth,
+    #[error("no whisper words for section")]
+    NoWhisperWords,
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
diff --git a/crates/dpub-align/src/json_format.rs b/crates/dpub-align/src/json_format.rs
new file mode 100644
index 0000000..0ef4d60
--- /dev/null
+++ b/crates/dpub-align/src/json_format.rs
@@ -0,0 +1,353 @@
+//! Adapter for the structured-JSON ground truth format.
+//!
+//! Schema (permissive — unknown fields are ignored):
+//!
+//! ```json
+//! {
+//!   "content": [
+//!     { "chapter-title": "...", "chapter-content": "..." },
+//!     ...
+//!   ]
+//! }
+//! ```
+//!
+//! Each entry becomes one section. The title is matched against the
+//! DAISY NCC heading via the existing fuzzy matcher; the content
+//! becomes the section body. Single newlines in the content are
+//! treated as paragraph breaks (DAISY-friendly default — most book
+//! exporters split paragraphs that way).
+
+use serde::Deserialize;
+
+/// Returns `true` when `raw` looks like our JSON format (first
+/// non-whitespace char is `{`). Used to dispatch between the JSON
+/// parser and the plain-text/markdown path.
+pub fn looks_like_json(raw: &str) -> bool {
+    raw.trim_start().starts_with('{')
+}
+
+/// Convert a JSON document conforming to the chapter-array schema
+/// into the markdown-style ground-truth text the rest of `dpub-align`
+/// already consumes. On any parse error returns the input unchanged
+/// so the caller can fall through to the plain-text path.
+pub fn convert_to_markdown(raw: &str) -> String {
+    match serde_json::from_str::<Document>(raw) {
+        Ok(doc) => render(&doc),
+        Err(e) => {
+            tracing::warn!("ground truth: JSON parse failed ({e}); falling back to plain-text path");
+            raw.to_owned()
+        }
+    }
+}
+
+#[derive(Debug, Deserialize)]
+struct Document {
+    #[serde(default)]
+    content: Vec<Chapter>,
+}
+
+#[derive(Debug, Deserialize)]
+struct Chapter {
+    #[serde(rename = "chapter-title", default)]
+    title: Option<String>,
+    #[serde(rename = "chapter-content", default)]
+    content: Option<String>,
+}
+
+fn render(doc: &Document) -> String {
+    // Bulk format: no chapter object carries a title — the whole book
+    // is in one (or more) `chapter-content` blobs with section titles
+    // encoded inline (typically as ALL-CAPS short lines). Concatenate
+    // the bodies and pass through as plain text so the existing
+    // line-by-line heading detector picks up the inline titles via
+    // fuzzy matching.
+    let any_titled = doc.content.iter().any(|c| {
+        c.title
+            .as_deref()
+            .is_some_and(|t| !t.trim().is_empty())
+    });
+    if !any_titled {
+        let mut body = String::new();
+        for ch in &doc.content {
+            if let Some(content) = &ch.content {
+                if !body.is_empty() {
+                    body.push_str("\n\n");
+                }
+                body.push_str(content.trim());
+            }
+        }
+        return normalize_body_preserving_lines(&body);
+    }
+
+    let mut out = String::with_capacity(doc.content.iter().map(|c| {
+        c.title.as_deref().map_or(0, str::len) + c.content.as_deref().map_or(0, str::len) + 8
+    }).sum());
+
+    for chapter in &doc.content {
+        let body_raw = chapter.content.as_deref().unwrap_or("").trim();
+        let title = chapter
+            .title
+            .as_deref()
+            .map(normalize_title)
+            .filter(|t| !t.is_empty());
+
+        // Skip entries with neither title nor content — nothing useful
+        // to align against.
+        if title.is_none() && body_raw.is_empty() {
+            continue;
+        }
+
+        // Emit a markdown H1 so the existing splitter picks it up.
+        // Untitled entries (typically the first cover/title-page item)
+        // get a synthetic placeholder so they still count as a section
+        // boundary; matchers will simply fail to find them in the NCC,
+        // which is the correct outcome.
+        let heading = title.unwrap_or_else(|| body_raw
+            .lines()
+            .next()
+            .unwrap_or("untitled")
+            .chars()
+            .take(80)
+            .collect::<String>());
+
+        out.push_str("# ");
+        out.push_str(heading.trim());
+        out.push_str("\n\n");
+
+        let body = normalize_body(body_raw);
+        out.push_str(&body);
+        out.push_str("\n\n");
+    }
+
+    out
+}
+
+/// Normalise a chapter title that may contain literal newlines or
+/// non-breaking spaces (`\u{a0}`). Reading systems display titles on
+/// one line; the NCC heading we match against is also a single line.
+fn normalize_title(s: &str) -> String {
+    let mut out = String::with_capacity(s.len());
+    let mut last_was_space = true;
+    for c in s.chars() {
+        let is_space = c == '\n' || c == '\r' || c == '\t' || c == '\u{a0}' || c == ' ';
+        if is_space {
+            if !last_was_space {
+                out.push(' ');
+            }
+            last_was_space = true;
+        } else {
+            out.push(c);
+            last_was_space = false;
+        }
+    }
+    out.trim().to_owned()
+}
+
+/// Convert the JSON's `chapter-content` into the paragraph-aware
+/// format the rest of the pipeline expects (paragraphs separated by
+/// blank lines). Heuristic:
+/// - Treat single `\n` as a paragraph break (the format uses single
+///   newlines between paragraphs).
+/// - Collapse runs of newlines into a single paragraph break.
+/// - Replace non-breaking spaces with regular spaces (better word
+///   matching: `nieuwe\u{a0}avonturen` matches Whisper's `nieuwe
+///   avonturen`).
+/// Normalise a bulk-format body without merging lines: inline
+/// chapter titles (ALL-CAPS short lines preceded by blank lines) must
+/// stay on their own lines so [`section_split`] can detect them.
+/// Only character-level normalisations are applied (NBSP → space).
+fn normalize_body_preserving_lines(s: &str) -> String {
+    s.chars()
+        .map(|c| if c == '\u{a0}' || c == '\u{2009}' || c == '\u{200a}' { ' ' } else { c })
+        .collect()
+}
+
+fn normalize_body(s: &str) -> String {
+    let mut paragraphs: Vec<String> = Vec::new();
+    for raw_para in s.split('\n') {
+        let trimmed = raw_para.trim();
+        if trimmed.is_empty() {
+            continue;
+        }
+        // Replace NBSPs and other Unicode spaces with a regular space.
+        let normalised: String = trimmed
+            .chars()
+            .map(|c| if c == '\u{a0}' || c == '\u{2009}' || c == '\u{200a}' { ' ' } else { c })
+            .collect();
+        paragraphs.push(normalised);
+    }
+    paragraphs.join("\n\n")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn detects_json_input() {
+        assert!(looks_like_json("{\"content\": []}"));
+        assert!(looks_like_json("  \n  { ... }"));
+        assert!(!looks_like_json("# Heading\nbody"));
+        assert!(!looks_like_json("plain text"));
+        assert!(!looks_like_json(""));
+    }
+
+    #[test]
+    fn converts_basic_document() {
+        let json = r#"{
+            "content": [
+                {"chapter-title": "Chapter 1", "chapter-content": "First paragraph.\nSecond paragraph."},
+                {"chapter-title": "Chapter 2", "chapter-content": "Body of two."}
+            ]
+        }"#;
+        let md = convert_to_markdown(json);
+        assert!(md.contains("# Chapter 1"));
+        assert!(md.contains("# Chapter 2"));
+        // Paragraphs separated by blank lines (i.e. \n\n).
+        assert!(md.contains("First paragraph.\n\nSecond paragraph."));
+    }
+
+    #[test]
+    fn collapses_multiline_title() {
+        let json = r#"{"content":[
+            {"chapter-title": "Hoera!\nNieuwe avonturen", "chapter-content": "Body."}
+        ]}"#;
+        let md = convert_to_markdown(json);
+        assert!(md.contains("# Hoera! Nieuwe avonturen"));
+    }
+
+    #[test]
+    fn replaces_nbsp_in_title_and_body() {
+        let json = "{\"content\":[{\"chapter-title\":\"de\u{a0}cavia\",\"chapter-content\":\"woord\u{a0}met nbsp.\"}]}";
+        let md = convert_to_markdown(json);
+        assert!(md.contains("# de cavia"));
+        assert!(md.contains("woord met nbsp."));
+    }
+
+    #[test]
+    fn skips_entry_with_no_useful_content() {
+        let json = r#"{"content":[
+            {"chapter-content": ""},
+            {"chapter-title": "Real chapter", "chapter-content": "Body."}
+        ]}"#;
+        let md = convert_to_markdown(json);
+        // Only one heading.
+        assert_eq!(md.matches("# ").count(), 1);
+        assert!(md.contains("# Real chapter"));
+    }
+
+    #[test]
+    fn ignores_extra_top_level_fields() {
+        let json = r#"{
+            "title": "Book Title",
+            "language": "nl",
+            "extraction_time_ms": 12345,
+            "total_chars_count": 999,
+            "content": [
+                {"chapter-title": "Only", "chapter-content": "Body."}
+            ]
+        }"#;
+        let md = convert_to_markdown(json);
+        assert!(md.contains("# Only"));
+        assert!(md.contains("Body."));
+    }
+
+    #[test]
+    fn ignores_extra_chapter_fields() {
+        let json = r#"{"content":[
+            {"chapter-title": "T", "chapter-content": "B", "chars_count": 1, "word_count": 1, "anything-else": null}
+        ]}"#;
+        let md = convert_to_markdown(json);
+        assert!(md.contains("# T"));
+        assert!(md.contains("B"));
+    }
+
+    #[test]
+    fn malformed_json_falls_through_unchanged() {
+        let raw = "{not json";
+        assert_eq!(convert_to_markdown(raw), raw);
+    }
+
+    /// Smoke test against a real fullbook.json (when present on disk).
+    /// Gated on the env var so CI without the file passes. The fixture
+    /// may be either format (structured or bulk); we just assert that
+    /// parsing produces a non-empty result.
+    #[test]
+    fn parses_real_fullbook_json() {
+        let Ok(path) = std::env::var("DPUB_TEST_GROUND_TRUTH_JSON") else {
+            return;
+        };
+        let raw = std::fs::read_to_string(&path).expect("read fixture");
+        assert!(looks_like_json(&raw));
+        let md = convert_to_markdown(&raw);
+        assert!(!md.is_empty());
+    }
+
+    /// Integration: parse the bulk-format fullbook.json and verify the
+    /// inline ALL-CAPS chapter titles can be matched against a sample
+    /// of expected DAISY headings via the public splitter API.
+    #[test]
+    fn bulk_format_finds_inline_chapter_titles() {
+        let Ok(path) = std::env::var("DPUB_TEST_GROUND_TRUTH_JSON") else {
+            return;
+        };
+        let raw = std::fs::read_to_string(&path).expect("read fixture");
+        // Skip if it's not the bulk format we want to exercise.
+        let doc: Document = serde_json::from_str(&raw).expect("valid json");
+        let any_titled = doc.content.iter().any(|c| {
+            c.title.as_deref().is_some_and(|t| !t.trim().is_empty())
+        });
+        if any_titled {
+            return; // structured format — different test.
+        }
+        let md = convert_to_markdown(&raw);
+        // Sample of titles known to live inline in this fixture
+        // (copied from the DAISY filenames of the matching book).
+        let ncc: &[(&str, usize)] = &[
+            ("Hh", 0),
+            ("Opletten", 1),
+            ("Nierstenen", 2),
+            ("Ping", 3),
+            ("Opvangbakje", 4),
+        ];
+        let sections = crate::split_into_sections(&md, ncc);
+        // The fuzzy matcher should find at least 3 of the 5 — this is
+        // a loose threshold so the test isn't brittle if NCC formatting
+        // changes the matcher's preferences.
+        assert!(
+            sections.len() >= 3,
+            "expected ≥3 of 5 sample headings to match, got {}",
+            sections.len()
+        );
+    }
+
+    #[test]
+    fn bulk_format_passes_body_through_as_plain_text() {
+        // Single-chapter document with the whole book in one blob and
+        // ALL-CAPS chapter titles inline. We must NOT prepend a `# `
+        // wrapper — the inline titles need to remain plain lines so
+        // section_split's plain-text path can fuzzy-match them.
+        let json = r#"{"content":[{"chapter-content":
+"Cover blurb.\n\nHÈHÈ\nFirst chapter body.\n\nOPLETTEN\nSecond chapter body."}]}"#;
+        let md = convert_to_markdown(json);
+        // No markdown headings emitted.
+        assert!(!md.contains("# "));
+        // Inline titles preserved as their own lines.
+        assert!(md.contains("\nHÈHÈ\n"));
+        assert!(md.contains("\nOPLETTEN\n"));
+    }
+
+    #[test]
+    fn untitled_first_entry_gets_placeholder() {
+        // Real-world case: the first entry has no `chapter-title`.
+        let json = r#"{"content":[
+            {"chapter-content": "De verwarde cavia"},
+            {"chapter-title": "Hoofdstuk 1", "chapter-content": "Body."}
+        ]}"#;
+        let md = convert_to_markdown(json);
+        // Two headings: one synthetic from body, one explicit.
+        assert_eq!(md.matches("# ").count(), 2);
+        assert!(md.contains("# De verwarde cavia"));
+        assert!(md.contains("# Hoofdstuk 1"));
+    }
+}
diff --git a/crates/dpub-align/src/lib.rs b/crates/dpub-align/src/lib.rs
new file mode 100644
index 0000000..50f07f9
--- /dev/null
+++ b/crates/dpub-align/src/lib.rs
@@ -0,0 +1,340 @@
+//! Align Whisper's approximate word-level transcription against the
+//! real book text (ground truth), transferring timestamps so the EPUB
+//! ships with accurate prose AND word-level Media Overlay sync.
+//!
+//! Pipeline:
+//!
+//! 1. **Section split** — match ground truth headings to NCC headings
+//!    so each section's text is identified.
+//! 2. **Word diff** — Myers diff with Jaro-Winkler fuzzy promotion
+//!    over normalised word keys.
+//! 3. **Boundary trim** — discard audiobook preamble / outro and
+//!    handle book-only material (colophon etc.) per
+//!    [`BoundaryStrategy`].
+//! 4. **Timestamp transfer** — copy/redistribute/interpolate Whisper
+//!    timings onto the ground truth word stream.
+//! 5. **Paragraph reconstruction** — group aligned words by ground
+//!    truth paragraph breaks (blank lines) and emit
+//!    [`AlignedParagraph`].
+
+mod boundary;
+mod diff;
+mod error;
+mod json_format;
+mod normalize;
+mod section_split;
+mod transfer;
+
+pub use error::{Error, Result};
+pub use section_split::SectionText;
+
+/// Split a ground truth file into per-section chunks matching the
+/// supplied NCC headings.
+///
+/// Auto-detects format:
+/// - **JSON** (first non-whitespace char is `{`): parsed as the
+///   structured chapter array (see [`json_format`]) and converted to
+///   markdown internally.
+/// - **Markdown / plain text**: passed through to the heading-line
+///   splitter as-is.
+pub fn split_into_sections(text: &str, ncc_headings: &[(&str, usize)]) -> Vec<SectionText> {
+    if json_format::looks_like_json(text) {
+        let markdown = json_format::convert_to_markdown(text);
+        section_split::split_into_sections(&markdown, ncc_headings)
+    } else {
+        section_split::split_into_sections(text, ncc_headings)
+    }
+}
+
+/// Lightweight word-with-timestamp type. Mirrors `dpub_whisper::Word`
+/// but keeps `dpub-align` free of whisper.cpp build dependencies.
+#[derive(Debug, Clone, PartialEq)]
+pub struct WordTiming {
+    pub start_seconds: f64,
+    pub end_seconds: f64,
+    pub text: String,
+}
+
+/// One ground truth word with the timestamp transferred from Whisper
+/// (or interpolated when Whisper had no match).
+#[derive(Debug, Clone, PartialEq)]
+pub struct AlignedWord {
+    pub text: String,
+    pub start_seconds: f64,
+    pub end_seconds: f64,
+    pub confidence: Confidence,
+}
+
+/// Provenance of an aligned word's timestamp.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Confidence {
+    /// Direct match — Whisper's normalised key equals the ground truth's.
+    Exact,
+    /// Near-match (Jaro-Winkler ≥ 0.85).
+    Fuzzy,
+    /// Inserted; timestamp interpolated proportionally from neighbours.
+    Interpolated,
+    /// Outside the anchor region under `bracket` strategy: timestamp
+    /// spans a slice of the leading/trailing gap.
+    Bracketed,
+    /// Outside the anchor region under `no-sync` strategy: word has
+    /// text but no usable timestamp (caller should omit from SMIL).
+    Unsynced,
+}
+
+/// One paragraph's worth of aligned words, ready to feed into the
+/// existing XHTML/SMIL pipeline.
+#[derive(Debug, Clone)]
+pub struct AlignedParagraph {
+    pub text: String,
+    pub words: Vec<AlignedWord>,
+    pub audio_src: String,
+    pub start_seconds: f64,
+    pub end_seconds: f64,
+}
+
+/// How to handle ground-truth-only words (book content the narrator
+/// skipped — colophon, index, acknowledgements).
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum BoundaryStrategy {
+    /// Drop the words from the EPUB entirely.
+    Drop,
+    /// Include the text but emit no Media Overlay entry — visible in
+    /// the XHTML, no karaoke highlight on those passages. Default
+    /// because for accessibility readable text matters more than
+    /// perfect highlight tracking.
+    #[default]
+    NoSync,
+    /// Span the available time gap proportionally — highlight bar
+    /// moves through the words at average speed. Produces continuous
+    /// sync at the cost of timestamp accuracy.
+    Bracket,
+}
+
+/// Diagnostic for one trimmed/dropped/bracketed region.
+#[derive(Debug, Clone)]
+pub struct TrimEvent {
+    pub kind: TrimKind,
+    pub word_count: usize,
+    /// First ~80 chars of the trimmed text, for log lines.
+    pub preview: String,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TrimKind {
+    /// Whisper-only words before the leading anchor (audiobook preamble).
+    LeadingWhisper,
+    /// Whisper-only words after the trailing anchor (audiobook outro).
+    TrailingWhisper,
+    /// Ground-truth-only words before the leading anchor.
+    LeadingGroundTruth,
+    /// Ground-truth-only words after the trailing anchor.
+    TrailingGroundTruth,
+}
+
+/// Result of aligning one section: paragraphs to embed in the EPUB,
+/// plus a log of every trim/drop event for the user to inspect.
+#[derive(Debug, Clone)]
+pub struct AlignmentResult {
+    pub paragraphs: Vec<AlignedParagraph>,
+    pub trim_log: Vec<TrimEvent>,
+}
+
+/// Align ground truth text against a Whisper word stream for one section.
+///
+/// `whisper_words` is the flat list of Whisper words for this section's
+/// audio (in chronological order). `ground_truth` is the section's text
+/// with paragraph boundaries marked by blank lines. `audio_src` is the
+/// audio filename (basename) that all produced paragraphs will inherit.
+pub fn align_section(
+    whisper_words: &[WordTiming],
+    ground_truth: &str,
+    audio_src: &str,
+    boundary_strategy: BoundaryStrategy,
+) -> Result<AlignmentResult> {
+    if whisper_words.is_empty() {
+        return Err(Error::NoWhisperWords);
+    }
+    if ground_truth.trim().is_empty() {
+        return Err(Error::EmptyGroundTruth);
+    }
+
+    // Tokenise ground truth into paragraphs of words. Each word
+    // remembers the paragraph index it belongs to, so we can rebuild
+    // paragraph structure after timestamp transfer.
+    let (gt_words, paragraph_breaks) = tokenise_ground_truth(ground_truth);
+
+    // Run word-level diff on normalised keys.
+    let edit_script = diff::diff_words(whisper_words, &gt_words);
+
+    // Detect the alignment anchor region and classify ops as
+    // leading/core/trailing.
+    let trimmed = boundary::classify(&edit_script);
+
+    // Walk the classified edit script and produce one AlignedWord per
+    // ground truth word.
+    let (aligned, trim_log) = transfer::transfer_timestamps(
+        whisper_words,
+        &gt_words,
+        &trimmed,
+        boundary_strategy,
+    );
+
+    // Group aligned words by paragraph (using the breaks we recorded).
+    let paragraphs = build_paragraphs(&aligned, &paragraph_breaks, audio_src);
+
+    Ok(AlignmentResult {
+        paragraphs,
+        trim_log,
+    })
+}
+
+/// Internal: one ground truth word with original surface text and a
+/// match key (normalised form for diffing).
+#[derive(Debug, Clone)]
+pub(crate) struct GroundTruthWord {
+    pub text: String,
+    pub key: String,
+}
+
+fn tokenise_ground_truth(text: &str) -> (Vec<GroundTruthWord>, Vec<usize>) {
+    let mut words: Vec<GroundTruthWord> = Vec::new();
+    let mut paragraph_breaks: Vec<usize> = Vec::new();
+
+    for (para_idx, para) in text.split("\n\n").enumerate() {
+        let para = para.trim();
+        if para.is_empty() {
+            continue;
+        }
+        if para_idx > 0 && !words.is_empty() {
+            paragraph_breaks.push(words.len());
+        }
+        for tok in para.split_whitespace() {
+            let key = normalize::normalise(tok);
+            if key.is_empty() {
+                // Pure punctuation token — attach to the previous word
+                // by appending raw text, leaving its key untouched.
+                if let Some(last) = words.last_mut() {
+                    last.text.push_str(tok);
+                    continue;
+                }
+            }
+            words.push(GroundTruthWord {
+                text: tok.to_owned(),
+                key,
+            });
+        }
+    }
+
+    (words, paragraph_breaks)
+}
+
+fn build_paragraphs(
+    aligned: &[AlignedWord],
+    paragraph_breaks: &[usize],
+    audio_src: &str,
+) -> Vec<AlignedParagraph> {
+    if aligned.is_empty() {
+        return Vec::new();
+    }
+    let mut paragraphs = Vec::new();
+    let mut start_idx = 0;
+    let mut breaks: Vec<usize> = paragraph_breaks.to_vec();
+    breaks.push(aligned.len()); // sentinel
+
+    for end_idx in breaks {
+        if end_idx <= start_idx {
+            continue;
+        }
+        let slice = &aligned[start_idx..end_idx];
+        if slice.is_empty() {
+            start_idx = end_idx;
+            continue;
+        }
+        let text = slice
+            .iter()
+            .map(|w| w.text.as_str())
+            .collect::<Vec<_>>()
+            .join(" ");
+        // Time bounds: pick the first/last word with a real timestamp
+        // (Unsynced words may have zeros).
+        let first_real = slice.iter().find(|w| w.confidence != Confidence::Unsynced);
+        let last_real = slice
+            .iter()
+            .rev()
+            .find(|w| w.confidence != Confidence::Unsynced);
+        let (start_seconds, end_seconds) = match (first_real, last_real) {
+            (Some(a), Some(b)) => (a.start_seconds, b.end_seconds),
+            _ => (0.0, 0.0),
+        };
+        paragraphs.push(AlignedParagraph {
+            text,
+            words: slice.to_vec(),
+            audio_src: audio_src.to_owned(),
+            start_seconds,
+            end_seconds,
+        });
+        start_idx = end_idx;
+    }
+    paragraphs
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn ww(text: &str, start: f64, end: f64) -> WordTiming {
+        WordTiming {
+            start_seconds: start,
+            end_seconds: end,
+            text: text.to_owned(),
+        }
+    }
+
+    #[test]
+    fn rejects_empty_inputs() {
+        assert!(matches!(
+            align_section(&[], "hello", "a.mp3", BoundaryStrategy::default()),
+            Err(Error::NoWhisperWords),
+        ));
+        assert!(matches!(
+            align_section(&[ww("a", 0.0, 1.0)], "", "a.mp3", BoundaryStrategy::default()),
+            Err(Error::EmptyGroundTruth),
+        ));
+    }
+
+    #[test]
+    fn perfect_match_passes_through() {
+        // Whisper says exactly what the ground truth says.
+        let whisper = vec![
+            ww("Hello", 0.0, 0.5),
+            ww("world.", 0.5, 1.5),
+        ];
+        let gt = "Hello world.";
+        let res = align_section(&whisper, gt, "a.mp3", BoundaryStrategy::default()).unwrap();
+        assert_eq!(res.paragraphs.len(), 1);
+        let para = &res.paragraphs[0];
+        assert_eq!(para.words.len(), 2);
+        assert_eq!(para.words[0].text, "Hello");
+        assert_eq!(para.words[1].text, "world.");
+        assert_eq!(para.words[0].confidence, Confidence::Exact);
+        assert_eq!(para.start_seconds, 0.0);
+        assert_eq!(para.end_seconds, 1.5);
+        assert!(res.trim_log.is_empty());
+    }
+
+    #[test]
+    fn paragraph_breaks_split_output() {
+        let whisper = vec![
+            ww("Hello", 0.0, 0.5),
+            ww("world.", 0.5, 1.5),
+            ww("Foo", 2.0, 2.4),
+            ww("bar.", 2.4, 3.0),
+        ];
+        let gt = "Hello world.\n\nFoo bar.";
+        let res = align_section(&whisper, gt, "a.mp3", BoundaryStrategy::default()).unwrap();
+        assert_eq!(res.paragraphs.len(), 2);
+        assert_eq!(res.paragraphs[0].text, "Hello world.");
+        assert_eq!(res.paragraphs[1].text, "Foo bar.");
+    }
+}
diff --git a/crates/dpub-align/src/normalize.rs b/crates/dpub-align/src/normalize.rs
new file mode 100644
index 0000000..ec435fc
--- /dev/null
+++ b/crates/dpub-align/src/normalize.rs
@@ -0,0 +1,89 @@
+//! Word normalisation for matching.
+//!
+//! Produces a "match key" by lowercasing and stripping punctuation.
+//! The original surface form (with punctuation, capitalisation) is
+//! preserved separately by callers and flows through to the output —
+//! normalisation is *only* used as the diff-equality key.
+
+/// Return the normalised match key for a word: Unicode-lowercased,
+/// with leading/trailing punctuation stripped. Internal apostrophes
+/// (`don't`, `c'est`) and hyphens (`well-known`) are preserved so
+/// English/French/Dutch contractions and compounds stay intact.
+pub fn normalise(word: &str) -> String {
+    // Strip surrounding punctuation/quotes/brackets/whitespace.
+    let trimmed = word.trim_matches(|c: char| {
+        c.is_whitespace() || is_strippable_punct(c)
+    });
+    if trimmed.is_empty() {
+        return String::new();
+    }
+    trimmed.to_lowercase()
+}
+
+fn is_strippable_punct(c: char) -> bool {
+    matches!(
+        c,
+        '.' | ',' | ';' | ':' | '!' | '?' | '…'
+            | '"' | '\u{201C}' | '\u{201D}'      // " " "
+            | '\'' | '\u{2018}' | '\u{2019}'    // ' ' '
+            | '(' | ')' | '[' | ']' | '{' | '}'
+            | '«' | '»' | '‹' | '›'
+            | '—' | '–'
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn lowercases() {
+        assert_eq!(normalise("Hello"), "hello");
+        assert_eq!(normalise("WORLD"), "world");
+    }
+
+    #[test]
+    fn strips_trailing_punctuation() {
+        assert_eq!(normalise("wereld."), "wereld");
+        assert_eq!(normalise("wereld,"), "wereld");
+        assert_eq!(normalise("wereld!"), "wereld");
+        assert_eq!(normalise("wereld?"), "wereld");
+        assert_eq!(normalise("wereld..."), "wereld");
+        assert_eq!(normalise("wereld…"), "wereld");
+    }
+
+    #[test]
+    fn strips_brackets_and_quotes() {
+        assert_eq!(normalise("(hello"), "hello");
+        assert_eq!(normalise("hello)"), "hello");
+        assert_eq!(normalise("\"quoted\""), "quoted");
+        assert_eq!(normalise("'word'"), "word");
+        assert_eq!(normalise("\u{201C}smart\u{201D}"), "smart");
+    }
+
+    #[test]
+    fn preserves_internal_apostrophes() {
+        assert_eq!(normalise("don't"), "don't");
+        assert_eq!(normalise("c'est"), "c'est");
+    }
+
+    #[test]
+    fn preserves_internal_hyphens() {
+        assert_eq!(normalise("well-known"), "well-known");
+        assert_eq!(normalise("co-op."), "co-op");
+    }
+
+    #[test]
+    fn pure_punctuation_returns_empty() {
+        assert_eq!(normalise("."), "");
+        assert_eq!(normalise("..."), "");
+        assert_eq!(normalise("\""), "");
+        assert_eq!(normalise(""), "");
+    }
+
+    #[test]
+    fn unicode_passes_through() {
+        assert_eq!(normalise("café"), "café");
+        assert_eq!(normalise("Antwerpen,"), "antwerpen");
+    }
+}
diff --git a/crates/dpub-align/src/section_split.rs b/crates/dpub-align/src/section_split.rs
new file mode 100644
index 0000000..b10f228
--- /dev/null
+++ b/crates/dpub-align/src/section_split.rs
@@ -0,0 +1,256 @@
+//! Split a single ground truth file into per-section text by matching
+//! its headings against the DAISY NCC headings.
+//!
+//! Auto-detects markdown vs plain text:
+//! - if any line starts with `#` followed by space, treat as markdown
+//!   and use those as candidate headings
+//! - otherwise scan every short line and fuzzy-match against the
+//!   provided NCC heading texts
+//!
+//! Match score: Jaro-Winkler over normalised heading text.
+
+use crate::normalize;
+
+/// One section's slice of the ground truth text.
+#[derive(Debug, Clone)]
+pub struct SectionText {
+    /// Index into the NCC headings list (0-based).
+    pub ncc_index: usize,
+    /// Raw text between this heading and the next matched heading.
+    /// Paragraph boundaries (blank lines) are preserved.
+    pub text: String,
+}
+
+/// Threshold for matching a candidate heading line to an NCC heading.
+const HEADING_MATCH_THRESHOLD: f64 = 0.85;
+
+/// Maximum character length for a "candidate heading" line in plain
+/// text mode. Real chapter titles are short; long lines are body text.
+const PLAIN_HEADING_MAX_LEN: usize = 120;
+
+/// Split the ground truth `text` into per-section chunks by matching
+/// headings against the supplied NCC heading texts (in order).
+///
+/// `ncc_headings` is a slice of (heading_text, ncc_index) tuples. The
+/// `ncc_index` lets the caller map results back to its own data
+/// structures.
+pub fn split_into_sections(
+    text: &str,
+    ncc_headings: &[(&str, usize)],
+) -> Vec<SectionText> {
+    let is_markdown = text
+        .lines()
+        .any(|l| l.trim_start().starts_with('#') && l.trim_start().chars().nth(1) == Some(' '));
+
+    let candidate_lines = if is_markdown {
+        markdown_headings(text)
+    } else {
+        plain_text_headings(text)
+    };
+
+    // For each NCC heading (in order), find the *first* candidate line
+    // (after the previous match) that fuzzy-matches it. Locking matches
+    // in document order prevents a later-section heading from stealing
+    // an earlier section's match.
+    let mut matches: Vec<(usize, usize, usize)> = Vec::new(); // (ncc_idx, line_byte_offset_start, line_byte_offset_end)
+    let mut search_from: usize = 0;
+    for (heading_text, ncc_idx) in ncc_headings {
+        let target = normalize_heading(heading_text);
+        let mut best: Option<(f64, usize, usize)> = None;
+        for cand in &candidate_lines {
+            if cand.line_start < search_from {
+                continue;
+            }
+            let normalised = normalize_heading(&cand.heading_text);
+            if normalised.is_empty() {
+                continue;
+            }
+            let score = strsim::jaro_winkler(&target, &normalised);
+            if score >= HEADING_MATCH_THRESHOLD
+                && best.map_or(true, |(prev, _, _)| score > prev)
+            {
+                best = Some((score, cand.line_start, cand.line_end));
+            }
+            // Don't break on first hit — we want the best score
+            // before search_from advances.
+            // But cap search distance so a typo doesn't cause a match
+            // 30 chapters later: stop once we've scanned enough lines.
+            if cand.line_start > search_from + 200_000 {
+                break;
+            }
+        }
+        if let Some((_, start, end)) = best {
+            matches.push((*ncc_idx, start, end));
+            search_from = end;
+        }
+    }
+
+    // Now slice the text between matched heading line ends.
+    let bytes = text.as_bytes();
+    let mut sections = Vec::with_capacity(matches.len());
+    for i in 0..matches.len() {
+        let (ncc_idx, _heading_start, heading_end) = matches[i];
+        let body_start = heading_end;
+        let body_end = matches.get(i + 1).map_or(bytes.len(), |&(_, next_start, _)| next_start);
+        if body_end <= body_start {
+            continue;
+        }
+        let slice = &text[body_start..body_end];
+        sections.push(SectionText {
+            ncc_index: ncc_idx,
+            text: slice.trim().to_owned(),
+        });
+    }
+    sections
+}
+
+#[derive(Debug)]
+struct CandidateHeading {
+    heading_text: String,
+    line_start: usize,
+    line_end: usize,
+}
+
+fn markdown_headings(text: &str) -> Vec<CandidateHeading> {
+    let mut out = Vec::new();
+    let mut offset: usize = 0;
+    for line in text.split_inclusive('\n') {
+        let trimmed = line.trim_start();
+        if trimmed.starts_with('#') {
+            if let Some(rest) = trimmed.strip_prefix('#') {
+                let heading_body = rest.trim_start_matches('#').trim();
+                if !heading_body.is_empty() {
+                    out.push(CandidateHeading {
+                        heading_text: heading_body.to_owned(),
+                        line_start: offset,
+                        line_end: offset + line.len(),
+                    });
+                }
+            }
+        }
+        offset += line.len();
+    }
+    out
+}
+
+fn plain_text_headings(text: &str) -> Vec<CandidateHeading> {
+    // Heuristic: any non-empty line ≤ PLAIN_HEADING_MAX_LEN is a
+    // candidate. We rely on the fuzzy-match threshold to filter out
+    // body-text lines that happen to be short.
+    let mut out = Vec::new();
+    let mut offset: usize = 0;
+    for line in text.split_inclusive('\n') {
+        let trimmed = line.trim();
+        if !trimmed.is_empty() && trimmed.len() <= PLAIN_HEADING_MAX_LEN {
+            out.push(CandidateHeading {
+                heading_text: trimmed.to_owned(),
+                line_start: offset,
+                line_end: offset + line.len(),
+            });
+        }
+        offset += line.len();
+    }
+    out
+}
+
+/// Heading-specific normalisation: lowercase, strip leading numbering
+/// ("Chapter 1", "1.", "I."), collapse whitespace.
+fn normalize_heading(text: &str) -> String {
+    let trimmed = text.trim();
+    let stripped = strip_leading_numbering(trimmed);
+    // Strip surrounding punctuation per word, then rejoin.
+    let cleaned: Vec<String> = stripped
+        .split_whitespace()
+        .map(normalize::normalise)
+        .filter(|s| !s.is_empty())
+        .collect();
+    cleaned.join(" ")
+}
+
+fn strip_leading_numbering(s: &str) -> &str {
+    // Strip patterns like "1. ", "1) ", "Chapter 1: ", "I. ".
+    let s = s.trim_start_matches(|c: char| c.is_ascii_digit() || c == '.' || c == ')');
+    s.trim()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn detects_markdown_headings() {
+        let text = "\
+# Chapter 1
+First paragraph.
+
+## Section A
+Second paragraph.
+
+# Chapter 2
+Third paragraph.
+";
+        let ncc = [("Chapter 1", 0), ("Chapter 2", 1)];
+        let sections = split_into_sections(text, &ncc);
+        assert_eq!(sections.len(), 2);
+        assert_eq!(sections[0].ncc_index, 0);
+        assert!(sections[0].text.contains("First paragraph"));
+        assert!(sections[0].text.contains("Section A"));
+        assert!(sections[0].text.contains("Second paragraph"));
+        assert_eq!(sections[1].ncc_index, 1);
+        assert!(sections[1].text.contains("Third paragraph"));
+    }
+
+    #[test]
+    fn detects_plain_text_headings() {
+        let text = "\
+Chapter 1
+
+This is the first paragraph of chapter one.
+
+Chapter 2
+
+This is the first paragraph of chapter two.
+";
+        let ncc = [("Chapter 1", 0), ("Chapter 2", 1)];
+        let sections = split_into_sections(text, &ncc);
+        assert_eq!(sections.len(), 2);
+        assert!(sections[0].text.contains("first paragraph of chapter one"));
+        assert!(sections[1].text.contains("first paragraph of chapter two"));
+    }
+
+    #[test]
+    fn fuzzy_matches_typo() {
+        // "Hofdstuk 1" (typo for "Hoofdstuk 1") should still match
+        // via Jaro-Winkler.
+        let text = "\
+# Hofdstuk 1
+Body text.
+
+# Hoofdstuk 2
+More body text.
+";
+        let ncc = [("Hoofdstuk 1", 0), ("Hoofdstuk 2", 1)];
+        let sections = split_into_sections(text, &ncc);
+        assert_eq!(sections.len(), 2);
+    }
+
+    #[test]
+    fn unmatched_heading_skipped() {
+        let text = "\
+# Chapter 1
+Body text.
+";
+        let ncc = [("Chapter 1", 0), ("Chapter 2 — Not in text", 1)];
+        let sections = split_into_sections(text, &ncc);
+        assert_eq!(sections.len(), 1);
+        assert_eq!(sections[0].ncc_index, 0);
+    }
+
+    #[test]
+    fn handles_empty_input() {
+        let text = "";
+        let ncc = [("Chapter 1", 0)];
+        let sections = split_into_sections(text, &ncc);
+        assert!(sections.is_empty());
+    }
+}
diff --git a/crates/dpub-align/src/transfer.rs b/crates/dpub-align/src/transfer.rs
new file mode 100644
index 0000000..ac0dc44
--- /dev/null
+++ b/crates/dpub-align/src/transfer.rs
@@ -0,0 +1,621 @@
+//! Walk the classified edit script and produce one [`AlignedWord`]
+//! per ground truth word, with timestamps transferred from Whisper
+//! (or interpolated / bracketed / unsynced as appropriate).
+//!
+//! The walk is index-based on the script. Two cursors track which
+//! input slice each op refers to:
+//! - `whisper_words[op.whisper_idx]` for the Whisper time data
+//! - `gt_words[op.gt_idx]` for the ground truth surface text
+
+use crate::boundary::{ClassifiedOp, Region};
+use crate::diff::Op;
+use crate::{
+    AlignedWord, BoundaryStrategy, Confidence, GroundTruthWord, TrimEvent, TrimKind, WordTiming,
+};
+
+/// Produce the aligned word stream and trim diagnostics.
+pub(crate) fn transfer_timestamps(
+    whisper: &[WordTiming],
+    ground_truth: &[GroundTruthWord],
+    script: &[ClassifiedOp],
+    strategy: BoundaryStrategy,
+) -> (Vec<AlignedWord>, Vec<TrimEvent>) {
+    // Pass 1: assemble per-region operation buckets.
+    let mut leading: Vec<&ClassifiedOp> = Vec::new();
+    let mut core: Vec<&ClassifiedOp> = Vec::new();
+    let mut trailing: Vec<&ClassifiedOp> = Vec::new();
+    for c in script {
+        match c.region {
+            Region::Leading => leading.push(c),
+            Region::Core => core.push(c),
+            Region::Trailing => trailing.push(c),
+        }
+    }
+
+    let mut aligned: Vec<AlignedWord> = Vec::with_capacity(ground_truth.len());
+    let mut trim_log: Vec<TrimEvent> = Vec::new();
+
+    // Time-bracket helpers: the leading region spans audio time from 0
+    // to the first core anchor's start; the trailing from last core
+    // anchor's end to whisper.last().end.
+    let core_start_seconds = first_core_match_time(&core, whisper).unwrap_or(0.0);
+    let core_end_seconds = last_core_match_time(&core, whisper)
+        .unwrap_or_else(|| whisper.last().map_or(0.0, |w| w.end_seconds));
+    let total_audio_end = whisper.last().map_or(core_end_seconds, |w| w.end_seconds);
+
+    handle_boundary_region(
+        &leading,
+        whisper,
+        ground_truth,
+        strategy,
+        BoundaryEnd::Leading,
+        0.0,
+        core_start_seconds,
+        &mut aligned,
+        &mut trim_log,
+    );
+
+    transfer_core(&core, whisper, ground_truth, &mut aligned);
+
+    handle_boundary_region(
+        &trailing,
+        whisper,
+        ground_truth,
+        strategy,
+        BoundaryEnd::Trailing,
+        core_end_seconds,
+        total_audio_end,
+        &mut aligned,
+        &mut trim_log,
+    );
+
+    enforce_monotonicity(&mut aligned);
+    (aligned, trim_log)
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum BoundaryEnd {
+    Leading,
+    Trailing,
+}
+
+#[allow(clippy::too_many_arguments)]
+fn handle_boundary_region(
+    region: &[&ClassifiedOp],
+    whisper: &[WordTiming],
+    ground_truth: &[GroundTruthWord],
+    strategy: BoundaryStrategy,
+    end: BoundaryEnd,
+    gap_start: f64,
+    gap_end: f64,
+    aligned: &mut Vec<AlignedWord>,
+    trim_log: &mut Vec<TrimEvent>,
+) {
+    if region.is_empty() {
+        return;
+    }
+
+    // Whisper-only words in a boundary region are always discarded
+    // (audiobook preamble / outro). Their time is *not* redistributed.
+    let whisper_only: Vec<usize> = region
+        .iter()
+        .filter_map(|c| match c.op {
+            Op::Delete { whisper_idx } => Some(whisper_idx),
+            _ => None,
+        })
+        .collect();
+    if !whisper_only.is_empty() {
+        let preview = preview_from_whisper(whisper, &whisper_only);
+        trim_log.push(TrimEvent {
+            kind: match end {
+                BoundaryEnd::Leading => TrimKind::LeadingWhisper,
+                BoundaryEnd::Trailing => TrimKind::TrailingWhisper,
+            },
+            word_count: whisper_only.len(),
+            preview,
+        });
+    }
+
+    // Ground-truth-only words: collect, then apply strategy.
+    let gt_only: Vec<usize> = region
+        .iter()
+        .filter_map(|c| match c.op {
+            Op::Insert { gt_idx } => Some(gt_idx),
+            _ => None,
+        })
+        .collect();
+
+    // Equal/Fuzzy ops *inside* a boundary region (rare — most matches
+    // get pulled into Core) are still real matches; transfer the
+    // timestamps directly so we don't lose them.
+    for c in region {
+        match c.op {
+            Op::Equal { whisper_idx, gt_idx } | Op::Fuzzy { whisper_idx, gt_idx, .. } => {
+                let w = &whisper[whisper_idx];
+                let g = &ground_truth[gt_idx];
+                aligned.push(AlignedWord {
+                    text: g.text.clone(),
+                    start_seconds: w.start_seconds,
+                    end_seconds: w.end_seconds,
+                    confidence: if matches!(c.op, Op::Equal { .. }) {
+                        Confidence::Exact
+                    } else {
+                        Confidence::Fuzzy
+                    },
+                });
+            }
+            _ => {}
+        }
+    }
+
+    if gt_only.is_empty() {
+        return;
+    }
+
+    let gt_preview = preview_from_ground_truth(ground_truth, &gt_only);
+    trim_log.push(TrimEvent {
+        kind: match end {
+            BoundaryEnd::Leading => TrimKind::LeadingGroundTruth,
+            BoundaryEnd::Trailing => TrimKind::TrailingGroundTruth,
+        },
+        word_count: gt_only.len(),
+        preview: gt_preview,
+    });
+
+    match strategy {
+        BoundaryStrategy::Drop => {
+            // Words excluded entirely; nothing to push.
+        }
+        BoundaryStrategy::NoSync => {
+            for &g_idx in &gt_only {
+                aligned.push(AlignedWord {
+                    text: ground_truth[g_idx].text.clone(),
+                    start_seconds: 0.0,
+                    end_seconds: 0.0,
+                    confidence: Confidence::Unsynced,
+                });
+            }
+        }
+        BoundaryStrategy::Bracket => {
+            // Distribute the gap [gap_start, gap_end] proportionally
+            // by character count. If the gap is non-positive, fall
+            // back to Unsynced.
+            let total_chars: usize = gt_only
+                .iter()
+                .map(|&g| ground_truth[g].text.chars().count().max(1))
+                .sum();
+            let gap_duration = (gap_end - gap_start).max(0.0);
+            if gap_duration <= 0.0 || total_chars == 0 {
+                for &g_idx in &gt_only {
+                    aligned.push(AlignedWord {
+                        text: ground_truth[g_idx].text.clone(),
+                        start_seconds: 0.0,
+                        end_seconds: 0.0,
+                        confidence: Confidence::Unsynced,
+                    });
+                }
+                return;
+            }
+            let mut cursor = gap_start;
+            #[allow(clippy::cast_precision_loss)]
+            let per_char = gap_duration / total_chars as f64;
+            for &g_idx in &gt_only {
+                let chars = ground_truth[g_idx].text.chars().count().max(1);
+                #[allow(clippy::cast_precision_loss)]
+                let dur = chars as f64 * per_char;
+                aligned.push(AlignedWord {
+                    text: ground_truth[g_idx].text.clone(),
+                    start_seconds: cursor,
+                    end_seconds: cursor + dur,
+                    confidence: Confidence::Bracketed,
+                });
+                cursor += dur;
+            }
+        }
+    }
+}
+
+/// Walk the core region: Equal/Fuzzy copy timestamps; Delete is
+/// discarded (audio time is reclaimed by neighbours via interpolation
+/// of any adjacent Inserts); Insert interpolates from neighbours.
+fn transfer_core(
+    region: &[&ClassifiedOp],
+    whisper: &[WordTiming],
+    ground_truth: &[GroundTruthWord],
+    aligned: &mut Vec<AlignedWord>,
+) {
+    let mut i = 0;
+    while i < region.len() {
+        match region[i].op {
+            Op::Equal { whisper_idx, gt_idx } => {
+                let w = &whisper[whisper_idx];
+                aligned.push(AlignedWord {
+                    text: ground_truth[gt_idx].text.clone(),
+                    start_seconds: w.start_seconds,
+                    end_seconds: w.end_seconds,
+                    confidence: Confidence::Exact,
+                });
+                i += 1;
+            }
+            Op::Fuzzy { whisper_idx, gt_idx, .. } => {
+                let w = &whisper[whisper_idx];
+                aligned.push(AlignedWord {
+                    text: ground_truth[gt_idx].text.clone(),
+                    start_seconds: w.start_seconds,
+                    end_seconds: w.end_seconds,
+                    confidence: Confidence::Fuzzy,
+                });
+                i += 1;
+            }
+            Op::Insert { .. } => {
+                // Collect a run of consecutive Inserts (and any
+                // preceding/following Deletes) to interpolate as a
+                // group.
+                let group_start = i;
+                while i < region.len()
+                    && matches!(region[i].op, Op::Insert { .. } | Op::Delete { .. })
+                {
+                    i += 1;
+                }
+                let group_end = i;
+                interpolate_insert_run(
+                    &region[group_start..group_end],
+                    whisper,
+                    ground_truth,
+                    aligned,
+                );
+            }
+            Op::Delete { .. } => {
+                // Skip lone Delete (Whisper hallucination). Time is
+                // implicitly reclaimed: the next Equal will start
+                // wherever Whisper had it, leaving a small audible
+                // pause that's consistent with what was actually said.
+                i += 1;
+            }
+        }
+    }
+}
+
+/// Interpolate timestamps for a run of `Insert` operations (with
+/// optional adjacent Deletes that contribute their time span). Uses
+/// the previous aligned word's end and the next core match's start
+/// as bounds, distributing time proportionally to character length.
+fn interpolate_insert_run(
+    run: &[&ClassifiedOp],
+    whisper: &[WordTiming],
+    ground_truth: &[GroundTruthWord],
+    aligned: &mut Vec<AlignedWord>,
+) {
+    // Determine the time bounds.
+    let prev_end = aligned.last().map_or(0.0, |w| w.end_seconds);
+
+    // The "next" anchor time is whatever Whisper word the deletes
+    // span, or — if none — the previous timestamp + 0 (which collapses
+    // to zero-duration words; better than fabricated time).
+    let delete_indices: Vec<usize> = run
+        .iter()
+        .filter_map(|c| match c.op {
+            Op::Delete { whisper_idx } => Some(whisper_idx),
+            _ => None,
+        })
+        .collect();
+    let bound_end = if let Some(last_d) = delete_indices.last() {
+        whisper[*last_d].end_seconds
+    } else {
+        prev_end // No delete pool — pure insert with no following anchor here.
+    };
+
+    let inserts: Vec<usize> = run
+        .iter()
+        .filter_map(|c| match c.op {
+            Op::Insert { gt_idx } => Some(gt_idx),
+            _ => None,
+        })
+        .collect();
+
+    if inserts.is_empty() {
+        return;
+    }
+
+    let total_chars: usize = inserts
+        .iter()
+        .map(|&g| ground_truth[g].text.chars().count().max(1))
+        .sum();
+    let span = (bound_end - prev_end).max(0.0);
+
+    if span <= 0.0 || total_chars == 0 {
+        // Zero-duration: emit at prev_end. Reading systems will skip
+        // these instantly but the text remains.
+        for &g_idx in &inserts {
+            aligned.push(AlignedWord {
+                text: ground_truth[g_idx].text.clone(),
+                start_seconds: prev_end,
+                end_seconds: prev_end,
+                confidence: Confidence::Interpolated,
+            });
+        }
+        return;
+    }
+
+    let mut cursor = prev_end;
+    #[allow(clippy::cast_precision_loss)]
+    let per_char = span / total_chars as f64;
+    for &g_idx in &inserts {
+        let chars = ground_truth[g_idx].text.chars().count().max(1);
+        #[allow(clippy::cast_precision_loss)]
+        let dur = chars as f64 * per_char;
+        aligned.push(AlignedWord {
+            text: ground_truth[g_idx].text.clone(),
+            start_seconds: cursor,
+            end_seconds: cursor + dur,
+            confidence: Confidence::Interpolated,
+        });
+        cursor += dur;
+    }
+}
+
+fn first_core_match_time(core: &[&ClassifiedOp], whisper: &[WordTiming]) -> Option<f64> {
+    core.iter().find_map(|c| match c.op {
+        Op::Equal { whisper_idx, .. } | Op::Fuzzy { whisper_idx, .. } => {
+            Some(whisper[whisper_idx].start_seconds)
+        }
+        _ => None,
+    })
+}
+
+fn last_core_match_time(core: &[&ClassifiedOp], whisper: &[WordTiming]) -> Option<f64> {
+    core.iter().rev().find_map(|c| match c.op {
+        Op::Equal { whisper_idx, .. } | Op::Fuzzy { whisper_idx, .. } => {
+            Some(whisper[whisper_idx].end_seconds)
+        }
+        _ => None,
+    })
+}
+
+fn preview_from_whisper(whisper: &[WordTiming], indices: &[usize]) -> String {
+    let mut out = String::new();
+    for &i in indices.iter().take(15) {
+        if !out.is_empty() {
+            out.push(' ');
+        }
+        out.push_str(&whisper[i].text);
+        if out.len() > 80 {
+            break;
+        }
+    }
+    if indices.len() > 15 {
+        out.push_str(" …");
+    }
+    out
+}
+
+fn preview_from_ground_truth(gt: &[GroundTruthWord], indices: &[usize]) -> String {
+    let mut out = String::new();
+    for &i in indices.iter().take(15) {
+        if !out.is_empty() {
+            out.push(' ');
+        }
+        out.push_str(&gt[i].text);
+        if out.len() > 80 {
+            break;
+        }
+    }
+    if indices.len() > 15 {
+        out.push_str(" …");
+    }
+    out
+}
+
+/// Clamp any timestamp regressions so SMIL emits monotonic clip times.
+/// Whisper itself can occasionally produce overlapping word ranges
+/// (BPE artefacts) and our interpolation could in theory exceed the
+/// next anchor's start when characters dominate. A simple sweep fixes
+/// both.
+fn enforce_monotonicity(aligned: &mut [AlignedWord]) {
+    for i in 1..aligned.len() {
+        // Skip Unsynced entries — they hold zeros by design.
+        if aligned[i].confidence == Confidence::Unsynced
+            || aligned[i - 1].confidence == Confidence::Unsynced
+        {
+            continue;
+        }
+        let prev_end = aligned[i - 1].end_seconds;
+        if aligned[i].start_seconds < prev_end {
+            aligned[i].start_seconds = prev_end;
+        }
+        if aligned[i].end_seconds < aligned[i].start_seconds {
+            aligned[i].end_seconds = aligned[i].start_seconds;
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::boundary::classify;
+    use crate::diff::diff_words;
+    use crate::normalize;
+
+    fn ww(text: &str, start: f64, end: f64) -> WordTiming {
+        WordTiming {
+            start_seconds: start,
+            end_seconds: end,
+            text: text.to_owned(),
+        }
+    }
+
+    fn gt(words: &[&str]) -> Vec<GroundTruthWord> {
+        words
+            .iter()
+            .map(|s| GroundTruthWord {
+                text: (*s).to_owned(),
+                key: normalize::normalise(s),
+            })
+            .collect()
+    }
+
+    fn run(
+        whisper: &[WordTiming],
+        ground_truth: &[GroundTruthWord],
+        strategy: BoundaryStrategy,
+    ) -> (Vec<AlignedWord>, Vec<TrimEvent>) {
+        let script = diff_words(whisper, ground_truth);
+        let classified = classify(&script);
+        transfer_timestamps(whisper, ground_truth, &classified, strategy)
+    }
+
+    #[test]
+    fn perfect_match_passes_timestamps_through() {
+        let w = vec![ww("hello", 0.0, 0.5), ww("world", 0.5, 1.5)];
+        let g = gt(&["hello", "world"]);
+        let (aligned, _) = run(&w, &g, BoundaryStrategy::default());
+        assert_eq!(aligned.len(), 2);
+        assert_eq!(aligned[0].start_seconds, 0.0);
+        assert_eq!(aligned[0].end_seconds, 0.5);
+        assert_eq!(aligned[1].start_seconds, 0.5);
+        assert_eq!(aligned[1].end_seconds, 1.5);
+    }
+
+    #[test]
+    fn whisper_preamble_is_trimmed_no_time_smear() {
+        // 8 fake preamble words + 5 real matches.
+        let mut w: Vec<WordTiming> = (0..8)
+            .map(|i| ww(&format!("p{i}"), i as f64, i as f64 + 1.0))
+            .collect();
+        for i in 0..5 {
+            let t = 8.0 + i as f64;
+            w.push(ww(&format!("real{i}"), t, t + 1.0));
+        }
+        let g = gt(&["real0", "real1", "real2", "real3", "real4"]);
+        let (aligned, trim_log) = run(&w, &g, BoundaryStrategy::default());
+        // The 8 preamble words must NOT bleed into the first real word.
+        assert_eq!(aligned.len(), 5);
+        assert_eq!(aligned[0].start_seconds, 8.0);
+        // Trim log records the preamble.
+        assert!(trim_log.iter().any(|e| e.kind == TrimKind::LeadingWhisper && e.word_count == 8));
+    }
+
+    #[test]
+    fn colophon_no_sync_default() {
+        // 5 real matches + 5 colophon words only in ground truth.
+        let mut w: Vec<WordTiming> = Vec::new();
+        for i in 0..5 {
+            let t = i as f64;
+            w.push(ww(&format!("real{i}"), t, t + 1.0));
+        }
+        let mut g_words: Vec<&str> = vec!["real0", "real1", "real2", "real3", "real4"];
+        let colophon = ["isbn", "9780000000000", "copyright", "2024", "publisher"];
+        g_words.extend(colophon.iter());
+        let g = gt(&g_words);
+        let (aligned, trim_log) = run(&w, &g, BoundaryStrategy::NoSync);
+        assert_eq!(aligned.len(), 10);
+        // Last 5 should be Unsynced.
+        for w in &aligned[5..] {
+            assert_eq!(w.confidence, Confidence::Unsynced);
+            assert_eq!(w.start_seconds, 0.0);
+            assert_eq!(w.end_seconds, 0.0);
+        }
+        assert!(trim_log
+            .iter()
+            .any(|e| e.kind == TrimKind::TrailingGroundTruth && e.word_count == 5));
+    }
+
+    #[test]
+    fn colophon_drop_excludes_words() {
+        let mut w: Vec<WordTiming> = Vec::new();
+        for i in 0..5 {
+            let t = i as f64;
+            w.push(ww(&format!("real{i}"), t, t + 1.0));
+        }
+        let mut g_words: Vec<&str> = vec!["real0", "real1", "real2", "real3", "real4"];
+        g_words.extend(["isbn", "9780000000000"].iter());
+        let g = gt(&g_words);
+        let (aligned, _) = run(&w, &g, BoundaryStrategy::Drop);
+        assert_eq!(aligned.len(), 5);
+    }
+
+    #[test]
+    fn colophon_bracket_spans_gap() {
+        // 5 matches end at t=5.0, total audio extends to t=10.0,
+        // bracket strategy should distribute 5s across the colophon.
+        let mut w: Vec<WordTiming> = Vec::new();
+        for i in 0..5 {
+            let t = i as f64;
+            w.push(ww(&format!("real{i}"), t, t + 1.0));
+        }
+        // Add one more whisper word to extend the audio range.
+        w.push(ww("trailing-noise", 5.0, 10.0));
+        let mut g_words: Vec<&str> = vec!["real0", "real1", "real2", "real3", "real4"];
+        g_words.extend(["isbn", "page", "number"].iter());
+        let g = gt(&g_words);
+        let (aligned, _) = run(&w, &g, BoundaryStrategy::Bracket);
+        // Bracketed entries must have non-zero durations and be in
+        // [5.0, 10.0].
+        let bracketed: Vec<_> = aligned
+            .iter()
+            .filter(|w| w.confidence == Confidence::Bracketed)
+            .collect();
+        assert!(!bracketed.is_empty());
+        for w in &bracketed {
+            assert!(w.start_seconds >= 5.0);
+            assert!(w.end_seconds <= 10.0 + 0.01);
+            assert!(w.end_seconds > w.start_seconds);
+        }
+    }
+
+    #[test]
+    fn fuzzy_match_transfers_timestamp() {
+        let w = vec![ww("antwerpe", 1.0, 2.0)];
+        let g = gt(&["antwerpen"]);
+        let (aligned, _) = run(&w, &g, BoundaryStrategy::default());
+        assert_eq!(aligned.len(), 1);
+        assert_eq!(aligned[0].text, "antwerpen");
+        assert_eq!(aligned[0].confidence, Confidence::Fuzzy);
+        assert_eq!(aligned[0].start_seconds, 1.0);
+    }
+
+    #[test]
+    fn missing_word_interpolates_between_neighbours() {
+        // Whisper missed "the": "hello world" vs "hello the world".
+        let w = vec![
+            ww("hello", 0.0, 1.0),
+            ww("world", 2.0, 3.0),
+        ];
+        let g = gt(&["hello", "the", "world"]);
+        // No anchor here (only 2 matches). The classify() falls back
+        // to Core for everything, so interpolation runs on the full
+        // script.
+        let (aligned, _) = run(&w, &g, BoundaryStrategy::default());
+        assert_eq!(aligned.len(), 3);
+        assert_eq!(aligned[1].text, "the");
+        assert_eq!(aligned[1].confidence, Confidence::Interpolated);
+        // "the" should be sandwiched between hello.end (1.0) and
+        // world.start (2.0).
+        assert!(aligned[1].start_seconds >= 1.0);
+        assert!(aligned[1].end_seconds <= 2.0 + 0.01);
+    }
+
+    #[test]
+    fn timestamps_remain_monotonic() {
+        let w = vec![
+            ww("a", 0.0, 1.0),
+            ww("b", 2.0, 3.0),
+            ww("c", 4.0, 5.0),
+        ];
+        let g = gt(&["a", "missing", "b", "c"]);
+        let (aligned, _) = run(&w, &g, BoundaryStrategy::default());
+        for i in 1..aligned.len() {
+            if aligned[i].confidence == Confidence::Unsynced
+                || aligned[i - 1].confidence == Confidence::Unsynced
+            {
+                continue;
+            }
+            assert!(
+                aligned[i].start_seconds >= aligned[i - 1].end_seconds - 1e-9,
+                "non-monotonic at {i}: {:?} → {:?}",
+                aligned[i - 1],
+                aligned[i],
+            );
+        }
+    }
+}
diff --git a/crates/dpub-cli/src/config.rs b/crates/dpub-cli/src/config.rs
index 6778277..d57e90e 100644
--- a/crates/dpub-cli/src/config.rs
+++ b/crates/dpub-cli/src/config.rs
@@ -45,6 +45,10 @@ pub struct DpubConfig {
     pub jobs: Option<usize>,
     /// Default log level (`"error"`, `"warn"`, `"info"`, `"debug"`, `"trace"`).
     pub log_level: Option<String>,
+    /// Default ground truth file path.
+    pub ground_truth: Option<PathBuf>,
+    /// Default ground truth strategy: `"drop"`, `"no-sync"`, or `"bracket"`.
+    pub ground_truth_strategy: Option<String>,
 }
 
 /// Return the platform-appropriate config directory for dpub.
@@ -106,7 +110,9 @@ pub fn example_json() -> &'static str {
   "validate": false,
   "a11y": false,
   "jobs": 0,
-  "log_level": "info"
+  "log_level": "info",
+  "ground_truth": null,
+  "ground_truth_strategy": "no-sync"
 }"#
 }
 
diff --git a/crates/dpub-cli/src/main.rs b/crates/dpub-cli/src/main.rs
index 65d4d03..4a59910 100644
--- a/crates/dpub-cli/src/main.rs
+++ b/crates/dpub-cli/src/main.rs
@@ -80,6 +80,15 @@ enum Command {
         /// source DAISY metadata.
         #[arg(long, value_name = "TEXT")]
         rights: Option<String>,
+        /// Path to the book's text (plain text or markdown). Section
+        /// headings are matched against the DAISY NCC; word-level
+        /// timestamps come from Whisper. Requires `--transcribe`.
+        #[arg(long, value_name = "PATH", requires = "transcribe")]
+        ground_truth: Option<PathBuf>,
+        /// How to handle book content the narrator skipped (colophon,
+        /// index, etc.) when using `--ground-truth`.
+        #[arg(long, value_enum, default_value_t = GroundTruthStrategyOpt::NoSync, value_name = "STRATEGY")]
+        ground_truth_strategy: GroundTruthStrategyOpt,
     },
     /// Validate an existing EPUB 3 publication with EPUBCheck.
     Validate {
@@ -179,6 +188,38 @@ impl AudioOpt {
     }
 }
 
+#[derive(Clone, Copy, ValueEnum, Default)]
+enum GroundTruthStrategyOpt {
+    /// Drop ground-truth-only words (colophon etc.) entirely.
+    Drop,
+    /// Include the text but emit no Media Overlay entry — visible in
+    /// the XHTML, no karaoke highlight on those passages. Default.
+    #[default]
+    NoSync,
+    /// Span the available time gap proportionally — highlight bar
+    /// moves through the words at average speed.
+    Bracket,
+}
+
+impl GroundTruthStrategyOpt {
+    fn into_strategy(self) -> dpub_convert::BoundaryStrategy {
+        match self {
+            Self::Drop => dpub_convert::BoundaryStrategy::Drop,
+            Self::NoSync => dpub_convert::BoundaryStrategy::NoSync,
+            Self::Bracket => dpub_convert::BoundaryStrategy::Bracket,
+        }
+    }
+
+    fn parse_str(s: &str) -> Option<Self> {
+        match s {
+            "drop" => Some(Self::Drop),
+            "no-sync" => Some(Self::NoSync),
+            "bracket" => Some(Self::Bracket),
+            _ => None,
+        }
+    }
+}
+
 fn main() -> Result<()> {
     // Load config early so we can use log_level before tracing init.
     let cfg = config::load();
@@ -214,6 +255,8 @@ fn main() -> Result<()> {
             cover,
             no_auto_cover,
             rights,
+            ground_truth,
+            ground_truth_strategy,
         } => {
             let audio = audio.unwrap_or_else(|| parse_audio_opt(&cfg));
             let bitrate = bitrate.unwrap_or_else(|| {
@@ -236,10 +279,26 @@ fn main() -> Result<()> {
                 Some(config::TranscribeSetting::Language(lang)) => Some(lang.clone()),
                 _ => None,
             });
+            // Merge ground truth: CLI > config > none.
+            let ground_truth = ground_truth.or_else(|| cfg.ground_truth.clone());
+            // Merge boundary strategy: CLI > config > NoSync default.
+            // CLI's default is NoSync; we treat it as "not explicitly
+            // set" only when the user passed nothing AND the config
+            // has a value.
+            let boundary_strategy = match (
+                ground_truth_strategy,
+                cfg.ground_truth_strategy
+                    .as_deref()
+                    .and_then(GroundTruthStrategyOpt::parse_str),
+            ) {
+                // CLI was explicitly NoSync (default) and config has a setting → use config.
+                (GroundTruthStrategyOpt::NoSync, Some(cfg_val)) => cfg_val,
+                (cli_val, _) => cli_val,
+            };
             cmd_convert(
                 &ncc, &output, validate, a11y, audio, bitrate, transcribe,
                 whisper_model, no_text_cleanup, no_word_sync, cover,
-                auto_cover, rights,
+                auto_cover, rights, ground_truth, boundary_strategy,
             )
         }
         Command::Validate { epub, json } => cmd_validate(&epub, json),
@@ -287,6 +346,8 @@ fn cmd_convert(
     cover: Option<PathBuf>,
     auto_cover: bool,
     rights: Option<String>,
+    ground_truth: Option<PathBuf>,
+    ground_truth_strategy: GroundTruthStrategyOpt,
 ) -> Result<()> {
     let ncc = resolve_ncc_path(ncc)?;
     let book = Book::from_ncc(&ncc).with_context(|| format!("loading {}", ncc.display()))?;
@@ -361,6 +422,24 @@ fn cmd_convert(
         println!("  Cover: best-effort lookup via Open Library");
     }
 
+    if let Some(path) = &ground_truth {
+        if !path.is_file() {
+            anyhow::bail!(
+                "ground truth file not found at {}",
+                path.display()
+            );
+        }
+        println!(
+            "  Ground truth: {} (strategy: {})",
+            path.display(),
+            match ground_truth_strategy {
+                GroundTruthStrategyOpt::Drop => "drop",
+                GroundTruthStrategyOpt::NoSync => "no-sync",
+                GroundTruthStrategyOpt::Bracket => "bracket",
+            },
+        );
+    }
+
     let opts = dpub_convert::ConvertOptions {
         audio: audio.into_format(bitrate_kbps),
         transcribe: transcribe_opts,
@@ -369,6 +448,8 @@ fn cmd_convert(
         auto_cover,
         rights,
         no_word_sync,
+        ground_truth,
+        boundary_strategy: ground_truth_strategy.into_strategy(),
     };
     let start = std::time::Instant::now();
     dpub_convert::convert_to_file(&book, output, &opts)
@@ -808,6 +889,8 @@ fn cmd_batch(
         auto_cover: true,
         rights: None,
         no_word_sync: false,
+        ground_truth: None,
+        boundary_strategy: dpub_convert::BoundaryStrategy::default(),
     };
     let start = std::time::Instant::now();
     let entries: Vec<BatchEntry> = books
diff --git a/crates/dpub-convert/Cargo.toml b/crates/dpub-convert/Cargo.toml
index 63eef37..04b25ae 100644
--- a/crates/dpub-convert/Cargo.toml
+++ b/crates/dpub-convert/Cargo.toml
@@ -18,11 +18,13 @@ metal = ["dpub-whisper/metal"]
 cuda = ["dpub-whisper/cuda"]
 
 [dependencies]
+dpub-align = { path = "../dpub-align", version = "0.6.0" }
 dpub-audio = { path = "../dpub-audio", version = "0.6.0" }
 dpub-core = { path = "../dpub-core", version = "0.6.0" }
 dpub-meta = { path = "../dpub-meta", version = "0.6.0" }
 dpub-util = { path = "../dpub-util", version = "0.6.0" }
 dpub-whisper = { path = "../dpub-whisper", version = "0.6.0" }
+tracing = { workspace = true }
 epub3-writer = { path = "../epub3-writer", version = "0.6.0" }
 thiserror = { workspace = true }
 uuid = { workspace = true }
diff --git a/crates/dpub-convert/src/error.rs b/crates/dpub-convert/src/error.rs
index 5045d50..3f7e5b7 100644
--- a/crates/dpub-convert/src/error.rs
+++ b/crates/dpub-convert/src/error.rs
@@ -23,6 +23,16 @@ pub enum Error {
 
     #[error("unsupported cover image at {path}: only JPEG and PNG are accepted")]
     UnsupportedCoverImage { path: PathBuf },
+
+    #[error("--ground-truth requires --transcribe (Whisper provides timestamps)")]
+    GroundTruthWithoutTranscribe,
+
+    #[error("ground truth file at {path} could not be read: {source}")]
+    GroundTruthIo {
+        path: PathBuf,
+        #[source]
+        source: std::io::Error,
+    },
 }
 
 pub type Result<T> = std::result::Result<T, Error>;
diff --git a/crates/dpub-convert/src/lib.rs b/crates/dpub-convert/src/lib.rs
index a2751c4..f31350d 100644
--- a/crates/dpub-convert/src/lib.rs
+++ b/crates/dpub-convert/src/lib.rs
@@ -20,6 +20,7 @@ use rayon::prelude::*;
 
 mod error;
 mod text_cleanup;
+pub use dpub_align::BoundaryStrategy;
 pub use error::{Error, Result};
 
 /// Convert a parsed DAISY 2.02 [`Book`] into an EPUB 3 [`Publication`].
@@ -205,8 +206,17 @@ fn build_sections(
                 .strip_suffix(".smil")
                 .unwrap_or(&section_ref.src)
                 .to_owned();
+            // XML Names (and OPF manifest IDs are XML Names) cannot
+            // start with a digit. DAISY filenames often do
+            // (`001_…`, `002_…`), so prefix those stems with `s-`.
             let id = if stem.is_empty() {
                 format!("section-{:03}", idx + 1)
+            } else if stem
+                .chars()
+                .next()
+                .is_some_and(|c| c.is_ascii_digit())
+            {
+                format!("s-{stem}")
             } else {
                 stem.clone()
             };
@@ -571,6 +581,15 @@ pub struct ConvertOptions {
     /// highlight-along-with-audio. Set this to keep SMIL files
     /// small at the cost of a coarser reading experience.
     pub no_word_sync: bool,
+    /// Optional path to a ground truth text file (plain text or
+    /// markdown). When set together with `transcribe`, Whisper still
+    /// runs to produce timestamps but the EPUB ships with the real
+    /// book text aligned word-by-word against Whisper's word stream.
+    pub ground_truth: Option<std::path::PathBuf>,
+    /// How to handle ground-truth-only words (book content the
+    /// narrator skipped — colophon, index, etc.). Default
+    /// `BoundaryStrategy::NoSync`.
+    pub boundary_strategy: dpub_align::BoundaryStrategy,
 }
 
 /// Convert and write a DAISY 2.02 publication to an EPUB 3 file in one call.
@@ -615,7 +634,11 @@ pub fn convert_to_file(book: &Book, output: &Path, opts: &ConvertOptions) -> Res
             transcribe,
             opts.raw_transcript_segments,
             opts.no_word_sync,
+            opts.ground_truth.as_deref(),
+            opts.boundary_strategy,
         )?;
+    } else if opts.ground_truth.is_some() {
+        return Err(Error::GroundTruthWithoutTranscribe);
     }
 
     // Recompression has to happen *before* the ZIP write because the writer
@@ -736,12 +759,15 @@ fn sniff_image_format(bytes: &[u8]) -> Option<(&'static str, &'static str)> {
 /// When `raw_segments` is `true`, the per-segment Whisper output is emitted
 /// directly (one `<p>` per ~10–30 s segment); the default `false` runs
 /// `text_cleanup::merge_into_paragraphs` to produce prose-shaped output.
+#[allow(clippy::too_many_arguments)]
 fn inject_transcripts(
     book: &Book,
     publication: &mut Publication,
     opts: &TranscribeOptions,
     raw_segments: bool,
     no_word_sync: bool,
+    ground_truth_path: Option<&std::path::Path>,
+    boundary_strategy: dpub_align::BoundaryStrategy,
 ) -> Result<()> {
     let whisper_opts = dpub_whisper::TranscribeOptions {
         model_path: opts.model_path.clone(),
@@ -752,6 +778,40 @@ fn inject_transcripts(
     // weights into Metal/CUDA buffers for every audio file (#10).
     let transcriber = dpub_whisper::Transcriber::new(&whisper_opts)?;
 
+    // Read and split the ground truth file once, mapping section
+    // index → owned section text. None when no ground truth is in use.
+    let ground_truth_by_section: Option<std::collections::HashMap<usize, String>> =
+        if let Some(path) = ground_truth_path {
+            let text = std::fs::read_to_string(path).map_err(|source| Error::GroundTruthIo {
+                path: path.to_path_buf(),
+                source,
+            })?;
+            // Use the master.smil section titles (one per section,
+            // 1:1 with publication.sections) so the alignment maps
+            // directly onto section indices without an extra lookup.
+            let headings: Vec<(&str, usize)> = book
+                .master
+                .references
+                .iter()
+                .enumerate()
+                .map(|(i, r)| (r.title.as_str(), i))
+                .collect();
+            let sections = dpub_align::split_into_sections(&text, &headings);
+            tracing::info!(
+                "ground truth: matched {}/{} sections",
+                sections.len(),
+                headings.len()
+            );
+            Some(
+                sections
+                    .into_iter()
+                    .map(|s| (s.ncc_index, s.text))
+                    .collect(),
+            )
+        } else {
+            None
+        };
+
     // Cache: file basename → segments. Reused across sections that share an
     // audio file.
     let mut cache: std::collections::HashMap<String, Vec<dpub_whisper::Segment>> =
@@ -796,11 +856,33 @@ fn inject_transcripts(
         let new_paragraphs = if raw_segments {
             render_raw_paragraphs(idx, &section_segments)
         } else {
-            let cleaned = text_cleanup::merge_into_paragraphs(
-                &section_segments,
-                &section_audio_srcs,
-                &text_cleanup::CleanupOpts::default(),
-            );
+            // Choose the cleanup path: ground truth alignment when
+            // available for this section, else heuristic merging of
+            // raw Whisper output.
+            let cleaned = match ground_truth_by_section
+                .as_ref()
+                .and_then(|m| m.get(&idx))
+            {
+                Some(gt_text) => align_with_ground_truth(
+                    idx,
+                    &section_segments,
+                    &section_audio_srcs,
+                    gt_text,
+                    boundary_strategy,
+                )
+                .unwrap_or_else(|| {
+                    text_cleanup::merge_into_paragraphs(
+                        &section_segments,
+                        &section_audio_srcs,
+                        &text_cleanup::CleanupOpts::default(),
+                    )
+                }),
+                None => text_cleanup::merge_into_paragraphs(
+                    &section_segments,
+                    &section_audio_srcs,
+                    &text_cleanup::CleanupOpts::default(),
+                ),
+            };
             let html = render_cleaned_paragraphs(idx, &cleaned);
             // Word-level Media Overlay sync: rebuild this section's
             // overlay from the cleaned paragraphs, replacing the
@@ -816,7 +898,15 @@ fn inject_transcripts(
                     idx,
                     &cleaned,
                 );
-                overlay.root = new_root;
+                // Only swap in the rebuilt tree if it actually has
+                // synced words. If every word in this section ended
+                // up filtered out (all-Unsynced ground truth, or all
+                // zero-duration interpolations), keep the existing
+                // heading-level overlay shell so we don't ship an
+                // empty SMIL body.
+                if has_par_descendant(&new_root) {
+                    overlay.root = new_root;
+                }
             }
             html
         };
@@ -828,6 +918,101 @@ fn inject_transcripts(
     Ok(())
 }
 
+/// Run ground truth alignment for one section. Builds a flat
+/// chronological Whisper word stream from `segments` (paired with
+/// per-segment audio basenames), splits the result into
+/// `text_cleanup::Paragraph` values that drop into the existing
+/// pipeline. Returns `None` if alignment was not possible (no audio,
+/// no words) so the caller falls back to heuristic cleanup.
+fn align_with_ground_truth(
+    section_idx: usize,
+    segments: &[dpub_whisper::Segment],
+    audio_srcs: &[String],
+    ground_truth: &str,
+    boundary_strategy: dpub_align::BoundaryStrategy,
+) -> Option<Vec<text_cleanup::Paragraph>> {
+    if segments.is_empty() || ground_truth.trim().is_empty() {
+        return None;
+    }
+    // Flatten all Whisper words from all segments into one stream.
+    let mut whisper_words: Vec<dpub_align::WordTiming> = Vec::new();
+    for seg in segments {
+        for w in &seg.words {
+            whisper_words.push(dpub_align::WordTiming {
+                start_seconds: w.start_seconds,
+                end_seconds: w.end_seconds,
+                text: w.text.clone(),
+            });
+        }
+    }
+    if whisper_words.is_empty() {
+        return None;
+    }
+    // The aligner doesn't know about per-word audio sources — when a
+    // section spans multiple audio files we use the first file's name
+    // for all paragraphs. Splitting paragraphs at audio boundaries is
+    // possible but rare for this input shape (one section ≈ one
+    // audio file in DAISY 2.02), so we punt for v1.
+    let primary_audio = audio_srcs.first().cloned().unwrap_or_default();
+
+    let result = dpub_align::align_section(
+        &whisper_words,
+        ground_truth,
+        &primary_audio,
+        boundary_strategy,
+    )
+    .ok()?;
+
+    for event in &result.trim_log {
+        let label = match event.kind {
+            dpub_align::TrimKind::LeadingWhisper => "leading whisper-only",
+            dpub_align::TrimKind::TrailingWhisper => "trailing whisper-only",
+            dpub_align::TrimKind::LeadingGroundTruth => "leading ground-truth-only",
+            dpub_align::TrimKind::TrailingGroundTruth => "trailing ground-truth-only",
+        };
+        tracing::info!(
+            "align: section {section_idx} trimmed {} {label} words: \"{}\"",
+            event.word_count,
+            event.preview,
+        );
+    }
+
+    let paragraphs: Vec<text_cleanup::Paragraph> = result
+        .paragraphs
+        .into_iter()
+        .map(|ap| text_cleanup::Paragraph {
+            start_seconds: ap.start_seconds,
+            end_seconds: ap.end_seconds,
+            text: ap.text,
+            audio_src: ap.audio_src,
+            words: ap
+                .words
+                .into_iter()
+                // Unsynced words keep their text in the XHTML span
+                // (so the text is readable) but carry start==end==0
+                // so build_word_overlay_seq omits them from SMIL —
+                // the colophon is visible without a fake audio sync.
+                .map(|w| dpub_whisper::Word {
+                    start_seconds: w.start_seconds,
+                    end_seconds: w.end_seconds,
+                    text: w.text,
+                })
+                .collect(),
+        })
+        .collect();
+    Some(paragraphs)
+}
+
+/// Returns true if the seq contains at least one `<par>` somewhere
+/// in its subtree. Mirrors the same check in the SMIL writer; used
+/// here to decide whether to replace the heading-level overlay.
+fn has_par_descendant(seq: &OverlaySeq) -> bool {
+    seq.children.iter().any(|c| match c {
+        OverlayItem::Par(_) => true,
+        OverlayItem::Seq(inner) => has_par_descendant(inner),
+    })
+}
+
 fn render_raw_paragraphs(section_idx: usize, segments: &[dpub_whisper::Segment]) -> String {
     let mut out = String::new();
     for (para_idx, seg) in segments.iter().enumerate() {
@@ -870,6 +1055,23 @@ fn build_word_overlay_seq(
 
         let mut word_children: Vec<OverlayItem> = Vec::with_capacity(para.words.len());
         for (word_idx, word) in para.words.iter().enumerate() {
+            // Skip "unsynced" words: explicit sentinel
+            // (start==end==0) for ground-truth-only material under
+            // NoSync strategy, and also any zero-duration word that
+            // slipped through interpolation (clipBegin == clipEnd
+            // would fail EPUBCheck MED-009). The XHTML span is still
+            // emitted so the text remains readable.
+            //
+            // Use the same millisecond rounding the SMIL writer
+            // applies — two distinct f64 timestamps can round to the
+            // same `H:MM:SS.fff` string and trip MED-009.
+            #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
+            let start_ms = (word.start_seconds * 1000.0).round() as i64;
+            #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
+            let end_ms = (word.end_seconds * 1000.0).round() as i64;
+            if end_ms <= start_ms {
+                continue;
+            }
             let word_id = format!("w-{section_idx:03}-{para_idx:03}-{word_idx:03}");
             word_children.push(OverlayItem::Par(OverlayPar {
                 id: Some(word_id.clone()),
@@ -879,6 +1081,12 @@ fn build_word_overlay_seq(
                 clip_end_seconds: word.end_seconds,
             }));
         }
+        // Empty paragraph-level <seq> elements fail EPUBCheck RSC-005
+        // ("element seq incomplete"). Skip the entire paragraph
+        // wrapper when no synced words remain.
+        if word_children.is_empty() {
+            continue;
+        }
         top_children.push(OverlayItem::Seq(OverlaySeq {
             textref: Some(para_textref),
             children: word_children,
@@ -1214,6 +1422,109 @@ mod tests {
         );
     }
 
+    #[test]
+    fn build_word_overlay_seq_drops_words_collapsing_to_same_millisecond() {
+        // f64 timestamps that round to the same millisecond when the
+        // SMIL writer formats them as H:MM:SS.fff would emit
+        // clipBegin == clipEnd, tripping EPUBCheck MED-009. The
+        // builder must mirror that rounding to filter such words.
+        let para = text_cleanup::Paragraph {
+            start_seconds: 0.0,
+            end_seconds: 1.0,
+            text: "ok bad ok".into(),
+            words: vec![
+                dpub_whisper::Word {
+                    start_seconds: 36.5302,
+                    end_seconds: 36.5304, // both round to 36.530
+                    text: "bad".into(),
+                },
+                dpub_whisper::Word {
+                    start_seconds: 36.6,
+                    end_seconds: 36.8,
+                    text: "ok".into(),
+                },
+            ],
+            audio_src: "a.mp3".into(),
+        };
+        let root = build_word_overlay_seq("content/x.xhtml", 0, &[para]);
+        let OverlayItem::Seq(inner) = &root.children[0] else {
+            panic!("expected paragraph seq");
+        };
+        // Only the second word survives.
+        assert_eq!(inner.children.len(), 1);
+    }
+
+    #[test]
+    fn build_word_overlay_seq_drops_zero_duration_words() {
+        // Words with start == end (interpolation collapsed to a
+        // zero-width slot) would produce SMIL `clipBegin == clipEnd`,
+        // which EPUBCheck rejects (MED-009).
+        let para = text_cleanup::Paragraph {
+            start_seconds: 0.0,
+            end_seconds: 1.0,
+            text: "ok bad ok".into(),
+            words: vec![
+                dpub_whisper::Word {
+                    start_seconds: 0.0,
+                    end_seconds: 0.5,
+                    text: "ok".into(),
+                },
+                dpub_whisper::Word {
+                    // zero-duration: must be filtered.
+                    start_seconds: 0.5,
+                    end_seconds: 0.5,
+                    text: "bad".into(),
+                },
+                dpub_whisper::Word {
+                    start_seconds: 0.5,
+                    end_seconds: 1.0,
+                    text: "ok".into(),
+                },
+            ],
+            audio_src: "a.mp3".into(),
+        };
+        let root = build_word_overlay_seq("content/x.xhtml", 0, &[para]);
+        let OverlayItem::Seq(inner) = &root.children[0] else {
+            panic!("expected paragraph seq");
+        };
+        // Two pars survive (the middle one was zero-duration).
+        assert_eq!(inner.children.len(), 2);
+    }
+
+    #[test]
+    fn build_word_overlay_seq_drops_paragraph_with_only_unsynced_words() {
+        // Every word zero-duration → empty paragraph seq → must be
+        // skipped entirely so the SMIL writer doesn't emit an empty
+        // <seq> (EPUBCheck RSC-005).
+        let p_all_unsynced = text_cleanup::Paragraph {
+            start_seconds: 0.0,
+            end_seconds: 0.0,
+            text: "colophon".into(),
+            words: vec![
+                dpub_whisper::Word {
+                    start_seconds: 0.0,
+                    end_seconds: 0.0,
+                    text: "colophon".into(),
+                },
+            ],
+            audio_src: "a.mp3".into(),
+        };
+        let p_real = text_cleanup::Paragraph {
+            start_seconds: 1.0,
+            end_seconds: 2.0,
+            text: "Real".into(),
+            words: vec![dpub_whisper::Word {
+                start_seconds: 1.0,
+                end_seconds: 2.0,
+                text: "Real".into(),
+            }],
+            audio_src: "a.mp3".into(),
+        };
+        let root = build_word_overlay_seq("content/x.xhtml", 0, &[p_all_unsynced, p_real]);
+        // The unsynced paragraph wrapper was dropped.
+        assert_eq!(root.children.len(), 1);
+    }
+
     #[test]
     fn render_cleaned_paragraphs_escapes_word_text() {
         let para = text_cleanup::Paragraph {
diff --git a/crates/dpub-convert/tests/real_conversion.rs b/crates/dpub-convert/tests/real_conversion.rs
index f007f7e..ee8c55c 100644
--- a/crates/dpub-convert/tests/real_conversion.rs
+++ b/crates/dpub-convert/tests/real_conversion.rs
@@ -123,6 +123,8 @@ fn opus_recompression_shrinks_real_book() {
             auto_cover: false,
             rights: None,
             no_word_sync: false,
+            ground_truth: None,
+            boundary_strategy: dpub_convert::BoundaryStrategy::default(),
         },
     )
     .expect("write opus");
diff --git a/crates/epub3-writer/src/writers.rs b/crates/epub3-writer/src/writers.rs
index eaa76c9..56980c8 100644
--- a/crates/epub3-writer/src/writers.rs
+++ b/crates/epub3-writer/src/writers.rs
@@ -314,6 +314,11 @@ pub fn write_overlay_smil(overlay: &MediaOverlay) -> String {
 }
 
 fn write_overlay_seq(s: &mut String, seq: &OverlaySeq, indent: usize) {
+    // EPUBCheck rejects empty <seq> elements (RSC-005 "element seq
+    // incomplete"). Recursively empty branches are also dropped.
+    if !seq_has_par_descendant(seq) {
+        return;
+    }
     let pad = " ".repeat(indent);
     let textref = seq
         .textref
@@ -330,6 +335,16 @@ fn write_overlay_seq(s: &mut String, seq: &OverlaySeq, indent: usize) {
     let _ = write!(s, "{pad}</seq>\n");
 }
 
+/// Recursively check whether a `<seq>` contains at least one `<par>`
+/// somewhere in its tree. Used to skip empty branches that would fail
+/// EPUBCheck.
+fn seq_has_par_descendant(seq: &OverlaySeq) -> bool {
+    seq.children.iter().any(|c| match c {
+        OverlayItem::Par(_) => true,
+        OverlayItem::Seq(inner) => seq_has_par_descendant(inner),
+    })
+}
+
 fn write_overlay_par(s: &mut String, par: &OverlayPar, indent: usize) {
     let pad = " ".repeat(indent);
     let id_attr = par