From ade05b6c4ee4c7228be2569ec274c15a53f8109b Mon Sep 17 00:00:00 2001 From: al8n Date: Thu, 12 Mar 2026 01:43:58 +0800 Subject: [PATCH 01/20] bench: add benchmarks and results --- Cargo.toml | 5 +++ README.md | 28 +++++++++++++ benches/srt.rs | 85 ++++++++++++++++++++++++++++++++++++- benches/vtt.rs | 111 +++++++++++++++++++++++++++++++++++++++++++++++++ src/srt.rs | 78 ++++++++++++++++++++-------------- src/vtt.rs | 109 ++++++++++++++++++++++++++++-------------------- 6 files changed, 340 insertions(+), 76 deletions(-) create mode 100644 benches/vtt.rs diff --git a/Cargo.toml b/Cargo.toml index 6f6464b..3189008 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,11 @@ path = "benches/srt.rs" name = "srt" harness = false +[[bench]] +path = "benches/vtt.rs" +name = "vtt" +harness = false + [features] default = ["std"] alloc = ["memchr"] diff --git a/README.md b/README.md index 45e9e6a..50300a5 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,34 @@ fasrt = "0.2" | `alloc` | No | Enables `CueText` DOM tree and entity decoding without `std` | | `memchr` | Yes (via `alloc`/`std`) | SIMD-accelerated fast path for entity decoding | +## Benchmarks + +Measured on Apple Silicon with `cargo bench` (Criterion). + +### SRT + +| Benchmark | Input | Time | Throughput | +|-----------|-------|------|------------| +| Parse (strict) | 2 cues, 89 B | ~316 ns | 281 MiB/s | +| Parse (strict) | 26 KB file | ~39 µs | 645 MiB/s | +| Parse (lossy) | 332 files, ~8 MB | ~12.5 ms | 624 MiB/s | +| Collect into `Vec` | 26 KB file | ~41 µs | 606 MiB/s | + +### WebVTT + +| Benchmark | Input | Time | Throughput | +|-----------|-------|------|------------| +| Parse | 2 cues, 96 B | ~299 ns | 309 MiB/s | +| Parse | Settings + region + style, 354 B | ~829 ns | 427 MiB/s | +| Parse | All WPT fixtures, ~34 KB | ~103 µs | 342 MiB/s | +| Collect into `Vec` | Settings + region + style, 354 B | ~1.08 µs | 329 MiB/s | + +Run benchmarks yourself: + +```sh +cargo bench +``` + #### License `fasrt` is under the terms of both the MIT license and the diff --git a/benches/srt.rs b/benches/srt.rs index f328e4d..977ecc8 100644 --- a/benches/srt.rs +++ b/benches/srt.rs @@ -1 +1,84 @@ -fn main() {} +use std::hint::black_box; + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use fasrt::srt::Parser; + +const SMALL_SRT: &str = "\ +1 +00:00:01,000 --> 00:00:04,000 +Hello world! + +2 +00:00:05,000 --> 00:00:08,000 +Goodbye world! +"; + +const MEDIUM_SRT: &str = include_str!("../fixtures/srt/DeathNote_01.eng.srt"); + +fn load_all_fixtures() -> String { + let mut buf = String::new(); + for entry in std::fs::read_dir("fixtures/srt").unwrap() { + let entry = entry.unwrap(); + if entry.path().extension().is_some_and(|e| e == "srt") { + if let Ok(content) = std::fs::read_to_string(entry.path()) { + buf.push_str(&content); + } + } + } + buf +} + +fn bench_srt_parse(c: &mut Criterion) { + let all_fixtures = load_all_fixtures(); + + let mut group = c.benchmark_group("srt/parse"); + + // Small inline SRT + group.throughput(Throughput::Bytes(SMALL_SRT.len() as u64)); + group.bench_function(BenchmarkId::new("strict", "small_2_cues"), |b| { + b.iter(|| { + let count = Parser::strict(black_box(SMALL_SRT)).count(); + black_box(count); + }); + }); + + // Medium real file (~26 KB) + group.throughput(Throughput::Bytes(MEDIUM_SRT.len() as u64)); + group.bench_function(BenchmarkId::new("strict", "medium_26kb"), |b| { + b.iter(|| { + let count = Parser::strict(black_box(MEDIUM_SRT)).count(); + black_box(count); + }); + }); + + // All fixtures (~8 MB) + group.throughput(Throughput::Bytes(all_fixtures.len() as u64)); + group.bench_function(BenchmarkId::new("lossy", "all_fixtures_8mb"), |b| { + b.iter(|| { + let count = Parser::lossy(black_box(&all_fixtures)).count(); + black_box(count); + }); + }); + + group.finish(); +} + +fn bench_srt_collect(c: &mut Criterion) { + let mut group = c.benchmark_group("srt/collect"); + + // Collect into Vec to measure allocation overhead + group.throughput(Throughput::Bytes(MEDIUM_SRT.len() as u64)); + group.bench_function("medium_26kb", |b| { + b.iter(|| { + let entries: Vec<_> = Parser::strict(black_box(MEDIUM_SRT)) + .collect::>() + .unwrap(); + black_box(entries.len()); + }); + }); + + group.finish(); +} + +criterion_group!(benches, bench_srt_parse, bench_srt_collect); +criterion_main!(benches); diff --git a/benches/vtt.rs b/benches/vtt.rs new file mode 100644 index 0000000..229a32b --- /dev/null +++ b/benches/vtt.rs @@ -0,0 +1,111 @@ +use std::hint::black_box; + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use fasrt::vtt::Parser; + +const SMALL_VTT: &str = "\ +WEBVTT + +00:00:00.000 --> 00:00:01.000 +Hello world! + +00:00:01.000 --> 00:00:02.000 +Goodbye world! +"; + +const SETTINGS_VTT: &str = "\ +WEBVTT + +STYLE +::cue { color: white; } + +REGION +id:region1 +width:40% +lines:3 +regionanchor:0%,100% +viewportanchor:10%,90% +scroll:up + +cue-1 +00:00:00.000 --> 00:00:01.000 align:start position:10% size:80% line:0 vertical:rl region:region1 +Bold and italic text + +NOTE This is a comment + +00:00:01.000 --> 00:00:05.000 +Second cue with voice tag +"; + +fn load_all_vtt_fixtures() -> String { + let mut buf = String::new(); + for dir in &[ + "fixtures/webvtt/wpt-file-parsing", + "fixtures/webvtt/wpt-cue-parsing", + ] { + if let Ok(entries) = std::fs::read_dir(dir) { + for entry in entries { + let entry = entry.unwrap(); + if entry.path().extension().is_some_and(|e| e == "vtt") { + buf.push_str(&std::fs::read_to_string(entry.path()).unwrap()); + buf.push_str("\n\n"); + } + } + } + } + buf +} + +fn bench_vtt_parse(c: &mut Criterion) { + let all_fixtures = load_all_vtt_fixtures(); + + let mut group = c.benchmark_group("vtt/parse"); + + // Small inline VTT + group.throughput(Throughput::Bytes(SMALL_VTT.len() as u64)); + group.bench_function(BenchmarkId::new("parse", "small_2_cues"), |b| { + b.iter(|| { + let count = Parser::new(black_box(SMALL_VTT)).count(); + black_box(count); + }); + }); + + // VTT with settings, regions, styles, cue options + group.throughput(Throughput::Bytes(SETTINGS_VTT.len() as u64)); + group.bench_function(BenchmarkId::new("parse", "with_settings"), |b| { + b.iter(|| { + let count = Parser::new(black_box(SETTINGS_VTT)).count(); + black_box(count); + }); + }); + + // All VTT fixtures + group.throughput(Throughput::Bytes(all_fixtures.len() as u64)); + group.bench_function(BenchmarkId::new("parse", "all_fixtures"), |b| { + b.iter(|| { + let count = Parser::new(black_box(&all_fixtures)).count(); + black_box(count); + }); + }); + + group.finish(); +} + +fn bench_vtt_collect(c: &mut Criterion) { + let mut group = c.benchmark_group("vtt/collect"); + + group.throughput(Throughput::Bytes(SETTINGS_VTT.len() as u64)); + group.bench_function("with_settings", |b| { + b.iter(|| { + let blocks: Vec<_> = Parser::new(black_box(SETTINGS_VTT)) + .collect::>() + .unwrap(); + black_box(blocks.len()); + }); + }); + + group.finish(); +} + +criterion_group!(benches, bench_vtt_parse, bench_vtt_collect); +criterion_main!(benches); diff --git a/src/srt.rs b/src/srt.rs index 29291a2..841e726 100644 --- a/src/srt.rs +++ b/src/srt.rs @@ -3,6 +3,7 @@ use core::num::NonZeroU64; use logos::{Lexer, Logos}; use crate::error::*; +use crate::types::{Millisecond, Minute, Second}; pub use types::{Entry, Header, Hour, Timestamp}; @@ -322,35 +323,45 @@ fn parse_number(s: &mut Lexer<'_, Token>) -> Result { #[inline] fn parse_header(s: &mut Lexer<'_, Token>) -> Result { - let s = s.slice().trim(); - let mut parts = s.split(" --> "); - match (parts.next(), parts.next()) { - (Some(start), Some(end)) => { - let start = parse_timestamp(start)?; - let end = parse_timestamp(end)?; - Ok(Header::new(start, end)) - } - _ => panic!("logos regex should guarantee this never happens"), - } + let slice = s.slice(); + // The regex guarantees `H+:MM:SS,mmm --> H+:MM:SS,mmm`. + // Find the arrow separator and extract both timestamps via byte arithmetic. + let arrow = slice.find("-->").unwrap(); + let start_str = slice[..arrow].trim(); + let end_str = slice[arrow + 3..].trim(); + let start = parse_timestamp_bytes(start_str.as_bytes())?; + let end = parse_timestamp_bytes(end_str.as_bytes())?; + Ok(Header::new(start, end)) } +/// Parse an SRT timestamp from raw bytes using direct digit extraction. +/// +/// Format: `H+:MM:SS,mmm` where H is 2-3 digits (regex-validated), +/// MM and SS are exactly 2 digits, mmm is exactly 3 digits. +/// Layout from the end: `mmm`(3) `,`(1) `SS`(2) `:`(1) `MM`(2) `:`(1) = 10 fixed bytes. #[inline] -fn parse_timestamp(s: &str) -> Result { - let mut parts = s.split(","); - - match (parts.next(), parts.next()) { - (Some(hms), Some(millis)) => { - let mut hms_parts = hms.split(':'); - - let (h, m, s) = match (hms_parts.next(), hms_parts.next(), hms_parts.next()) { - (Some(h), Some(m), Some(s)) => (h.parse()?, m.parse()?, s.parse()?), - _ => panic!("logos regex should guarantee this never happens"), - }; - let millis = millis.parse()?; - Ok(Timestamp::from_hmsm(h, m, s, millis)) - } - _ => panic!("logos regex should guarantee this never happens"), - } +fn parse_timestamp_bytes(b: &[u8]) -> Result { + let len = b.len(); + let millis = Millisecond(digit3(&b[len - 3..])); + let seconds = Second(digit2(&b[len - 6..len - 4])); + let minutes = Minute(digit2(&b[len - 9..len - 7])); + let hour_len = len - 10; + let hours = match hour_len { + 2 => Hour(digit2(&b[..2]) as u16), + 3 => Hour(digit3(&b[..3])), + _ => return Err(ParseHourError::NotPadded.into()), + }; + Ok(Timestamp::from_hmsm(hours, minutes, seconds, millis)) +} + +#[cfg_attr(not(tarpaulin), inline(always))] +const fn digit2(b: &[u8]) -> u8 { + (b[0] - b'0') * 10 + (b[1] - b'0') +} + +#[cfg_attr(not(tarpaulin), inline(always))] +const fn digit3(b: &[u8]) -> u16 { + (b[0] - b'0') as u16 * 100 + (b[1] - b'0') as u16 * 10 + (b[2] - b'0') as u16 } struct StateBody { @@ -736,18 +747,23 @@ impl<'a> Iterator for Lines<'a> { return None; } - let remaining = &self.input[self.pos..]; - let line_end = remaining - .find('\n') + let bytes = &self.input.as_bytes()[self.pos..]; + + #[cfg(feature = "memchr")] + let found = memchr::memchr(b'\n', bytes); + #[cfg(not(feature = "memchr"))] + let found = bytes.iter().position(|&b| b == b'\n'); + + let line_end = found .map(|i| { - let end = if i > 0 && remaining.as_bytes()[i - 1] == b'\r' { + let end = if i > 0 && bytes[i - 1] == b'\r' { i - 1 } else { i }; (end, i + 1) }) - .unwrap_or((remaining.len(), remaining.len())); + .unwrap_or((bytes.len(), bytes.len())); let line = &self.input[self.pos..self.pos + line_end.0]; self.pos += line_end.1; diff --git a/src/vtt.rs b/src/vtt.rs index 46104dd..ec8ab48 100644 --- a/src/vtt.rs +++ b/src/vtt.rs @@ -1,6 +1,9 @@ use logos::{Lexer, Logos}; -use crate::{error::*, types::Millisecond}; +use crate::{ + error::*, + types::{Millisecond, Minute, Second}, +}; pub use types::{ Align, Anchor, Block, Cue, CueId, CueOptions, Header, Hour, Line, LineAlign, LineValue, @@ -104,36 +107,55 @@ fn split_arrow(s: &str) -> Result<(&str, &str), ParseVttError> { } /// Parse a timestamp in either long form `HH:MM:SS.mmm` or short form -/// `MM:SS.mmm`. Auto-detects by counting `:` separators. +/// `MM:SS.mmm` using direct byte extraction. +/// +/// The logos regex guarantees the format, so we can extract fields by +/// fixed offsets from the end: `mmm`(3) `.`(1) `SS`(2) `:`(1) `MM`(2) +/// = 9 fixed bytes. If there's a `:`(1) before that, everything before +/// it is the hours component. #[inline] pub(crate) fn parse_timestamp(s: &str) -> Result { - let (time, ms) = s - .split_once('.') - .ok_or(ParseVttError::InvalidTimestamp("missing '.'"))?; - let millis: Millisecond = ms.parse()?; - let mut parts = time.splitn(3, ':'); - let first = parts - .next() - .ok_or(ParseVttError::InvalidTimestamp("empty timestamp"))?; - let second = parts.next().ok_or(ParseVttError::InvalidTimestamp( - "missing minutes or seconds", - ))?; - match parts.next() { - // Long form: HH:MM:SS - Some(sec_str) => Ok(Timestamp::from_hmsm( - first.parse()?, - second.parse()?, - sec_str.parse()?, - millis, - )), - // Short form: MM:SS - None => Ok(Timestamp::from_hmsm( - Hour::new(), - first.parse()?, - second.parse()?, - millis, - )), + let b = s.as_bytes(); + let len = b.len(); + if len < 9 { + return Err(ParseVttError::InvalidTimestamp("too short")); } + let millis = Millisecond(vtt_digit3(&b[len - 3..])); + // b[len-4] == b'.' + let seconds = Second(vtt_digit2(&b[len - 6..len - 4])); + // b[len-7] == b':' + let minutes = Minute(vtt_digit2(&b[len - 9..len - 7])); + let hours = if len > 9 { + // Long form: has hours before the second ':' + // b[len-10] == b':' + let hour_str = &s[..len - 10]; + // VTT hours are unbounded u64; parse the variable-length prefix + parse_vtt_hour_bytes(hour_str.as_bytes()) + .map_err(|_| ParseVttError::InvalidTimestamp("invalid hours"))? + } else { + Hour::new() + }; + Ok(Timestamp::from_hmsm(hours, minutes, seconds, millis)) +} + +/// Parse VTT hour bytes (variable-length, unbounded u64). +#[cfg_attr(not(tarpaulin), inline(always))] +fn parse_vtt_hour_bytes(b: &[u8]) -> Result { + let mut val: u64 = 0; + for &byte in b { + val = val * 10 + (byte - b'0') as u64; + } + Ok(Hour(val)) +} + +#[cfg_attr(not(tarpaulin), inline(always))] +const fn vtt_digit2(b: &[u8]) -> u8 { + (b[0] - b'0') * 10 + (b[1] - b'0') +} + +#[cfg_attr(not(tarpaulin), inline(always))] +const fn vtt_digit3(b: &[u8]) -> u16 { + (b[0] - b'0') as u16 * 100 + (b[1] - b'0') as u16 * 10 + (b[2] - b'0') as u16 } /// Lex the next token from a line. Returns `Ok(None)` when the line @@ -1253,28 +1275,27 @@ impl<'a> Iterator for Lines<'a> { } let bytes = &self.input.as_bytes()[self.pos..]; - let mut i = 0; - while i < bytes.len() { - if bytes[i] == b'\n' { - let line = &self.input[self.pos..self.pos + i]; - self.pos += i + 1; - return Some(line); - } else if bytes[i] == b'\r' { + + #[cfg(feature = "memchr")] + let found = memchr::memchr2(b'\n', b'\r', bytes); + #[cfg(not(feature = "memchr"))] + let found = bytes.iter().position(|&b| b == b'\n' || b == b'\r'); + + match found { + Some(i) => { let line = &self.input[self.pos..self.pos + i]; - // Check for CRLF - if i + 1 < bytes.len() && bytes[i + 1] == b'\n' { + if bytes[i] == b'\r' && i + 1 < bytes.len() && bytes[i + 1] == b'\n' { self.pos += i + 2; } else { self.pos += i + 1; } - return Some(line); + Some(line) + } + None => { + let line = &self.input[self.pos..]; + self.pos = self.input.len(); + Some(line) } - i += 1; } - - // No line terminator found - rest of input - let line = &self.input[self.pos..]; - self.pos = self.input.len(); - Some(line) } } From 1f89be3ae95041a02c7b8e484b977b84348f7d5f Mon Sep 17 00:00:00 2001 From: al8n Date: Thu, 12 Mar 2026 02:11:11 +0800 Subject: [PATCH 02/20] fix comments --- benches/srt.rs | 3 +++ src/srt.rs | 1 + src/vtt.rs | 24 +++++++++++++++++++----- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/benches/srt.rs b/benches/srt.rs index 977ecc8..19ef303 100644 --- a/benches/srt.rs +++ b/benches/srt.rs @@ -21,6 +21,9 @@ fn load_all_fixtures() -> String { let entry = entry.unwrap(); if entry.path().extension().is_some_and(|e| e == "srt") { if let Ok(content) = std::fs::read_to_string(entry.path()) { + if !buf.is_empty() { + buf.push_str("\n\n"); + } buf.push_str(&content); } } diff --git a/src/srt.rs b/src/srt.rs index 841e726..e55bd61 100644 --- a/src/srt.rs +++ b/src/srt.rs @@ -347,6 +347,7 @@ fn parse_timestamp_bytes(b: &[u8]) -> Result { let minutes = Minute(digit2(&b[len - 9..len - 7])); let hour_len = len - 10; let hours = match hour_len { + 1 => Hour((b[0] - b'0') as u16), 2 => Hour(digit2(&b[..2]) as u16), 3 => Hour(digit3(&b[..3])), _ => return Err(ParseHourError::NotPadded.into()), diff --git a/src/vtt.rs b/src/vtt.rs index ec8ab48..292a63d 100644 --- a/src/vtt.rs +++ b/src/vtt.rs @@ -120,14 +120,19 @@ pub(crate) fn parse_timestamp(s: &str) -> Result { if len < 9 { return Err(ParseVttError::InvalidTimestamp("too short")); } + // Validate expected separators before doing byte arithmetic. + // This guards against malformed input from cue-text parsing where + // the regex only loosely validates the format. + if b[len - 4] != b'.' || b[len - 7] != b':' { + return Err(ParseVttError::InvalidTimestamp("invalid format")); + } let millis = Millisecond(vtt_digit3(&b[len - 3..])); - // b[len-4] == b'.' let seconds = Second(vtt_digit2(&b[len - 6..len - 4])); - // b[len-7] == b':' let minutes = Minute(vtt_digit2(&b[len - 9..len - 7])); let hours = if len > 9 { - // Long form: has hours before the second ':' - // b[len-10] == b':' + if b[len - 10] != b':' { + return Err(ParseVttError::InvalidTimestamp("invalid format")); + } let hour_str = &s[..len - 10]; // VTT hours are unbounded u64; parse the variable-length prefix parse_vtt_hour_bytes(hour_str.as_bytes()) @@ -139,11 +144,20 @@ pub(crate) fn parse_timestamp(s: &str) -> Result { } /// Parse VTT hour bytes (variable-length, unbounded u64). +/// +/// Uses checked arithmetic to prevent overflow on extremely large +/// hour values from untrusted input. #[cfg_attr(not(tarpaulin), inline(always))] fn parse_vtt_hour_bytes(b: &[u8]) -> Result { let mut val: u64 = 0; for &byte in b { - val = val * 10 + (byte - b'0') as u64; + if !byte.is_ascii_digit() { + return Err(ParseHourError::NotPadded); + } + val = val + .checked_mul(10) + .and_then(|v| v.checked_add((byte - b'0') as u64)) + .ok_or(ParseHourError::NotPadded)?; } Ok(Hour(val)) } From 13d0edade6c809beb26e22e99f53aafaa1915df7 Mon Sep 17 00:00:00 2001 From: al8n Date: Thu, 12 Mar 2026 02:33:48 +0800 Subject: [PATCH 03/20] improve performance --- src/srt.rs | 3 +-- src/vtt.rs | 13 ++++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/srt.rs b/src/srt.rs index e55bd61..9d133c3 100644 --- a/src/srt.rs +++ b/src/srt.rs @@ -294,7 +294,7 @@ impl Default for Options { enum Token { /// Header "HH:MM:SS,mmm --> HH:MM:SS,mmm" #[regex( - r"[0-9]{1,3}:[0-5][0-9]:[0-5][0-9],[0-9]{3}[ \t\x0C]+-->[ \t\x0C]+[0-9]{1,3}:[0-5][0-9]:[0-5][0-9],[0-9]{3}", + r"[0-9]{2,3}:[0-5][0-9]:[0-5][0-9],[0-9]{3}[ \t\x0C]+-->[ \t\x0C]+[0-9]{2,3}:[0-5][0-9]:[0-5][0-9],[0-9]{3}", parse_header )] Header(Header), @@ -347,7 +347,6 @@ fn parse_timestamp_bytes(b: &[u8]) -> Result { let minutes = Minute(digit2(&b[len - 9..len - 7])); let hour_len = len - 10; let hours = match hour_len { - 1 => Hour((b[0] - b'0') as u16), 2 => Hour(digit2(&b[..2]) as u16), 3 => Hour(digit3(&b[..3])), _ => return Err(ParseHourError::NotPadded.into()), diff --git a/src/vtt.rs b/src/vtt.rs index 292a63d..bae0461 100644 --- a/src/vtt.rs +++ b/src/vtt.rs @@ -126,9 +126,16 @@ pub(crate) fn parse_timestamp(s: &str) -> Result { if b[len - 4] != b'.' || b[len - 7] != b':' { return Err(ParseVttError::InvalidTimestamp("invalid format")); } - let millis = Millisecond(vtt_digit3(&b[len - 3..])); - let seconds = Second(vtt_digit2(&b[len - 6..len - 4])); - let minutes = Minute(vtt_digit2(&b[len - 9..len - 7])); + let millis_val = vtt_digit3(&b[len - 3..]); + let seconds_val = vtt_digit2(&b[len - 6..len - 4]); + let minutes_val = vtt_digit2(&b[len - 9..len - 7]); + // Validate ranges — needed for cue-text path where input is not regex-validated. + if seconds_val >= 60 || minutes_val >= 60 { + return Err(ParseVttError::InvalidTimestamp("out of range")); + } + let millis = Millisecond(millis_val); + let seconds = Second(seconds_val); + let minutes = Minute(minutes_val); let hours = if len > 9 { if b[len - 10] != b':' { return Err(ParseVttError::InvalidTimestamp("invalid format")); From a2a379e53f82da967ceacbfd51ebda48fdbedbd7 Mon Sep 17 00:00:00 2001 From: Al Liu Date: Thu, 12 Mar 2026 02:42:14 +0800 Subject: [PATCH 04/20] Update benches/vtt.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- benches/vtt.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/benches/vtt.rs b/benches/vtt.rs index 229a32b..1c02cc2 100644 --- a/benches/vtt.rs +++ b/benches/vtt.rs @@ -43,16 +43,20 @@ fn load_all_vtt_fixtures() -> String { "fixtures/webvtt/wpt-file-parsing", "fixtures/webvtt/wpt-cue-parsing", ] { - if let Ok(entries) = std::fs::read_dir(dir) { - for entry in entries { - let entry = entry.unwrap(); - if entry.path().extension().is_some_and(|e| e == "vtt") { - buf.push_str(&std::fs::read_to_string(entry.path()).unwrap()); - buf.push_str("\n\n"); - } + let entries = std::fs::read_dir(dir) + .expect("failed to read VTT fixture directory"); + for entry in entries { + let entry = entry.unwrap(); + if entry.path().extension().is_some_and(|e| e == "vtt") { + buf.push_str(&std::fs::read_to_string(entry.path()).unwrap()); + buf.push_str("\n\n"); } } } + assert!( + !buf.is_empty(), + "No VTT fixtures were loaded from fixtures/webvtt" + ); buf } From 30b5c4a19d1c33b255711b51811574044d689988 Mon Sep 17 00:00:00 2001 From: al8n Date: Thu, 12 Mar 2026 02:51:49 +0800 Subject: [PATCH 05/20] improve performance --- src/error.rs | 5 +++++ src/vtt.rs | 21 +++++++++++---------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/error.rs b/src/error.rs index 9dee2bb..f0a5437 100644 --- a/src/error.rs +++ b/src/error.rs @@ -50,6 +50,11 @@ pub enum ParseHourError { /// The hour component is out of range (not between 0-999). #[error("hour component must be between 0-999, but was {0}")] Overflow(u16), + /// The hour component overflowed u64 (VTT unbounded hours). + #[error("hour component overflowed")] + #[unwrap(ignore)] + #[try_unwrap(ignore)] + HourOverflow, /// Not a valid number. #[error(transparent)] ParseInt(#[from] ParseIntError), diff --git a/src/vtt.rs b/src/vtt.rs index bae0461..314984e 100644 --- a/src/vtt.rs +++ b/src/vtt.rs @@ -129,13 +129,14 @@ pub(crate) fn parse_timestamp(s: &str) -> Result { let millis_val = vtt_digit3(&b[len - 3..]); let seconds_val = vtt_digit2(&b[len - 6..len - 4]); let minutes_val = vtt_digit2(&b[len - 9..len - 7]); - // Validate ranges — needed for cue-text path where input is not regex-validated. - if seconds_val >= 60 || minutes_val >= 60 { - return Err(ParseVttError::InvalidTimestamp("out of range")); - } - let millis = Millisecond(millis_val); - let seconds = Second(seconds_val); - let minutes = Minute(minutes_val); + // Use try_with to validate ranges — needed for cue-text path where input + // is not regex-validated. + let millis = Millisecond::try_with(millis_val) + .ok_or(ParseVttError::InvalidTimestamp("milliseconds out of range"))?; + let seconds = + Second::try_with(seconds_val).ok_or(ParseVttError::InvalidTimestamp("seconds out of range"))?; + let minutes = + Minute::try_with(minutes_val).ok_or(ParseVttError::InvalidTimestamp("minutes out of range"))?; let hours = if len > 9 { if b[len - 10] != b':' { return Err(ParseVttError::InvalidTimestamp("invalid format")); @@ -164,7 +165,7 @@ fn parse_vtt_hour_bytes(b: &[u8]) -> Result { val = val .checked_mul(10) .and_then(|v| v.checked_add((byte - b'0') as u64)) - .ok_or(ParseHourError::NotPadded)?; + .ok_or(ParseHourError::HourOverflow)?; } Ok(Hour(val)) } @@ -1297,9 +1298,9 @@ impl<'a> Iterator for Lines<'a> { let bytes = &self.input.as_bytes()[self.pos..]; - #[cfg(feature = "memchr")] + #[cfg(all(feature = "memchr", miri))] let found = memchr::memchr2(b'\n', b'\r', bytes); - #[cfg(not(feature = "memchr"))] + #[cfg(not(all(feature = "memchr", miri)))] let found = bytes.iter().position(|&b| b == b'\n' || b == b'\r'); match found { From cb9bee5f4a5b88685465457cfd778574823606c0 Mon Sep 17 00:00:00 2001 From: al8n Date: Thu, 12 Mar 2026 02:57:27 +0800 Subject: [PATCH 06/20] improve performance --- benches/vtt.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benches/vtt.rs b/benches/vtt.rs index 1c02cc2..a7eb32e 100644 --- a/benches/vtt.rs +++ b/benches/vtt.rs @@ -43,8 +43,7 @@ fn load_all_vtt_fixtures() -> String { "fixtures/webvtt/wpt-file-parsing", "fixtures/webvtt/wpt-cue-parsing", ] { - let entries = std::fs::read_dir(dir) - .expect("failed to read VTT fixture directory"); + let entries = std::fs::read_dir(dir).expect("failed to read VTT fixture directory"); for entry in entries { let entry = entry.unwrap(); if entry.path().extension().is_some_and(|e| e == "vtt") { From da9be044cfc78f6f067f1215d245b967f93429e4 Mon Sep 17 00:00:00 2001 From: al8n Date: Thu, 12 Mar 2026 03:10:52 +0800 Subject: [PATCH 07/20] improve performance --- src/vtt.rs | 78 +++++++++++++++++++++++++++++++++++++++++++------- src/vtt/cue.rs | 8 ++++-- 2 files changed, 72 insertions(+), 14 deletions(-) diff --git a/src/vtt.rs b/src/vtt.rs index 314984e..b499ce6 100644 --- a/src/vtt.rs +++ b/src/vtt.rs @@ -113,6 +113,13 @@ fn split_arrow(s: &str) -> Result<(&str, &str), ParseVttError> { /// fixed offsets from the end: `mmm`(3) `.`(1) `SS`(2) `:`(1) `MM`(2) /// = 9 fixed bytes. If there's a `:`(1) before that, everything before /// it is the hours component. +/// +/// # Safety assumption +/// +/// Callers **must** ensure that `s` has already been validated by a +/// logos regex (which guarantees digit/separator placement). +/// For loosely-validated input (e.g. unterminated cue-text tags), use +/// [`parse_timestamp_cue`] instead. #[inline] pub(crate) fn parse_timestamp(s: &str) -> Result { let b = s.as_bytes(); @@ -120,17 +127,42 @@ pub(crate) fn parse_timestamp(s: &str) -> Result { if len < 9 { return Err(ParseVttError::InvalidTimestamp("too short")); } - // Validate expected separators before doing byte arithmetic. - // This guards against malformed input from cue-text parsing where - // the regex only loosely validates the format. + let millis = Millisecond(vtt_digit3(&b[len - 3..])); + let seconds = Second(vtt_digit2(&b[len - 6..len - 4])); + let minutes = Minute(vtt_digit2(&b[len - 9..len - 7])); + let hours = if len > 9 { + let hour_str = &s[..len - 10]; + parse_vtt_hour_bytes(hour_str.as_bytes()) + .map_err(|_| ParseVttError::InvalidTimestamp("invalid hours"))? + } else { + Hour::new() + }; + Ok(Timestamp::from_hmsm(hours, minutes, seconds, millis)) +} + +/// Parse a timestamp from cue-text, where input is only loosely validated +/// by the regex `<[0-9:]+\.[0-9]{3}>`. +/// +/// Unlike [`parse_timestamp`], this validates separators and digit bytes +/// before doing any arithmetic, so it is safe for untrusted input. +#[inline] +pub(crate) fn parse_timestamp_cue(s: &str) -> Result { + let b = s.as_bytes(); + let len = b.len(); + if len < 9 { + return Err(ParseVttError::InvalidTimestamp("too short")); + } + // Validate separators. if b[len - 4] != b'.' || b[len - 7] != b':' { return Err(ParseVttError::InvalidTimestamp("invalid format")); } - let millis_val = vtt_digit3(&b[len - 3..]); - let seconds_val = vtt_digit2(&b[len - 6..len - 4]); - let minutes_val = vtt_digit2(&b[len - 9..len - 7]); - // Use try_with to validate ranges — needed for cue-text path where input - // is not regex-validated. + // Validate digit bytes before arithmetic. + let millis_val = + vtt_digit3_checked(&b[len - 3..]).ok_or(ParseVttError::InvalidTimestamp("invalid digits"))?; + let seconds_val = vtt_digit2_checked(&b[len - 6..len - 4]) + .ok_or(ParseVttError::InvalidTimestamp("invalid digits"))?; + let minutes_val = vtt_digit2_checked(&b[len - 9..len - 7]) + .ok_or(ParseVttError::InvalidTimestamp("invalid digits"))?; let millis = Millisecond::try_with(millis_val) .ok_or(ParseVttError::InvalidTimestamp("milliseconds out of range"))?; let seconds = @@ -142,7 +174,6 @@ pub(crate) fn parse_timestamp(s: &str) -> Result { return Err(ParseVttError::InvalidTimestamp("invalid format")); } let hour_str = &s[..len - 10]; - // VTT hours are unbounded u64; parse the variable-length prefix parse_vtt_hour_bytes(hour_str.as_bytes()) .map_err(|_| ParseVttError::InvalidTimestamp("invalid hours"))? } else { @@ -170,16 +201,41 @@ fn parse_vtt_hour_bytes(b: &[u8]) -> Result { Ok(Hour(val)) } +/// Unchecked 2-digit extraction. Caller must guarantee ASCII digits. #[cfg_attr(not(tarpaulin), inline(always))] const fn vtt_digit2(b: &[u8]) -> u8 { (b[0] - b'0') * 10 + (b[1] - b'0') } +/// Unchecked 3-digit extraction. Caller must guarantee ASCII digits. #[cfg_attr(not(tarpaulin), inline(always))] const fn vtt_digit3(b: &[u8]) -> u16 { (b[0] - b'0') as u16 * 100 + (b[1] - b'0') as u16 * 10 + (b[2] - b'0') as u16 } +/// Checked 2-digit extraction. Returns `None` if any byte is not an ASCII digit. +#[cfg_attr(not(tarpaulin), inline(always))] +fn vtt_digit2_checked(b: &[u8]) -> Option { + let d0 = b[0].wrapping_sub(b'0'); + let d1 = b[1].wrapping_sub(b'0'); + if d0 > 9 || d1 > 9 { + return None; + } + Some(d0 * 10 + d1) +} + +/// Checked 3-digit extraction. Returns `None` if any byte is not an ASCII digit. +#[cfg_attr(not(tarpaulin), inline(always))] +fn vtt_digit3_checked(b: &[u8]) -> Option { + let d0 = b[0].wrapping_sub(b'0'); + let d1 = b[1].wrapping_sub(b'0'); + let d2 = b[2].wrapping_sub(b'0'); + if d0 > 9 || d1 > 9 || d2 > 9 { + return None; + } + Some(d0 as u16 * 100 + d1 as u16 * 10 + d2 as u16) +} + /// Lex the next token from a line. Returns `Ok(None)` when the line /// produces no tokens. #[cfg_attr(not(tarpaulin), inline(always))] @@ -1298,9 +1354,9 @@ impl<'a> Iterator for Lines<'a> { let bytes = &self.input.as_bytes()[self.pos..]; - #[cfg(all(feature = "memchr", miri))] + #[cfg(all(feature = "memchr", not(miri)))] let found = memchr::memchr2(b'\n', b'\r', bytes); - #[cfg(not(all(feature = "memchr", miri)))] + #[cfg(any(not(feature = "memchr"), miri))] let found = bytes.iter().position(|&b| b == b'\n' || b == b'\r'); match found { diff --git a/src/vtt/cue.rs b/src/vtt/cue.rs index 64483d9..a810b86 100644 --- a/src/vtt/cue.rs +++ b/src/vtt/cue.rs @@ -95,8 +95,9 @@ enum RawCueToken<'a> { StartLang(&'a str), // ── timestamp tag ───────────────────────────────────────────────────── - /// `<[digits/colons].[3 digits]>` — validated by `parse_timestamp` later. - #[regex(r"<[0-9:]+\.[0-9]{3}>")] + /// `` or `` — fully validated by the DFA so + /// the fast unchecked `parse_timestamp` can be used directly. + #[regex(r"<(?:[0-9]+:)?[0-5][0-9]:[0-5][0-9]\.[0-9]{3}>")] Timestamp(&'a str), // ── fallbacks (skipped by the iterator) ─────────────────────────────── @@ -574,6 +575,7 @@ impl<'a> Iterator for CueParser<'a> { // ── timestamp ── Ok(RawCueToken::Timestamp(s)) => { let content = &s[1..s.len() - 1]; // strip `<` and `>` + // Regex already validates format; use unchecked fast path. if let Ok(ts) = super::parse_timestamp(content) { return Some(CueToken::Timestamp(ts)); } @@ -608,7 +610,7 @@ fn try_parse_unterminated<'a>(slice: &'a str) -> Option> { // Try timestamp: digits/colons + "." + 3 digits if inner.as_bytes()[0].is_ascii_digit() { - if let Ok(ts) = super::parse_timestamp(inner) { + if let Ok(ts) = super::parse_timestamp_cue(inner) { return Some(CueToken::Timestamp(ts)); } return None; From 3f6146bddc718f9f20d402ffd9757b68463c66f6 Mon Sep 17 00:00:00 2001 From: al8n Date: Thu, 12 Mar 2026 03:19:32 +0800 Subject: [PATCH 08/20] improve performance --- benches/vtt.rs | 58 +++++++++++++++++++++++++++++++++++++++++++++++++- src/srt.rs | 4 ++-- 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/benches/vtt.rs b/benches/vtt.rs index a7eb32e..b3e6a61 100644 --- a/benches/vtt.rs +++ b/benches/vtt.rs @@ -2,6 +2,7 @@ use std::hint::black_box; use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; use fasrt::vtt::Parser; +use fasrt::vtt::cue::CueParser; const SMALL_VTT: &str = "\ WEBVTT @@ -110,5 +111,60 @@ fn bench_vtt_collect(c: &mut Criterion) { group.finish(); } -criterion_group!(benches, bench_vtt_parse, bench_vtt_collect); +/// Cue text with many timestamp tags to benchmark the cue-text parsing path. +fn build_cue_text_with_timestamps() -> String { + let mut s = String::new(); + for i in 0..500 { + let h = i / 3600; + let m = (i % 3600) / 60; + let sec = i % 60; + s.push_str(&format!( + "word <{h:02}:{m:02}:{sec:02}.{ms:03}>text", + h = h, + m = m, + sec = sec, + ms = (i * 7) % 1000 + )); + } + s +} + +/// Cue text with tags but no timestamps. +const CUE_TEXT_TAGS: &str = "\ +bold bold-italic plain underline \ +voice english baseruby \ +classed & < >   end"; + +fn bench_vtt_cue_text(c: &mut Criterion) { + let ts_input = build_cue_text_with_timestamps(); + + let mut group = c.benchmark_group("vtt/cue_text"); + + // Tags only (no timestamps) + group.throughput(Throughput::Bytes(CUE_TEXT_TAGS.len() as u64)); + group.bench_function("tags_only", |b| { + b.iter(|| { + let count = CueParser::new(black_box(CUE_TEXT_TAGS)).count(); + black_box(count); + }); + }); + + // Timestamp-heavy cue text + group.throughput(Throughput::Bytes(ts_input.len() as u64)); + group.bench_function("500_timestamps", |b| { + b.iter(|| { + let count = CueParser::new(black_box(&ts_input)).count(); + black_box(count); + }); + }); + + group.finish(); +} + +criterion_group!( + benches, + bench_vtt_parse, + bench_vtt_collect, + bench_vtt_cue_text +); criterion_main!(benches); diff --git a/src/srt.rs b/src/srt.rs index 9d133c3..1f279eb 100644 --- a/src/srt.rs +++ b/src/srt.rs @@ -749,9 +749,9 @@ impl<'a> Iterator for Lines<'a> { let bytes = &self.input.as_bytes()[self.pos..]; - #[cfg(feature = "memchr")] + #[cfg(all(feature = "memchr", not(miri)))] let found = memchr::memchr(b'\n', bytes); - #[cfg(not(feature = "memchr"))] + #[cfg(not(all(feature = "memchr", not(miri))))] let found = bytes.iter().position(|&b| b == b'\n'); let line_end = found From f5e7549fe13629cfc43265484503dcdc17a0d22f Mon Sep 17 00:00:00 2001 From: al8n Date: Thu, 12 Mar 2026 10:41:13 +0800 Subject: [PATCH 09/20] improve performance --- README.md | 23 +++++++---- src/error.rs | 14 +++++-- src/vtt.rs | 12 +++++- tests/vtt_cue_text.rs | 90 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 126 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 50300a5..8177d26 100644 --- a/README.md +++ b/README.md @@ -78,19 +78,26 @@ Measured on Apple Silicon with `cargo bench` (Criterion). | Benchmark | Input | Time | Throughput | |-----------|-------|------|------------| -| Parse (strict) | 2 cues, 89 B | ~316 ns | 281 MiB/s | -| Parse (strict) | 26 KB file | ~39 µs | 645 MiB/s | -| Parse (lossy) | 332 files, ~8 MB | ~12.5 ms | 624 MiB/s | -| Collect into `Vec` | 26 KB file | ~41 µs | 606 MiB/s | +| Parse (strict) | 2 cues, 89 B | ~170 ns | 520 MiB/s | +| Parse (strict) | 26 KB file | ~38 µs | 661 MiB/s | +| Parse (lossy) | 332 files, ~8 MB | ~12.1 ms | 646 MiB/s | +| Collect into `Vec` | 26 KB file | ~40 µs | 616 MiB/s | ### WebVTT | Benchmark | Input | Time | Throughput | |-----------|-------|------|------------| -| Parse | 2 cues, 96 B | ~299 ns | 309 MiB/s | -| Parse | Settings + region + style, 354 B | ~829 ns | 427 MiB/s | -| Parse | All WPT fixtures, ~34 KB | ~103 µs | 342 MiB/s | -| Collect into `Vec` | Settings + region + style, 354 B | ~1.08 µs | 329 MiB/s | +| Parse | 2 cues, 96 B | ~318 ns | 291 MiB/s | +| Parse | Settings + region + style, 354 B | ~915 ns | 387 MiB/s | +| Parse | All WPT fixtures, ~34 KB | ~113 µs | 314 MiB/s | +| Collect into `Vec` | Settings + region + style, 354 B | ~973 ns | 364 MiB/s | + +### Cue Text + +| Benchmark | Input | Time | Throughput | +|-----------|-------|------|------------| +| Parse | Tags only, 166 B | ~316 ns | 552 MiB/s | +| Parse | 500 timestamps, ~11 KB | ~14.1 µs | 776 MiB/s | Run benchmarks yourself: diff --git a/src/error.rs b/src/error.rs index f0a5437..c4546fb 100644 --- a/src/error.rs +++ b/src/error.rs @@ -38,19 +38,27 @@ pub enum ParseSecondError { } /// The error type for parsing hour components of timestamps. +/// +/// This enum is shared by both SRT and WebVTT parsers: +/// - **SRT** hours are 2–3 digits (0–999): uses [`NotPadded`](Self::NotPadded) +/// and [`Overflow(u16)`](Self::Overflow). +/// - **WebVTT** hours are unbounded (`u64`): uses [`NotPadded`](Self::NotPadded) +/// for non-digit input and [`HourOverflow`](Self::HourOverflow) when the +/// value exceeds `u64::MAX`. #[derive(Debug, Clone, PartialEq, Eq, IsVariant, Unwrap, TryUnwrap, thiserror::Error)] #[unwrap(ref, ref_mut)] #[try_unwrap(ref, ref_mut)] pub enum ParseHourError { - /// The hour component is not zero-padded to 2 digits. + /// The hour component is not zero-padded to 2 digits (SRT), + /// or contains non-digit characters (VTT). #[error("hour component is not zero-padded to 2 digits")] #[unwrap(ignore)] #[try_unwrap(ignore)] NotPadded, - /// The hour component is out of range (not between 0-999). + /// The hour component is out of the SRT range (0–999). #[error("hour component must be between 0-999, but was {0}")] Overflow(u16), - /// The hour component overflowed u64 (VTT unbounded hours). + /// The hour component overflowed `u64` (VTT unbounded hours). #[error("hour component overflowed")] #[unwrap(ignore)] #[try_unwrap(ignore)] diff --git a/src/vtt.rs b/src/vtt.rs index b499ce6..0939698 100644 --- a/src/vtt.rs +++ b/src/vtt.rs @@ -202,14 +202,22 @@ fn parse_vtt_hour_bytes(b: &[u8]) -> Result { } /// Unchecked 2-digit extraction. Caller must guarantee ASCII digits. +/// +/// Only called from [`parse_timestamp`], which requires logos-regex-validated input. +/// Debug assertions catch misuse during development. #[cfg_attr(not(tarpaulin), inline(always))] -const fn vtt_digit2(b: &[u8]) -> u8 { +fn vtt_digit2(b: &[u8]) -> u8 { + debug_assert!(b[0].is_ascii_digit() && b[1].is_ascii_digit()); (b[0] - b'0') * 10 + (b[1] - b'0') } /// Unchecked 3-digit extraction. Caller must guarantee ASCII digits. +/// +/// Only called from [`parse_timestamp`], which requires logos-regex-validated input. +/// Debug assertions catch misuse during development. #[cfg_attr(not(tarpaulin), inline(always))] -const fn vtt_digit3(b: &[u8]) -> u16 { +fn vtt_digit3(b: &[u8]) -> u16 { + debug_assert!(b[0].is_ascii_digit() && b[1].is_ascii_digit() && b[2].is_ascii_digit()); (b[0] - b'0') as u16 * 100 + (b[1] - b'0') as u16 * 10 + (b[2] - b'0') as u16 } diff --git a/tests/vtt_cue_text.rs b/tests/vtt_cue_text.rs index 446d53e..5a61641 100644 --- a/tests/vtt_cue_text.rs +++ b/tests/vtt_cue_text.rs @@ -374,3 +374,93 @@ impl CueTokenExt for CueToken<'_> { } } } + +// ── Malformed timestamp rejection tests ────────────────────────────────────── +// +// These verify that malformed cue-text timestamp tags are safely rejected +// (treated as unknown tags or skipped) without panicking, even in debug builds. + +/// Colons where digits are expected: `<:::.000>` matches the old loose regex +/// but is rejected by the tightened DFA. +#[test] +fn cue_text_rejects_colons_as_digits() { + let tokens: Vec<_> = CueParser::new("<:::.000>").collect(); + assert!( + !tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "colons-only should not parse as timestamp" + ); +} + +/// Minutes out of range: `<99:99.000>` — rejected by the DFA (`[0-5][0-9]`). +#[test] +fn cue_text_rejects_out_of_range_minutes() { + let tokens: Vec<_> = CueParser::new("<99:99.000>").collect(); + assert!( + !tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "99:99 should not parse as timestamp" + ); +} + +/// Seconds out of range: `<00:60.000>` — rejected by the DFA. +#[test] +fn cue_text_rejects_out_of_range_seconds() { + let tokens: Vec<_> = CueParser::new("<00:60.000>").collect(); + assert!( + !tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "60 seconds should not parse as timestamp" + ); +} + +/// Non-digit bytes in hour position: ``. +#[test] +fn cue_text_rejects_non_digit_hours() { + let tokens: Vec<_> = CueParser::new("").collect(); + assert!( + !tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "non-digit hours should not parse as timestamp" + ); +} + +/// Empty hour prefix: `<:00:00.000>` — colon where a digit is expected. +#[test] +fn cue_text_rejects_empty_hour_prefix() { + let tokens: Vec<_> = CueParser::new("<:00:00.000>").collect(); + assert!( + !tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "empty hour prefix should not parse as timestamp" + ); +} + +/// Very large hours that would overflow u64: 30-digit hour value. +#[test] +fn cue_text_rejects_overflowing_hours() { + // 30 digits exceeds u64::MAX (20 digits) + let tag = format!("<{}:00:00.000>", "9".repeat(30)); + let tokens: Vec<_> = CueParser::new(&tag).collect(); + assert!( + !tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "overflowing hours should not parse as timestamp" + ); +} + +/// Unterminated timestamp tag with valid format goes through `parse_timestamp_cue`. +#[test] +fn cue_text_unterminated_valid_timestamp() { + // `<00:05.000` without closing `>` — handled by try_parse_unterminated + let tokens: Vec<_> = CueParser::new("<00:05.000").collect(); + assert!( + tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "unterminated but valid timestamp should parse" + ); +} + +/// Unterminated timestamp with invalid digits goes through `parse_timestamp_cue` +/// and is safely rejected. +#[test] +fn cue_text_unterminated_invalid_timestamp() { + let tokens: Vec<_> = CueParser::new("<99:99.000").collect(); + assert!( + !tokens.iter().any(|t| matches!(t, CueToken::Timestamp(_))), + "unterminated invalid timestamp should be rejected" + ); +} From ca93584b936a5cec3941e8482f6ad00d0ff2eb58 Mon Sep 17 00:00:00 2001 From: Al Liu Date: Thu, 12 Mar 2026 10:52:16 +0800 Subject: [PATCH 10/20] Update benches/vtt.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- benches/vtt.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/benches/vtt.rs b/benches/vtt.rs index b3e6a61..5414b43 100644 --- a/benches/vtt.rs +++ b/benches/vtt.rs @@ -44,9 +44,12 @@ fn load_all_vtt_fixtures() -> String { "fixtures/webvtt/wpt-file-parsing", "fixtures/webvtt/wpt-cue-parsing", ] { - let entries = std::fs::read_dir(dir).expect("failed to read VTT fixture directory"); + let mut entries: Vec<_> = std::fs::read_dir(dir) + .expect("failed to read VTT fixture directory") + .map(|e| e.unwrap()) + .collect(); + entries.sort_by(|a, b| a.file_name().cmp(&b.file_name())); for entry in entries { - let entry = entry.unwrap(); if entry.path().extension().is_some_and(|e| e == "vtt") { buf.push_str(&std::fs::read_to_string(entry.path()).unwrap()); buf.push_str("\n\n"); From 71494a9de745346999bdbb6638e434ceb3ca2291 Mon Sep 17 00:00:00 2001 From: Al Liu Date: Thu, 12 Mar 2026 10:52:38 +0800 Subject: [PATCH 11/20] Update benches/srt.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- benches/srt.rs | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/benches/srt.rs b/benches/srt.rs index 19ef303..fdf541e 100644 --- a/benches/srt.rs +++ b/benches/srt.rs @@ -16,16 +16,28 @@ Goodbye world! const MEDIUM_SRT: &str = include_str!("../fixtures/srt/DeathNote_01.eng.srt"); fn load_all_fixtures() -> String { + let mut paths: Vec<_> = std::fs::read_dir("fixtures/srt") + .unwrap() + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + if path.extension().is_some_and(|e| e == "srt") { + Some(path) + } else { + None + } + }) + .collect(); + + paths.sort(); + let mut buf = String::new(); - for entry in std::fs::read_dir("fixtures/srt").unwrap() { - let entry = entry.unwrap(); - if entry.path().extension().is_some_and(|e| e == "srt") { - if let Ok(content) = std::fs::read_to_string(entry.path()) { - if !buf.is_empty() { - buf.push_str("\n\n"); - } - buf.push_str(&content); + for path in paths { + if let Ok(content) = std::fs::read_to_string(path) { + if !buf.is_empty() { + buf.push_str("\n\n"); } + buf.push_str(&content); } } buf From 30ac3eefb6d9ca6a2bfefd0b518cb3a067882bc1 Mon Sep 17 00:00:00 2001 From: Al Liu Date: Thu, 12 Mar 2026 10:53:28 +0800 Subject: [PATCH 12/20] Update src/vtt.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/vtt.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/vtt.rs b/src/vtt.rs index 0939698..8820de3 100644 --- a/src/vtt.rs +++ b/src/vtt.rs @@ -174,6 +174,9 @@ pub(crate) fn parse_timestamp_cue(s: &str) -> Result { return Err(ParseVttError::InvalidTimestamp("invalid format")); } let hour_str = &s[..len - 10]; + if hour_str.is_empty() { + return Err(ParseVttError::InvalidTimestamp("invalid hours")); + } parse_vtt_hour_bytes(hour_str.as_bytes()) .map_err(|_| ParseVttError::InvalidTimestamp("invalid hours"))? } else { From d633aeb98932a8a628b14a18507456f0d62409a0 Mon Sep 17 00:00:00 2001 From: al8n Date: Thu, 12 Mar 2026 10:56:06 +0800 Subject: [PATCH 13/20] update stale docs --- src/srt.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/srt.rs b/src/srt.rs index 1f279eb..7f01ddf 100644 --- a/src/srt.rs +++ b/src/srt.rs @@ -324,7 +324,7 @@ fn parse_number(s: &mut Lexer<'_, Token>) -> Result { #[inline] fn parse_header(s: &mut Lexer<'_, Token>) -> Result { let slice = s.slice(); - // The regex guarantees `H+:MM:SS,mmm --> H+:MM:SS,mmm`. + // The regex guarantees `HH+:MM:SS,mmm --> HH+:MM:SS,mmm`. // Find the arrow separator and extract both timestamps via byte arithmetic. let arrow = slice.find("-->").unwrap(); let start_str = slice[..arrow].trim(); From 6bb45953d83670ef0ea99e747af50bd1d84ed517 Mon Sep 17 00:00:00 2001 From: al8n Date: Thu, 12 Mar 2026 11:04:06 +0800 Subject: [PATCH 14/20] update ci sh --- ci/miri_sb.sh | 2 +- ci/miri_tb.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/miri_sb.sh b/ci/miri_sb.sh index cc3c6e0..568741b 100755 --- a/ci/miri_sb.sh +++ b/ci/miri_sb.sh @@ -35,4 +35,4 @@ cargo miri setup export MIRIFLAGS="-Zmiri-strict-provenance -Zmiri-disable-isolation -Zmiri-symbolic-alignment-check" -cargo miri test --all-targets --target "$TARGET" +cargo miri test --lib --tests --doc --target "$TARGET" diff --git a/ci/miri_tb.sh b/ci/miri_tb.sh index 5d374c7..a05e475 100755 --- a/ci/miri_tb.sh +++ b/ci/miri_tb.sh @@ -35,4 +35,4 @@ cargo miri setup export MIRIFLAGS="-Zmiri-strict-provenance -Zmiri-disable-isolation -Zmiri-symbolic-alignment-check -Zmiri-tree-borrows" -cargo miri test --all-targets --target "$TARGET" +cargo miri test --lib --tests --doc --target "$TARGET" From 29fa662d698ce3fbc598bf9083d176e6dbb03569 Mon Sep 17 00:00:00 2001 From: Al Liu Date: Thu, 12 Mar 2026 11:06:34 +0800 Subject: [PATCH 15/20] Update src/error.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/error.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/error.rs b/src/error.rs index c4546fb..d1e5d8f 100644 --- a/src/error.rs +++ b/src/error.rs @@ -51,7 +51,7 @@ pub enum ParseSecondError { pub enum ParseHourError { /// The hour component is not zero-padded to 2 digits (SRT), /// or contains non-digit characters (VTT). - #[error("hour component is not zero-padded to 2 digits")] + #[error("hour component is not zero-padded to 2 digits or contains invalid characters")] #[unwrap(ignore)] #[try_unwrap(ignore)] NotPadded, From 87a967e0ea37dce8b20c3fa926fa4f22acad2d32 Mon Sep 17 00:00:00 2001 From: Al Liu Date: Thu, 12 Mar 2026 11:07:14 +0800 Subject: [PATCH 16/20] Update benches/vtt.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- benches/vtt.rs | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/benches/vtt.rs b/benches/vtt.rs index 5414b43..77bdb05 100644 --- a/benches/vtt.rs +++ b/benches/vtt.rs @@ -44,22 +44,27 @@ fn load_all_vtt_fixtures() -> String { "fixtures/webvtt/wpt-file-parsing", "fixtures/webvtt/wpt-cue-parsing", ] { - let mut entries: Vec<_> = std::fs::read_dir(dir) - .expect("failed to read VTT fixture directory") - .map(|e| e.unwrap()) - .collect(); - entries.sort_by(|a, b| a.file_name().cmp(&b.file_name())); - for entry in entries { - if entry.path().extension().is_some_and(|e| e == "vtt") { - buf.push_str(&std::fs::read_to_string(entry.path()).unwrap()); - buf.push_str("\n\n"); + if let Ok(read_dir) = std::fs::read_dir(dir) { + let mut entries: Vec<_> = read_dir + .filter_map(Result::ok) + .collect(); + entries.sort_by(|a, b| a.file_name().cmp(&b.file_name())); + for entry in entries { + if entry.path().extension().is_some_and(|e| e == "vtt") { + if let Ok(contents) = std::fs::read_to_string(entry.path()) { + buf.push_str(&contents); + buf.push_str("\n\n"); + } + } } } } - assert!( - !buf.is_empty(), - "No VTT fixtures were loaded from fixtures/webvtt" - ); + if buf.is_empty() { + // Fallback to embedded samples so benches still run without fixtures. + buf.push_str(SMALL_VTT); + buf.push_str("\n\n"); + buf.push_str(SETTINGS_VTT); + } buf } From a8ed2dc10b252b716a8c6e4b2ac1fe9bf04592b2 Mon Sep 17 00:00:00 2001 From: al8n Date: Thu, 12 Mar 2026 11:37:11 +0800 Subject: [PATCH 17/20] cleanup --- ci/miri_sb.sh | 4 +++- ci/miri_tb.sh | 4 +++- src/error.rs | 20 ++++++++++++++++ src/vtt.rs | 66 ++++++++++++++++++++++++++++++--------------------- 4 files changed, 65 insertions(+), 29 deletions(-) diff --git a/ci/miri_sb.sh b/ci/miri_sb.sh index 568741b..de9aa95 100755 --- a/ci/miri_sb.sh +++ b/ci/miri_sb.sh @@ -35,4 +35,6 @@ cargo miri setup export MIRIFLAGS="-Zmiri-strict-provenance -Zmiri-disable-isolation -Zmiri-symbolic-alignment-check" -cargo miri test --lib --tests --doc --target "$TARGET" +cargo miri test --lib --tests --target "$TARGET" + +cargo miri test --doc --target "$TARGET" diff --git a/ci/miri_tb.sh b/ci/miri_tb.sh index a05e475..a9c8beb 100755 --- a/ci/miri_tb.sh +++ b/ci/miri_tb.sh @@ -35,4 +35,6 @@ cargo miri setup export MIRIFLAGS="-Zmiri-strict-provenance -Zmiri-disable-isolation -Zmiri-symbolic-alignment-check -Zmiri-tree-borrows" -cargo miri test --lib --tests --doc --target "$TARGET" +cargo miri test --lib --tests --target "$TARGET" + +cargo miri test --doc --target "$TARGET" diff --git a/src/error.rs b/src/error.rs index d1e5d8f..ad6775a 100644 --- a/src/error.rs +++ b/src/error.rs @@ -86,6 +86,26 @@ pub enum ParseMillisecondError { ParseInt(#[from] ParseIntError), } +/// Specific reason why a VTT timestamp has invalid structure. +/// +/// This covers structural validation errors only (length, separators, digits). +/// Component range errors (hours, minutes, seconds, milliseconds) are +/// represented by their dedicated error types ([`ParseHourError`], +/// [`ParseMinuteError`], [`ParseSecondError`], [`ParseMillisecondError`]) +/// which are separate variants of [`crate::vtt::ParseVttError`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq, thiserror::Error)] +pub enum TimestampError { + /// The input is too short or has an invalid length for a VTT timestamp. + #[error("invalid length")] + InvalidLength, + /// A separator (`.` or `:`) is not in the expected position. + #[error("invalid format")] + InvalidFormat, + /// One or more digit positions contain non-digit bytes. + #[error("invalid digits")] + InvalidDigits, +} + /// The error type for parsing index numbers of subtitles. #[derive(Debug, Clone, PartialEq, Eq, IsVariant, Unwrap, TryUnwrap, thiserror::Error)] #[unwrap(ref, ref_mut)] diff --git a/src/vtt.rs b/src/vtt.rs index 8820de3..8bc9aa3 100644 --- a/src/vtt.rs +++ b/src/vtt.rs @@ -45,7 +45,7 @@ pub enum ParseVttError { /// A timestamp could not be parsed. #[error("invalid timestamp: {0}")] - InvalidTimestamp(&'static str), + InvalidTimestamp(TimestampError), /// The timing line is malformed (missing `-->`). #[error("invalid timing line: missing '-->' separator")] @@ -125,15 +125,16 @@ pub(crate) fn parse_timestamp(s: &str) -> Result { let b = s.as_bytes(); let len = b.len(); if len < 9 { - return Err(ParseVttError::InvalidTimestamp("too short")); + return Err(ParseVttError::InvalidTimestamp( + TimestampError::InvalidLength, + )); } let millis = Millisecond(vtt_digit3(&b[len - 3..])); let seconds = Second(vtt_digit2(&b[len - 6..len - 4])); let minutes = Minute(vtt_digit2(&b[len - 9..len - 7])); let hours = if len > 9 { let hour_str = &s[..len - 10]; - parse_vtt_hour_bytes(hour_str.as_bytes()) - .map_err(|_| ParseVttError::InvalidTimestamp("invalid hours"))? + parse_vtt_hour_bytes(hour_str.as_bytes())? } else { Hour::new() }; @@ -149,36 +150,47 @@ pub(crate) fn parse_timestamp(s: &str) -> Result { pub(crate) fn parse_timestamp_cue(s: &str) -> Result { let b = s.as_bytes(); let len = b.len(); - if len < 9 { - return Err(ParseVttError::InvalidTimestamp("too short")); + // Short form `MM:SS.mmm` = 9 bytes (0 hour digits). + // Long form `H+:MM:SS.mmm` = 12..=29 bytes (1..=20 hour digits). + // Reject lengths outside [9, 29] and the gap [10, 11]. + if len < 9 || len > 29 || (len > 9 && len < 12) { + return Err(ParseVttError::InvalidTimestamp( + TimestampError::InvalidLength, + )); } // Validate separators. if b[len - 4] != b'.' || b[len - 7] != b':' { - return Err(ParseVttError::InvalidTimestamp("invalid format")); + return Err(ParseVttError::InvalidTimestamp( + TimestampError::InvalidFormat, + )); } // Validate digit bytes before arithmetic. - let millis_val = - vtt_digit3_checked(&b[len - 3..]).ok_or(ParseVttError::InvalidTimestamp("invalid digits"))?; - let seconds_val = vtt_digit2_checked(&b[len - 6..len - 4]) - .ok_or(ParseVttError::InvalidTimestamp("invalid digits"))?; - let minutes_val = vtt_digit2_checked(&b[len - 9..len - 7]) - .ok_or(ParseVttError::InvalidTimestamp("invalid digits"))?; - let millis = Millisecond::try_with(millis_val) - .ok_or(ParseVttError::InvalidTimestamp("milliseconds out of range"))?; - let seconds = - Second::try_with(seconds_val).ok_or(ParseVttError::InvalidTimestamp("seconds out of range"))?; - let minutes = - Minute::try_with(minutes_val).ok_or(ParseVttError::InvalidTimestamp("minutes out of range"))?; - let hours = if len > 9 { + let millis_val = vtt_digit3_checked(&b[len - 3..]).ok_or(ParseVttError::InvalidTimestamp( + TimestampError::InvalidDigits, + ))?; + let seconds_val = vtt_digit2_checked(&b[len - 6..len - 4]).ok_or( + ParseVttError::InvalidTimestamp(TimestampError::InvalidDigits), + )?; + let minutes_val = vtt_digit2_checked(&b[len - 9..len - 7]).ok_or( + ParseVttError::InvalidTimestamp(TimestampError::InvalidDigits), + )?; + let millis = Millisecond::try_with(millis_val).ok_or(ParseVttError::ParseMillisecond( + ParseMillisecondError::Overflow(millis_val), + ))?; + let seconds = Second::try_with(seconds_val).ok_or(ParseVttError::ParseSecond( + ParseSecondError::Overflow(seconds_val), + ))?; + let minutes = Minute::try_with(minutes_val).ok_or(ParseVttError::ParseMinute( + ParseMinuteError::Overflow(minutes_val), + ))?; + let hours = if len >= 12 { if b[len - 10] != b':' { - return Err(ParseVttError::InvalidTimestamp("invalid format")); - } - let hour_str = &s[..len - 10]; - if hour_str.is_empty() { - return Err(ParseVttError::InvalidTimestamp("invalid hours")); + return Err(ParseVttError::InvalidTimestamp( + TimestampError::InvalidFormat, + )); } - parse_vtt_hour_bytes(hour_str.as_bytes()) - .map_err(|_| ParseVttError::InvalidTimestamp("invalid hours"))? + let hour_bytes = &b[..len - 10]; + parse_vtt_hour_bytes(hour_bytes)? } else { Hour::new() }; From c36a4cb6278be38646fb0cff16d920ca55cb42f3 Mon Sep 17 00:00:00 2001 From: al8n Date: Thu, 12 Mar 2026 11:44:51 +0800 Subject: [PATCH 18/20] cleanup --- benches/vtt.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/benches/vtt.rs b/benches/vtt.rs index 77bdb05..6b6d2d8 100644 --- a/benches/vtt.rs +++ b/benches/vtt.rs @@ -45,9 +45,7 @@ fn load_all_vtt_fixtures() -> String { "fixtures/webvtt/wpt-cue-parsing", ] { if let Ok(read_dir) = std::fs::read_dir(dir) { - let mut entries: Vec<_> = read_dir - .filter_map(Result::ok) - .collect(); + let mut entries: Vec<_> = read_dir.filter_map(Result::ok).collect(); entries.sort_by(|a, b| a.file_name().cmp(&b.file_name())); for entry in entries { if entry.path().extension().is_some_and(|e| e == "vtt") { From 59ee779e540d797f25972ed91b02d7a695d6f23b Mon Sep 17 00:00:00 2001 From: Al Liu Date: Thu, 12 Mar 2026 11:54:53 +0800 Subject: [PATCH 19/20] Update benches/srt.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- benches/srt.rs | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/benches/srt.rs b/benches/srt.rs index fdf541e..55bb31e 100644 --- a/benches/srt.rs +++ b/benches/srt.rs @@ -16,31 +16,35 @@ Goodbye world! const MEDIUM_SRT: &str = include_str!("../fixtures/srt/DeathNote_01.eng.srt"); fn load_all_fixtures() -> String { - let mut paths: Vec<_> = std::fs::read_dir("fixtures/srt") - .unwrap() - .filter_map(|entry| { - let entry = entry.ok()?; - let path = entry.path(); - if path.extension().is_some_and(|e| e == "srt") { - Some(path) - } else { - None + if let Ok(read_dir) = std::fs::read_dir("fixtures/srt") { + let mut paths: Vec<_> = read_dir + .filter_map(|entry| { + let entry = entry.ok()?; + let path = entry.path(); + if path.extension().is_some_and(|e| e == "srt") { + Some(path) + } else { + None + } + }) + .collect(); + + paths.sort(); + + let mut buf = String::new(); + for path in paths { + if let Ok(content) = std::fs::read_to_string(path) { + if !buf.is_empty() { + buf.push_str("\n\n"); + } + buf.push_str(&content); } - }) - .collect(); - - paths.sort(); - - let mut buf = String::new(); - for path in paths { - if let Ok(content) = std::fs::read_to_string(path) { - if !buf.is_empty() { - buf.push_str("\n\n"); - } - buf.push_str(&content); } + buf + } else { + // Fallback: use a small embedded sample when fixtures are unavailable. + SMALL_SRT.to_string() } - buf } fn bench_srt_parse(c: &mut Criterion) { From cef59220d79e3251228377ae609283de2ea8fe18 Mon Sep 17 00:00:00 2001 From: al8n Date: Thu, 12 Mar 2026 11:56:19 +0800 Subject: [PATCH 20/20] fix clippy --- src/vtt.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vtt.rs b/src/vtt.rs index 8bc9aa3..52e3cae 100644 --- a/src/vtt.rs +++ b/src/vtt.rs @@ -153,7 +153,7 @@ pub(crate) fn parse_timestamp_cue(s: &str) -> Result { // Short form `MM:SS.mmm` = 9 bytes (0 hour digits). // Long form `H+:MM:SS.mmm` = 12..=29 bytes (1..=20 hour digits). // Reject lengths outside [9, 29] and the gap [10, 11]. - if len < 9 || len > 29 || (len > 9 && len < 12) { + if !(9..=29).contains(&len) || (len > 9 && len < 12) { return Err(ParseVttError::InvalidTimestamp( TimestampError::InvalidLength, ));