diff --git a/CHANGELOG.md b/CHANGELOG.md index 96dd7bd..ef6381f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ARM NEON SIMD support (Go 1.26 `simd/archsimd` intrinsics — [#120](https://github.com/coregx/coregex/issues/120)) - SIMD prefilter for CompositeSequenceDFA (#83) +## [0.12.7] - 2026-03-10 + +### Performance +- **PikeVM sparse-dispatch for `.` patterns** (Issue [#132](https://github.com/coregx/coregex/issues/132)) — + The NFA compiler generated ~9 split states chaining UTF-8 byte-range alternation + branches for each `.` (AnyCharNotNL). PikeVM had to DFS-traverse the entire split + chain at every byte position, resulting in O(branches) work per byte. For `.*?` + patterns on large inputs (e.g., `\{\{(.*?)\}\}` on 10MB template), this caused + ~5 billion branch evaluations. + Fix: new `compileUTF8AnySparse()` compiles `.` as a single sparse state that maps + each leading byte range directly to its continuation chain — O(1) dispatch instead + of O(branches) split-chain traversal. Same approach as Rust regex's `State::Sparse`. + PikeVM speedup: **2.8-4.8x** on dot-heavy patterns. DFA unaffected (uses byte-level NFA). + Reported by [@kostya](https://github.com/kostya) via LangArena benchmarks. + ## [0.12.6] - 2026-03-08 ### Fixed diff --git a/ROADMAP.md b/ROADMAP.md index 74fc4cf..81249e7 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -2,7 +2,7 @@ > **Strategic Focus**: Production-grade regex engine with RE2/rust-regex level optimizations -**Last Updated**: 2026-03-08 | **Current Version**: v0.12.6 | **Target**: v1.0.0 stable +**Last Updated**: 2026-03-10 | **Current Version**: v0.12.7 | **Target**: v1.0.0 stable --- @@ -12,7 +12,7 @@ Build a **production-ready, high-performance regex engine** for Go that matches ### Current State vs Target -| Metric | Current (v0.12.6) | Target (v1.0.0) | +| Metric | Current (v0.12.7) | Target (v1.0.0) | |--------|-------------------|-----------------| | Inner literal speedup | **280-3154x** | ✅ Achieved | | Case-insensitive speedup | **263x** | ✅ Achieved | @@ -68,7 +68,9 @@ v0.12.4 ✅ → Test coverage 80%+, CI improvements, awesome-go readiness ↓ v0.12.5 ✅ → Non-greedy quantifier fix, ReverseSuffix correctness (#124) ↓ -v0.12.6 (Current) ✅ → BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127) +v0.12.6 ✅ → BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127) + ↓ +v0.12.7 (Current) ✅ → PikeVM sparse-dispatch for dot patterns, 2.8-4.8x speedup (#132) ↓ v1.0.0-rc → Feature freeze, API locked ↓ @@ -103,6 +105,7 @@ v1.0.0 STABLE → Production release with API stability guarantee - ✅ **v0.12.4**: Test coverage 80%+, CI improvements, awesome-go readiness (#123) - ✅ **v0.12.5**: Non-greedy quantifier fix, ReverseSuffix forward verification (#124) - ✅ **v0.12.6**: BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127) +- ✅ **v0.12.7**: PikeVM sparse-dispatch for `.` patterns, 2.8-4.8x speedup (#132) --- @@ -194,7 +197,7 @@ v1.0.0 STABLE → Production release with API stability guarantee ## Feature Comparison Matrix -| Feature | RE2 | rust-regex | coregex v0.12.6 | coregex v1.0 | +| Feature | RE2 | rust-regex | coregex v0.12.7 | coregex v1.0 | |---------|-----|------------|-----------------|--------------| | Lazy DFA | ✅ | ✅ | ✅ | ✅ | | Thompson NFA | ✅ | ✅ | ✅ | ✅ | @@ -352,7 +355,8 @@ Reference implementations available locally: | Version | Date | Type | Key Changes | |---------|------|------|-------------| -| **v0.12.6** | 2026-03-08 | Fix | **BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127)** | +| **v0.12.7** | 2026-03-10 | Performance | **PikeVM sparse-dispatch for `.` patterns, 2.8-4.8x speedup (#132)** | +| v0.12.6 | 2026-03-08 | Fix | BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127) | | v0.12.5 | 2026-03-08 | Fix | Non-greedy quantifier fix, ReverseSuffix correctness (#124) | | v0.12.4 | 2026-03-01 | Test | Test coverage 80%+, CI improvements, awesome-go readiness | | **v0.12.3** | 2026-02-16 | Performance | **Cross-product literal expansion, 110x regexdna speedup (#119)** | @@ -392,4 +396,4 @@ Reference implementations available locally: --- -*Current: v0.12.6 | Next: v0.13.0 | Target: v1.0.0* +*Current: v0.12.7 | Next: v0.13.0 | Target: v1.0.0* diff --git a/meta/compile.go b/meta/compile.go index 359648c..29e9bff 100644 --- a/meta/compile.go +++ b/meta/compile.go @@ -279,11 +279,12 @@ func buildCharClassSearchers( strategy Strategy, re *syntax.Regexp, nfaEngine *nfa.NFA, + btNFA *nfa.NFA, // NFA for BoundedBacktracker (runeNFA when available, else nfaEngine) ) charClassSearcherResult { result := charClassSearcherResult{finalStrategy: strategy} if strategy == UseBoundedBacktracker { - result.boundedBT = nfa.NewBoundedBacktracker(nfaEngine) + result.boundedBT = nfa.NewBoundedBacktracker(btNFA) } if strategy == UseCharClassSearcher { @@ -298,7 +299,7 @@ func buildCharClassSearchers( } else { // Fallback to BoundedBacktracker if extraction fails result.finalStrategy = UseBoundedBacktracker - result.boundedBT = nfa.NewBoundedBacktracker(nfaEngine) + result.boundedBT = nfa.NewBoundedBacktracker(btNFA) } } @@ -309,7 +310,7 @@ func buildCharClassSearchers( if result.compositeSrch == nil { // Fallback to BoundedBacktracker if extraction fails result.finalStrategy = UseBoundedBacktracker - result.boundedBT = nfa.NewBoundedBacktracker(nfaEngine) + result.boundedBT = nfa.NewBoundedBacktracker(btNFA) } else { // Try to build faster DFA (uses subset construction for overlapping patterns) result.compositeSeqDFA = nfa.NewCompositeSequenceDFA(re) @@ -334,7 +335,7 @@ func buildCharClassSearchers( if result.branchDispatcher == nil { // Fallback to BoundedBacktracker if dispatch not possible result.finalStrategy = UseBoundedBacktracker - result.boundedBT = nfa.NewBoundedBacktracker(nfaEngine) + result.boundedBT = nfa.NewBoundedBacktracker(btNFA) } } @@ -343,12 +344,63 @@ func buildCharClassSearchers( // generation-based visited tracking (O(1) reset) vs PikeVM's thread queues. // This is similar to how stdlib uses backtracking for simple patterns. if result.finalStrategy == UseNFA && result.boundedBT == nil && nfaEngine.States() < 50 { - result.boundedBT = nfa.NewBoundedBacktracker(nfaEngine) + result.boundedBT = nfa.NewBoundedBacktracker(btNFA) } return result } +// buildDotOptimizedNFAs compiles optimized NFA variants for patterns with '.'. +// Returns: +// - asciiNFA: NFA with '.' compiled as single ASCII byte range (for ASCII-only input) +// - asciiBT: BoundedBacktracker for asciiNFA +// - runeNFA: NFA with '.' compiled as sparse dispatch (fewer split states for PikeVM) +func buildDotOptimizedNFAs( + re *syntax.Regexp, config Config, +) (*nfa.NFA, *nfa.BoundedBacktracker, *nfa.NFA) { + if !nfa.ContainsDot(re) { + return nil, nil, nil + } + + // ASCII-only NFA (V11-002 optimization): + // compile '.' as single byte range [0x00-0x7F] for ASCII-only inputs. + var asciiNFAEngine *nfa.NFA + var asciiBT *nfa.BoundedBacktracker + if config.EnableASCIIOptimization { + asciiCompiler := nfa.NewCompiler(nfa.CompilerConfig{ + UTF8: true, + Anchored: false, + DotNewline: false, + ASCIIOnly: true, + MaxRecursionDepth: config.MaxRecursionDepth, + }) + var err error + asciiNFAEngine, err = asciiCompiler.CompileRegexp(re) + if err == nil { + asciiBT = nfa.NewBoundedBacktracker(asciiNFAEngine) + } + } + + // Sparse-dispatch NFA: compile '.' as a single sparse state mapping each + // leading byte range to the correct continuation chain. This eliminates + // ~9 split states per dot, giving PikeVM O(1) dispatch instead of + // O(branches) split-chain DFS. Measured 2.8-4.8x PikeVM speedup. + var runeNFAEngine *nfa.NFA + runeCompiler := nfa.NewCompiler(nfa.CompilerConfig{ + UTF8: true, + Anchored: false, + DotNewline: false, + UseRuneStates: true, + MaxRecursionDepth: config.MaxRecursionDepth, + }) + runeNFAEngine, err := runeCompiler.CompileRegexp(re) + if err != nil { + runeNFAEngine = nil + } + + return asciiNFAEngine, asciiBT, runeNFAEngine +} + // CompileRegexp compiles a parsed syntax.Regexp with default configuration. // // This is useful when you already have a parsed regexp from another source. @@ -373,25 +425,8 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) { } } - // Compile ASCII-only NFA for patterns with '.' (V11-002 optimization). - // This enables runtime ASCII detection: if input is all ASCII, use the faster - // ASCII NFA which has ~2.8x fewer states for '.'-heavy patterns. - var asciiNFAEngine *nfa.NFA - var asciiBT *nfa.BoundedBacktracker - if nfa.ContainsDot(re) && config.EnableASCIIOptimization { - asciiCompiler := nfa.NewCompiler(nfa.CompilerConfig{ - UTF8: true, - Anchored: false, - DotNewline: false, - ASCIIOnly: true, // Key: compile '.' as single byte range - MaxRecursionDepth: config.MaxRecursionDepth, - }) - asciiNFAEngine, err = asciiCompiler.CompileRegexp(re) - if err == nil { - asciiBT = nfa.NewBoundedBacktracker(asciiNFAEngine) - } - // If ASCII NFA compilation fails, we fall back to UTF-8 NFA (asciiNFAEngine stays nil) - } + // Compile optimized NFA variants for patterns with '.' + asciiNFAEngine, asciiBT, runeNFAEngine := buildDotOptimizedNFAs(re, config) // Extract literals for prefiltering // NOTE: Don't build prefilter for start-anchored patterns (^...). @@ -418,8 +453,14 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) { // Select strategy (pass re for anchor detection) strategy := SelectStrategy(nfaEngine, re, literals, config) - // Build PikeVM (always needed for fallback) - pikevm := nfa.NewPikeVM(nfaEngine) + // Build PikeVM (always needed for fallback). + // Use runeNFA when available — sparse dispatch replaces ~9 split states + // with a single sparse state, giving PikeVM O(1) byte dispatch per '.'. + pikevmNFA := nfaEngine + if runeNFAEngine != nil { + pikevmNFA = runeNFAEngine + } + pikevm := nfa.NewPikeVM(pikevmNFA) // Build OnePass DFA for anchored patterns with captures (optional optimization) onePassRes := buildOnePassDFA(re, nfaEngine, config) @@ -428,8 +469,9 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) { engines := buildStrategyEngines(strategy, re, nfaEngine, literals, pf, config) strategy = engines.finalStrategy - // Build specialized searchers for character class patterns - charClassResult := buildCharClassSearchers(strategy, re, nfaEngine) + // Build specialized searchers for character class patterns. + // Pass pikevmNFA so BoundedBacktrackers benefit from rune states. + charClassResult := buildCharClassSearchers(strategy, re, nfaEngine, pikevmNFA) strategy = charClassResult.finalStrategy // Check if pattern can match empty string. @@ -497,7 +539,7 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) { // Fallback if detection fails (shouldn't happen since SelectStrategy checked) if anchoredLiteralInfo == nil { strategy = UseBoundedBacktracker - charClassResult.boundedBT = nfa.NewBoundedBacktracker(nfaEngine) + charClassResult.boundedBT = nfa.NewBoundedBacktracker(pikevmNFA) } } @@ -506,6 +548,7 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) { return &Engine{ nfa: nfaEngine, + runeNFA: runeNFAEngine, asciiNFA: asciiNFAEngine, asciiBoundedBacktracker: asciiBT, dfa: engines.dfa, @@ -534,7 +577,7 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) { canMatchEmpty: canMatchEmpty, isStartAnchored: isStartAnchored, fatTeddyFallback: fatTeddyFallback, - statePool: newSearchStatePool(nfaEngine, numCaptures), + statePool: newSearchStatePool(pikevmNFA, numCaptures), stats: Stats{}, }, nil } diff --git a/meta/engine.go b/meta/engine.go index d1eeb59..df113bc 100644 --- a/meta/engine.go +++ b/meta/engine.go @@ -50,6 +50,17 @@ type Engine struct { nfa *nfa.NFA + // runeNFA is an NFA compiled with UseRuneStates=true (sparse dispatch). + // Instead of ~9 split states chaining UTF-8 byte-range alternation branches + // for each '.', it uses a single sparse state that maps each leading byte + // range directly to its continuation chain. This gives PikeVM O(1) dispatch + // instead of O(branches) split-chain DFS per byte position. + // Measured 2.8-4.8x PikeVM speedup on dot-heavy patterns. + // + // This field is nil if the pattern doesn't contain '.' (no benefit). + // The byte-level NFA (nfa field) remains unchanged for DFA/strategy use. + runeNFA *nfa.NFA + // asciiNFA is an NFA compiled in ASCII-only mode (V11-002 optimization). // When the pattern contains '.' and input is ASCII-only (all bytes < 0x80), // this NFA is used instead of the main NFA. ASCII mode compiles '.' to diff --git a/nfa/compile.go b/nfa/compile.go index f06d476..88c28c0 100644 --- a/nfa/compile.go +++ b/nfa/compile.go @@ -30,6 +30,16 @@ type CompilerConfig struct { // compile both ASCII and UTF-8 NFAs, select at runtime based on input. ASCIIOnly bool + // UseRuneStates when true, compiles '.' as a single sparse dispatch state + // instead of ~9 split states chaining UTF-8 byte-range alternation branches. + // The sparse state maps each leading byte range directly to its continuation + // chain, giving PikeVM O(1) dispatch instead of O(branches) DFS traversal. + // + // Measured 2.8-4.8x PikeVM speedup on dot-heavy patterns. + // Safe for all engines (DFA, PikeVM, BoundedBacktracker) since it uses + // standard byte-range states, just organized as sparse instead of splits. + UseRuneStates bool + // MaxRecursionDepth limits recursion during compilation to prevent stack overflow // Default: 100 MaxRecursionDepth int @@ -966,6 +976,13 @@ func (c *Compiler) utf8Cont2HiFull(leadVal, cont1Val, hiLead, hiCont1, hiCont2 b // This is used for OpAnyChar which the parser generates when DotNL flag is set // (either globally via syntax.DotNL or locally via inline flag (?s:...)). func (c *Compiler) compileAnyChar() (start, end StateID, err error) { + // UseRuneStates mode: compile '.' as a single sparse state mapping + // each leading byte range to the correct continuation chain. + // This eliminates ~9 split states, giving PikeVM O(1) dispatch + // instead of O(branches) split-chain traversal per byte position. + if c.config.UseRuneStates { + return c.compileUTF8AnySparse(true) + } // ASCII-only mode: match any single ASCII byte (0x00-0x7F) // This reduces ~28 UTF-8 states to just 1 state. if c.config.ASCIIOnly { @@ -977,6 +994,13 @@ func (c *Compiler) compileAnyChar() (start, end StateID, err error) { // compileAnyCharNotNL compiles '.' matching any character except \n func (c *Compiler) compileAnyCharNotNL() (start, end StateID, err error) { + // UseRuneStates mode: compile '.' as a single sparse state mapping + // each leading byte range to the correct continuation chain. + // This eliminates ~9 split states, giving PikeVM O(1) dispatch + // instead of O(branches) split-chain traversal per byte position. + if c.config.UseRuneStates { + return c.compileUTF8AnySparse(false) + } // ASCII-only mode: match any single ASCII byte except newline // This reduces ~28 UTF-8 states to just 1-2 states. if c.config.ASCIIOnly { @@ -1017,6 +1041,86 @@ func (c *Compiler) compileASCIIAny(includeNL bool) (start, end StateID, err erro return ascii, endState, nil } +// compileUTF8AnySparse compiles '.' as a single sparse state that maps each +// leading byte range directly to the correct continuation chain. +// +// This eliminates the ~9 split states used by compileUTF8Any, giving PikeVM +// O(1) sparse-transition dispatch instead of O(branches) split-chain DFS. +// For patterns like .*?, this reduces per-byte-position work from ~10 split +// evaluations + epsilon closures to a single sparse table lookup. +// +// State count comparison for '.': +// - compileUTF8Any: ~15 states (with suffix sharing) + ~9 split states = ~24 +// - compileUTF8AnySparse: ~15 states (with suffix sharing) + 1 sparse state = ~16 +// +// The performance gain comes not from fewer states but from eliminating split +// evaluations. PikeVM's split processing requires DFS epsilon closure with a +// stack, while sparse state processing is a simple linear scan of transitions. +func (c *Compiler) compileUTF8AnySparse(includeNL bool) (start, end StateID, err error) { + endState := c.builder.AddEpsilon(InvalidState) + + // Suffix cache for sharing common continuation byte states + cache := newUtf8SuffixCache() + + // Build continuation chains for each leading byte range. + // We process bytes in REVERSE order for suffix sharing (same as compileUTF8Any). + type byteRange struct{ lo, hi byte } + sequences := [][]byteRange{ + // 2-byte: 0xC2-0xDF, 0x80-0xBF + {{0xC2, 0xDF}, {0x80, 0xBF}}, + // 3-byte sequences + {{0xE0, 0xE0}, {0xA0, 0xBF}, {0x80, 0xBF}}, + {{0xE1, 0xEC}, {0x80, 0xBF}, {0x80, 0xBF}}, + {{0xED, 0xED}, {0x80, 0x9F}, {0x80, 0xBF}}, + {{0xEE, 0xEF}, {0x80, 0xBF}, {0x80, 0xBF}}, + // 4-byte sequences + {{0xF0, 0xF0}, {0x90, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF}}, + {{0xF1, 0xF3}, {0x80, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF}}, + {{0xF4, 0xF4}, {0x80, 0x8F}, {0x80, 0xBF}, {0x80, 0xBF}}, + } + + // Build all transitions for the single sparse dispatch state. + // Each transition maps a leading byte range to the first continuation state. + var transitions []Transition + + // 1-byte ASCII (0x00-0x7F), excluding newline if needed + if includeNL { + transitions = append(transitions, Transition{Lo: 0x00, Hi: 0x7F, Next: endState}) + } else { + transitions = append(transitions, + Transition{Lo: 0x00, Hi: 0x09, Next: endState}, + Transition{Lo: 0x0B, Hi: 0x7F, Next: endState}, + ) + } + + // Invalid standalone bytes → endState (match as single byte for stdlib compat) + transitions = append(transitions, + Transition{Lo: 0x80, Hi: 0xBF, Next: endState}, // standalone continuation + Transition{Lo: 0xC0, Hi: 0xC1, Next: endState}, // overlong 2-byte + ) + + // Multi-byte sequences: leading byte → first continuation state + for _, seq := range sequences { + // Build continuation chain in REVERSE for suffix sharing + target := endState + for i := len(seq) - 1; i >= 1; i-- { + br := seq[i] + target = cache.getOrCreate(c.builder, target, br.lo, br.hi) + } + // seq[0] is the leading byte range, target is the first continuation state + transitions = append(transitions, Transition{Lo: seq[0].lo, Hi: seq[0].hi, Next: target}) + } + + // Invalid high bytes → endState + transitions = append(transitions, + Transition{Lo: 0xF5, Hi: 0xFF, Next: endState}, + ) + + // Single sparse state replaces the entire split chain + startState := c.builder.AddSparse(transitions) + return startState, endState, nil +} + // compileUTF8Any compiles an NFA that matches any single UTF-8 codepoint. // If includeNL is false, newline (0x0A) is excluded. //