coregx · kolkov · Mar 10, 2026 · Mar 10, 2026
@@ -12,6 +12,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - ARM NEON SIMD support (Go 1.26 `simd/archsimd` intrinsics — [#120](https://github.com/coregx/coregex/issues/120))
 - SIMD prefilter for CompositeSequenceDFA (#83)
 
+## [0.12.7] - 2026-03-10
+
+### Performance
+- **PikeVM sparse-dispatch for `.` patterns** (Issue [#132](https://github.com/coregx/coregex/issues/132)) —
+  The NFA compiler generated ~9 split states chaining UTF-8 byte-range alternation
+  branches for each `.` (AnyCharNotNL). PikeVM had to DFS-traverse the entire split
+  chain at every byte position, resulting in O(branches) work per byte. For `.*?`
+  patterns on large inputs (e.g., `\{\{(.*?)\}\}` on 10MB template), this caused
+  ~5 billion branch evaluations.
+  Fix: new `compileUTF8AnySparse()` compiles `.` as a single sparse state that maps
+  each leading byte range directly to its continuation chain — O(1) dispatch instead
+  of O(branches) split-chain traversal. Same approach as Rust regex's `State::Sparse`.
+  PikeVM speedup: **2.8-4.8x** on dot-heavy patterns. DFA unaffected (uses byte-level NFA).
+  Reported by [@kostya](https://github.com/kostya) via LangArena benchmarks.
+
 ## [0.12.6] - 2026-03-08
 
 ### Fixed

@@ -2,7 +2,7 @@
 
 > **Strategic Focus**: Production-grade regex engine with RE2/rust-regex level optimizations
 
-**Last Updated**: 2026-03-08 | **Current Version**: v0.12.6 | **Target**: v1.0.0 stable
+**Last Updated**: 2026-03-10 | **Current Version**: v0.12.7 | **Target**: v1.0.0 stable
 
 ---
 
@@ -12,7 +12,7 @@ Build a **production-ready, high-performance regex engine** for Go that matches
 
 ### Current State vs Target
 
-| Metric | Current (v0.12.6) | Target (v1.0.0) |
+| Metric | Current (v0.12.7) | Target (v1.0.0) |
 |--------|-------------------|-----------------|
 | Inner literal speedup | **280-3154x** | ✅ Achieved |
 | Case-insensitive speedup | **263x** | ✅ Achieved |
@@ -68,7 +68,9 @@ v0.12.4 ✅ → Test coverage 80%+, CI improvements, awesome-go readiness
          ↓
 v0.12.5 ✅ → Non-greedy quantifier fix, ReverseSuffix correctness (#124)
          ↓
-v0.12.6 (Current) ✅ → BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127)
+v0.12.6 ✅ → BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127)
+         ↓
+v0.12.7 (Current) ✅ → PikeVM sparse-dispatch for dot patterns, 2.8-4.8x speedup (#132)
          ↓
 v1.0.0-rc → Feature freeze, API locked
          ↓
@@ -103,6 +105,7 @@ v1.0.0 STABLE → Production release with API stability guarantee
 - ✅ **v0.12.4**: Test coverage 80%+, CI improvements, awesome-go readiness (#123)
 - ✅ **v0.12.5**: Non-greedy quantifier fix, ReverseSuffix forward verification (#124)
 - ✅ **v0.12.6**: BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127)
+- ✅ **v0.12.7**: PikeVM sparse-dispatch for `.` patterns, 2.8-4.8x speedup (#132)
 
 ---
 
@@ -194,7 +197,7 @@ v1.0.0 STABLE → Production release with API stability guarantee
 
 ## Feature Comparison Matrix
 
-| Feature | RE2 | rust-regex | coregex v0.12.6 | coregex v1.0 |
+| Feature | RE2 | rust-regex | coregex v0.12.7 | coregex v1.0 |
 |---------|-----|------------|-----------------|--------------|
 | Lazy DFA | ✅ | ✅ | ✅ | ✅ |
 | Thompson NFA | ✅ | ✅ | ✅ | ✅ |
@@ -352,7 +355,8 @@ Reference implementations available locally:
 
 | Version | Date | Type | Key Changes |
 |---------|------|------|-------------|
-| **v0.12.6** | 2026-03-08 | Fix | **BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127)** |
+| **v0.12.7** | 2026-03-10 | Performance | **PikeVM sparse-dispatch for `.` patterns, 2.8-4.8x speedup (#132)** |
+| v0.12.6 | 2026-03-08 | Fix | BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127) |
 | v0.12.5 | 2026-03-08 | Fix | Non-greedy quantifier fix, ReverseSuffix correctness (#124) |
 | v0.12.4 | 2026-03-01 | Test | Test coverage 80%+, CI improvements, awesome-go readiness |
 | **v0.12.3** | 2026-02-16 | Performance | **Cross-product literal expansion, 110x regexdna speedup (#119)** |
@@ -392,4 +396,4 @@ Reference implementations available locally:
 
 ---
 
-*Current: v0.12.6 | Next: v0.13.0 | Target: v1.0.0*
+*Current: v0.12.7 | Next: v0.13.0 | Target: v1.0.0*
@@ -279,11 +279,12 @@ func buildCharClassSearchers(
 	strategy Strategy,
 	re *syntax.Regexp,
 	nfaEngine *nfa.NFA,
+	btNFA *nfa.NFA, // NFA for BoundedBacktracker (runeNFA when available, else nfaEngine)
 ) charClassSearcherResult {
 	result := charClassSearcherResult{finalStrategy: strategy}
 
 	if strategy == UseBoundedBacktracker {
-		result.boundedBT = nfa.NewBoundedBacktracker(nfaEngine)
+		result.boundedBT = nfa.NewBoundedBacktracker(btNFA)
 	}
 
 	if strategy == UseCharClassSearcher {
@@ -298,7 +299,7 @@ func buildCharClassSearchers(
 		} else {
 			// Fallback to BoundedBacktracker if extraction fails
 			result.finalStrategy = UseBoundedBacktracker
-			result.boundedBT = nfa.NewBoundedBacktracker(nfaEngine)
+			result.boundedBT = nfa.NewBoundedBacktracker(btNFA)
 		}
 	}
 
@@ -309,7 +310,7 @@ func buildCharClassSearchers(
 		if result.compositeSrch == nil {
 			// Fallback to BoundedBacktracker if extraction fails
 			result.finalStrategy = UseBoundedBacktracker
-			result.boundedBT = nfa.NewBoundedBacktracker(nfaEngine)
+			result.boundedBT = nfa.NewBoundedBacktracker(btNFA)
 		} else {
 			// Try to build faster DFA (uses subset construction for overlapping patterns)
 			result.compositeSeqDFA = nfa.NewCompositeSequenceDFA(re)
@@ -334,7 +335,7 @@ func buildCharClassSearchers(
 		if result.branchDispatcher == nil {
 			// Fallback to BoundedBacktracker if dispatch not possible
 			result.finalStrategy = UseBoundedBacktracker
-			result.boundedBT = nfa.NewBoundedBacktracker(nfaEngine)
+			result.boundedBT = nfa.NewBoundedBacktracker(btNFA)
 		}
 	}
 
@@ -343,12 +344,63 @@ func buildCharClassSearchers(
 	// generation-based visited tracking (O(1) reset) vs PikeVM's thread queues.
 	// This is similar to how stdlib uses backtracking for simple patterns.
 	if result.finalStrategy == UseNFA && result.boundedBT == nil && nfaEngine.States() < 50 {
-		result.boundedBT = nfa.NewBoundedBacktracker(nfaEngine)
+		result.boundedBT = nfa.NewBoundedBacktracker(btNFA)
 	}
 
 	return result
 }
 
+// buildDotOptimizedNFAs compiles optimized NFA variants for patterns with '.'.
+// Returns:
+//   - asciiNFA: NFA with '.' compiled as single ASCII byte range (for ASCII-only input)
+//   - asciiBT: BoundedBacktracker for asciiNFA
+//   - runeNFA: NFA with '.' compiled as sparse dispatch (fewer split states for PikeVM)
+func buildDotOptimizedNFAs(
+	re *syntax.Regexp, config Config,
+) (*nfa.NFA, *nfa.BoundedBacktracker, *nfa.NFA) {
+	if !nfa.ContainsDot(re) {
+		return nil, nil, nil
+	}
+
+	// ASCII-only NFA (V11-002 optimization):
+	// compile '.' as single byte range [0x00-0x7F] for ASCII-only inputs.
+	var asciiNFAEngine *nfa.NFA
+	var asciiBT *nfa.BoundedBacktracker
+	if config.EnableASCIIOptimization {
+		asciiCompiler := nfa.NewCompiler(nfa.CompilerConfig{
+			UTF8:              true,
+			Anchored:          false,
+			DotNewline:        false,
+			ASCIIOnly:         true,
+			MaxRecursionDepth: config.MaxRecursionDepth,
+		})
+		var err error
+		asciiNFAEngine, err = asciiCompiler.CompileRegexp(re)
+		if err == nil {
+			asciiBT = nfa.NewBoundedBacktracker(asciiNFAEngine)
+		}
+	}
+
+	// Sparse-dispatch NFA: compile '.' as a single sparse state mapping each
+	// leading byte range to the correct continuation chain. This eliminates
+	// ~9 split states per dot, giving PikeVM O(1) dispatch instead of
+	// O(branches) split-chain DFS. Measured 2.8-4.8x PikeVM speedup.
+	var runeNFAEngine *nfa.NFA
+	runeCompiler := nfa.NewCompiler(nfa.CompilerConfig{
+		UTF8:              true,
+		Anchored:          false,
+		DotNewline:        false,
+		UseRuneStates:     true,
+		MaxRecursionDepth: config.MaxRecursionDepth,
+	})
+	runeNFAEngine, err := runeCompiler.CompileRegexp(re)
+	if err != nil {
+		runeNFAEngine = nil
+	}
+
+	return asciiNFAEngine, asciiBT, runeNFAEngine
+}
+
 // CompileRegexp compiles a parsed syntax.Regexp with default configuration.
 //
 // This is useful when you already have a parsed regexp from another source.
@@ -373,25 +425,8 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) {
 		}
 	}
 
-	// Compile ASCII-only NFA for patterns with '.' (V11-002 optimization).
-	// This enables runtime ASCII detection: if input is all ASCII, use the faster
-	// ASCII NFA which has ~2.8x fewer states for '.'-heavy patterns.
-	var asciiNFAEngine *nfa.NFA
-	var asciiBT *nfa.BoundedBacktracker
-	if nfa.ContainsDot(re) && config.EnableASCIIOptimization {
-		asciiCompiler := nfa.NewCompiler(nfa.CompilerConfig{
-			UTF8:              true,
-			Anchored:          false,
-			DotNewline:        false,
-			ASCIIOnly:         true, // Key: compile '.' as single byte range
-			MaxRecursionDepth: config.MaxRecursionDepth,
-		})
-		asciiNFAEngine, err = asciiCompiler.CompileRegexp(re)
-		if err == nil {
-			asciiBT = nfa.NewBoundedBacktracker(asciiNFAEngine)
-		}
-		// If ASCII NFA compilation fails, we fall back to UTF-8 NFA (asciiNFAEngine stays nil)
-	}
+	// Compile optimized NFA variants for patterns with '.'
+	asciiNFAEngine, asciiBT, runeNFAEngine := buildDotOptimizedNFAs(re, config)
 
 	// Extract literals for prefiltering
 	// NOTE: Don't build prefilter for start-anchored patterns (^...).
@@ -418,8 +453,14 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) {
 	// Select strategy (pass re for anchor detection)
 	strategy := SelectStrategy(nfaEngine, re, literals, config)
 
-	// Build PikeVM (always needed for fallback)
-	pikevm := nfa.NewPikeVM(nfaEngine)
+	// Build PikeVM (always needed for fallback).
+	// Use runeNFA when available — sparse dispatch replaces ~9 split states
+	// with a single sparse state, giving PikeVM O(1) byte dispatch per '.'.
+	pikevmNFA := nfaEngine
+	if runeNFAEngine != nil {
+		pikevmNFA = runeNFAEngine
+	}
+	pikevm := nfa.NewPikeVM(pikevmNFA)
 
 	// Build OnePass DFA for anchored patterns with captures (optional optimization)
 	onePassRes := buildOnePassDFA(re, nfaEngine, config)
@@ -428,8 +469,9 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) {
 	engines := buildStrategyEngines(strategy, re, nfaEngine, literals, pf, config)
 	strategy = engines.finalStrategy
 
-	// Build specialized searchers for character class patterns
-	charClassResult := buildCharClassSearchers(strategy, re, nfaEngine)
+	// Build specialized searchers for character class patterns.
+	// Pass pikevmNFA so BoundedBacktrackers benefit from rune states.
+	charClassResult := buildCharClassSearchers(strategy, re, nfaEngine, pikevmNFA)
 	strategy = charClassResult.finalStrategy
 
 	// Check if pattern can match empty string.
@@ -497,7 +539,7 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) {
 		// Fallback if detection fails (shouldn't happen since SelectStrategy checked)
 		if anchoredLiteralInfo == nil {
 			strategy = UseBoundedBacktracker
-			charClassResult.boundedBT = nfa.NewBoundedBacktracker(nfaEngine)
+			charClassResult.boundedBT = nfa.NewBoundedBacktracker(pikevmNFA)
 		}
 	}
 
@@ -506,6 +548,7 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) {
 
 	return &Engine{
 		nfa:                            nfaEngine,
+		runeNFA:                        runeNFAEngine,
 		asciiNFA:                       asciiNFAEngine,
 		asciiBoundedBacktracker:        asciiBT,
 		dfa:                            engines.dfa,
@@ -534,7 +577,7 @@ func CompileRegexp(re *syntax.Regexp, config Config) (*Engine, error) {
 		canMatchEmpty:                  canMatchEmpty,
 		isStartAnchored:                isStartAnchored,
 		fatTeddyFallback:               fatTeddyFallback,
-		statePool:                      newSearchStatePool(nfaEngine, numCaptures),
+		statePool:                      newSearchStatePool(pikevmNFA, numCaptures),
 		stats:                          Stats{},
 	}, nil
 }

@@ -50,6 +50,17 @@ type Engine struct {
 
 	nfa *nfa.NFA
 
+	// runeNFA is an NFA compiled with UseRuneStates=true (sparse dispatch).
+	// Instead of ~9 split states chaining UTF-8 byte-range alternation branches
+	// for each '.', it uses a single sparse state that maps each leading byte
+	// range directly to its continuation chain. This gives PikeVM O(1) dispatch
+	// instead of O(branches) split-chain DFS per byte position.
+	// Measured 2.8-4.8x PikeVM speedup on dot-heavy patterns.
+	//
+	// This field is nil if the pattern doesn't contain '.' (no benefit).
+	// The byte-level NFA (nfa field) remains unchanged for DFA/strategy use.
+	runeNFA *nfa.NFA
+
 	// asciiNFA is an NFA compiled in ASCII-only mode (V11-002 optimization).
 	// When the pattern contains '.' and input is ASCII-only (all bytes < 0x80),
 	// this NFA is used instead of the main NFA. ASCII mode compiles '.' to