diff --git a/CHANGELOG.md b/CHANGELOG.md
index 783c226..96dd7bd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -32,6 +32,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   allocation for inputs with many matches (e.g., 150K replacements on 6MB
   string took 2m19s). Fix: replaced with `strings.Builder` for O(n) performance
   (now completes in ~1.3s).
+- **DFA FindAll O(n²) scanning for dense-match inputs** —
+  `findIndicesDFAAt` used `DFA.FindAt` (longest-match scan, O(n)) as prefilter,
+  then PikeVM re-scanned for exact bounds. For 2000 matches over 50KB, total DFA
+  work was ~50MB. Fix: added `DFA.IsMatchAt` with early termination (O(k) where
+  k = distance to first match), and prefilter skip that jumps PikeVM directly to
+  the candidate position. Template pattern `\{\{(.*?)\}\}` FindAll improved ~37%.
 
 ## [0.12.5] - 2026-03-08
 
diff --git a/ROADMAP.md b/ROADMAP.md
index ccc81ab..37a880c 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -2,7 +2,7 @@
 
 > **Strategic Focus**: Production-grade regex engine with RE2/rust-regex level optimizations
 
-**Last Updated**: 2026-03-08 | **Current Version**: v0.12.5 | **Target**: v1.0.0 stable
+**Last Updated**: 2026-03-08 | **Current Version**: v0.12.6 | **Target**: v1.0.0 stable
 
 ---
 
@@ -12,7 +12,7 @@ Build a **production-ready, high-performance regex engine** for Go that matches
 
 ### Current State vs Target
 
-| Metric | Current (v0.12.5) | Target (v1.0.0) |
+| Metric | Current (v0.12.6) | Target (v1.0.0) |
 |--------|-------------------|-----------------|
 | Inner literal speedup | **280-3154x** | ✅ Achieved |
 | Case-insensitive speedup | **263x** | ✅ Achieved |
@@ -66,7 +66,9 @@ v0.12.3 ✅ → Cross-product literal expansion, 110x speedup on regexdna (#119)
          ↓
 v0.12.4 ✅ → Test coverage 80%+, CI improvements, awesome-go readiness
          ↓
-v0.12.5 (Current) ✅ → Non-greedy quantifier fix, ReverseSuffix correctness (#124)
+v0.12.5 ✅ → Non-greedy quantifier fix, ReverseSuffix correctness (#124)
+         ↓
+v0.12.6 (Current) ✅ → BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127)
          ↓
 v1.0.0-rc → Feature freeze, API locked
          ↓
@@ -100,6 +102,7 @@ v1.0.0 STABLE → Production release with API stability guarantee
 - ✅ **v0.12.3**: Cross-product literal expansion for regexdna patterns, 110x speedup (#119)
 - ✅ **v0.12.4**: Test coverage 80%+, CI improvements, awesome-go readiness (#123)
 - ✅ **v0.12.5**: Non-greedy quantifier fix, ReverseSuffix forward verification (#124)
+- ✅ **v0.12.6**: BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127)
 
 ---
 
@@ -191,7 +194,7 @@ v1.0.0 STABLE → Production release with API stability guarantee
 
 ## Feature Comparison Matrix
 
-| Feature | RE2 | rust-regex | coregex v0.12.5 | coregex v1.0 |
+| Feature | RE2 | rust-regex | coregex v0.12.6 | coregex v1.0 |
 |---------|-----|------------|-----------------|--------------|
 | Lazy DFA | ✅ | ✅ | ✅ | ✅ |
 | Thompson NFA | ✅ | ✅ | ✅ | ✅ |
@@ -338,6 +341,9 @@ Reference implementations available locally:
 
 | Version | Date | Type | Key Changes |
 |---------|------|------|-------------|
+| **v0.12.6** | 2026-03-08 | Fix | **BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127)** |
+| v0.12.5 | 2026-03-08 | Fix | Non-greedy quantifier fix, ReverseSuffix correctness (#124) |
+| v0.12.4 | 2026-03-01 | Test | Test coverage 80%+, CI improvements, awesome-go readiness |
 | **v0.12.3** | 2026-02-16 | Performance | **Cross-product literal expansion, 110x regexdna speedup (#119)** |
 | v0.12.2 | 2026-02-16 | Fix | ReverseSuffixSet safety guard, matchStartZero fix (#116) |
 | v0.12.1 | 2026-02-15 | Performance | DFA bidirectional fallback, digit-run skip, bounded repetitions (#115) |
@@ -375,4 +381,4 @@ Reference implementations available locally:
 
 ---
 
-*Current: v0.12.5 | Next: v0.13.0 | Target: v1.0.0*
+*Current: v0.12.6 | Next: v0.13.0 | Target: v1.0.0*
diff --git a/dfa/lazy/lazy.go b/dfa/lazy/lazy.go
index f452021..50c5f4d 100644
--- a/dfa/lazy/lazy.go
+++ b/dfa/lazy/lazy.go
@@ -321,6 +321,21 @@ func (d *DFA) IsMatch(haystack []byte) bool {
 	return d.searchEarliestMatch(haystack, 0)
 }
 
+// IsMatchAt returns true if the pattern matches anywhere in haystack[at:].
+// Uses early termination: returns as soon as any match state is reached.
+// This is O(k) where k is the distance to the first match, vs FindAt's O(n)
+// which always scans for the longest match.
+func (d *DFA) IsMatchAt(haystack []byte, at int) bool {
+	if at >= len(haystack) {
+		if at == len(haystack) {
+			return d.matchesEmpty()
+		}
+		return false
+	}
+
+	return d.searchEarliestMatch(haystack, at)
+}
+
 // isMatchWithPrefilter uses prefilter for fast boolean match.
 // Returns as soon as any match is found.
 func (d *DFA) isMatchWithPrefilter(haystack []byte) bool {
diff --git a/meta/find_indices.go b/meta/find_indices.go
index bad2e83..00ca25e 100644
--- a/meta/find_indices.go
+++ b/meta/find_indices.go
@@ -228,16 +228,22 @@ func (e *Engine) findIndicesDFA(haystack []byte) (int, int, bool) {
 		return e.pikevm.Search(haystack)
 	}
 
-	// Use DFA search to check if there's a match
-	pos := e.dfa.Find(haystack)
-	if pos == -1 {
+	// Prefilter skip: jump to candidate position, then PikeVM from there.
+	if e.prefilter != nil {
+		pos := e.prefilter.Find(haystack, 0)
+		if pos == -1 {
+			return -1, -1, false
+		}
+		atomic.AddUint64(&e.stats.PrefilterHits, 1)
+		return e.pikevm.SearchAt(haystack, pos)
+	}
+
+	// No prefilter: DFA with early termination + PikeVM.
+	if !e.dfa.IsMatch(haystack) {
 		return -1, -1, false
 	}
 
-	// DFA found a match - use PikeVM for exact bounds (leftmost-first semantics)
-	// NOTE: Bidirectional search (reverse DFA) doesn't work correctly here because
-	// DFA.Find returns the END of LONGEST match, not FIRST match. For patterns like
-	// (?m)abc$ on "abc\nabc", DFA returns 7 but correct first match ends at 3.
+	// DFA confirmed a match exists - use PikeVM for exact bounds
 	return e.pikevm.Search(haystack)
 }
 
@@ -259,13 +265,27 @@ func (e *Engine) findIndicesDFAAt(haystack []byte, at int) (int, int, bool) {
 		return e.pikevm.SearchAt(haystack, at)
 	}
 
-	// Use DFA search to check if there's a match
-	pos := e.dfa.FindAt(haystack, at)
-	if pos == -1 {
+	// Prefilter skip: use prefix prefilter to jump to candidate position,
+	// then PikeVM from there. This avoids PikeVM scanning non-matching regions.
+	// For patterns like \{\{(.*?)\}\} with prefix prefilter for "{{",
+	// PikeVM only processes ~8 bytes per match instead of ~25.
+	if e.prefilter != nil {
+		pos := e.prefilter.Find(haystack, at)
+		if pos == -1 {
+			return -1, -1, false
+		}
+		atomic.AddUint64(&e.stats.PrefilterHits, 1)
+		return e.pikevm.SearchAt(haystack, pos)
+	}
+
+	// No prefilter: DFA with early termination + PikeVM.
+	// IsMatchAt is O(k) where k = distance to first match, vs FindAt's O(n)
+	// which scans for the longest match. Avoids O(n²) in FindAll.
+	if !e.dfa.IsMatchAt(haystack, at) {
 		return -1, -1, false
 	}
 
-	// DFA found a match - use PikeVM for exact bounds (leftmost-first semantics)
+	// DFA confirmed a match exists - use PikeVM for exact bounds
 	return e.pikevm.SearchAt(haystack, at)
 }