From c7e16ee7e1f7905f3155f9e13e150a43c6db7eb6 Mon Sep 17 00:00:00 2001 From: Dmitriy Kovalenko Date: Sun, 29 Mar 2026 13:10:50 -0700 Subject: [PATCH 1/2] perf: Reduce index memory footprint --- crates/fff-core/benches/bigram_bench.rs | 4 +-- crates/fff-core/src/file_picker.rs | 13 +++++---- crates/fff-core/src/types.rs | 32 ++++++++++++++------- crates/fff-nvim/src/bin/bench_grep_query.rs | 4 +-- crates/fff-nvim/src/bin/grep_profiler.rs | 2 +- 5 files changed, 35 insertions(+), 20 deletions(-) diff --git a/crates/fff-core/benches/bigram_bench.rs b/crates/fff-core/benches/bigram_bench.rs index c1a972f..bad7284 100644 --- a/crates/fff-core/benches/bigram_bench.rs +++ b/crates/fff-core/benches/bigram_bench.rs @@ -14,7 +14,7 @@ fn build_test_index(file_count: usize) -> BigramFilter { builder.add_file_content(i, content.as_bytes()); } - builder.compress() + builder.compress(None) } fn bench_bigram_query(c: &mut Criterion) { @@ -97,7 +97,7 @@ fn bench_bigram_build(c: &mut Criterion) { for (i, content) in contents.iter().enumerate() { builder.add_file_content(i, content.as_bytes()); } - let index = builder.compress(); + let index = builder.compress(None); black_box(index.columns_used()) }); }, diff --git a/crates/fff-core/src/file_picker.rs b/crates/fff-core/src/file_picker.rs index 0f41d32..a9a4df1 100644 --- a/crates/fff-core/src/file_picker.rs +++ b/crates/fff-core/src/file_picker.rs @@ -1213,11 +1213,14 @@ pub fn build_bigram_index( }); let cols = builder.columns_used(); - let mut index = builder.compress(); - // Skip index: skip bigrams are inherently less specific than consecutive - // bigrams, so relevant columns are almost always dense. Dense-only saves - // ~20% memory vs all columns with no loss in filtering. - let skip_index = skip_builder.compress(); + let mut index = builder.compress(None); + + // Skip bigrams are supplementary — the consecutive index does the heavy + // lifting. Rare skip columns (< 12% of files) add virtually no filtering + // on either homogeneous (kernel) or polyglot (monorepo) codebases, but + // cost ~25-30% of total index memory. Using a higher sparse cutoff for + // the skip index drops these dead-weight columns with negligible loss. + let skip_index = skip_builder.compress(Some(12)); index.set_skip_index(skip_index); // The builder just freed ~276 MB (for 500k files) of atomic bitsets. diff --git a/crates/fff-core/src/types.rs b/crates/fff-core/src/types.rs index a95b2ef..0c369dd 100644 --- a/crates/fff-core/src/types.rs +++ b/crates/fff-core/src/types.rs @@ -547,7 +547,7 @@ impl BigramIndexBuilder { } } } - // NOTE: populated count tracked by the consecutive builder, not here. + self.populated.fetch_add(1, Ordering::Relaxed); } pub fn is_ready(&self) -> bool { @@ -562,17 +562,16 @@ impl BigramIndexBuilder { /// Compress the dense builder into a compact `BigramFilter`. /// - /// Only dense columns (bitsets) are retained — sparse bigrams (those - /// appearing in fewer files than the dense threshold) are dropped. - /// The skip-1 bigram index provides equivalent or better filtering - /// for those cases, and dense-only storage enables SIMD-vectorized - /// AND operations with no per-column indirection. + /// Retains columns where the bigram appears in ≥`min_density_pct`% (or + /// the default ~3.1% heuristic when `None`) and <90% of indexed files. + /// Sparse columns carry too little data to justify their memory; + /// ubiquitous columns (≥90%) are nearly all-ones and barely filter. /// /// Each column's `Box<[AtomicU64]>` (~60 KB for 500k files) is freed /// immediately after compression via `OnceLock::take`, so peak memory /// during compress is roughly `max(builder, result)` instead of /// `builder + result`. - pub fn compress(self) -> BigramFilter { + pub fn compress(self, min_density_pct: Option) -> BigramFilter { let cols = self.columns_used() as usize; let words = self.words; let file_count = self.file_count; @@ -596,14 +595,27 @@ impl BigramIndexBuilder { continue; }; - // Count set bits to decide if this column is dense enough to keep. + // Count set bits to decide if this column is worth keeping. let mut popcount = 0u32; for w in 0..words { popcount += bitset[w].load(Ordering::Relaxed).count_ones(); } - // Skip sparse bigrams — not worth storing. - if (popcount as usize * 4) < dense_bytes { + // Sparse threshold — drop bigrams appearing in too few files. + let sparse_ok = if let Some(min_pct) = min_density_pct { + // Percentage-based: require ≥ min_pct% of populated files. + populated > 0 && (popcount as usize) * 100 >= populated * min_pct as usize + } else { + // Default heuristic: popcount ≥ words × 2 (~3.1% of files). + (popcount as usize * 4) >= dense_bytes + }; + if !sparse_ok { + continue; + } + + // Drop ubiquitous bigrams — columns ≥90% ones carry almost no + // filtering power and just waste memory + AND cycles. + if populated > 0 && (popcount as usize) * 10 >= populated * 9 { continue; } diff --git a/crates/fff-nvim/src/bin/bench_grep_query.rs b/crates/fff-nvim/src/bin/bench_grep_query.rs index 313bb79..f39bad8 100644 --- a/crates/fff-nvim/src/bin/bench_grep_query.rs +++ b/crates/fff-nvim/src/bin/bench_grep_query.rs @@ -212,8 +212,8 @@ fn main() { } }); - let mut index = builder.compress(); - let skip_index = skip_builder.compress(); + let mut index = builder.compress(None); + let skip_index = skip_builder.compress(Some(12)); let build_time = t.elapsed(); eprintln!("done in {:.2}s", build_time.as_secs_f64()); eprintln!( diff --git a/crates/fff-nvim/src/bin/grep_profiler.rs b/crates/fff-nvim/src/bin/grep_profiler.rs index 8654397..ea455cb 100644 --- a/crates/fff-nvim/src/bin/grep_profiler.rs +++ b/crates/fff-nvim/src/bin/grep_profiler.rs @@ -195,7 +195,7 @@ fn build_bigram_index(files: &[FileItem]) -> BigramFilter { } }); - builder.compress() + builder.compress(None) } fn fmt_dur(d: Duration) -> String { From b3b0b4b1a4bc50086bc492e62fe5f647a6a7a1c2 Mon Sep 17 00:00:00 2001 From: Dmitriy Kovalenko Date: Sun, 29 Mar 2026 20:11:30 +0000 Subject: [PATCH 2/2] chore: Update docs for - perf: Reduce index memory footprint --- doc/fff.nvim.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/fff.nvim.txt b/doc/fff.nvim.txt index c600c20..db28482 100644 --- a/doc/fff.nvim.txt +++ b/doc/fff.nvim.txt @@ -1,4 +1,4 @@ -*fff.nvim.txt* For Neovim >= 0.10.0 Last change: 2026 March 27 +*fff.nvim.txt* For Neovim >= 0.10.0 Last change: 2026 March 29 ============================================================================== Table of Contents *fff.nvim-table-of-contents*