From c7e16ee7e1f7905f3155f9e13e150a43c6db7eb6 Mon Sep 17 00:00:00 2001
From: Dmitriy Kovalenko <dmtr.kovalenko@outlook.com>
Date: Sun, 29 Mar 2026 13:10:50 -0700
Subject: [PATCH 1/2] perf: Reduce index memory footprint

---
 crates/fff-core/benches/bigram_bench.rs     |  4 +--
 crates/fff-core/src/file_picker.rs          | 13 +++++----
 crates/fff-core/src/types.rs                | 32 ++++++++++++++-------
 crates/fff-nvim/src/bin/bench_grep_query.rs |  4 +--
 crates/fff-nvim/src/bin/grep_profiler.rs    |  2 +-
 5 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/crates/fff-core/benches/bigram_bench.rs b/crates/fff-core/benches/bigram_bench.rs
index c1a972f..bad7284 100644
--- a/crates/fff-core/benches/bigram_bench.rs
+++ b/crates/fff-core/benches/bigram_bench.rs
@@ -14,7 +14,7 @@ fn build_test_index(file_count: usize) -> BigramFilter {
         builder.add_file_content(i, content.as_bytes());
     }
 
-    builder.compress()
+    builder.compress(None)
 }
 
 fn bench_bigram_query(c: &mut Criterion) {
@@ -97,7 +97,7 @@ fn bench_bigram_build(c: &mut Criterion) {
                     for (i, content) in contents.iter().enumerate() {
                         builder.add_file_content(i, content.as_bytes());
                     }
-                    let index = builder.compress();
+                    let index = builder.compress(None);
                     black_box(index.columns_used())
                 });
             },
diff --git a/crates/fff-core/src/file_picker.rs b/crates/fff-core/src/file_picker.rs
index 0f41d32..a9a4df1 100644
--- a/crates/fff-core/src/file_picker.rs
+++ b/crates/fff-core/src/file_picker.rs
@@ -1213,11 +1213,14 @@ pub fn build_bigram_index(
     });
 
     let cols = builder.columns_used();
-    let mut index = builder.compress();
-    // Skip index: skip bigrams are inherently less specific than consecutive
-    // bigrams, so relevant columns are almost always dense. Dense-only saves
-    // ~20% memory vs all columns with no loss in filtering.
-    let skip_index = skip_builder.compress();
+    let mut index = builder.compress(None);
+
+    // Skip bigrams are supplementary — the consecutive index does the heavy
+    // lifting. Rare skip columns (< 12% of files) add virtually no filtering
+    // on either homogeneous (kernel) or polyglot (monorepo) codebases, but
+    // cost ~25-30% of total index memory. Using a higher sparse cutoff for
+    // the skip index drops these dead-weight columns with negligible loss.
+    let skip_index = skip_builder.compress(Some(12));
     index.set_skip_index(skip_index);
 
     // The builder just freed ~276 MB (for 500k files) of atomic bitsets.
diff --git a/crates/fff-core/src/types.rs b/crates/fff-core/src/types.rs
index a95b2ef..0c369dd 100644
--- a/crates/fff-core/src/types.rs
+++ b/crates/fff-core/src/types.rs
@@ -547,7 +547,7 @@ impl BigramIndexBuilder {
                 }
             }
         }
-        // NOTE: populated count tracked by the consecutive builder, not here.
+        self.populated.fetch_add(1, Ordering::Relaxed);
     }
 
     pub fn is_ready(&self) -> bool {
@@ -562,17 +562,16 @@ impl BigramIndexBuilder {
 
     /// Compress the dense builder into a compact `BigramFilter`.
     ///
-    /// Only dense columns (bitsets) are retained — sparse bigrams (those
-    /// appearing in fewer files than the dense threshold) are dropped.
-    /// The skip-1 bigram index provides equivalent or better filtering
-    /// for those cases, and dense-only storage enables SIMD-vectorized
-    /// AND operations with no per-column indirection.
+    /// Retains columns where the bigram appears in ≥`min_density_pct`% (or
+    /// the default ~3.1% heuristic when `None`) and <90% of indexed files.
+    /// Sparse columns carry too little data to justify their memory;
+    /// ubiquitous columns (≥90%) are nearly all-ones and barely filter.
     ///
     /// Each column's `Box<[AtomicU64]>` (~60 KB for 500k files) is freed
     /// immediately after compression via `OnceLock::take`, so peak memory
     /// during compress is roughly `max(builder, result)` instead of
     /// `builder + result`.
-    pub fn compress(self) -> BigramFilter {
+    pub fn compress(self, min_density_pct: Option<u32>) -> BigramFilter {
         let cols = self.columns_used() as usize;
         let words = self.words;
         let file_count = self.file_count;
@@ -596,14 +595,27 @@ impl BigramIndexBuilder {
                 continue;
             };
 
-            // Count set bits to decide if this column is dense enough to keep.
+            // Count set bits to decide if this column is worth keeping.
             let mut popcount = 0u32;
             for w in 0..words {
                 popcount += bitset[w].load(Ordering::Relaxed).count_ones();
             }
 
-            // Skip sparse bigrams — not worth storing.
-            if (popcount as usize * 4) < dense_bytes {
+            // Sparse threshold — drop bigrams appearing in too few files.
+            let sparse_ok = if let Some(min_pct) = min_density_pct {
+                // Percentage-based: require ≥ min_pct% of populated files.
+                populated > 0 && (popcount as usize) * 100 >= populated * min_pct as usize
+            } else {
+                // Default heuristic: popcount ≥ words × 2 (~3.1% of files).
+                (popcount as usize * 4) >= dense_bytes
+            };
+            if !sparse_ok {
+                continue;
+            }
+
+            // Drop ubiquitous bigrams — columns ≥90% ones carry almost no
+            // filtering power and just waste memory + AND cycles.
+            if populated > 0 && (popcount as usize) * 10 >= populated * 9 {
                 continue;
             }
 
diff --git a/crates/fff-nvim/src/bin/bench_grep_query.rs b/crates/fff-nvim/src/bin/bench_grep_query.rs
index 313bb79..f39bad8 100644
--- a/crates/fff-nvim/src/bin/bench_grep_query.rs
+++ b/crates/fff-nvim/src/bin/bench_grep_query.rs
@@ -212,8 +212,8 @@ fn main() {
         }
     });
 
-    let mut index = builder.compress();
-    let skip_index = skip_builder.compress();
+    let mut index = builder.compress(None);
+    let skip_index = skip_builder.compress(Some(12));
     let build_time = t.elapsed();
     eprintln!("done in {:.2}s", build_time.as_secs_f64());
     eprintln!(
diff --git a/crates/fff-nvim/src/bin/grep_profiler.rs b/crates/fff-nvim/src/bin/grep_profiler.rs
index 8654397..ea455cb 100644
--- a/crates/fff-nvim/src/bin/grep_profiler.rs
+++ b/crates/fff-nvim/src/bin/grep_profiler.rs
@@ -195,7 +195,7 @@ fn build_bigram_index(files: &[FileItem]) -> BigramFilter {
         }
     });
 
-    builder.compress()
+    builder.compress(None)
 }
 
 fn fmt_dur(d: Duration) -> String {

From b3b0b4b1a4bc50086bc492e62fe5f647a6a7a1c2 Mon Sep 17 00:00:00 2001
From: Dmitriy Kovalenko <dmtr.kovalenko@outlook.com>
Date: Sun, 29 Mar 2026 20:11:30 +0000
Subject: [PATCH 2/2] chore: Update docs for - perf: Reduce index memory
 footprint

---
 doc/fff.nvim.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/fff.nvim.txt b/doc/fff.nvim.txt
index c600c20..db28482 100644
--- a/doc/fff.nvim.txt
+++ b/doc/fff.nvim.txt
@@ -1,4 +1,4 @@
-*fff.nvim.txt*         For Neovim >= 0.10.0         Last change: 2026 March 27
+*fff.nvim.txt*         For Neovim >= 0.10.0         Last change: 2026 March 29
 
 ==============================================================================
 Table of Contents                                 *fff.nvim-table-of-contents*