dmtrKovalenko · dmtrKovalenko · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/crates/fff-core/benches/bigram_bench.rs b/crates/fff-core/benches/bigram_bench.rs
@@ -14,7 +14,7 @@ fn build_test_index(file_count: usize) -> BigramFilter {
         builder.add_file_content(i, content.as_bytes());
     }
 
-    builder.compress()
+    builder.compress(None)
 }
 
 fn bench_bigram_query(c: &mut Criterion) {
@@ -97,7 +97,7 @@ fn bench_bigram_build(c: &mut Criterion) {
                     for (i, content) in contents.iter().enumerate() {
                         builder.add_file_content(i, content.as_bytes());
                     }
-                    let index = builder.compress();
+                    let index = builder.compress(None);
                     black_box(index.columns_used())
                 });
             },

diff --git a/crates/fff-core/src/file_picker.rs b/crates/fff-core/src/file_picker.rs
@@ -1213,11 +1213,14 @@ pub fn build_bigram_index(
     });
 
     let cols = builder.columns_used();
-    let mut index = builder.compress();
-    // Skip index: skip bigrams are inherently less specific than consecutive
-    // bigrams, so relevant columns are almost always dense. Dense-only saves
-    // ~20% memory vs all columns with no loss in filtering.
-    let skip_index = skip_builder.compress();
+    let mut index = builder.compress(None);
+
+    // Skip bigrams are supplementary — the consecutive index does the heavy
+    // lifting. Rare skip columns (< 12% of files) add virtually no filtering
+    // on either homogeneous (kernel) or polyglot (monorepo) codebases, but
+    // cost ~25-30% of total index memory. Using a higher sparse cutoff for
+    // the skip index drops these dead-weight columns with negligible loss.
+    let skip_index = skip_builder.compress(Some(12));
     index.set_skip_index(skip_index);
 
     // The builder just freed ~276 MB (for 500k files) of atomic bitsets.

diff --git a/crates/fff-core/src/types.rs b/crates/fff-core/src/types.rs
@@ -229,7 +229,7 @@
 #[cfg(target_arch = "aarch64")]
 const MMAP_THRESHOLD: u64 = 16 * 1024;
 #[cfg(not(target_arch = "aarch64"))]
 const MMAP_THRESHOLD: u64 = 4 * 1024;

 /// Load file contents: small files are read into a heap buffer to avoid
 /// mmap page alignment waste; large files use mmap for zero-copy access.
@@ -547,7 +547,7 @@
                 }
             }
         }
-        // NOTE: populated count tracked by the consecutive builder, not here.
+        self.populated.fetch_add(1, Ordering::Relaxed);
     }
 
     pub fn is_ready(&self) -> bool {
@@ -562,17 +562,16 @@
 
     /// Compress the dense builder into a compact `BigramFilter`.
     ///
-    /// Only dense columns (bitsets) are retained — sparse bigrams (those
-    /// appearing in fewer files than the dense threshold) are dropped.
-    /// The skip-1 bigram index provides equivalent or better filtering
-    /// for those cases, and dense-only storage enables SIMD-vectorized
-    /// AND operations with no per-column indirection.
+    /// Retains columns where the bigram appears in ≥`min_density_pct`% (or
+    /// the default ~3.1% heuristic when `None`) and <90% of indexed files.
+    /// Sparse columns carry too little data to justify their memory;
+    /// ubiquitous columns (≥90%) are nearly all-ones and barely filter.
     ///
     /// Each column's `Box<[AtomicU64]>` (~60 KB for 500k files) is freed
     /// immediately after compression via `OnceLock::take`, so peak memory
     /// during compress is roughly `max(builder, result)` instead of
     /// `builder + result`.
-    pub fn compress(self) -> BigramFilter {
+    pub fn compress(self, min_density_pct: Option<u32>) -> BigramFilter {
         let cols = self.columns_used() as usize;
         let words = self.words;
         let file_count = self.file_count;
@@ -596,14 +595,27 @@
                 continue;
             };
 
-            // Count set bits to decide if this column is dense enough to keep.
+            // Count set bits to decide if this column is worth keeping.
             let mut popcount = 0u32;
             for w in 0..words {
                 popcount += bitset[w].load(Ordering::Relaxed).count_ones();
             }
 
-            // Skip sparse bigrams — not worth storing.
-            if (popcount as usize * 4) < dense_bytes {
+            // Sparse threshold — drop bigrams appearing in too few files.
+            let sparse_ok = if let Some(min_pct) = min_density_pct {
+                // Percentage-based: require ≥ min_pct% of populated files.
+                populated > 0 && (popcount as usize) * 100 >= populated * min_pct as usize
+            } else {
+                // Default heuristic: popcount ≥ words × 2 (~3.1% of files).
+                (popcount as usize * 4) >= dense_bytes
+            };
+            if !sparse_ok {
+                continue;
+            }
+
+            // Drop ubiquitous bigrams — columns ≥90% ones carry almost no
+            // filtering power and just waste memory + AND cycles.
+            if populated > 0 && (popcount as usize) * 10 >= populated * 9 {
                 continue;
             }
 

diff --git a/crates/fff-nvim/src/bin/bench_grep_query.rs b/crates/fff-nvim/src/bin/bench_grep_query.rs
@@ -212,8 +212,8 @@ fn main() {
         }
     });
 
-    let mut index = builder.compress();
-    let skip_index = skip_builder.compress();
+    let mut index = builder.compress(None);
+    let skip_index = skip_builder.compress(Some(12));
     let build_time = t.elapsed();
     eprintln!("done in {:.2}s", build_time.as_secs_f64());
     eprintln!(

diff --git a/crates/fff-nvim/src/bin/grep_profiler.rs b/crates/fff-nvim/src/bin/grep_profiler.rs
@@ -195,7 +195,7 @@ fn build_bigram_index(files: &[FileItem]) -> BigramFilter {
         }
     });
 
-    builder.compress()
+    builder.compress(None)
 }
 
 fn fmt_dur(d: Duration) -> String {

diff --git a/doc/fff.nvim.txt b/doc/fff.nvim.txt
@@ -1,4 +1,4 @@
-*fff.nvim.txt*         For Neovim >= 0.10.0         Last change: 2026 March 27
+*fff.nvim.txt*         For Neovim >= 0.10.0         Last change: 2026 March 29
 
 ==============================================================================
 Table of Contents                                 *fff.nvim-table-of-contents*
-Original file line number
+Diff line change
@@ Expand Up / @@ -195,7 +195,7 @@ fn build_bigram_index(files: &[FileItem]) -> BigramFilter { @@
             }
         });
-        builder.compress()
+        builder.compress(None)
     }
     fn fmt_dur(d: Duration) -> String {
@@ Expand Down @@