Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions crates/fff-core/benches/bigram_bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ fn build_test_index(file_count: usize) -> BigramFilter {
builder.add_file_content(i, content.as_bytes());
}

builder.compress()
builder.compress(None)
}

fn bench_bigram_query(c: &mut Criterion) {
Expand Down Expand Up @@ -97,7 +97,7 @@ fn bench_bigram_build(c: &mut Criterion) {
for (i, content) in contents.iter().enumerate() {
builder.add_file_content(i, content.as_bytes());
}
let index = builder.compress();
let index = builder.compress(None);
black_box(index.columns_used())
});
},
Expand Down
13 changes: 8 additions & 5 deletions crates/fff-core/src/file_picker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1213,11 +1213,14 @@ pub fn build_bigram_index(
});

let cols = builder.columns_used();
let mut index = builder.compress();
// Skip index: skip bigrams are inherently less specific than consecutive
// bigrams, so relevant columns are almost always dense. Dense-only saves
// ~20% memory vs all columns with no loss in filtering.
let skip_index = skip_builder.compress();
let mut index = builder.compress(None);

// Skip bigrams are supplementary — the consecutive index does the heavy
// lifting. Rare skip columns (< 12% of files) add virtually no filtering
// on either homogeneous (kernel) or polyglot (monorepo) codebases, but
// cost ~25-30% of total index memory. Using a higher sparse cutoff for
// the skip index drops these dead-weight columns with negligible loss.
let skip_index = skip_builder.compress(Some(12));
index.set_skip_index(skip_index);

// The builder just freed ~276 MB (for 500k files) of atomic bitsets.
Expand Down
32 changes: 22 additions & 10 deletions crates/fff-core/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@
#[cfg(target_arch = "aarch64")]
const MMAP_THRESHOLD: u64 = 16 * 1024;
#[cfg(not(target_arch = "aarch64"))]
const MMAP_THRESHOLD: u64 = 4 * 1024;

Check warning on line 232 in crates/fff-core/src/types.rs

View workflow job for this annotation

GitHub Actions / e2e (windows-latest)

constant `MMAP_THRESHOLD` is never used

Check warning on line 232 in crates/fff-core/src/types.rs

View workflow job for this annotation

GitHub Actions / e2e (windows-latest)

constant `MMAP_THRESHOLD` is never used

Check warning on line 232 in crates/fff-core/src/types.rs

View workflow job for this annotation

GitHub Actions / e2e (windows-latest)

constant `MMAP_THRESHOLD` is never used

/// Load file contents: small files are read into a heap buffer to avoid
/// mmap page alignment waste; large files use mmap for zero-copy access.
Expand Down Expand Up @@ -547,7 +547,7 @@
}
}
}
// NOTE: populated count tracked by the consecutive builder, not here.
self.populated.fetch_add(1, Ordering::Relaxed);
}

pub fn is_ready(&self) -> bool {
Expand All @@ -562,17 +562,16 @@

/// Compress the dense builder into a compact `BigramFilter`.
///
/// Only dense columns (bitsets) are retained — sparse bigrams (those
/// appearing in fewer files than the dense threshold) are dropped.
/// The skip-1 bigram index provides equivalent or better filtering
/// for those cases, and dense-only storage enables SIMD-vectorized
/// AND operations with no per-column indirection.
/// Retains columns where the bigram appears in ≥`min_density_pct`% (or
/// the default ~3.1% heuristic when `None`) and <90% of indexed files.
/// Sparse columns carry too little data to justify their memory;
/// ubiquitous columns (≥90%) are nearly all-ones and barely filter.
///
/// Each column's `Box<[AtomicU64]>` (~60 KB for 500k files) is freed
/// immediately after compression via `OnceLock::take`, so peak memory
/// during compress is roughly `max(builder, result)` instead of
/// `builder + result`.
pub fn compress(self) -> BigramFilter {
pub fn compress(self, min_density_pct: Option<u32>) -> BigramFilter {
let cols = self.columns_used() as usize;
let words = self.words;
let file_count = self.file_count;
Expand All @@ -596,14 +595,27 @@
continue;
};

// Count set bits to decide if this column is dense enough to keep.
// Count set bits to decide if this column is worth keeping.
let mut popcount = 0u32;
for w in 0..words {
popcount += bitset[w].load(Ordering::Relaxed).count_ones();
}

// Skip sparse bigrams — not worth storing.
if (popcount as usize * 4) < dense_bytes {
// Sparse threshold — drop bigrams appearing in too few files.
let sparse_ok = if let Some(min_pct) = min_density_pct {
// Percentage-based: require ≥ min_pct% of populated files.
populated > 0 && (popcount as usize) * 100 >= populated * min_pct as usize
} else {
// Default heuristic: popcount ≥ words × 2 (~3.1% of files).
(popcount as usize * 4) >= dense_bytes
};
if !sparse_ok {
continue;
}

// Drop ubiquitous bigrams — columns ≥90% ones carry almost no
// filtering power and just waste memory + AND cycles.
if populated > 0 && (popcount as usize) * 10 >= populated * 9 {
continue;
}

Expand Down
4 changes: 2 additions & 2 deletions crates/fff-nvim/src/bin/bench_grep_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,8 @@ fn main() {
}
});

let mut index = builder.compress();
let skip_index = skip_builder.compress();
let mut index = builder.compress(None);
let skip_index = skip_builder.compress(Some(12));
let build_time = t.elapsed();
eprintln!("done in {:.2}s", build_time.as_secs_f64());
eprintln!(
Expand Down
2 changes: 1 addition & 1 deletion crates/fff-nvim/src/bin/grep_profiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ fn build_bigram_index(files: &[FileItem]) -> BigramFilter {
}
});

builder.compress()
builder.compress(None)
}

fn fmt_dur(d: Duration) -> String {
Expand Down
2 changes: 1 addition & 1 deletion doc/fff.nvim.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
*fff.nvim.txt* For Neovim >= 0.10.0 Last change: 2026 March 27
*fff.nvim.txt* For Neovim >= 0.10.0 Last change: 2026 March 29

==============================================================================
Table of Contents *fff.nvim-table-of-contents*
Expand Down
Loading