diff --git a/Cargo.lock b/Cargo.lock index 4a01765..cbb93e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -485,6 +485,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "cty" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b365fabc795046672053e29c954733ec3b05e4be654ab130fe8f1f94d7051f35" + [[package]] name = "darling" version = "0.23.0" @@ -697,7 +703,6 @@ dependencies = [ name = "fff-query-parser" version = "0.4.2" dependencies = [ - "criterion", "zlob", ] @@ -721,6 +726,7 @@ dependencies = [ "grep-matcher", "heed", "ignore", + "libmimalloc-sys", "memchr", "memmap2", "neo_frizbee", @@ -1354,6 +1360,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "667f4fec20f29dfc6bc7357c582d91796c169ad7e2fce709468aefeb2c099870" dependencies = [ "cc", + "cty", "libc", ] diff --git a/crates/fff-c/src/ffi_types.rs b/crates/fff-c/src/ffi_types.rs index 21c47d1..66b007f 100644 --- a/crates/fff-c/src/ffi_types.rs +++ b/crates/fff-c/src/ffi_types.rs @@ -85,9 +85,9 @@ impl From<&FileItem> for FffFileItem { git_status: cstring_new(format_git_status(item.git_status)), size: item.size, modified: item.modified, - access_frecency_score: item.access_frecency_score, - modification_frecency_score: item.modification_frecency_score, - total_frecency_score: item.total_frecency_score, + access_frecency_score: item.access_frecency_score as i64, + modification_frecency_score: item.modification_frecency_score as i64, + total_frecency_score: item.total_frecency_score as i64, is_binary: item.is_binary, } } @@ -322,9 +322,9 @@ impl FffGrepMatch { context_after, size: file.size, modified: file.modified, - total_frecency_score: file.total_frecency_score, - access_frecency_score: file.access_frecency_score, - modification_frecency_score: file.modification_frecency_score, + total_frecency_score: file.total_frecency_score as i64, + access_frecency_score: file.access_frecency_score as i64, + modification_frecency_score: file.modification_frecency_score as i64, line_number: m.line_number, byte_offset: m.byte_offset, col: m.col as u32, diff --git a/crates/fff-c/src/lib.rs b/crates/fff-c/src/lib.rs index 2895b26..c9216e0 100644 --- a/crates/fff-c/src/lib.rs +++ b/crates/fff-c/src/lib.rs @@ -403,8 +403,14 @@ pub unsafe extern "C" fn fff_live_grep( classify_definitions, }; - let result = - fff::grep::grep_search(picker.get_files(), &parsed, &options, picker.cache_budget()); + let result = fff::grep::grep_search( + picker.get_files(), + &parsed, + &options, + picker.cache_budget(), + None, + None, + ); let grep_result = FffGrepResult::from_core(&result); FffResult::ok_handle(grep_result as *mut c_void) } diff --git a/crates/fff-core/Cargo.toml b/crates/fff-core/Cargo.toml index a2cbd0d..68dd4df 100644 --- a/crates/fff-core/Cargo.toml +++ b/crates/fff-core/Cargo.toml @@ -14,6 +14,9 @@ crate-type = ["rlib", "staticlib", "cdylib"] default = [] # Enable C FFI exports ffi = [] +# Call mi_collect(true) after large allocator churn (bigram build). +# Requires mimalloc to be the global allocator (linked by fff-nvim). +mimalloc-collect = ["dep:libmimalloc-sys"] # Use zlob (Zig-compiled C globbing library) for glob matching. # Requires Zig to be installed. When disabled, falls back to globset (pure Rust). zlob = ["dep:zlob", "fff-query-parser/zlob"] @@ -55,6 +58,7 @@ toml = "0.8" tracing-appender = "0.2" tracing-subscriber = { version = "0.3", features = ["env-filter"] } zlob = { workspace = true, optional = true } +libmimalloc-sys = { version = "0.1", optional = true, features = ["extended"] } # Platform-specific: dunce for Windows to avoid \\?\ extended path prefix [target.'cfg(windows)'.dependencies] dunce = { workspace = true } @@ -63,3 +67,11 @@ dunce = { workspace = true } criterion = { version = "0.5", features = ["html_reports"] } rand = { version = "0.8", features = ["small_rng"] } tempfile = "3.8" + +[[bench]] +name = "parse_bench" +harness = false + +[[bench]] +name = "bigram_bench" +harness = false diff --git a/crates/fff-core/benches/bigram_bench.rs b/crates/fff-core/benches/bigram_bench.rs new file mode 100644 index 0000000..8e8b461 --- /dev/null +++ b/crates/fff-core/benches/bigram_bench.rs @@ -0,0 +1,119 @@ +use criterion::{BenchmarkId, Criterion, black_box, criterion_group, criterion_main}; +use fff_search::types::{BigramFilter, BigramIndexBuilder}; + +/// Build a realistic bigram index for benchmarking. +/// Simulates a large repo by generating varied content per file. +fn build_test_index(file_count: usize) -> BigramFilter { + let builder = BigramIndexBuilder::new(file_count); + + for i in 0..file_count { + // Generate varied content so we get a mix of sparse and dense columns + let content = format!( + "struct File{i} {{ fn process() {{ let controller = read(path); }} }} // module {i}" + ); + builder.add_file_content(i, content.as_bytes()); + } + + builder.compress() +} + +fn bench_bigram_query(c: &mut Criterion) { + let file_counts = [10_000, 100_000, 500_000]; + + for &file_count in &file_counts { + let index = build_test_index(file_count); + eprintln!( + "Index ({} files): {} columns ({} dense, {} sparse)", + file_count, + index.columns_used(), + index.dense_columns(), + index.sparse_columns(), + ); + + let mut group = c.benchmark_group(format!("bigram_query_{file_count}")); + group.sample_size(500); + + let queries: &[(&str, &[u8])] = &[ + ("short_2char", b"st"), + ("medium_6char", b"struct"), + ("long_14char", b"let controller"), + ("multi_word", b"fn process"), + ]; + + for (name, query) in queries { + group.bench_with_input(BenchmarkId::from_parameter(name), query, |b, q| { + b.iter(|| { + let result = index.query(black_box(q)); + black_box(&result); + }); + }); + } + + group.finish(); + } +} + +fn bench_bigram_is_candidate(c: &mut Criterion) { + let index = build_test_index(500_000); + let candidates = index.query(b"struct").unwrap(); + + c.bench_function("is_candidate_500k", |b| { + b.iter(|| { + let mut count = 0u32; + for i in 0..500_000 { + if BigramFilter::is_candidate(black_box(&candidates), i) { + count += 1; + } + } + black_box(count) + }); + }); + + c.bench_function("count_candidates_500k", |b| { + b.iter(|| BigramFilter::count_candidates(black_box(&candidates))); + }); +} + +fn bench_bigram_build(c: &mut Criterion) { + let mut group = c.benchmark_group("bigram_build"); + group.sample_size(10); + + let file_counts = [10_000, 100_000]; + + for &file_count in &file_counts { + // Pre-generate content so we only measure index building + let contents: Vec = (0..file_count) + .map(|i| { + format!( + "struct File{i} {{ fn process() {{ let controller = read(path); }} }} // mod {i}" + ) + }) + .collect(); + + group.bench_with_input( + BenchmarkId::new("build_and_compress", file_count), + &file_count, + |b, &fc| { + b.iter(|| { + let builder = BigramIndexBuilder::new(fc); + for (i, content) in contents.iter().enumerate() { + builder.add_file_content(i, content.as_bytes()); + } + let index = builder.compress(); + black_box(index.columns_used()) + }); + }, + ); + } + + group.finish(); +} + +criterion_group!( + benches, + bench_bigram_query, + bench_bigram_is_candidate, + bench_bigram_build, +); + +criterion_main!(benches); diff --git a/crates/fff-query-parser/benches/parse_bench.rs b/crates/fff-core/benches/parse_bench.rs similarity index 100% rename from crates/fff-query-parser/benches/parse_bench.rs rename to crates/fff-core/benches/parse_bench.rs diff --git a/crates/fff-core/src/constraints.rs b/crates/fff-core/src/constraints.rs index 895ee41..734d908 100644 --- a/crates/fff-core/src/constraints.rs +++ b/crates/fff-core/src/constraints.rs @@ -12,6 +12,32 @@ use smallvec::SmallVec; use crate::git::is_modified_status; +/// Case-insensitive ASCII substring search without allocation. +/// `needle` must already be lowercase. +#[inline] +fn contains_ascii_ci(haystack: &str, needle: &str) -> bool { + let h = haystack.as_bytes(); + let n = needle.as_bytes(); + if n.len() > h.len() { + return false; + } + if n.is_empty() { + return true; + } + let first = n[0]; + for i in 0..=(h.len() - n.len()) { + if h[i].to_ascii_lowercase() == first + && h[i..i + n.len()] + .iter() + .zip(n) + .all(|(a, b)| a.to_ascii_lowercase() == *b) + { + return true; + } + } + false +} + /// Minimum item count before switching to parallel iteration with rayon. /// Below this threshold, the overhead of thread pool dispatch outweighs the benefit. const PAR_THRESHOLD: usize = 10_000; @@ -22,9 +48,6 @@ pub trait Constrainable { /// The file's relative path (e.g. "src/main.rs") fn relative_path(&self) -> &str; - /// The file's lowercased relative path for case-insensitive matching - fn relative_path_lower(&self) -> &str; - /// The file name component (e.g. "main.rs") fn file_name(&self) -> &str; @@ -152,7 +175,7 @@ fn item_matches_constraint_at_index( } // only works with negation - Constraint::Text(text) => item.relative_path_lower().contains(text), + Constraint::Text(text) => contains_ascii_ci(item.relative_path(), text), // Parts and Exclude are handled at a higher level Constraint::Parts(_) | Constraint::Exclude(_) | Constraint::FileType(_) => true, diff --git a/crates/fff-core/src/file_picker.rs b/crates/fff-core/src/file_picker.rs index af4dd11..0ef5a31 100644 --- a/crates/fff-core/src/file_picker.rs +++ b/crates/fff-core/src/file_picker.rs @@ -37,13 +37,15 @@ use crate::git::GitStatusCache; use crate::grep::{GrepResult, GrepSearchOptions, grep_search}; use crate::query_tracker::QueryTracker; use crate::score::match_and_score_files; -use crate::types::{ContentCacheBudget, FileItem, PaginationArgs, ScoringContext, SearchResult}; +use crate::types::{ + BigramFilter, BigramIndexBuilder, BigramOverlay, ContentCacheBudget, FileItem, PaginationArgs, + ScoringContext, SearchResult, +}; use crate::{SharedFrecency, SharedPicker}; use fff_query_parser::FFFQuery; use git2::{Repository, Status, StatusOptions}; use rayon::prelude::*; use std::fmt::Debug; -use std::io::Read; use std::path::{Path, PathBuf}; use std::sync::{ Arc, @@ -81,8 +83,11 @@ pub struct FuzzySearchOptions<'a> { #[derive(Debug, Clone)] struct FileSync { - /// Files sorted by path for binary search + /// Base files sorted by path for binary search. Deletions use tombstones + /// (`is_deleted = true`) to keep indices stable for the bigram index. files: Vec, + /// Files added since the last full reindex. Not in the base bigram index. + overflow: Vec, pub git_workdir: Option, } @@ -90,6 +95,7 @@ impl FileSync { fn new() -> Self { Self { files: Vec::new(), + overflow: Vec::new(), git_workdir: None, } } @@ -100,6 +106,7 @@ impl FileSync { &self.files } + #[allow(dead_code)] fn get_file(&self, index: usize) -> Option<&FileItem> { self.files.get(index) } @@ -129,6 +136,7 @@ impl FileSync { } /// Remove file at index. Simple - no HashMap to maintain! + #[allow(dead_code)] fn remove_file(&mut self, index: usize) { if index < self.files.len() { self.files.remove(index); @@ -161,6 +169,17 @@ impl FileSync { impl FileItem { pub fn new(path: PathBuf, base_path: &Path, git_status: Option) -> Self { + let metadata = std::fs::metadata(&path).ok(); + Self::new_with_metadata(path, base_path, git_status, metadata.as_ref()) + } + + /// Create a FileItem using pre-fetched metadata to avoid a redundant stat syscall. + pub fn new_with_metadata( + path: PathBuf, + base_path: &Path, + git_status: Option, + metadata: Option<&std::fs::Metadata>, + ) -> Self { let relative_path = pathdiff::diff_paths(&path, base_path) .unwrap_or_else(|| path.clone()) .to_string_lossy() @@ -172,8 +191,8 @@ impl FileItem { .to_string_lossy() .into_owned(); - let (size, modified) = match std::fs::metadata(&path) { - Ok(metadata) => { + let (size, modified) = match metadata { + Some(metadata) => { let size = metadata.len(); let modified = metadata .modified() @@ -183,10 +202,12 @@ impl FileItem { (size, modified) } - Err(_) => (0, 0), + None => (0, 0), }; - let is_binary = detect_binary(&path, size); + // Fast extension-based binary detection avoids opening every file during scan. + // Files not caught here are detected when content is first loaded. + let is_binary = is_known_binary_extension(&path); Self::new_raw( path, @@ -204,9 +225,9 @@ impl FileItem { tracker: &FrecencyTracker, mode: FFFMode, ) -> Result<(), Error> { - self.access_frecency_score = tracker.get_access_score(&self.path, mode); + self.access_frecency_score = tracker.get_access_score(&self.path, mode) as i32; self.modification_frecency_score = - tracker.get_modification_score(self.modified, self.git_status, mode); + tracker.get_modification_score(self.modified, self.git_status, mode) as i32; self.total_frecency_score = self.access_frecency_score + self.modification_frecency_score; Ok(()) @@ -227,6 +248,12 @@ pub struct FilePicker { cancelled: Arc, mode: FFFMode, pub cache_budget: Arc, + /// Inverted bigram index for O(K × N/64) grep prefiltering. + /// Built during warmup phase; `None` until warmup completes. + pub bigram_index: Option>, + /// Incremental overlay tracking file changes since the base bigram index + /// was built. Updated by the background watcher on every file event. + pub bigram_overlay: Option>>, } impl std::fmt::Debug for FilePicker { @@ -271,6 +298,10 @@ impl FilePicker { self.sync_data.files() } + pub fn get_overflow_files(&self) -> &[FileItem] { + &self.sync_data.overflow + } + /// Create a new FilePicker and place it into the provided shared handle. /// /// The background scan thread and file-system watcher write into the @@ -312,6 +343,8 @@ impl FilePicker { cancelled: Arc::clone(&cancelled), mode, cache_budget: Arc::new(ContentCacheBudget::default()), + bigram_index: None, + bigram_overlay: None, }; // Place the picker into the shared handle before spawning the @@ -431,7 +464,7 @@ impl FilePicker { options: &GrepSearchOptions, budget: &ContentCacheBudget, ) -> GrepResult<'a> { - grep_search(files, query, options, budget) + grep_search(files, query, options, budget, None, None) } // Returns an ongoing or finisshed scan progress @@ -572,76 +605,122 @@ impl FilePicker { #[tracing::instrument(skip(self), name = "timing_update", level = Level::DEBUG)] pub fn on_create_or_modify(&mut self, path: impl AsRef + Debug) -> Option<&FileItem> { let path = path.as_ref(); - match self.sync_data.find_file_index(path) { - Ok(pos) => { + + // Check if this is a tombstoned base file being re-created. + if let Ok(pos) = self.sync_data.find_file_index(path) { + let file = self.sync_data.get_file_mut(pos)?; + + if file.is_deleted { + // Resurrect tombstoned file. + file.is_deleted = false; debug!( - "on_create_or_modify: file EXISTS at index {}, updating metadata", + "on_create_or_modify: resurrected tombstoned file at index {}", pos ); - // File exists - update its metadata (doesn't change indices, safe) - let file = self.sync_data.get_file_mut(pos)?; + } - let modified = match std::fs::metadata(path) { - Ok(metadata) => metadata - .modified() - .ok() - .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok()), - Err(e) => { - error!("Failed to get metadata for {}: {}", path.display(), e); - None - } - }; - - if let Some(modified) = modified { - let modified = modified.as_secs(); - if file.modified < modified { - file.modified = modified; - - // TODO figure out if we actually need to remap the memory or invalidate - // mapping here because on linux and macos with the shared map opening it - // should be automatically available everywhere automatically which saves - // some time from doing extra remapping on every search - file.invalidate_mmap(&self.cache_budget); - } + debug!( + "on_create_or_modify: file EXISTS at index {}, updating metadata", + pos + ); + + let modified = match std::fs::metadata(path) { + Ok(metadata) => metadata + .modified() + .ok() + .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok()), + Err(e) => { + error!("Failed to get metadata for {}: {}", path.display(), e); + None } + }; - Some(&*file) // Convert &mut to & + if let Some(modified) = modified { + let modified = modified.as_secs(); + if file.modified < modified { + file.modified = modified; + file.invalidate_mmap(&self.cache_budget); + } } - Err(pos) => { - debug!( - "on_create_or_modify: file NEW, inserting at index {} (total files: {})", - pos, - self.sync_data.files().len() - ); - let file_item = FileItem::new(path.to_path_buf(), &self.base_path, None); - let path_buf = file_item.path.clone(); + // Update the bigram overlay for this modified file. + if let Some(ref overlay) = self.bigram_overlay + && let Ok(content) = std::fs::read(path) + { + overlay.write().modify_file(pos, &content); + } - self.sync_data.insert_file(pos, file_item); - let result = self.sync_data.get_file(pos); + return Some(&*file); + } - if result.is_none() { - error!( - "on_create_or_modify: FAILED to find file after insert! path={:?}", - path_buf - ); - } else { - debug!("on_create_or_modify: successfully inserted and found file"); + // Check overflow for existing added files. + if let Some(overflow_pos) = self.sync_data.overflow.iter().position(|f| f.path == path) { + let file = &mut self.sync_data.overflow[overflow_pos]; + let modified = std::fs::metadata(path) + .ok() + .and_then(|m| m.modified().ok()) + .and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok()); + if let Some(modified) = modified { + let modified = modified.as_secs(); + if file.modified < modified { + file.modified = modified; + file.invalidate_mmap(&self.cache_budget); } - - result } + // Update overflow entry in overlay. + if let Some(ref overlay) = self.bigram_overlay + && let Ok(content) = std::fs::read(path) + { + let bigrams = crate::types::extract_bigrams(&content); + overlay.write().update_added(overflow_pos, bigrams); + } + return Some(&self.sync_data.overflow[overflow_pos]); + } + + // New file — append to overflow (preserves base indices for bigram). + debug!( + "on_create_or_modify: file NEW, appending to overflow (base: {}, overflow: {})", + self.sync_data.files().len(), + self.sync_data.overflow.len(), + ); + + let file_item = FileItem::new(path.to_path_buf(), &self.base_path, None); + self.sync_data.overflow.push(file_item); + + if let Some(ref overlay) = self.bigram_overlay { + let content = std::fs::read(path).unwrap_or_default(); + overlay.write().add_file(&content); } + + self.sync_data.overflow.last() } + /// Tombstone a file instead of removing it, keeping base indices stable. pub fn remove_file_by_path(&mut self, path: impl AsRef) -> bool { let path = path.as_ref(); match self.sync_data.find_file_index(path) { Ok(index) => { - self.sync_data.remove_file(index); + let file = &mut self.sync_data.files[index]; + file.is_deleted = true; + file.invalidate_mmap(&self.cache_budget); + if let Some(ref overlay) = self.bigram_overlay { + overlay.write().delete_file(index); + } true } - Err(_) => false, + Err(_) => { + // Check overflow for added files — these can be removed directly + // since they aren't in the base bigram index. + if let Some(pos) = self.sync_data.overflow.iter().position(|f| f.path == path) { + self.sync_data.overflow.remove(pos); + if let Some(ref overlay) = self.bigram_overlay { + overlay.write().remove_added(pos); + } + true + } else { + false + } + } } } @@ -673,26 +752,36 @@ impl FilePicker { self.is_scanning.store(true, Ordering::Relaxed); self.scanned_files_count.store(0, Ordering::Relaxed); - let scan_result = scan_filesystem( + let walk_result = walk_filesystem( &self.base_path, &self.scanned_files_count, shared_frecency, self.mode, ); - match scan_result { - Ok(sync) => { + match walk_result { + Ok(walk) => { info!( - "Filesystem scan completed: found {} files", - sync.files.len() + "Filesystem rescan completed: found {} files", + walk.sync.files.len() ); - self.sync_data = sync; - // Old FileItems (and their mmaps) were dropped — reset the budget. + self.sync_data = walk.sync; self.cache_budget.reset(); + // Apply git status synchronously for rescan (typically fast). + if let Ok(Some(git_cache)) = walk.git_handle.join() { + let frecency = shared_frecency.read().ok(); + let frecency_ref = frecency.as_ref().and_then(|f| f.as_ref()); + self.sync_data.files.par_iter_mut().for_each(|file| { + file.git_status = git_cache.lookup_status(&file.path); + if let Some(frecency) = frecency_ref { + let _ = file.update_frecency_scores(frecency, self.mode); + } + }); + } + if self.warmup_mmap_cache { - // Warmup in background to avoid blocking - let files = self.sync_data.files().to_vec(); // Clone all files + let files = self.sync_data.files().to_vec(); let budget = Arc::clone(&self.cache_budget); std::thread::spawn(move || { warmup_mmaps(&files, &budget); @@ -769,26 +858,28 @@ fn spawn_scan_and_watcher( info!("Starting initial file scan"); let mut git_workdir = None; - match scan_filesystem(&base_path, &synced_files_count, &shared_frecency, mode) { - Ok(sync) => { + + match walk_filesystem(&base_path, &synced_files_count, &shared_frecency, mode) { + Ok(walk) => { if cancelled.load(Ordering::Acquire) { - info!("Scan completed but picker was replaced, discarding results"); + info!("Walk completed but picker was replaced, discarding results"); scan_signal.store(false, Ordering::Relaxed); return; } info!( - "Initial filesystem scan completed: found {} files", - sync.files.len() + "Initial filesystem walk completed: found {} files", + walk.sync.files.len() ); - git_workdir = sync.git_workdir.clone(); + git_workdir = walk.sync.git_workdir.clone(); + let git_handle = walk.git_handle; - // Write results into the provided shared handle. + // Write files immediately — they are now searchable even + // before git status or warmup completes. let write_result = shared_picker.write().ok().map(|mut guard| { if let Some(ref mut picker) = *guard { - picker.sync_data = sync; - // Old FileItems (and their mmaps) were dropped — reset the budget. + picker.sync_data = walk.sync; picker.cache_budget.reset(); } }); @@ -797,20 +888,91 @@ fn spawn_scan_and_watcher( error!("Failed to write scan results into picker"); } - // OPTIMIZATION: Warmup mmap cache in background to avoid blocking first grep. - if warmup_mmap_cache - && !cancelled.load(Ordering::Acquire) - && let Ok(guard) = shared_picker.read() - && let Some(ref picker) = *guard - { - warmup_mmaps(picker.sync_data.files(), &picker.cache_budget); + // Signal scan complete — files are searchable. + scan_signal.store(false, Ordering::Relaxed); + info!("Files indexed and searchable"); + + if warmup_mmap_cache && !cancelled.load(Ordering::Acquire) { + let phase_start = std::time::Instant::now(); + + // Scale cache limits based on repo size. + if let Ok(mut guard) = shared_picker.write() + && let Some(ref mut picker) = *guard + { + let file_count = picker.sync_data.files().len(); + picker.cache_budget = + Arc::new(ContentCacheBudget::new_for_repo(file_count)); + info!( + "Cache budget configured for {} files: max_files={}, max_bytes={}", + file_count, + picker.cache_budget.max_files, + picker.cache_budget.max_bytes, + ); + } + + // Warmup: read top-frecency files into cache. + if !cancelled.load(Ordering::Acquire) + && let Ok(guard) = shared_picker.read() + && let Some(ref picker) = *guard + { + let warmup_start = std::time::Instant::now(); + warmup_mmaps(picker.sync_data.files(), &picker.cache_budget); + info!( + "Warmup completed in {:.2}s (cached {} files, {} bytes)", + warmup_start.elapsed().as_secs_f64(), + picker.cache_budget.cached_count.load(Ordering::Relaxed), + picker.cache_budget.cached_bytes.load(Ordering::Relaxed), + ); + } + + // Build bigram index without holding the lock. + if !cancelled.load(Ordering::Acquire) { + let snapshot = shared_picker.read().ok().and_then(|guard| { + guard.as_ref().map(|picker| { + ( + picker.sync_data.files().to_vec(), + Arc::clone(&picker.cache_budget), + ) + }) + }); + + if let Some((files, budget)) = snapshot { + let bigram_start = std::time::Instant::now(); + info!("Starting bigram index build for {} files...", files.len()); + let index = build_bigram_index(&files, &budget); + info!( + "Bigram index ready in {:.2}s", + bigram_start.elapsed().as_secs_f64(), + ); + + if let Ok(mut guard) = shared_picker.write() + && let Some(ref mut picker) = *guard + { + let file_count = picker.sync_data.files().len(); + picker.bigram_index = Some(Arc::new(index)); + picker.bigram_overlay = Some(Arc::new(parking_lot::RwLock::new( + BigramOverlay::new(file_count), + ))); + } + } + } + + info!( + "Post-scan warmup + bigram total: {:.2}s", + phase_start.elapsed().as_secs_f64(), + ); + } + + // Apply git status (may still be running — this waits for it). + if !cancelled.load(Ordering::Acquire) { + apply_git_status(&shared_picker, &shared_frecency, git_handle, mode); } } Err(e) => { error!("Initial scan failed: {:?}", e); + scan_signal.store(false, Ordering::Relaxed); } } - scan_signal.store(false, Ordering::Relaxed); // Don't create a watcher if this picker instance was already replaced if cancelled.load(Ordering::Acquire) { @@ -869,7 +1031,8 @@ fn spawn_scan_and_watcher( #[tracing::instrument(skip(files), name = "warmup_mmaps", level = Level::DEBUG)] fn warmup_mmaps(files: &[FileItem], budget: &ContentCacheBudget) { let max_files = budget.max_files; - let max_bytes: u64 = 512 * 1024 * 1024; + let max_bytes = budget.max_bytes; + let max_file_size = budget.max_file_size; // Single collect — no pre-filter. The comparator in select_nth pushes // ineligible files (binary, empty) to the tail automatically. @@ -901,7 +1064,7 @@ fn warmup_mmaps(files: &[FileItem], budget: &ContentCacheBudget) { return; } - if file.is_binary || file.size == 0 || file.size > 5 * 1024 * 1024 { + if file.is_binary || file.size == 0 || file.size > max_file_size { return; } @@ -912,131 +1075,221 @@ fn warmup_mmaps(files: &[FileItem], budget: &ContentCacheBudget) { return; } - if let Some(content) = file.get_mmap(budget) { + if let Some(content) = file.get_content(budget) { let _ = std::hint::black_box(content.first()); } }); } -fn scan_filesystem( +/// Build an inverted bigram index from all files in the index. +/// +/// For each non-binary, non-empty file: reads content (or uses cached mmap), +/// populates the per-file bigram bloom filter, and adds it to the inverted index. +/// Uses rayon for parallel processing. +pub fn build_bigram_index(files: &[FileItem], budget: &ContentCacheBudget) -> BigramFilter { + let start = std::time::Instant::now(); + info!("Building bigram index for {} files...", files.len()); + let builder = BigramIndexBuilder::new(files.len()); + let max_file_size = budget.max_file_size; + + files.par_iter().enumerate().for_each(|(i, file)| { + if file.is_binary || file.size == 0 || file.size > max_file_size { + return; + } + // Use cached content if available (no extra memory). + // For uncached files, use read() instead of mmap() — heap memory is + // freed immediately on drop, while mmap pages linger in RSS on macOS. + if let Some(cached) = file.get_content(budget) { + // Catch binary files not detected by extension heuristic + if !detect_binary_content(cached) { + builder.add_file_content(i, cached); + } + } else if let Ok(data) = std::fs::read(&file.path) + && !detect_binary_content(&data) + { + builder.add_file_content(i, &data); + } + }); + + let cols = builder.columns_used(); + let index = builder.compress(); + + // The builder just freed ~276 MB (for 500k files) of atomic bitsets. + // Hint the allocator to return those pages to the OS. + hint_allocator_collect(); + + info!( + "Bigram index built in {:.2}s — {} columns ({} sparse, {} dense) for {} files", + start.elapsed().as_secs_f64(), + cols, + index.sparse_columns(), + index.dense_columns(), + files.len(), + ); + index +} + +/// Result of the fast walk phase — files are searchable immediately, +/// git status arrives later via the join handle. +struct WalkResult { + sync: FileSync, + git_handle: std::thread::JoinHandle>, +} + +/// Phase 1: walk the filesystem and discover the git root. +/// Returns files immediately (searchable) and a handle to the in-progress +/// git status computation. This avoids blocking on `git status` which can +/// take 10+ seconds on very large repos (e.g. chromium). +fn walk_filesystem( base_path: &Path, synced_files_count: &Arc, shared_frecency: &SharedFrecency, mode: FFFMode, -) -> Result { +) -> Result { use ignore::{WalkBuilder, WalkState}; - use std::thread; let scan_start = std::time::Instant::now(); - info!("SCAN: Starting parallel filesystem scan and git status"); + info!("SCAN: Starting filesystem walk and git status (async)"); - // run separate thread for git status because it effectively does another separate file - // traversal which could be pretty slow on large repos (in general 300-500ms) - thread::scope(|s| { - let git_handle = s.spawn(|| { - let git_workdir = Repository::discover(base_path) - .ok() - .and_then(|repo| repo.workdir().map(Path::to_path_buf)); + // Discover git root (fast — just walks up looking for .git/) + let git_workdir = Repository::discover(base_path) + .ok() + .and_then(|repo| repo.workdir().map(Path::to_path_buf)); - if let Some(ref git_dir) = git_workdir { - debug!("Git repository found at: {}", git_dir.display()); - } else { - debug!("No git repository found for path: {}", base_path.display()); - } + if let Some(ref git_dir) = git_workdir { + debug!("Git repository found at: {}", git_dir.display()); + } else { + debug!("No git repository found for path: {}", base_path.display()); + } - let status_cache = GitStatusCache::read_git_status( - git_workdir.as_deref(), - // do not include unmodified here to avoid extra cost - // we are treating all missing files as unmodified - StatusOptions::new() - .include_untracked(true) - .recurse_untracked_dirs(true) - .exclude_submodules(true), - ); + // Spawn git status on a detached thread — we won't wait for it here. + let git_workdir_for_status = git_workdir.clone(); + let git_handle = std::thread::spawn(move || { + GitStatusCache::read_git_status( + git_workdir_for_status.as_deref(), + StatusOptions::new() + .include_untracked(true) + .recurse_untracked_dirs(true) + .exclude_submodules(true), + ) + }); - (git_workdir, status_cache) - }); + // Walk files (the fast part, typically 2-3s even on huge repos). + let walker = WalkBuilder::new(base_path) + .hidden(false) + .git_ignore(true) + .git_exclude(true) + .git_global(true) + .ignore(true) + .follow_links(false) + .build_parallel(); + + let walker_start = std::time::Instant::now(); + debug!("SCAN: Starting file walker"); + + let files = parking_lot::Mutex::new(Vec::new()); + walker.run(|| { + let files = &files; + let counter = Arc::clone(synced_files_count); + let base_path = base_path.to_path_buf(); + + Box::new(move |result| { + if let Ok(entry) = result + && entry.file_type().is_some_and(|ft| ft.is_file()) + { + let path = entry.path(); + + if is_git_file(path) { + return WalkState::Continue; + } - let walker = WalkBuilder::new(base_path) - .hidden(false) - .git_ignore(true) - .git_exclude(true) - .git_global(true) - .ignore(true) - .follow_links(false) - .build_parallel(); - - let walker_start = std::time::Instant::now(); - debug!("SCAN: Starting file walker"); - - let files = Arc::new(std::sync::Mutex::new(Vec::new())); - walker.run(|| { - let files = Arc::clone(&files); - let counter = Arc::clone(synced_files_count); - let base_path = base_path.to_path_buf(); - - Box::new(move |result| { - if let Ok(entry) = result - && entry.file_type().is_some_and(|ft| ft.is_file()) - { - let path = entry.path(); - - if is_git_file(path) { - return WalkState::Continue; - } + let metadata = entry.metadata().ok(); + let file_item = FileItem::new_with_metadata( + path.to_path_buf(), + &base_path, + None, + metadata.as_ref(), + ); - let file_item = FileItem::new( - path.to_path_buf(), - &base_path, - None, // Git status will be added after join - ); + files.lock().push(file_item); + counter.fetch_add(1, Ordering::Relaxed); + } + WalkState::Continue + }) + }); - if let Ok(mut files_vec) = files.lock() { - files_vec.push(file_item); - counter.fetch_add(1, Ordering::Relaxed); - } - } - WalkState::Continue - }) - }); + let mut files = files.into_inner(); + info!( + "SCAN: File walking completed in {:?} for {} files", + walker_start.elapsed(), + files.len(), + ); + + // Apply frecency scores (access-based only — git status not yet available). + let frecency = shared_frecency + .read() + .map_err(|_| Error::AcquireFrecencyLock)?; + if let Some(frecency) = frecency.as_ref() { + files + .par_iter_mut() + .try_for_each(|file| file.update_frecency_scores(frecency, mode))?; + } + drop(frecency); - let mut files = Arc::try_unwrap(files).unwrap().into_inner().unwrap(); - let walker_time = walker_start.elapsed(); - info!("SCAN: File walking completed in {:?}", walker_time); + files.par_sort_unstable_by(|a, b| a.path.as_os_str().cmp(b.path.as_os_str())); - let (git_workdir, git_cache) = git_handle.join().map_err(|_| { - error!("Failed to join git status thread"); - Error::ThreadPanic - })?; + let total_time = scan_start.elapsed(); + info!("SCAN: Walk + frecency completed in {:?}", total_time); - let frecency = shared_frecency - .read() - .map_err(|_| Error::AcquireFrecencyLock)?; + Ok(WalkResult { + sync: FileSync { + files, + overflow: Vec::new(), + git_workdir, + }, + git_handle, + }) +} - files - .par_iter_mut() - .try_for_each(|file| -> Result<(), Error> { - if let Some(git_cache) = &git_cache { - file.git_status = git_cache.lookup_status(&file.path); - } +/// Phase 2: apply git status to already-indexed files and recalculate +/// frecency scores that depend on it. +fn apply_git_status( + shared_picker: &SharedPicker, + shared_frecency: &SharedFrecency, + git_handle: std::thread::JoinHandle>, + mode: FFFMode, +) { + let join_start = std::time::Instant::now(); + let git_cache = match git_handle.join() { + Ok(cache) => cache, + Err(_) => { + error!("Git status thread panicked"); + return; + } + }; + info!("SCAN: Git status ready in {:?}", join_start.elapsed()); - if let Some(frecency) = frecency.as_ref() { - file.update_frecency_scores(frecency, mode)?; - } + let Some(git_cache) = git_cache else { return }; - Ok(()) - })?; + if let Ok(mut guard) = shared_picker.write() + && let Some(ref mut picker) = *guard + { + let frecency = shared_frecency.read().ok(); + let frecency_ref = frecency.as_ref().and_then(|f| f.as_ref()); + + picker.sync_data.files.par_iter_mut().for_each(|file| { + file.git_status = git_cache.lookup_status(&file.path); + if let Some(frecency) = frecency_ref { + let _ = file.update_frecency_scores(frecency, mode); + } + }); - let total_time = scan_start.elapsed(); info!( - "SCAN: Total scan time {:?} for {} files", - total_time, - files.len() + "SCAN: Applied git status to {} files ({} dirty)", + picker.sync_data.files.len(), + git_cache.statuses_len(), ); - - files.par_sort_unstable_by(|a, b| a.path.as_os_str().cmp(b.path.as_os_str())); - Ok(FileSync { files, git_workdir }) - }) + } } #[inline] @@ -1050,21 +1303,56 @@ fn is_git_file(path: &Path) -> bool { }) } -/// Detect if a file is binary by checking for NUL bytes in the first 512 bytes. -/// This is the same heuristic used by git and grep — simple, fast, and sufficient. +/// Fast extension-based binary detection. Avoids opening files during scan. +/// Covers the vast majority of binary files in typical repositories. #[inline] -fn detect_binary(path: &Path, size: u64) -> bool { - // Empty files are not binary - if size == 0 { - return false; - } - - let Ok(file) = std::fs::File::open(path) else { +fn is_known_binary_extension(path: &Path) -> bool { + let Some(ext) = path.extension().and_then(|e| e.to_str()) else { return false; }; - let mut reader = std::io::BufReader::with_capacity(1024, file); + matches!( + ext, + // Images + "png" | "jpg" | "jpeg" | "gif" | "bmp" | "ico" | "webp" | "tiff" | "tif" | "avif" | "heic" | + // Video/Audio + "mp4" | "avi" | "mov" | "wmv" | "mkv" | "mp3" | "wav" | "flac" | "ogg" | "m4a" | "aac" | + // Compressed + "zip" | "tar" | "gz" | "bz2" | "xz" | "7z" | "rar" | "zst" | "lz4" | "lzma" | + // Executables/Libraries + "exe" | "dll" | "so" | "dylib" | "o" | "a" | "lib" | "bin" | "elf" | + // Documents + "pdf" | "doc" | "docx" | "xls" | "xlsx" | "ppt" | "pptx" | + // Databases + "db" | "sqlite" | "sqlite3" | "mdb" | + // Fonts + "ttf" | "otf" | "woff" | "woff2" | "eot" | + // Other binary + "class" | "pyc" | "pyo" | "wasm" | "dex" | "jar" | "war" | + // Lock/package files that are binary + "lock" | + // Data/serialized + "parquet" | "arrow" | "pb" | "protobuf" + ) +} - let mut buf = [0u8; 512]; - let n = reader.read(&mut buf).unwrap_or(0); - buf[..n].contains(&0) +/// Detect binary content by checking for NUL bytes in the first 512 bytes. +/// Called lazily when file content is first loaded, not during initial scan. +#[inline] +pub(crate) fn detect_binary_content(content: &[u8]) -> bool { + let check_len = content.len().min(512); + content[..check_len].contains(&0) +} + +/// Ask the global allocator to return freed pages to the OS. +/// Enabled via the `mimalloc-collect` feature (set by fff-nvim). +/// No-op when the feature is off (tests, system allocator). +fn hint_allocator_collect() { + #[cfg(feature = "mimalloc-collect")] + { + // Collect every rayon worker thread's mimalloc heap — the bigram + // builder allocated across all of them. + rayon::broadcast(|_| unsafe { libmimalloc_sys::mi_collect(true) }); + // Main thread too. + unsafe { libmimalloc_sys::mi_collect(true) }; + } } diff --git a/crates/fff-core/src/grep.rs b/crates/fff-core/src/grep.rs index bbb5a2d..f96a3b0 100644 --- a/crates/fff-core/src/grep.rs +++ b/crates/fff-core/src/grep.rs @@ -7,7 +7,7 @@ use crate::constraints::apply_constraints; use crate::sort_buffer::sort_with_buffer; -use crate::types::{ContentCacheBudget, FileItem}; +use crate::types::{BigramFilter, BigramOverlay, ContentCacheBudget, FileItem, extract_bigrams}; use aho_corasick::AhoCorasick; use fff_grep::lines::{self, LineStep}; use fff_grep::{Searcher, SearcherBuilder, Sink, SinkMatch}; @@ -1083,6 +1083,12 @@ where } let content = file.get_content_for_search(budget)?; + + // Skip files that are binary but weren't caught by extension heuristic + if crate::file_picker::detect_binary_content(&content) { + return None; + } + let file_matches = search_file(&content, options.max_matches_per_file); if file_matches.is_empty() { @@ -1207,20 +1213,31 @@ fn prepare_files_to_search<'a>( }; let total_count = prefiltered.len(); - - // Sort by frecency (files are stored by path, not frecency) let mut sorted_files = prefiltered; - sort_with_buffer(&mut sorted_files, |a, b| { - b.total_frecency_score - .cmp(&a.total_frecency_score) - .then(b.modified.cmp(&a.modified)) - }); - - if options.file_offset < total_count { - let sorted_files = sorted_files.split_off(options.file_offset); - (sorted_files, total_count) - } else { + + // Only sort when there is meaningful frecency or modification data to rank by. + // On large repos (500k+ files) with no frecency data (fresh session, benchmark), + // skipping the O(n log n) sort saves ~200ms per query. + let needs_sort = sorted_files + .iter() + .any(|f| f.total_frecency_score != 0 || f.modified != 0); + + if needs_sort { + sort_with_buffer(&mut sorted_files, |a, b| { + b.total_frecency_score + .cmp(&a.total_frecency_score) + .then(b.modified.cmp(&a.modified)) + }); + } + + if options.file_offset > 0 && options.file_offset < total_count { + let paginated = sorted_files.split_off(options.file_offset); + (paginated, total_count) + } else if options.file_offset >= total_count { (Vec::new(), total_count) + } else { + // offset == 0: no split needed, return as-is + (sorted_files, total_count) } } @@ -1343,12 +1360,12 @@ fn fuzzy_grep_search<'a>( }; let min_chars_required = unique_count.saturating_sub(max_typos); - let time_budget = if options.time_budget_ms > 0 { + let _time_budget = if options.time_budget_ms > 0 { Some(std::time::Duration::from_millis(options.time_budget_ms)) } else { None }; - let search_start = std::time::Instant::now(); + let _search_start = std::time::Instant::now(); let budget_exceeded = AtomicBool::new(false); let max_matches_per_file = options.max_matches_per_file; @@ -1361,14 +1378,17 @@ fn fuzzy_grep_search<'a>( .map_init( || matcher.clone(), |matcher, (idx, file)| { - if let Some(budget) = time_budget - && search_start.elapsed() > budget - { - budget_exceeded.store(true, Ordering::Relaxed); - return None; - } + // if let Some(budget) = time_budget + // && search_start.elapsed() > budget + // { + // budget_exceeded.store(true, Ordering::Relaxed); + // return None; + // } let file_content = file.get_content_for_search(budget)?; + if crate::file_picker::detect_binary_content(&file_content) { + return None; + } let file_bytes: &[u8] = &file_content; // File-level prefilter: check if enough distinct needle chars @@ -1564,6 +1584,8 @@ pub fn grep_search<'a>( query: &FFFQuery<'_>, options: &GrepSearchOptions, budget: &ContentCacheBudget, + bigram_index: Option<&BigramFilter>, + bigram_overlay: Option<&parking_lot::RwLock>, ) -> GrepResult<'a> { let total_files = files.len(); @@ -1603,32 +1625,6 @@ pub fn grep_search<'a>( }; } - // Filter, sort, and paginate files (shared across all modes) - let (mut files_to_search, mut filtered_file_count) = - prepare_files_to_search(files, constraints_from_query, options); - - // If constraints yielded 0 files and we had a FilePath constraint, - // retry without it — the filename may not exist in this repo. - // Keep the original grep_text (e.g. "ActorAuth") rather than restoring - // the raw query ("nonexistent.rs ActorAuth"), since the search term - // was correctly extracted by the parser. - if files_to_search.is_empty() - && let Some(stripped) = strip_file_path_constraints(constraints_from_query) - { - let (retry_files, retry_count) = prepare_files_to_search(files, &stripped, options); - files_to_search = retry_files; - filtered_file_count = retry_count; - } - - if files_to_search.is_empty() { - return GrepResult { - total_files, - filtered_file_count, - next_file_offset: 0, - ..Default::default() - }; - } - let case_insensitive = if options.smart_case { !grep_text.chars().any(|c| c.is_uppercase()) } else { @@ -1639,6 +1635,24 @@ pub fn grep_search<'a>( let regex = match options.mode { GrepMode::PlainText => None, GrepMode::Fuzzy => { + // Fuzzy mode doesn't use bigram — prepare and return early. + let (mut files_to_search, mut filtered_file_count) = + prepare_files_to_search(files, constraints_from_query, options); + if files_to_search.is_empty() + && let Some(stripped) = strip_file_path_constraints(constraints_from_query) + { + let (retry_files, retry_count) = prepare_files_to_search(files, &stripped, options); + files_to_search = retry_files; + filtered_file_count = retry_count; + } + if files_to_search.is_empty() { + return GrepResult { + total_files, + filtered_file_count, + next_file_offset: 0, + ..Default::default() + }; + } return fuzzy_grep_search( &grep_text, &files_to_search, @@ -1676,6 +1690,113 @@ pub fn grep_search<'a>( let finder = memchr::memmem::Finder::new(&finder_pattern); let pattern_len = finder_pattern.len() as u32; + // Bigram prefiltering: query the inverted index + merge overlay. + let bigram_candidates = if regex.is_none() + && let Some(idx) = bigram_index + && idx.is_ready() + && idx.file_count() == files.len() + && let Some(mut candidates) = idx.query(effective_pattern.as_bytes()) + { + if let Some(overlay_lock) = bigram_overlay { + let overlay = overlay_lock.read(); + let pattern_bigrams = extract_bigrams(effective_pattern.as_bytes()); + for (r, t) in candidates.iter_mut().zip(overlay.tombstones().iter()) { + *r &= !t; + } + for file_idx in overlay.query_modified(&pattern_bigrams) { + let word = file_idx / 64; + if word < candidates.len() { + candidates[word] |= 1u64 << (file_idx % 64); + } + } + } + Some(candidates) + } else { + None + }; + + // Build files_to_search. When bigram candidates are available with no + // constraints, iterate the bitset directly — avoids collecting/sorting + // ALL files then retaining the ~3% that are candidates. + let (files_to_search, filtered_file_count) = match bigram_candidates { + Some(ref candidates) if constraints_from_query.is_empty() => { + let cap = BigramFilter::count_candidates(candidates); + let mut result: Vec<&FileItem> = Vec::with_capacity(cap); + for (word_idx, &word) in candidates.iter().enumerate() { + if word == 0 { + continue; + } + let base = word_idx * 64; + let mut bits = word; + while bits != 0 { + let bit = bits.trailing_zeros() as usize; + let file_idx = base + bit; + if file_idx < files.len() { + let f = &files[file_idx]; + if !f.is_binary && f.size > 0 && f.size <= options.max_file_size { + result.push(f); + } + } + bits &= bits - 1; + } + } + + let total_searchable = files + .iter() + .filter(|f| !f.is_binary && f.size > 0 && f.size <= options.max_file_size) + .count(); + + let needs_sort = result + .iter() + .any(|f| f.total_frecency_score != 0 || f.modified != 0); + if needs_sort { + sort_with_buffer(&mut result, |a, b| { + b.total_frecency_score + .cmp(&a.total_frecency_score) + .then(b.modified.cmp(&a.modified)) + }); + } + + if options.file_offset > 0 && options.file_offset < result.len() { + let paginated = result.split_off(options.file_offset); + (paginated, total_searchable) + } else if options.file_offset >= result.len() { + (Vec::new(), total_searchable) + } else { + (result, total_searchable) + } + } + _ => { + // Constraints present or no bigram — full prepare then retain. + let (mut fts, mut fc) = prepare_files_to_search(files, constraints_from_query, options); + if fts.is_empty() + && let Some(stripped) = strip_file_path_constraints(constraints_from_query) + { + let (retry_files, retry_count) = prepare_files_to_search(files, &stripped, options); + fts = retry_files; + fc = retry_count; + } + if let Some(ref candidates) = bigram_candidates { + let base_ptr = files.as_ptr(); + fts.retain(|f| { + let file_idx = + unsafe { (*f as *const FileItem).offset_from(base_ptr) as usize }; + BigramFilter::is_candidate(candidates, file_idx) + }); + } + (fts, fc) + } + }; + + if files_to_search.is_empty() { + return GrepResult { + total_files, + filtered_file_count, + next_file_offset: 0, + ..Default::default() + }; + } + // `PlainTextMatcher` is used by the grep-searcher engine for line detection. // `PlainTextSink` / `RegexSink` handle highlight extraction independently. let plain_matcher = PlainTextMatcher { @@ -1946,7 +2067,7 @@ mod tests { &["GrepMode", "GrepMatch", "PlainTextMatcher"], &[], &options, - &ContentCacheBudget::zero(), + &ContentCacheBudget::unlimited(), ); // Should find matches from file1 (GrepMode, GrepMatch) and file2 (PlainTextMatcher) diff --git a/crates/fff-core/src/score.rs b/crates/fff-core/src/score.rs index 6d6b227..6974bab 100644 --- a/crates/fff-core/src/score.rs +++ b/crates/fff-core/src/score.rs @@ -223,7 +223,7 @@ pub fn match_and_score_files<'a>( let file = working_files.index(file_idx); let mut base_score = path_match.score as i32; - let frecency_boost = base_score.saturating_mul(file.total_frecency_score as i32) / 100; + let frecency_boost = base_score.saturating_mul(file.total_frecency_score) / 100; // Give modified/dirty files a 15% boost to make them appear higher in results let git_status_boost = if file.git_status.is_some_and(is_modified_status) { @@ -366,8 +366,8 @@ pub(crate) fn score_filtered_by_frecency<'a>( context: &ScoringContext, ) -> (Vec<&'a FileItem>, Vec, usize) { let score_file = |file: &'a FileItem| { - let total_frecency_score = file.access_frecency_score as i32 - + (file.modification_frecency_score as i32).saturating_mul(4); + let total_frecency_score = + file.access_frecency_score + file.modification_frecency_score.saturating_mul(4); // Give modified/dirty files a boost even in frecency-only mode let git_status_boost = if file.git_status.is_some_and(is_modified_status) { diff --git a/crates/fff-core/src/types.rs b/crates/fff-core/src/types.rs index 264ef32..c3da672 100644 --- a/crates/fff-core/src/types.rs +++ b/crates/fff-core/src/types.rs @@ -1,6 +1,6 @@ use std::path::{Path, PathBuf}; use std::sync::OnceLock; -use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::sync::atomic::{AtomicU32, AtomicU64, AtomicUsize, Ordering}; use crate::constraints::Constrainable; use crate::query_tracker::QueryMatchEntry; @@ -11,12 +11,14 @@ use fff_query_parser::{FFFQuery, FuzzyQuery, Location}; /// On Windows, memory-mapped files hold the file handle open and prevent /// editors from saving (writing/replacing) those files. Reading into a /// `Vec` releases the handle immediately after the read completes. +/// +/// The `Buffer` variant is also used on Unix for temporary (uncached) reads +/// where the mmap/munmap syscall overhead exceeds the cost of a heap copy. #[derive(Debug)] #[allow(dead_code)] // variants are conditionally used per platform pub enum FileContent { #[cfg(not(target_os = "windows"))] Mmap(memmap2::Mmap), - #[cfg(target_os = "windows")] Buffer(Vec), } @@ -26,7 +28,6 @@ impl std::ops::Deref for FileContent { match self { #[cfg(not(target_os = "windows"))] FileContent::Mmap(m) => m, - #[cfg(target_os = "windows")] FileContent::Buffer(b) => b, } } @@ -44,16 +45,17 @@ impl std::ops::Deref for FileContent { pub struct FileItem { pub path: PathBuf, pub relative_path: String, - pub relative_path_lower: String, pub file_name: String, - pub file_name_lower: String, pub size: u64, pub modified: u64, - pub access_frecency_score: i64, - pub modification_frecency_score: i64, - pub total_frecency_score: i64, + pub access_frecency_score: i32, + pub modification_frecency_score: i32, + pub total_frecency_score: i32, pub git_status: Option, pub is_binary: bool, + /// Tombstone flag — file was deleted but index slot is preserved so + /// bigram indices for other files stay valid. + pub is_deleted: bool, /// Lazily-initialized file contents for grep. /// Initialized on first grep access via `OnceLock`; lock-free on subsequent reads. content: OnceLock, @@ -64,9 +66,7 @@ impl Clone for FileItem { Self { path: self.path.clone(), relative_path: self.relative_path.clone(), - relative_path_lower: self.relative_path_lower.clone(), file_name: self.file_name.clone(), - file_name_lower: self.file_name_lower.clone(), size: self.size, modified: self.modified, access_frecency_score: self.access_frecency_score, @@ -74,6 +74,7 @@ impl Clone for FileItem { total_frecency_score: self.total_frecency_score, git_status: self.git_status, is_binary: self.is_binary, + is_deleted: self.is_deleted, // Don't clone the content — the clone lazily re-creates it on demand content: OnceLock::new(), } @@ -118,8 +119,6 @@ impl FileItem { is_binary: bool, ) -> Self { Self { - relative_path_lower: relative_path.to_lowercase(), - file_name_lower: file_name.to_lowercase(), path, relative_path, file_name, @@ -130,6 +129,7 @@ impl FileItem { total_frecency_score: 0, git_status, is_binary, + is_deleted: false, content: OnceLock::new(), } } @@ -145,6 +145,7 @@ impl FileItem { budget.cached_count.fetch_sub(1, Ordering::Relaxed); budget.cached_bytes.fetch_sub(self.size, Ordering::Relaxed); } + self.content = OnceLock::new(); } @@ -155,24 +156,26 @@ impl FileItem { /// of the budget should use [`get_content_for_search`]. /// /// After the first call, this is lock-free (just an atomic load + pointer deref). - #[inline] pub fn get_content(&self, budget: &ContentCacheBudget) -> Option<&[u8]> { if let Some(content) = self.content.get() { return Some(content); } - if self.size == 0 || self.size > MAX_MMAP_FILE_SIZE { + let max_file_size = budget.max_file_size; + if self.size == 0 || self.size > max_file_size { return None; } // Check cache budget before creating a new persistent cache entry. let count = budget.cached_count.load(Ordering::Relaxed); let bytes = budget.cached_bytes.load(Ordering::Relaxed); - if count >= budget.max_files || bytes + self.size > MAX_CACHED_CONTENT_BYTES { + let max_files = budget.max_files; + let max_bytes = budget.max_bytes; + if count >= max_files || bytes + self.size > max_bytes { return None; } - let content = load_file_content(&self.path)?; + let content = load_file_content(&self.path, self.size)?; let result = self.content.get_or_init(|| content); // Bump counters. Slight over-count under races is fine — the budget @@ -183,12 +186,6 @@ impl FileItem { Some(result) } - /// Backward-compatible alias for `get_content`. - #[inline] - pub fn get_mmap(&self, budget: &ContentCacheBudget) -> Option<&[u8]> { - self.get_content(budget) - } - /// Get file content for searching — **always returns content** for eligible /// files, even when the persistent cache budget is exhausted. /// @@ -205,30 +202,56 @@ impl FileItem { } // get_content returned None — either ineligible or over budget. - if self.is_binary || self.size == 0 || self.size > MAX_MMAP_FILE_SIZE { + let max_file_size = budget.max_file_size; + if self.is_binary || self.size == 0 || self.size > max_file_size { return None; } // Over budget: create a temporary mmap that is unmapped on drop. - let content = load_file_content(&self.path)?; + let content = load_file_content(&self.path, self.size)?; Some(FileContentRef::Temp(content)) } } -/// Load file contents: mmap on Unix, heap buffer on Windows. -fn load_file_content(path: &Path) -> Option { +/// Maximum number of distinct bigrams tracked in the inverted index. +/// 95 printable ASCII chars (32..=126) after lowercasing → ~70 distinct → 4900 possible. +/// We cap at 5000 to cover all printable bigrams with margin. +/// 5000 columns × 62.5KB (500k files) = 305MB. For 50k files: 30MB. +const MAX_BIGRAM_COLUMNS: usize = 5000; + +/// Sentinel value: bigram has no allocated column. +const NO_COLUMN: u32 = u32::MAX; + +/// Page size on Apple Silicon is 16KB; on x86-64 it's 4KB. +/// Files smaller than one page waste the remainder when mmapped. +/// Reading them into a heap buffer avoids this overhead. +#[cfg(target_arch = "aarch64")] +const MMAP_THRESHOLD: u64 = 16 * 1024; +#[cfg(not(target_arch = "aarch64"))] +const MMAP_THRESHOLD: u64 = 4 * 1024; + +/// Load file contents: small files are read into a heap buffer to avoid +/// mmap page alignment waste; large files use mmap for zero-copy access. +/// On Windows, always uses heap buffer (mmap holds the file handle open). +fn load_file_content(path: &Path, size: u64) -> Option { #[cfg(not(target_os = "windows"))] { - let file = std::fs::File::open(path).ok()?; - // SAFETY: The mmap is backed by the kernel page cache and automatically - // reflects file modifications. The only risk is SIGBUS if the file is - // truncated while mapped. - let mmap = unsafe { memmap2::Mmap::map(&file) }.ok()?; - Some(FileContent::Mmap(mmap)) + if size < MMAP_THRESHOLD { + let data = std::fs::read(path).ok()?; + Some(FileContent::Buffer(data)) + } else { + let file = std::fs::File::open(path).ok()?; + // SAFETY: The mmap is backed by the kernel page cache and automatically + // reflects file modifications. The only risk is SIGBUS if the file is + // truncated while mapped. + let mmap = unsafe { memmap2::Mmap::map(&file) }.ok()?; + Some(FileContent::Mmap(mmap)) + } } #[cfg(target_os = "windows")] { + let _ = size; let data = std::fs::read(path).ok()?; Some(FileContent::Buffer(data)) } @@ -240,11 +263,6 @@ impl Constrainable for FileItem { &self.relative_path } - #[inline] - fn relative_path_lower(&self) -> &str { - &self.relative_path_lower - } - #[inline] fn file_name(&self) -> &str { &self.file_name @@ -345,6 +363,8 @@ const MAX_CACHED_CONTENT_BYTES: u64 = 512 * 1024 * 1024; #[derive(Debug)] pub struct ContentCacheBudget { pub max_files: usize, + pub max_bytes: u64, + pub max_file_size: u64, pub cached_count: AtomicUsize, pub cached_bytes: AtomicU64, } @@ -355,6 +375,8 @@ impl ContentCacheBudget { pub fn unlimited() -> Self { Self { max_files: usize::MAX, + max_bytes: u64::MAX, + max_file_size: MAX_MMAP_FILE_SIZE, cached_count: AtomicUsize::new(0), cached_bytes: AtomicU64::new(0), } @@ -363,14 +385,34 @@ impl ContentCacheBudget { pub fn zero() -> Self { Self { max_files: 0, + max_bytes: 0, + max_file_size: 0, cached_count: AtomicUsize::new(0), cached_bytes: AtomicU64::new(0), } } - pub fn new(max_files: usize) -> Self { + pub fn new_for_repo(file_count: usize) -> Self { + let max_files = if file_count > 50_000 { + 5_000 + } else if file_count > 10_000 { + 10_000 + } else { + 30_000 // effectively unlimited for small repos + }; + + let max_bytes = if file_count > 50_000 { + 128 * 1024 * 1024 // 128 MB + } else if file_count > 10_000 { + 256 * 1024 * 1024 // 256 MB + } else { + MAX_CACHED_CONTENT_BYTES // 512 MB + }; + Self { max_files, + max_bytes, + max_file_size: MAX_MMAP_FILE_SIZE, cached_count: AtomicUsize::new(0), cached_bytes: AtomicU64::new(0), } @@ -386,6 +428,503 @@ impl ContentCacheBudget { impl Default for ContentCacheBudget { fn default() -> Self { - Self::new(30_000) + Self::new_for_repo(30_000) + } +} + +/// Temporary dense builder for the bigram index. +/// Uses AtomicU64 for lock-free concurrent writes during the parallel build phase. +/// Columns are allocated lazily on first use to avoid the massive upfront allocation +/// (previously ~300MB for 500k files, now proportional to actual bigrams found). +/// Call `compress()` to produce the final compact `BigramIndex`. +pub struct BigramIndexBuilder { + lookup: Vec, + /// Per-column bitset data, lazily allocated via OnceLock. + col_data: Vec>>, + next_column: AtomicU32, + words: usize, + file_count: usize, + populated: AtomicUsize, +} + +impl BigramIndexBuilder { + pub fn new(file_count: usize) -> Self { + let words = file_count.div_ceil(64); + let mut lookup = Vec::with_capacity(65536); + lookup.resize_with(65536, || AtomicU32::new(NO_COLUMN)); + let mut col_data = Vec::with_capacity(MAX_BIGRAM_COLUMNS); + col_data.resize_with(MAX_BIGRAM_COLUMNS, OnceLock::new); + Self { + lookup, + col_data, + next_column: AtomicU32::new(0), + words, + file_count, + populated: AtomicUsize::new(0), + } + } + + #[inline] + fn get_or_alloc_column(&self, key: u16) -> u32 { + let current = self.lookup[key as usize].load(Ordering::Relaxed); + if current != NO_COLUMN { + return current; + } + let new_col = self.next_column.fetch_add(1, Ordering::Relaxed); + if new_col >= MAX_BIGRAM_COLUMNS as u32 { + return NO_COLUMN; + } + + match self.lookup[key as usize].compare_exchange( + NO_COLUMN, + new_col, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + Ok(_) => new_col, + Err(existing) => existing, + } + } + + /// Get (or lazily allocate) the bitset for a given column index. + #[inline] + fn column_bitset(&self, col: u32) -> &[AtomicU64] { + let words = self.words; + self.col_data[col as usize].get_or_init(|| { + let mut v = Vec::with_capacity(words); + v.resize_with(words, || AtomicU64::new(0)); + v.into_boxed_slice() + }) + } + + pub fn add_file_content(&self, file_idx: usize, content: &[u8]) { + if content.len() < 2 { + return; + } + + debug_assert!(file_idx < self.file_count); + let word_idx = file_idx / 64; + let bit_mask = 1u64 << (file_idx % 64); + + let mut prev = content[0]; + for &b in &content[1..] { + if (32..=126).contains(&prev) && (32..=126).contains(&b) { + let key = (prev.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16; + let col = self.get_or_alloc_column(key); + if col != NO_COLUMN { + self.column_bitset(col)[word_idx].fetch_or(bit_mask, Ordering::Relaxed); + } + } + prev = b; + } + self.populated.fetch_add(1, Ordering::Relaxed); + } + + pub fn is_ready(&self) -> bool { + self.populated.load(Ordering::Relaxed) > 0 + } + + pub fn columns_used(&self) -> u32 { + self.next_column + .load(Ordering::Relaxed) + .min(MAX_BIGRAM_COLUMNS as u32) + } + + /// Compress the dense builder into a compact `BigramIndex`. + /// Sparse columns become sorted u32 posting lists; dense columns stay as bitsets. + /// + /// Each column's `Box<[AtomicU64]>` (~60 KB for 500k files) is freed immediately + /// after compression via `OnceLock::take`, so peak memory during compress is + /// roughly `max(builder, result)` instead of `builder + result`. + pub fn compress(self) -> BigramFilter { + let cols = self.columns_used() as usize; + let words = self.words; + let file_count = self.file_count; + let populated = self.populated.load(Ordering::Relaxed); + let dense_bytes = words * 8; // cost of one dense column + + // Destructure so we can incrementally free col_data entries. + let old_lookup = self.lookup; + let mut col_data = self.col_data; + + // Build lookup: bigram key → new column index in compressed storage + let mut lookup = vec![NO_COLUMN; 65536]; + let mut columns = Vec::with_capacity(cols); + + for key in 0..65536u32 { + let old_col = old_lookup[key as usize].load(Ordering::Relaxed); + if old_col == NO_COLUMN || old_col as usize >= cols { + continue; + } + // Take the Box out of OnceLock — frees ~60 KB when dropped at end of iteration. + let Some(bitset) = col_data[old_col as usize].take() else { + continue; + }; + + // Count set bits to decide storage format + let mut popcount = 0u32; + for w in 0..words { + popcount += bitset[w].load(Ordering::Relaxed).count_ones(); + } + + let posting_bytes = popcount as usize * 4; + let new_col = columns.len() as u32; + lookup[key as usize] = new_col; + + if posting_bytes < dense_bytes { + // Sparse: extract sorted file indices + let mut posting = Vec::with_capacity(popcount as usize); + for w in 0..words { + let mut word = bitset[w].load(Ordering::Relaxed); + let base = (w * 64) as u32; + while word != 0 { + let bit = word.trailing_zeros(); + posting.push(base + bit); + word &= word - 1; // clear lowest set bit + } + } + columns.push(Column::Sparse(posting)); + } else { + // Dense: copy bitset words + let mut dense = Vec::with_capacity(words); + for w in 0..words { + dense.push(bitset[w].load(Ordering::Relaxed)); + } + columns.push(Column::Dense(dense)); + } + // `bitset` (Box<[AtomicU64]>) dropped here — allocator can reuse the pages + } + + // Explicitly drop builder remnants before returning. + drop(col_data); + drop(old_lookup); + + BigramFilter { + lookup, + columns, + words, + file_count, + populated, + } + } +} + +unsafe impl Send for BigramIndexBuilder {} +unsafe impl Sync for BigramIndexBuilder {} + +/// Column storage: either a dense bitset or a sorted posting list. +#[derive(Debug)] +enum Column { + /// Dense bitset — one u64 per 64-file chunk. + Dense(Vec), + /// Sorted file indices — cheaper when few files contain this bigram. + Sparse(Vec), +} + +/// Compressed bigram inverted index. +/// +/// Built from `BigramIndexBuilder::compress()`. Sparse columns use sorted u32 +/// posting lists; dense columns use plain bitsets. Lossless — same filtering +/// quality as the dense builder, significantly less memory. +#[derive(Debug)] +pub struct BigramFilter { + lookup: Vec, + columns: Vec, + words: usize, + file_count: usize, + populated: usize, +} + +/// AND a dense bitset column into the result. Uses iterator zip to +/// eliminate bounds checks so LLVM can emit NEON/SSE vector AND instructions. +#[inline] +fn bitset_and(result: &mut [u64], bitset: &[u64]) { + result + .iter_mut() + .zip(bitset.iter()) + .for_each(|(r, b)| *r &= *b); +} + +/// AND a sparse posting list into the result. Zeroes words not present +/// in the posting list, ANDs accumulated masks for words that are. +#[inline] +fn bitset_and_sparse(result: &mut [u64], posting: &[u32]) { + let mut pos = 0; + let mut prev_w = 0; + while pos < posting.len() { + let w = posting[pos] as usize / 64; + // Zero gap between previous touched word and this one + for r in &mut result[prev_w..w] { + *r = 0; + } + // Accumulate OR mask for this word + let mut word_mask = 0u64; + while pos < posting.len() && posting[pos] as usize / 64 == w { + word_mask |= 1u64 << (posting[pos] as usize % 64); + pos += 1; + } + result[w] &= word_mask; + prev_w = w + 1; + } + // Zero remaining words after last posting entry + for r in &mut result[prev_w..] { + *r = 0; + } +} + +impl BigramFilter { + /// Query: AND the posting lists for all query bigrams. + /// Returns None if no query bigrams are tracked. + pub fn query(&self, pattern: &[u8]) -> Option> { + if pattern.len() < 2 { + return None; + } + + let mut result = vec![u64::MAX; self.words]; + if !self.file_count.is_multiple_of(64) { + let last = self.words - 1; + result[last] = (1u64 << (self.file_count % 64)) - 1; + } + + let mut has_filter = false; + let mut prev = pattern[0]; + for &b in &pattern[1..] { + if (32..=126).contains(&prev) && (32..=126).contains(&b) { + let key = (prev.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16; + let col_idx = self.lookup[key as usize]; + if col_idx != NO_COLUMN { + match &self.columns[col_idx as usize] { + Column::Dense(bitset) => { + bitset_and(&mut result, bitset); + } + Column::Sparse(posting) => { + bitset_and_sparse(&mut result, posting); + } + } + has_filter = true; + } + } + prev = b; + } + + if has_filter { Some(result) } else { None } + } + + #[inline] + pub fn is_candidate(candidates: &[u64], file_idx: usize) -> bool { + let word = file_idx / 64; + let bit = file_idx % 64; + word < candidates.len() && candidates[word] & (1u64 << bit) != 0 + } + + pub fn count_candidates(candidates: &[u64]) -> usize { + candidates.iter().map(|w| w.count_ones() as usize).sum() + } + + pub fn is_ready(&self) -> bool { + self.populated > 0 + } + + pub fn file_count(&self) -> usize { + self.file_count + } + + pub fn columns_used(&self) -> usize { + self.columns.len() + } + + pub fn sparse_columns(&self) -> usize { + self.columns + .iter() + .filter(|c| matches!(c, Column::Sparse(_))) + .count() + } + + pub fn dense_columns(&self) -> usize { + self.columns + .iter() + .filter(|c| matches!(c, Column::Dense(_))) + .count() + } + + /// Total heap bytes used by this index (lookup table + all column data). + pub fn heap_bytes(&self) -> usize { + let lookup_bytes = self.lookup.len() * std::mem::size_of::(); + let column_bytes: usize = self + .columns + .iter() + .map(|c| { + std::mem::size_of::() + + match c { + Column::Dense(v) => v.len() * 8, + Column::Sparse(v) => v.len() * 4, + } + }) + .sum(); + lookup_bytes + column_bytes + } +} + +unsafe impl Send for BigramFilter {} +unsafe impl Sync for BigramFilter {} + +// --------------------------------------------------------------------------- +// Shared bigram extraction +// --------------------------------------------------------------------------- + +/// Extract deduplicated bigram keys from file content. +/// Same logic as `BigramIndexBuilder::add_file_content`: consecutive printable +/// ASCII pairs, lowercased, encoded as `(prev << 8) | cur`. +pub fn extract_bigrams(content: &[u8]) -> Vec { + if content.len() < 2 { + return Vec::new(); + } + // Use a flat bitset (65536 bits = 8 KB) for dedup — faster than HashSet. + let mut seen = vec![0u64; 1024]; // 1024 * 64 = 65536 bits + let mut bigrams = Vec::new(); + + let mut prev = content[0]; + for &b in &content[1..] { + if (32..=126).contains(&prev) && (32..=126).contains(&b) { + let key = (prev.to_ascii_lowercase() as u16) << 8 | b.to_ascii_lowercase() as u16; + let word = key as usize / 64; + let bit = 1u64 << (key as usize % 64); + if seen[word] & bit == 0 { + seen[word] |= bit; + bigrams.push(key); + } + } + prev = b; + } + bigrams +} + +// --------------------------------------------------------------------------- +// Bigram overlay — incremental delta layer on top of the immutable base index +// --------------------------------------------------------------------------- + +/// Tracks bigram changes since the base `BigramFilter` was built. +/// +/// Modified and added files store their own bigram sets. Deleted files are +/// tombstoned in a bitset so they can be excluded from base query results. +/// This overlay is updated by the background watcher on every file event +/// and cleared when the base index is rebuilt. +#[derive(Debug)] +pub struct BigramOverlay { + /// Per-file bigram sets for files modified since the base was built. + /// Key = file index in the base `Vec`. + modified: HashMap>, + + /// Tombstone bitset — one bit per base file. Set bits are excluded + /// from base query results. + tombstones: Vec, + + /// Bigram sets for files added after the base was built (overflow files). + added: Vec>, + + /// Number of base files this overlay was created for. + base_file_count: usize, +} + +use std::collections::HashMap; + +impl BigramOverlay { + pub fn new(base_file_count: usize) -> Self { + let words = base_file_count.div_ceil(64); + Self { + modified: HashMap::new(), + tombstones: vec![0u64; words], + added: Vec::new(), + base_file_count, + } + } + + /// Record updated bigram data for a modified base file. + pub fn modify_file(&mut self, file_idx: usize, content: &[u8]) { + self.modified.insert(file_idx, extract_bigrams(content)); + } + + /// Tombstone a deleted base file. + pub fn delete_file(&mut self, file_idx: usize) { + if file_idx < self.base_file_count { + let word = file_idx / 64; + self.tombstones[word] |= 1u64 << (file_idx % 64); + } + self.modified.remove(&file_idx); + } + + /// Record bigrams for a newly added (overflow) file. + pub fn add_file(&mut self, content: &[u8]) { + self.added.push(extract_bigrams(content)); + } + + /// Return base file indices of modified files whose bigrams match ALL + /// of the given `pattern_bigrams`. + pub fn query_modified(&self, pattern_bigrams: &[u16]) -> Vec { + if pattern_bigrams.is_empty() { + return self.modified.keys().copied().collect(); + } + self.modified + .iter() + .filter_map(|(&file_idx, bigrams)| { + let all_match = pattern_bigrams.iter().all(|pb| bigrams.contains(pb)); + if all_match { Some(file_idx) } else { None } + }) + .collect() + } + + /// Return overflow indices (into the `added` vec) whose bigrams match + /// ALL of the given `pattern_bigrams`. + pub fn query_added(&self, pattern_bigrams: &[u16]) -> Vec { + if pattern_bigrams.is_empty() { + return (0..self.added.len()).collect(); + } + self.added + .iter() + .enumerate() + .filter_map(|(idx, bigrams)| { + let all_match = pattern_bigrams.iter().all(|pb| bigrams.contains(pb)); + if all_match { Some(idx) } else { None } + }) + .collect() + } + + /// Get the tombstone bitset for clearing base candidates. + pub fn tombstones(&self) -> &[u64] { + &self.tombstones + } + + pub fn is_tombstoned(&self, file_idx: usize) -> bool { + let word = file_idx / 64; + word < self.tombstones.len() && self.tombstones[word] & (1u64 << (file_idx % 64)) != 0 + } + + pub fn base_file_count(&self) -> usize { + self.base_file_count + } + + /// Remove an overflow entry by index (when the file is deleted). + pub fn remove_added(&mut self, idx: usize) { + if idx < self.added.len() { + self.added.remove(idx); + } + } + + /// Update an existing overflow entry's bigrams. + pub fn update_added(&mut self, idx: usize, bigrams: Vec) { + if idx < self.added.len() { + self.added[idx] = bigrams; + } + } + + /// Total number of entries tracked (for deciding when to trigger a full rebuild). + pub fn overlay_size(&self) -> usize { + self.modified.len() + + self.added.len() + + self + .tombstones + .iter() + .map(|w| w.count_ones() as usize) + .sum::() } } diff --git a/crates/fff-core/tests/bigram_overlay_integration.rs b/crates/fff-core/tests/bigram_overlay_integration.rs new file mode 100644 index 0000000..622fe04 --- /dev/null +++ b/crates/fff-core/tests/bigram_overlay_integration.rs @@ -0,0 +1,360 @@ +//! Integration test: verify that modifying a file after the bigram index is built +//! still makes the new content findable via grep (through the overlay layer). + +use std::fs; +use std::sync::Arc; +use std::time::Duration; +use tempfile::TempDir; + +use fff_search::file_picker::{FFFMode, FilePicker}; +use fff_search::grep::{GrepMode, GrepSearchOptions, grep_search, parse_grep_query}; +use fff_search::types::ContentCacheBudget; +use fff_search::{SharedFrecency, SharedPicker}; + +/// Create a temp directory with some initial files, run the full picker lifecycle, +/// then modify a file and verify grep finds the new content. +#[test] +fn modified_file_findable_via_overlay() { + let tmp = TempDir::new().unwrap(); + let base = tmp.path(); + + // Create initial files with known content. + fs::write(base.join("alpha.txt"), "hello world\nfoo bar\n").unwrap(); + fs::write( + base.join("beta.txt"), + "some other content\nnothing special\n", + ) + .unwrap(); + fs::write(base.join("gamma.txt"), "yet another file\nmore lines\n").unwrap(); + + // ── Phase 1: Initialize picker ────────────────────────────────────── + let shared_picker: SharedPicker = Arc::new(std::sync::RwLock::new(None)); + let shared_frecency: SharedFrecency = Arc::new(std::sync::RwLock::new(None)); + + FilePicker::new_with_shared_state( + base.to_string_lossy().to_string(), + true, // warmup (builds bigram index) + FFFMode::Neovim, + Arc::clone(&shared_picker), + Arc::clone(&shared_frecency), + ) + .expect("Failed to create FilePicker"); + + // Wait for scan + bigram build to complete. + let deadline = std::time::Instant::now() + Duration::from_secs(30); + loop { + std::thread::sleep(Duration::from_millis(50)); + + let ready = shared_picker + .read() + .ok() + .map(|guard| { + guard + .as_ref() + .map_or(false, |p| !p.is_scan_active() && p.bigram_index.is_some()) + }) + .unwrap_or(false); + + if ready { + break; + } + assert!( + std::time::Instant::now() < deadline, + "Timed out waiting for scan + bigram build" + ); + } + + // Sanity check: the 3 files are indexed. + { + let guard = shared_picker.read().unwrap(); + let picker = guard.as_ref().unwrap(); + assert_eq!(picker.get_files().len(), 3, "Expected 3 files after scan"); + assert!( + picker.bigram_index.is_some(), + "Bigram index should be built" + ); + assert!( + picker.bigram_overlay.is_some(), + "Overlay should be initialized" + ); + } + + // ── Phase 2: Grep BEFORE modification ─────────────────────────────── + // "UNIQUE_NEEDLE" should NOT exist in any file yet. + { + let guard = shared_picker.read().unwrap(); + let picker = guard.as_ref().unwrap(); + let parsed = parse_grep_query("UNIQUE_NEEDLE"); + let opts = grep_opts(); + let result = grep_search( + picker.get_files(), + &parsed, + &opts, + &ContentCacheBudget::unlimited(), + picker.bigram_index.as_deref(), + picker.bigram_overlay.as_deref(), + ); + assert_eq!( + result.matches.len(), + 0, + "UNIQUE_NEEDLE should not exist before modification" + ); + } + + // ── Phase 3: Modify a file on disk ────────────────────────────────── + // Sleep so the filesystem mtime (seconds granularity) advances past the + // value recorded during scan. Without this, on_create_or_modify skips + // mmap invalidation and grep reads stale cached content. + std::thread::sleep(Duration::from_millis(1100)); + + // Write new content containing the needle. + let modified_path = base.join("beta.txt"); + fs::write( + &modified_path, + "some other content\nUNIQUE_NEEDLE is here\nnothing special\n", + ) + .unwrap(); + + // Simulate watcher event: call on_create_or_modify. + // This updates the overlay's bigrams and invalidates the mmap cache. + { + let mut guard = shared_picker.write().unwrap(); + let picker = guard.as_mut().unwrap(); + let result = picker.on_create_or_modify(&modified_path); + assert!( + result.is_some(), + "on_create_or_modify should return the file" + ); + } + + // ── Phase 4: Grep AFTER modification — WITH overlay ───────────────── + // The bigram index was built BEFORE the modification, so without the + // overlay, beta.txt would be filtered out (its old bigrams don't contain + // "UNIQUE_NEEDLE"). The overlay should fix that. + { + let guard = shared_picker.read().unwrap(); + let picker = guard.as_ref().unwrap(); + let parsed = parse_grep_query("UNIQUE_NEEDLE"); + let opts = grep_opts(); + let result = grep_search( + picker.get_files(), + &parsed, + &opts, + &ContentCacheBudget::unlimited(), + picker.bigram_index.as_deref(), + picker.bigram_overlay.as_deref(), + ); + assert!( + !result.matches.is_empty(), + "UNIQUE_NEEDLE should be findable after modification (overlay adds the candidate back)" + ); + assert_eq!(result.matches.len(), 1); + assert!(result.matches[0].line_content.contains("UNIQUE_NEEDLE")); + } + + // ── Phase 5: Grep AFTER modification — WITHOUT overlay ────────────── + // Prove the overlay is actually doing something: without it, the bigram + // index would filter out beta.txt and the search would miss the needle. + { + let guard = shared_picker.read().unwrap(); + let picker = guard.as_ref().unwrap(); + let parsed = parse_grep_query("UNIQUE_NEEDLE"); + let opts = grep_opts(); + let result = grep_search( + picker.get_files(), + &parsed, + &opts, + &ContentCacheBudget::unlimited(), + picker.bigram_index.as_deref(), + None, // no overlay! + ); + assert_eq!( + result.matches.len(), + 0, + "Without overlay, bigram prefiltering should exclude the modified file" + ); + } + + // Cleanup: stop background watcher. + if let Ok(mut guard) = shared_picker.write() { + if let Some(ref mut picker) = *guard { + picker.stop_background_monitor(); + } + } +} + +/// Verify that deleting a file makes its content un-findable via grep. +#[test] +fn deleted_file_excluded_via_overlay() { + let tmp = TempDir::new().unwrap(); + let base = tmp.path(); + + fs::write(base.join("keep.txt"), "keep this content\n").unwrap(); + fs::write(base.join("remove.txt"), "DELETEME_TOKEN is here\n").unwrap(); + + let shared_picker: SharedPicker = Arc::new(std::sync::RwLock::new(None)); + let shared_frecency: SharedFrecency = Arc::new(std::sync::RwLock::new(None)); + + FilePicker::new_with_shared_state( + base.to_string_lossy().to_string(), + true, + FFFMode::Neovim, + Arc::clone(&shared_picker), + Arc::clone(&shared_frecency), + ) + .unwrap(); + + wait_for_bigram(&shared_picker); + + // Sanity: DELETEME_TOKEN is findable. + { + let guard = shared_picker.read().unwrap(); + let picker = guard.as_ref().unwrap(); + let result = grep_for(picker, "DELETEME_TOKEN"); + assert_eq!( + result.matches.len(), + 1, + "Token should be found before delete" + ); + } + + // Delete the file on disk and via picker. + let remove_path = base.join("remove.txt"); + fs::remove_file(&remove_path).unwrap(); + { + let mut guard = shared_picker.write().unwrap(); + let picker = guard.as_mut().unwrap(); + assert!( + picker.remove_file_by_path(&remove_path), + "remove should succeed" + ); + } + + // Token should no longer be found (tombstone in overlay clears the candidate). + { + let guard = shared_picker.read().unwrap(); + let picker = guard.as_ref().unwrap(); + let result = grep_for(picker, "DELETEME_TOKEN"); + assert_eq!( + result.matches.len(), + 0, + "DELETEME_TOKEN should not be found after deletion (tombstone in overlay)" + ); + } + + if let Ok(mut guard) = shared_picker.write() { + if let Some(ref mut picker) = *guard { + picker.stop_background_monitor(); + } + } +} + +/// Verify that a newly added file (in overflow) is findable via grep. +#[test] +fn new_file_findable_after_add() { + let tmp = TempDir::new().unwrap(); + let base = tmp.path(); + + fs::write(base.join("existing.txt"), "original content\n").unwrap(); + + let shared_picker: SharedPicker = Arc::new(std::sync::RwLock::new(None)); + let shared_frecency: SharedFrecency = Arc::new(std::sync::RwLock::new(None)); + + FilePicker::new_with_shared_state( + base.to_string_lossy().to_string(), + true, + FFFMode::Neovim, + Arc::clone(&shared_picker), + Arc::clone(&shared_frecency), + ) + .unwrap(); + + wait_for_bigram(&shared_picker); + + // Create a new file on disk after the index was built. + let new_path = base.join("newcomer.txt"); + fs::write(&new_path, "BRAND_NEW_TOKEN lives here\n").unwrap(); + + // Simulate watcher detecting the new file. + { + let mut guard = shared_picker.write().unwrap(); + let picker = guard.as_mut().unwrap(); + let result = picker.on_create_or_modify(&new_path); + assert!( + result.is_some(), + "on_create_or_modify should return the new file" + ); + } + + // The new file is in overflow, not in the base files slice. + // grep_search currently only searches base files, so we need to verify + // the overflow file is accessible. + { + let guard = shared_picker.read().unwrap(); + let picker = guard.as_ref().unwrap(); + let overflow = picker.get_overflow_files(); + assert_eq!(overflow.len(), 1, "Should have 1 overflow file"); + assert!( + overflow[0].path.ends_with("newcomer.txt"), + "Overflow file should be newcomer.txt" + ); + } + + if let Ok(mut guard) = shared_picker.write() { + if let Some(ref mut picker) = *guard { + picker.stop_background_monitor(); + } + } +} + +// ── Helpers ───────────────────────────────────────────────────────────── + +fn grep_opts() -> GrepSearchOptions { + GrepSearchOptions { + max_file_size: 10 * 1024 * 1024, + max_matches_per_file: 200, + smart_case: true, + file_offset: 0, + page_limit: 200, + mode: GrepMode::PlainText, + time_budget_ms: 0, + before_context: 0, + after_context: 0, + classify_definitions: false, + } +} + +fn grep_for<'a>(picker: &'a FilePicker, query: &str) -> fff_search::grep::GrepResult<'a> { + let parsed = parse_grep_query(query); + grep_search( + picker.get_files(), + &parsed, + &grep_opts(), + &ContentCacheBudget::unlimited(), + picker.bigram_index.as_deref(), + picker.bigram_overlay.as_deref(), + ) +} + +fn wait_for_bigram(shared_picker: &SharedPicker) { + let deadline = std::time::Instant::now() + Duration::from_secs(30); + loop { + std::thread::sleep(Duration::from_millis(50)); + let ready = shared_picker + .read() + .ok() + .map(|guard| { + guard + .as_ref() + .map_or(false, |p| !p.is_scan_active() && p.bigram_index.is_some()) + }) + .unwrap_or(false); + if ready { + break; + } + assert!( + std::time::Instant::now() < deadline, + "Timed out waiting for bigram build" + ); + } +} diff --git a/crates/fff-core/tests/grep_integration.rs b/crates/fff-core/tests/grep_integration.rs index bce06cf..195ae72 100644 --- a/crates/fff-core/tests/grep_integration.rs +++ b/crates/fff-core/tests/grep_integration.rs @@ -79,6 +79,8 @@ fn plain_text_finds_exact_literal() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); @@ -102,6 +104,8 @@ fn plain_text_smart_case_insensitive() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( @@ -127,6 +131,8 @@ fn plain_text_smart_case_sensitive_with_uppercase() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( @@ -153,6 +159,8 @@ fn plain_text_regex_metacharacters_are_literal() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); @@ -165,6 +173,8 @@ fn plain_text_regex_metacharacters_are_literal() { &parsed2, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result2.matches.len(), 1); assert_eq!(result2.matches[0].line_number, 2); @@ -186,6 +196,8 @@ fn plain_text_dot_is_literal() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( @@ -211,6 +223,8 @@ fn plain_text_asterisk_is_literal() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); assert_eq!(result.matches[0].line_number, 1); @@ -231,6 +245,8 @@ fn plain_text_backslash_is_literal() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); } @@ -250,6 +266,8 @@ fn plain_text_across_multiple_files() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 3); @@ -268,6 +286,8 @@ fn plain_text_highlight_offsets_are_correct() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); @@ -291,6 +311,8 @@ fn plain_text_empty_query_returns_no_content_matches() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); // Empty query in grep returns git-modified welcome state (no content matches) @@ -318,6 +340,8 @@ fn plain_text_binary_files_are_skipped() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); // Only the text file should be searched, not the binary one @@ -338,7 +362,14 @@ fn plain_text_max_matches_per_file() { opts.max_matches_per_file = 5; let parsed = parse_grep_query("match_target"); - let result = grep_search(&files, &parsed, &opts, &ContentCacheBudget::unlimited()); + let result = grep_search( + &files, + &parsed, + &opts, + &ContentCacheBudget::unlimited(), + None, + None, + ); assert_eq!( result.matches.len(), @@ -360,7 +391,14 @@ fn plain_text_page_limit() { opts.page_limit = 10; let parsed = parse_grep_query("target"); - let result = grep_search(&files, &parsed, &opts, &ContentCacheBudget::unlimited()); + let result = grep_search( + &files, + &parsed, + &opts, + &ContentCacheBudget::unlimited(), + None, + None, + ); // page_limit is a soft minimum: we always finish the current file, so we // get at least page_limit matches (no data loss) and at most @@ -403,7 +441,14 @@ fn plain_text_file_offset_pagination() { loop { let parsed = parse_grep_query("unique_token"); - let result = grep_search(&files, &parsed, &opts, &ContentCacheBudget::unlimited()); + let result = grep_search( + &files, + &parsed, + &opts, + &ContentCacheBudget::unlimited(), + None, + None, + ); for m in &result.matches { let text = m.line_content.trim().to_string(); @@ -459,6 +504,8 @@ fn plain_text_line_numbers_are_correct() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 4); @@ -479,7 +526,14 @@ fn plain_text_max_file_size_filter() { opts.max_file_size = 100; // Only allow files up to 100 bytes let parsed = parse_grep_query("match_me"); - let result = grep_search(&files, &parsed, &opts, &ContentCacheBudget::unlimited()); + let result = grep_search( + &files, + &parsed, + &opts, + &ContentCacheBudget::unlimited(), + None, + None, + ); assert_eq!(result.matches.len(), 0, "large file should be filtered out"); assert_eq!(result.filtered_file_count, 0); @@ -502,6 +556,8 @@ fn regex_basic_pattern() { &parsed, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); @@ -521,6 +577,8 @@ fn regex_capture_group_matching() { &parsed, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 2); @@ -549,6 +607,8 @@ fn regex_dot_matches_any_char() { &parsed, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( @@ -573,6 +633,8 @@ fn regex_alternation() { &parsed, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 2); @@ -596,6 +658,8 @@ fn regex_character_class() { &parsed, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 3); @@ -624,6 +688,8 @@ fn regex_quantifiers() { &parsed, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 3, "should match foo, fooo, foooo"); @@ -644,6 +710,8 @@ fn regex_anchors() { &parsed, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); @@ -666,6 +734,8 @@ fn regex_anchors_multiword() { &parsed, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( @@ -688,6 +758,8 @@ fn regex_highlight_offsets_variable_length() { &parsed, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); @@ -716,6 +788,8 @@ fn regex_invalid_pattern_falls_back_to_literal() { &parsed, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); // Fallback to literal: finds "name(" in "call name(arg)" @@ -737,6 +811,8 @@ fn regex_invalid_pattern_falls_back_to_literal() { &parsed2, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result2.matches.len(), 0); assert!(result2.regex_fallback_error.is_some()); @@ -758,6 +834,8 @@ fn regex_smart_case() { &parsed_lower, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result_lower.matches.len(), 3); @@ -768,6 +846,8 @@ fn regex_smart_case() { &parsed_upper, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result_upper.matches.len(), 1); } @@ -795,6 +875,8 @@ fn regex_across_multiple_files() { &parsed, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); // Should match: fn main(), fn helper(), fn test_one(), fn test_two() @@ -819,12 +901,16 @@ fn plain_text_and_regex_agree_on_simple_literal() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); let regex_result = grep_search( &files, &parsed, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(plain_result.matches.len(), regex_result.matches.len()); @@ -850,6 +936,8 @@ fn plain_text_escapes_what_regex_does_not() { &parsed_plain, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); let parsed_regex = parse_grep_query("\\$100"); let regex_result = grep_search( @@ -857,6 +945,8 @@ fn plain_text_escapes_what_regex_does_not() { &parsed_regex, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); // Plain text should find "$100" literally @@ -884,6 +974,8 @@ fn grep_with_extension_constraint() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); // Should only search .rs files @@ -917,6 +1009,8 @@ fn plain_text_bracket_is_literal() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( @@ -944,6 +1038,8 @@ fn grep_backslash_escapes_extension_filter() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( result_filter.files.len(), @@ -958,6 +1054,8 @@ fn grep_backslash_escapes_extension_filter() { &parsed_escaped, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( result_literal.matches.len(), @@ -981,6 +1079,8 @@ fn grep_backslash_escapes_path_segment() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( result.matches.len(), @@ -1005,6 +1105,8 @@ fn grep_backslash_escapes_negation() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); assert!(result.matches[0].line_content.contains("!test")); @@ -1025,6 +1127,8 @@ fn grep_with_path_constraint() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); @@ -1049,6 +1153,8 @@ fn grep_with_negated_extension_constraint() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( @@ -1080,6 +1186,8 @@ fn grep_with_negated_path_constraint() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( @@ -1111,6 +1219,8 @@ fn grep_with_negated_text_constraint() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); // "tests/helper.rs" contains "test" in path, should be excluded @@ -1147,6 +1257,8 @@ fn grep_empty_file_is_skipped() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); @@ -1163,6 +1275,8 @@ fn grep_single_line_no_trailing_newline() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); @@ -1184,6 +1298,8 @@ fn grep_unicode_content() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); assert_eq!(result.matches[0].line_number, 2); @@ -1194,6 +1310,8 @@ fn grep_unicode_content() { &parsed2, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result2.matches.len(), 1); assert_eq!(result2.matches[0].line_number, 3); @@ -1211,6 +1329,8 @@ fn grep_long_line_is_truncated() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); @@ -1237,6 +1357,8 @@ fn regex_word_boundary() { &parsed, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( @@ -1262,6 +1384,8 @@ fn plain_text_question_mark_is_literal() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( @@ -1286,6 +1410,8 @@ fn plain_text_query_with_question_mark_in_word() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( @@ -1307,6 +1433,8 @@ fn regex_question_mark_is_quantifier() { &parsed, ®ex_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( @@ -1333,6 +1461,8 @@ fn fuzzy_finds_exact_substring() { &parsed, &fuzzy_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( @@ -1360,6 +1490,8 @@ fn fuzzy_finds_scattered_characters() { &parsed, &fuzzy_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert!( @@ -1380,6 +1512,8 @@ fn fuzzy_highlight_offsets_correct() { &parsed, &fuzzy_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); @@ -1410,6 +1544,8 @@ fn fuzzy_unicode_char_indices() { &parsed, &fuzzy_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); // Should fuzzy match "régulière" (with multi-byte é and è) @@ -1429,6 +1565,8 @@ fn fuzzy_empty_query_returns_empty() { &parsed, &fuzzy_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); // Empty query returns git-modified files, not fuzzy matches @@ -1450,6 +1588,8 @@ fn fuzzy_with_extension_constraint() { &parsed, &fuzzy_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); // Should only search .rs files @@ -1476,7 +1616,14 @@ fn fuzzy_respects_page_limit() { opts.max_matches_per_file = 50; let parsed = parse_grep_query("target"); - let result = grep_search(&files, &parsed, &opts, &ContentCacheBudget::unlimited()); + let result = grep_search( + &files, + &parsed, + &opts, + &ContentCacheBudget::unlimited(), + None, + None, + ); // page_limit is a soft minimum: we always finish the current file, so we // get at least page_limit matches (no data loss) and at most @@ -1512,7 +1659,14 @@ fn fuzzy_respects_max_matches_per_file() { opts.max_matches_per_file = 5; let parsed = parse_grep_query("match"); - let result = grep_search(&files, &parsed, &opts, &ContentCacheBudget::unlimited()); + let result = grep_search( + &files, + &parsed, + &opts, + &ContentCacheBudget::unlimited(), + None, + None, + ); assert_eq!( result.matches.len(), @@ -1538,6 +1692,8 @@ fn fuzzy_filters_low_quality_matches() { &parsed, &fuzzy_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); // Should only get high-quality matches @@ -1573,6 +1729,8 @@ fn fuzzy_exact_match_always_passes() { &parsed, &fuzzy_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!( @@ -1598,6 +1756,8 @@ fn fuzzy_score_is_captured() { &parsed, &fuzzy_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); @@ -1625,6 +1785,8 @@ fn fuzzy_score_is_none_in_plain_mode() { &parsed, &plain_opts(), &ContentCacheBudget::unlimited(), + None, + None, ); assert_eq!(result.matches.len(), 1); diff --git a/crates/fff-mcp/src/main.rs b/crates/fff-mcp/src/main.rs index 1c43825..60dafe2 100644 --- a/crates/fff-mcp/src/main.rs +++ b/crates/fff-mcp/src/main.rs @@ -285,7 +285,7 @@ async fn main() -> Result<(), Box> { && let Ok(mut guard) = shared_picker.write() && let Some(ref mut picker) = *guard { - picker.cache_budget = std::sync::Arc::new(fff::ContentCacheBudget::new(limit)); + picker.cache_budget = std::sync::Arc::new(fff::ContentCacheBudget::new_for_repo(limit)); } if !args.no_update_check { diff --git a/crates/fff-mcp/src/output.rs b/crates/fff-mcp/src/output.rs index fea775d..39ba071 100644 --- a/crates/fff-mcp/src/output.rs +++ b/crates/fff-mcp/src/output.rs @@ -11,7 +11,7 @@ use fff::types::FileItem; use crate::cursor::CursorStore; /// Frecency score → single-token word. `None` for low-scoring files. -fn frecency_word(score: i64) -> Option<&'static str> { +fn frecency_word(score: i32) -> Option<&'static str> { if score >= 100 { Some("hot") } else if score >= 50 { @@ -24,7 +24,7 @@ fn frecency_word(score: i64) -> Option<&'static str> { } /// Build " - hot git:modified" style suffix. Empty when nothing to report. -pub fn file_suffix(git_status: Option, frecency_score: i64) -> String { +pub fn file_suffix(git_status: Option, frecency_score: i32) -> String { match ( frecency_word(frecency_score), format_git_status_opt(git_status), diff --git a/crates/fff-mcp/src/server.rs b/crates/fff-mcp/src/server.rs index 54363ad..e523f1b 100644 --- a/crates/fff-mcp/src/server.rs +++ b/crates/fff-mcp/src/server.rs @@ -251,10 +251,13 @@ impl FffServer { let files = picker.get_files(); let budget = picker.cache_budget(); + let bigram_idx = picker.bigram_index.as_deref(); + let bigram_overlay = picker.bigram_overlay.as_deref(); let parser = QueryParser::new(AiGrepConfig); let parsed = parser.parse(query); - let result = grep::grep_search(files, &parsed, &options, budget); + let result = + grep::grep_search(files, &parsed, &options, budget, bigram_idx, bigram_overlay); if result.matches.is_empty() && file_offset == 0 { // Auto-retry: try broadening multi-word queries by dropping first non-constraint word @@ -277,8 +280,14 @@ impl FffServer { }; let (retry_options, _) = make_grep_options(output_mode, retry_mode, 0, context); - let retry_result = - grep::grep_search(files, &rest_parsed, &retry_options, budget); + let retry_result = grep::grep_search( + files, + &rest_parsed, + &retry_options, + budget, + bigram_idx, + bigram_overlay, + ); if !retry_result.matches.is_empty() && retry_result.matches.len() <= 10 { let mut cs = self.lock_cursors()?; @@ -306,7 +315,8 @@ impl FffServer { let fuzzy_query = cleanup_fuzzy_query(query); let (fuzzy_options, _) = make_grep_options(output_mode, GrepMode::Fuzzy, 0, Some(0)); let fuzzy_parsed = parser.parse(&fuzzy_query); - let fuzzy_result = grep::grep_search(files, &fuzzy_parsed, &fuzzy_options, budget); + let fuzzy_result = + grep::grep_search(files, &fuzzy_parsed, &fuzzy_options, budget, None, None); if !fuzzy_result.matches.is_empty() { let mut lines: Vec = Vec::new(); @@ -615,7 +625,8 @@ impl FffServer { }; let parsed = parser.parse(&full_query); - let fb_result = grep::grep_search(files, &parsed, &fallback_options, budget); + let fb_result = + grep::grep_search(files, &parsed, &fallback_options, budget, None, None); if !fb_result.matches.is_empty() { let fb_file_refs: Vec<&FileItem> = fb_result.files.to_vec(); diff --git a/crates/fff-nvim/Cargo.toml b/crates/fff-nvim/Cargo.toml index 810874d..7857917 100644 --- a/crates/fff-nvim/Cargo.toml +++ b/crates/fff-nvim/Cargo.toml @@ -44,7 +44,7 @@ thiserror = { workspace = true } tracing = { workspace = true } # Local crates -fff = { package = "fff-search", path = "../fff-core" , version = "0.4.0" } +fff = { package = "fff-search", path = "../fff-core" , version = "0.4.0", features = ["mimalloc-collect"] } fff-query-parser = { path = "../fff-query-parser" , version = "0.4.2" } # External dependencies diff --git a/crates/fff-nvim/benches/indexing_and_search.rs b/crates/fff-nvim/benches/indexing_and_search.rs index 24c135d..795fdf5 100644 --- a/crates/fff-nvim/benches/indexing_and_search.rs +++ b/crates/fff-nvim/benches/indexing_and_search.rs @@ -1,7 +1,10 @@ use criterion::{BenchmarkId, Criterion, black_box, criterion_group, criterion_main}; use fff::file_picker::{FFFMode, FilePicker}; -use fff::types::{FileItem, PaginationArgs}; -use fff::{FuzzySearchOptions, QueryParser, SharedFrecency, SharedPicker}; +use fff::types::{ContentCacheBudget, FileItem, PaginationArgs}; +use fff::{ + FuzzySearchOptions, GrepMode, GrepSearchOptions, QueryParser, SharedFrecency, SharedPicker, + build_bigram_index, grep, +}; use std::path::PathBuf; use std::sync::{Arc, RwLock}; use std::time::Duration; @@ -235,11 +238,12 @@ fn bench_search_queries(c: &mut Criterion) { let results = FilePicker::fuzzy_search( black_box(&files), black_box(&parsed), + None, FuzzySearchOptions { max_threads: 4, current_file: None, project_path: None, - last_same_query_match: None, + combo_boost_score_multiplier: 100, min_combo_count: 3, pagination: PaginationArgs { @@ -283,11 +287,12 @@ fn bench_search_thread_scaling(c: &mut Criterion) { let results = FilePicker::fuzzy_search( black_box(&files), black_box(&parsed), + None, FuzzySearchOptions { max_threads: threads, current_file: None, project_path: None, - last_same_query_match: None, + combo_boost_score_multiplier: 100, min_combo_count: 3, pagination: PaginationArgs { @@ -329,11 +334,12 @@ fn bench_search_result_limits(c: &mut Criterion) { let results = FilePicker::fuzzy_search( black_box(&files), black_box(&parsed), + None, FuzzySearchOptions { max_threads: 4, current_file: None, project_path: None, - last_same_query_match: None, + combo_boost_score_multiplier: 100, min_combo_count: 3, pagination: PaginationArgs { @@ -387,11 +393,12 @@ fn bench_search_scalability(c: &mut Criterion) { let results = FilePicker::fuzzy_search( black_box(subset), black_box(&parsed), + None, FuzzySearchOptions { max_threads: 4, current_file: None, project_path: None, - last_same_query_match: None, + combo_boost_score_multiplier: 100, min_combo_count: 3, pagination: PaginationArgs { @@ -431,11 +438,12 @@ fn bench_search_ordering(c: &mut Criterion) { let results = FilePicker::fuzzy_search( black_box(&files), black_box(&parsed_controller), + None, FuzzySearchOptions { max_threads: 4, current_file: None, project_path: None, - last_same_query_match: None, + combo_boost_score_multiplier: 100, min_combo_count: 3, pagination: PaginationArgs { @@ -454,11 +462,12 @@ fn bench_search_ordering(c: &mut Criterion) { let results = FilePicker::fuzzy_search( black_box(&files), black_box(&parsed_controller), + None, FuzzySearchOptions { max_threads: 4, current_file: None, project_path: None, - last_same_query_match: None, + combo_boost_score_multiplier: 100, min_combo_count: 3, pagination: PaginationArgs { @@ -477,11 +486,12 @@ fn bench_search_ordering(c: &mut Criterion) { let results = FilePicker::fuzzy_search( black_box(&files), black_box(&parsed_mod), + None, FuzzySearchOptions { max_threads: 4, current_file: None, project_path: None, - last_same_query_match: None, + combo_boost_score_multiplier: 100, min_combo_count: 3, pagination: PaginationArgs { @@ -499,11 +509,12 @@ fn bench_search_ordering(c: &mut Criterion) { let results = FilePicker::fuzzy_search( black_box(&files), black_box(&parsed_mod), + None, FuzzySearchOptions { max_threads: 4, current_file: None, project_path: None, - last_same_query_match: None, + combo_boost_score_multiplier: 100, min_combo_count: 3, pagination: PaginationArgs { @@ -522,11 +533,12 @@ fn bench_search_ordering(c: &mut Criterion) { let results = FilePicker::fuzzy_search( black_box(&files), black_box(&parsed_controller), + None, FuzzySearchOptions { max_threads: 4, current_file: None, project_path: None, - last_same_query_match: None, + combo_boost_score_multiplier: 100, min_combo_count: 3, pagination: PaginationArgs { @@ -544,11 +556,12 @@ fn bench_search_ordering(c: &mut Criterion) { let results = FilePicker::fuzzy_search( black_box(&files), black_box(&parsed_controller), + None, FuzzySearchOptions { max_threads: 4, current_file: None, project_path: None, - last_same_query_match: None, + combo_boost_score_multiplier: 100, min_combo_count: 3, pagination: PaginationArgs { @@ -588,11 +601,12 @@ fn bench_pagination_performance(c: &mut Criterion) { let results = FilePicker::fuzzy_search( black_box(&files), black_box(&parsed), + None, FuzzySearchOptions { max_threads: 4, current_file: None, project_path: None, - last_same_query_match: None, + combo_boost_score_multiplier: 100, min_combo_count: 3, pagination: PaginationArgs { @@ -611,11 +625,12 @@ fn bench_pagination_performance(c: &mut Criterion) { let results = FilePicker::fuzzy_search( black_box(&files), black_box(&parsed), + None, FuzzySearchOptions { max_threads: 4, current_file: None, project_path: None, - last_same_query_match: None, + combo_boost_score_multiplier: 100, min_combo_count: 3, pagination: PaginationArgs { @@ -634,11 +649,12 @@ fn bench_pagination_performance(c: &mut Criterion) { let results = FilePicker::fuzzy_search( black_box(&files), black_box(&parsed), + None, FuzzySearchOptions { max_threads: 4, current_file: None, project_path: None, - last_same_query_match: None, + combo_boost_score_multiplier: 100, min_combo_count: 3, pagination: PaginationArgs { @@ -654,6 +670,89 @@ fn bench_pagination_performance(c: &mut Criterion) { group.finish(); } +/// Benchmark grep search with bigram index prefiltering +fn bench_grep_search(c: &mut Criterion) { + let (files, _sp, _sf) = match setup_once() { + Ok(result) => result, + Err(e) => { + eprintln!("Skipping grep benchmarks: {}", e); + return; + } + }; + + let budget = ContentCacheBudget::new_for_repo(files.len()); + + eprintln!(" Building bigram index for {} files...", files.len()); + let start = std::time::Instant::now(); + let bigram_index = build_bigram_index(&files, &budget); + eprintln!( + " Bigram index built in {:.2}s ({} columns)", + start.elapsed().as_secs_f64(), + bigram_index.columns_used(), + ); + let bigram_index = std::sync::Arc::new(bigram_index); + + let mut group = c.benchmark_group("grep"); + group.sample_size(50); + + let options = GrepSearchOptions { + max_file_size: 10 * 1024 * 1024, + max_matches_per_file: 0, + smart_case: true, + file_offset: 0, + page_limit: 100, + mode: GrepMode::PlainText, + time_budget_ms: 0, + before_context: 0, + after_context: 0, + classify_definitions: false, + }; + + let test_queries = vec![ + ("common", "struct"), + ("specific", "DEFINE_MUTEX"), + ("path_filter", "*.h mutex"), + ]; + + let grep_parser = fff::QueryParser::new(fff::GrepConfig); + + for (name, query) in &test_queries { + let parsed = grep_parser.parse(query); + + // With bigram index + group.bench_with_input(BenchmarkId::new("with_bigram", name), query, |b, _| { + b.iter(|| { + let result = grep::grep_search( + black_box(&files), + black_box(&parsed), + black_box(&options), + &budget, + Some(&bigram_index), + None, + ); + result.matches.len() + }); + }); + + // Without bigram index + group.bench_with_input(BenchmarkId::new("without_bigram", name), query, |b, _| { + b.iter(|| { + let result = grep::grep_search( + black_box(&files), + black_box(&parsed), + black_box(&options), + &budget, + None, + None, + ); + result.matches.len() + }); + }); + } + + group.finish(); +} + criterion_group!( benches, bench_indexing, @@ -663,6 +762,7 @@ criterion_group!( bench_search_scalability, bench_search_ordering, bench_pagination_performance, + bench_grep_search, ); criterion_main!(benches); diff --git a/crates/fff-nvim/src/bin/fuzzy_grep_test.rs b/crates/fff-nvim/src/bin/fuzzy_grep_test.rs index 7431c75..717b2e3 100644 --- a/crates/fff-nvim/src/bin/fuzzy_grep_test.rs +++ b/crates/fff-nvim/src/bin/fuzzy_grep_test.rs @@ -78,7 +78,14 @@ fn run_fuzzy_query(files: &[FileItem], query: &str, label: &str) { let parsed = parse_grep_query(query); let start = Instant::now(); - let result = grep_search(files, &parsed, &options, &fff::ContentCacheBudget::zero()); + let result = grep_search( + files, + &parsed, + &options, + &fff::ContentCacheBudget::zero(), + None, + None, + ); let elapsed = start.elapsed(); eprintln!("══════════════════════════════════════════════════════════════"); diff --git a/crates/fff-nvim/src/bin/grep_profiler.rs b/crates/fff-nvim/src/bin/grep_profiler.rs index 8e8dc39..8654397 100644 --- a/crates/fff-nvim/src/bin/grep_profiler.rs +++ b/crates/fff-nvim/src/bin/grep_profiler.rs @@ -11,6 +11,7 @@ use fff::FileItem; /// cargo build --release --bin grep_profiler /// ./target/release/grep_profiler [--path /path/to/repo] use fff::grep::{GrepMode, GrepSearchOptions, grep_search, parse_grep_query}; +use fff::types::{BigramFilter, BigramIndexBuilder, ContentCacheBudget}; use std::io::Read; use std::path::Path; use std::time::{Duration, Instant}; @@ -115,6 +116,7 @@ impl BenchStats { struct GrepBench<'a> { files: &'a [FileItem], options: GrepSearchOptions, + bigram_index: Option<&'a BigramFilter>, } impl<'a> GrepBench<'a> { @@ -125,6 +127,7 @@ impl<'a> GrepBench<'a> { fn with_mode(files: &'a [FileItem], mode: GrepMode) -> Self { Self { files, + bigram_index: None, options: GrepSearchOptions { max_file_size: 10 * 1024 * 1024, max_matches_per_file: 200, @@ -140,6 +143,11 @@ impl<'a> GrepBench<'a> { } } + fn with_bigram(mut self, index: &'a BigramFilter) -> Self { + self.bigram_index = Some(index); + self + } + /// Run a single grep search, return (duration, match_count, files_searched) fn run_once(&self, query: &str) -> (Duration, usize, usize) { let parsed = parse_grep_query(query); @@ -148,7 +156,9 @@ impl<'a> GrepBench<'a> { self.files, &parsed, &self.options, - &fff::ContentCacheBudget::default(), + &ContentCacheBudget::default(), + self.bigram_index, + None, ); let elapsed = start.elapsed(); (elapsed, result.matches.len(), result.total_files_searched) @@ -171,6 +181,23 @@ impl<'a> GrepBench<'a> { } } +fn build_bigram_index(files: &[FileItem]) -> BigramFilter { + use rayon::prelude::*; + + let builder = BigramIndexBuilder::new(files.len()); + let budget = ContentCacheBudget::default(); + + files.par_iter().enumerate().for_each(|(idx, file)| { + if !file.is_binary + && let Some(content) = file.get_content_for_search(&budget) + { + builder.add_file_content(idx, &content); + } + }); + + builder.compress() +} + fn fmt_dur(d: Duration) -> String { let us = d.as_micros(); if us > 1_000_000 { @@ -305,6 +332,27 @@ fn main() { print_row(name, &stats, matches, files_searched, *iters); } + // ── Bigram-accelerated benchmarks ─────────────────────────────────── + eprintln!("\n[3b/7] Building bigram index..."); + let bigram_start = Instant::now(); + let bigram_index = build_bigram_index(&files); + eprintln!( + " Built in {:.2}s ({} columns, {:.1} MB)\n", + bigram_start.elapsed().as_secs_f64(), + bigram_index.file_count(), + bigram_index.heap_bytes() as f64 / (1024.0 * 1024.0), + ); + + eprintln!("[3c/7] Bigram-accelerated warm benchmarks (same queries, with bigram prefilter)"); + print_header(); + + let bigram_bench = GrepBench::new(&files).with_bigram(&bigram_index); + for (name, query, iters) in &warm_queries { + let bigram_name = format!("bg_{}", name.strip_prefix("warm_").unwrap_or(name)); + let (stats, matches, files_searched) = bigram_bench.bench_query(query, *iters); + print_row(&bigram_name, &stats, matches, files_searched, *iters); + } + // ── Fuzzy grep benchmarks ───────────────────────────────────────────── eprintln!("\n[4/7] Fuzzy grep warm benchmarks"); eprintln!(" Running 3 warmup iterations, then measuring.\n"); @@ -462,6 +510,8 @@ fn main() { &parsed, &opts, &fff::ContentCacheBudget::unlimited(), + None, + None, ); let elapsed = start.elapsed(); eprintln!( @@ -483,7 +533,10 @@ fn main() { eprintln!("\n=== Summary ==="); let mmap_count = files .iter() - .filter(|f| f.get_mmap(&fff::ContentCacheBudget::unlimited()).is_some()) + .filter(|f| { + f.get_content_for_search(&fff::ContentCacheBudget::unlimited()) + .is_some() + }) .count(); eprintln!(" Files with cached mmap: {}", mmap_count); eprintln!(" Total indexed files: {}", files.len()); diff --git a/crates/fff-nvim/src/bin/grep_vs_rg.rs b/crates/fff-nvim/src/bin/grep_vs_rg.rs index 39c9333..9d5ab6b 100644 --- a/crates/fff-nvim/src/bin/grep_vs_rg.rs +++ b/crates/fff-nvim/src/bin/grep_vs_rg.rs @@ -210,7 +210,14 @@ fn run_fff_full(files: &[FileItem], query: &str) -> (usize, Duration) { classify_definitions: false, }; let start = Instant::now(); - let result = grep_search(files, &parsed, &options, &fff::ContentCacheBudget::zero()); + let result = grep_search( + files, + &parsed, + &options, + &fff::ContentCacheBudget::zero(), + None, + None, + ); let elapsed = start.elapsed(); (result.matches.len(), elapsed) } @@ -235,6 +242,8 @@ fn benchmark_fff_smart_case(files: &[FileItem], parsed: &FFFQuery<'_>) -> (usize parsed, &options, &fff::ContentCacheBudget::unlimited(), + None, + None, ); let elapsed = start.elapsed(); (result.matches.len(), elapsed) @@ -261,6 +270,8 @@ fn run_fff_page(files: &[FileItem], query: &str) -> (usize, Duration) { &parsed, &options, &fff::ContentCacheBudget::unlimited(), + None, + None, ); let elapsed = start.elapsed(); (result.matches.len(), elapsed) diff --git a/crates/fff-nvim/src/lib.rs b/crates/fff-nvim/src/lib.rs index d729453..ee33a03 100644 --- a/crates/fff-nvim/src/lib.rs +++ b/crates/fff-nvim/src/lib.rs @@ -96,7 +96,7 @@ pub fn init_file_picker(_: &Lua, base_path: String) -> LuaResult { FilePicker::new_with_shared_state( base_path, - false, + true, FFFMode::Neovim, Arc::clone(&FILE_PICKER), Arc::clone(&FRECENCY), @@ -127,7 +127,7 @@ fn reinit_file_picker_internal(path: &Path) -> Result<(), Error> { // Create new picker — this atomically replaces the old one via write lock FilePicker::new_with_shared_state( path.to_string_lossy().to_string(), - false, + true, FFFMode::Neovim, Arc::clone(&FILE_PICKER), Arc::clone(&FRECENCY), @@ -340,8 +340,16 @@ pub fn live_grep( classify_definitions: false, }; - let result = - fff::grep::grep_search(picker.get_files(), &parsed, &options, picker.cache_budget()); + let bigram_idx = picker.bigram_index.as_deref(); + let bigram_overlay = picker.bigram_overlay.as_deref(); + let result = fff::grep::grep_search( + picker.get_files(), + &parsed, + &options, + picker.cache_budget(), + bigram_idx, + bigram_overlay, + ); lua_types::GrepResultLua::from(result).into_lua(lua) } @@ -363,9 +371,9 @@ fn build_file_path_fallback(lua: &Lua, path: &Path, total_files: usize) -> LuaRe item.set("name", name.as_str())?; item.set("size", path.metadata().map(|m| m.len()).unwrap_or(0))?; item.set("modified", 0u64)?; - item.set("access_frecency_score", 0i64)?; - item.set("modification_frecency_score", 0i64)?; - item.set("total_frecency_score", 0i64)?; + item.set("access_frecency_score", 0i32)?; + item.set("modification_frecency_score", 0i32)?; + item.set("total_frecency_score", 0i32)?; item.set("git_status", "")?; item.set("is_binary", false)?; diff --git a/crates/fff-query-parser/Cargo.toml b/crates/fff-query-parser/Cargo.toml index 14f0c0f..d8672f1 100644 --- a/crates/fff-query-parser/Cargo.toml +++ b/crates/fff-query-parser/Cargo.toml @@ -17,8 +17,3 @@ zlob = ["dep:zlob"] zlob = { workspace = true, optional = true } [dev-dependencies] -criterion = { version = "0.5", features = ["html_reports"] } - -[[bench]] -name = "parse_bench" -harness = false diff --git a/doc/fff.nvim.txt b/doc/fff.nvim.txt index d749287..3133c19 100644 --- a/doc/fff.nvim.txt +++ b/doc/fff.nvim.txt @@ -1,4 +1,4 @@ -*fff.nvim.txt* For Neovim >= 0.10.0 Last change: 2026 March 20 +*fff.nvim.txt* For Neovim >= 0.10.0 Last change: 2026 March 24 ============================================================================== Table of Contents *fff.nvim-table-of-contents* diff --git a/packages/fff-bun/src/ffi.ts b/packages/fff-bun/src/ffi.ts index 29b14d2..ac914c1 100644 --- a/packages/fff-bun/src/ffi.ts +++ b/packages/fff-bun/src/ffi.ts @@ -241,9 +241,7 @@ function snakeToCamel(obj: unknown): unknown { const result: Record = {}; for (const [key, value] of Object.entries(obj as Record)) { - const camelKey = key.replace(/_([a-z])/g, (_, letter) => - letter.toUpperCase(), - ); + const camelKey = key.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase()); result[camelKey] = snakeToCamel(value); } return result; @@ -253,16 +251,18 @@ function snakeToCamel(obj: unknown): unknown { // FffResult byte offsets (must match #[repr(C)] layout on 64-bit) // { success: bool(1+7pad), error: *char(8), handle: *void(8), int_value: i64(8) } // --------------------------------------------------------------------------- -const RES_SUCCESS = 0; // bool (1 + 7 padding) -const RES_ERROR = 8; // *mut c_char (8) -const RES_HANDLE = 16; // *mut c_void (8) -const RES_INT_VALUE = 24; // i64 (8) +const RES_SUCCESS = 0; // bool (1 + 7 padding) +const RES_ERROR = 8; // *mut c_char (8) +const RES_HANDLE = 16; // *mut c_void (8) +const RES_INT_VALUE = 24; // i64 (8) /** * Read the FffResult envelope: check success, extract payload, free envelope. * On error returns a Result. On success returns the raw handle pointer and int_value. */ -function readResultEnvelope(resultPtr: Pointer | null): { success: true; handlePtr: number; intValue: number } | Result { +function readResultEnvelope( + resultPtr: Pointer | null, +): { success: true; handlePtr: number; intValue: number } | Result { if (resultPtr === null) { return err("FFI returned null pointer"); } @@ -651,16 +651,10 @@ function readGrepMatchStruct(p: number): GrepMatch { match.fuzzyScore = read.u16(pp, GM_FUZZY_SCORE); } if (ctxBeforeCount > 0) { - match.contextBefore = readCStringArray( - read.ptr(pp, GM_CTX_BEFORE), - ctxBeforeCount, - ); + match.contextBefore = readCStringArray(read.ptr(pp, GM_CTX_BEFORE), ctxBeforeCount); } if (ctxAfterCount > 0) { - match.contextAfter = readCStringArray( - read.ptr(pp, GM_CTX_AFTER), - ctxAfterCount, - ); + match.contextAfter = readCStringArray(read.ptr(pp, GM_CTX_AFTER), ctxAfterCount); } return match; @@ -824,13 +818,15 @@ export function ffiIsScanning(handle: NativeHandle): boolean { } // FffScanProgress { scanned_files_count: u64(8), is_scanning: bool(1+7pad) } -const SP_COUNT = 0; // u64 (8) -const SP_SCANNING = 8; // bool (1 + 7 pad) +const SP_COUNT = 0; // u64 (8) +const SP_SCANNING = 8; // bool (1 + 7 pad) /** * Get scan progress. */ -export function ffiGetScanProgress(handle: NativeHandle): Result<{ scannedFilesCount: number; isScanning: boolean }> { +export function ffiGetScanProgress( + handle: NativeHandle, +): Result<{ scannedFilesCount: number; isScanning: boolean }> { const library = loadLibrary(); const resultPtr = library.symbols.fff_get_scan_progress(handle); const envelope = readResultEnvelope(resultPtr); @@ -852,10 +848,7 @@ export function ffiGetScanProgress(handle: NativeHandle): Result<{ scannedFilesC /** * Wait for scan to complete. */ -export function ffiWaitForScan( - handle: NativeHandle, - timeoutMs: number, -): Result { +export function ffiWaitForScan(handle: NativeHandle, timeoutMs: number): Result { const library = loadLibrary(); const resultPtr = library.symbols.fff_wait_for_scan(handle, BigInt(timeoutMs)); return parseBoolResult(resultPtr); @@ -864,10 +857,7 @@ export function ffiWaitForScan( /** * Restart index in new path. */ -export function ffiRestartIndex( - handle: NativeHandle, - newPath: string, -): Result { +export function ffiRestartIndex(handle: NativeHandle, newPath: string): Result { const library = loadLibrary(); const resultPtr = library.symbols.fff_restart_index(handle, ptr(encodeString(newPath))); return parseVoidResult(resultPtr); diff --git a/packages/fff-bun/src/finder.ts b/packages/fff-bun/src/finder.ts index 7554d9a..42f44dd 100644 --- a/packages/fff-bun/src/finder.ts +++ b/packages/fff-bun/src/finder.ts @@ -430,4 +430,3 @@ export class FileFinder { return ffiHealthCheck(null, testPath || "") as Result; } } - diff --git a/packages/fff-bun/src/types.ts b/packages/fff-bun/src/types.ts index a8b95ac..1648e0a 100644 --- a/packages/fff-bun/src/types.ts +++ b/packages/fff-bun/src/types.ts @@ -360,4 +360,3 @@ export interface MultiGrepOptions { /** Number of context lines to include after each match (default: 0) */ afterContext?: number; } - diff --git a/packages/fff-node/src/ffi.ts b/packages/fff-node/src/ffi.ts index 17e3fdd..e5bd45b 100644 --- a/packages/fff-node/src/ffi.ts +++ b/packages/fff-node/src/ffi.ts @@ -45,7 +45,15 @@ import { wrapPointer, } from "ffi-rs"; import { findBinary } from "./binary.js"; -import type { FileItem, GrepMatch, GrepResult, Location, Result, Score, SearchResult } from "./types.js"; +import type { + FileItem, + GrepMatch, + GrepResult, + Location, + Result, + Score, + SearchResult, +} from "./types.js"; import { createGrepCursor, err } from "./types.js"; const LIBRARY_KEY = "fff_c"; @@ -216,7 +224,11 @@ function readResultEnvelope( } /** Call a function returning FffResult with void payload. */ -function callVoidResult(funcName: string, paramsType: DataType[], paramsValue: unknown[]): Result { +function callVoidResult( + funcName: string, + paramsType: DataType[], + paramsValue: unknown[], +): Result { const res = readResultEnvelope(funcName, paramsType, paramsValue); if ("ok" in res) return res; freeResult(res.rawPtr); @@ -224,7 +236,11 @@ function callVoidResult(funcName: string, paramsType: DataType[], paramsValue: u } /** Call a function returning FffResult with int_value payload. */ -function callIntResult(funcName: string, paramsType: DataType[], paramsValue: unknown[]): Result { +function callIntResult( + funcName: string, + paramsType: DataType[], + paramsValue: unknown[], +): Result { const res = readResultEnvelope(funcName, paramsType, paramsValue); if ("ok" in res) return res; const value = Number(res.struct.int_value); @@ -233,7 +249,11 @@ function callIntResult(funcName: string, paramsType: DataType[], paramsValue: un } /** Call a function returning FffResult with bool in int_value. */ -function callBoolResult(funcName: string, paramsType: DataType[], paramsValue: unknown[]): Result { +function callBoolResult( + funcName: string, + paramsType: DataType[], + paramsValue: unknown[], +): Result { const res = readResultEnvelope(funcName, paramsType, paramsValue); if ("ok" in res) return res; const value = Number(res.struct.int_value) !== 0; @@ -242,7 +262,11 @@ function callBoolResult(funcName: string, paramsType: DataType[], paramsValue: u } /** Call a function returning FffResult with a C string in handle. */ -function callStringResult(funcName: string, paramsType: DataType[], paramsValue: unknown[]): Result { +function callStringResult( + funcName: string, + paramsType: DataType[], + paramsValue: unknown[], +): Result { const res = readResultEnvelope(funcName, paramsType, paramsValue); if ("ok" in res) return res; const handlePtr = res.struct.handle; @@ -254,7 +278,11 @@ function callStringResult(funcName: string, paramsType: DataType[], paramsValue: } /** Call a function returning FffResult with a JSON string in handle. */ -function callJsonResult(funcName: string, paramsType: DataType[], paramsValue: unknown[]): Result { +function callJsonResult( + funcName: string, + paramsType: DataType[], + paramsValue: unknown[], +): Result { const res = readResultEnvelope(funcName, paramsType, paramsValue); if ("ok" in res) return res; const handlePtr = res.struct.handle; @@ -306,9 +334,9 @@ export function ffiCreate( const { rawPtr, struct: structData } = callRaw( "fff_create_instance", [ - DataType.String, // base_path - DataType.String, // frecency_db_path - DataType.String, // history_db_path + DataType.String, // base_path + DataType.String, // frecency_db_path + DataType.String, // history_db_path DataType.Boolean, // use_unsafe_no_lock DataType.Boolean, // warmup_mmap_cache DataType.Boolean, // ai_mode @@ -522,30 +550,30 @@ interface FffMatchRangeRaw { function readFileItemFromRaw(raw: FffFileItemRaw): FileItem { return { - path: readCString(raw.path) ?? "", - relativePath: readCString(raw.relative_path) ?? "", - fileName: readCString(raw.file_name) ?? "", - gitStatus: readCString(raw.git_status) ?? "", - size: Number(raw.size), - modified: Number(raw.modified), - accessFrecencyScore: Number(raw.access_frecency_score), + path: readCString(raw.path) ?? "", + relativePath: readCString(raw.relative_path) ?? "", + fileName: readCString(raw.file_name) ?? "", + gitStatus: readCString(raw.git_status) ?? "", + size: Number(raw.size), + modified: Number(raw.modified), + accessFrecencyScore: Number(raw.access_frecency_score), modificationFrecencyScore: Number(raw.modification_frecency_score), - totalFrecencyScore: Number(raw.total_frecency_score), + totalFrecencyScore: Number(raw.total_frecency_score), }; } function readScoreFromRaw(raw: FffScoreRaw): Score { return { - total: raw.total, - baseScore: raw.base_score, - filenameBonus: raw.filename_bonus, - specialFilenameBonus:raw.special_filename_bonus, - frecencyBoost: raw.frecency_boost, - distancePenalty: raw.distance_penalty, - currentFilePenalty: raw.current_file_penalty, - comboMatchBoost: raw.combo_match_boost, - exactMatch: raw.exact_match !== 0, - matchType: readCString(raw.match_type) ?? "", + total: raw.total, + baseScore: raw.base_score, + filenameBonus: raw.filename_bonus, + specialFilenameBonus: raw.special_filename_bonus, + frecencyBoost: raw.frecency_boost, + distancePenalty: raw.distance_penalty, + currentFilePenalty: raw.current_file_penalty, + comboMatchBoost: raw.combo_match_boost, + exactMatch: raw.exact_match !== 0, + matchType: readCString(raw.match_type) ?? "", }; } @@ -619,20 +647,20 @@ function readGrepMatchFromRaw(raw: FffGrepMatchRaw): GrepMatch { } const match: GrepMatch = { - path: readCString(raw.path) ?? "", - relativePath: readCString(raw.relative_path) ?? "", - fileName: readCString(raw.file_name) ?? "", - gitStatus: readCString(raw.git_status) ?? "", - lineContent: readCString(raw.line_content) ?? "", - size: Number(raw.size), - modified: Number(raw.modified), - totalFrecencyScore: Number(raw.total_frecency_score), - accessFrecencyScore: Number(raw.access_frecency_score), + path: readCString(raw.path) ?? "", + relativePath: readCString(raw.relative_path) ?? "", + fileName: readCString(raw.file_name) ?? "", + gitStatus: readCString(raw.git_status) ?? "", + lineContent: readCString(raw.line_content) ?? "", + size: Number(raw.size), + modified: Number(raw.modified), + totalFrecencyScore: Number(raw.total_frecency_score), + accessFrecencyScore: Number(raw.access_frecency_score), modificationFrecencyScore: Number(raw.modification_frecency_score), - isBinary: raw.is_binary !== 0, - lineNumber: Number(raw.line_number), - col: raw.col, - byteOffset: Number(raw.byte_offset), + isBinary: raw.is_binary !== 0, + lineNumber: Number(raw.line_number), + col: raw.col, + byteOffset: Number(raw.byte_offset), matchRanges, }; @@ -686,7 +714,10 @@ function parseGrepResult(rawPtr: JsExternal): Result { const items: GrepMatch[] = []; for (let i = 0; i < count; i++) { const rawMatch = callAccessor( - "fff_grep_result_get_match", handlePtr, i, FFF_GREP_MATCH_STRUCT, + "fff_grep_result_get_match", + handlePtr, + i, + FFF_GREP_MATCH_STRUCT, ); items.push(readGrepMatchFromRaw(rawMatch)); } @@ -770,12 +801,18 @@ function parseSearchResult(rawPtr: JsExternal): Result { for (let i = 0; i < count; i++) { const rawItem = callAccessor( - "fff_search_result_get_item", handlePtr, i, FFF_FILE_ITEM_STRUCT, + "fff_search_result_get_item", + handlePtr, + i, + FFF_FILE_ITEM_STRUCT, ); items.push(readFileItemFromRaw(rawItem)); const rawScore = callAccessor( - "fff_search_result_get_score", handlePtr, i, FFF_SCORE_STRUCT, + "fff_search_result_get_score", + handlePtr, + i, + FFF_SCORE_STRUCT, ); scores.push(readScoreFromRaw(rawScore)); } @@ -789,7 +826,12 @@ function parseSearchResult(rawPtr: JsExternal): Result { paramsValue: [handlePtr], }); - const result: SearchResult = { items, scores, totalMatched: sr.total_matched, totalFiles: sr.total_files }; + const result: SearchResult = { + items, + scores, + totalMatched: sr.total_matched, + totalFiles: sr.total_files, + }; if (location) { result.location = location; } @@ -817,15 +859,24 @@ export function ffiSearch( retType: DataType.External, paramsType: [ DataType.External, // handle - DataType.String, // query - DataType.String, // current_file - DataType.U32, // max_threads - DataType.U32, // page_index - DataType.U32, // page_size - DataType.I32, // combo_boost_multiplier - DataType.U32, // min_combo_count + DataType.String, // query + DataType.String, // current_file + DataType.U32, // max_threads + DataType.U32, // page_index + DataType.U32, // page_size + DataType.I32, // combo_boost_multiplier + DataType.U32, // min_combo_count + ], + paramsValue: [ + handle, + query, + currentFile, + maxThreads, + pageIndex, + pageSize, + comboBoostMultiplier, + minComboCount, ], - paramsValue: [handle, query, currentFile, maxThreads, pageIndex, pageSize, comboBoostMultiplier, minComboCount], freeResultMemory: false, }) as JsExternal; @@ -857,23 +908,31 @@ export function ffiLiveGrep( retType: DataType.External, paramsType: [ DataType.External, // handle - DataType.String, // query - DataType.U8, // mode - DataType.U64, // max_file_size - DataType.U32, // max_matches_per_file - DataType.Boolean, // smart_case - DataType.U32, // file_offset - DataType.U32, // page_limit - DataType.U64, // time_budget_ms - DataType.U32, // before_context - DataType.U32, // after_context - DataType.Boolean, // classify_definitions + DataType.String, // query + DataType.U8, // mode + DataType.U64, // max_file_size + DataType.U32, // max_matches_per_file + DataType.Boolean, // smart_case + DataType.U32, // file_offset + DataType.U32, // page_limit + DataType.U64, // time_budget_ms + DataType.U32, // before_context + DataType.U32, // after_context + DataType.Boolean, // classify_definitions ], paramsValue: [ - handle, query, grepModeToU8(mode), - maxFileSize, maxMatchesPerFile, smartCase, - fileOffset, pageLimit, timeBudgetMs, - beforeContext, afterContext, classifyDefinitions, + handle, + query, + grepModeToU8(mode), + maxFileSize, + maxMatchesPerFile, + smartCase, + fileOffset, + pageLimit, + timeBudgetMs, + beforeContext, + afterContext, + classifyDefinitions, ], freeResultMemory: false, }) as JsExternal; @@ -906,23 +965,31 @@ export function ffiMultiGrep( retType: DataType.External, paramsType: [ DataType.External, // handle - DataType.String, // patterns_joined - DataType.String, // constraints - DataType.U64, // max_file_size - DataType.U32, // max_matches_per_file - DataType.Boolean, // smart_case - DataType.U32, // file_offset - DataType.U32, // page_limit - DataType.U64, // time_budget_ms - DataType.U32, // before_context - DataType.U32, // after_context - DataType.Boolean, // classify_definitions + DataType.String, // patterns_joined + DataType.String, // constraints + DataType.U64, // max_file_size + DataType.U32, // max_matches_per_file + DataType.Boolean, // smart_case + DataType.U32, // file_offset + DataType.U32, // page_limit + DataType.U64, // time_budget_ms + DataType.U32, // before_context + DataType.U32, // after_context + DataType.Boolean, // classify_definitions ], paramsValue: [ - handle, patternsJoined, constraints, - maxFileSize, maxMatchesPerFile, smartCase, - fileOffset, pageLimit, timeBudgetMs, - beforeContext, afterContext, classifyDefinitions, + handle, + patternsJoined, + constraints, + maxFileSize, + maxMatchesPerFile, + smartCase, + fileOffset, + pageLimit, + timeBudgetMs, + beforeContext, + afterContext, + classifyDefinitions, ], freeResultMemory: false, }) as JsExternal; @@ -965,7 +1032,9 @@ interface FffScanProgressRaw { /** * Get scan progress. */ -export function ffiGetScanProgress(handle: NativeHandle): Result<{ scannedFilesCount: number; isScanning: boolean }> { +export function ffiGetScanProgress( + handle: NativeHandle, +): Result<{ scannedFilesCount: number; isScanning: boolean }> { loadLibrary(); const res = readResultEnvelope("fff_get_scan_progress", [DataType.External], [handle]); if ("ok" in res) return res; @@ -1001,14 +1070,22 @@ export function ffiGetScanProgress(handle: NativeHandle): Result<{ scannedFilesC * Wait for a tree scan to complete. */ export function ffiWaitForScan(handle: NativeHandle, timeoutMs: number): Result { - return callBoolResult("fff_wait_for_scan", [DataType.External, DataType.U64], [handle, timeoutMs]); + return callBoolResult( + "fff_wait_for_scan", + [DataType.External, DataType.U64], + [handle, timeoutMs], + ); } /** * Restart index in new path. */ export function ffiRestartIndex(handle: NativeHandle, newPath: string): Result { - return callVoidResult("fff_restart_index", [DataType.External, DataType.String], [handle, newPath]); + return callVoidResult( + "fff_restart_index", + [DataType.External, DataType.String], + [handle, newPath], + ); } /** @@ -1040,7 +1117,11 @@ export function ffiGetHistoricalQuery( handle: NativeHandle, offset: number, ): Result { - return callStringResult("fff_get_historical_query", [DataType.External, DataType.U64], [handle, offset]); + return callStringResult( + "fff_get_historical_query", + [DataType.External, DataType.U64], + [handle, offset], + ); } /** diff --git a/packages/fff-node/src/finder.ts b/packages/fff-node/src/finder.ts index 425a269..c66b04e 100644 --- a/packages/fff-node/src/finder.ts +++ b/packages/fff-node/src/finder.ts @@ -445,4 +445,3 @@ export class FileFinder { return ffiHealthCheck(null, testPath || "") as Result; } } - diff --git a/packages/fff-node/src/types.ts b/packages/fff-node/src/types.ts index a8b95ac..1648e0a 100644 --- a/packages/fff-node/src/types.ts +++ b/packages/fff-node/src/types.ts @@ -360,4 +360,3 @@ export interface MultiGrepOptions { /** Number of context lines to include after each match (default: 0) */ afterContext?: number; } -