diff --git a/crates/uffs-core/src/search/backend.rs b/crates/uffs-core/src/search/backend.rs index 41f2295f1..2651a26fa 100644 --- a/crates/uffs-core/src/search/backend.rs +++ b/crates/uffs-core/src/search/backend.rs @@ -1,16 +1,12 @@ // SPDX-License-Identifier: MPL-2.0 // Copyright (c) 2025-2026 SKY, LLC. -//! Search backend types: display rows, sort columns, filter modes, and -//! multi-drive search orchestration. +//! Search backend types: sort columns, filter modes, and multi-drive search +//! orchestration. //! -//! Exception: `file_size_policy` allows this file to exceed 800 LOC. -//! Rationale: cross-cutting facade — `DisplayRow`, `PhaseTimings`, -//! `SearchResult`, `FilterMode`, and `MultiDriveBackend` form a -//! cohesive contract surface referenced by every dispatch path, the -//! daemon wire layer, and the test harness. Splitting would -//! scatter the type definitions across files and break the -//! single-import convention downstream crates rely on. +//! The result-row type [`DisplayRow`] lives in the sibling `display_row` +//! module and is re-exported here, so the single-import convention downstream +//! crates rely on (`uffs_core::search::backend::DisplayRow`) is preserved. use alloc::sync::Arc; use std::time::Instant; @@ -27,286 +23,6 @@ use crate::search::field::FieldId; /// Sentinel: no truncation — return every matching record. const UNLIMITED: usize = usize::MAX; -/// A single displayable search result row. -/// -/// The filename is **not** stored separately — it is derived from the `path` -/// field using `name_start` (byte offset where the filename begins within -/// `path`). This avoids one heap allocation per result row. -/// -/// `Default` is implemented manually below: [`uffs_mft::platform::DriveLetter`] -/// has no `Default` impl (it's a validated `A..=Z` newtype with no canonical -/// zero), but the `sort_rows_with_fold` hot path uses -/// [`core::mem::take`] to move rows out of a `&mut [DisplayRow]` slice -/// as part of a Schwartzian decorate/sort/undecorate transform. The -/// take leaves a transient placeholder in the slice that's -/// immediately overwritten by the put-back step, so any consistent -/// drive letter works for the placeholder. -#[derive(Debug, Clone)] -#[expect( - clippy::partial_pub_fields, - reason = "name_start is private by design — accessed via name() method" -)] -pub struct DisplayRow { - /// Record index within the compact/cache file. - pub record_index: u32, - /// Drive letter this result belongs to. - pub drive: uffs_mft::platform::DriveLetter, - /// Full resolved path (e.g., `C:\Users\file.txt`). - pub path: String, - /// Byte offset within `path` where the filename begins. - /// - /// `self.name()` returns `&self.path[name_start..]`. - /// Computed once at construction from the last `\` separator. - name_start: u32, - /// File size in bytes. - pub size: u64, - /// Whether this is a directory. - pub is_directory: bool, - /// Last modified time (Unix microseconds). - pub modified: i64, - /// Creation time (Unix microseconds). - pub created: i64, - /// Last access time (Unix microseconds). - pub accessed: i64, - /// Raw NTFS `FILE_ATTRIBUTE_*` flags. - pub flags: u32, - /// Allocated size on disk in bytes. - pub allocated: u64, - /// Descendant count (directories only). - pub descendants: u32, - /// Sum of logical file sizes in entire subtree (directories only). - pub treesize: u64, - /// Sum of allocated sizes in entire subtree (directories only). - pub tree_allocated: u64, - /// WI-4.4 forensic flag: this record's own leaf name is ill-formed (its - /// true bytes are not valid UTF-8 — an unpaired UTF-16 surrogate). - /// Computed in the hot path from the lossless name bytes; the lossy - /// `path`/`name()` view cannot recover this (it is always valid UTF-8). - pub malformed: bool, - /// WI-4.4 forensic flag: some component of the resolved path is ill-formed - /// (so a clean-named file under a crooked directory is flagged). Superset - /// of [`Self::malformed`]; computed during parent-chain resolution. - pub malformed_path: bool, - /// WI-4.4 forensic evidence: hex of the true (WTF-8) leaf-name bytes. - /// `Some` for every malformed leaf and `None` otherwise, so the - /// hex-encode/allocation cost is paid only for the vanishing fraction of - /// ill-formed names — it is keyed on name validity, never on projection. - /// JSON output therefore carries it by default for malformed rows. - pub name_hex: Option, -} - -impl DisplayRow { - /// Construct a `DisplayRow`, computing `name_start` from the path. - #[must_use] - #[expect( - clippy::too_many_arguments, - reason = "flat struct — all fields are required, no logical grouping" - )] - pub fn new( - record_index: u32, - drive: uffs_mft::platform::DriveLetter, - path: String, - size: u64, - is_directory: bool, - modified: i64, - created: i64, - accessed: i64, - flags: u32, - allocated: u64, - descendants: u32, - treesize: u64, - tree_allocated: u64, - ) -> Self { - let name_start = uffs_mft::len_to_u32(path.rfind('\\').map_or(0, |pos| pos + 1)); - Self { - record_index, - drive, - path, - name_start, - size, - is_directory, - modified, - created, - accessed, - flags, - allocated, - descendants, - treesize, - tree_allocated, - // Forensic carriers default to "well-formed / not requested"; the - // hot path overwrites them via `with_forensics` when it has the - // lossless name bytes. Keeping them out of `new()`'s arg list - // leaves the many existing call sites untouched. - malformed: false, - malformed_path: false, - name_hex: None, - } - } - - /// Attach the WI-4.4 forensic facts computed in the hot path against the - /// lossless name bytes. Chained after [`Self::new`] at the single result- - /// materialization chokepoint so the lossy `path` boundary is never the - /// source of these values. - #[must_use] - #[inline] - pub fn with_forensics( - mut self, - malformed: bool, - malformed_path: bool, - name_hex: Option, - ) -> Self { - self.malformed = malformed; - self.malformed_path = malformed_path; - self.name_hex = name_hex; - self - } - - /// Filename portion of the path (e.g., `file.txt`). - /// - /// Zero-cost: returns a `&str` slice into the owned `path`. - /// - /// The `uffs_format::FormatRow::name` trait method forwards to - /// this inherent method — keeping the inherent impl named `name` - /// (rather than e.g. `file_name`) preserves the accessor's - /// ergonomics across the many `uffs-core` call sites that - /// predate the trait. The intentional collision with the trait - /// method silences `clippy::same_name_method` here. - #[must_use] - #[inline] - #[expect( - clippy::same_name_method, - reason = "shared name with the FormatRow trait impl is intentional — see method-level doc" - )] - pub fn name(&self) -> &str { - self.path.get(self.name_start as usize..).unwrap_or("") - } - - /// Directory portion of path (up to and including the last `\`). - /// - /// Uses `name_start` for zero-cost slicing (no `rfind` needed). - #[must_use] - #[inline] - pub fn path_dir(&self) -> &str { - self.path - .get(..self.name_start as usize) - .unwrap_or(&self.path) - } -} - -/// Feed `DisplayRow` straight into the shared `uffs-format` writer. -/// -/// The daemon holds `DisplayRow` directly on the search hot path, so -/// this impl lets `uffs_format::write_rows::` run -/// without an intermediate copy. Every accessor is O(1) and just -/// hands back a struct field (or the pre-computed filename slice), -/// matching the trait's inlineability requirement. -/// -/// Manual `Default` impl — see the struct doc-comment for why we -/// don't derive it. All fields default to their natural zero -/// (`0`, `String::new()`, `false`) except `drive`, which we set to -/// [`uffs_mft::platform::DriveLetter::A`] purely as a placeholder for -/// [`core::mem::take`] in the sort hot path. Callers never observe -/// this value: the take is immediately followed by a put-back. -impl Default for DisplayRow { - fn default() -> Self { - Self { - record_index: 0, - drive: uffs_mft::platform::DriveLetter::A, - path: String::new(), - name_start: 0, - size: 0, - is_directory: false, - modified: 0, - created: 0, - accessed: 0, - flags: 0, - allocated: 0, - descendants: 0, - treesize: 0, - tree_allocated: 0, - malformed: false, - malformed_path: false, - name_hex: None, - } - } -} - -/// The trait method `name()` collides with `DisplayRow::name()` (the -/// inherent accessor that pre-dates the trait); the trait impl -/// delegates to the inherent impl so the behaviour is identical. -/// The `clippy::same_name_method` lint is silenced on the inherent -/// method above — see its `#[expect]` attribute. -impl uffs_format::FormatRow for DisplayRow { - #[inline] - fn drive(&self) -> char { - // `uffs-format` is a foundation crate that intentionally - // doesn't depend on `uffs-mft`, so the trait surface - // stays `char`. `DriveLetter::as_char` is the canonical - // zero-cost conversion to the ASCII letter. - self.drive.as_char() - } - #[inline] - fn path(&self) -> &str { - &self.path - } - #[inline] - fn name(&self) -> &str { - Self::name(self) - } - #[inline] - fn size(&self) -> u64 { - self.size - } - #[inline] - fn is_directory(&self) -> bool { - self.is_directory - } - #[inline] - fn modified(&self) -> i64 { - self.modified - } - #[inline] - fn created(&self) -> i64 { - self.created - } - #[inline] - fn accessed(&self) -> i64 { - self.accessed - } - #[inline] - fn flags(&self) -> u32 { - self.flags - } - #[inline] - fn allocated(&self) -> u64 { - self.allocated - } - #[inline] - fn descendants(&self) -> u32 { - self.descendants - } - #[inline] - fn treesize(&self) -> u64 { - self.treesize - } - #[inline] - fn tree_allocated(&self) -> u64 { - self.tree_allocated - } - #[inline] - fn malformed(&self) -> bool { - self.malformed - } - #[inline] - fn malformed_path(&self) -> bool { - self.malformed_path - } - #[inline] - fn name_hex(&self) -> Option<&str> { - self.name_hex.as_deref() - } -} - /// Sub-phase wall-clock breakdown inside the `pattern == "*"` pipeline. /// /// Populated only when the `match_all` dispatch path is taken (via @@ -625,6 +341,10 @@ impl MultiDriveBackend { }); let needle = super::dispatch::fold_needle(case_sensitive, pattern, fold); let is_path = !is_match_all && !is_regex && crate::search::tree::is_path_pattern(&needle); + let is_prefix = !is_match_all + && !is_regex + && !is_path + && crate::search::tree::is_prefix_pattern(&needle).is_some(); if is_match_all { let (match_all_rows, match_all_timings) = super::query::collect_global_top_n( @@ -686,6 +406,35 @@ impl MultiDriveBackend { }; } } + } else if is_prefix { + // Trigram-accelerated prefix scan (`win*`). `is_prefix` already + // proved `is_prefix_pattern` holds, so the strip is infallible. + if let Some(prefix) = crate::search::tree::is_prefix_pattern(&needle) { + let drive_results: Vec> = self + .drives + .par_iter() + .map(|drive| { + super::query::search_compact_drive_prefix( + drive, + prefix, + limit, + case_sensitive, + ) + }) + .collect(); + for drive_rows in drive_results { + rows.extend(drive_rows); + } + super::filters::apply_filter(&mut rows, filter_mode); + super::filters::apply_search_filters(&mut rows, search_filters); + sort_rows( + &mut rows, + self.sort_column, + self.sort_desc, + &self.extra_sort_tiers, + ); + rows.truncate(limit); + } } else { let drive_results: Vec> = self .drives @@ -733,6 +482,8 @@ impl MultiDriveBackend { "regex" } else if is_path { "tree" + } else if is_prefix { + "prefix" } else { "trigram" }; @@ -888,6 +639,10 @@ pub fn search_index( }); let needle = super::dispatch::fold_needle(case_sensitive, pattern, fold); let is_path = !is_match_all && !is_regex && crate::search::tree::is_path_pattern(&needle); + let is_prefix = !is_match_all + && !is_regex + && !is_path + && crate::search::tree::is_prefix_pattern(&needle).is_some(); tracing::debug!( pattern, @@ -935,6 +690,7 @@ pub fn search_index( &active_drives, &needle, is_path, + is_prefix, case_sensitive, whole_word, match_path, @@ -951,7 +707,7 @@ pub fn search_index( let scanned = active_drives.iter().map(|dr| dr.records.len()).sum(); let wall_ms = start.elapsed().as_millis(); - let mode = pick_mode_label(is_match_all, is_regex, is_path); + let mode = pick_mode_label(is_match_all, is_regex, is_path, is_prefix); tracing::debug!( target: "cache_profile", wall_ms = %wall_ms, @@ -1020,6 +776,7 @@ fn apply_bloom_pre_check(active_drives: &mut Vec<&DriveCompactIndex>, ext_terms: // so existing `use uffs_core::search::backend::*;` call sites see no // change. pub use super::dataframe_convert::{dataframe_to_display_rows, display_rows_to_dataframe}; +pub use super::display_row::DisplayRow; pub use super::sorting::{format_sort_spec, parse_sort_spec, sort_rows, sort_rows_with_fold}; #[cfg(test)] diff --git a/crates/uffs-core/src/search/dispatch.rs b/crates/uffs-core/src/search/dispatch.rs index 20dada03b..fa155b849 100644 --- a/crates/uffs-core/src/search/dispatch.rs +++ b/crates/uffs-core/src/search/dispatch.rs @@ -378,18 +378,20 @@ pub(super) fn dispatch_regex( Some(rows) } -/// Dispatch the default branch: tree-walk for path patterns, trigram -/// for name patterns, both fanned across drives then filtered + sorted -/// + truncated. +/// Dispatch the default branch: tree-walk for path patterns, trigram- +/// accelerated prefix scan for prefix patterns (`win*`), trigram for the +/// remaining name patterns — all fanned across drives then filtered + +/// sorted + truncated. #[expect(clippy::too_many_arguments, reason = "single call site, flat args")] #[expect( clippy::fn_params_excessive_bools, - reason = "the four bools (is_path / case_sensitive / whole_word / match_path) are orthogonal runtime switches, each controlling a distinct aspect of trigram vs tree matching; bundling them into an enum would lose that orthogonality" + reason = "the bools (is_path / is_prefix / case_sensitive / whole_word / match_path) are orthogonal runtime switches, each controlling a distinct aspect of trigram vs tree matching; bundling them into an enum would lose that orthogonality" )] pub(super) fn dispatch_trigram_or_tree( active_drives: &[&DriveCompactIndex], needle: &str, is_path: bool, + is_prefix: bool, case_sensitive: bool, whole_word: bool, match_path: bool, @@ -405,6 +407,30 @@ pub(super) fn dispatch_trigram_or_tree( .map(|drive| { if is_path { super::query::search_compact_drive_tree(drive, needle, limit) + } else if is_prefix { + // `is_prefix` was validated upstream via `is_prefix_pattern`; + // re-extract the prefix and fall back to the generic scan if + // the (should-be-impossible) re-validation ever fails. + super::tree::is_prefix_pattern(needle).map_or_else( + || { + super::query::search_compact_drive( + drive, + needle, + limit, + case_sensitive, + whole_word, + match_path, + ) + }, + |prefix| { + super::query::search_compact_drive_prefix( + drive, + prefix, + limit, + case_sensitive, + ) + }, + ) } else { super::query::search_compact_drive( drive, @@ -427,10 +453,15 @@ pub(super) fn dispatch_trigram_or_tree( /// Pick the `cache_profile` `mode` tracing label for the chosen /// dispatch branch. Pure function — no side effects. +#[expect( + clippy::fn_params_excessive_bools, + reason = "the four bools are independent runtime dispatch flags; bundling into an enum would lose orthogonality and not improve clarity" +)] pub(super) const fn pick_mode_label( is_match_all: bool, is_regex: bool, is_path: bool, + is_prefix: bool, ) -> &'static str { if is_match_all { "match-all" @@ -438,6 +469,8 @@ pub(super) const fn pick_mode_label( "regex" } else if is_path { "tree" + } else if is_prefix { + "prefix" } else { "trigram" } diff --git a/crates/uffs-core/src/search/display_row.rs b/crates/uffs-core/src/search/display_row.rs new file mode 100644 index 000000000..31b00c6f0 --- /dev/null +++ b/crates/uffs-core/src/search/display_row.rs @@ -0,0 +1,289 @@ +// SPDX-License-Identifier: MPL-2.0 +// Copyright (c) 2025-2026 SKY, LLC. + +//! The [`DisplayRow`] result-row type and its `uffs_format::FormatRow` impl. +//! +//! Extracted from `backend.rs` to keep that file under the 800-LOC file-size +//! policy. `DisplayRow` is re-exported from `backend` (`pub use`) so the +//! single-import convention downstream crates rely on +//! (`uffs_core::search::backend::DisplayRow`) is preserved. + +/// A single displayable search result row. +/// +/// The filename is **not** stored separately — it is derived from the `path` +/// field using `name_start` (byte offset where the filename begins within +/// `path`). This avoids one heap allocation per result row. +/// +/// `Default` is implemented manually below: [`uffs_mft::platform::DriveLetter`] +/// has no `Default` impl (it's a validated `A..=Z` newtype with no canonical +/// zero), but the `sort_rows_with_fold` hot path uses +/// [`core::mem::take`] to move rows out of a `&mut [DisplayRow]` slice +/// as part of a Schwartzian decorate/sort/undecorate transform. The +/// take leaves a transient placeholder in the slice that's +/// immediately overwritten by the put-back step, so any consistent +/// drive letter works for the placeholder. +#[derive(Debug, Clone)] +#[expect( + clippy::partial_pub_fields, + reason = "name_start is private by design — accessed via name() method" +)] +pub struct DisplayRow { + /// Record index within the compact/cache file. + pub record_index: u32, + /// Drive letter this result belongs to. + pub drive: uffs_mft::platform::DriveLetter, + /// Full resolved path (e.g., `C:\Users\file.txt`). + pub path: String, + /// Byte offset within `path` where the filename begins. + /// + /// `self.name()` returns `&self.path[name_start..]`. + /// Computed once at construction from the last `\` separator. + name_start: u32, + /// File size in bytes. + pub size: u64, + /// Whether this is a directory. + pub is_directory: bool, + /// Last modified time (Unix microseconds). + pub modified: i64, + /// Creation time (Unix microseconds). + pub created: i64, + /// Last access time (Unix microseconds). + pub accessed: i64, + /// Raw NTFS `FILE_ATTRIBUTE_*` flags. + pub flags: u32, + /// Allocated size on disk in bytes. + pub allocated: u64, + /// Descendant count (directories only). + pub descendants: u32, + /// Sum of logical file sizes in entire subtree (directories only). + pub treesize: u64, + /// Sum of allocated sizes in entire subtree (directories only). + pub tree_allocated: u64, + /// WI-4.4 forensic flag: this record's own leaf name is ill-formed (its + /// true bytes are not valid UTF-8 — an unpaired UTF-16 surrogate). + /// Computed in the hot path from the lossless name bytes; the lossy + /// `path`/`name()` view cannot recover this (it is always valid UTF-8). + pub malformed: bool, + /// WI-4.4 forensic flag: some component of the resolved path is ill-formed + /// (so a clean-named file under a crooked directory is flagged). Superset + /// of [`Self::malformed`]; computed during parent-chain resolution. + pub malformed_path: bool, + /// WI-4.4 forensic evidence: hex of the true (WTF-8) leaf-name bytes. + /// `Some` for every malformed leaf and `None` otherwise, so the + /// hex-encode/allocation cost is paid only for the vanishing fraction of + /// ill-formed names — it is keyed on name validity, never on projection. + /// JSON output therefore carries it by default for malformed rows. + pub name_hex: Option, +} + +impl DisplayRow { + /// Construct a `DisplayRow`, computing `name_start` from the path. + #[must_use] + #[expect( + clippy::too_many_arguments, + reason = "flat struct — all fields are required, no logical grouping" + )] + pub fn new( + record_index: u32, + drive: uffs_mft::platform::DriveLetter, + path: String, + size: u64, + is_directory: bool, + modified: i64, + created: i64, + accessed: i64, + flags: u32, + allocated: u64, + descendants: u32, + treesize: u64, + tree_allocated: u64, + ) -> Self { + let name_start = uffs_mft::len_to_u32(path.rfind('\\').map_or(0, |pos| pos + 1)); + Self { + record_index, + drive, + path, + name_start, + size, + is_directory, + modified, + created, + accessed, + flags, + allocated, + descendants, + treesize, + tree_allocated, + // Forensic carriers default to "well-formed / not requested"; the + // hot path overwrites them via `with_forensics` when it has the + // lossless name bytes. Keeping them out of `new()`'s arg list + // leaves the many existing call sites untouched. + malformed: false, + malformed_path: false, + name_hex: None, + } + } + + /// Attach the WI-4.4 forensic facts computed in the hot path against the + /// lossless name bytes. Chained after [`Self::new`] at the single result- + /// materialization chokepoint so the lossy `path` boundary is never the + /// source of these values. + #[must_use] + #[inline] + pub fn with_forensics( + mut self, + malformed: bool, + malformed_path: bool, + name_hex: Option, + ) -> Self { + self.malformed = malformed; + self.malformed_path = malformed_path; + self.name_hex = name_hex; + self + } + + /// Filename portion of the path (e.g., `file.txt`). + /// + /// Zero-cost: returns a `&str` slice into the owned `path`. + /// + /// The `uffs_format::FormatRow::name` trait method forwards to + /// this inherent method — keeping the inherent impl named `name` + /// (rather than e.g. `file_name`) preserves the accessor's + /// ergonomics across the many `uffs-core` call sites that + /// predate the trait. The intentional collision with the trait + /// method silences `clippy::same_name_method` here. + #[must_use] + #[inline] + #[expect( + clippy::same_name_method, + reason = "shared name with the FormatRow trait impl is intentional — see method-level doc" + )] + pub fn name(&self) -> &str { + self.path.get(self.name_start as usize..).unwrap_or("") + } + + /// Directory portion of path (up to and including the last `\`). + /// + /// Uses `name_start` for zero-cost slicing (no `rfind` needed). + #[must_use] + #[inline] + pub fn path_dir(&self) -> &str { + self.path + .get(..self.name_start as usize) + .unwrap_or(&self.path) + } +} + +/// Feed `DisplayRow` straight into the shared `uffs-format` writer. +/// +/// The daemon holds `DisplayRow` directly on the search hot path, so +/// this impl lets `uffs_format::write_rows::` run +/// without an intermediate copy. Every accessor is O(1) and just +/// hands back a struct field (or the pre-computed filename slice), +/// matching the trait's inlineability requirement. +/// +/// Manual `Default` impl — see the struct doc-comment for why we +/// don't derive it. All fields default to their natural zero +/// (`0`, `String::new()`, `false`) except `drive`, which we set to +/// [`uffs_mft::platform::DriveLetter::A`] purely as a placeholder for +/// [`core::mem::take`] in the sort hot path. Callers never observe +/// this value: the take is immediately followed by a put-back. +impl Default for DisplayRow { + fn default() -> Self { + Self { + record_index: 0, + drive: uffs_mft::platform::DriveLetter::A, + path: String::new(), + name_start: 0, + size: 0, + is_directory: false, + modified: 0, + created: 0, + accessed: 0, + flags: 0, + allocated: 0, + descendants: 0, + treesize: 0, + tree_allocated: 0, + malformed: false, + malformed_path: false, + name_hex: None, + } + } +} + +/// The trait method `name()` collides with `DisplayRow::name()` (the +/// inherent accessor that pre-dates the trait); the trait impl +/// delegates to the inherent impl so the behaviour is identical. +/// The `clippy::same_name_method` lint is silenced on the inherent +/// method above — see its `#[expect]` attribute. +impl uffs_format::FormatRow for DisplayRow { + #[inline] + fn drive(&self) -> char { + // `uffs-format` is a foundation crate that intentionally + // doesn't depend on `uffs-mft`, so the trait surface + // stays `char`. `DriveLetter::as_char` is the canonical + // zero-cost conversion to the ASCII letter. + self.drive.as_char() + } + #[inline] + fn path(&self) -> &str { + &self.path + } + #[inline] + fn name(&self) -> &str { + Self::name(self) + } + #[inline] + fn size(&self) -> u64 { + self.size + } + #[inline] + fn is_directory(&self) -> bool { + self.is_directory + } + #[inline] + fn modified(&self) -> i64 { + self.modified + } + #[inline] + fn created(&self) -> i64 { + self.created + } + #[inline] + fn accessed(&self) -> i64 { + self.accessed + } + #[inline] + fn flags(&self) -> u32 { + self.flags + } + #[inline] + fn allocated(&self) -> u64 { + self.allocated + } + #[inline] + fn descendants(&self) -> u32 { + self.descendants + } + #[inline] + fn treesize(&self) -> u64 { + self.treesize + } + #[inline] + fn tree_allocated(&self) -> u64 { + self.tree_allocated + } + #[inline] + fn malformed(&self) -> bool { + self.malformed + } + #[inline] + fn malformed_path(&self) -> bool { + self.malformed_path + } + #[inline] + fn name_hex(&self) -> Option<&str> { + self.name_hex.as_deref() + } +} diff --git a/crates/uffs-core/src/search/mod.rs b/crates/uffs-core/src/search/mod.rs index a77cbf3d2..4654ef78b 100644 --- a/crates/uffs-core/src/search/mod.rs +++ b/crates/uffs-core/src/search/mod.rs @@ -12,6 +12,7 @@ pub mod columns; mod dataframe_convert; pub mod derived; mod dispatch; +mod display_row; pub mod field; pub mod filters; pub mod query; diff --git a/crates/uffs-core/src/search/query/mod.rs b/crates/uffs-core/src/search/query/mod.rs index 98476c154..7310ef431 100644 --- a/crates/uffs-core/src/search/query/mod.rs +++ b/crates/uffs-core/src/search/query/mod.rs @@ -10,6 +10,8 @@ mod numeric_sort_key; mod numeric_top_n; mod path_only_top_n; mod path_sorted_top_n; +mod prefix_search; +mod row_resolve; use alloc::collections::BinaryHeap; use std::sync::LazyLock; @@ -17,6 +19,8 @@ use std::sync::LazyLock; use numeric_top_n::collect_global_top_n_numeric; use path_only_top_n::collect_path_only_sorted_top_n; use path_sorted_top_n::collect_path_sorted_top_n; +pub(crate) use prefix_search::search_compact_drive_prefix; +use row_resolve::indices_to_rows; use super::backend::{DisplayRow, FilterMode, PhaseTimings}; use super::field::FieldId; @@ -648,7 +652,7 @@ fn hex_encode(bytes: &[u8]) -> String { /// Returns a 3-byte `&str` without heap allocation. Uses safe /// `from_utf8` with a fallback — the bytes are always valid ASCII. #[inline] -pub(super) fn stack_volume_prefix( +pub(crate) fn stack_volume_prefix( buf: &mut [u8; 4], letter: uffs_mft::platform::DriveLetter, ) -> &str { @@ -677,42 +681,6 @@ pub(super) fn heap_push_capped(heap: &mut BinaryHeap, entry: T, limit } } -/// Convert a list of record indices into `DisplayRow`s with resolved paths. -fn indices_to_rows( - drive: &DriveCompactIndex, - indices: &[u32], - volume_prefix: &str, -) -> Vec { - let mut dir_cache = tree::dir_cache_with_capacity(256); - let mut mal_cache = tree::malformed_cache_with_capacity(256); - indices - .iter() - .filter_map(|&record_idx| { - let rec = drive.records.get(record_idx as usize)?; - let name = rec.name(&drive.names); - if name.is_empty() { - return None; - } - let (path, path_malformed) = tree::resolve_path_cached_with_malformed( - drive, - record_idx as usize, - volume_prefix, - &mut dir_cache, - &mut mal_cache, - ); - let forensics = row_forensics(rec, &drive.names, path_malformed); - Some(make_display_row( - record_idx, - drive.letter, - rec, - name, - path, - forensics, - )) - }) - .collect() -} - // ════════════════════════════════════════════════════════════════════════ // REGRESSION TESTS — End-to-End Compact Search Parity // diff --git a/crates/uffs-core/src/search/query/numeric_top_n.rs b/crates/uffs-core/src/search/query/numeric_top_n.rs index fc2e28b3c..46c12d632 100644 --- a/crates/uffs-core/src/search/query/numeric_top_n.rs +++ b/crates/uffs-core/src/search/query/numeric_top_n.rs @@ -572,6 +572,16 @@ fn sort_and_localise( sort_desc: bool, limit: usize, ) -> Vec<(u16, u32, i64)> { + // Unlimited query fast-path: when `limit` admits every candidate, the + // value-sort is wasted work — the downstream `backend::sort_rows` re-sorts + // the materialised rows by the user's column anyway, and `truncate` is a + // no-op. Skipping it saves a full O(N log N) pass over millions of tuples + // on `*` full-scan (e.g. 7.9 M rows on C,D). We still do the cheap + // MFT-locality sort so path resolution keeps its warm-`DirCache` ordering. + if limit >= candidates.len() { + candidates.sort_unstable_by_key(|&(drive_idx, rec_idx, _)| (drive_idx, rec_idx)); + return candidates; + } if sort_desc { candidates.sort_unstable_by_key(|entry| core::cmp::Reverse(entry.2)); } else { diff --git a/crates/uffs-core/src/search/query/prefix_search.rs b/crates/uffs-core/src/search/query/prefix_search.rs new file mode 100644 index 000000000..5f5a19989 --- /dev/null +++ b/crates/uffs-core/src/search/query/prefix_search.rs @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: MPL-2.0 +// Copyright (c) 2025-2026 SKY, LLC. + +//! Prefix search using trigram-accelerated lookup. +//! +//! Extracted from `mod.rs` to satisfy the 800-LOC file-size policy. + +use crate::compact::DriveCompactIndex; +use crate::search::query::{indices_to_rows, stack_volume_prefix}; + +/// Whether cache profiling is enabled (`UFFS_CACHE_PROFILE` env var). +static CACHE_PROFILE: std::sync::LazyLock = + std::sync::LazyLock::new(|| std::env::var_os("UFFS_CACHE_PROFILE").is_some()); + +/// Search a single drive using trigram index for prefix queries (e.g., `win*`). +/// +/// Uses the first 3 characters of the prefix to narrow candidates via the +/// trigram index, then filters by full prefix match. Significantly faster +/// than full scan for large drives. +/// +/// The caller guarantees `prefix` came from +/// [`crate::search::tree::is_prefix_pattern`], so it is ≥ 3 bytes and free of +/// wildcards / path separators. +#[must_use] +pub(crate) fn search_compact_drive_prefix( + drive: &DriveCompactIndex, + prefix: &str, + limit: usize, + case_sensitive: bool, +) -> Vec { + let mut vp_buf = [0_u8; 4]; + let volume_prefix = stack_volume_prefix(&mut vp_buf, drive.letter); + let profile = *CACHE_PROFILE; + + let t_tri = std::time::Instant::now(); + + // Get trigram candidates using first 3 chars of prefix. + // get() safely handles any byte boundaries; prefix is ASCII from pattern. + let trigram_needle = prefix.get(..prefix.len().min(3)).unwrap_or(prefix); + let candidates = drive.trigram.search(trigram_needle, drive.fold); + + let tri_ms = t_tri.elapsed().as_millis(); + let tri_count = candidates.as_ref().map_or(0, Vec::len); + + let t_match = std::time::Instant::now(); + let mut match_indices = Vec::new(); + + if let Some(candidate_indices) = candidates { + // Pre-fold the prefix for case-insensitive matching. + let mut fold_buf: Vec = Vec::with_capacity(prefix.len()); + let prefix_folded = if case_sensitive { + prefix.to_owned() + } else { + drive.fold.fold_into(prefix, &mut fold_buf).to_owned() + }; + + for rec_idx in candidate_indices { + let Some(rec) = drive.records.get(rec_idx as usize) else { + continue; + }; + + let name = rec.name(&drive.names); + if name.is_empty() { + continue; + } + + // Check prefix match. + let matches = if case_sensitive { + name.starts_with(prefix) + } else { + let mut name_buf: Vec = Vec::with_capacity(name.len()); + let name_folded = drive.fold.fold_into(name, &mut name_buf); + name_folded.starts_with(&prefix_folded) + }; + + if matches { + match_indices.push(rec_idx); + if match_indices.len() >= limit { + break; + } + } + } + } + + let match_ms = t_match.elapsed().as_millis(); + let match_count = match_indices.len(); + + let t_resolve = std::time::Instant::now(); + let rows = indices_to_rows(drive, &match_indices, volume_prefix); + let resolve_ms = t_resolve.elapsed().as_millis(); + + if profile { + tracing::debug!( + target: "cache_profile", + drive = %drive.letter, + tri_ms = %tri_ms, + tri_count, + match_ms = %match_ms, + match_count, + resolve_ms = %resolve_ms, + prefix = %prefix, + "search_prefix" + ); + } + + rows +} diff --git a/crates/uffs-core/src/search/query/row_resolve.rs b/crates/uffs-core/src/search/query/row_resolve.rs new file mode 100644 index 000000000..d40e25405 --- /dev/null +++ b/crates/uffs-core/src/search/query/row_resolve.rs @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: MPL-2.0 +// Copyright (c) 2025-2026 SKY, LLC. + +//! Record-index → [`DisplayRow`] path resolution. +//! +//! Extracted from `mod.rs` to keep that file under the 800-LOC file-size +//! policy. Hosts the size-adaptive dispatch ([`indices_to_rows`]) and its +//! sequential / parallel variants, plus the chunk-size constant that doubles +//! as the sequential-vs-parallel threshold. + +use rayon::prelude::*; + +use super::{DisplayRow, make_display_row, row_forensics}; +use crate::compact::DriveCompactIndex; +use crate::search::tree; + +/// Chunk size for parallel path resolution. At ~370 ns per candidate, +/// a 4 K chunk runs in ~1.5 ms — well above rayon's task-dispatch floor +/// (~1 μs). Also the threshold below which path resolution stays +/// sequential: tiny result sets (e.g. an `exact` query returning a +/// handful of rows) must NOT pay rayon's submission cost, which shows up +/// as p95 tail jitter rather than mean latency. Above it, prefix / +/// substring queries (10 K–35 K rows) fan out across workers. +const RESOLVE_CHUNK_SIZE: usize = 4096; + +/// Convert a list of record indices into `DisplayRow`s with resolved paths. +/// +/// Dispatches on result-set size: small sets (`< RESOLVE_CHUNK_SIZE`) resolve +/// sequentially to avoid rayon's task-submission cost (which would otherwise +/// surface as p95 tail jitter on tiny `exact` queries), while large sets fan +/// out across rayon workers with one `DirCache` per chunk. +pub(crate) fn indices_to_rows( + drive: &DriveCompactIndex, + indices: &[u32], + volume_prefix: &str, +) -> Vec { + // Parallel overhead is only worth it above a chunk's worth of candidates. + if indices.len() < RESOLVE_CHUNK_SIZE { + return indices_to_rows_sequential(drive, indices, volume_prefix); + } + indices_to_rows_parallel(drive, indices, volume_prefix) +} + +/// Sequential path resolution for small candidate sets (`< +/// RESOLVE_CHUNK_SIZE`). +fn indices_to_rows_sequential( + drive: &DriveCompactIndex, + indices: &[u32], + volume_prefix: &str, +) -> Vec { + let mut dir_cache = tree::dir_cache_with_capacity(256); + let mut mal_cache = tree::malformed_cache_with_capacity(256); + indices + .iter() + .filter_map(|&record_idx| { + let rec = drive.records.get(record_idx as usize)?; + let name = rec.name(&drive.names); + if name.is_empty() { + return None; + } + let (path, path_malformed) = tree::resolve_path_cached_with_malformed( + drive, + record_idx as usize, + volume_prefix, + &mut dir_cache, + &mut mal_cache, + ); + let forensics = row_forensics(rec, &drive.names, path_malformed); + Some(make_display_row( + record_idx, + drive.letter, + rec, + name, + path, + forensics, + )) + }) + .collect() +} + +/// Parallel path resolution for large candidate sets using rayon. +/// +/// Each chunk owns its own `DirCache` / `MalformedCache` so workers never +/// contend; sibling records within a chunk keep the cache warm. Chunk-local +/// row vectors are concatenated in order via `reduce`, preserving the input +/// ordering exactly so the downstream sort sees the same candidate sequence +/// the sequential path would produce. +fn indices_to_rows_parallel( + drive: &DriveCompactIndex, + indices: &[u32], + volume_prefix: &str, +) -> Vec { + indices + .par_chunks(RESOLVE_CHUNK_SIZE) + .map(|chunk| { + let mut dir_cache = tree::dir_cache_with_capacity(256); + let mut mal_cache = tree::malformed_cache_with_capacity(256); + let mut local_rows = Vec::with_capacity(chunk.len()); + + for &record_idx in chunk { + let Some(rec) = drive.records.get(record_idx as usize) else { + continue; + }; + let name = rec.name(&drive.names); + if name.is_empty() { + continue; + } + let (path, path_malformed) = tree::resolve_path_cached_with_malformed( + drive, + record_idx as usize, + volume_prefix, + &mut dir_cache, + &mut mal_cache, + ); + let forensics = row_forensics(rec, &drive.names, path_malformed); + local_rows.push(make_display_row( + record_idx, + drive.letter, + rec, + name, + path, + forensics, + )); + } + local_rows + }) + .reduce(Vec::new, |mut acc, mut chunk_rows| { + acc.append(&mut chunk_rows); + acc + }) +} diff --git a/crates/uffs-core/src/search/query_tests.rs b/crates/uffs-core/src/search/query_tests.rs index 4ceeb03a6..900a916d0 100644 --- a/crates/uffs-core/src/search/query_tests.rs +++ b/crates/uffs-core/src/search/query_tests.rs @@ -318,6 +318,61 @@ fn unlimited_returns_more_than_capped() { assert!(all.rows.len() > cap.rows.len(), "unlimited > capped"); } +// ═══════════════════════════════════════════════════════════════════════ +// Prefix search (search_compact_drive_prefix) — trigram fast-path +// ═══════════════════════════════════════════════════════════════════════ + +#[test] +fn prefix_search_matches_generic_glob_path() { + // The trigram-accelerated prefix path must return exactly the same set of + // rows as the ground-truth generic glob scan. `f000` matches f00000..f00099. + let drive = build_large_drive(1_500); + let prefix_rows = search_compact_drive_prefix(&drive, "f000", 10_000, false); + let glob_rows = search_compact_drive(&drive, "f000*", 10_000, false, false, false); + + let mut prefix_names: Vec<&str> = prefix_rows.iter().map(DisplayRow::name).collect(); + let mut glob_names: Vec<&str> = glob_rows.iter().map(DisplayRow::name).collect(); + prefix_names.sort_unstable(); + glob_names.sort_unstable(); + + assert!(!prefix_names.is_empty(), "fixture must contain f000* files"); + assert_eq!( + prefix_names, glob_names, + "prefix fast-path must return the same set as the generic glob scan" + ); +} + +#[test] +fn prefix_search_respects_limit() { + let drive = build_large_drive(1_500); + let rows = search_compact_drive_prefix(&drive, "f00", 25, false); + assert!( + rows.len() <= 25, + "prefix search must respect limit, got {}", + rows.len() + ); +} + +#[test] +fn large_glob_uses_parallel_resolve_with_correct_rows() { + // 9 000 files all share the "f0" stem, so a "f0*" glob yields more than + // RESOLVE_CHUNK_SIZE (4 096) matches and drives indices_to_rows down its + // parallel branch. Verify that path returns every match with intact paths + // (no dropped, duplicated, or misordered rows from the chunk reduce). + let drive = build_large_drive(9_000); + let rows = search_compact_drive(&drive, "f0*", 20_000, false, false, false); + assert_eq!( + rows.len(), + 9_000, + "every f0* file must resolve via the parallel path" + ); + assert!( + rows.iter() + .all(|row| row.path.starts_with("C:\\") && row.name().starts_with("f0")), + "parallel-resolved rows must keep correct volume prefix and name" + ); +} + // ═══════════════════════════════════════════════════════════════════════ // Regex search (search_compact_drive_regex) // ═══════════════════════════════════════════════════════════════════════ diff --git a/crates/uffs-core/src/search/tree.rs b/crates/uffs-core/src/search/tree.rs index 85e8ad5b8..5a7397439 100644 --- a/crates/uffs-core/src/search/tree.rs +++ b/crates/uffs-core/src/search/tree.rs @@ -313,6 +313,38 @@ pub(crate) fn is_path_pattern(pattern: &str) -> bool { pattern.contains('\\') || pattern.contains('/') } +/// Returns `Some(prefix)` if the pattern is a simple prefix query (e.g., +/// `win*`). +/// +/// A prefix query: +/// - Ends with `*` +/// - Contains no other wildcards (`*`, `?`) before the trailing `*` +/// - Contains no path separators +/// - Is at least 3 characters (for trigram effectiveness) +/// +/// The 3-char minimum is what makes the trigram pre-filter worthwhile: the +/// trigram index keys on 3-byte windows, so a 1–2 char prefix would still +/// require a full scan. Below the floor we fall through to the regular +/// `search_compact_drive` path instead. +#[must_use] +pub fn is_prefix_pattern(pattern: &str) -> Option<&str> { + // Must end with exactly one trailing `*`. + let prefix = pattern.strip_suffix('*')?; + + // Must have content and be at least 3 chars (for trigram). + if prefix.len() < 3 { + return None; + } + + // Must not contain other wildcards or path separators. + if prefix.contains('*') || prefix.contains('?') || prefix.contains('\\') || prefix.contains('/') + { + return None; + } + + Some(prefix) +} + /// Search using tree traversal for path patterns like `\photos\*.jpg`. /// /// Strategy: @@ -611,3 +643,42 @@ fn glob_match(text: &[u8], pattern: &[u8]) -> bool { pi == pattern.len() } + +#[cfg(test)] +mod tests { + use super::is_prefix_pattern; + + #[test] + fn prefix_accepts_simple_trailing_star() { + assert_eq!(is_prefix_pattern("win*"), Some("win")); + assert_eq!(is_prefix_pattern("kernel*"), Some("kernel")); + } + + #[test] + fn prefix_rejects_short_prefix_below_trigram_floor() { + // < 3 chars: trigram index can't accelerate, so not a prefix query. + assert_eq!(is_prefix_pattern("ab*"), None); + assert_eq!(is_prefix_pattern("a*"), None); + assert_eq!(is_prefix_pattern("*"), None); + } + + #[test] + fn prefix_rejects_missing_trailing_star() { + // No trailing `*` => exact/substring, not a prefix query. + assert_eq!(is_prefix_pattern("windows"), None); + } + + #[test] + fn prefix_rejects_interior_wildcards() { + // A second wildcard before the trailing `*` is a glob, not a prefix. + assert_eq!(is_prefix_pattern("wi*n*"), None); + assert_eq!(is_prefix_pattern("w?n*"), None); + } + + #[test] + fn prefix_rejects_path_separators() { + // Path-anchored patterns route through the tree walker instead. + assert_eq!(is_prefix_pattern("C:\\Win*"), None); + assert_eq!(is_prefix_pattern("dir/sub*"), None); + } +} diff --git a/docs/benchmarks/2026-06-v0.5.120-vs-everything.md b/docs/benchmarks/2026-06-v0.5.120-vs-everything.md new file mode 100644 index 000000000..dc0d1baa2 --- /dev/null +++ b/docs/benchmarks/2026-06-v0.5.120-vs-everything.md @@ -0,0 +1,109 @@ +# UFFS v0.5.120 — Search Latency vs Everything (June 2026 snapshot) + +**Against** Everything (voidtools) 1.4.1.1032 +**Tested on** AMD Ryzen 9 3900XT · 64 GB DDR4-3600 · Windows 11 Pro 24H2 (build 26100) +**Scope** C: + D: NTFS volumes · 7 971 908 live file records +**Measured** 2026-06-09 · UFFS v0.5.120 · HOT phase, file sink, p50/p95 over the benchmark run +**Reproduces** via [`scripts/windows/cross-tool-benchmark.rs`](../../scripts/windows/cross-tool-benchmark.rs) + +> This is a **snapshot** — a factual record of one cross-tool run on the date above. It states +> results, not methods. For the fairness doctrine and how each phase is measured, see +> [`docs/benchmarks/methodology.md`](methodology.md). + +--- + +## TL;DR + +- **UFFS wins 17 of 18 targeted head-to-head cells against Everything at p50** across six pattern classes (exact, prefix, rare-extension, common-extension, regex-alternation, substring) on C:, D:, and the combined C+D index. **Median ratio ≈ 0.52× — UFFS is roughly 1.9× faster on the median interactive query.** +- The 18th cell (C: prefix) is a **1 ms statistical tie** (UFFS 72 ms vs ES 71 ms). +- **The gap widens as the result set grows.** On the combined C+D index, `*.dll` (207 K rows) is **2.6× faster** and `substring` (37 K rows) is **2.4× faster** than Everything. +- **Full-scan export is a workload Everything does not run** in this harness. UFFS streams the complete 7.97 M-row CSV in 3.3 s (combined C+D). + +--- + +## Head-to-head — interactive queries (UFFS vs Everything) + +HOT phase, file sink (`--out` / `-export-csv`). Lower is better. **UFFS/ES** < 1.0 means UFFS is faster. +Row counts are matched across tools within live-filesystem drift. + +| Drive | Pattern | UFFS p50 | UFFS p95 | ES p50 | ES p95 | UFFS/ES | Rows | +|-------|---------|---------:|---------:|-------:|-------:|--------:|-----:| +| C: | exact | **20 ms** | 236 ms | 46 ms | 52 ms | **0.43×** | 30 | +| C: | prefix | 72 ms | 81 ms | 71 ms | 79 ms | 1.01×* | 34 095 | +| C: | ext_rare | **20 ms** | 21 ms | 34 ms | 37 ms | **0.59×** | 1 | +| C: | ext_dll | **91 ms** | 94 ms | 199 ms | 215 ms | **0.46×** | 162 330 | +| C: | ext_regex_alt | **28 ms** | 46 ms | 53 ms | 56 ms | **0.53×** | 15 082 | +| C: | substring | **39 ms** | 41 ms | 75 ms | 77 ms | **0.52×** | 24 229 | +| D: | exact | **20 ms** | 278 ms | 43 ms | 44 ms | **0.47×** | 3 | +| D: | prefix | **38 ms** | 53 ms | 49 ms | 70 ms | **0.78×** | 8 732 | +| D: | ext_rare | **20 ms** | 20 ms | 38 ms | 41 ms | **0.53×** | 11 | +| D: | ext_dll | **36 ms** | 38 ms | 88 ms | 90 ms | **0.41×** | 44 529 | +| D: | ext_regex_alt | **28 ms** | 32 ms | 52 ms | 57 ms | **0.54×** | 10 438 | +| D: | substring | **34 ms** | 38 ms | 61 ms | 63 ms | **0.56×** | 12 458 | +| C+D: | exact | **22 ms** | 211 ms | 56 ms | 59 ms | **0.39×** | 33 | +| C+D: | prefix | **79 ms** | 82 ms | 89 ms | 91 ms | **0.89×** | 42 827 | +| C+D: | ext_rare | **20 ms** | 23 ms | 38 ms | 39 ms | **0.53×** | 12 | +| C+D: | ext_dll | **101 ms** | 114 ms | 258 ms | 264 ms | **0.39×** | 206 859 | +| C+D: | ext_regex_alt | **30 ms** | 50 ms | 68 ms | 69 ms | **0.44×** | 25 520 | +| C+D: | substring | **45 ms** | 50 ms | 110 ms | 115 ms | **0.41×** | 36 687 | + +\* C: prefix is a 1 ms statistical tie (UFFS 72 ms vs ES 71 ms), inside run-to-run noise. Counted as a tie, not a win. + +**Median p50 ratio ≈ 0.52× — UFFS is ~1.9× faster on the median interactive query.** + +### What the table shows + +- **Every targeted query is a UFFS win or tie.** The single near-tie (C: prefix) is one millisecond. +- **The advantage grows with result-set size.** Small exact-match sets are a 2.1–2.6× win; the largest set measured (`C+D: *.dll`, 207 K rows) is also a 2.6× win. UFFS scales in both directions. +- **`exact` p95 spikes** (211–278 ms) reflect per-invocation CLI process spawn on a fresh `uffs.exe` each round, not query cost — the daemon-side latency for these 3–33 row queries is in the low-single-digit milliseconds. p50 is the representative interactive figure. + +--- + +## Full-scan export — the workload Everything does not run here + +Everything's command-line export is not exercised for unbounded `*` full-scan in this harness. UFFS writes the complete result set to CSV directly from the daemon: + +| Index | UFFS `*` → CSV (p50) | Rows | +|-------|---------------------:|-----:| +| C: | 1.5 s | 3 216 011 | +| D: | 2.1 s | 4 755 889 | +| C+D: | **3.3 s** | **7 971 908** | + +Combined throughput on the C+D index is **≈ 2.4 M records/sec** through the daemon → CSV pipe. `--hide-system --hide-ads` strips NTFS system files and Alternate Data Streams to match Everything's default scope. + +--- + +## Test environment + +| | | +|-|-| +| CPU | AMD Ryzen 9 3900XT, 12 cores / 24 threads | +| RAM | 64 GB DDR4-3600 | +| OS | Windows 11 Pro 24H2 (build 26100) | +| Benchmarked volumes | C: + D: NTFS, 7 971 908 total file records | +| UFFS | v0.5.120 (`uffs.exe`, Rust), `cargo build --release` | +| Everything | 1.4.1.1032 (engine), driven via the `es.exe` command-line interface | + +Both tools are measured fully warm (Everything keeps its index resident; UFFS serves from a resident daemon). Each head-to-head pair runs back-to-back in the same shell session on the same OS page-cache state, with matched patterns, drives, and output sink. A live NTFS filesystem drifts by a handful of files between runs (< 0.01 % at this scale); timings are unaffected. + +--- + +## Reproducing + +```powershell +# Elevated PowerShell, repository root, after `cargo build --release` +rust-script .\scripts\windows\cross-tool-benchmark.rs --drives C,D --rounds 20 --sinks file +``` + +The script emits the summary and head-to-head tables this report is built from. + +--- + +## What this snapshot does not claim + +- **"Fastest file search on Windows."** Different workloads have different winners; this snapshot measures the six targeted patterns above plus full-scan export, on C: + D:, on one machine, on one date. +- **"Best tool for every user."** Everything remains an excellent choice for desktop-interactive single-drive lookups. UFFS is the better fit when "huge", "scripted", "structured", "aggregated", or "AI-agent-accessible" describes the workload. + +--- + +*Snapshot compiled 2026-06-09 from a single cross-tool benchmark run. Numbers reflect UFFS v0.5.120 on the stated hardware and volumes.* diff --git a/scripts/ci/file_size_exceptions.txt b/scripts/ci/file_size_exceptions.txt index aafce93ee..ccd25ed0b 100644 --- a/scripts/ci/file_size_exceptions.txt +++ b/scripts/ci/file_size_exceptions.txt @@ -21,5 +21,4 @@ crates/uffs-core/src/search/query_tests.rs|PERMANENT: Integration query test sui crates/uffs-core/src/aggregate/integration_tests.rs|PERMANENT: Aggregate engine integration test suite; shared synthetic drive fixture requires cohesion crates/uffs-mft/src/platform/volume.rs|PERMANENT: Volume handle + write-protect fallback handles; splitting would fragment the handle lifecycle crates/uffs-mft/src/platform/system.rs|PERMANENT: Cross-platform module covering Windows Win32 FFI (privilege, volume, drive detection) and Unix stubs (geteuid, /proc/meminfo, sysctl); splitting by OS would duplicate shared types or add indirection for marginal gain -crates/uffs-core/src/search/backend.rs|PERMANENT: Search backend facade — DisplayRow, PhaseTimings, SearchResult, FilterMode, MultiDriveBackend orchestrator; cross-cutting types for the whole search pipeline crates/uffs-daemon/src/lifecycle.rs|PERMANENT: LifecycleManager + LifecycleHandle + idle-timer state machine; splitting the cohesive run_idle_timer + load_stalled_force_retire + extended_timeout_for_activity cluster fragments the active-connection / load-stall / shutdown semantics across files diff --git a/scripts/verify_parity.rs b/scripts/verify_parity.rs index 5dd000234..6d902390b 100644 --- a/scripts/verify_parity.rs +++ b/scripts/verify_parity.rs @@ -1886,15 +1886,19 @@ fn verify_single_drive( println!(" Computing streaming SHA256 + order-independent fingerprints..."); let t_hash = Instant::now(); - let golden_stats = compute_streaming_stats(&golden_baseline_file); + // The golden baseline (cpp_*.txt) is immutable across reruns, so its hash + // is cached in a sidecar keyed on (size, mtime). Only the regenerated Rust + // output is hashed every run. + let (golden_stats, golden_cached) = compute_streaming_stats_cached(&golden_baseline_file); let rust_stats = compute_streaming_stats(rust_output); let hash_elapsed = t_hash.elapsed(); println!( - " Golden baseline: {} ({} lines) [{:.1}s]", + " Golden baseline: {} ({} lines) [{:.1}s{}]", golden_stats.ordered_hash, golden_stats.line_count, - hash_elapsed.as_secs_f64() + hash_elapsed.as_secs_f64(), + if golden_cached { ", golden cached" } else { "" } ); println!( " Rust output: {} ({} lines)", @@ -3145,6 +3149,89 @@ fn compute_streaming_stats(path: &Path) -> StreamingFileStats { } } +/// File identity used to validate a cached fingerprint: (size_bytes, mtime_nanos). +fn file_identity(path: &Path) -> Option<(u64, u128)> { + let meta = fs::metadata(path).ok()?; + let mtime_ns = meta + .modified() + .ok()? + .duration_since(std::time::UNIX_EPOCH) + .ok()? + .as_nanos(); + Some((meta.len(), mtime_ns)) +} + +/// Sidecar path holding the cached streaming stats for a baseline file. +fn parity_hash_sidecar(path: &Path) -> PathBuf { + let mut name = path.file_name().unwrap_or_default().to_os_string(); + name.push(".parityhash"); + path.with_file_name(name) +} + +/// Load cached `StreamingFileStats` for `path`, but only if the sidecar's +/// recorded (size, mtime) matches the current file (else the baseline changed). +fn load_cached_stats(path: &Path, identity: (u64, u128)) -> Option { + let contents = fs::read_to_string(parity_hash_sidecar(path)).ok()?; + let (mut size, mut mtime_ns): (Option, Option) = (None, None); + let (mut ordered_hash, mut line_count): (Option, Option) = (None, None); + let (mut xor_fp, mut sum_fp): (Option, Option) = (None, None); + for line in contents.lines() { + let mut parts = line.splitn(2, ' '); + let key = parts.next().unwrap_or(""); + let val = parts.next().unwrap_or(""); + match key { + "size" => size = val.parse().ok(), + "mtime_ns" => mtime_ns = val.parse().ok(), + "ordered_hash" => ordered_hash = Some(val.to_string()), + "line_count" => line_count = val.parse().ok(), + "xor_fingerprint" => xor_fp = val.parse().ok(), + "sum_fingerprint" => sum_fp = val.parse().ok(), + _ => {} + } + } + if size? != identity.0 || mtime_ns? != identity.1 { + return None; + } + Some(StreamingFileStats { + ordered_hash: ordered_hash?, + line_count: line_count?, + xor_fingerprint: xor_fp?, + sum_fingerprint: sum_fp?, + }) +} + +/// Persist `StreamingFileStats` to the sidecar cache (best-effort; failures +/// just mean the next run recomputes). +fn store_cached_stats(path: &Path, identity: (u64, u128), stats: &StreamingFileStats) { + let body = format!( + "parityhash v1\nsize {}\nmtime_ns {}\nordered_hash {}\nline_count {}\nxor_fingerprint {}\nsum_fingerprint {}\n", + identity.0, + identity.1, + stats.ordered_hash, + stats.line_count, + stats.xor_fingerprint, + stats.sum_fingerprint, + ); + let _ = fs::write(parity_hash_sidecar(path), body); +} + +/// Like `compute_streaming_stats`, but caches the result in a `.parityhash` +/// sidecar keyed on (size, mtime). Intended for the immutable golden baseline: +/// reruns skip rehashing the multi-GB `cpp_*.txt` unless it actually changes. +/// Returns `(stats, cache_hit)`. +fn compute_streaming_stats_cached(path: &Path) -> (StreamingFileStats, bool) { + let Some(identity) = file_identity(path) else { + // Could not stat the file → fall back to an uncached compute. + return (compute_streaming_stats(path), false); + }; + if let Some(cached) = load_cached_stats(path, identity) { + return (cached, true); + } + let stats = compute_streaming_stats(path); + store_cached_stats(path, identity, &stats); + (stats, false) +} + /// Check if two files have the same lines (order-independent) using streaming stats. fn is_sorted_match(a: &StreamingFileStats, b: &StreamingFileStats) -> bool { a.line_count == b.line_count