Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 48 additions & 39 deletions src/watcher.zig
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,42 @@ const WorkerParsedResults = struct {
/// trigram (see effective_skip_trigram); past this cap the file is skipped.
/// Was 512KB, which silently dropped 512KB-1MB source files entirely.
const max_indexed_file_bytes = 2 * 1024 * 1024;

/// #635: byte threshold for the trigram index. Files up to this size get trigram
/// coverage; larger files (up to max_indexed_file_bytes) still get outline+word
/// indexing but skip trigrams to bound memory on large repos. Previously a bare
/// `1024 * 1024` duplicated across seven call sites.
const max_trigram_file_bytes = 1024 * 1024;

/// #635: the single read gate every index path shares. Enforces the size cap and
/// the binary (null-byte) check in one place, so the threshold can't drift across
/// call sites again (the root cause of #635). Returns the file content (allocated
/// in `alloc`), or null when the file must be skipped — over the cap (logged when
/// `warn_oversize`) or binary. `size` is the caller's already-stat'd file size.
fn readIndexableFile(
io: std.Io,
dir: std.Io.Dir,
path: []const u8,
alloc: std.mem.Allocator,
size: u64,
warn_oversize: bool,
) !?[]const u8 {
if (size > max_indexed_file_bytes) {
if (warn_oversize)
// Reachable via codedb_read (disk fallback) but invisible to
// search/symbol/outline — surface the skip instead of dropping it.
std.log.warn("codedb: not indexing {s} ({d} bytes > {d} cap) — reachable only via codedb_read", .{ path, size, max_indexed_file_bytes });
return null;
}
const content = try dir.readFileAlloc(io, path, alloc, .limited(max_indexed_file_bytes));
// Skip binary content (null byte within the first 512 bytes).
const check_len = @min(content.len, 512);
if (std.mem.indexOfScalar(u8, content[0..check_len], 0) != null) {
alloc.free(content);
return null;
}
return content;
}
const skip_dirs = [_][]const u8{
".git",
".claude",
Expand Down Expand Up @@ -453,23 +489,13 @@ fn parseInitialScanEntry(io: std.Io, root: []const u8, entry: InitialScanEntry,
const dir = try std.Io.Dir.cwd().openDir(io, root, .{});
defer dir.close(io);
const stat = try dir.statFile(io, entry.path, .{});
if (stat.size > max_indexed_file_bytes) {
// #635: surface the skip instead of dropping it silently. Reachable via
// codedb_read (disk fallback) but invisible to search/symbol/outline.
std.log.warn("codedb: not indexing {s} ({d} bytes > {d} cap) — reachable only via codedb_read", .{ entry.path, stat.size, max_indexed_file_bytes });
return null;
}
const content = try dir.readFileAlloc(io, entry.path, arena_alloc, .limited(max_indexed_file_bytes));
const check_len = @min(content.len, 512);
for (content[0..check_len]) |c| {
if (c == 0) return null;
}
const content = (try readIndexableFile(io, dir, entry.path, arena_alloc, stat.size, true)) orelse return null;
// Threshold for including a file in the trigram index. Bumped from 64KB to
// 1MB after the search-shootout bench (issue: large code files like
// ReactFiberCompleteWork.js at 77KB were invisible to substring search,
// causing agents to miss call sites in them). 1MB covers all reasonable
// code files; minified/generated bundles past 1MB are correctly skipped.
const effective_skip_trigram = entry.skip_trigram or (content.len > 1024 * 1024);
const effective_skip_trigram = entry.skip_trigram or (content.len > max_trigram_file_bytes);
const parsed = try explore_mod.Explorer.parseContentForIndexing(arena_alloc, entry.path, content);
return .{
.path = entry.path,
Expand Down Expand Up @@ -607,12 +633,7 @@ fn readFileEntry(io: std.Io, root: []const u8, entry: InitialScanEntry, arena_al
const dir = std.Io.Dir.cwd().openDir(io, root, .{}) catch return null;
defer dir.close(io);
const stat = dir.statFile(io, entry.path, .{}) catch return null;
if (stat.size > max_indexed_file_bytes) return null;
const c = dir.readFileAlloc(io, entry.path, arena_alloc, .limited(max_indexed_file_bytes)) catch return null;
const check_len = @min(c.len, 512);
for (c[0..check_len]) |ch| {
if (ch == 0) return null;
}
const c = (readIndexableFile(io, dir, entry.path, arena_alloc, stat.size, false) catch return null) orelse return null;
return .{ .path = entry.path, .content = c };
}

Expand Down Expand Up @@ -665,7 +686,7 @@ fn cachedTrigramExtractWorker(results: *TriExtractResults, entries: []const Cach
defer local.deinit();
local.ensureTotalCapacity(4096) catch {};
for (entries) |entry| {
if (entry.content.len > 1024 * 1024) continue;
if (entry.content.len > max_trigram_file_bytes) continue;
local.clearRetainingCapacity();
if (entry.content.len >= 3) {
for (0..entry.content.len - 2) |i| {
Expand Down Expand Up @@ -716,7 +737,7 @@ pub fn buildTrigramsFromCache(
try entries.ensureTotalCapacity(allocator, contents.count());
var iter = contents.iterator();
while (iter.next()) |e| {
if (e.value_ptr.*.len > 1024 * 1024) continue;
if (e.value_ptr.*.len > max_trigram_file_bytes) continue;
entries.appendAssumeCapacity(.{ .path = e.key_ptr.*, .content = e.value_ptr.* });
}
if (entries.items.len == 0) return tmp_tri;
Expand Down Expand Up @@ -768,7 +789,7 @@ fn trigramExtractWorker(io: std.Io, results: *TriExtractResults, root: []const u
local.ensureTotalCapacity(4096) catch {};
for (entries) |entry| {
const r = readFileEntry(io, root, entry, alloc) orelse continue;
if (r.content.len > 1024 * 1024) continue;
if (r.content.len > max_trigram_file_bytes) continue;
local.clearRetainingCapacity();
if (r.content.len >= 3) {
for (0..r.content.len - 2) |i| {
Expand Down Expand Up @@ -837,7 +858,7 @@ pub fn initialScanWithTrigrams(
explorer.commitParsedFileOwnedOutline(file.path, file.content, file.outline, true, true) catch continue;
}
// Build trigrams from same content — no re-read needed
if (file.content.len <= 1024 * 1024) {
if (file.content.len <= max_trigram_file_bytes) {
tmp_tri.indexFile(file.path, file.content) catch {};
}
}
Expand Down Expand Up @@ -903,7 +924,7 @@ pub fn initialScanWithTrigrams(
for (workers) |*worker| {
for (worker.items.items) |file| {
explorer.commitParsedFileOwnedOutline(file.path, file.content, file.outline, true, true) catch continue;
if (file.content.len <= 1024 * 1024) {
if (file.content.len <= max_trigram_file_bytes) {
tmp_tri.indexFile(file.path, file.content) catch {};
}
}
Expand Down Expand Up @@ -931,13 +952,8 @@ pub fn initialScan(io: std.Io, store: *Store, explorer: *Explorer, root: []const
fn indexFileOutline(io: std.Io, explorer: *Explorer, dir: std.Io.Dir, path: []const u8, allocator: std.mem.Allocator) !void {
if (shouldSkipFile(path)) return;
const stat = try dir.statFile(io, path, .{});
if (stat.size > max_indexed_file_bytes) return;
const content = try dir.readFileAlloc(io, path, allocator, .limited(max_indexed_file_bytes));
const content = (try readIndexableFile(io, dir, path, allocator, stat.size, false)) orelse return;
defer allocator.free(content);
const check_len = @min(content.len, 512);
for (content[0..check_len]) |c| {
if (c == 0) return;
}
try explorer.indexFileOutlineOnly(path, content);
}

Expand Down Expand Up @@ -1202,20 +1218,13 @@ fn indexFileContent(io: std.Io, explorer: *Explorer, dir: std.Io.Dir, path: []co
_ = allocator;
if (shouldSkipFile(path)) return;
const stat = try dir.statFile(io, path, .{});
// Skip files over 512KB (likely minified bundles or generated)
if (stat.size > max_indexed_file_bytes) return;
// Use page_allocator arena for content — pages returned to OS immediately
// via munmap on deinit, eliminating GPA page retention from content churn.
var content_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer content_arena.deinit();
const content = try dir.readFileAlloc(io, path, content_arena.allocator(), .limited(max_indexed_file_bytes));
// Skip binary content (check first 512 bytes for null bytes)
const check_len = @min(content.len, 512);
for (content[0..check_len]) |c| {
if (c == 0) return;
}
// Skip trigram indexing for files > 64KB to prevent OOM on large repos
const effective_skip_trigram = skip_trigram or (content.len > 1024 * 1024);
const content = (try readIndexableFile(io, dir, path, content_arena.allocator(), stat.size, false)) orelse return;
// Skip trigram indexing for files over the trigram cap to bound memory on large repos
const effective_skip_trigram = skip_trigram or (content.len > max_trigram_file_bytes);
if (effective_skip_trigram) {
try explorer.indexFileSkipTrigram(path, content);
} else {
Expand Down
Loading