diff --git a/src/watcher.zig b/src/watcher.zig index 7f89e77..af7238b 100644 --- a/src/watcher.zig +++ b/src/watcher.zig @@ -112,6 +112,42 @@ const WorkerParsedResults = struct { /// trigram (see effective_skip_trigram); past this cap the file is skipped. /// Was 512KB, which silently dropped 512KB-1MB source files entirely. const max_indexed_file_bytes = 2 * 1024 * 1024; + +/// #635: byte threshold for the trigram index. Files up to this size get trigram +/// coverage; larger files (up to max_indexed_file_bytes) still get outline+word +/// indexing but skip trigrams to bound memory on large repos. Previously a bare +/// `1024 * 1024` duplicated across seven call sites. +const max_trigram_file_bytes = 1024 * 1024; + +/// #635: the single read gate every index path shares. Enforces the size cap and +/// the binary (null-byte) check in one place, so the threshold can't drift across +/// call sites again (the root cause of #635). Returns the file content (allocated +/// in `alloc`), or null when the file must be skipped — over the cap (logged when +/// `warn_oversize`) or binary. `size` is the caller's already-stat'd file size. +fn readIndexableFile( + io: std.Io, + dir: std.Io.Dir, + path: []const u8, + alloc: std.mem.Allocator, + size: u64, + warn_oversize: bool, +) !?[]const u8 { + if (size > max_indexed_file_bytes) { + if (warn_oversize) + // Reachable via codedb_read (disk fallback) but invisible to + // search/symbol/outline — surface the skip instead of dropping it. + std.log.warn("codedb: not indexing {s} ({d} bytes > {d} cap) — reachable only via codedb_read", .{ path, size, max_indexed_file_bytes }); + return null; + } + const content = try dir.readFileAlloc(io, path, alloc, .limited(max_indexed_file_bytes)); + // Skip binary content (null byte within the first 512 bytes). + const check_len = @min(content.len, 512); + if (std.mem.indexOfScalar(u8, content[0..check_len], 0) != null) { + alloc.free(content); + return null; + } + return content; +} const skip_dirs = [_][]const u8{ ".git", ".claude", @@ -453,23 +489,13 @@ fn parseInitialScanEntry(io: std.Io, root: []const u8, entry: InitialScanEntry, const dir = try std.Io.Dir.cwd().openDir(io, root, .{}); defer dir.close(io); const stat = try dir.statFile(io, entry.path, .{}); - if (stat.size > max_indexed_file_bytes) { - // #635: surface the skip instead of dropping it silently. Reachable via - // codedb_read (disk fallback) but invisible to search/symbol/outline. - std.log.warn("codedb: not indexing {s} ({d} bytes > {d} cap) — reachable only via codedb_read", .{ entry.path, stat.size, max_indexed_file_bytes }); - return null; - } - const content = try dir.readFileAlloc(io, entry.path, arena_alloc, .limited(max_indexed_file_bytes)); - const check_len = @min(content.len, 512); - for (content[0..check_len]) |c| { - if (c == 0) return null; - } + const content = (try readIndexableFile(io, dir, entry.path, arena_alloc, stat.size, true)) orelse return null; // Threshold for including a file in the trigram index. Bumped from 64KB to // 1MB after the search-shootout bench (issue: large code files like // ReactFiberCompleteWork.js at 77KB were invisible to substring search, // causing agents to miss call sites in them). 1MB covers all reasonable // code files; minified/generated bundles past 1MB are correctly skipped. - const effective_skip_trigram = entry.skip_trigram or (content.len > 1024 * 1024); + const effective_skip_trigram = entry.skip_trigram or (content.len > max_trigram_file_bytes); const parsed = try explore_mod.Explorer.parseContentForIndexing(arena_alloc, entry.path, content); return .{ .path = entry.path, @@ -607,12 +633,7 @@ fn readFileEntry(io: std.Io, root: []const u8, entry: InitialScanEntry, arena_al const dir = std.Io.Dir.cwd().openDir(io, root, .{}) catch return null; defer dir.close(io); const stat = dir.statFile(io, entry.path, .{}) catch return null; - if (stat.size > max_indexed_file_bytes) return null; - const c = dir.readFileAlloc(io, entry.path, arena_alloc, .limited(max_indexed_file_bytes)) catch return null; - const check_len = @min(c.len, 512); - for (c[0..check_len]) |ch| { - if (ch == 0) return null; - } + const c = (readIndexableFile(io, dir, entry.path, arena_alloc, stat.size, false) catch return null) orelse return null; return .{ .path = entry.path, .content = c }; } @@ -665,7 +686,7 @@ fn cachedTrigramExtractWorker(results: *TriExtractResults, entries: []const Cach defer local.deinit(); local.ensureTotalCapacity(4096) catch {}; for (entries) |entry| { - if (entry.content.len > 1024 * 1024) continue; + if (entry.content.len > max_trigram_file_bytes) continue; local.clearRetainingCapacity(); if (entry.content.len >= 3) { for (0..entry.content.len - 2) |i| { @@ -716,7 +737,7 @@ pub fn buildTrigramsFromCache( try entries.ensureTotalCapacity(allocator, contents.count()); var iter = contents.iterator(); while (iter.next()) |e| { - if (e.value_ptr.*.len > 1024 * 1024) continue; + if (e.value_ptr.*.len > max_trigram_file_bytes) continue; entries.appendAssumeCapacity(.{ .path = e.key_ptr.*, .content = e.value_ptr.* }); } if (entries.items.len == 0) return tmp_tri; @@ -768,7 +789,7 @@ fn trigramExtractWorker(io: std.Io, results: *TriExtractResults, root: []const u local.ensureTotalCapacity(4096) catch {}; for (entries) |entry| { const r = readFileEntry(io, root, entry, alloc) orelse continue; - if (r.content.len > 1024 * 1024) continue; + if (r.content.len > max_trigram_file_bytes) continue; local.clearRetainingCapacity(); if (r.content.len >= 3) { for (0..r.content.len - 2) |i| { @@ -837,7 +858,7 @@ pub fn initialScanWithTrigrams( explorer.commitParsedFileOwnedOutline(file.path, file.content, file.outline, true, true) catch continue; } // Build trigrams from same content — no re-read needed - if (file.content.len <= 1024 * 1024) { + if (file.content.len <= max_trigram_file_bytes) { tmp_tri.indexFile(file.path, file.content) catch {}; } } @@ -903,7 +924,7 @@ pub fn initialScanWithTrigrams( for (workers) |*worker| { for (worker.items.items) |file| { explorer.commitParsedFileOwnedOutline(file.path, file.content, file.outline, true, true) catch continue; - if (file.content.len <= 1024 * 1024) { + if (file.content.len <= max_trigram_file_bytes) { tmp_tri.indexFile(file.path, file.content) catch {}; } } @@ -931,13 +952,8 @@ pub fn initialScan(io: std.Io, store: *Store, explorer: *Explorer, root: []const fn indexFileOutline(io: std.Io, explorer: *Explorer, dir: std.Io.Dir, path: []const u8, allocator: std.mem.Allocator) !void { if (shouldSkipFile(path)) return; const stat = try dir.statFile(io, path, .{}); - if (stat.size > max_indexed_file_bytes) return; - const content = try dir.readFileAlloc(io, path, allocator, .limited(max_indexed_file_bytes)); + const content = (try readIndexableFile(io, dir, path, allocator, stat.size, false)) orelse return; defer allocator.free(content); - const check_len = @min(content.len, 512); - for (content[0..check_len]) |c| { - if (c == 0) return; - } try explorer.indexFileOutlineOnly(path, content); } @@ -1202,20 +1218,13 @@ fn indexFileContent(io: std.Io, explorer: *Explorer, dir: std.Io.Dir, path: []co _ = allocator; if (shouldSkipFile(path)) return; const stat = try dir.statFile(io, path, .{}); - // Skip files over 512KB (likely minified bundles or generated) - if (stat.size > max_indexed_file_bytes) return; // Use page_allocator arena for content — pages returned to OS immediately // via munmap on deinit, eliminating GPA page retention from content churn. var content_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); defer content_arena.deinit(); - const content = try dir.readFileAlloc(io, path, content_arena.allocator(), .limited(max_indexed_file_bytes)); - // Skip binary content (check first 512 bytes for null bytes) - const check_len = @min(content.len, 512); - for (content[0..check_len]) |c| { - if (c == 0) return; - } - // Skip trigram indexing for files > 64KB to prevent OOM on large repos - const effective_skip_trigram = skip_trigram or (content.len > 1024 * 1024); + const content = (try readIndexableFile(io, dir, path, content_arena.allocator(), stat.size, false)) orelse return; + // Skip trigram indexing for files over the trigram cap to bound memory on large repos + const effective_skip_trigram = skip_trigram or (content.len > max_trigram_file_bytes); if (effective_skip_trigram) { try explorer.indexFileSkipTrigram(path, content); } else {