justrach · justrach · Jun 22, 2026
diff --git a/src/watcher.zig b/src/watcher.zig
@@ -112,6 +112,42 @@ const WorkerParsedResults = struct {
 /// trigram (see effective_skip_trigram); past this cap the file is skipped.
 /// Was 512KB, which silently dropped 512KB-1MB source files entirely.
 const max_indexed_file_bytes = 2 * 1024 * 1024;
+
+/// #635: byte threshold for the trigram index. Files up to this size get trigram
+/// coverage; larger files (up to max_indexed_file_bytes) still get outline+word
+/// indexing but skip trigrams to bound memory on large repos. Previously a bare
+/// `1024 * 1024` duplicated across seven call sites.
+const max_trigram_file_bytes = 1024 * 1024;
+
+/// #635: the single read gate every index path shares. Enforces the size cap and
+/// the binary (null-byte) check in one place, so the threshold can't drift across
+/// call sites again (the root cause of #635). Returns the file content (allocated
+/// in `alloc`), or null when the file must be skipped — over the cap (logged when
+/// `warn_oversize`) or binary. `size` is the caller's already-stat'd file size.
+fn readIndexableFile(
+    io: std.Io,
+    dir: std.Io.Dir,
+    path: []const u8,
+    alloc: std.mem.Allocator,
+    size: u64,
+    warn_oversize: bool,
+) !?[]const u8 {
+    if (size > max_indexed_file_bytes) {
+        if (warn_oversize)
+            // Reachable via codedb_read (disk fallback) but invisible to
+            // search/symbol/outline — surface the skip instead of dropping it.
+            std.log.warn("codedb: not indexing {s} ({d} bytes > {d} cap) — reachable only via codedb_read", .{ path, size, max_indexed_file_bytes });
+        return null;
+    }
+    const content = try dir.readFileAlloc(io, path, alloc, .limited(max_indexed_file_bytes));
+    // Skip binary content (null byte within the first 512 bytes).
+    const check_len = @min(content.len, 512);
+    if (std.mem.indexOfScalar(u8, content[0..check_len], 0) != null) {
+        alloc.free(content);
+        return null;
+    }
+    return content;
+}
 const skip_dirs = [_][]const u8{
     ".git",
     ".claude",
@@ -453,23 +489,13 @@ fn parseInitialScanEntry(io: std.Io, root: []const u8, entry: InitialScanEntry,
     const dir = try std.Io.Dir.cwd().openDir(io, root, .{});
     defer dir.close(io);
     const stat = try dir.statFile(io, entry.path, .{});
-    if (stat.size > max_indexed_file_bytes) {
-        // #635: surface the skip instead of dropping it silently. Reachable via
-        // codedb_read (disk fallback) but invisible to search/symbol/outline.
-        std.log.warn("codedb: not indexing {s} ({d} bytes > {d} cap) — reachable only via codedb_read", .{ entry.path, stat.size, max_indexed_file_bytes });
-        return null;
-    }
-    const content = try dir.readFileAlloc(io, entry.path, arena_alloc, .limited(max_indexed_file_bytes));
-    const check_len = @min(content.len, 512);
-    for (content[0..check_len]) |c| {
-        if (c == 0) return null;
-    }
+    const content = (try readIndexableFile(io, dir, entry.path, arena_alloc, stat.size, true)) orelse return null;
     // Threshold for including a file in the trigram index. Bumped from 64KB to
     // 1MB after the search-shootout bench (issue: large code files like
     // ReactFiberCompleteWork.js at 77KB were invisible to substring search,
     // causing agents to miss call sites in them). 1MB covers all reasonable
     // code files; minified/generated bundles past 1MB are correctly skipped.
-    const effective_skip_trigram = entry.skip_trigram or (content.len > 1024 * 1024);
+    const effective_skip_trigram = entry.skip_trigram or (content.len > max_trigram_file_bytes);
     const parsed = try explore_mod.Explorer.parseContentForIndexing(arena_alloc, entry.path, content);
     return .{
         .path = entry.path,
@@ -607,12 +633,7 @@ fn readFileEntry(io: std.Io, root: []const u8, entry: InitialScanEntry, arena_al
     const dir = std.Io.Dir.cwd().openDir(io, root, .{}) catch return null;
     defer dir.close(io);
     const stat = dir.statFile(io, entry.path, .{}) catch return null;
-    if (stat.size > max_indexed_file_bytes) return null;
-    const c = dir.readFileAlloc(io, entry.path, arena_alloc, .limited(max_indexed_file_bytes)) catch return null;
-    const check_len = @min(c.len, 512);
-    for (c[0..check_len]) |ch| {
-        if (ch == 0) return null;
-    }
+    const c = (readIndexableFile(io, dir, entry.path, arena_alloc, stat.size, false) catch return null) orelse return null;
     return .{ .path = entry.path, .content = c };
 }
 
@@ -665,7 +686,7 @@ fn cachedTrigramExtractWorker(results: *TriExtractResults, entries: []const Cach
     defer local.deinit();
     local.ensureTotalCapacity(4096) catch {};
     for (entries) |entry| {
-        if (entry.content.len > 1024 * 1024) continue;
+        if (entry.content.len > max_trigram_file_bytes) continue;
         local.clearRetainingCapacity();
         if (entry.content.len >= 3) {
             for (0..entry.content.len - 2) |i| {
@@ -716,7 +737,7 @@ pub fn buildTrigramsFromCache(
     try entries.ensureTotalCapacity(allocator, contents.count());
     var iter = contents.iterator();
     while (iter.next()) |e| {
-        if (e.value_ptr.*.len > 1024 * 1024) continue;
+        if (e.value_ptr.*.len > max_trigram_file_bytes) continue;
         entries.appendAssumeCapacity(.{ .path = e.key_ptr.*, .content = e.value_ptr.* });
     }
     if (entries.items.len == 0) return tmp_tri;
@@ -768,7 +789,7 @@ fn trigramExtractWorker(io: std.Io, results: *TriExtractResults, root: []const u
     local.ensureTotalCapacity(4096) catch {};
     for (entries) |entry| {
         const r = readFileEntry(io, root, entry, alloc) orelse continue;
-        if (r.content.len > 1024 * 1024) continue;
+        if (r.content.len > max_trigram_file_bytes) continue;
         local.clearRetainingCapacity();
         if (r.content.len >= 3) {
             for (0..r.content.len - 2) |i| {
@@ -837,7 +858,7 @@ pub fn initialScanWithTrigrams(
                     explorer.commitParsedFileOwnedOutline(file.path, file.content, file.outline, true, true) catch continue;
                 }
                 // Build trigrams from same content — no re-read needed
-                if (file.content.len <= 1024 * 1024) {
+                if (file.content.len <= max_trigram_file_bytes) {
                     tmp_tri.indexFile(file.path, file.content) catch {};
                 }
             }
@@ -903,7 +924,7 @@ pub fn initialScanWithTrigrams(
         for (workers) |*worker| {
             for (worker.items.items) |file| {
                 explorer.commitParsedFileOwnedOutline(file.path, file.content, file.outline, true, true) catch continue;
-                if (file.content.len <= 1024 * 1024) {
+                if (file.content.len <= max_trigram_file_bytes) {
                     tmp_tri.indexFile(file.path, file.content) catch {};
                 }
             }
@@ -931,13 +952,8 @@ pub fn initialScan(io: std.Io, store: *Store, explorer: *Explorer, root: []const
 fn indexFileOutline(io: std.Io, explorer: *Explorer, dir: std.Io.Dir, path: []const u8, allocator: std.mem.Allocator) !void {
     if (shouldSkipFile(path)) return;
     const stat = try dir.statFile(io, path, .{});
-    if (stat.size > max_indexed_file_bytes) return;
-    const content = try dir.readFileAlloc(io, path, allocator, .limited(max_indexed_file_bytes));
+    const content = (try readIndexableFile(io, dir, path, allocator, stat.size, false)) orelse return;
     defer allocator.free(content);
-    const check_len = @min(content.len, 512);
-    for (content[0..check_len]) |c| {
-        if (c == 0) return;
-    }
     try explorer.indexFileOutlineOnly(path, content);
 }
 
@@ -1202,20 +1218,13 @@ fn indexFileContent(io: std.Io, explorer: *Explorer, dir: std.Io.Dir, path: []co
     _ = allocator;
     if (shouldSkipFile(path)) return;
     const stat = try dir.statFile(io, path, .{});
-    // Skip files over 512KB (likely minified bundles or generated)
-    if (stat.size > max_indexed_file_bytes) return;
     // Use page_allocator arena for content — pages returned to OS immediately
     // via munmap on deinit, eliminating GPA page retention from content churn.
     var content_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
     defer content_arena.deinit();
-    const content = try dir.readFileAlloc(io, path, content_arena.allocator(), .limited(max_indexed_file_bytes));
-    // Skip binary content (check first 512 bytes for null bytes)
-    const check_len = @min(content.len, 512);
-    for (content[0..check_len]) |c| {
-        if (c == 0) return;
-    }
-    // Skip trigram indexing for files > 64KB to prevent OOM on large repos
-    const effective_skip_trigram = skip_trigram or (content.len > 1024 * 1024);
+    const content = (try readIndexableFile(io, dir, path, content_arena.allocator(), stat.size, false)) orelse return;
+    // Skip trigram indexing for files over the trigram cap to bound memory on large repos
+    const effective_skip_trigram = skip_trigram or (content.len > max_trigram_file_bytes);
     if (effective_skip_trigram) {
         try explorer.indexFileSkipTrigram(path, content);
     } else {