From 16e5fe91a978c0d496598f3245dd565377b122eb Mon Sep 17 00:00:00 2001
From: justrach <54503978+justrach@users.noreply.github.com>
Date: Mon, 22 Jun 2026 12:50:10 +0800
Subject: [PATCH] perf(watcher): read changed files once on the live-update
 path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

incrementalDiff and drainNotifyFile each hashed a changed file (full read) and
then indexed it (a second full read) — two reads of the same bytes per change,
up to 2× the max_indexed_file_bytes cap of IO after #635 widened it to 2 MB.

Read once and reuse the buffer for both the content hash and indexing:

- New `hashAndIndexFile` reads a file once, hashes it, and indexes it from the
  same buffer; returns the hash to store (0 = skipped, maxInt = IO error).
- New `indexContentBuffer` is the shared "index a text buffer" primitive
  (binary check + trigram decision), reused by indexFileContent and
  hashAndIndexFile so the rules live in one place.
- incrementalDiff reads the changed file once: when there is no prior hash
  (first edit since startup — the common case) it no longer does a separate
  hash pass, and it now stores a real content hash for size-changed files too,
  so a later revert-to-same-size is detected. The streaming `hashFile` helper
  is gone (its only callers are replaced).
- drainNotifyFile (muonry instant re-index, fired on every edit) drops its
  unconditional index-then-hash double read.

Behavior-preserving for change detection: same identical-content short-circuit
(touch / git checkout), same binary-hashed-but-not-indexed split, same skip and
IO-error sentinels. Wyhash over the full buffer equals the prior chunked stream.
Full suite green; zig fmt clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/watcher.zig | 90 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 56 insertions(+), 34 deletions(-)

diff --git a/src/watcher.zig b/src/watcher.zig
index af7238b..99d56e8 100644
--- a/src/watcher.zig
+++ b/src/watcher.zig
@@ -1084,26 +1084,40 @@ pub fn incrementalLoop(io: std.Io, store: *Store, explorer: *Explorer, queue: *E
     }
 }
 
-fn hashFile(io: std.Io, dir: std.Io.Dir, path: []const u8, size: u64) !u64 {
-    // Returns 0 for intentional skip (large files, filtered extensions).
-    // Returns maxInt(u64) on IO error so the value always differs from a valid
-    // previously stored hash of 0, preventing a false "content unchanged" conclusion.
-    if (shouldSkipFile(path)) return 0;
-    if (size > max_indexed_file_bytes) return 0;
-    const file = dir.openFile(io, path, .{}) catch return std.math.maxInt(u64);
-    defer file.close(io);
 
-    var hasher = std.hash.Wyhash.init(0);
-    var buf: [16 * 1024]u8 = undefined;
-    var offset: u64 = 0;
-    while (true) {
-        const n = file.readPositionalAll(io, &buf, offset) catch return std.math.maxInt(u64);
-        if (n == 0) break;
-        hasher.update(buf[0..n]);
-        offset += n;
-        if (n < buf.len) break;
+/// Index already-read file content: skip binary (a null byte in the first 512
+/// bytes), then index with or without trigrams by size. The single place that
+/// turns a text buffer into index entries — shared by indexFileContent and
+/// hashAndIndexFile so the binary + trigram rules live in one spot.
+fn indexContentBuffer(explorer: *Explorer, path: []const u8, content: []const u8, skip_trigram: bool) !void {
+    const check_len = @min(content.len, 512);
+    if (std.mem.indexOfScalar(u8, content[0..check_len], 0) != null) return; // binary
+    const effective_skip_trigram = skip_trigram or (content.len > max_trigram_file_bytes);
+    if (effective_skip_trigram) {
+        try explorer.indexFileSkipTrigram(path, content);
+    } else {
+        try explorer.indexFile(path, content);
     }
-    return hasher.final();
+}
+
+/// Read a file once and reuse the buffer for both change detection and indexing.
+/// The live-update paths (incrementalDiff, drainNotifyFile) previously hashed and
+/// indexed in two separate full reads of a changed file — disk read twice,
+/// up to 2× max_indexed_file_bytes of IO after #635 widened the cap. This reads
+/// once. Returns the content hash to store for future change detection: 0 when the
+/// file is skipped (filtered or over the cap), maxInt(u64) on IO error (always
+/// differs from a stored hash, forcing a re-read next cycle). Binary files are
+/// hashed but not indexed, matching the prior hash-then-index split behavior.
+/// `size` is the caller's already-stat'd size, so there is no extra stat here.
+fn hashAndIndexFile(io: std.Io, explorer: *Explorer, dir: std.Io.Dir, path: []const u8, size: u64) u64 {
+    if (shouldSkipFile(path)) return 0;
+    if (size > max_indexed_file_bytes) return 0;
+    var content_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
+    defer content_arena.deinit();
+    const content = dir.readFileAlloc(io, path, content_arena.allocator(), .limited(max_indexed_file_bytes)) catch return std.math.maxInt(u64);
+    const hash = std.hash.Wyhash.hash(0, content);
+    indexContentBuffer(explorer, path, content, false) catch {};
+    return hash;
 }
 
 fn pushEventOrWait(queue: *EventQueue, event: FsEvent) void {
@@ -1135,14 +1149,27 @@ fn incrementalDiff(io: std.Io, store: *Store, explorer: *Explorer, queue: *Event
             // Mtime unchanged -> skip (cheap path, no IO)
             if (old.mtime == mtime) continue;
 
-            // Size changed -> definitely changed, skip expensive hash.
+            const stable_path = known_entry.key_ptr.*;
+
+            // Mtime changed: read the file once and reuse the buffer for both the
+            // content hash (change detection) and indexing, instead of hashing
+            // (full read) and then indexing (a second full read).
             var hash: u64 = 0;
-            if (old.size == stat.size) {
-                // Same size + changed mtime -> hash to confirm content actually differs.
-                hash = hashFile(io, dir, entry.path, stat.size) catch 0;
+            var content: ?[]const u8 = null;
+            var content_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
+            defer content_arena.deinit();
+            if (!shouldSkipFile(entry.path) and stat.size <= max_indexed_file_bytes) {
+                if (dir.readFileAlloc(io, entry.path, content_arena.allocator(), .limited(max_indexed_file_bytes))) |buf| {
+                    content = buf;
+                    hash = std.hash.Wyhash.hash(0, buf);
+                } else |_| {
+                    hash = std.math.maxInt(u64); // IO error -> force re-index next cycle
+                }
             }
+
+            // Same size + matching prior hash -> content identical (touch, git
+            // checkout): update metadata only, no snapshot/event/re-index.
             if (old.size == stat.size and hash != 0 and old.hash != 0 and hash == old.hash) {
-                // Content identical (e.g. touch, git checkout) -> update metadata only.
                 old.mtime = mtime;
                 old.size = stat.size;
                 continue;
@@ -1152,9 +1179,8 @@ fn incrementalDiff(io: std.Io, store: *Store, explorer: *Explorer, queue: *Event
             old.mtime = mtime;
             old.size = stat.size;
             old.hash = hash;
-            const stable_path = known_entry.key_ptr.*;
             if (FsEvent.init(stable_path, .modified, seq)) |ev| pushEventOrWait(queue, ev);
-            indexFileContent(io, explorer, dir, stable_path, tmp, false) catch {};
+            if (content) |buf| indexContentBuffer(explorer, stable_path, buf, false) catch {};
         } else {
             // New files always generate an event, so skip the extra full-file hash pass.
             const duped = try persistent.dupe(u8, entry.path);
@@ -1223,13 +1249,7 @@ fn indexFileContent(io: std.Io, explorer: *Explorer, dir: std.Io.Dir, path: []co
     var content_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
     defer content_arena.deinit();
     const content = (try readIndexableFile(io, dir, path, content_arena.allocator(), stat.size, false)) orelse return;
-    // Skip trigram indexing for files over the trigram cap to bound memory on large repos
-    const effective_skip_trigram = skip_trigram or (content.len > max_trigram_file_bytes);
-    if (effective_skip_trigram) {
-        try explorer.indexFileSkipTrigram(path, content);
-    } else {
-        try explorer.indexFile(path, content);
-    }
+    try indexContentBuffer(explorer, path, content, skip_trigram);
 }
 
 // ── muonry interop ───────────────────────────────────────────────────────────
@@ -1279,10 +1299,12 @@ fn drainNotifyFile(io: std.Io, store: *Store, explorer: *Explorer, queue: *Event
             if (existing.mtime == mtime and existing.size == stat.size) continue;
         }
 
-        indexFileContent(io, explorer, dir, rel, alloc, false) catch continue;
+        // Read once: index + hash from the same buffer (previously two separate
+        // full reads of the same file per notification).
+        const hash = hashAndIndexFile(io, explorer, dir, rel, stat.size);
+        if (hash == std.math.maxInt(u64)) continue; // read failed — retry next cycle
 
         // Update known-file state so incrementalDiff doesn't double-process
-        const hash = hashFile(io, dir, rel, stat.size) catch continue;
         if (known.getPtr(rel)) |existing| {
             existing.mtime = mtime;
             existing.size = stat.size;