diff --git a/src/test_index.zig b/src/test_index.zig index 6b60692..0b58a85 100644 --- a/src/test_index.zig +++ b/src/test_index.zig @@ -37,27 +37,6 @@ const SearchResult = @import("explore.zig").SearchResult; const SymbolKind = explore.SymbolKind; const edit_mod = @import("edit.zig"); - - - - - - - - - - - - - - - - - - - - - test "trigram index: index and candidate lookup" { var ti = TrigramIndex.init(testing.allocator); defer ti.deinit(); @@ -72,7 +51,6 @@ test "trigram index: index and candidate lookup" { try testing.expectEqualStrings("src/store.zig", cands.?[0]); } - test "trigram index: short query returns null" { var ti = TrigramIndex.init(testing.allocator); defer ti.deinit(); @@ -82,7 +60,6 @@ test "trigram index: short query returns null" { try testing.expect(cands == null); } - test "trigram index: no match returns empty" { var ti = TrigramIndex.init(testing.allocator); defer ti.deinit(); @@ -93,7 +70,6 @@ test "trigram index: no match returns empty" { try testing.expect(cands.?.len == 0); } - test "trigram index: re-index removes old trigrams" { var ti = TrigramIndex.init(testing.allocator); defer ti.deinit(); @@ -113,7 +89,6 @@ test "trigram index: re-index removes old trigrams" { try testing.expect(c3 != null and c3.?.len == 1); } - test "pairWeight: deterministic" { const w1 = pairWeight('a', 'b'); const w2 = pairWeight('a', 'b'); @@ -125,7 +100,6 @@ test "pairWeight: deterministic" { _ = w3; // just ensure it compiles and doesn't crash } - test "pairWeight: different pairs produce different values (sanity)" { // 'ab' and 'ba' should almost never collide for a reasonable hash. const w_ab = pairWeight('a', 'b'); @@ -135,14 +109,12 @@ test "pairWeight: different pairs produce different values (sanity)" { _ = w_ba; } - test "extractSparseNgrams: short content returns empty" { const ng = try extractSparseNgrams("ab", testing.allocator); defer testing.allocator.free(ng); try testing.expectEqual(@as(usize, 0), ng.len); } - test "extractSparseNgrams: minimum length content yields one ngram" { const ng = try extractSparseNgrams("abc", testing.allocator); defer testing.allocator.free(ng); @@ -151,7 +123,6 @@ test "extractSparseNgrams: minimum length content yields one ngram" { try testing.expectEqual(@as(usize, 0), ng[0].pos); } - test "extractSparseNgrams: deterministic across calls" { const ng1 = try extractSparseNgrams("hello world", testing.allocator); defer testing.allocator.free(ng1); @@ -166,7 +137,6 @@ test "extractSparseNgrams: deterministic across calls" { } } - test "extractSparseNgrams: case-insensitive hashing" { const ng_lower = try extractSparseNgrams("hello", testing.allocator); defer testing.allocator.free(ng_lower); @@ -179,7 +149,6 @@ test "extractSparseNgrams: case-insensitive hashing" { } } - test "extractSparseNgrams: ngrams cover entire content" { const content = "the quick brown fox"; const ng = try extractSparseNgrams(content, testing.allocator); @@ -200,7 +169,6 @@ test "extractSparseNgrams: ngrams cover entire content" { } } - test "extractSparseNgrams: coverage with force-split remainder 1 (len=17)" { // 17 identical chars → no interior local maxima → one span of length 17. // Force-split: one MAX_NGRAM_LEN=16 chunk, remainder=1 → must still cover byte 16. @@ -217,7 +185,6 @@ test "extractSparseNgrams: coverage with force-split remainder 1 (len=17)" { for (covered) |c| try testing.expect(c); } - test "extractSparseNgrams: coverage with force-split remainder 2 (len=18)" { // 18 identical chars → remainder=2 → must still cover bytes 16-17. const content = "aaaaaaaaaaaaaaaaaa"; // 18 'a's @@ -233,7 +200,6 @@ test "extractSparseNgrams: coverage with force-split remainder 2 (len=18)" { for (covered) |c| try testing.expect(c); } - test "extractSparseNgrams: ngram length bounds" { const content = "abcdefghijklmnopqrstuvwxyz0123456789"; const ng = try extractSparseNgrams(content, testing.allocator); @@ -245,7 +211,6 @@ test "extractSparseNgrams: ngram length bounds" { } } - test "buildCoveringSet: sliding window covers all query substrings" { // "foobar" (6 chars); lengths [3,6] yield 4+3+2+1 = 10 substrings. const ngrams = try buildCoveringSet("foobar", testing.allocator); @@ -254,14 +219,12 @@ test "buildCoveringSet: sliding window covers all query substrings" { for (ngrams) |ng| try testing.expect(ng.len >= 3 and ng.len <= 6); } - test "buildCoveringSet: short query returns empty" { const ngrams = try buildCoveringSet("ab", testing.allocator); defer testing.allocator.free(ngrams); try testing.expectEqual(@as(usize, 0), ngrams.len); } - test "sparse ngram index: index and candidate lookup" { var sni = SparseNgramIndex.init(testing.allocator); defer sni.deinit(); @@ -289,7 +252,6 @@ test "sparse ngram index: index and candidate lookup" { try testing.expect(!found_bar); } - test "sparse ngram index: short query returns null" { var sni = SparseNgramIndex.init(testing.allocator); defer sni.deinit(); @@ -299,7 +261,6 @@ test "sparse ngram index: short query returns null" { try testing.expect(cands == null); } - test "sparse ngram index: re-index removes old ngrams" { var sni = SparseNgramIndex.init(testing.allocator); defer sni.deinit(); @@ -316,7 +277,6 @@ test "sparse ngram index: re-index removes old ngrams" { if (c2) |cs| try testing.expectEqual(@as(usize, 0), cs.len); } - test "sparse ngram index: removeFile prunes entries" { var sni = SparseNgramIndex.init(testing.allocator); defer sni.deinit(); @@ -328,7 +288,6 @@ test "sparse ngram index: removeFile prunes entries" { try testing.expectEqual(@as(u32, 0), sni.fileCount()); } - test "sparse ngram candidates: sliding window finds file with short n-gram" { var sni = SparseNgramIndex.init(testing.allocator); defer sni.deinit(); @@ -353,7 +312,6 @@ test "sparse ngram candidates: sliding window finds file with short n-gram" { try testing.expect(found_a); } - test "pairWeight: common pairs have lower weight than rare pairs" { // Common English/code pairs should have lower base weight than rare pairs. // 'th' and 'er' are in the default_pair_freq table with weight 0x1000. @@ -367,7 +325,6 @@ test "pairWeight: common pairs have lower weight than rare pairs" { try testing.expect(w_er < w_zj); } - test "pairWeight: frequency-weighted produces fewer boundaries for common text" { // A string composed of very common pairs should produce few local maxima // (interior weights are low and similar), giving fewer n-grams than a @@ -382,7 +339,6 @@ test "pairWeight: frequency-weighted produces fewer boundaries for common text" try testing.expect(ng_rare.len >= ng_common.len); } - test "pairWeight: deterministic with frequency table" { const w1 = pairWeight('a', 'b'); const w2 = pairWeight('a', 'b'); @@ -392,7 +348,6 @@ test "pairWeight: deterministic with frequency table" { try testing.expectEqual(pairWeight('q', 'x'), pairWeight('q', 'x')); } - test "buildFrequencyTable: common pairs get lower weight than absent pairs" { // Construct content where 'ab' appears many times and 'qx' never appears. const content = "ababababababababababab"; @@ -402,7 +357,6 @@ test "buildFrequencyTable: common pairs get lower weight than absent pairs" { try testing.expectEqual(@as(u16, 0xFE00), table['q']['x']); } - test "frequency table: disk round-trip" { var tmp_dir = testing.tmpDir(.{}); defer tmp_dir.cleanup(); @@ -430,7 +384,6 @@ test "frequency table: disk round-trip" { ); } - test "frequency table: little-endian byte order on disk" { var tmp_dir = testing.tmpDir(.{}); defer tmp_dir.cleanup(); @@ -461,7 +414,6 @@ test "frequency table: little-endian byte order on disk" { try testing.expectEqual(@as(u16, 0xABCD), loaded.?[0][1]); } - test "setFrequencyTable / resetFrequencyTable: pairWeight output changes" { // Build a table where 'th' is rare (high weight) — opposite of default. var custom: [256][256]u16 = .{.{0x1000} ** 256} ** 256; // all common @@ -486,7 +438,6 @@ test "setFrequencyTable / resetFrequencyTable: pairWeight output changes" { _ = after_qx; } - test "file versions: append and latest" { var fv = version.FileVersions.init(testing.allocator, "test.zig"); defer fv.deinit(); @@ -513,7 +464,6 @@ test "file versions: append and latest" { try testing.expect(latest.size == 150); } - test "file versions: countSince" { var fv = version.FileVersions.init(testing.allocator, "test.zig"); defer fv.deinit(); @@ -549,7 +499,6 @@ test "file versions: countSince" { try testing.expect(fv.countSince(10) == 0); } - test "watcher: queue overflow is explicit" { var queue = watcher.EventQueue{}; @@ -569,7 +518,6 @@ test "watcher: queue overflow is explicit" { try testing.expect(popped == pushed); } - test "watcher: queue event copies path bytes" { var queue = watcher.EventQueue{}; const original = try testing.allocator.dupe(u8, "tmp/deleted.zig"); @@ -582,7 +530,6 @@ test "watcher: queue event copies path bytes" { try testing.expect(event.seq == 99); } - test "watcher: parallel initial scan matches sequential results" { var tmp_dir = testing.tmpDir(.{}); defer tmp_dir.cleanup(); @@ -625,7 +572,6 @@ test "watcher: parallel initial scan matches sequential results" { try testing.expectEqual(explorer_seq.outlines.count(), explorer_par.outlines.count()); } - test "watcher: parallel word-index shards match sequential (skip_file_words)" { // Exercises the per-worker WordIndex shard + serial mergeShard path // (use_shards requires word_index.enabled and skip_file_words). Asserts the @@ -692,7 +638,6 @@ test "watcher: parallel word-index shards match sequential (skip_file_words)" { } } - test "edit: range_start zero is invalid" { var tmp = testing.tmpDir(.{}); defer tmp.cleanup(); @@ -719,7 +664,6 @@ test "edit: range_start zero is invalid" { })); } - test "edit: range_start beyond file is invalid" { var tmp = testing.tmpDir(.{}); defer tmp.cleanup(); @@ -746,7 +690,6 @@ test "edit: range_start beyond file is invalid" { })); } - test "regression #2: searchContent frees trigram candidate slice" { // Verifies that the candidates() return value is freed by searchContent. // If the defer is missing, the GPA will detect the leak and fail. @@ -768,7 +711,6 @@ test "regression #2: searchContent frees trigram candidate slice" { try testing.expectEqualStrings("leak-check.zig", results[0].path); } - test "regression #2: searchContent no leak on zero results" { // Even when trigram narrows to candidates but none match full text, // the candidate slice must be freed. @@ -789,7 +731,6 @@ test "regression #2: searchContent no leak on zero results" { try testing.expect(results.len == 0); } - test "regression #2: searchContent short query skips trigrams" { // Queries < 3 chars can't use trigram index — ensure no leak from null path. var explorer = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); @@ -808,7 +749,6 @@ test "regression #2: searchContent short query skips trigrams" { try testing.expect(results.len == 1); } - test "regression #5: getHotFiles does not deadlock" { // getHotFiles used to hold explorer.mu while calling store.getLatest() // which locks store.mu — a lock ordering violation. The fix collects @@ -840,7 +780,6 @@ test "regression #5: getHotFiles does not deadlock" { try testing.expectEqualStrings("hot-c.zig", hot[1]); } - test "regression #5: getHotFiles with no store entries" { var explorer = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); defer explorer.deinit(); @@ -860,7 +799,6 @@ test "regression #5: getHotFiles with no store entries" { try testing.expectEqualStrings("orphan.zig", hot[0]); } - test "regression: concurrent hot/read with remove" { var explorer = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); defer explorer.deinit(); @@ -909,7 +847,6 @@ test "regression: concurrent hot/read with remove" { stop.store(true, .release); } - test "regression #5: store getLatestSeqUnlocked" { var store = Store.init(testing.allocator); defer store.deinit(); @@ -926,7 +863,6 @@ test "regression #5: store getLatestSeqUnlocked" { try testing.expect(missing == 0); } - test "regression #7: tree shows directory nodes" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -948,7 +884,6 @@ test "regression #7: tree shows directory nodes" { try testing.expect(std.mem.indexOf(u8, tree, "build.zig") != null); } - test "regression #7: tree handles nested directories" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -967,7 +902,6 @@ test "regression #7: tree handles nested directories" { try testing.expect(std.mem.indexOf(u8, tree, " hash.zig") != null); } - test "regression #7: tree shows only basenames" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -984,7 +918,6 @@ test "regression #7: tree shows only basenames" { try testing.expect(std.mem.indexOf(u8, tree, "bar.zig") != null); } - test "regression: searchWord empty result is allocator-owned" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -997,7 +930,6 @@ test "regression: searchWord empty result is allocator-owned" { try testing.expect(hits.len == 0); } - test "regression: searchContent frees empty trigram candidate slice" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -1016,7 +948,6 @@ test "regression: searchContent frees empty trigram candidate slice" { try testing.expect(results.len == 0); } - test "regression: queue push stays non-blocking when full" { var queue = watcher.EventQueue{}; @@ -1036,14 +967,12 @@ test "regression: queue push stays non-blocking when full" { try testing.expect(elapsed < 50 * std.time.ns_per_ms); } - test "isPathSafe: rejects absolute paths" { const mcp = @import("mcp.zig"); try testing.expect(!mcp.isPathSafe("/etc/passwd")); try testing.expect(!mcp.isPathSafe("/")); } - test "isPathSafe: rejects parent traversal" { const mcp = @import("mcp.zig"); try testing.expect(!mcp.isPathSafe("../secret")); @@ -1051,13 +980,11 @@ test "isPathSafe: rejects parent traversal" { try testing.expect(!mcp.isPathSafe("..")); } - test "isPathSafe: rejects empty path" { const mcp = @import("mcp.zig"); try testing.expect(!mcp.isPathSafe("")); } - test "isPathSafe: accepts valid relative paths" { const mcp = @import("mcp.zig"); try testing.expect(mcp.isPathSafe("src/main.zig")); @@ -1065,7 +992,6 @@ test "isPathSafe: accepts valid relative paths" { try testing.expect(mcp.isPathSafe("a/b/c/d.txt")); } - test "findSymbol: returned data is owned copy" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -1085,7 +1011,6 @@ test "findSymbol: returned data is owned copy" { try testing.expectEqualStrings("myFunc", result.?.symbol.name); } - test "findAllSymbols: returned data survives source removal" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -1108,7 +1033,6 @@ test "findAllSymbols: returned data survives source removal" { } } - test "searchContent: returned paths are owned copies" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -1127,7 +1051,6 @@ test "searchContent: returned paths are owned copies" { try testing.expectEqualStrings("src/hello.zig", results[0].path); } - test "trigram index: removeFile prunes empty sets" { var ti = TrigramIndex.init(testing.allocator); defer ti.deinit(); @@ -1147,7 +1070,6 @@ test "trigram index: removeFile prunes empty sets" { } } - test "edit: atomic write leaves no temp files on success" { // Create a temp file to edit var tmp_dir = testing.tmpDir(.{}); @@ -1169,7 +1091,6 @@ test "edit: atomic write leaves no temp files on success" { return error.TempFileNotCleaned; } - test "getBool: returns true for bool true" { var map: std.json.ObjectMap = .empty; defer map.deinit(testing.allocator); @@ -1178,7 +1099,6 @@ test "getBool: returns true for bool true" { try testing.expect(mcp_getBool(&map, "flag") == true); } - test "getBool: returns false for bool false" { var map: std.json.ObjectMap = .empty; defer map.deinit(testing.allocator); @@ -1187,7 +1107,6 @@ test "getBool: returns false for bool false" { try testing.expect(mcp_getBool(&map, "flag") == false); } - test "getBool: returns false for missing key" { var map: std.json.ObjectMap = .empty; defer map.deinit(testing.allocator); @@ -1195,7 +1114,6 @@ test "getBool: returns false for missing key" { try testing.expect(mcp_getBool(&map, "missing") == false); } - test "getBool: returns false for non-bool value" { var map: std.json.ObjectMap = .empty; defer map.deinit(testing.allocator); @@ -1204,7 +1122,6 @@ test "getBool: returns false for non-bool value" { try testing.expect(mcp_getBool(&map, "flag") == false); } - test "Tool enum: all valid tool names parse" { const Tool = @import("mcp.zig").Tool; try testing.expect(std.meta.stringToEnum(Tool, "codedb_tree") != null); @@ -1222,7 +1139,6 @@ test "Tool enum: all valid tool names parse" { try testing.expect(std.meta.stringToEnum(Tool, "codedb_bundle") != null); } - test "Tool enum: invalid names return null" { const Tool = @import("mcp.zig").Tool; try testing.expect(std.meta.stringToEnum(Tool, "codedb_invalid") == null); @@ -1230,7 +1146,6 @@ test "Tool enum: invalid names return null" { try testing.expect(std.meta.stringToEnum(Tool, "tree") == null); } - test "decomposeRegex: pure literal extracts trigrams" { var q = try decomposeRegex("hello", testing.allocator); defer q.deinit(); @@ -1239,14 +1154,12 @@ test "decomposeRegex: pure literal extracts trigrams" { try testing.expectEqual(@as(usize, 0), q.or_groups.len); } - test "decomposeRegex: short literal yields no trigrams" { var q = try decomposeRegex("ab", testing.allocator); defer q.deinit(); try testing.expectEqual(@as(usize, 0), q.and_trigrams.len); } - test "decomposeRegex: dot breaks trigram chain" { var q = try decomposeRegex("he.lo", testing.allocator); defer q.deinit(); @@ -1254,7 +1167,6 @@ test "decomposeRegex: dot breaks trigram chain" { try testing.expectEqual(@as(usize, 0), q.and_trigrams.len); } - test "decomposeRegex: dot in longer literal" { var q = try decomposeRegex("hello.world", testing.allocator); defer q.deinit(); @@ -1262,7 +1174,6 @@ test "decomposeRegex: dot in longer literal" { try testing.expectEqual(@as(usize, 6), q.and_trigrams.len); } - test "decomposeRegex: alternation creates OR groups" { var q = try decomposeRegex("foo|bar", testing.allocator); defer q.deinit(); @@ -1295,7 +1206,6 @@ test "issue-628: alternation with a no-trigram branch falls back to scan-all" { try testing.expectEqual(@as(usize, 1), q3.or_groups.len); } - test "decomposeRegex: quantifier removes preceding char" { var q = try decomposeRegex("hel+o", testing.allocator); defer q.deinit(); @@ -1303,7 +1213,6 @@ test "decomposeRegex: quantifier removes preceding char" { try testing.expectEqual(@as(usize, 0), q.and_trigrams.len); } - test "decomposeRegex: escaped literal preserved" { var q = try decomposeRegex("a\\.bc", testing.allocator); defer q.deinit(); @@ -1311,7 +1220,6 @@ test "decomposeRegex: escaped literal preserved" { try testing.expectEqual(@as(usize, 2), q.and_trigrams.len); } - test "decomposeRegex: character class breaks chain" { var q = try decomposeRegex("abc[xy]def", testing.allocator); defer q.deinit(); @@ -1319,7 +1227,6 @@ test "decomposeRegex: character class breaks chain" { try testing.expectEqual(@as(usize, 2), q.and_trigrams.len); } - test "decomposeRegex: backslash-w breaks chain" { var q = try decomposeRegex("abc\\wdef", testing.allocator); defer q.deinit(); @@ -1327,7 +1234,6 @@ test "decomposeRegex: backslash-w breaks chain" { try testing.expectEqual(@as(usize, 2), q.and_trigrams.len); } - test "candidatesRegex: finds files with AND trigrams" { var ti = TrigramIndex.init(testing.allocator); defer ti.deinit(); @@ -1352,7 +1258,6 @@ test "candidatesRegex: finds files with AND trigrams" { try testing.expect(found_foo); } - test "candidatesRegex: OR groups union posting lists" { var ti = TrigramIndex.init(testing.allocator); defer ti.deinit(); @@ -1379,53 +1284,45 @@ test "candidatesRegex: OR groups union posting lists" { try testing.expect(found_alpha or found_beta); } - test "regexMatch: literal match" { try testing.expect(regexMatch("hello world", "hello")); try testing.expect(regexMatch("hello world", "world")); try testing.expect(!regexMatch("hello world", "xyz")); } - test "regexMatch: dot matches any char" { try testing.expect(regexMatch("hello", "h.llo")); try testing.expect(regexMatch("hello", "h..lo")); try testing.expect(!regexMatch("hello", "h...lo")); } - test "regexMatch: star quantifier" { try testing.expect(regexMatch("helllo", "hel*o")); try testing.expect(regexMatch("heo", "hel*o")); try testing.expect(regexMatch("aab", "a*b")); } - test "regexMatch: plus quantifier" { try testing.expect(regexMatch("helllo", "hel+o")); try testing.expect(!regexMatch("heo", "hel+o")); } - test "regexMatch: question quantifier" { try testing.expect(regexMatch("color", "colou?r")); try testing.expect(regexMatch("colour", "colou?r")); } - test "regexMatch: character class" { try testing.expect(regexMatch("cat", "c[aeiou]t")); try testing.expect(regexMatch("cot", "c[aeiou]t")); try testing.expect(!regexMatch("cxt", "c[aeiou]t")); } - test "regexMatch: negated character class" { try testing.expect(!regexMatch("cat", "c[^aeiou]t")); try testing.expect(regexMatch("cxt", "c[^aeiou]t")); } - test "regexMatch: anchors" { try testing.expect(regexMatch("hello", "^hello")); try testing.expect(!regexMatch("say hello", "^hello")); @@ -1433,7 +1330,6 @@ test "regexMatch: anchors" { try testing.expect(!regexMatch("hello world", "hello$")); } - test "regexMatch: escape sequences" { try testing.expect(regexMatch("abc123", "\\d+")); try testing.expect(regexMatch("hello world", "\\w+\\s\\w+")); @@ -1441,14 +1337,12 @@ test "regexMatch: escape sequences" { try testing.expect(!regexMatch("axb", "a\\.b")); } - test "regexMatch: alternation" { try testing.expect(regexMatch("foo", "foo|bar")); try testing.expect(regexMatch("bar", "foo|bar")); try testing.expect(!regexMatch("baz", "foo|bar")); } - test "regexMatch: alternation with many branches does not stack overflow" { // 300 branches: 4 chars each + 299 separators = 1499 bytes max var buf: [1500]u8 = undefined; @@ -1474,13 +1368,11 @@ test "regexMatch: alternation with many branches does not stack overflow" { try testing.expect(!regexMatch("a999", pattern)); } - test "regexMatch: dot-star" { try testing.expect(regexMatch("hello world", "hello.*world")); try testing.expect(regexMatch("helloworld", "hello.*world")); } - test "issue-454: regex \\b word boundary matches whole-word, not literal 'b'" { // \b is a word-boundary assertion: should match "foo" as a whole word // but not when it appears as a substring inside another word. @@ -1491,7 +1383,6 @@ test "issue-454: regex \\b word boundary matches whole-word, not literal 'b'" { try testing.expect(!regexMatch("foobarbaz", "\\bbar\\b")); } - test "bloom: PostingMask is populated during indexing" { // Verify that indexing actually sets mask bits, not just zeros. var ti = TrigramIndex.init(testing.allocator); @@ -1512,7 +1403,6 @@ test "bloom: PostingMask is populated during indexing" { try testing.expect(mask.?.next_mask != 0); } - test "bloom: loc_mask records correct position bits" { var ti = TrigramIndex.init(testing.allocator); defer ti.deinit(); @@ -1529,7 +1419,6 @@ test "bloom: loc_mask records correct position bits" { try testing.expect(mask.loc_mask & 1 != 0); // bit 0 set } - test "bloom: next_mask records the following character" { var ti = TrigramIndex.init(testing.allocator); defer ti.deinit(); @@ -1545,7 +1434,6 @@ test "bloom: next_mask records the following character" { try testing.expect(mask.next_mask & expected_bit != 0); } - test "bloom: soundness — never rejects actual matches" { // The bloom filter must NEVER produce false negatives. // Every file that actually contains the query must appear in candidates. @@ -1576,7 +1464,6 @@ test "bloom: soundness — never rejects actual matches" { try testing.expect(found2); } - test "bloom: reduces candidates vs pure trigram intersection" { // This is the key test: prove bloom filtering actually eliminates // files that trigram intersection alone would not. @@ -1613,7 +1500,6 @@ test "bloom: reduces candidates vs pure trigram intersection" { try testing.expect(cands.?.len < 4); } - test "bloom: loc_mask adjacency filtering works" { // Construct a scenario where two trigrams exist in a file but at // positions where they can't be adjacent. The loc_mask check should @@ -1647,7 +1533,6 @@ test "bloom: loc_mask adjacency filtering works" { try testing.expect(cands.?.len >= 1); // at least the real match } - test "bloom: masks accumulate across multiple positions" { // If a trigram appears at many positions in a file, both masks should // have multiple bits set (OR'd together, never replaced). @@ -1667,7 +1552,6 @@ test "bloom: masks accumulate across multiple positions" { try testing.expect(mask.next_mask != 0); } - test "bloom: regression — candidate count for known queries" { // Regression benchmark: index a controlled set of files and assert // specific candidate counts. If bloom filtering breaks or regresses, @@ -1735,7 +1619,6 @@ test "bloom: regression — candidate count for known queries" { } } - test "regex regression: trigram extraction counts" { // Verify exact trigram counts for known patterns. // If decomposition logic changes, these catch it. @@ -1764,7 +1647,6 @@ test "regex regression: trigram extraction counts" { } } - test "regex regression: regexMatch edge cases" { // Empty pattern matches anything try testing.expect(regexMatch("anything", "")); @@ -1785,7 +1667,6 @@ test "regex regression: regexMatch edge cases" { try testing.expect(!regexMatch("abc", "abc\\")); } - test "regex regression: candidatesRegex reduces vs brute force" { var ti = TrigramIndex.init(testing.allocator); defer ti.deinit(); @@ -1822,7 +1703,6 @@ test "regex regression: candidatesRegex reduces vs brute force" { try testing.expect(cands.?.len <= 2); } - test "perf regression: indexing 200 files under 200ms" { var ti = TrigramIndex.init(testing.allocator); defer ti.deinit(); @@ -1873,7 +1753,6 @@ test "perf regression: indexing 200 files under 200ms" { try testing.expect(elapsed_ms < 500.0); } - test "perf regression: trigram candidate lookup under 1ms per query" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -1919,7 +1798,6 @@ test "perf regression: trigram candidate lookup under 1ms per query" { try testing.expect(ns_per_query < 1_000_000); } - test "perf regression: word index lookup under 100ns per query" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -1953,7 +1831,6 @@ test "perf regression: word index lookup under 100ns per query" { try testing.expect(ns_per_query < 500); } - test "perf regression: bloom filter reduces scan work" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -1988,7 +1865,6 @@ test "perf regression: bloom filter reduces scan work" { try testing.expect(cands.?.len < 25); // must eliminate at least half } - test "disk word index: round-trip write and read preserves hits" { const alloc = testing.allocator; var wi = WordIndex.init(alloc); @@ -2035,7 +1911,6 @@ test "disk word index: round-trip write and read preserves hits" { try testing.expect(found_store); } - test "disk word index: skip_file_words still writes file table" { const alloc = testing.allocator; var wi = WordIndex.init(alloc); @@ -2068,7 +1943,6 @@ test "disk word index: skip_file_words still writes file table" { try testing.expectEqualStrings("src/a.zig", loaded_wi.hitPath(hits[0])); } - test "disk index: round-trip write and read preserves candidates" { const alloc = testing.allocator; var ti = TrigramIndex.init(alloc); @@ -2113,13 +1987,11 @@ test "disk index: round-trip write and read preserves candidates" { try testing.expect(found); } - test "disk index: readFromDisk returns null for missing files" { const loaded = TrigramIndex.readFromDisk(io, "/tmp/codedb_nonexistent_dir_12345", testing.allocator); try testing.expect(loaded == null); } - test "disk index: readFromDisk returns null for corrupt magic" { var tmp = testing.tmpDir(.{}); defer tmp.cleanup(); @@ -2148,7 +2020,6 @@ test "disk index: readFromDisk returns null for corrupt magic" { try testing.expect(loaded == null); } - test "disk index: empty index round-trips correctly" { const alloc = testing.allocator; var ti = TrigramIndex.init(alloc); @@ -2170,7 +2041,6 @@ test "disk index: empty index round-trips correctly" { try testing.expectEqual(@as(u32, 0), loaded_ti.fileCount()); } - test "disk index: bloom masks preserved after round-trip" { const alloc = testing.allocator; var ti = TrigramIndex.init(alloc); @@ -2203,7 +2073,6 @@ test "disk index: bloom masks preserved after round-trip" { try testing.expectEqual(orig_mask.loc_mask, loaded_mask.loc_mask); } - test "disk index: fileCount matches after round-trip" { const alloc = testing.allocator; var ti = TrigramIndex.init(alloc); @@ -2231,7 +2100,6 @@ test "disk index: fileCount matches after round-trip" { try testing.expectEqual(@as(u32, 3), loaded_ti.fileCount()); } - test "disk index: writeToDisk stores git_head, readGitHead retrieves it" { const alloc = testing.allocator; var ti = TrigramIndex.init(alloc); @@ -2253,7 +2121,6 @@ test "disk index: writeToDisk stores git_head, readGitHead retrieves it" { try testing.expectEqualSlices(u8, &fake_head, &retrieved.?); } - test "disk index: writeToDisk with null git_head, readGitHead returns null" { const alloc = testing.allocator; var ti = TrigramIndex.init(alloc); @@ -2271,7 +2138,6 @@ test "disk index: writeToDisk with null git_head, readGitHead returns null" { try testing.expect(retrieved == null); } - test "disk index: readDiskHeader returns file_count and git_head" { const alloc = testing.allocator; var ti = TrigramIndex.init(alloc); @@ -2334,7 +2200,6 @@ test "issue-553: status reads file_count from disk header without loading the in try testing.expectEqualSlices(u8, &fake_head, &meta.git_head.?); } - test "disk index: v1 format (no git_head) still loads and readGitHead returns null" { const alloc = testing.allocator; @@ -2380,7 +2245,6 @@ test "disk index: v1 format (no git_head) still loads and readGitHead returns nu try testing.expectEqual(@as(u32, 0), loaded_ti.fileCount()); } - test "issue-105: large files skip trigram indexing to prevent OOM" { var explorer = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); defer explorer.deinit(); @@ -2405,7 +2269,6 @@ test "issue-105: large files skip trigram indexing to prevent OOM" { try testing.expect(explorer.trigram_index.fileCount() == 1); } - test "issue-107: codedb_deps returns results for Python files" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -2424,7 +2287,6 @@ test "issue-107: codedb_deps returns results for Python files" { try testing.expectEqualStrings("consumer.py", deps[0]); } - test "regression-142: trigram index finds all matching files" { var exp = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); defer exp.deinit(); @@ -2445,7 +2307,6 @@ test "regression-142: trigram index finds all matching files" { try testing.expect(results.len == 2); } - test "regression-142: trigram index returns no false positives" { var exp = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); defer exp.deinit(); @@ -2459,7 +2320,6 @@ test "regression-142: trigram index returns no false positives" { try testing.expect(results.len == 0); } - test "regression-142: trigram intersection narrows correctly" { var exp = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); defer exp.deinit(); @@ -2481,7 +2341,6 @@ test "regression-142: trigram intersection narrows correctly" { try testing.expectEqualStrings("match.zig", results[0].path); } - test "regression-142: trigram handles file removal" { var exp = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); defer exp.deinit(); @@ -2507,7 +2366,6 @@ test "regression-142: trigram handles file removal" { try testing.expect(results2.len == 1); } - test "regression-142: trigram handles re-indexing same file" { var exp = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); defer exp.deinit(); @@ -2530,7 +2388,6 @@ test "regression-142: trigram handles re-indexing same file" { try testing.expect(new.len == 1); } - test "regression-142: trigram disk roundtrip preserves results" { var tmp = testing.tmpDir(.{}); defer tmp.cleanup(); @@ -2557,7 +2414,6 @@ test "regression-142: trigram disk roundtrip preserves results" { try testing.expect(cands.len == 1); } - test "regression-142: many files don't corrupt index" { var exp = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); defer exp.deinit(); @@ -2585,7 +2441,6 @@ test "regression-142: many files don't corrupt index" { try testing.expectEqualStrings("file_250.zig", results[0].path); } - test "regression-142: short queries fall back gracefully" { var exp = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); defer exp.deinit(); @@ -2604,7 +2459,6 @@ test "regression-142: short queries fall back gracefully" { try testing.expect(results.len == 1); } - test "regression-142: word index still works alongside trigram" { var exp = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); defer exp.deinit(); @@ -2616,7 +2470,6 @@ test "regression-142: word index still works alongside trigram" { try testing.expect(hits.len == 1); } - test "issue-164: mmap trigram index returns same candidates as heap index" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -2658,7 +2511,6 @@ test "issue-164: mmap trigram index returns same candidates as heap index" { try testing.expect(!mmap_idx.containsFile("nonexistent.zig")); } - test "issue-164: mmap binary search on sorted lookup table" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -2695,13 +2547,11 @@ test "issue-164: mmap binary search on sorted lookup table" { } } - test "issue-164: mmap handles missing files gracefully" { const result = MmapTrigramIndex.initFromDisk(io, "/tmp/nonexistent-codedb-test-dir-164", testing.allocator); try testing.expect(result == null); } - test "issue-164: AnyTrigramIndex dispatches to mmap variant" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -2733,7 +2583,6 @@ test "issue-164: AnyTrigramIndex dispatches to mmap variant" { try testing.expect(!explorer.trigram_index.containsFile("bar.zig")); } - test "issue-246: TrigramIndex.removeFile cleans stale path_to_id left by failed indexFile" { // Reproduces the corrupted state an OOM mid-way through indexFile leaves: // removeFile cleared file_trigrams, getOrCreateDocId wrote to path_to_id, @@ -2754,7 +2603,6 @@ test "issue-246: TrigramIndex.removeFile cleans stale path_to_id left by failed try testing.expectEqual(@as(usize, 0), idx.path_to_id.count()); } - test "issue-247: TrigramIndex.id_to_path does not grow on re-index of same file" { // removeFile removes path_to_id[path] but leaves the id_to_path slot intact. // getOrCreateDocId then appends a new slot since path_to_id misses. @@ -2772,7 +2620,6 @@ test "issue-247: TrigramIndex.id_to_path does not grow on re-index of same file" try testing.expectEqual(@as(usize, 1), idx.id_to_path.items.len); } - test "issue-227: TrigramIndex.id_to_path stays bounded across many files re-indexed" { // Broader regression: ensure re-indexing multiple distinct files also doesn't // accumulate dead id_to_path slots. @@ -2789,7 +2636,6 @@ test "issue-227: TrigramIndex.id_to_path stays bounded across many files re-inde try testing.expectEqual(@as(usize, files.len), idx.id_to_path.items.len); } - test "issue-248: PostingList.removeDocId removes target and preserves sorted order" { // Documents the correctness contract for the O(log n) binary-search replacement. // Currently correct but O(n); fix replaces linear scan with bsearch + single remove. @@ -2815,7 +2661,6 @@ test "issue-248: PostingList.removeDocId removes target and preserves sorted ord } } - test "issue-250: searchContent finds content in files skipped by trigram index" { // Files indexed with skip_trigram=true (e.g. past the 15k cap) must still be // reachable via the fallback full-scan path in searchContent. @@ -2835,8 +2680,6 @@ test "issue-250: searchContent finds content in files skipped by trigram index" try testing.expectEqual(@as(usize, 1), results.len); } - - test "issue-263: skip_trigram_files searched before max_results exhausted" { // Files indexed with skip_trigram=true are only searched after all // trigram/sparse/word paths are exhausted. When a single normal file @@ -2876,7 +2719,6 @@ test "issue-263: skip_trigram_files searched before max_results exhausted" { try testing.expect(found_large); } - test "search: BM25 ranks higher-frequency line first" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -2901,7 +2743,6 @@ test "search: BM25 ranks higher-frequency line first" { try testing.expectEqual(@as(u32, 2), results[0].line_num); } - test "issue-388: TrigramIndex.removeFile frees owned path on tombstone" { // owns_paths=true means getOrCreateDocId duped the path so callers can // free their copy. removeFile must release that dup before tombstoning @@ -2920,7 +2761,6 @@ test "issue-388: TrigramIndex.removeFile frees owned path on tombstone" { // (cleared to ""), so deinit's `if (p.len > 0) free(p)` misses it. } - test "bm25-persistence: writeToDisk/readFromDisk preserves total_tokens and doc_lengths" { const alloc = testing.allocator; var wi = WordIndex.init(alloc); @@ -2988,7 +2828,6 @@ test "bm25-persistence: writeToDisk/readFromDisk preserves total_tokens and doc_ try testing.expectEqual(pre_total, wi2.total_tokens); } - test "issue-451: scope search surfaces skip-trigram canonical file" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -3028,7 +2867,6 @@ test "issue-451: scope search surfaces skip-trigram canonical file" { try testing.expect(found_canonical); } - test "issue-447: searchContent surfaces large (>64KB) skip-trigram files for common identifiers" { var arena = std.heap.ArenaAllocator.init(testing.allocator); defer arena.deinit(); @@ -3067,7 +2905,6 @@ test "issue-447: searchContent surfaces large (>64KB) skip-trigram files for com try testing.expect(found_canonical); } - test "issue-583: disk-loaded word index — re-index and removeFile must drop stale postings" { // readFromDisk/mmapFromDisk set skip_file_words=true, which made removeFile // a silent no-op (file_words is empty). In a daemon that fast-loads the @@ -3341,3 +3178,45 @@ test "issue-606: doc_id reuse survives a persist/reload round-trip" { defer alloc.free(alpha_hits); try testing.expectEqual(@as(usize, 0), alpha_hits.len); } + +test "issue-635: files between 512KB and 1MB are silently dropped from the index" { + // watcher.parseInitialScanEntry hard-drops any file > 512KB (src/watcher.zig:451), + // even though the trigram threshold (line 462) was deliberately raised to 1MB so + // "large code files aren't invisible to search". So a 600KB source file (well + // under the documented 1MB cap) gets no outline / no symbol / no word+search at + // all — silently. (codedb_read still works via the disk fallback.) + var tmp_dir = testing.tmpDir(.{}); + defer tmp_dir.cleanup(); + try tmp_dir.dir.createDirPath(io, "src"); + + // small control file + try tmp_dir.dir.writeFile(io, .{ .sub_path = "src/small.zig", .data = "pub fn tinyMarker() void {}\n" }); + + // ~600KB file: > 512KB hard-skip, < 1MB documented trigram cap. Unique token. + var big: std.ArrayList(u8) = .empty; + defer big.deinit(testing.allocator); + try big.appendSlice(testing.allocator, "pub fn bigMarkerXYZ() void {}\n"); + while (big.items.len < 600 * 1024) try big.appendSlice(testing.allocator, "pub fn filler() void {}\n"); + try tmp_dir.dir.writeFile(io, .{ .sub_path = "src/big.zig", .data = big.items }); + + var root_buf: [std.fs.max_path_bytes]u8 = undefined; + const root_len = try tmp_dir.dir.realPathFile(io, ".", &root_buf); + const root = root_buf[0..root_len]; + + var store = Store.init(testing.allocator); + defer store.deinit(); + var explorer = Explorer.init(testing.allocator, Explorer.DEFAULT_CONTENT_CACHE_CAPACITY); + defer explorer.deinit(); + explorer.setRoot(io, root); + try watcher.initialScanWithWorkerCount(io, &store, &explorer, root, testing.allocator, false, 1); + + // control: the small file is indexed and searchable + const small_hits = try explorer.searchWord("tinyMarker", testing.allocator); + defer testing.allocator.free(small_hits); + try testing.expect(small_hits.len >= 1); + + // the 600KB file must ALSO be indexed (it's under the documented 1MB cap). + const big_hits = try explorer.searchWord("bigMarkerXYZ", testing.allocator); + defer testing.allocator.free(big_hits); + try testing.expect(big_hits.len >= 1); // fails on main: big.zig dropped at the 512KB gate +} diff --git a/src/watcher.zig b/src/watcher.zig index c57102a..7f89e77 100644 --- a/src/watcher.zig +++ b/src/watcher.zig @@ -107,6 +107,11 @@ const WorkerParsedResults = struct { } }; +/// #635: max file size codedb will read + index (outline/symbol/word). Files up +/// to 1MB also get trigram coverage; 1MB..this cap get outline+word but skip +/// trigram (see effective_skip_trigram); past this cap the file is skipped. +/// Was 512KB, which silently dropped 512KB-1MB source files entirely. +const max_indexed_file_bytes = 2 * 1024 * 1024; const skip_dirs = [_][]const u8{ ".git", ".claude", @@ -448,8 +453,13 @@ fn parseInitialScanEntry(io: std.Io, root: []const u8, entry: InitialScanEntry, const dir = try std.Io.Dir.cwd().openDir(io, root, .{}); defer dir.close(io); const stat = try dir.statFile(io, entry.path, .{}); - if (stat.size > 512 * 1024) return null; - const content = try dir.readFileAlloc(io, entry.path, arena_alloc, .limited(512 * 1024)); + if (stat.size > max_indexed_file_bytes) { + // #635: surface the skip instead of dropping it silently. Reachable via + // codedb_read (disk fallback) but invisible to search/symbol/outline. + std.log.warn("codedb: not indexing {s} ({d} bytes > {d} cap) — reachable only via codedb_read", .{ entry.path, stat.size, max_indexed_file_bytes }); + return null; + } + const content = try dir.readFileAlloc(io, entry.path, arena_alloc, .limited(max_indexed_file_bytes)); const check_len = @min(content.len, 512); for (content[0..check_len]) |c| { if (c == 0) return null; @@ -597,8 +607,8 @@ fn readFileEntry(io: std.Io, root: []const u8, entry: InitialScanEntry, arena_al const dir = std.Io.Dir.cwd().openDir(io, root, .{}) catch return null; defer dir.close(io); const stat = dir.statFile(io, entry.path, .{}) catch return null; - if (stat.size > 512 * 1024) return null; - const c = dir.readFileAlloc(io, entry.path, arena_alloc, .limited(512 * 1024)) catch return null; + if (stat.size > max_indexed_file_bytes) return null; + const c = dir.readFileAlloc(io, entry.path, arena_alloc, .limited(max_indexed_file_bytes)) catch return null; const check_len = @min(c.len, 512); for (c[0..check_len]) |ch| { if (ch == 0) return null; @@ -921,8 +931,8 @@ pub fn initialScan(io: std.Io, store: *Store, explorer: *Explorer, root: []const fn indexFileOutline(io: std.Io, explorer: *Explorer, dir: std.Io.Dir, path: []const u8, allocator: std.mem.Allocator) !void { if (shouldSkipFile(path)) return; const stat = try dir.statFile(io, path, .{}); - if (stat.size > 512 * 1024) return; - const content = try dir.readFileAlloc(io, path, allocator, .limited(512 * 1024)); + if (stat.size > max_indexed_file_bytes) return; + const content = try dir.readFileAlloc(io, path, allocator, .limited(max_indexed_file_bytes)); defer allocator.free(content); const check_len = @min(content.len, 512); for (content[0..check_len]) |c| { @@ -1063,7 +1073,7 @@ fn hashFile(io: std.Io, dir: std.Io.Dir, path: []const u8, size: u64) !u64 { // Returns maxInt(u64) on IO error so the value always differs from a valid // previously stored hash of 0, preventing a false "content unchanged" conclusion. if (shouldSkipFile(path)) return 0; - if (size > 512 * 1024) return 0; + if (size > max_indexed_file_bytes) return 0; const file = dir.openFile(io, path, .{}) catch return std.math.maxInt(u64); defer file.close(io); @@ -1193,12 +1203,12 @@ fn indexFileContent(io: std.Io, explorer: *Explorer, dir: std.Io.Dir, path: []co if (shouldSkipFile(path)) return; const stat = try dir.statFile(io, path, .{}); // Skip files over 512KB (likely minified bundles or generated) - if (stat.size > 512 * 1024) return; + if (stat.size > max_indexed_file_bytes) return; // Use page_allocator arena for content — pages returned to OS immediately // via munmap on deinit, eliminating GPA page retention from content churn. var content_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); defer content_arena.deinit(); - const content = try dir.readFileAlloc(io, path, content_arena.allocator(), .limited(512 * 1024)); + const content = try dir.readFileAlloc(io, path, content_arena.allocator(), .limited(max_indexed_file_bytes)); // Skip binary content (check first 512 bytes for null bytes) const check_len = @min(content.len, 512); for (content[0..check_len]) |c| {