From 794720bcd4522df7d5ddac832ad7d39a2edd582d Mon Sep 17 00:00:00 2001 From: Keegan Thompson Date: Fri, 10 Apr 2026 10:17:01 -0500 Subject: [PATCH] fix: search index corruption on incremental updates and compile indexing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The incremental search index (used by ingest → addDocument → save) had two compounding bugs that caused search to silently degrade after multiple ingests: 1. serialize() used d.tokens.length instead of d.tokenCount — loaded docs have tokens: [] (by design), so every save overwrote real token counts with 0. 2. recomputeIdf() iterated doc.tokens (empty for loaded docs) instead of doc.termFreqs.keys(), so IDF was only computed from the most recently added document, making all prior terms unsearchable. Additionally: - Compile now updates the search index with wiki articles (previously only ingest updated it, so --wiki searches returned nothing after compile). - --wiki/--raw scope flags now filter results from cached indexes (previously scope was only applied during build, not when loading from cache). - Fix all strict TypeScript errors across core and CLI packages. Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/cli/src/commands/search.ts | 6 +++++ packages/cli/src/commands/watch.test.ts | 5 +++- packages/core/src/compile/compiler.ts | 29 ++++++++++++++++++++++ packages/core/src/daemon/folder-watcher.ts | 6 ++--- packages/core/src/recovery.ts | 2 +- packages/core/src/search/engine.ts | 17 ++++++------- packages/core/src/skills/registry.ts | 6 ++--- 7 files changed, 54 insertions(+), 17 deletions(-) diff --git a/packages/cli/src/commands/search.ts b/packages/cli/src/commands/search.ts index fcd0c0a..f163017 100644 --- a/packages/cli/src/commands/search.ts +++ b/packages/cli/src/commands/search.ts @@ -135,6 +135,12 @@ export async function search(term: string, opts: SearchOpts) { spinner.stop(); + // Filter results by scope when loaded from cache (cache contains all scopes) + if (scope !== "all") { + const scopeDir = scope === "wiki" ? "/wiki/" : "/raw/"; + results = results.filter((r) => r.path.includes(scopeDir)); + } + if (opts.json) { console.log(JSON.stringify(results, null, 2)); return; diff --git a/packages/cli/src/commands/watch.test.ts b/packages/cli/src/commands/watch.test.ts index dd3c31b..e5de777 100644 --- a/packages/cli/src/commands/watch.test.ts +++ b/packages/cli/src/commands/watch.test.ts @@ -57,7 +57,10 @@ describe("watch: HTTP server /ingest", () => { }); test("builds correct markdown without url", () => { - const body = { content: "Body text", title: "Title Only" }; + const body: { content: string; title: string; url?: string } = { + content: "Body text", + title: "Title Only", + }; const fullContent = body.title ? `# ${body.title}\n\n${body.url ? `Source: ${body.url}\n\n` : ""}${body.content}` : body.content; diff --git a/packages/core/src/compile/compiler.ts b/packages/core/src/compile/compiler.ts index 08cfedf..d06b5cc 100644 --- a/packages/core/src/compile/compiler.ts +++ b/packages/core/src/compile/compiler.ts @@ -700,6 +700,35 @@ async function compileVaultInner( await saveManifest(root, manifest); + // Update search index with newly compiled wiki articles + try { + const { SearchIndex } = await import("../search/engine.js"); + const searchIndex = new SearchIndex(); + await searchIndex.load(root); + for (const op of allOperations) { + if ((op.op === "create" || op.op === "update") && op.content) { + const { frontmatter, body } = parseFrontmatter(op.content); + const title = + (frontmatter.title as string) ?? op.path.split("/").pop()?.replace(/\.md$/, "") ?? ""; + const tags = Array.isArray(frontmatter.tags) ? (frontmatter.tags as string[]) : []; + const date = + (frontmatter.created as string) ?? + (frontmatter.updated as string) ?? + new Date().toISOString().slice(0, 10); + searchIndex.addDocument({ + path: join(root, op.path), + title, + content: body, + tags, + date, + }); + } + } + await searchIndex.save(root); + } catch { + // Search index update is best-effort — don't fail the compile + } + // Log compile activity const parts = [`${sourcesToCompile.length} sources compiled`]; if (totalCreated > 0) parts.push(`${totalCreated} articles created`); diff --git a/packages/core/src/daemon/folder-watcher.ts b/packages/core/src/daemon/folder-watcher.ts index 1533f47..cef9e63 100644 --- a/packages/core/src/daemon/folder-watcher.ts +++ b/packages/core/src/daemon/folder-watcher.ts @@ -25,14 +25,14 @@ export function matchGlob(filename: string, pattern: string): boolean { // Handle *.{ext1,ext2} pattern const braceMatch = pattern.match(/^\*\.\{(.+)\}$/); - if (braceMatch) { + if (braceMatch?.[1]) { const extensions = braceMatch[1].split(",").map((e) => `.${e.trim()}`); return extensions.includes(extname(filename).toLowerCase()); } // Handle *.ext pattern const extMatch = pattern.match(/^\*(\..+)$/); - if (extMatch) { + if (extMatch?.[1]) { return extname(filename).toLowerCase() === extMatch[1].toLowerCase(); } @@ -105,7 +105,7 @@ export async function scanFolder(folder: WatchFolder): Promise { try { const files = await readdir(absPath, { recursive: folder.recursive }); for (const file of files) { - const name = typeof file === "string" ? file : file.toString(); + const name = String(file); const base = name.includes("/") ? name.split("/").pop()! : name; if (!base.startsWith(".") && matchGlob(base, folder.glob)) { matches.push(join(absPath, name)); diff --git a/packages/core/src/recovery.ts b/packages/core/src/recovery.ts index 5b559b0..558c2c3 100644 --- a/packages/core/src/recovery.ts +++ b/packages/core/src/recovery.ts @@ -145,7 +145,7 @@ export async function repairVault(root: string): Promise { .sort() .reverse(); - if (backups.length > 0) { + if (backups[0]) { const latest = join(backupsDir, backups[0]); try { const backup = await readFile(latest, "utf-8"); diff --git a/packages/core/src/search/engine.ts b/packages/core/src/search/engine.ts index b9d8b55..e026555 100644 --- a/packages/core/src/search/engine.ts +++ b/packages/core/src/search/engine.ts @@ -406,7 +406,7 @@ export class SearchIndex { path: d.path, title: d.title, snippet: d.content.slice(0, 200), - tokenCount: d.tokens.length, + tokenCount: d.tokenCount, termFreqs: [...d.termFreqs.entries()], tags: d.tags, date: d.date, @@ -436,9 +436,10 @@ export class SearchIndex { try { const raw = await readFile(path, "utf-8"); - const data = JSON.parse(raw) as SerializedIndex & { version: number }; + const data = JSON.parse(raw) as SerializedIndex; + const version = data.version as number; - if (data.version !== 1 && data.version !== 2) return false; + if (version !== 1 && version !== 2) return false; this.documents = data.documents.map((d) => ({ path: d.path, @@ -507,12 +508,10 @@ export class SearchIndex { const docFreq = new Map(); for (const doc of this.documents) { - const seen = new Set(); - for (const token of doc.tokens) { - if (!seen.has(token)) { - docFreq.set(token, (docFreq.get(token) ?? 0) + 1); - seen.add(token); - } + // Use termFreqs.keys() instead of tokens — loaded docs have tokens: [] + // but termFreqs is always populated (from serialization or addDocument) + for (const term of doc.termFreqs.keys()) { + docFreq.set(term, (docFreq.get(term) ?? 0) + 1); } } diff --git a/packages/core/src/skills/registry.ts b/packages/core/src/skills/registry.ts index 3cae0e4..42098c4 100644 --- a/packages/core/src/skills/registry.ts +++ b/packages/core/src/skills/registry.ts @@ -235,13 +235,13 @@ async function installFromGitHub( skillsDir: string, ): Promise { // repo format: "user/repo" or "user/repo#branch" - const [repoPath, branch] = repo.split("#"); + const [repoPath, branch] = repo.split("#") as [string, string | undefined]; const parts = repoPath.split("/"); - if (parts.length !== 2) { + if (parts.length !== 2 || !parts[1]) { throw new Error(`Invalid GitHub repo format: "${repo}". Expected "user/repo"`); } - const repoName = parts[1]; + const repoName: string = parts[1]; const destDir = join(skillsDir, repoName); if (existsSync(destDir)) {