preflight-dev · TerminalGravity · Mar 11, 2026
diff --git a/src/lib/embeddings.ts b/src/lib/embeddings.ts
@@ -1,3 +1,24 @@
+// =============================================================================
+// Embedding Providers for Preflight Timeline Search
+// =============================================================================
+// Converts text (prompts, commits, corrections) into dense vectors for
+// semantic similarity search in LanceDB. Two providers are supported:
+//
+//   1. **Local** (default) — Xenova/all-MiniLM-L6-v2 (384-dim, ~90MB download
+//      on first use, runs entirely on-device, no API key needed)
+//   2. **OpenAI** — text-embedding-3-small (1536-dim, requires OPENAI_API_KEY,
+//      batches up to 100 texts per request)
+//
+// Text is preprocessed before embedding: markdown is stripped, whitespace is
+// normalized, and content is truncated to ~2048 chars (~512 tokens) to stay
+// within model context limits.
+//
+// Usage:
+//   const provider = createEmbeddingProvider({ provider: "local" });
+//   const vector = await provider.embed("fix the auth bug");
+//   const vectors = await provider.embedBatch(["text1", "text2"]);
+// =============================================================================
+
 import { pipeline } from "@xenova/transformers";
 
 // --- Types ---
@@ -26,9 +47,9 @@

 // --- Local Provider (Xenova/transformers) ---

 let extractor: any = null;

 async function getExtractor(): Promise<any> {
  if (!extractor) {
    extractor = await pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2");
  }
@@ -92,7 +113,7 @@

      const data = await resp.json();
      // Sort by index to preserve order
      const sorted = data.data.sort((a: any, b: any) => a.index - b.index);
      for (const item of sorted) {
        results.push(item.embedding);
      }

diff --git a/src/lib/timeline-db.ts b/src/lib/timeline-db.ts
@@ -1,3 +1,34 @@
+// =============================================================================
+// Timeline Database — LanceDB-backed Semantic Search for Session History
+// =============================================================================
+// This is the persistence and query layer for preflight's "Timeline Intelligence"
+// tools (search_history, timeline_view, scan_sessions, onboard_project).
+//
+// Architecture:
+//   ~/.preflight/                     ← global preflight data directory
+//   ├── config.json                   ← embedding provider settings + legacy index
+//   └── projects/
+//       ├── index.json                ← registry mapping project paths → hashes
+//       └── <hash>/                   ← per-project data (hash of absolute path)
+//           ├── timeline.lance/       ← LanceDB vector database
+//           └── meta.json             ← project metadata (event counts, etc.)
+//
+// Each project gets its own LanceDB instance keyed by a SHA-256 hash of its
+// absolute path. Events (prompts, responses, commits, corrections, errors) are
+// embedded via the configured provider (see embeddings.ts) and stored with
+// metadata for filtered retrieval.
+//
+// Key functions:
+//   insertEvents()       — embed and store timeline events
+//   searchSemantic()     — vector similarity search across projects
+//   searchExact()        — SQL LIKE text search (no embeddings)
+//   getTimeline()        — chronological event retrieval with filters
+//   listIndexedProjects()— list all onboarded projects and their stats
+//
+// Connection pooling: LanceDB connections are cached per project directory
+// in _connections Map. The embedding provider is lazily initialized once.
+// =============================================================================
+
 import * as lancedb from "@lancedb/lancedb";
 import { randomUUID } from "node:crypto";
 import { readFile, writeFile, mkdir, stat } from "node:fs/promises";
@@ -191,6 +222,12 @@ async function getEmbedder(): Promise<EmbeddingProvider> {
   return _embedder;
 }
 
+/**
+ * Get or create the "events" table for a project's LanceDB instance.
+ * On first call for a new project, creates the table with a seed record
+ * (then deletes it) because LanceDB requires at least one record to infer
+ * the schema and vector dimensions.
+ */
 export async function getEventsTable(projectDir: string): Promise<lancedb.Table> {
   const db = await getDb(projectDir);
   try {
@@ -222,6 +259,12 @@ export async function getEventsTable(projectDir: string): Promise<lancedb.Table>
 
 // --- Core Operations ---
 
+/**
+ * Embed and insert timeline events into LanceDB. Events are grouped by project
+ * directory (from event.project or the explicit projectDir param), embedded in
+ * batch via the configured provider, and stored with full metadata. Also updates
+ * the project registry and per-project metadata counts.
+ */
 export async function insertEvents(events: TimelineEvent[], projectDir?: string): Promise<void> {
   if (events.length === 0) return;
 
@@ -289,6 +332,7 @@ export async function insertEvents(events: TimelineEvent[], projectDir?: string)
   }
 }
 
+/** Build a SQL WHERE clause from search options for LanceDB filtering. */
 function buildWhereFilter(opts: SearchOptions): string | undefined {
   const clauses: string[] = [];
   if (opts.project) clauses.push(`project = '${opts.project}'`);