Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions src/lib/embeddings.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,24 @@
// =============================================================================
// Embedding Providers for Preflight Timeline Search
// =============================================================================
// Converts text (prompts, commits, corrections) into dense vectors for
// semantic similarity search in LanceDB. Two providers are supported:
//
// 1. **Local** (default) — Xenova/all-MiniLM-L6-v2 (384-dim, ~90MB download
// on first use, runs entirely on-device, no API key needed)
// 2. **OpenAI** — text-embedding-3-small (1536-dim, requires OPENAI_API_KEY,
// batches up to 100 texts per request)
//
// Text is preprocessed before embedding: markdown is stripped, whitespace is
// normalized, and content is truncated to ~2048 chars (~512 tokens) to stay
// within model context limits.
//
// Usage:
// const provider = createEmbeddingProvider({ provider: "local" });
// const vector = await provider.embed("fix the auth bug");
// const vectors = await provider.embedBatch(["text1", "text2"]);
// =============================================================================

import { pipeline } from "@xenova/transformers";

// --- Types ---
Expand Down Expand Up @@ -26,9 +47,9 @@

// --- Local Provider (Xenova/transformers) ---

let extractor: any = null;

Check warning on line 50 in src/lib/embeddings.ts

View workflow job for this annotation

GitHub Actions / build-and-test (20)

Unexpected any. Specify a different type

Check warning on line 50 in src/lib/embeddings.ts

View workflow job for this annotation

GitHub Actions / build-and-test (22)

Unexpected any. Specify a different type

async function getExtractor(): Promise<any> {

Check warning on line 52 in src/lib/embeddings.ts

View workflow job for this annotation

GitHub Actions / build-and-test (20)

Unexpected any. Specify a different type

Check warning on line 52 in src/lib/embeddings.ts

View workflow job for this annotation

GitHub Actions / build-and-test (22)

Unexpected any. Specify a different type
if (!extractor) {
extractor = await pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2");
}
Expand Down Expand Up @@ -92,7 +113,7 @@

const data = await resp.json();
// Sort by index to preserve order
const sorted = data.data.sort((a: any, b: any) => a.index - b.index);

Check warning on line 116 in src/lib/embeddings.ts

View workflow job for this annotation

GitHub Actions / build-and-test (20)

Unexpected any. Specify a different type

Check warning on line 116 in src/lib/embeddings.ts

View workflow job for this annotation

GitHub Actions / build-and-test (20)

Unexpected any. Specify a different type

Check warning on line 116 in src/lib/embeddings.ts

View workflow job for this annotation

GitHub Actions / build-and-test (22)

Unexpected any. Specify a different type

Check warning on line 116 in src/lib/embeddings.ts

View workflow job for this annotation

GitHub Actions / build-and-test (22)

Unexpected any. Specify a different type
for (const item of sorted) {
results.push(item.embedding);
}
Expand Down
44 changes: 44 additions & 0 deletions src/lib/timeline-db.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,34 @@
// =============================================================================
// Timeline Database — LanceDB-backed Semantic Search for Session History
// =============================================================================
// This is the persistence and query layer for preflight's "Timeline Intelligence"
// tools (search_history, timeline_view, scan_sessions, onboard_project).
//
// Architecture:
// ~/.preflight/ ← global preflight data directory
// ├── config.json ← embedding provider settings + legacy index
// └── projects/
// ├── index.json ← registry mapping project paths → hashes
// └── <hash>/ ← per-project data (hash of absolute path)
// ├── timeline.lance/ ← LanceDB vector database
// └── meta.json ← project metadata (event counts, etc.)
//
// Each project gets its own LanceDB instance keyed by a SHA-256 hash of its
// absolute path. Events (prompts, responses, commits, corrections, errors) are
// embedded via the configured provider (see embeddings.ts) and stored with
// metadata for filtered retrieval.
//
// Key functions:
// insertEvents() — embed and store timeline events
// searchSemantic() — vector similarity search across projects
// searchExact() — SQL LIKE text search (no embeddings)
// getTimeline() — chronological event retrieval with filters
// listIndexedProjects()— list all onboarded projects and their stats
//
// Connection pooling: LanceDB connections are cached per project directory
// in _connections Map. The embedding provider is lazily initialized once.
// =============================================================================

import * as lancedb from "@lancedb/lancedb";
import { randomUUID } from "node:crypto";
import { readFile, writeFile, mkdir, stat } from "node:fs/promises";
Expand Down Expand Up @@ -191,6 +222,12 @@ async function getEmbedder(): Promise<EmbeddingProvider> {
return _embedder;
}

/**
* Get or create the "events" table for a project's LanceDB instance.
* On first call for a new project, creates the table with a seed record
* (then deletes it) because LanceDB requires at least one record to infer
* the schema and vector dimensions.
*/
export async function getEventsTable(projectDir: string): Promise<lancedb.Table> {
const db = await getDb(projectDir);
try {
Expand Down Expand Up @@ -222,6 +259,12 @@ export async function getEventsTable(projectDir: string): Promise<lancedb.Table>

// --- Core Operations ---

/**
* Embed and insert timeline events into LanceDB. Events are grouped by project
* directory (from event.project or the explicit projectDir param), embedded in
* batch via the configured provider, and stored with full metadata. Also updates
* the project registry and per-project metadata counts.
*/
export async function insertEvents(events: TimelineEvent[], projectDir?: string): Promise<void> {
if (events.length === 0) return;

Expand Down Expand Up @@ -289,6 +332,7 @@ export async function insertEvents(events: TimelineEvent[], projectDir?: string)
}
}

/** Build a SQL WHERE clause from search options for LanceDB filtering. */
function buildWhereFilter(opts: SearchOptions): string | undefined {
const clauses: string[] = [];
if (opts.project) clauses.push(`project = '${opts.project}'`);
Expand Down
Loading