diff --git a/.jules/bolt.md b/.jules/bolt.md new file mode 100644 index 0000000..e69de29 diff --git a/packages/ai/src/index.test.ts b/packages/ai/src/index.test.ts new file mode 100644 index 0000000..08e2af7 --- /dev/null +++ b/packages/ai/src/index.test.ts @@ -0,0 +1,61 @@ + +import { describe, it, expect } from "vitest"; +import { + generateEmbeddings, + cosineSimilarity, + normalizeEmbeddingText, + embeddingContentHash, + embeddingDimensions +} from "./index.js"; + +describe("AI Embedding Utils", () => { + it("should normalize text correctly", () => { + expect(normalizeEmbeddingText(" hello world ")).toBe("hello world"); + }); + + it("should generate consistent hashes", () => { + const text = "test message"; + const hash1 = embeddingContentHash(text); + const hash2 = embeddingContentHash(text); + expect(hash1).toBe(hash2); + expect(hash1).toMatch(/^[a-f0-9]{64}$/); + }); + + it("should generate deterministic synthetic embeddings", async () => { + const text = "deterministic test"; + const result1 = await generateEmbeddings([text], { forceSynthetic: true }); + const result2 = await generateEmbeddings([text], { forceSynthetic: true }); + + expect(result1[0].values).toEqual(result2[0].values); + expect(result1[0].dimensions).toBe(embeddingDimensions); + }); + + it("should produce normalized synthetic vectors", async () => { + const text = "normalized test"; + const [record] = await generateEmbeddings([text], { forceSynthetic: true }); + + const magnitude = Math.sqrt( + record.values.reduce((sum, val) => sum + val * val, 0) + ); + // Should be very close to 1 + expect(magnitude).toBeGreaterThan(0.999999); + expect(magnitude).toBeLessThan(1.000001); + }); + + it("should calculate cosine similarity correctly", () => { + const v1 = [1, 0, 0]; + const v2 = [1, 0, 0]; + const v3 = [0, 1, 0]; + const v4 = [-1, 0, 0]; + + expect(cosineSimilarity(v1, v2)).toBeCloseTo(1); + expect(cosineSimilarity(v1, v3)).toBeCloseTo(0); + expect(cosineSimilarity(v1, v4)).toBeCloseTo(-1); + }); + + it("should handle empty or mismatched vectors in cosineSimilarity", () => { + expect(cosineSimilarity([], [1, 2, 3])).toBe(0); + expect(cosineSimilarity([1, 2], [1, 2, 3])).toBe(0); + expect(cosineSimilarity(undefined, [1])).toBe(0); + }); +}); diff --git a/packages/ai/src/index.ts b/packages/ai/src/index.ts index 6eb88fc..111b465 100644 --- a/packages/ai/src/index.ts +++ b/packages/ai/src/index.ts @@ -15,21 +15,33 @@ const normalizeText = (value: string) => value.replace(/\s+/g, " ").trim(); const contentHashFor = (value: string) => crypto.createHash("sha256").update(normalizeText(value)).digest("hex"); -const seededUnitValue = (seed: string, index: number) => { - const digest = crypto.createHash("sha256").update(`${seed}:${index}`).digest(); - const int = digest.readUInt32BE(0); - return int / 0xffffffff; +/** + * Fast, deterministic PRNG based on Mulberry32. + * @param seed - 32-bit integer seed + */ +const mulberry32 = (seed: number) => { + return () => { + /* biome-ignore lint/style/noParameterAssign: optimization */ + seed |= 0; + /* biome-ignore lint/style/noParameterAssign: optimization */ + seed = (seed + 0x6d2b79f5) | 0; + let t = Math.imul(seed ^ (seed >>> 15), 1 | seed); + t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t; + return ((t ^ (t >>> 14)) >>> 0) / 4294967296; + }; }; const syntheticVector = (text: string, dimensions = DEFAULT_EMBEDDING_DIMENSIONS) => { const normalized = normalizeText(text); - const hash = contentHashFor(normalized); - const values = Array.from({ - length: dimensions - }, (_, index) => { - const centered = seededUnitValue(hash, index) * 2 - 1; - return Number(centered.toFixed(8)); - }); + const hash = crypto.createHash("sha256").update(normalized).digest(); + const view = new DataView(hash.buffer, hash.byteOffset, hash.byteLength); + const seed = view.getUint32(0); + const prng = mulberry32(seed); + + const values = new Array(dimensions); + for (let i = 0; i < dimensions; i++) { + values[i] = prng() * 2 - 1; + } return normalizeVector(values); }; @@ -43,7 +55,12 @@ const normalizeVector = (values: number[]) => { return values.map(() => 0); } - return values.map((value) => Number((value / magnitude).toFixed(8))); + const invMagnitude = 1 / magnitude; + return values.map((value) => { + const v = value * invMagnitude; + // Fast rounding to 8 decimal places + return Math.sign(v) * Math.round(Math.abs(v) * 1e8) / 1e8; + }); }; const toEmbeddingVectorRecord = (