Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .jules/bolt.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Bolt's Performance Journal

## 2025-05-14 - Optimized AI Vector Generation and Normalization
**Learning:** `toFixed(n)` is extremely slow for precision rounding as it involves string conversion. Pure mathematical rounding using `Math.round(x * 10^n) / 10^n` is significantly faster. In Node 22+, `crypto.hash` is preferred over `crypto.createHash` for single-shot hashing. Consolidating redundant normalization calls in vector pipelines yields measurable gains.
**Action:** Always prefer mathematical rounding over `toFixed` in hot paths. Use `crypto.hash` for one-off hashes in modern Node.js environments. Look for redundant O(N) operations in data processing pipelines.
21 changes: 21 additions & 0 deletions packages/ai/src/benchmark.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@

import { generateEmbeddings } from "./index.js";

async function runBenchmark() {
const inputs = Array.from({ length: 50 }, (_, i) => `This is some sample text for embedding generation number ${i} to test performance.`);

console.log("Starting benchmark: 50 synthetic embeddings...");

// Warmup
await generateEmbeddings(inputs.slice(0, 5), { forceSynthetic: true });

const start = Date.now();
await generateEmbeddings(inputs, { forceSynthetic: true });
const end = Date.now();

console.log(`Time taken for 50 embeddings: ${end - start}ms`);
console.log(`Average time per embedding: ${(end - start) / 50}ms`);
console.log(`Throughput: ${50 / ((end - start) / 1000)} embeddings/sec`);
}

runBenchmark().catch(console.error);
28 changes: 28 additions & 0 deletions packages/ai/src/index.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@

import { describe, it, expect } from "vitest";
import { generateEmbeddings, embeddingContentHash } from "./index.js";

describe("synthetic embeddings", () => {
it("should be deterministic", async () => {
const text = "hello world";
const [res1] = await generateEmbeddings([text], { forceSynthetic: true });
const [res2] = await generateEmbeddings([text], { forceSynthetic: true });

expect(res1.values).toEqual(res2.values);
expect(res1.contentHash).toBe(embeddingContentHash(text));
});

it("should have correct dimensions", async () => {
const text = "test dimensions";
const [res] = await generateEmbeddings([text], { forceSynthetic: true });
expect(res.values).toHaveLength(1536);
expect(res.dimensions).toBe(1536);
});

it("should be normalized", async () => {
const text = "test normalization";
const [res] = await generateEmbeddings([text], { forceSynthetic: true });
const magnitude = Math.sqrt(res.values.reduce((sum, v) => sum + v * v, 0));
expect(magnitude).toBeCloseTo(1, 7);
});
});
46 changes: 32 additions & 14 deletions packages/ai/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,37 +13,54 @@ const MAX_RETRIES = 2;
const normalizeText = (value: string) => value.replace(/\s+/g, " ").trim();

const contentHashFor = (value: string) =>
crypto.createHash("sha256").update(normalizeText(value)).digest("hex");
crypto.hash("sha256", normalizeText(value), "hex");

const seededUnitValue = (seed: string, index: number) => {
const digest = crypto.createHash("sha256").update(`${seed}:${index}`).digest();
// Use crypto.hash for better performance in Node 22+
const digest = crypto.hash("sha256", `${seed}:${index}`, "buffer");
// Buffer.readUInt32BE is efficient for reading from the hash digest
const int = digest.readUInt32BE(0);
return int / 0xffffffff;
};

const syntheticVector = (text: string, dimensions = DEFAULT_EMBEDDING_DIMENSIONS) => {
const normalized = normalizeText(text);
const hash = contentHashFor(normalized);
const values = Array.from({
length: dimensions
}, (_, index) => {
const centered = seededUnitValue(hash, index) * 2 - 1;
return Number(centered.toFixed(8));
});
return normalizeVector(values);
// Avoid intermediate rounding and normalization here as it's done in toEmbeddingVectorRecord
return Array.from(
{
length: dimensions
},
(_, index) => seededUnitValue(hash, index) * 2 - 1
);
};

const normalizeVector = (values: number[]) => {
if (values.length === 0) {
const len = values.length;
if (len === 0) {
return values;
}

const magnitude = Math.sqrt(values.reduce((sum, value) => sum + value * value, 0));
let sum = 0;
for (let i = 0; i < len; i++) {
const val = values[i];
sum += val * val;
}

const magnitude = Math.sqrt(sum);
if (magnitude === 0) {
return values.map(() => 0);
return new Array(len).fill(0);
}

return values.map((value) => Number((value / magnitude).toFixed(8)));
// Pre-calculate inverse magnitude to use multiplication instead of division in the loop
const invMag = 1 / magnitude;
const result = new Array(len);
for (let i = 0; i < len; i++) {
const val = values[i] * invMag;
// Faster precision rounding than toFixed(8)
result[i] = Math.sign(val) * Math.round(Math.abs(val) * 1e8) / 1e8;
}
return result;
};

const toEmbeddingVectorRecord = (
Expand All @@ -52,6 +69,7 @@ const toEmbeddingVectorRecord = (
provider: EmbeddingProvider,
model: string
): EmbeddingVectorRecord => ({
// Centralized normalization and rounding
values: normalizeVector(values),
dimensions: values.length,
provider,
Expand Down Expand Up @@ -127,7 +145,7 @@ const callOpenAiEmbeddings = async (
const data = payload.data ?? [];
return data
.sort((left, right) => (left.index ?? 0) - (right.index ?? 0))
.map((record) => normalizeVector(record.embedding ?? []));
.map((record) => record.embedding ?? []); // Normalization happens in toEmbeddingVectorRecord
};

const embedBatchLive = async (
Expand Down
Loading