Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions backend/src/api/powersync.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ const powersyncSettings: Settings = {
mistralApiKey: '',
anthropicApiKey: '',
exaApiKey: '',
thunderboltInferenceUrl: '',
thunderboltInferenceApiKey: '',
tinfoilApiKey: '',
tinfoilEnclaveUrl: 'https://inference.tinfoil.sh/v1',
monitoringToken: '',
Expand Down
4 changes: 0 additions & 4 deletions backend/src/config/settings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ const settingsSchema = z
mistralApiKey: z.string().default(''),
anthropicApiKey: z.string().default(''),
exaApiKey: z.string().default(''),
thunderboltInferenceUrl: z.string().default(''),
thunderboltInferenceApiKey: z.string().default(''),
tinfoilApiKey: z.string().default(''),
// Include the `/v1` API prefix — Tinfoil's OpenAI-compatible endpoints live
// under `/v1/chat/completions`, `/v1/models`, etc.
Expand Down Expand Up @@ -148,8 +146,6 @@ const parseSettings = (): Settings => {
mistralApiKey: process.env.MISTRAL_API_KEY || '',
anthropicApiKey: process.env.ANTHROPIC_API_KEY || '',
exaApiKey: process.env.EXA_API_KEY || '',
thunderboltInferenceUrl: process.env.THUNDERBOLT_INFERENCE_URL || '',
thunderboltInferenceApiKey: process.env.THUNDERBOLT_INFERENCE_API_KEY || '',
tinfoilApiKey: process.env.TINFOIL_API_KEY || '',
tinfoilEnclaveUrl: process.env.TINFOIL_ENCLAVE_URL || 'https://inference.tinfoil.sh/v1',
monitoringToken: process.env.MONITORING_TOKEN || '',
Expand Down
45 changes: 1 addition & 44 deletions backend/src/inference/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { getPostHogClient, isPostHogConfigured } from '@/posthog/client'
import { OpenAI as PostHogOpenAI } from '@posthog/ai'
import OpenAI from 'openai'

export type InferenceProvider = 'fireworks' | 'thunderbolt' | 'mistral' | 'anthropic'
export type InferenceProvider = 'fireworks' | 'mistral' | 'anthropic'

type InferenceClient = {
client: OpenAI | PostHogOpenAI
Expand All @@ -19,11 +19,6 @@ type InferenceClient = {
*/
let fireworksClient: OpenAI | PostHogOpenAI | null = null

/**
* Lazily initialized Thunderbolt client
*/
let thunderboltClient: OpenAI | PostHogOpenAI | null = null

/**
* Lazily initialized Mistral client
*/
Expand Down Expand Up @@ -70,42 +65,6 @@ const getFireworksClient = (fetchFn?: typeof fetch): OpenAI | PostHogOpenAI => {
return client
}

/**
* Get the Thunderbolt inference client for gpt-oss
*/
const getThunderboltClient = (fetchFn?: typeof fetch): OpenAI | PostHogOpenAI => {
// Don't use cache when fetchFn is provided (primarily for testing)
if (thunderboltClient && !fetchFn) {
return thunderboltClient
}

const settings = getSettings()

if (!settings.thunderboltInferenceUrl || !settings.thunderboltInferenceApiKey) {
throw new Error('Thunderbolt inference URL or API key not configured')
}

const params = {
apiKey: settings.thunderboltInferenceApiKey,
baseURL: settings.thunderboltInferenceUrl,
...(fetchFn && { fetch: fetchFn }),
}

const client = isPostHogConfigured()
? new PostHogOpenAI({
...params,
posthog: getPostHogClient(fetchFn),
})
: new OpenAI(params)

// Only cache if no custom fetchFn was provided
if (!fetchFn) {
thunderboltClient = client
}

return client
}

/**
* Get the Mistral AI client using OpenAI-compatible API
*/
Expand Down Expand Up @@ -180,7 +139,6 @@ const getAnthropicClient = (fetchFn?: typeof fetch): OpenAI | PostHogOpenAI => {
*/
export const getInferenceClient = (provider: InferenceProvider, fetchFn?: typeof fetch): InferenceClient => {
const clientMap: Record<InferenceProvider, () => OpenAI | PostHogOpenAI> = {
thunderbolt: () => getThunderboltClient(fetchFn),
mistral: () => getMistralClient(fetchFn),
anthropic: () => getAnthropicClient(fetchFn),
fireworks: () => getFireworksClient(fetchFn),
Expand All @@ -200,7 +158,6 @@ export const getInferenceClient = (provider: InferenceProvider, fetchFn?: typeof
*/
export const clearInferenceClientCache = () => {
fireworksClient = null
thunderboltClient = null
mistralClient = null
anthropicClient = null
}
Expand Down
2 changes: 0 additions & 2 deletions backend/src/inference/posthog-privacy.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ describe('Inference Routes - PostHog Privacy Integration', () => {
POSTHOG_API_KEY: process.env.POSTHOG_API_KEY,
POSTHOG_HOST: process.env.POSTHOG_HOST,
FIREWORKS_API_KEY: process.env.FIREWORKS_API_KEY,
THUNDERBOLT_INFERENCE_URL: process.env.THUNDERBOLT_INFERENCE_URL,
THUNDERBOLT_INFERENCE_API_KEY: process.env.THUNDERBOLT_INFERENCE_API_KEY,
}

capturedFetches = []
Expand Down
69 changes: 1 addition & 68 deletions backend/src/inference/routes.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -121,37 +121,6 @@ describe('Inference Routes', () => {
expect(createSSEStreamSpy).toHaveBeenCalledWith(mockCompletion)
})

it('should route gpt-oss-120b model to thunderbolt provider', async () => {
getInferenceClientSpy.mockReturnValue({
client: mockOpenAIClient as unknown as OpenAI,
provider: 'thunderbolt',
})

const mockCompletion = createMockStream()
mockCreateCompletion.mockImplementation(() => Promise.resolve(mockCompletion))

const gptOssRequest = {
...validRequestBody,
model: 'gpt-oss-120b',
}

const response = await app.handle(
new Request('http://localhost/chat/completions', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(gptOssRequest),
}),
)

expect(response.status).toBe(200)
expect(getInferenceClientSpy).toHaveBeenCalledWith('thunderbolt')
expect(mockCreateCompletion).toHaveBeenCalledWith(
expect.objectContaining({
model: 'openai/gpt-oss-120b',
}),
)
})

it('should route mistral models to mistral provider', async () => {
const mockCompletion = createMockStream()
mockCreateCompletion.mockImplementation(() => Promise.resolve(mockCompletion))
Expand Down Expand Up @@ -229,42 +198,6 @@ describe('Inference Routes', () => {
isPostHogConfiguredSpy.mockReturnValue(false)
})

it('should include correct provider in PostHog properties for gpt-oss-120b', async () => {
isPostHogConfiguredSpy.mockReturnValue(true)
getInferenceClientSpy.mockReturnValue({
client: mockOpenAIClient as unknown as OpenAI,
provider: 'thunderbolt',
})

const mockCompletion = createMockStream()
mockCreateCompletion.mockImplementation(() => Promise.resolve(mockCompletion))

const gptOssRequest = {
...validRequestBody,
model: 'gpt-oss-120b',
}

const response = await app.handle(
new Request('http://localhost/chat/completions', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(gptOssRequest),
}),
)

expect(response.status).toBe(200)
expect(mockCreateCompletion).toHaveBeenCalledWith(
expect.objectContaining({
posthogProperties: expect.objectContaining({
model_provider: 'thunderbolt',
}),
}),
)

// Reset for other tests
isPostHogConfiguredSpy.mockReturnValue(false)
})

it('should reject non-streaming requests', async () => {
const nonStreamingRequest = {
...validRequestBody,
Expand Down Expand Up @@ -330,7 +263,7 @@ describe('Inference Routes', () => {
})

it('should validate all supported models', () => {
const expectedModels = ['gpt-oss-120b', 'mistral-medium-3.1', 'mistral-large-3', 'sonnet-4.5', 'opus-4.8']
const expectedModels = ['mistral-medium-3.1', 'mistral-large-3', 'sonnet-4.5', 'opus-4.8']
expect(Object.keys(supportedModels)).toEqual(expectedModels)
})

Expand Down
4 changes: 0 additions & 4 deletions backend/src/inference/routes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,6 @@ type ModelConfig = {
}

export const supportedModels: Record<string, ModelConfig> = {
'gpt-oss-120b': {
provider: 'thunderbolt',
internalName: 'openai/gpt-oss-120b',
},
'mistral-medium-3.1': {
provider: 'mistral',
internalName: 'mistral-medium-2508',
Expand Down
2 changes: 0 additions & 2 deletions backend/src/test-utils/settings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ export const createTestSettings = (overrides: Partial<Settings> = {}): Settings
mistralApiKey: '',
anthropicApiKey: '',
exaApiKey: '',
thunderboltInferenceUrl: '',
thunderboltInferenceApiKey: '',
tinfoilApiKey: '',
tinfoilEnclaveUrl: 'https://inference.tinfoil.sh/v1',
monitoringToken: '',
Expand Down
61 changes: 28 additions & 33 deletions src/ai/eval/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,20 @@ Embedded E2E test runner that validates AI response quality across all models an
## Quick Start

```bash
# Run all 135 scenarios (3 models x 3 modes x 15 prompts)
# Run all scenarios
bun run eval

# Test only GPT-OSS
EVAL_MODELS=gpt-oss bun run eval
# Test only Opus
EVAL_MODELS=opus bun run eval

# Test only Chat mode across all models
EVAL_MODES=chat bun run eval

# Verbose mode — shows the full system prompt and model response for each scenario
EVAL_MODELS=gpt-oss EVAL_MODES=chat bun run eval -- --verbose
EVAL_MODELS=opus EVAL_MODES=chat bun run eval -- --verbose

# Test GPT-OSS in Search mode only
EVAL_MODELS=gpt-oss EVAL_MODES=search bun run eval

# Test Mistral and Sonnet in Chat and Search modes
EVAL_MODELS=mistral,sonnet EVAL_MODES=chat,search bun run eval
# Test Opus in Search mode only
EVAL_MODELS=opus EVAL_MODES=search bun run eval
```

> **Prerequisite**: The backend must be running at `localhost:8000` (or whatever `cloud_url` is configured). The eval runner makes real API calls to the models.
Expand Down Expand Up @@ -55,20 +52,20 @@ Each scenario checks a combination of criteria depending on the mode:
Thunderbolt AI Eval Runner
========================================
Scenarios: 15
Models: gpt-oss
Models: opus
Modes: chat
Parallel: 3 (one per model)
Timeout: 120000ms per scenario
========================================

Starting batch: gpt-oss
Starting batch: opus

--- GPT-OSS (15 scenarios) ---
PASS gpt-oss/chat/C1 (2.1s)
PASS gpt-oss/chat/C2 (4.3s)
PASS gpt-oss/chat/C3 (1.8s)
FAIL gpt-oss/chat/C4 (60.0s) — Empty response — no text output produced
PASS gpt-oss/chat/C5 (1.2s)
--- OPUS (15 scenarios) ---
PASS opus/chat/C1 (2.1s)
PASS opus/chat/C2 (4.3s)
PASS opus/chat/C3 (1.8s)
FAIL opus/chat/C4 (60.0s) — Empty response — no text output produced
PASS opus/chat/C5 (1.2s)
...

============================================================
Expand All @@ -78,17 +75,17 @@ EVAL REPORT
Overall: 12/15 passed (80%)

By Model:
gpt-oss: 12/15 (80%)
opus: 12/15 (80%)

By Mode:
chat: 12/15 (80%)

Failures (3):
FAIL gpt-oss/chat/C4
FAIL opus/chat/C4
- Empty response — no text output produced
FAIL gpt-oss/chat/C11
FAIL opus/chat/C11
- Insufficient citations: 0 found, 2 required
FAIL gpt-oss/chat/C15
FAIL opus/chat/C15
- Empty response — no text output produced

============================================================
Expand All @@ -100,7 +97,7 @@ Report saved to: evals/eval-results.md

| Variable | Default | Example | Description |
| ------------------------ | ----------------------- | ----------------- | ------------------------------- |
| `EVAL_MODELS` | all | `gpt-oss,mistral` | Which models to test |
| `EVAL_MODELS` | all | `opus` | Which models to test |
| `EVAL_MODES` | all | `chat,search` | Which modes to test |
| `EVAL_SCENARIO_PARALLEL` | `3` | `1` | Concurrent scenarios per worker |
| `EVAL_TIMEOUT` | `120000` | `60000` | Timeout per scenario (ms) |
Expand All @@ -116,12 +113,12 @@ Report saved to: evals/eval-results.md
Example with detailed report:

```
$ EVAL_MODELS=gpt-oss EVAL_MODES=chat bun run eval -- --detailed
$ EVAL_MODELS=opus EVAL_MODES=chat bun run eval -- --detailed

# The markdown report at evals/eval-results.md will include:
## Failures

### gpt-oss/chat/C4
### opus/chat/C4

- **Prompt**: Compare the iPhone 16 Pro and Samsung Galaxy S25 Ultra
- **Duration**: 60.0s
Expand All @@ -134,10 +131,10 @@ $ EVAL_MODELS=gpt-oss EVAL_MODES=chat bun run eval -- --detailed
Example with verbose:

```
$ EVAL_MODELS=gpt-oss EVAL_MODES=chat bun run eval -- --verbose
$ EVAL_MODELS=opus EVAL_MODES=chat bun run eval -- --verbose

--- SYSTEM PROMPT (gpt-oss/chat/C1) ---
You are an executive assistant using the **GPT OSS** model...
--- SYSTEM PROMPT (opus/chat/C1) ---
You are an executive assistant using the **Opus 4.8** model...
# Principles
...
# Active Mode (follow these instructions)
Expand All @@ -146,9 +143,9 @@ Make quick decisions—don't overthink...
What are the top 3 news stories today?
--- END PROMPT ---

PASS gpt-oss/chat/C1 (2.1s)
PASS opus/chat/C1 (2.1s)

--- RESPONSE (gpt-oss/chat/C1) ---
--- RESPONSE (opus/chat/C1) ---
Here are the three leading stories on AP News for February 16, 2026:
- **Europeans push back at the U.S...** [1]
- **"First feline" Larry marks 15 years...** [2]
Expand All @@ -160,9 +157,7 @@ Here are the three leading stories on AP News for February 16, 2026:

Use these names in `EVAL_MODELS`:

- `gpt-oss` — GPT OSS 120B (self-hosted)
- `mistral` — Mistral Medium 3.1
- `sonnet` — Sonnet 4.5
- `opus` — Opus 4.8

### Mode names

Expand All @@ -174,7 +169,7 @@ Use these names in `EVAL_MODES`:

## Scenarios

135 total scenarios: 15 prompts per mode, tested against each of 3 models.
15 prompts per mode, tested against each registered model.

**Chat mode** covers: news queries, product recommendations, factual lookups, comparisons, multi-part travel queries, medical info, stock market data, and more.

Expand Down
Loading
Loading