From f2c80d7d619e2f684d2172a59871ba84c23870f5 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Sun, 19 Apr 2026 09:45:01 -0700 Subject: [PATCH 1/4] Raise provider headers timeout from 10m to 30m Deep-thinking models (Minimax M2.5, Kimi K2.5, GLM-5.1, GPT-5) can spend 15+ minutes in the reasoning phase before emitting the first token. The 10-min headersTimeout was cutting them off mid-think and surfacing as "Agent run error: The operation timed out." Co-Authored-By: Claude Opus 4.7 (1M context) --- web/src/llm-api/canopywave.ts | 2 +- web/src/llm-api/fireworks.ts | 2 +- web/src/llm-api/openai.ts | 2 +- web/src/llm-api/openrouter.ts | 2 +- web/src/llm-api/siliconflow.ts | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/web/src/llm-api/canopywave.ts b/web/src/llm-api/canopywave.ts index 52fe1885c3..0db3e0f9cb 100644 --- a/web/src/llm-api/canopywave.ts +++ b/web/src/llm-api/canopywave.ts @@ -19,7 +19,7 @@ const CANOPYWAVE_BASE_URL = 'https://inference.canopywave.io/v1' // Extended timeout for deep-thinking models that can take // a long time to start streaming. -const CANOPYWAVE_HEADERS_TIMEOUT_MS = 10 * 60 * 1000 +const CANOPYWAVE_HEADERS_TIMEOUT_MS = 30 * 60 * 1000 const canopywaveAgent = new Agent({ headersTimeout: CANOPYWAVE_HEADERS_TIMEOUT_MS, diff --git a/web/src/llm-api/fireworks.ts b/web/src/llm-api/fireworks.ts index 83b99abcc9..6e304638d7 100644 --- a/web/src/llm-api/fireworks.ts +++ b/web/src/llm-api/fireworks.ts @@ -20,7 +20,7 @@ const FIREWORKS_BASE_URL = 'https://api.fireworks.ai/inference/v1' // Extended timeout for deep-thinking models that can take // a long time to start streaming. -const FIREWORKS_HEADERS_TIMEOUT_MS = 10 * 60 * 1000 +const FIREWORKS_HEADERS_TIMEOUT_MS = 30 * 60 * 1000 const fireworksAgent = new Agent({ headersTimeout: FIREWORKS_HEADERS_TIMEOUT_MS, diff --git a/web/src/llm-api/openai.ts b/web/src/llm-api/openai.ts index 8f619e8357..960ef63c99 100644 --- a/web/src/llm-api/openai.ts +++ b/web/src/llm-api/openai.ts @@ -62,7 +62,7 @@ const OUTPUT_TOKEN_COSTS: Record = { // Extended timeout for deep-thinking models (e.g., gpt-5.x) that can take // a long time to start streaming. -const OPENAI_HEADERS_TIMEOUT_MS = 10 * 60 * 1000 +const OPENAI_HEADERS_TIMEOUT_MS = 30 * 60 * 1000 const openaiAgent = new Agent({ headersTimeout: OPENAI_HEADERS_TIMEOUT_MS, diff --git a/web/src/llm-api/openrouter.ts b/web/src/llm-api/openrouter.ts index a8528764fa..2762a60d8d 100644 --- a/web/src/llm-api/openrouter.ts +++ b/web/src/llm-api/openrouter.ts @@ -42,7 +42,7 @@ const GENERATION_LOOKUP_DELAY_MS = 500 // Extended timeout for deep-thinking models (e.g., gpt-5) that can take // a long time to start streaming. -const OPENROUTER_HEADERS_TIMEOUT_MS = 10 * 60 * 1000 +const OPENROUTER_HEADERS_TIMEOUT_MS = 30 * 60 * 1000 const openrouterAgent = new Agent({ headersTimeout: OPENROUTER_HEADERS_TIMEOUT_MS, diff --git a/web/src/llm-api/siliconflow.ts b/web/src/llm-api/siliconflow.ts index 6398fe184f..936c3f7b28 100644 --- a/web/src/llm-api/siliconflow.ts +++ b/web/src/llm-api/siliconflow.ts @@ -19,7 +19,7 @@ const SILICONFLOW_BASE_URL = 'https://api.siliconflow.com/v1' // Extended timeout for deep-thinking models that can take // a long time to start streaming. -const SILICONFLOW_HEADERS_TIMEOUT_MS = 10 * 60 * 1000 +const SILICONFLOW_HEADERS_TIMEOUT_MS = 30 * 60 * 1000 const siliconflowAgent = new Agent({ headersTimeout: SILICONFLOW_HEADERS_TIMEOUT_MS, From 35021d817ba366ce5f65e6644edfb68b7dcce9b6 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Sun, 19 Apr 2026 09:45:53 -0700 Subject: [PATCH 2/4] Make prefil queue health more strict --- web/src/server/free-session/fireworks-health.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/src/server/free-session/fireworks-health.ts b/web/src/server/free-session/fireworks-health.ts index 73cec6cbb3..c102e721c0 100644 --- a/web/src/server/free-session/fireworks-health.ts +++ b/web/src/server/free-session/fireworks-health.ts @@ -18,7 +18,7 @@ export type FireworksHealth = 'healthy' | 'degraded' | 'unhealthy' /** Degrade once median prefill-queue latency crosses this bound. Strict by * design — a 1s queue on top of ~1s prefill already means users feel 2s+ * before first token. */ -export const PREFILL_QUEUE_DEGRADED_MS = 200 +export const PREFILL_QUEUE_DEGRADED_MS = 125 /** Leading indicator of load — responds instantly to memory pressure, while * prefill-queue p50 is a lagging window statistic. Degrading here lets us From 2bbd3b1bfe5da8b0c987b8e53a7514b0d36627fd Mon Sep 17 00:00:00 2001 From: James Grugett Date: Sun, 19 Apr 2026 09:48:13 -0700 Subject: [PATCH 3/4] Estimate waiting room wait as 1 minute per spot ahead Decouples the user-facing wait estimate from the admission tick rate. The estimate is now a rough one-minute-per-spot rule of thumb, which reads more intuitively in the CLI than a tick-derived number that shifts with deployment cadence. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/freebuff-waiting-room.md | 8 +++---- .../session/__tests__/session.test.ts | 1 - .../free-session/__tests__/public-api.test.ts | 2 -- .../__tests__/session-view.test.ts | 16 ++++++------- web/src/server/free-session/public-api.ts | 4 ---- web/src/server/free-session/session-view.ts | 23 ++++++++----------- 6 files changed, 20 insertions(+), 34 deletions(-) diff --git a/docs/freebuff-waiting-room.md b/docs/freebuff-waiting-room.md index 5dfe3d5a99..604046715e 100644 --- a/docs/freebuff-waiting-room.md +++ b/docs/freebuff-waiting-room.md @@ -246,16 +246,16 @@ This is a **trust-the-client** design: the server still admits requests during t ## Estimated Wait Time -Computed in `session-view.ts` from the drip-admission rate: +Computed in `session-view.ts` as a rough one-minute-per-spot-ahead estimate: ``` -waitMs = (position - 1) * admissionTickMs +waitMs = (position - 1) * 60_000 ``` - Position 1 → 0 (next tick admits you) -- Position 2 → one tick, and so on. +- Position 2 → one minute, and so on. -This estimate **ignores health-gated pauses**: during a Fireworks incident admission halts entirely, so the actual wait can be longer. We choose to under-report here because showing "unknown" / "indefinite" is worse UX for the common case where the deployment is healthy. +This estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence and health-gated pauses (during a Fireworks incident admission halts entirely), so the real wait can be longer or shorter. ## CLI Integration (frontend-side contract) diff --git a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts index d9cfb3ea48..83e0dc2995 100644 --- a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts +++ b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts @@ -33,7 +33,6 @@ function makeSessionDeps(overrides: Partial = {}): SessionDeps & { return { rows, isWaitingRoomEnabled: () => true, - admissionTickMs: 15_000, graceMs: 30 * 60 * 1000, now: () => now, getSessionRow: async (userId) => rows.get(userId) ?? null, diff --git a/web/src/server/free-session/__tests__/public-api.test.ts b/web/src/server/free-session/__tests__/public-api.test.ts index 2e307d62c9..df34b75567 100644 --- a/web/src/server/free-session/__tests__/public-api.test.ts +++ b/web/src/server/free-session/__tests__/public-api.test.ts @@ -11,7 +11,6 @@ import type { SessionDeps } from '../public-api' import type { InternalSessionRow } from '../types' const SESSION_LEN = 60 * 60 * 1000 -const TICK_MS = 15_000 const GRACE_MS = 30 * 60 * 1000 function makeDeps(overrides: Partial = {}): SessionDeps & { @@ -36,7 +35,6 @@ function makeDeps(overrides: Partial = {}): SessionDeps & { }, _now: () => currentNow, isWaitingRoomEnabled: () => true, - admissionTickMs: TICK_MS, graceMs: GRACE_MS, now: () => currentNow, getSessionRow: async (userId) => rows.get(userId) ?? null, diff --git a/web/src/server/free-session/__tests__/session-view.test.ts b/web/src/server/free-session/__tests__/session-view.test.ts index 57d9d1e7d5..b3bdade6ab 100644 --- a/web/src/server/free-session/__tests__/session-view.test.ts +++ b/web/src/server/free-session/__tests__/session-view.test.ts @@ -4,7 +4,7 @@ import { estimateWaitMs, toSessionStateResponse } from '../session-view' import type { InternalSessionRow } from '../types' -const TICK_MS = 15_000 +const WAIT_PER_SPOT_MS = 60_000 const GRACE_MS = 30 * 60_000 function row(overrides: Partial = {}): InternalSessionRow { @@ -24,24 +24,22 @@ function row(overrides: Partial = {}): InternalSessionRow { describe('estimateWaitMs', () => { test('position 1 → 0 wait (next tick picks you up)', () => { - expect(estimateWaitMs({ position: 1, admissionTickMs: TICK_MS })).toBe(0) + expect(estimateWaitMs({ position: 1 })).toBe(0) }) - test('position N → (N-1) ticks ahead', () => { - expect(estimateWaitMs({ position: 2, admissionTickMs: TICK_MS })).toBe(TICK_MS) - expect(estimateWaitMs({ position: 10, admissionTickMs: TICK_MS })).toBe(9 * TICK_MS) + test('position N → (N-1) minutes ahead', () => { + expect(estimateWaitMs({ position: 2 })).toBe(WAIT_PER_SPOT_MS) + expect(estimateWaitMs({ position: 10 })).toBe(9 * WAIT_PER_SPOT_MS) }) test('degenerate inputs return 0', () => { - expect(estimateWaitMs({ position: 0, admissionTickMs: TICK_MS })).toBe(0) - expect(estimateWaitMs({ position: 5, admissionTickMs: 0 })).toBe(0) + expect(estimateWaitMs({ position: 0 })).toBe(0) }) }) describe('toSessionStateResponse', () => { const now = new Date('2026-04-17T12:00:00Z') const baseArgs = { - admissionTickMs: TICK_MS, graceMs: GRACE_MS, } @@ -69,7 +67,7 @@ describe('toSessionStateResponse', () => { instanceId: 'inst-1', position: 3, queueDepth: 10, - estimatedWaitMs: 2 * TICK_MS, + estimatedWaitMs: 2 * WAIT_PER_SPOT_MS, queuedAt: now.toISOString(), }) }) diff --git a/web/src/server/free-session/public-api.ts b/web/src/server/free-session/public-api.ts index c3b09b3b0e..759a516d73 100644 --- a/web/src/server/free-session/public-api.ts +++ b/web/src/server/free-session/public-api.ts @@ -1,5 +1,4 @@ import { - ADMISSION_TICK_MS, getSessionGraceMs, isWaitingRoomEnabled, } from './config' @@ -25,7 +24,6 @@ export interface SessionDeps { /** Plain values, not getters: these never change at runtime. The deps * interface uses values rather than thunks so tests can pass numbers * inline without wrapping. */ - admissionTickMs: number graceMs: number now?: () => Date } @@ -37,7 +35,6 @@ const defaultDeps: SessionDeps = { queueDepth, queuePositionFor, isWaitingRoomEnabled, - admissionTickMs: ADMISSION_TICK_MS, get graceMs() { // Read-through getter so test overrides via env still work; the value // itself is materialized once per call. Cheaper than a thunk because @@ -64,7 +61,6 @@ async function viewForRow( row, position, queueDepth: depth, - admissionTickMs: deps.admissionTickMs, graceMs: deps.graceMs, now: nowOf(deps), }) diff --git a/web/src/server/free-session/session-view.ts b/web/src/server/free-session/session-view.ts index b154e177b3..7ce1f75fe7 100644 --- a/web/src/server/free-session/session-view.ts +++ b/web/src/server/free-session/session-view.ts @@ -13,11 +13,10 @@ export function toSessionStateResponse(params: { row: InternalSessionRow | null position: number queueDepth: number - admissionTickMs: number graceMs: number now: Date }): SessionStateResponse | null { - const { row, position, queueDepth, admissionTickMs, graceMs, now } = params + const { row, position, queueDepth, graceMs, now } = params if (!row) return null if (row.status === 'active' && row.expires_at) { @@ -51,7 +50,7 @@ export function toSessionStateResponse(params: { instanceId: row.active_instance_id, position, queueDepth, - estimatedWaitMs: estimateWaitMs({ position, admissionTickMs }), + estimatedWaitMs: estimateWaitMs({ position }), queuedAt: row.queued_at.toISOString(), } } @@ -60,18 +59,14 @@ export function toSessionStateResponse(params: { return null } +const WAIT_MS_PER_SPOT_AHEAD = 60_000 + /** - * Wait-time estimate under the drip-admission model: one user per - * `admissionTickMs`, gated by Fireworks health. Ignoring health pauses, the - * user at position P waits roughly `(P - 1) * admissionTickMs`. - * + * Rough wait-time estimate shown to queued users: one minute per spot ahead. * Position 1 → 0ms (next tick picks you up). */ -export function estimateWaitMs(params: { - position: number - admissionTickMs: number -}): number { - const { position, admissionTickMs } = params - if (position <= 1 || admissionTickMs <= 0) return 0 - return (position - 1) * admissionTickMs +export function estimateWaitMs(params: { position: number }): number { + const { position } = params + if (position <= 1) return 0 + return (position - 1) * WAIT_MS_PER_SPOT_AHEAD } From 12ed322b4a1d5f026cb01f5fab959fc8a4a45d09 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Sun, 19 Apr 2026 09:52:47 -0700 Subject: [PATCH 4/4] Hide web referral banner --- web/src/app/layout.tsx | 2 -- 1 file changed, 2 deletions(-) diff --git a/web/src/app/layout.tsx b/web/src/app/layout.tsx index 05c0ee71ae..eecfa69b85 100644 --- a/web/src/app/layout.tsx +++ b/web/src/app/layout.tsx @@ -8,7 +8,6 @@ import { LayoutWrapper } from '@/components/layout-wrapper' import { Navbar } from '@/components/navbar/navbar' import QueryProvider from '@/components/providers/query-client-provider' import { ThemeProvider } from '@/components/theme-provider' -import { Banner } from '@/components/ui/banner' import { Toaster } from '@/components/ui/toaster' import { siteConfig } from '@/lib/constant' import { fonts } from '@/lib/fonts' @@ -67,7 +66,6 @@ export default function RootLayout({ -
{children}