diff --git a/docs/freebuff-waiting-room.md b/docs/freebuff-waiting-room.md index 5dfe3d5a9..604046715 100644 --- a/docs/freebuff-waiting-room.md +++ b/docs/freebuff-waiting-room.md @@ -246,16 +246,16 @@ This is a **trust-the-client** design: the server still admits requests during t ## Estimated Wait Time -Computed in `session-view.ts` from the drip-admission rate: +Computed in `session-view.ts` as a rough one-minute-per-spot-ahead estimate: ``` -waitMs = (position - 1) * admissionTickMs +waitMs = (position - 1) * 60_000 ``` - Position 1 → 0 (next tick admits you) -- Position 2 → one tick, and so on. +- Position 2 → one minute, and so on. -This estimate **ignores health-gated pauses**: during a Fireworks incident admission halts entirely, so the actual wait can be longer. We choose to under-report here because showing "unknown" / "indefinite" is worse UX for the common case where the deployment is healthy. +This estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence and health-gated pauses (during a Fireworks incident admission halts entirely), so the real wait can be longer or shorter. ## CLI Integration (frontend-side contract) diff --git a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts index d9cfb3ea4..83e0dc299 100644 --- a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts +++ b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts @@ -33,7 +33,6 @@ function makeSessionDeps(overrides: Partial = {}): SessionDeps & { return { rows, isWaitingRoomEnabled: () => true, - admissionTickMs: 15_000, graceMs: 30 * 60 * 1000, now: () => now, getSessionRow: async (userId) => rows.get(userId) ?? null, diff --git a/web/src/app/layout.tsx b/web/src/app/layout.tsx index 05c0ee71a..eecfa69b8 100644 --- a/web/src/app/layout.tsx +++ b/web/src/app/layout.tsx @@ -8,7 +8,6 @@ import { LayoutWrapper } from '@/components/layout-wrapper' import { Navbar } from '@/components/navbar/navbar' import QueryProvider from '@/components/providers/query-client-provider' import { ThemeProvider } from '@/components/theme-provider' -import { Banner } from '@/components/ui/banner' import { Toaster } from '@/components/ui/toaster' import { siteConfig } from '@/lib/constant' import { fonts } from '@/lib/fonts' @@ -67,7 +66,6 @@ export default function RootLayout({ -
{children} diff --git a/web/src/llm-api/canopywave.ts b/web/src/llm-api/canopywave.ts index 52fe1885c..0db3e0f9c 100644 --- a/web/src/llm-api/canopywave.ts +++ b/web/src/llm-api/canopywave.ts @@ -19,7 +19,7 @@ const CANOPYWAVE_BASE_URL = 'https://inference.canopywave.io/v1' // Extended timeout for deep-thinking models that can take // a long time to start streaming. -const CANOPYWAVE_HEADERS_TIMEOUT_MS = 10 * 60 * 1000 +const CANOPYWAVE_HEADERS_TIMEOUT_MS = 30 * 60 * 1000 const canopywaveAgent = new Agent({ headersTimeout: CANOPYWAVE_HEADERS_TIMEOUT_MS, diff --git a/web/src/llm-api/fireworks.ts b/web/src/llm-api/fireworks.ts index 83b99abcc..6e304638d 100644 --- a/web/src/llm-api/fireworks.ts +++ b/web/src/llm-api/fireworks.ts @@ -20,7 +20,7 @@ const FIREWORKS_BASE_URL = 'https://api.fireworks.ai/inference/v1' // Extended timeout for deep-thinking models that can take // a long time to start streaming. -const FIREWORKS_HEADERS_TIMEOUT_MS = 10 * 60 * 1000 +const FIREWORKS_HEADERS_TIMEOUT_MS = 30 * 60 * 1000 const fireworksAgent = new Agent({ headersTimeout: FIREWORKS_HEADERS_TIMEOUT_MS, diff --git a/web/src/llm-api/openai.ts b/web/src/llm-api/openai.ts index 8f619e835..960ef63c9 100644 --- a/web/src/llm-api/openai.ts +++ b/web/src/llm-api/openai.ts @@ -62,7 +62,7 @@ const OUTPUT_TOKEN_COSTS: Record = { // Extended timeout for deep-thinking models (e.g., gpt-5.x) that can take // a long time to start streaming. -const OPENAI_HEADERS_TIMEOUT_MS = 10 * 60 * 1000 +const OPENAI_HEADERS_TIMEOUT_MS = 30 * 60 * 1000 const openaiAgent = new Agent({ headersTimeout: OPENAI_HEADERS_TIMEOUT_MS, diff --git a/web/src/llm-api/openrouter.ts b/web/src/llm-api/openrouter.ts index a8528764f..2762a60d8 100644 --- a/web/src/llm-api/openrouter.ts +++ b/web/src/llm-api/openrouter.ts @@ -42,7 +42,7 @@ const GENERATION_LOOKUP_DELAY_MS = 500 // Extended timeout for deep-thinking models (e.g., gpt-5) that can take // a long time to start streaming. -const OPENROUTER_HEADERS_TIMEOUT_MS = 10 * 60 * 1000 +const OPENROUTER_HEADERS_TIMEOUT_MS = 30 * 60 * 1000 const openrouterAgent = new Agent({ headersTimeout: OPENROUTER_HEADERS_TIMEOUT_MS, diff --git a/web/src/llm-api/siliconflow.ts b/web/src/llm-api/siliconflow.ts index 6398fe184..936c3f7b2 100644 --- a/web/src/llm-api/siliconflow.ts +++ b/web/src/llm-api/siliconflow.ts @@ -19,7 +19,7 @@ const SILICONFLOW_BASE_URL = 'https://api.siliconflow.com/v1' // Extended timeout for deep-thinking models that can take // a long time to start streaming. -const SILICONFLOW_HEADERS_TIMEOUT_MS = 10 * 60 * 1000 +const SILICONFLOW_HEADERS_TIMEOUT_MS = 30 * 60 * 1000 const siliconflowAgent = new Agent({ headersTimeout: SILICONFLOW_HEADERS_TIMEOUT_MS, diff --git a/web/src/server/free-session/__tests__/public-api.test.ts b/web/src/server/free-session/__tests__/public-api.test.ts index 2e307d62c..df34b7556 100644 --- a/web/src/server/free-session/__tests__/public-api.test.ts +++ b/web/src/server/free-session/__tests__/public-api.test.ts @@ -11,7 +11,6 @@ import type { SessionDeps } from '../public-api' import type { InternalSessionRow } from '../types' const SESSION_LEN = 60 * 60 * 1000 -const TICK_MS = 15_000 const GRACE_MS = 30 * 60 * 1000 function makeDeps(overrides: Partial = {}): SessionDeps & { @@ -36,7 +35,6 @@ function makeDeps(overrides: Partial = {}): SessionDeps & { }, _now: () => currentNow, isWaitingRoomEnabled: () => true, - admissionTickMs: TICK_MS, graceMs: GRACE_MS, now: () => currentNow, getSessionRow: async (userId) => rows.get(userId) ?? null, diff --git a/web/src/server/free-session/__tests__/session-view.test.ts b/web/src/server/free-session/__tests__/session-view.test.ts index 57d9d1e7d..b3bdade6a 100644 --- a/web/src/server/free-session/__tests__/session-view.test.ts +++ b/web/src/server/free-session/__tests__/session-view.test.ts @@ -4,7 +4,7 @@ import { estimateWaitMs, toSessionStateResponse } from '../session-view' import type { InternalSessionRow } from '../types' -const TICK_MS = 15_000 +const WAIT_PER_SPOT_MS = 60_000 const GRACE_MS = 30 * 60_000 function row(overrides: Partial = {}): InternalSessionRow { @@ -24,24 +24,22 @@ function row(overrides: Partial = {}): InternalSessionRow { describe('estimateWaitMs', () => { test('position 1 → 0 wait (next tick picks you up)', () => { - expect(estimateWaitMs({ position: 1, admissionTickMs: TICK_MS })).toBe(0) + expect(estimateWaitMs({ position: 1 })).toBe(0) }) - test('position N → (N-1) ticks ahead', () => { - expect(estimateWaitMs({ position: 2, admissionTickMs: TICK_MS })).toBe(TICK_MS) - expect(estimateWaitMs({ position: 10, admissionTickMs: TICK_MS })).toBe(9 * TICK_MS) + test('position N → (N-1) minutes ahead', () => { + expect(estimateWaitMs({ position: 2 })).toBe(WAIT_PER_SPOT_MS) + expect(estimateWaitMs({ position: 10 })).toBe(9 * WAIT_PER_SPOT_MS) }) test('degenerate inputs return 0', () => { - expect(estimateWaitMs({ position: 0, admissionTickMs: TICK_MS })).toBe(0) - expect(estimateWaitMs({ position: 5, admissionTickMs: 0 })).toBe(0) + expect(estimateWaitMs({ position: 0 })).toBe(0) }) }) describe('toSessionStateResponse', () => { const now = new Date('2026-04-17T12:00:00Z') const baseArgs = { - admissionTickMs: TICK_MS, graceMs: GRACE_MS, } @@ -69,7 +67,7 @@ describe('toSessionStateResponse', () => { instanceId: 'inst-1', position: 3, queueDepth: 10, - estimatedWaitMs: 2 * TICK_MS, + estimatedWaitMs: 2 * WAIT_PER_SPOT_MS, queuedAt: now.toISOString(), }) }) diff --git a/web/src/server/free-session/fireworks-health.ts b/web/src/server/free-session/fireworks-health.ts index 73cec6cbb..c102e721c 100644 --- a/web/src/server/free-session/fireworks-health.ts +++ b/web/src/server/free-session/fireworks-health.ts @@ -18,7 +18,7 @@ export type FireworksHealth = 'healthy' | 'degraded' | 'unhealthy' /** Degrade once median prefill-queue latency crosses this bound. Strict by * design — a 1s queue on top of ~1s prefill already means users feel 2s+ * before first token. */ -export const PREFILL_QUEUE_DEGRADED_MS = 200 +export const PREFILL_QUEUE_DEGRADED_MS = 125 /** Leading indicator of load — responds instantly to memory pressure, while * prefill-queue p50 is a lagging window statistic. Degrading here lets us diff --git a/web/src/server/free-session/public-api.ts b/web/src/server/free-session/public-api.ts index c3b09b3b0..759a516d7 100644 --- a/web/src/server/free-session/public-api.ts +++ b/web/src/server/free-session/public-api.ts @@ -1,5 +1,4 @@ import { - ADMISSION_TICK_MS, getSessionGraceMs, isWaitingRoomEnabled, } from './config' @@ -25,7 +24,6 @@ export interface SessionDeps { /** Plain values, not getters: these never change at runtime. The deps * interface uses values rather than thunks so tests can pass numbers * inline without wrapping. */ - admissionTickMs: number graceMs: number now?: () => Date } @@ -37,7 +35,6 @@ const defaultDeps: SessionDeps = { queueDepth, queuePositionFor, isWaitingRoomEnabled, - admissionTickMs: ADMISSION_TICK_MS, get graceMs() { // Read-through getter so test overrides via env still work; the value // itself is materialized once per call. Cheaper than a thunk because @@ -64,7 +61,6 @@ async function viewForRow( row, position, queueDepth: depth, - admissionTickMs: deps.admissionTickMs, graceMs: deps.graceMs, now: nowOf(deps), }) diff --git a/web/src/server/free-session/session-view.ts b/web/src/server/free-session/session-view.ts index b154e177b..7ce1f75fe 100644 --- a/web/src/server/free-session/session-view.ts +++ b/web/src/server/free-session/session-view.ts @@ -13,11 +13,10 @@ export function toSessionStateResponse(params: { row: InternalSessionRow | null position: number queueDepth: number - admissionTickMs: number graceMs: number now: Date }): SessionStateResponse | null { - const { row, position, queueDepth, admissionTickMs, graceMs, now } = params + const { row, position, queueDepth, graceMs, now } = params if (!row) return null if (row.status === 'active' && row.expires_at) { @@ -51,7 +50,7 @@ export function toSessionStateResponse(params: { instanceId: row.active_instance_id, position, queueDepth, - estimatedWaitMs: estimateWaitMs({ position, admissionTickMs }), + estimatedWaitMs: estimateWaitMs({ position }), queuedAt: row.queued_at.toISOString(), } } @@ -60,18 +59,14 @@ export function toSessionStateResponse(params: { return null } +const WAIT_MS_PER_SPOT_AHEAD = 60_000 + /** - * Wait-time estimate under the drip-admission model: one user per - * `admissionTickMs`, gated by Fireworks health. Ignoring health pauses, the - * user at position P waits roughly `(P - 1) * admissionTickMs`. - * + * Rough wait-time estimate shown to queued users: one minute per spot ahead. * Position 1 → 0ms (next tick picks you up). */ -export function estimateWaitMs(params: { - position: number - admissionTickMs: number -}): number { - const { position, admissionTickMs } = params - if (position <= 1 || admissionTickMs <= 0) return 0 - return (position - 1) * admissionTickMs +export function estimateWaitMs(params: { position: number }): number { + const { position } = params + if (position <= 1) return 0 + return (position - 1) * WAIT_MS_PER_SPOT_AHEAD }