Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions docs/freebuff-waiting-room.md
Original file line number Diff line number Diff line change
Expand Up @@ -246,16 +246,16 @@ This is a **trust-the-client** design: the server still admits requests during t

## Estimated Wait Time

Computed in `session-view.ts` from the drip-admission rate:
Computed in `session-view.ts` as a rough one-minute-per-spot-ahead estimate:

```
waitMs = (position - 1) * admissionTickMs
waitMs = (position - 1) * 60_000
```

- Position 1 → 0 (next tick admits you)
- Position 2 → one tick, and so on.
- Position 2 → one minute, and so on.

This estimate **ignores health-gated pauses**: during a Fireworks incident admission halts entirely, so the actual wait can be longer. We choose to under-report here because showing "unknown" / "indefinite" is worse UX for the common case where the deployment is healthy.
This estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence and health-gated pauses (during a Fireworks incident admission halts entirely), so the real wait can be longer or shorter.

## CLI Integration (frontend-side contract)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ function makeSessionDeps(overrides: Partial<SessionDeps> = {}): SessionDeps & {
return {
rows,
isWaitingRoomEnabled: () => true,
admissionTickMs: 15_000,
graceMs: 30 * 60 * 1000,
now: () => now,
getSessionRow: async (userId) => rows.get(userId) ?? null,
Expand Down
2 changes: 0 additions & 2 deletions web/src/app/layout.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import { LayoutWrapper } from '@/components/layout-wrapper'
import { Navbar } from '@/components/navbar/navbar'
import QueryProvider from '@/components/providers/query-client-provider'
import { ThemeProvider } from '@/components/theme-provider'
import { Banner } from '@/components/ui/banner'
import { Toaster } from '@/components/ui/toaster'
import { siteConfig } from '@/lib/constant'
import { fonts } from '@/lib/fonts'
Expand Down Expand Up @@ -67,7 +66,6 @@ export default function RootLayout({
<SessionProvider>
<QueryProvider>
<PostHogProvider>
<Banner />
<Navbar />
<div className="flex-grow">
<LayoutWrapper>{children}</LayoutWrapper>
Expand Down
2 changes: 1 addition & 1 deletion web/src/llm-api/canopywave.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const CANOPYWAVE_BASE_URL = 'https://inference.canopywave.io/v1'

// Extended timeout for deep-thinking models that can take
// a long time to start streaming.
const CANOPYWAVE_HEADERS_TIMEOUT_MS = 10 * 60 * 1000
const CANOPYWAVE_HEADERS_TIMEOUT_MS = 30 * 60 * 1000

const canopywaveAgent = new Agent({
headersTimeout: CANOPYWAVE_HEADERS_TIMEOUT_MS,
Expand Down
2 changes: 1 addition & 1 deletion web/src/llm-api/fireworks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ const FIREWORKS_BASE_URL = 'https://api.fireworks.ai/inference/v1'

// Extended timeout for deep-thinking models that can take
// a long time to start streaming.
const FIREWORKS_HEADERS_TIMEOUT_MS = 10 * 60 * 1000
const FIREWORKS_HEADERS_TIMEOUT_MS = 30 * 60 * 1000

const fireworksAgent = new Agent({
headersTimeout: FIREWORKS_HEADERS_TIMEOUT_MS,
Expand Down
2 changes: 1 addition & 1 deletion web/src/llm-api/openai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ const OUTPUT_TOKEN_COSTS: Record<string, number> = {

// Extended timeout for deep-thinking models (e.g., gpt-5.x) that can take
// a long time to start streaming.
const OPENAI_HEADERS_TIMEOUT_MS = 10 * 60 * 1000
const OPENAI_HEADERS_TIMEOUT_MS = 30 * 60 * 1000

const openaiAgent = new Agent({
headersTimeout: OPENAI_HEADERS_TIMEOUT_MS,
Expand Down
2 changes: 1 addition & 1 deletion web/src/llm-api/openrouter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ const GENERATION_LOOKUP_DELAY_MS = 500

// Extended timeout for deep-thinking models (e.g., gpt-5) that can take
// a long time to start streaming.
const OPENROUTER_HEADERS_TIMEOUT_MS = 10 * 60 * 1000
const OPENROUTER_HEADERS_TIMEOUT_MS = 30 * 60 * 1000

const openrouterAgent = new Agent({
headersTimeout: OPENROUTER_HEADERS_TIMEOUT_MS,
Expand Down
2 changes: 1 addition & 1 deletion web/src/llm-api/siliconflow.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const SILICONFLOW_BASE_URL = 'https://api.siliconflow.com/v1'

// Extended timeout for deep-thinking models that can take
// a long time to start streaming.
const SILICONFLOW_HEADERS_TIMEOUT_MS = 10 * 60 * 1000
const SILICONFLOW_HEADERS_TIMEOUT_MS = 30 * 60 * 1000

const siliconflowAgent = new Agent({
headersTimeout: SILICONFLOW_HEADERS_TIMEOUT_MS,
Expand Down
2 changes: 0 additions & 2 deletions web/src/server/free-session/__tests__/public-api.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import type { SessionDeps } from '../public-api'
import type { InternalSessionRow } from '../types'

const SESSION_LEN = 60 * 60 * 1000
const TICK_MS = 15_000
const GRACE_MS = 30 * 60 * 1000

function makeDeps(overrides: Partial<SessionDeps> = {}): SessionDeps & {
Expand All @@ -36,7 +35,6 @@ function makeDeps(overrides: Partial<SessionDeps> = {}): SessionDeps & {
},
_now: () => currentNow,
isWaitingRoomEnabled: () => true,
admissionTickMs: TICK_MS,
graceMs: GRACE_MS,
now: () => currentNow,
getSessionRow: async (userId) => rows.get(userId) ?? null,
Expand Down
16 changes: 7 additions & 9 deletions web/src/server/free-session/__tests__/session-view.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { estimateWaitMs, toSessionStateResponse } from '../session-view'

import type { InternalSessionRow } from '../types'

const TICK_MS = 15_000
const WAIT_PER_SPOT_MS = 60_000
const GRACE_MS = 30 * 60_000

function row(overrides: Partial<InternalSessionRow> = {}): InternalSessionRow {
Expand All @@ -24,24 +24,22 @@ function row(overrides: Partial<InternalSessionRow> = {}): InternalSessionRow {

describe('estimateWaitMs', () => {
test('position 1 → 0 wait (next tick picks you up)', () => {
expect(estimateWaitMs({ position: 1, admissionTickMs: TICK_MS })).toBe(0)
expect(estimateWaitMs({ position: 1 })).toBe(0)
})

test('position N → (N-1) ticks ahead', () => {
expect(estimateWaitMs({ position: 2, admissionTickMs: TICK_MS })).toBe(TICK_MS)
expect(estimateWaitMs({ position: 10, admissionTickMs: TICK_MS })).toBe(9 * TICK_MS)
test('position N → (N-1) minutes ahead', () => {
expect(estimateWaitMs({ position: 2 })).toBe(WAIT_PER_SPOT_MS)
expect(estimateWaitMs({ position: 10 })).toBe(9 * WAIT_PER_SPOT_MS)
})

test('degenerate inputs return 0', () => {
expect(estimateWaitMs({ position: 0, admissionTickMs: TICK_MS })).toBe(0)
expect(estimateWaitMs({ position: 5, admissionTickMs: 0 })).toBe(0)
expect(estimateWaitMs({ position: 0 })).toBe(0)
})
})

describe('toSessionStateResponse', () => {
const now = new Date('2026-04-17T12:00:00Z')
const baseArgs = {
admissionTickMs: TICK_MS,
graceMs: GRACE_MS,
}

Expand Down Expand Up @@ -69,7 +67,7 @@ describe('toSessionStateResponse', () => {
instanceId: 'inst-1',
position: 3,
queueDepth: 10,
estimatedWaitMs: 2 * TICK_MS,
estimatedWaitMs: 2 * WAIT_PER_SPOT_MS,
queuedAt: now.toISOString(),
})
})
Expand Down
2 changes: 1 addition & 1 deletion web/src/server/free-session/fireworks-health.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ export type FireworksHealth = 'healthy' | 'degraded' | 'unhealthy'
/** Degrade once median prefill-queue latency crosses this bound. Strict by
* design — a 1s queue on top of ~1s prefill already means users feel 2s+
* before first token. */
export const PREFILL_QUEUE_DEGRADED_MS = 200
export const PREFILL_QUEUE_DEGRADED_MS = 125

/** Leading indicator of load — responds instantly to memory pressure, while
* prefill-queue p50 is a lagging window statistic. Degrading here lets us
Expand Down
4 changes: 0 additions & 4 deletions web/src/server/free-session/public-api.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import {
ADMISSION_TICK_MS,
getSessionGraceMs,
isWaitingRoomEnabled,
} from './config'
Expand All @@ -25,7 +24,6 @@ export interface SessionDeps {
/** Plain values, not getters: these never change at runtime. The deps
* interface uses values rather than thunks so tests can pass numbers
* inline without wrapping. */
admissionTickMs: number
graceMs: number
now?: () => Date
}
Expand All @@ -37,7 +35,6 @@ const defaultDeps: SessionDeps = {
queueDepth,
queuePositionFor,
isWaitingRoomEnabled,
admissionTickMs: ADMISSION_TICK_MS,
get graceMs() {
// Read-through getter so test overrides via env still work; the value
// itself is materialized once per call. Cheaper than a thunk because
Expand All @@ -64,7 +61,6 @@ async function viewForRow(
row,
position,
queueDepth: depth,
admissionTickMs: deps.admissionTickMs,
graceMs: deps.graceMs,
now: nowOf(deps),
})
Expand Down
23 changes: 9 additions & 14 deletions web/src/server/free-session/session-view.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@ export function toSessionStateResponse(params: {
row: InternalSessionRow | null
position: number
queueDepth: number
admissionTickMs: number
graceMs: number
now: Date
}): SessionStateResponse | null {
const { row, position, queueDepth, admissionTickMs, graceMs, now } = params
const { row, position, queueDepth, graceMs, now } = params
if (!row) return null

if (row.status === 'active' && row.expires_at) {
Expand Down Expand Up @@ -51,7 +50,7 @@ export function toSessionStateResponse(params: {
instanceId: row.active_instance_id,
position,
queueDepth,
estimatedWaitMs: estimateWaitMs({ position, admissionTickMs }),
estimatedWaitMs: estimateWaitMs({ position }),
queuedAt: row.queued_at.toISOString(),
}
}
Expand All @@ -60,18 +59,14 @@ export function toSessionStateResponse(params: {
return null
}

const WAIT_MS_PER_SPOT_AHEAD = 60_000

/**
* Wait-time estimate under the drip-admission model: one user per
* `admissionTickMs`, gated by Fireworks health. Ignoring health pauses, the
* user at position P waits roughly `(P - 1) * admissionTickMs`.
*
* Rough wait-time estimate shown to queued users: one minute per spot ahead.
* Position 1 → 0ms (next tick picks you up).
*/
export function estimateWaitMs(params: {
position: number
admissionTickMs: number
}): number {
const { position, admissionTickMs } = params
if (position <= 1 || admissionTickMs <= 0) return 0
return (position - 1) * admissionTickMs
export function estimateWaitMs(params: { position: number }): number {
const { position } = params
if (position <= 1) return 0
return (position - 1) * WAIT_MS_PER_SPOT_AHEAD
}
Loading