From f2c80d7d619e2f684d2172a59871ba84c23870f5 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Sun, 19 Apr 2026 09:45:01 -0700
Subject: [PATCH 1/4] Raise provider headers timeout from 10m to 30m

Deep-thinking models (Minimax M2.5, Kimi K2.5, GLM-5.1, GPT-5) can spend
15+ minutes in the reasoning phase before emitting the first token.
The 10-min headersTimeout was cutting them off mid-think and surfacing
as "Agent run error: The operation timed out."

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 web/src/llm-api/canopywave.ts  | 2 +-
 web/src/llm-api/fireworks.ts   | 2 +-
 web/src/llm-api/openai.ts      | 2 +-
 web/src/llm-api/openrouter.ts  | 2 +-
 web/src/llm-api/siliconflow.ts | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/web/src/llm-api/canopywave.ts b/web/src/llm-api/canopywave.ts
index 52fe1885c3..0db3e0f9cb 100644
--- a/web/src/llm-api/canopywave.ts
+++ b/web/src/llm-api/canopywave.ts
@@ -19,7 +19,7 @@ const CANOPYWAVE_BASE_URL = 'https://inference.canopywave.io/v1'
 
 // Extended timeout for deep-thinking models that can take
 // a long time to start streaming.
-const CANOPYWAVE_HEADERS_TIMEOUT_MS = 10 * 60 * 1000
+const CANOPYWAVE_HEADERS_TIMEOUT_MS = 30 * 60 * 1000
 
 const canopywaveAgent = new Agent({
   headersTimeout: CANOPYWAVE_HEADERS_TIMEOUT_MS,
diff --git a/web/src/llm-api/fireworks.ts b/web/src/llm-api/fireworks.ts
index 83b99abcc9..6e304638d7 100644
--- a/web/src/llm-api/fireworks.ts
+++ b/web/src/llm-api/fireworks.ts
@@ -20,7 +20,7 @@ const FIREWORKS_BASE_URL = 'https://api.fireworks.ai/inference/v1'
 
 // Extended timeout for deep-thinking models that can take
 // a long time to start streaming.
-const FIREWORKS_HEADERS_TIMEOUT_MS = 10 * 60 * 1000
+const FIREWORKS_HEADERS_TIMEOUT_MS = 30 * 60 * 1000
 
 const fireworksAgent = new Agent({
   headersTimeout: FIREWORKS_HEADERS_TIMEOUT_MS,
diff --git a/web/src/llm-api/openai.ts b/web/src/llm-api/openai.ts
index 8f619e8357..960ef63c99 100644
--- a/web/src/llm-api/openai.ts
+++ b/web/src/llm-api/openai.ts
@@ -62,7 +62,7 @@ const OUTPUT_TOKEN_COSTS: Record<string, number> = {
 
 // Extended timeout for deep-thinking models (e.g., gpt-5.x) that can take
 // a long time to start streaming.
-const OPENAI_HEADERS_TIMEOUT_MS = 10 * 60 * 1000
+const OPENAI_HEADERS_TIMEOUT_MS = 30 * 60 * 1000
 
 const openaiAgent = new Agent({
   headersTimeout: OPENAI_HEADERS_TIMEOUT_MS,
diff --git a/web/src/llm-api/openrouter.ts b/web/src/llm-api/openrouter.ts
index a8528764fa..2762a60d8d 100644
--- a/web/src/llm-api/openrouter.ts
+++ b/web/src/llm-api/openrouter.ts
@@ -42,7 +42,7 @@ const GENERATION_LOOKUP_DELAY_MS = 500
 
 // Extended timeout for deep-thinking models (e.g., gpt-5) that can take
 // a long time to start streaming.
-const OPENROUTER_HEADERS_TIMEOUT_MS = 10 * 60 * 1000
+const OPENROUTER_HEADERS_TIMEOUT_MS = 30 * 60 * 1000
 
 const openrouterAgent = new Agent({
   headersTimeout: OPENROUTER_HEADERS_TIMEOUT_MS,
diff --git a/web/src/llm-api/siliconflow.ts b/web/src/llm-api/siliconflow.ts
index 6398fe184f..936c3f7b28 100644
--- a/web/src/llm-api/siliconflow.ts
+++ b/web/src/llm-api/siliconflow.ts
@@ -19,7 +19,7 @@ const SILICONFLOW_BASE_URL = 'https://api.siliconflow.com/v1'
 
 // Extended timeout for deep-thinking models that can take
 // a long time to start streaming.
-const SILICONFLOW_HEADERS_TIMEOUT_MS = 10 * 60 * 1000
+const SILICONFLOW_HEADERS_TIMEOUT_MS = 30 * 60 * 1000
 
 const siliconflowAgent = new Agent({
   headersTimeout: SILICONFLOW_HEADERS_TIMEOUT_MS,

From 35021d817ba366ce5f65e6644edfb68b7dcce9b6 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Sun, 19 Apr 2026 09:45:53 -0700
Subject: [PATCH 2/4] Make prefil queue health more strict

---
 web/src/server/free-session/fireworks-health.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/src/server/free-session/fireworks-health.ts b/web/src/server/free-session/fireworks-health.ts
index 73cec6cbb3..c102e721c0 100644
--- a/web/src/server/free-session/fireworks-health.ts
+++ b/web/src/server/free-session/fireworks-health.ts
@@ -18,7 +18,7 @@ export type FireworksHealth = 'healthy' | 'degraded' | 'unhealthy'
 /** Degrade once median prefill-queue latency crosses this bound. Strict by
  *  design — a 1s queue on top of ~1s prefill already means users feel 2s+
  *  before first token. */
-export const PREFILL_QUEUE_DEGRADED_MS = 200
+export const PREFILL_QUEUE_DEGRADED_MS = 125
 
 /** Leading indicator of load — responds instantly to memory pressure, while
  *  prefill-queue p50 is a lagging window statistic. Degrading here lets us

From 2bbd3b1bfe5da8b0c987b8e53a7514b0d36627fd Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Sun, 19 Apr 2026 09:48:13 -0700
Subject: [PATCH 3/4] Estimate waiting room wait as 1 minute per spot ahead

Decouples the user-facing wait estimate from the admission tick rate.
The estimate is now a rough one-minute-per-spot rule of thumb, which
reads more intuitively in the CLI than a tick-derived number that
shifts with deployment cadence.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/freebuff-waiting-room.md                 |  8 +++----
 .../session/__tests__/session.test.ts         |  1 -
 .../free-session/__tests__/public-api.test.ts |  2 --
 .../__tests__/session-view.test.ts            | 16 ++++++-------
 web/src/server/free-session/public-api.ts     |  4 ----
 web/src/server/free-session/session-view.ts   | 23 ++++++++-----------
 6 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/docs/freebuff-waiting-room.md b/docs/freebuff-waiting-room.md
index 5dfe3d5a99..604046715e 100644
--- a/docs/freebuff-waiting-room.md
+++ b/docs/freebuff-waiting-room.md
@@ -246,16 +246,16 @@ This is a **trust-the-client** design: the server still admits requests during t
 
 ## Estimated Wait Time
 
-Computed in `session-view.ts` from the drip-admission rate:
+Computed in `session-view.ts` as a rough one-minute-per-spot-ahead estimate:
 
 ```
-waitMs = (position - 1) * admissionTickMs
+waitMs = (position - 1) * 60_000
 ```
 
 - Position 1 → 0 (next tick admits you)
-- Position 2 → one tick, and so on.
+- Position 2 → one minute, and so on.
 
-This estimate **ignores health-gated pauses**: during a Fireworks incident admission halts entirely, so the actual wait can be longer. We choose to under-report here because showing "unknown" / "indefinite" is worse UX for the common case where the deployment is healthy.
+This estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence and health-gated pauses (during a Fireworks incident admission halts entirely), so the real wait can be longer or shorter.
 
 ## CLI Integration (frontend-side contract)
 
diff --git a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts
index d9cfb3ea48..83e0dc2995 100644
--- a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts
+++ b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts
@@ -33,7 +33,6 @@ function makeSessionDeps(overrides: Partial<SessionDeps> = {}): SessionDeps & {
   return {
     rows,
     isWaitingRoomEnabled: () => true,
-    admissionTickMs: 15_000,
     graceMs: 30 * 60 * 1000,
     now: () => now,
     getSessionRow: async (userId) => rows.get(userId) ?? null,
diff --git a/web/src/server/free-session/__tests__/public-api.test.ts b/web/src/server/free-session/__tests__/public-api.test.ts
index 2e307d62c9..df34b75567 100644
--- a/web/src/server/free-session/__tests__/public-api.test.ts
+++ b/web/src/server/free-session/__tests__/public-api.test.ts
@@ -11,7 +11,6 @@ import type { SessionDeps } from '../public-api'
 import type { InternalSessionRow } from '../types'
 
 const SESSION_LEN = 60 * 60 * 1000
-const TICK_MS = 15_000
 const GRACE_MS = 30 * 60 * 1000
 
 function makeDeps(overrides: Partial<SessionDeps> = {}): SessionDeps & {
@@ -36,7 +35,6 @@ function makeDeps(overrides: Partial<SessionDeps> = {}): SessionDeps & {
     },
     _now: () => currentNow,
     isWaitingRoomEnabled: () => true,
-    admissionTickMs: TICK_MS,
     graceMs: GRACE_MS,
     now: () => currentNow,
     getSessionRow: async (userId) => rows.get(userId) ?? null,
diff --git a/web/src/server/free-session/__tests__/session-view.test.ts b/web/src/server/free-session/__tests__/session-view.test.ts
index 57d9d1e7d5..b3bdade6ab 100644
--- a/web/src/server/free-session/__tests__/session-view.test.ts
+++ b/web/src/server/free-session/__tests__/session-view.test.ts
@@ -4,7 +4,7 @@ import { estimateWaitMs, toSessionStateResponse } from '../session-view'
 
 import type { InternalSessionRow } from '../types'
 
-const TICK_MS = 15_000
+const WAIT_PER_SPOT_MS = 60_000
 const GRACE_MS = 30 * 60_000
 
 function row(overrides: Partial<InternalSessionRow> = {}): InternalSessionRow {
@@ -24,24 +24,22 @@ function row(overrides: Partial<InternalSessionRow> = {}): InternalSessionRow {
 
 describe('estimateWaitMs', () => {
   test('position 1 → 0 wait (next tick picks you up)', () => {
-    expect(estimateWaitMs({ position: 1, admissionTickMs: TICK_MS })).toBe(0)
+    expect(estimateWaitMs({ position: 1 })).toBe(0)
   })
 
-  test('position N → (N-1) ticks ahead', () => {
-    expect(estimateWaitMs({ position: 2, admissionTickMs: TICK_MS })).toBe(TICK_MS)
-    expect(estimateWaitMs({ position: 10, admissionTickMs: TICK_MS })).toBe(9 * TICK_MS)
+  test('position N → (N-1) minutes ahead', () => {
+    expect(estimateWaitMs({ position: 2 })).toBe(WAIT_PER_SPOT_MS)
+    expect(estimateWaitMs({ position: 10 })).toBe(9 * WAIT_PER_SPOT_MS)
   })
 
   test('degenerate inputs return 0', () => {
-    expect(estimateWaitMs({ position: 0, admissionTickMs: TICK_MS })).toBe(0)
-    expect(estimateWaitMs({ position: 5, admissionTickMs: 0 })).toBe(0)
+    expect(estimateWaitMs({ position: 0 })).toBe(0)
   })
 })
 
 describe('toSessionStateResponse', () => {
   const now = new Date('2026-04-17T12:00:00Z')
   const baseArgs = {
-    admissionTickMs: TICK_MS,
     graceMs: GRACE_MS,
   }
 
@@ -69,7 +67,7 @@ describe('toSessionStateResponse', () => {
       instanceId: 'inst-1',
       position: 3,
       queueDepth: 10,
-      estimatedWaitMs: 2 * TICK_MS,
+      estimatedWaitMs: 2 * WAIT_PER_SPOT_MS,
       queuedAt: now.toISOString(),
     })
   })
diff --git a/web/src/server/free-session/public-api.ts b/web/src/server/free-session/public-api.ts
index c3b09b3b0e..759a516d73 100644
--- a/web/src/server/free-session/public-api.ts
+++ b/web/src/server/free-session/public-api.ts
@@ -1,5 +1,4 @@
 import {
-  ADMISSION_TICK_MS,
   getSessionGraceMs,
   isWaitingRoomEnabled,
 } from './config'
@@ -25,7 +24,6 @@ export interface SessionDeps {
   /** Plain values, not getters: these never change at runtime. The deps
    *  interface uses values rather than thunks so tests can pass numbers
    *  inline without wrapping. */
-  admissionTickMs: number
   graceMs: number
   now?: () => Date
 }
@@ -37,7 +35,6 @@ const defaultDeps: SessionDeps = {
   queueDepth,
   queuePositionFor,
   isWaitingRoomEnabled,
-  admissionTickMs: ADMISSION_TICK_MS,
   get graceMs() {
     // Read-through getter so test overrides via env still work; the value
     // itself is materialized once per call. Cheaper than a thunk because
@@ -64,7 +61,6 @@ async function viewForRow(
     row,
     position,
     queueDepth: depth,
-    admissionTickMs: deps.admissionTickMs,
     graceMs: deps.graceMs,
     now: nowOf(deps),
   })
diff --git a/web/src/server/free-session/session-view.ts b/web/src/server/free-session/session-view.ts
index b154e177b3..7ce1f75fe7 100644
--- a/web/src/server/free-session/session-view.ts
+++ b/web/src/server/free-session/session-view.ts
@@ -13,11 +13,10 @@ export function toSessionStateResponse(params: {
   row: InternalSessionRow | null
   position: number
   queueDepth: number
-  admissionTickMs: number
   graceMs: number
   now: Date
 }): SessionStateResponse | null {
-  const { row, position, queueDepth, admissionTickMs, graceMs, now } = params
+  const { row, position, queueDepth, graceMs, now } = params
   if (!row) return null
 
   if (row.status === 'active' && row.expires_at) {
@@ -51,7 +50,7 @@ export function toSessionStateResponse(params: {
       instanceId: row.active_instance_id,
       position,
       queueDepth,
-      estimatedWaitMs: estimateWaitMs({ position, admissionTickMs }),
+      estimatedWaitMs: estimateWaitMs({ position }),
       queuedAt: row.queued_at.toISOString(),
     }
   }
@@ -60,18 +59,14 @@ export function toSessionStateResponse(params: {
   return null
 }
 
+const WAIT_MS_PER_SPOT_AHEAD = 60_000
+
 /**
- * Wait-time estimate under the drip-admission model: one user per
- * `admissionTickMs`, gated by Fireworks health. Ignoring health pauses, the
- * user at position P waits roughly `(P - 1) * admissionTickMs`.
- *
+ * Rough wait-time estimate shown to queued users: one minute per spot ahead.
  * Position 1 → 0ms (next tick picks you up).
  */
-export function estimateWaitMs(params: {
-  position: number
-  admissionTickMs: number
-}): number {
-  const { position, admissionTickMs } = params
-  if (position <= 1 || admissionTickMs <= 0) return 0
-  return (position - 1) * admissionTickMs
+export function estimateWaitMs(params: { position: number }): number {
+  const { position } = params
+  if (position <= 1) return 0
+  return (position - 1) * WAIT_MS_PER_SPOT_AHEAD
 }

From 12ed322b4a1d5f026cb01f5fab959fc8a4a45d09 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Sun, 19 Apr 2026 09:52:47 -0700
Subject: [PATCH 4/4] Hide web referral banner

---
 web/src/app/layout.tsx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/web/src/app/layout.tsx b/web/src/app/layout.tsx
index 05c0ee71ae..eecfa69b85 100644
--- a/web/src/app/layout.tsx
+++ b/web/src/app/layout.tsx
@@ -8,7 +8,6 @@ import { LayoutWrapper } from '@/components/layout-wrapper'
 import { Navbar } from '@/components/navbar/navbar'
 import QueryProvider from '@/components/providers/query-client-provider'
 import { ThemeProvider } from '@/components/theme-provider'
-import { Banner } from '@/components/ui/banner'
 import { Toaster } from '@/components/ui/toaster'
 import { siteConfig } from '@/lib/constant'
 import { fonts } from '@/lib/fonts'
@@ -67,7 +66,6 @@ export default function RootLayout({
           <SessionProvider>
             <QueryProvider>
               <PostHogProvider>
-                <Banner />
                 <Navbar />
                 <div className="flex-grow">
                   <LayoutWrapper>{children}</LayoutWrapper>