Remove GLM from freebuff sessions (#737)

jahooma · web-flow · commit 250d4aaf669a · 2026-05-23T16:38:25.000-07:00
diff --git a/cli/src/hooks/use-freebuff-session.ts b/cli/src/hooks/use-freebuff-session.ts
@@ -514,9 +514,8 @@ export function useFreebuffSession(): UseFreebuffSessionResult {
           return
         }
         if (next.status === 'model_unavailable') {
-          // Server says the requested model isn't available right now (e.g.
-          // legacy GLM 5.1 outside deployment hours). Flip to the
-          // always-available fallback for this run. In-memory only —
+          // Server says the requested model isn't available right now. Flip
+          // to the always-available fallback for this run. In-memory only —
           // `setSelectedModel` doesn't persist, so the user's saved preference
           // is preserved for their next launch.
           useFreebuffModelStore
@@ -637,15 +636,15 @@ export function useFreebuffSession(): UseFreebuffSessionResult {
               if (response.status === 'none' || response.status === 'queued') {
                 apply({
                   status: 'none',
-                  accessTier:
-                    response.accessTier ?? landingSession.accessTier,
+                  accessTier: response.accessTier ?? landingSession.accessTier,
                   queueDepthByModel:
                     response.queueDepthByModel ??
                     landingSession.queueDepthByModel,
                   rateLimitsByModel:
                     response.rateLimitsByModel ??
                     landingSession.rateLimitsByModel,
-                  countryCode: response.countryCode ?? landingSession.countryCode,
+                  countryCode:
+                    response.countryCode ?? landingSession.countryCode,
                   countryBlockReason:
                     response.countryBlockReason ??
                     landingSession.countryBlockReason,
diff --git a/common/src/__tests__/freebuff-models.test.ts b/common/src/__tests__/freebuff-models.test.ts
@@ -5,7 +5,6 @@ import {
   DEFAULT_FREEBUFF_MODEL_ID,
   FREEBUFF_DEEPSEEK_V4_FLASH_MODEL_ID,
   FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
-  FREEBUFF_GLM_MODEL_ID,
   FREEBUFF_KIMI_MODEL_ID,
   LIMITED_FREEBUFF_MODEL_ID,
   FREEBUFF_MINIMAX_MODEL_ID,
@@ -84,15 +83,14 @@ describe('freebuff model availability', () => {
     ).toBe(false)
   })
 
-  test('supports GLM 5.1 as a legacy server-side model without selecting it for new clients', () => {
-    expect(FREEBUFF_MODELS.map((model) => model.id)).not.toContain(
-      FREEBUFF_GLM_MODEL_ID,
+  test('does not support GLM 5.1 for freebuff sessions', () => {
+    const glm = 'z-ai/glm-5.1'
+    expect(FREEBUFF_MODELS.map((model) => model.id)).not.toContain(glm)
+    expect(SUPPORTED_FREEBUFF_MODELS.map((model) => model.id)).not.toContain(
+      glm,
     )
-    expect(SUPPORTED_FREEBUFF_MODELS.map((model) => model.id)).toContain(
-      FREEBUFF_GLM_MODEL_ID,
-    )
-    expect(isFreebuffModelId(FREEBUFF_GLM_MODEL_ID)).toBe(false)
-    expect(isSupportedFreebuffModelId(FREEBUFF_GLM_MODEL_ID)).toBe(true)
+    expect(isFreebuffModelId(glm)).toBe(false)
+    expect(isSupportedFreebuffModelId(glm)).toBe(false)
   })
 
   test('formats the close time in the user local timezone while deployment is open', () => {
diff --git a/common/src/constants/free-agents.ts b/common/src/constants/free-agents.ts
@@ -5,7 +5,6 @@ import {
   FREEBUFF_DEEPSEEK_V4_FLASH_MODEL_ID,
   FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
   FREEBUFF_GEMINI_PRO_MODEL_ID,
-  FREEBUFF_GLM_MODEL_ID,
   FREEBUFF_KIMI_MODEL_ID,
   FREEBUFF_MINIMAX_MODEL_ID,
   SUPPORTED_FREEBUFF_MODELS,
@@ -68,7 +67,6 @@ export const FREE_MODE_AGENT_MODELS: Record<string, Set<string>> = {
   // Root orchestrator
   'base2-free': new Set([
     FREEBUFF_MINIMAX_MODEL_ID,
-    FREEBUFF_GLM_MODEL_ID,
     FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
     FREEBUFF_DEEPSEEK_V4_FLASH_MODEL_ID,
     FREEBUFF_KIMI_MODEL_ID,
@@ -94,10 +92,7 @@ export const FREE_MODE_AGENT_MODELS: Record<string, Set<string>> = {
   'tmux-cli': new Set([FREEBUFF_MINIMAX_MODEL_ID]),
 
   // Code reviewer for free mode
-  'code-reviewer-minimax': new Set([
-    FREEBUFF_MINIMAX_MODEL_ID,
-    FREEBUFF_GLM_MODEL_ID,
-  ]),
+  'code-reviewer-minimax': new Set([FREEBUFF_MINIMAX_MODEL_ID]),
   'code-reviewer-kimi': new Set([FREEBUFF_KIMI_MODEL_ID]),
   'code-reviewer-deepseek': new Set([FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID]),
   'code-reviewer-deepseek-flash': new Set([
diff --git a/common/src/constants/freebuff-models.ts b/common/src/constants/freebuff-models.ts
@@ -35,7 +35,6 @@ export const FREEBUFF_DEPLOYMENT_HOURS_LABEL = '9am ET-5pm PT every day'
 export const FREEBUFF_GEMINI_PRO_MODEL_ID = 'google/gemini-3.1-pro-preview'
 export const FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID = 'deepseek/deepseek-v4-pro'
 export const FREEBUFF_DEEPSEEK_V4_FLASH_MODEL_ID = 'deepseek/deepseek-v4-flash'
-export const FREEBUFF_GLM_MODEL_ID = 'z-ai/glm-5.1'
 export const FREEBUFF_KIMI_MODEL_ID = 'moonshotai/kimi-k2.6'
 export const FREEBUFF_MINIMAX_MODEL_ID = 'minimax/minimax-m2.7'
 export const FREEBUFF_PREMIUM_SESSION_LIMIT = 5
@@ -102,29 +101,15 @@ export const FREEBUFF_MODELS = [
   },
 ] as const satisfies readonly FreebuffModelOption[]
 
-export const LEGACY_FREEBUFF_MODELS = [
-  {
-    id: FREEBUFF_GLM_MODEL_ID,
-    displayName: 'GLM 5.1',
-    tagline: 'Legacy',
-    availability: 'deployment_hours',
-  },
-] as const satisfies readonly FreebuffModelOption[]
-
 export const FREEBUFF_PREMIUM_MODEL_IDS = [
   FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
   FREEBUFF_KIMI_MODEL_ID,
-  FREEBUFF_GLM_MODEL_ID,
 ] as const
 
-export const SUPPORTED_FREEBUFF_MODELS = [
-  ...FREEBUFF_MODELS,
-  ...LEGACY_FREEBUFF_MODELS,
-] as const satisfies readonly FreebuffModelOption[]
+export const SUPPORTED_FREEBUFF_MODELS = FREEBUFF_MODELS
 
 export type FreebuffModelId = (typeof FREEBUFF_MODELS)[number]['id']
-export type SupportedFreebuffModelId =
-  (typeof SUPPORTED_FREEBUFF_MODELS)[number]['id']
+export type SupportedFreebuffModelId = FreebuffModelId
 export type FreebuffPremiumModelId = (typeof FREEBUFF_PREMIUM_MODEL_IDS)[number]
 
 /** What new freebuff users see selected in the picker. MiniMax is the
diff --git a/docs/freebuff-waiting-room.md b/docs/freebuff-waiting-room.md
@@ -5,7 +5,7 @@
 The waiting room is the admission control layer for **free-mode** requests against the freebuff Fireworks deployments. It has three jobs:
 
 1. **Drip-admit users per model** — each selectable freebuff model has its own FIFO queue. Admission runs one tick (default `ADMISSION_TICK_MS`, 15s) that tries to admit one user per model, so heavier models can sit cold without starving lighter ones.
-2. **Gate on per-deployment health and hours** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` and currently available admit that tick; GLM 5.1 is available during 9am ET-5pm PT on weekdays, while MiniMax M2.7 is serverless and always available.
+2. **Gate on per-deployment health and hours** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` and currently available admit that tick; models without a dedicated deployment are treated as serverless and always available.
 3. **One instance per account** — prevent a single user from running N concurrent freebuff CLIs to get N× throughput.
 
 Users who cannot be admitted immediately are placed in the queue for their chosen model and given an estimated wait time. Admitted users get a fixed-length session (default 1h) bound to the model they were admitted on; chat completions use that model for the life of the session.
@@ -153,18 +153,18 @@ The final tick result carries a `queueDepthByModel` map and a single `skipped` r
 
 ### Tunables
 
-| Constant                     | Location                                  | Default                                                             | Purpose                                                                                                                                                                       |
-| ---------------------------- | ----------------------------------------- | ------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `ADMISSION_TICK_MS`          | `config.ts`                               | 15000                                                               | How often the ticker fires. Up to one user is admitted per model per tick.                                                                                                    |
-| `FREEBUFF_MODELS`            | `common/src/constants/freebuff-models.ts` | `deepseek-v4-pro`, `kimi-k2.6`, `minimax-m2.7`, `deepseek-v4-flash` | Selectable models; each gets its own queue and admission slot.                                                                                                                |
-| `FIREWORKS_DEPLOYMENT_MAP`   | `web/src/llm-api/fireworks-config.ts`     | `glm-5.1`                                                           | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback) — drop this default when they migrate to their own deployments. |
-| `HEALTH_CACHE_TTL_MS`        | `fireworks-health.ts`                     | 25000                                                               | Fleet probe cache TTL. Sits just under the Fireworks 30s exporter cadence and 6 req/min rate limit.                                                                           |
-| `FREEBUFF_SESSION_LENGTH_MS` | env                                       | 3_600_000                                                           | Session lifetime                                                                                                                                                              |
-| `SESSION_GRACE_MS`           | `web/src/server/free-session/config.ts`   | 1_800_000                                                           | Drain window after expiry — gate still admits requests so an in-flight agent can finish, but the CLI is expected to block new prompts. Hard cutoff at `expires_at + grace`.   |
+| Constant                     | Location                                  | Default                                                             | Purpose                                                                                                                                                                     |
+| ---------------------------- | ----------------------------------------- | ------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `ADMISSION_TICK_MS`          | `config.ts`                               | 15000                                                               | How often the ticker fires. Up to one user is admitted per model per tick.                                                                                                  |
+| `FREEBUFF_MODELS`            | `common/src/constants/freebuff-models.ts` | `deepseek-v4-pro`, `kimi-k2.6`, `minimax-m2.7`, `deepseek-v4-flash` | Selectable models; each gets its own queue and admission slot.                                                                                                              |
+| `FIREWORKS_DEPLOYMENT_MAP`   | `web/src/llm-api/fireworks-config.ts`     | none for current freebuff models                                    | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback).                                                              |
+| `HEALTH_CACHE_TTL_MS`        | `fireworks-health.ts`                     | 25000                                                               | Fleet probe cache TTL. Sits just under the Fireworks 30s exporter cadence and 6 req/min rate limit.                                                                         |
+| `FREEBUFF_SESSION_LENGTH_MS` | env                                       | 3_600_000                                                           | Session lifetime                                                                                                                                                            |
+| `SESSION_GRACE_MS`           | `web/src/server/free-session/config.ts`   | 1_800_000                                                           | Drain window after expiry — gate still admits requests so an in-flight agent can finish, but the CLI is expected to block new prompts. Hard cutoff at `expires_at + grace`. |
 
 ### Premium Session Quota
 
-DeepSeek V4 Pro, Kimi, and legacy GLM share a per-user premium quota. The server counts `free_session_admit` rows from the last midnight in `America/Los_Angeles`; when the user reaches `FREEBUFF_PREMIUM_SESSION_LIMIT`, the next premium `POST /session` is rejected until the next Pacific midnight reset. MiniMax and DeepSeek V4 Flash remain unlimited.
+DeepSeek V4 Pro and Kimi share a per-user premium quota. The server counts `free_session_admit` rows from the last midnight in `America/Los_Angeles`; when the user reaches `FREEBUFF_PREMIUM_SESSION_LIMIT`, the next premium `POST /session` is rejected until the next Pacific midnight reset. MiniMax and DeepSeek V4 Flash remain unlimited.
 
 ## HTTP API
 
@@ -198,7 +198,7 @@ Response shapes:
   "queueDepth": 43,        // size of this model's queue
   "queueDepthByModel": {   // snapshot of every model's queue — powers the
     "minimax/minimax-m2.7": 43, //  "N ahead" hint in the selector. Missing
-    "z-ai/glm-5.1": 4   //  entries should be treated as 0.
+    "deepseek/deepseek-v4-pro": 4 // entries should be treated as 0.
   },
   "estimatedWaitMs": 384000,
   "queuedAt": "2026-04-17T12:00:00Z"
@@ -298,7 +298,7 @@ waitMs = (position - 1) * 24_000
 - Position 1 → 0 (next tick admits you)
 - Position 2 → 24s, and so on.
 
-`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `z-ai/glm-5.1` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence, health-gated pauses, and deployment-hours availability (during a GLM Fireworks incident or outside 9am ET-5pm PT, only GLM's queue stalls; MiniMax keeps draining), so the real wait can be longer or shorter.
+`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `deepseek/deepseek-v4-pro` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence and health-gated pauses, so the real wait can be longer or shorter.
 
 ## CLI Integration (frontend-side contract)
 
@@ -337,7 +337,7 @@ The `disabled` response means the server has the waiting room turned off. CLI tr
 | Spamming POST/GET to starve admission tick                    | Admission uses per-model Postgres advisory locks; DDoS protection is upstream (Next's global rate limits). Consider adding a per-user limiter on `/session` if traffic warrants. |
 | Repeatedly POSTing different models to get across every queue | Single row per user (PK on `user_id`); switching models moves the row, never clones it. A user holds exactly one queue slot at any time.                                         |
 | Fireworks metrics endpoint down / slow                        | `getFleetHealth()` fails closed (timeout, non-OK, or missing API key) → every dedicated-deployment model is flagged `unhealthy` and its queue pauses.                            |
-| One deployment degraded while others are fine                 | Health is classified per-deployment; only the affected model's queue pauses, so a degraded GLM deployment doesn't block MiniMax admissions.                                      |
+| One deployment degraded while others are fine                 | Health is classified per-deployment; only the affected model's queue pauses, so a degraded dedicated deployment doesn't block serverless model admissions.                       |
 | Zombie expired sessions holding capacity                      | Swept on every admission tick, even when upstream is unhealthy                                                                                                                   |
 
 ## Testing
diff --git a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
@@ -7,8 +7,6 @@ import {
   FREEBUFF_DEEPSEEK_V4_FLASH_MODEL_ID,
   FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
   FREEBUFF_GEMINI_PRO_MODEL_ID,
-  FREEBUFF_GLM_MODEL_ID,
-  isFreebuffDeploymentHours,
 } from '@codebuff/common/constants/freebuff-models'
 import { openCodeZenModels } from '@codebuff/common/constants/model-config'
 import { postChatCompletions } from '../_post'
@@ -963,7 +961,7 @@ describe('/api/v1/chat/completions POST endpoint', () => {
     })
 
     it(
-      'lets old freebuff clients keep using GLM 5.1 through Fireworks availability rules',
+      'rejects removed GLM 5.1 for free mode before provider calls',
       async () => {
         const fetchedBodies: Record<string, unknown>[] = []
         const fetchViaFireworks = mock(
@@ -994,7 +992,7 @@ describe('/api/v1/chat/completions POST endpoint', () => {
             method: 'POST',
             headers: allowedFreeModeHeaders('test-api-key-new-free'),
             body: JSON.stringify({
-              model: FREEBUFF_GLM_MODEL_ID,
+              model: 'z-ai/glm-5.1',
               stream: false,
               codebuff_metadata: {
                 run_id: 'run-free',
@@ -1019,19 +1017,9 @@ describe('/api/v1/chat/completions POST endpoint', () => {
         })
 
         const body = await response.json()
-        if (isFreebuffDeploymentHours()) {
-          expect(response.status).toBe(200)
-          expect(fetchedBodies).toHaveLength(1)
-          expect(fetchedBodies[0].model).toBe(
-            'accounts/fireworks/models/glm-5p1',
-          )
-          expect(body.model).toBe(FREEBUFF_GLM_MODEL_ID)
-          expect(body.provider).toBe('Fireworks')
-        } else {
-          expect(response.status).toBe(503)
-          expect(fetchedBodies).toHaveLength(0)
-          expect(body.error.code).toBe('DEPLOYMENT_OUTSIDE_HOURS')
-        }
+        expect(response.status).toBe(403)
+        expect(fetchedBodies).toHaveLength(0)
+        expect(body.error).toBe('free_mode_invalid_agent_model')
       },
       FETCH_PATH_TEST_TIMEOUT_MS,
     )
diff --git a/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts b/web/src/app/api/v1/freebuff/session/__tests__/session.test.ts
@@ -380,17 +380,17 @@ describe('POST /api/v1/freebuff/session', () => {
     expect(body.ipPrivacySignals).toBeUndefined()
   })
 
-  test('returns model_unavailable for legacy GLM 5.1 outside deployment hours', async () => {
+  test('falls back for removed GLM 5.1 requests', async () => {
     const sessionDeps = makeSessionDeps()
     const resp = await postFreebuffSession(
       makeReq('ok', { model: 'z-ai/glm-5.1' }),
       makeDeps(sessionDeps, 'u1'),
     )
-    expect(resp.status).toBe(409)
+    expect(resp.status).toBe(200)
     const body = await resp.json()
-    expect(body.status).toBe('model_unavailable')
-    expect(body.availableHours).toBe('9am ET-5pm PT every day')
-    expect(sessionDeps.rows.size).toBe(0)
+    expect(body.status).toBe('queued')
+    expect(body.model).toBe('minimax/minimax-m2.7')
+    expect(sessionDeps.rows.get('u1')?.model).toBe('minimax/minimax-m2.7')
   })
 
   // Banned bots with valid API keys were POSTing every few seconds and
diff --git a/web/src/server/free-session/__tests__/admission.test.ts b/web/src/server/free-session/__tests__/admission.test.ts
diff --git a/web/src/server/free-session/__tests__/public-api.test.ts b/web/src/server/free-session/__tests__/public-api.test.ts
diff --git a/web/src/server/free-session/config.ts b/web/src/server/free-session/config.ts