Skip to content

Commit 250d4aa

Browse files
authored
Remove GLM from freebuff sessions (#737)
1 parent b22d244 commit 250d4aa

10 files changed

Lines changed: 74 additions & 125 deletions

File tree

cli/src/hooks/use-freebuff-session.ts

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -514,9 +514,8 @@ export function useFreebuffSession(): UseFreebuffSessionResult {
514514
return
515515
}
516516
if (next.status === 'model_unavailable') {
517-
// Server says the requested model isn't available right now (e.g.
518-
// legacy GLM 5.1 outside deployment hours). Flip to the
519-
// always-available fallback for this run. In-memory only —
517+
// Server says the requested model isn't available right now. Flip
518+
// to the always-available fallback for this run. In-memory only —
520519
// `setSelectedModel` doesn't persist, so the user's saved preference
521520
// is preserved for their next launch.
522521
useFreebuffModelStore
@@ -637,15 +636,15 @@ export function useFreebuffSession(): UseFreebuffSessionResult {
637636
if (response.status === 'none' || response.status === 'queued') {
638637
apply({
639638
status: 'none',
640-
accessTier:
641-
response.accessTier ?? landingSession.accessTier,
639+
accessTier: response.accessTier ?? landingSession.accessTier,
642640
queueDepthByModel:
643641
response.queueDepthByModel ??
644642
landingSession.queueDepthByModel,
645643
rateLimitsByModel:
646644
response.rateLimitsByModel ??
647645
landingSession.rateLimitsByModel,
648-
countryCode: response.countryCode ?? landingSession.countryCode,
646+
countryCode:
647+
response.countryCode ?? landingSession.countryCode,
649648
countryBlockReason:
650649
response.countryBlockReason ??
651650
landingSession.countryBlockReason,

common/src/__tests__/freebuff-models.test.ts

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import {
55
DEFAULT_FREEBUFF_MODEL_ID,
66
FREEBUFF_DEEPSEEK_V4_FLASH_MODEL_ID,
77
FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
8-
FREEBUFF_GLM_MODEL_ID,
98
FREEBUFF_KIMI_MODEL_ID,
109
LIMITED_FREEBUFF_MODEL_ID,
1110
FREEBUFF_MINIMAX_MODEL_ID,
@@ -84,15 +83,14 @@ describe('freebuff model availability', () => {
8483
).toBe(false)
8584
})
8685

87-
test('supports GLM 5.1 as a legacy server-side model without selecting it for new clients', () => {
88-
expect(FREEBUFF_MODELS.map((model) => model.id)).not.toContain(
89-
FREEBUFF_GLM_MODEL_ID,
86+
test('does not support GLM 5.1 for freebuff sessions', () => {
87+
const glm = 'z-ai/glm-5.1'
88+
expect(FREEBUFF_MODELS.map((model) => model.id)).not.toContain(glm)
89+
expect(SUPPORTED_FREEBUFF_MODELS.map((model) => model.id)).not.toContain(
90+
glm,
9091
)
91-
expect(SUPPORTED_FREEBUFF_MODELS.map((model) => model.id)).toContain(
92-
FREEBUFF_GLM_MODEL_ID,
93-
)
94-
expect(isFreebuffModelId(FREEBUFF_GLM_MODEL_ID)).toBe(false)
95-
expect(isSupportedFreebuffModelId(FREEBUFF_GLM_MODEL_ID)).toBe(true)
92+
expect(isFreebuffModelId(glm)).toBe(false)
93+
expect(isSupportedFreebuffModelId(glm)).toBe(false)
9694
})
9795

9896
test('formats the close time in the user local timezone while deployment is open', () => {

common/src/constants/free-agents.ts

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import {
55
FREEBUFF_DEEPSEEK_V4_FLASH_MODEL_ID,
66
FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
77
FREEBUFF_GEMINI_PRO_MODEL_ID,
8-
FREEBUFF_GLM_MODEL_ID,
98
FREEBUFF_KIMI_MODEL_ID,
109
FREEBUFF_MINIMAX_MODEL_ID,
1110
SUPPORTED_FREEBUFF_MODELS,
@@ -68,7 +67,6 @@ export const FREE_MODE_AGENT_MODELS: Record<string, Set<string>> = {
6867
// Root orchestrator
6968
'base2-free': new Set([
7069
FREEBUFF_MINIMAX_MODEL_ID,
71-
FREEBUFF_GLM_MODEL_ID,
7270
FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
7371
FREEBUFF_DEEPSEEK_V4_FLASH_MODEL_ID,
7472
FREEBUFF_KIMI_MODEL_ID,
@@ -94,10 +92,7 @@ export const FREE_MODE_AGENT_MODELS: Record<string, Set<string>> = {
9492
'tmux-cli': new Set([FREEBUFF_MINIMAX_MODEL_ID]),
9593

9694
// Code reviewer for free mode
97-
'code-reviewer-minimax': new Set([
98-
FREEBUFF_MINIMAX_MODEL_ID,
99-
FREEBUFF_GLM_MODEL_ID,
100-
]),
95+
'code-reviewer-minimax': new Set([FREEBUFF_MINIMAX_MODEL_ID]),
10196
'code-reviewer-kimi': new Set([FREEBUFF_KIMI_MODEL_ID]),
10297
'code-reviewer-deepseek': new Set([FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID]),
10398
'code-reviewer-deepseek-flash': new Set([

common/src/constants/freebuff-models.ts

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ export const FREEBUFF_DEPLOYMENT_HOURS_LABEL = '9am ET-5pm PT every day'
3535
export const FREEBUFF_GEMINI_PRO_MODEL_ID = 'google/gemini-3.1-pro-preview'
3636
export const FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID = 'deepseek/deepseek-v4-pro'
3737
export const FREEBUFF_DEEPSEEK_V4_FLASH_MODEL_ID = 'deepseek/deepseek-v4-flash'
38-
export const FREEBUFF_GLM_MODEL_ID = 'z-ai/glm-5.1'
3938
export const FREEBUFF_KIMI_MODEL_ID = 'moonshotai/kimi-k2.6'
4039
export const FREEBUFF_MINIMAX_MODEL_ID = 'minimax/minimax-m2.7'
4140
export const FREEBUFF_PREMIUM_SESSION_LIMIT = 5
@@ -102,29 +101,15 @@ export const FREEBUFF_MODELS = [
102101
},
103102
] as const satisfies readonly FreebuffModelOption[]
104103

105-
export const LEGACY_FREEBUFF_MODELS = [
106-
{
107-
id: FREEBUFF_GLM_MODEL_ID,
108-
displayName: 'GLM 5.1',
109-
tagline: 'Legacy',
110-
availability: 'deployment_hours',
111-
},
112-
] as const satisfies readonly FreebuffModelOption[]
113-
114104
export const FREEBUFF_PREMIUM_MODEL_IDS = [
115105
FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
116106
FREEBUFF_KIMI_MODEL_ID,
117-
FREEBUFF_GLM_MODEL_ID,
118107
] as const
119108

120-
export const SUPPORTED_FREEBUFF_MODELS = [
121-
...FREEBUFF_MODELS,
122-
...LEGACY_FREEBUFF_MODELS,
123-
] as const satisfies readonly FreebuffModelOption[]
109+
export const SUPPORTED_FREEBUFF_MODELS = FREEBUFF_MODELS
124110

125111
export type FreebuffModelId = (typeof FREEBUFF_MODELS)[number]['id']
126-
export type SupportedFreebuffModelId =
127-
(typeof SUPPORTED_FREEBUFF_MODELS)[number]['id']
112+
export type SupportedFreebuffModelId = FreebuffModelId
128113
export type FreebuffPremiumModelId = (typeof FREEBUFF_PREMIUM_MODEL_IDS)[number]
129114

130115
/** What new freebuff users see selected in the picker. MiniMax is the

docs/freebuff-waiting-room.md

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
The waiting room is the admission control layer for **free-mode** requests against the freebuff Fireworks deployments. It has three jobs:
66

77
1. **Drip-admit users per model** — each selectable freebuff model has its own FIFO queue. Admission runs one tick (default `ADMISSION_TICK_MS`, 15s) that tries to admit one user per model, so heavier models can sit cold without starving lighter ones.
8-
2. **Gate on per-deployment health and hours** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` and currently available admit that tick; GLM 5.1 is available during 9am ET-5pm PT on weekdays, while MiniMax M2.7 is serverless and always available.
8+
2. **Gate on per-deployment health and hours** — a single fleet probe per tick (`getFleetHealth` in `web/src/server/free-session/fireworks-health.ts`) hits the Fireworks metrics endpoint and classifies each dedicated deployment as `healthy | degraded | unhealthy`. Only models whose deployment is `healthy` and currently available admit that tick; models without a dedicated deployment are treated as serverless and always available.
99
3. **One instance per account** — prevent a single user from running N concurrent freebuff CLIs to get N× throughput.
1010

1111
Users who cannot be admitted immediately are placed in the queue for their chosen model and given an estimated wait time. Admitted users get a fixed-length session (default 1h) bound to the model they were admitted on; chat completions use that model for the life of the session.
@@ -153,18 +153,18 @@ The final tick result carries a `queueDepthByModel` map and a single `skipped` r
153153

154154
### Tunables
155155

156-
| Constant | Location | Default | Purpose |
157-
| ---------------------------- | ----------------------------------------- | ------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
158-
| `ADMISSION_TICK_MS` | `config.ts` | 15000 | How often the ticker fires. Up to one user is admitted per model per tick. |
159-
| `FREEBUFF_MODELS` | `common/src/constants/freebuff-models.ts` | `deepseek-v4-pro`, `kimi-k2.6`, `minimax-m2.7`, `deepseek-v4-flash` | Selectable models; each gets its own queue and admission slot. |
160-
| `FIREWORKS_DEPLOYMENT_MAP` | `web/src/llm-api/fireworks-config.ts` | `glm-5.1` | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback) — drop this default when they migrate to their own deployments. |
161-
| `HEALTH_CACHE_TTL_MS` | `fireworks-health.ts` | 25000 | Fleet probe cache TTL. Sits just under the Fireworks 30s exporter cadence and 6 req/min rate limit. |
162-
| `FREEBUFF_SESSION_LENGTH_MS` | env | 3_600_000 | Session lifetime |
163-
| `SESSION_GRACE_MS` | `web/src/server/free-session/config.ts` | 1_800_000 | Drain window after expiry — gate still admits requests so an in-flight agent can finish, but the CLI is expected to block new prompts. Hard cutoff at `expires_at + grace`. |
156+
| Constant | Location | Default | Purpose |
157+
| ---------------------------- | ----------------------------------------- | ------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
158+
| `ADMISSION_TICK_MS` | `config.ts` | 15000 | How often the ticker fires. Up to one user is admitted per model per tick. |
159+
| `FREEBUFF_MODELS` | `common/src/constants/freebuff-models.ts` | `deepseek-v4-pro`, `kimi-k2.6`, `minimax-m2.7`, `deepseek-v4-flash` | Selectable models; each gets its own queue and admission slot. |
160+
| `FIREWORKS_DEPLOYMENT_MAP` | `web/src/llm-api/fireworks-config.ts` | none for current freebuff models | Models with dedicated Fireworks deployments. Models not listed are treated as `healthy` (serverless fallback). |
161+
| `HEALTH_CACHE_TTL_MS` | `fireworks-health.ts` | 25000 | Fleet probe cache TTL. Sits just under the Fireworks 30s exporter cadence and 6 req/min rate limit. |
162+
| `FREEBUFF_SESSION_LENGTH_MS` | env | 3_600_000 | Session lifetime |
163+
| `SESSION_GRACE_MS` | `web/src/server/free-session/config.ts` | 1_800_000 | Drain window after expiry — gate still admits requests so an in-flight agent can finish, but the CLI is expected to block new prompts. Hard cutoff at `expires_at + grace`. |
164164

165165
### Premium Session Quota
166166

167-
DeepSeek V4 Pro, Kimi, and legacy GLM share a per-user premium quota. The server counts `free_session_admit` rows from the last midnight in `America/Los_Angeles`; when the user reaches `FREEBUFF_PREMIUM_SESSION_LIMIT`, the next premium `POST /session` is rejected until the next Pacific midnight reset. MiniMax and DeepSeek V4 Flash remain unlimited.
167+
DeepSeek V4 Pro and Kimi share a per-user premium quota. The server counts `free_session_admit` rows from the last midnight in `America/Los_Angeles`; when the user reaches `FREEBUFF_PREMIUM_SESSION_LIMIT`, the next premium `POST /session` is rejected until the next Pacific midnight reset. MiniMax and DeepSeek V4 Flash remain unlimited.
168168

169169
## HTTP API
170170

@@ -198,7 +198,7 @@ Response shapes:
198198
"queueDepth": 43, // size of this model's queue
199199
"queueDepthByModel": { // snapshot of every model's queue — powers the
200200
"minimax/minimax-m2.7": 43, // "N ahead" hint in the selector. Missing
201-
"z-ai/glm-5.1": 4 // entries should be treated as 0.
201+
"deepseek/deepseek-v4-pro": 4 // entries should be treated as 0.
202202
},
203203
"estimatedWaitMs": 384000,
204204
"queuedAt": "2026-04-17T12:00:00Z"
@@ -298,7 +298,7 @@ waitMs = (position - 1) * 24_000
298298
- Position 1 → 0 (next tick admits you)
299299
- Position 2 → 24s, and so on.
300300

301-
`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `z-ai/glm-5.1` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence, health-gated pauses, and deployment-hours availability (during a GLM Fireworks incident or outside 9am ET-5pm PT, only GLM's queue stalls; MiniMax keeps draining), so the real wait can be longer or shorter.
301+
`position` is scoped to this model's queue — a user at position 1 in the `minimax/minimax-m2.7` queue is not affected by the depth of the `deepseek/deepseek-v4-pro` queue. The estimate is intentionally decoupled from the admission tick — it's a human-friendly rule-of-thumb for the UI, not a precise projection. Actual wait depends on admission-tick cadence and health-gated pauses, so the real wait can be longer or shorter.
302302

303303
## CLI Integration (frontend-side contract)
304304

@@ -337,7 +337,7 @@ The `disabled` response means the server has the waiting room turned off. CLI tr
337337
| Spamming POST/GET to starve admission tick | Admission uses per-model Postgres advisory locks; DDoS protection is upstream (Next's global rate limits). Consider adding a per-user limiter on `/session` if traffic warrants. |
338338
| Repeatedly POSTing different models to get across every queue | Single row per user (PK on `user_id`); switching models moves the row, never clones it. A user holds exactly one queue slot at any time. |
339339
| Fireworks metrics endpoint down / slow | `getFleetHealth()` fails closed (timeout, non-OK, or missing API key) → every dedicated-deployment model is flagged `unhealthy` and its queue pauses. |
340-
| One deployment degraded while others are fine | Health is classified per-deployment; only the affected model's queue pauses, so a degraded GLM deployment doesn't block MiniMax admissions. |
340+
| One deployment degraded while others are fine | Health is classified per-deployment; only the affected model's queue pauses, so a degraded dedicated deployment doesn't block serverless model admissions. |
341341
| Zombie expired sessions holding capacity | Swept on every admission tick, even when upstream is unhealthy |
342342

343343
## Testing

web/src/app/api/v1/chat/completions/__tests__/completions.test.ts

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ import {
77
FREEBUFF_DEEPSEEK_V4_FLASH_MODEL_ID,
88
FREEBUFF_DEEPSEEK_V4_PRO_MODEL_ID,
99
FREEBUFF_GEMINI_PRO_MODEL_ID,
10-
FREEBUFF_GLM_MODEL_ID,
11-
isFreebuffDeploymentHours,
1210
} from '@codebuff/common/constants/freebuff-models'
1311
import { openCodeZenModels } from '@codebuff/common/constants/model-config'
1412
import { postChatCompletions } from '../_post'
@@ -963,7 +961,7 @@ describe('/api/v1/chat/completions POST endpoint', () => {
963961
})
964962

965963
it(
966-
'lets old freebuff clients keep using GLM 5.1 through Fireworks availability rules',
964+
'rejects removed GLM 5.1 for free mode before provider calls',
967965
async () => {
968966
const fetchedBodies: Record<string, unknown>[] = []
969967
const fetchViaFireworks = mock(
@@ -994,7 +992,7 @@ describe('/api/v1/chat/completions POST endpoint', () => {
994992
method: 'POST',
995993
headers: allowedFreeModeHeaders('test-api-key-new-free'),
996994
body: JSON.stringify({
997-
model: FREEBUFF_GLM_MODEL_ID,
995+
model: 'z-ai/glm-5.1',
998996
stream: false,
999997
codebuff_metadata: {
1000998
run_id: 'run-free',
@@ -1019,19 +1017,9 @@ describe('/api/v1/chat/completions POST endpoint', () => {
10191017
})
10201018

10211019
const body = await response.json()
1022-
if (isFreebuffDeploymentHours()) {
1023-
expect(response.status).toBe(200)
1024-
expect(fetchedBodies).toHaveLength(1)
1025-
expect(fetchedBodies[0].model).toBe(
1026-
'accounts/fireworks/models/glm-5p1',
1027-
)
1028-
expect(body.model).toBe(FREEBUFF_GLM_MODEL_ID)
1029-
expect(body.provider).toBe('Fireworks')
1030-
} else {
1031-
expect(response.status).toBe(503)
1032-
expect(fetchedBodies).toHaveLength(0)
1033-
expect(body.error.code).toBe('DEPLOYMENT_OUTSIDE_HOURS')
1034-
}
1020+
expect(response.status).toBe(403)
1021+
expect(fetchedBodies).toHaveLength(0)
1022+
expect(body.error).toBe('free_mode_invalid_agent_model')
10351023
},
10361024
FETCH_PATH_TEST_TIMEOUT_MS,
10371025
)

web/src/app/api/v1/freebuff/session/__tests__/session.test.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -380,17 +380,17 @@ describe('POST /api/v1/freebuff/session', () => {
380380
expect(body.ipPrivacySignals).toBeUndefined()
381381
})
382382

383-
test('returns model_unavailable for legacy GLM 5.1 outside deployment hours', async () => {
383+
test('falls back for removed GLM 5.1 requests', async () => {
384384
const sessionDeps = makeSessionDeps()
385385
const resp = await postFreebuffSession(
386386
makeReq('ok', { model: 'z-ai/glm-5.1' }),
387387
makeDeps(sessionDeps, 'u1'),
388388
)
389-
expect(resp.status).toBe(409)
389+
expect(resp.status).toBe(200)
390390
const body = await resp.json()
391-
expect(body.status).toBe('model_unavailable')
392-
expect(body.availableHours).toBe('9am ET-5pm PT every day')
393-
expect(sessionDeps.rows.size).toBe(0)
391+
expect(body.status).toBe('queued')
392+
expect(body.model).toBe('minimax/minimax-m2.7')
393+
expect(sessionDeps.rows.get('u1')?.model).toBe('minimax/minimax-m2.7')
394394
})
395395

396396
// Banned bots with valid API keys were POSTing every few seconds and

0 commit comments

Comments
 (0)