diff --git a/.claude/settings.json b/.claude/settings.json
new file mode 100644
index 000000000..9b82e92e3
--- /dev/null
+++ b/.claude/settings.json
@@ -0,0 +1,5 @@
+{
+ "permissions": {
+ "defaultMode": "auto"
+ }
+}
diff --git a/cli/src/app.tsx b/cli/src/app.tsx
index 5c93cd8f6..616e7b890 100644
--- a/cli/src/app.tsx
+++ b/cli/src/app.tsx
@@ -285,17 +285,6 @@ export const App = ({
)
}
- // Render chat history screen when requested
- if (showChatHistory) {
- return (
-
- )
- }
-
// Use key to force remount when resuming a different chat from history
const chatKey = resumeChatId ?? 'current'
@@ -316,6 +305,10 @@ export const App = ({
initialMode={initialMode}
gitRoot={gitRoot}
onSwitchToGitRoot={handleSwitchToGitRoot}
+ showChatHistory={showChatHistory}
+ onSelectChat={handleResumeChat}
+ onCancelChatHistory={closeChatHistory}
+ onNewChat={handleNewChat}
/>
)
}
@@ -336,6 +329,10 @@ interface AuthedSurfaceProps {
initialMode: AgentMode | undefined
gitRoot: string | null | undefined
onSwitchToGitRoot: () => void
+ showChatHistory: boolean
+ onSelectChat: (chatId: string) => void
+ onCancelChatHistory: () => void
+ onNewChat: () => void
}
/**
@@ -359,6 +356,10 @@ const AuthedSurface = ({
initialMode,
gitRoot,
onSwitchToGitRoot,
+ showChatHistory,
+ onSelectChat,
+ onCancelChatHistory,
+ onNewChat,
}: AuthedSurfaceProps) => {
const { session, error: sessionError } = useFreebuffSession()
@@ -388,6 +389,20 @@ const AuthedSurface = ({
return
}
+ // Chat history renders inside AuthedSurface so the freebuff session stays
+ // mounted while the user browses history. Unmounting this surface would
+ // DELETE the session row and drop the user back into the waiting room on
+ // return.
+ if (showChatHistory) {
+ return (
+
+ )
+ }
+
return (
- ) : isFreebuffSessionOver ? (
+ ) : isFreebuffSessionOver && !askUserState ? (
diff --git a/cli/src/hooks/helpers/send-message.ts b/cli/src/hooks/helpers/send-message.ts
index 01f6880b6..02e419b30 100644
--- a/cli/src/hooks/helpers/send-message.ts
+++ b/cli/src/hooks/helpers/send-message.ts
@@ -510,10 +510,16 @@ function handleFreebuffGateError(
switch (kind) {
case 'session_expired':
case 'waiting_room_required':
- // Our seat is gone mid-chat. Flip to `ended` instead of auto re-queuing:
- // the Chat surface stays mounted so any in-flight agent work can finish
- // under the server-side grace period, and the session-ended banner
- // prompts the user to press Enter when they're ready to rejoin.
+ // Our seat is gone mid-chat. Finalize the AI message so its streaming
+ // indicator stops — otherwise `isComplete` stays false and the message
+ // keeps rendering a blinking cursor forever, making the user think the
+ // agent is still working even though the SessionEndedBanner is visible
+ // and actionable. Also disposes the batched-updater flush interval.
+ updater.markComplete()
+ // Flip to `ended` instead of auto re-queuing: the Chat surface stays
+ // mounted so any in-flight agent work can finish under the server-side
+ // grace period, and the session-ended banner prompts the user to press
+ // Enter when they're ready to rejoin.
markFreebuffSessionEnded()
return
case 'waiting_room_queued':
diff --git a/docs/error-schema.md b/docs/error-schema.md
index 6f7e2e177..56a735654 100644
--- a/docs/error-schema.md
+++ b/docs/error-schema.md
@@ -34,7 +34,7 @@ Used for errors that the client needs to identify programmatically:
| Status | `error` code | Example `message` |
|--------|-------------|-------------------|
-| 403 | `account_suspended` | `"Your account has been suspended due to billing issues. Please contact support@codebuff.com to resolve this."` |
+| 403 | `account_suspended` | `"Your account has been suspended. Please contact support@codebuff.com if you did not expect this."` |
| 403 | `free_mode_unavailable` | `"Free mode is not available in your country."` (Freebuff: `"Freebuff is not available in your country."`) |
| 429 | `rate_limit_exceeded` | `"Subscription weekly limit reached. Your limit resets in 2 hours. Enable 'Continue with credits' in the CLI to use a-la-carte credits."` |
diff --git a/packages/agent-runtime/src/__tests__/main-prompt.test.ts b/packages/agent-runtime/src/__tests__/main-prompt.test.ts
index 17b4f99e1..f68e13147 100644
--- a/packages/agent-runtime/src/__tests__/main-prompt.test.ts
+++ b/packages/agent-runtime/src/__tests__/main-prompt.test.ts
@@ -375,6 +375,7 @@ describe('mainPrompt', () => {
it('should update consecutiveAssistantMessages when new prompt is received', async () => {
const sessionState = getInitialSessionState(mockFileContext)
sessionState.mainAgentState.stepsRemaining = 12
+ const initialStepsRemaining = sessionState.mainAgentState.stepsRemaining
const action = {
type: 'prompt' as const,
@@ -394,7 +395,7 @@ describe('mainPrompt', () => {
// When there's a new prompt, consecutiveAssistantMessages should be set to 1
expect(newSessionState.mainAgentState.stepsRemaining).toBe(
- sessionState.mainAgentState.stepsRemaining - 1,
+ initialStepsRemaining - 1,
)
})
diff --git a/packages/agent-runtime/src/run-agent-step.ts b/packages/agent-runtime/src/run-agent-step.ts
index 704cedf3a..4b8267033 100644
--- a/packages/agent-runtime/src/run-agent-step.ts
+++ b/packages/agent-runtime/src/run-agent-step.ts
@@ -536,6 +536,17 @@ export const runAgentStep = async (
}
}
+/**
+ * Runs the agent loop.
+ *
+ * IMPORTANT: This function mutates `params.agentState` in place throughout the
+ * run (not just at return time). Fields like `messageHistory`, `systemPrompt`,
+ * `toolDefinitions`, `creditsUsed`, and `output` are updated as work progresses
+ * so that callers holding a reference to the same object (e.g. the SDK's
+ * `sessionState.mainAgentState`) see in-progress work immediately — which
+ * matters when an error is thrown mid-run and the normal return path is
+ * skipped.
+ */
export async function loopAgentSteps(
params: {
addAgentStep: AddAgentStepFn
@@ -800,12 +811,13 @@ export async function loopAgentSteps(
return cachedAdditionalToolDefinitions
}
- let currentAgentState: AgentState = {
- ...initialAgentState,
- messageHistory: initialMessages,
- systemPrompt: system,
- toolDefinitions,
- }
+ // Mutate initialAgentState so that in-progress work propagates back to the
+ // caller's shared reference (e.g. SDK's sessionState.mainAgentState) even if
+ // an error is thrown before we return.
+ initialAgentState.messageHistory = initialMessages
+ initialAgentState.systemPrompt = system
+ initialAgentState.toolDefinitions = toolDefinitions
+ let currentAgentState: AgentState = initialAgentState
// Convert tool definitions to Anthropic format for accurate token counting
// Tool definitions are stored as { [name]: { description, inputSchema } }
@@ -908,7 +920,8 @@ export async function loopAgentSteps(
} = programmaticResult
n = generateN
- currentAgentState = programmaticAgentState
+ Object.assign(initialAgentState, programmaticAgentState)
+ currentAgentState = initialAgentState
totalSteps = stepNumber
shouldEndTurn = endTurn
@@ -989,7 +1002,8 @@ export async function loopAgentSteps(
logger.error('No runId found for agent state after finishing agent run')
}
- currentAgentState = newAgentState
+ Object.assign(initialAgentState, newAgentState)
+ currentAgentState = initialAgentState
shouldEndTurn = llmShouldEndTurn
nResponses = generatedResponses
diff --git a/sdk/src/__tests__/run-error-preserves-history.test.ts b/sdk/src/__tests__/run-error-preserves-history.test.ts
new file mode 100644
index 000000000..95b72ead2
--- /dev/null
+++ b/sdk/src/__tests__/run-error-preserves-history.test.ts
@@ -0,0 +1,315 @@
+import * as mainPromptModule from '@codebuff/agent-runtime/main-prompt'
+import { getInitialSessionState } from '@codebuff/common/types/session-state'
+import { getStubProjectFileContext } from '@codebuff/common/util/file'
+import { assistantMessage, userMessage } from '@codebuff/common/util/messages'
+import { afterEach, describe, expect, it, mock, spyOn } from 'bun:test'
+
+import { CodebuffClient } from '../client'
+import * as databaseModule from '../impl/database'
+
+interface ToolCallContentBlock {
+ type: 'tool-call'
+ toolCallId: string
+ toolName: string
+ input: Record
+}
+
+const setupDatabaseMocks = () => {
+ spyOn(databaseModule, 'getUserInfoFromApiKey').mockResolvedValue({
+ id: 'user-123',
+ email: 'test@example.com',
+ discord_id: null,
+ referral_code: null,
+ stripe_customer_id: null,
+ banned: false,
+ created_at: new Date('2024-01-01T00:00:00Z'),
+ })
+ spyOn(databaseModule, 'fetchAgentFromDatabase').mockResolvedValue(null)
+ spyOn(databaseModule, 'startAgentRun').mockResolvedValue('run-1')
+ spyOn(databaseModule, 'finishAgentRun').mockResolvedValue(undefined)
+ spyOn(databaseModule, 'addAgentStep').mockResolvedValue('step-1')
+}
+
+describe('Error preserves in-progress message history', () => {
+ afterEach(() => {
+ mock.restore()
+ })
+
+ it('preserves in-progress assistant work on error (simulated via shared state mutation)', async () => {
+ setupDatabaseMocks()
+
+ // Simulate the agent runtime:
+ // 1. Mutates the shared session state with the user message and partial work
+ // 2. Then throws due to a downstream timeout/service error
+ spyOn(mainPromptModule, 'callMainPrompt').mockImplementation(
+ async (params: Parameters[0]) => {
+ const mainAgentState = params.action.sessionState.mainAgentState
+
+ // Match the real runtime's behavior: replace messageHistory with a new
+ // array that includes the user prompt as its first entry. The SDK
+ // detects runtime progress via reference inequality, so we must
+ // reassign the array rather than pushing into it.
+ mainAgentState.messageHistory = [
+ ...mainAgentState.messageHistory,
+ {
+ role: 'user',
+ content: [{ type: 'text', text: 'Fix the bug in auth.ts' }],
+ tags: ['USER_PROMPT'],
+ },
+ {
+ role: 'assistant',
+ content: [
+ { type: 'text', text: 'Let me read the auth file first.' },
+ {
+ type: 'tool-call',
+ toolCallId: 'read-1',
+ toolName: 'read_files',
+ input: { paths: ['auth.ts'] },
+ } as ToolCallContentBlock,
+ ],
+ },
+ {
+ role: 'tool',
+ toolCallId: 'read-1',
+ toolName: 'read_files',
+ content: [
+ {
+ type: 'json',
+ value: [{ path: 'auth.ts', content: 'const auth = ...' }],
+ },
+ ],
+ },
+ {
+ role: 'assistant',
+ content: [
+ { type: 'text', text: 'Found the issue, writing the fix now.' },
+ {
+ type: 'tool-call',
+ toolCallId: 'write-1',
+ toolName: 'write_file',
+ input: { path: 'auth.ts', content: 'const auth = fixed' },
+ } as ToolCallContentBlock,
+ ],
+ },
+ {
+ role: 'tool',
+ toolCallId: 'write-1',
+ toolName: 'write_file',
+ content: [{ type: 'json', value: { file: 'auth.ts', message: 'File written' } }],
+ },
+ ]
+
+ // Now simulate a server timeout on the next LLM call
+ const timeoutError = new Error('Service Unavailable') as Error & {
+ statusCode: number
+ responseBody: string
+ }
+ timeoutError.statusCode = 503
+ timeoutError.responseBody = JSON.stringify({
+ message: 'Request timeout after 30s',
+ })
+ throw timeoutError
+ },
+ )
+
+ const client = new CodebuffClient({ apiKey: 'test-key' })
+ const result = await client.run({
+ agent: 'base2',
+ prompt: 'Fix the bug in auth.ts',
+ })
+
+ // Error output with correct status code
+ expect(result.output.type).toBe('error')
+ const errorOutput = result.output as {
+ type: 'error'
+ message: string
+ statusCode?: number
+ }
+ expect(errorOutput.statusCode).toBe(503)
+
+ const history = result.sessionState!.mainAgentState.messageHistory
+
+ // The user's prompt should appear exactly once
+ const userPromptMessages = history.filter(
+ (m) =>
+ m.role === 'user' &&
+ (m.content as Array<{ type: string; text?: string }>).some(
+ (c) => c.type === 'text' && c.text?.includes('Fix the bug'),
+ ),
+ )
+ expect(userPromptMessages.length).toBe(1)
+
+ // Assistant text messages from both steps should be preserved
+ const firstAssistantText = history.find(
+ (m) =>
+ m.role === 'assistant' &&
+ (m.content as Array<{ type: string; text?: string }>).some(
+ (c) => c.type === 'text' && c.text?.includes('read the auth file'),
+ ),
+ )
+ expect(firstAssistantText).toBeDefined()
+
+ const secondAssistantText = history.find(
+ (m) =>
+ m.role === 'assistant' &&
+ (m.content as Array<{ type: string; text?: string }>).some(
+ (c) => c.type === 'text' && c.text?.includes('writing the fix'),
+ ),
+ )
+ expect(secondAssistantText).toBeDefined()
+
+ // Both tool calls and both tool results should be preserved
+ const readToolCall = history.find(
+ (m) =>
+ m.role === 'assistant' &&
+ (m.content as Array<{ type: string; toolCallId?: string }>).some(
+ (c) => c.type === 'tool-call' && c.toolCallId === 'read-1',
+ ),
+ )
+ expect(readToolCall).toBeDefined()
+
+ const writeToolCall = history.find(
+ (m) =>
+ m.role === 'assistant' &&
+ (m.content as Array<{ type: string; toolCallId?: string }>).some(
+ (c) => c.type === 'tool-call' && c.toolCallId === 'write-1',
+ ),
+ )
+ expect(writeToolCall).toBeDefined()
+
+ const readToolResult = history.find(
+ (m) => m.role === 'tool' && m.toolCallId === 'read-1',
+ )
+ expect(readToolResult).toBeDefined()
+
+ const writeToolResult = history.find(
+ (m) => m.role === 'tool' && m.toolCallId === 'write-1',
+ )
+ expect(writeToolResult).toBeDefined()
+ })
+
+ it('a subsequent run after error includes the preserved in-progress history', async () => {
+ setupDatabaseMocks()
+
+ // Run 1: agent does some work then hits an error
+ spyOn(mainPromptModule, 'callMainPrompt').mockImplementation(
+ async (params: Parameters[0]) => {
+ const mainAgentState = params.action.sessionState.mainAgentState
+
+ mainAgentState.messageHistory = [
+ ...mainAgentState.messageHistory,
+ {
+ role: 'user',
+ content: [{ type: 'text', text: 'Investigate the login bug' }],
+ tags: ['USER_PROMPT'],
+ },
+ assistantMessage('I found the problem in auth.ts on line 42.'),
+ {
+ role: 'assistant',
+ content: [
+ {
+ type: 'tool-call',
+ toolCallId: 'read-login',
+ toolName: 'read_files',
+ input: { paths: ['login.ts'] },
+ } as ToolCallContentBlock,
+ ],
+ },
+ {
+ role: 'tool',
+ toolCallId: 'read-login',
+ toolName: 'read_files',
+ content: [{ type: 'json', value: [{ path: 'login.ts', content: 'login code' }] }],
+ },
+ ]
+
+ const error = new Error('Service Unavailable') as Error & {
+ statusCode: number
+ }
+ error.statusCode = 503
+ throw error
+ },
+ )
+
+ const client = new CodebuffClient({ apiKey: 'test-key' })
+ const firstResult = await client.run({
+ agent: 'base2',
+ prompt: 'Investigate the login bug',
+ })
+
+ expect(firstResult.output.type).toBe('error')
+
+ // Run 2: use the failed run as previousRun
+ mock.restore()
+ setupDatabaseMocks()
+
+ let historyReceivedByRuntime: unknown[] | undefined
+ spyOn(mainPromptModule, 'callMainPrompt').mockImplementation(
+ async (params: Parameters[0]) => {
+ const { sendAction, promptId } = params
+ historyReceivedByRuntime = [
+ ...params.action.sessionState.mainAgentState.messageHistory,
+ ]
+
+ const responseSessionState = getInitialSessionState(
+ getStubProjectFileContext(),
+ )
+ responseSessionState.mainAgentState.messageHistory = [
+ ...params.action.sessionState.mainAgentState.messageHistory,
+ userMessage('Now try again'),
+ assistantMessage('Continuing with the fix.'),
+ ]
+
+ await sendAction({
+ action: {
+ type: 'prompt-response',
+ promptId,
+ sessionState: responseSessionState,
+ output: { type: 'lastMessage', value: [] },
+ },
+ })
+
+ return {
+ sessionState: responseSessionState,
+ output: { type: 'lastMessage' as const, value: [] },
+ }
+ },
+ )
+
+ const secondResult = await client.run({
+ agent: 'base2',
+ prompt: 'Now try again',
+ previousRun: firstResult,
+ })
+
+ // The runtime should have received history containing the work from the first run
+ expect(historyReceivedByRuntime).toBeDefined()
+ const receivedReadCall = historyReceivedByRuntime!.find(
+ (m) =>
+ (m as { role: string }).role === 'assistant' &&
+ ((m as { content: Array<{ type: string; toolCallId?: string }> })
+ .content ?? []).some(
+ (c) => c.type === 'tool-call' && c.toolCallId === 'read-login',
+ ),
+ )
+ expect(receivedReadCall).toBeDefined()
+
+ const receivedToolResult = historyReceivedByRuntime!.find(
+ (m) =>
+ (m as { role: string }).role === 'tool' &&
+ (m as { toolCallId: string }).toolCallId === 'read-login',
+ )
+ expect(receivedToolResult).toBeDefined()
+
+ // Final result should preserve history
+ const finalHistory = secondResult.sessionState!.mainAgentState.messageHistory
+ const finalReadCall = finalHistory.find(
+ (m) =>
+ m.role === 'assistant' &&
+ (m.content as Array<{ type: string; toolCallId?: string }>).some(
+ (c) => c.type === 'tool-call' && c.toolCallId === 'read-login',
+ ),
+ )
+ expect(finalReadCall).toBeDefined()
+ })
+})
diff --git a/sdk/src/run.ts b/sdk/src/run.ts
index 5a18f7025..2dfcef553 100644
--- a/sdk/src/run.ts
+++ b/sdk/src/run.ts
@@ -282,16 +282,27 @@ async function runOnce({
}
}
+ // The agent runtime mutates sessionState.mainAgentState as it progresses,
+ // replacing messageHistory with a new array once it adds the user prompt.
+ // Comparing array identity detects progress more robustly than length:
+ // context pruning could shrink history below its starting length without
+ // meaning the runtime never ran.
+ const initialMessageHistory = sessionState.mainAgentState.messageHistory
+
/** Calculates the current session state if cancelled.
*
- * This is used when callMainPrompt throws an error (the server never processed the request).
- * We need to add the user's message here since the server didn't get a chance to add it.
+ * This is used when callMainPrompt throws an error. If the agent runtime made
+ * any progress (replaced the shared messageHistory), those messages are
+ * preserved. Otherwise the user's message is added so it isn't lost.
*/
function getCancelledSessionState(message: string): SessionState {
+ const runtimeMadeProgress =
+ sessionState.mainAgentState.messageHistory !== initialMessageHistory
+
const state = cloneDeep(sessionState)
- // Add the user's message since the server never processed it
- if (prompt || preparedContent) {
+ // Only add the user's message if the runtime didn't get a chance to add it.
+ if (!runtimeMadeProgress && (prompt || preparedContent)) {
state.mainAgentState.messageHistory.push({
role: 'user' as const,
content: buildUserMessageContent(prompt, params, preparedContent),
diff --git a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
index 2c6d5bb27..43b431f29 100644
--- a/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
+++ b/web/src/app/api/v1/chat/completions/__tests__/completions.test.ts
@@ -412,8 +412,8 @@ describe('/api/v1/chat/completions POST endpoint', () => {
expect(response.status).toBe(403)
const body = await response.json()
expect(body.error).toBe('account_suspended')
- expect(body.message).toContain('Your account has been suspended due to billing issues')
- expect(body.message).toContain('to resolve this')
+ expect(body.message).toContain('Your account has been suspended')
+ expect(body.message).toContain('if you did not expect this')
})
})
diff --git a/web/src/app/api/v1/chat/completions/_post.ts b/web/src/app/api/v1/chat/completions/_post.ts
index 85e10437a..c9b616846 100644
--- a/web/src/app/api/v1/chat/completions/_post.ts
+++ b/web/src/app/api/v1/chat/completions/_post.ts
@@ -260,7 +260,7 @@ export async function postChatCompletions(params: {
return NextResponse.json(
{
error: 'account_suspended',
- message: `Your account has been suspended due to billing issues. Please contact ${env.NEXT_PUBLIC_SUPPORT_EMAIL} to resolve this.`,
+ message: `Your account has been suspended. Please contact ${env.NEXT_PUBLIC_SUPPORT_EMAIL} if you did not expect this.`,
},
{ status: 403 },
)
@@ -413,7 +413,11 @@ export async function postChatCompletions(params: {
if (isFreeModeRequest) {
const claimedInstanceId =
typedBody.codebuff_metadata?.freebuff_instance_id
- const gate = await checkSession({ userId, claimedInstanceId })
+ const gate = await checkSession({
+ userId,
+ userEmail: userInfo.email,
+ claimedInstanceId,
+ })
if (!gate.ok) {
trackEvent({
event: AnalyticsEvent.CHAT_COMPLETIONS_VALIDATION_ERROR,
@@ -468,19 +472,19 @@ export async function postChatCompletions(params: {
if (ensureSubscriberBlockGrant) {
try {
const blockGrantResult = await ensureSubscriberBlockGrant({ userId, logger })
-
+
// Check if user hit subscription limit and should be rate-limited
if (blockGrantResult && (isWeeklyLimitError(blockGrantResult) || isBlockExhaustedError(blockGrantResult))) {
// Fetch user's preference for falling back to a-la-carte credits
const preferences = getUserPreferences
? await getUserPreferences({ userId, logger })
: { fallbackToALaCarte: true } // Default to allowing a-la-carte if no preference function
-
+
if (!preferences.fallbackToALaCarte && !isFreeModeRequest) {
const resetTime = blockGrantResult.resetsAt
const resetCountdown = formatQuotaResetCountdown(resetTime.toISOString())
const limitType = isWeeklyLimitError(blockGrantResult) ? 'weekly' : '5-hour session'
-
+
trackEvent({
event: AnalyticsEvent.CHAT_COMPLETIONS_INSUFFICIENT_CREDITS,
userId,
@@ -491,7 +495,7 @@ export async function postChatCompletions(params: {
},
logger,
})
-
+
return NextResponse.json(
{
error: 'rate_limit_exceeded',
@@ -553,54 +557,54 @@ export async function postChatCompletions(params: {
const useOpenAIDirect = !useFireworks && isOpenAIDirectModel(typedBody.model)
const stream = useSiliconFlow
? await handleSiliconFlowStream({
- body: typedBody,
- userId,
- stripeCustomerId,
- agentId,
- fetch,
- logger,
- insertMessageBigquery,
- })
+ body: typedBody,
+ userId,
+ stripeCustomerId,
+ agentId,
+ fetch,
+ logger,
+ insertMessageBigquery,
+ })
: useCanopyWave
- ? await handleCanopyWaveStream({
- body: typedBody,
- userId,
- stripeCustomerId,
- agentId,
- fetch,
- logger,
- insertMessageBigquery,
- })
- : useFireworks
- ? await handleFireworksStream({
- body: typedBody,
- userId,
- stripeCustomerId,
- agentId,
- fetch,
- logger,
- insertMessageBigquery,
- })
- : useOpenAIDirect
- ? await handleOpenAIStream({
- body: typedBody,
- userId,
- stripeCustomerId,
- agentId,
- fetch,
- logger,
- insertMessageBigquery,
- })
- : await handleOpenRouterStream({
+ ? await handleCanopyWaveStream({
body: typedBody,
userId,
stripeCustomerId,
agentId,
- openrouterApiKey,
fetch,
logger,
insertMessageBigquery,
})
+ : useFireworks
+ ? await handleFireworksStream({
+ body: typedBody,
+ userId,
+ stripeCustomerId,
+ agentId,
+ fetch,
+ logger,
+ insertMessageBigquery,
+ })
+ : useOpenAIDirect
+ ? await handleOpenAIStream({
+ body: typedBody,
+ userId,
+ stripeCustomerId,
+ agentId,
+ fetch,
+ logger,
+ insertMessageBigquery,
+ })
+ : await handleOpenRouterStream({
+ body: typedBody,
+ userId,
+ stripeCustomerId,
+ agentId,
+ openrouterApiKey,
+ fetch,
+ logger,
+ insertMessageBigquery,
+ })
trackEvent({
event: AnalyticsEvent.CHAT_COMPLETIONS_STREAM_STARTED,
@@ -631,26 +635,16 @@ export async function postChatCompletions(params: {
const nonStreamRequest = useSiliconFlow
? handleSiliconFlowNonStream({
- body: typedBody,
- userId,
- stripeCustomerId,
- agentId,
- fetch,
- logger,
- insertMessageBigquery,
- })
+ body: typedBody,
+ userId,
+ stripeCustomerId,
+ agentId,
+ fetch,
+ logger,
+ insertMessageBigquery,
+ })
: useCanopyWave
- ? handleCanopyWaveNonStream({
- body: typedBody,
- userId,
- stripeCustomerId,
- agentId,
- fetch,
- logger,
- insertMessageBigquery,
- })
- : useFireworks
- ? handleFireworksNonStream({
+ ? handleCanopyWaveNonStream({
body: typedBody,
userId,
stripeCustomerId,
@@ -659,26 +653,36 @@ export async function postChatCompletions(params: {
logger,
insertMessageBigquery,
})
- : shouldUseOpenAIEndpoint
- ? handleOpenAINonStream({
- body: typedBody,
- userId,
- stripeCustomerId,
- agentId,
- fetch,
- logger,
- insertMessageBigquery,
- })
- : handleOpenRouterNonStream({
+ : useFireworks
+ ? handleFireworksNonStream({
body: typedBody,
userId,
stripeCustomerId,
agentId,
- openrouterApiKey,
fetch,
logger,
insertMessageBigquery,
})
+ : shouldUseOpenAIEndpoint
+ ? handleOpenAINonStream({
+ body: typedBody,
+ userId,
+ stripeCustomerId,
+ agentId,
+ fetch,
+ logger,
+ insertMessageBigquery,
+ })
+ : handleOpenRouterNonStream({
+ body: typedBody,
+ userId,
+ stripeCustomerId,
+ agentId,
+ openrouterApiKey,
+ fetch,
+ logger,
+ insertMessageBigquery,
+ })
const result = await nonStreamRequest
trackEvent({
diff --git a/web/src/app/api/v1/freebuff/session/_handlers.ts b/web/src/app/api/v1/freebuff/session/_handlers.ts
index 54157c0b8..5bed8e9c9 100644
--- a/web/src/app/api/v1/freebuff/session/_handlers.ts
+++ b/web/src/app/api/v1/freebuff/session/_handlers.ts
@@ -22,7 +22,9 @@ export interface FreebuffSessionDeps {
sessionDeps?: SessionDeps
}
-type AuthResult = { error: NextResponse } | { userId: string }
+type AuthResult =
+ | { error: NextResponse }
+ | { userId: string; userEmail: string | null }
async function resolveUser(req: NextRequest, deps: FreebuffSessionDeps): Promise {
const apiKey = extractApiKeyFromHeader(req)
@@ -39,7 +41,7 @@ async function resolveUser(req: NextRequest, deps: FreebuffSessionDeps): Promise
}
const userInfo = await deps.getUserInfoFromApiKey({
apiKey,
- fields: ['id'],
+ fields: ['id', 'email'],
logger: deps.logger,
})
if (!userInfo?.id) {
@@ -50,7 +52,7 @@ async function resolveUser(req: NextRequest, deps: FreebuffSessionDeps): Promise
),
}
}
- return { userId: String(userInfo.id) }
+ return { userId: String(userInfo.id), userEmail: userInfo.email ?? null }
}
function serverError(
@@ -96,6 +98,7 @@ export async function postFreebuffSession(
try {
const state = await requestSession({
userId: auth.userId,
+ userEmail: auth.userEmail,
deps: deps.sessionDeps,
})
return NextResponse.json(state, { status: 200 })
@@ -118,6 +121,7 @@ export async function getFreebuffSession(
const claimedInstanceId = req.headers.get(FREEBUFF_INSTANCE_HEADER) ?? undefined
const state = await getSessionState({
userId: auth.userId,
+ userEmail: auth.userEmail,
claimedInstanceId,
deps: deps.sessionDeps,
})
@@ -142,7 +146,11 @@ export async function deleteFreebuffSession(
if ('error' in auth) return auth.error
try {
- await endUserSession({ userId: auth.userId, deps: deps.sessionDeps })
+ await endUserSession({
+ userId: auth.userId,
+ userEmail: auth.userEmail,
+ deps: deps.sessionDeps,
+ })
return NextResponse.json({ status: 'ended' }, { status: 200 })
} catch (error) {
return serverError(deps, 'DELETE', auth.userId, error)
diff --git a/web/src/server/free-session/__tests__/fireworks-health.test.ts b/web/src/server/free-session/__tests__/fireworks-health.test.ts
index 6120731cf..3475769cd 100644
--- a/web/src/server/free-session/__tests__/fireworks-health.test.ts
+++ b/web/src/server/free-session/__tests__/fireworks-health.test.ts
@@ -3,7 +3,7 @@ import { describe, expect, test } from 'bun:test'
import {
KV_BLOCKS_DEGRADED_FRACTION,
KV_BLOCKS_UNHEALTHY_FRACTION,
- PREFILL_QUEUE_DEGRADED_MS,
+ PREFILL_QUEUE_P90_DEGRADED_MS,
classify,
} from '../fireworks-health'
@@ -19,20 +19,22 @@ function kvBlocks(value: number): PromSample {
}
}
-/** Emit a minimal cumulative-counts histogram for prefill queue where every
- * event lands in exactly one bucket `le`. */
-function prefillQueueBuckets(p50Ms: number): PromSample[] {
+/** Emit a cumulative-counts histogram for prefill queue where the p90
+ * percentile falls in the bucket with le ≥ p90Ms (i.e. p90 ≥ p90Ms).
+ * Uses 10 total events all landing in that bucket, so the 90th-percentile
+ * interpolates within the bucket above the bucket boundary. */
+function prefillQueueBuckets(p90Ms: number): PromSample[] {
const les = [50, 150, 300, 500, 750, 1000, 1500, 3000, 5000, 7500, 10000]
const name = 'latency_prefill_queue_ms_bucket:sum_by_deployment'
- // cumulative count = 0 below p50, 1 at and above p50
+ const total = 10
return les.map((le) => ({
name,
labels: { deployment_id: DEPLOY, le: String(le) },
- value: le >= p50Ms ? 1 : 0,
+ value: le >= p90Ms ? total : 0,
})).concat({
name,
labels: { deployment_id: DEPLOY, le: '+Inf' },
- value: 1,
+ value: total,
})
}
@@ -58,10 +60,10 @@ describe('fireworks health classifier', () => {
expect(classify(samples, [DEPLOY])).toBe('healthy')
})
- test('degraded when prefill queue p50 exceeds the threshold', () => {
+ test('degraded when prefill queue p90 exceeds the threshold', () => {
const samples: PromSample[] = [
kvBlocks(0.5),
- ...prefillQueueBuckets(PREFILL_QUEUE_DEGRADED_MS + 500),
+ ...prefillQueueBuckets(PREFILL_QUEUE_P90_DEGRADED_MS + 500),
]
expect(classify(samples, [DEPLOY])).toBe('degraded')
})
@@ -110,7 +112,7 @@ describe('fireworks health classifier', () => {
const other = 'other123'
const samples: PromSample[] = [
kvBlocks(0.5),
- ...prefillQueueBuckets(PREFILL_QUEUE_DEGRADED_MS + 500),
+ ...prefillQueueBuckets(PREFILL_QUEUE_P90_DEGRADED_MS + 500),
{
name: 'generator_kv_blocks_fraction:avg_by_deployment',
labels: { deployment_id: other },
diff --git a/web/src/server/free-session/__tests__/public-api.test.ts b/web/src/server/free-session/__tests__/public-api.test.ts
index df34b7556..b19f24ea0 100644
--- a/web/src/server/free-session/__tests__/public-api.test.ts
+++ b/web/src/server/free-session/__tests__/public-api.test.ts
@@ -281,6 +281,29 @@ describe('checkSessionAdmissible', () => {
expect(result.code).toBe('waiting_room_required')
})
+ test('bypassed email (team@codebuff.com) → ok with reason=disabled, no DB read', async () => {
+ const result = await checkSessionAdmissible({
+ userId: 'u1',
+ userEmail: 'team@codebuff.com',
+ claimedInstanceId: undefined,
+ deps,
+ })
+ expect(result.ok).toBe(true)
+ if (!result.ok) throw new Error('unreachable')
+ expect(result.reason).toBe('disabled')
+ expect(deps.rows.size).toBe(0)
+ })
+
+ test('bypassed email is case-insensitive', async () => {
+ const result = await checkSessionAdmissible({
+ userId: 'u1',
+ userEmail: 'Team@Codebuff.COM',
+ claimedInstanceId: undefined,
+ deps,
+ })
+ expect(result.ok).toBe(true)
+ })
+
test('queued session → waiting_room_queued', async () => {
await requestSession({ userId: 'u1', deps })
const result = await checkSessionAdmissible({
diff --git a/web/src/server/free-session/__tests__/session-view.test.ts b/web/src/server/free-session/__tests__/session-view.test.ts
index b3bdade6a..681072b30 100644
--- a/web/src/server/free-session/__tests__/session-view.test.ts
+++ b/web/src/server/free-session/__tests__/session-view.test.ts
@@ -4,7 +4,7 @@ import { estimateWaitMs, toSessionStateResponse } from '../session-view'
import type { InternalSessionRow } from '../types'
-const WAIT_PER_SPOT_MS = 60_000
+const WAIT_PER_SPOT_MS = 24_000
const GRACE_MS = 30 * 60_000
function row(overrides: Partial = {}): InternalSessionRow {
diff --git a/web/src/server/free-session/config.ts b/web/src/server/free-session/config.ts
index 4e9e729c1..e70e1b5c6 100644
--- a/web/src/server/free-session/config.ts
+++ b/web/src/server/free-session/config.ts
@@ -16,6 +16,18 @@ export function isWaitingRoomEnabled(): boolean {
return env.FREEBUFF_WAITING_ROOM_ENABLED
}
+/** Per-account override on top of the global kill switch. The internal
+ * `team@codebuff.com` account drives e2e tests in CI; landing it in the
+ * queue would make those tests flake whenever the waiting room is warm.
+ * Bypassed users behave exactly as if the waiting room were disabled. */
+const WAITING_ROOM_BYPASS_EMAILS = new Set(['team@codebuff.com'])
+export function isWaitingRoomBypassedForEmail(
+ email: string | null | undefined,
+): boolean {
+ if (!email) return false
+ return WAITING_ROOM_BYPASS_EMAILS.has(email.toLowerCase())
+}
+
export function getSessionLengthMs(): number {
return env.FREEBUFF_SESSION_LENGTH_MS
}
diff --git a/web/src/server/free-session/fireworks-health.ts b/web/src/server/free-session/fireworks-health.ts
index c102e721c..7d8e115e4 100644
--- a/web/src/server/free-session/fireworks-health.ts
+++ b/web/src/server/free-session/fireworks-health.ts
@@ -1,5 +1,6 @@
-import { FIREWORKS_ACCOUNT_ID, FIREWORKS_DEPLOYMENT_MAP } from '@/llm-api/fireworks-config'
import { env } from '@codebuff/internal/env'
+
+import { FIREWORKS_ACCOUNT_ID, FIREWORKS_DEPLOYMENT_MAP } from '@/llm-api/fireworks-config'
import { logger } from '@/util/logger'
/**
@@ -15,13 +16,14 @@ import { logger } from '@/util/logger'
*/
export type FireworksHealth = 'healthy' | 'degraded' | 'unhealthy'
-/** Degrade once median prefill-queue latency crosses this bound. Strict by
- * design — a 1s queue on top of ~1s prefill already means users feel 2s+
- * before first token. */
-export const PREFILL_QUEUE_DEGRADED_MS = 125
+/** Degrade once p90 prefill-queue latency crosses this bound. Using p90
+ * instead of p50 gives a better early-warning signal — the tail starts
+ * rising before the median does, so we can halt admission before most
+ * users feel it. */
+export const PREFILL_QUEUE_P90_DEGRADED_MS = 1000
/** Leading indicator of load — responds instantly to memory pressure, while
- * prefill-queue p50 is a lagging window statistic. Degrading here lets us
+ * prefill-queue p90 is a lagging window statistic. Degrading here lets us
* halt admission *before* users feel it. */
export const KV_BLOCKS_DEGRADED_FRACTION = 0.8
@@ -160,16 +162,16 @@ function classifyOne(samples: PromSample[], deploymentId: string): FireworksHeal
return 'unhealthy'
}
- const p50 = histogramPercentile(
+ const p90 = histogramPercentile(
samples,
'latency_prefill_queue_ms_bucket:sum_by_deployment',
deploymentId,
- 50,
+ 90,
)
- if (p50 !== undefined && p50 > PREFILL_QUEUE_DEGRADED_MS) {
+ if (p90 !== undefined && p90 > PREFILL_QUEUE_P90_DEGRADED_MS) {
logger.info(
- { deploymentId, prefillQueueP50Ms: Math.round(p50), kvBlocks },
- '[FireworksHealth] degraded: prefill queue p50 over threshold',
+ { deploymentId, prefillQueueP90Ms: Math.round(p90), kvBlocks },
+ '[FireworksHealth] degraded: prefill queue p90 over threshold',
)
return 'degraded'
}
diff --git a/web/src/server/free-session/public-api.ts b/web/src/server/free-session/public-api.ts
index 759a516d7..74af009cc 100644
--- a/web/src/server/free-session/public-api.ts
+++ b/web/src/server/free-session/public-api.ts
@@ -1,5 +1,6 @@
import {
getSessionGraceMs,
+ isWaitingRoomBypassedForEmail,
isWaitingRoomEnabled,
} from './config'
import {
@@ -79,10 +80,16 @@ async function viewForRow(
*/
export async function requestSession(params: {
userId: string
+ userEmail?: string | null | undefined
deps?: SessionDeps
}): Promise {
const deps = params.deps ?? defaultDeps
- if (!deps.isWaitingRoomEnabled()) return { status: 'disabled' }
+ if (
+ !deps.isWaitingRoomEnabled() ||
+ isWaitingRoomBypassedForEmail(params.userEmail)
+ ) {
+ return { status: 'disabled' }
+ }
const row = await deps.joinOrTakeOver({ userId: params.userId, now: nowOf(deps) })
const view = await viewForRow(params.userId, deps, row)
@@ -109,11 +116,17 @@ export async function requestSession(params: {
*/
export async function getSessionState(params: {
userId: string
+ userEmail?: string | null | undefined
claimedInstanceId?: string | null | undefined
deps?: SessionDeps
}): Promise {
const deps = params.deps ?? defaultDeps
- if (!deps.isWaitingRoomEnabled()) return { status: 'disabled' }
+ if (
+ !deps.isWaitingRoomEnabled() ||
+ isWaitingRoomBypassedForEmail(params.userEmail)
+ ) {
+ return { status: 'disabled' }
+ }
const row = await deps.getSessionRow(params.userId)
if (!row) return { status: 'none' }
@@ -132,10 +145,16 @@ export async function getSessionState(params: {
export async function endUserSession(params: {
userId: string
+ userEmail?: string | null | undefined
deps?: SessionDeps
}): Promise {
const deps = params.deps ?? defaultDeps
- if (!deps.isWaitingRoomEnabled()) return
+ if (
+ !deps.isWaitingRoomEnabled() ||
+ isWaitingRoomBypassedForEmail(params.userEmail)
+ ) {
+ return
+ }
await deps.endSession(params.userId)
}
@@ -169,11 +188,17 @@ export type SessionGateResult =
*/
export async function checkSessionAdmissible(params: {
userId: string
+ userEmail?: string | null | undefined
claimedInstanceId: string | null | undefined
deps?: SessionDeps
}): Promise {
const deps = params.deps ?? defaultDeps
- if (!deps.isWaitingRoomEnabled()) return { ok: true, reason: 'disabled' }
+ if (
+ !deps.isWaitingRoomEnabled() ||
+ isWaitingRoomBypassedForEmail(params.userEmail)
+ ) {
+ return { ok: true, reason: 'disabled' }
+ }
// Pre-waiting-room CLIs never send a freebuff_instance_id. Classify that up
// front so the caller gets a distinct code (→ 426 Upgrade Required) and the
diff --git a/web/src/server/free-session/session-view.ts b/web/src/server/free-session/session-view.ts
index 7ce1f75fe..582e78814 100644
--- a/web/src/server/free-session/session-view.ts
+++ b/web/src/server/free-session/session-view.ts
@@ -59,10 +59,10 @@ export function toSessionStateResponse(params: {
return null
}
-const WAIT_MS_PER_SPOT_AHEAD = 60_000
+const WAIT_MS_PER_SPOT_AHEAD = 24_000
/**
- * Rough wait-time estimate shown to queued users: one minute per spot ahead.
+ * Rough wait-time estimate shown to queued users: 24 seconds per spot ahead.
* Position 1 → 0ms (next tick picks you up).
*/
export function estimateWaitMs(params: { position: number }): number {