diff --git a/agents/context-pruner.ts b/agents/context-pruner.ts index 99b57a7a59..fd98630d3a 100644 --- a/agents/context-pruner.ts +++ b/agents/context-pruner.ts @@ -14,7 +14,7 @@ const definition: AgentDefinition = { id: 'context-pruner', publisher, displayName: 'Context Pruner', - model: 'openai/gpt-5-mini', + model: 'anthropic/claude-sonnet-4.6', spawnerPrompt: `Spawn this agent between steps to prune context, summarizing the conversation into a condensed format when context exceeds the limit.`, diff --git a/cli/release/package.json b/cli/release/package.json index 5ccbe9c048..1eb51b176f 100644 --- a/cli/release/package.json +++ b/cli/release/package.json @@ -1,6 +1,6 @@ { "name": "codebuff", - "version": "1.0.640", + "version": "1.0.641", "description": "AI coding agent", "license": "MIT", "bin": { diff --git a/freebuff/cli/release/package.json b/freebuff/cli/release/package.json index f4eed9d22d..dc00bf86cd 100644 --- a/freebuff/cli/release/package.json +++ b/freebuff/cli/release/package.json @@ -1,6 +1,6 @@ { "name": "freebuff", - "version": "0.0.32", + "version": "0.0.33", "description": "The world's strongest free coding agent", "license": "MIT", "bin": { diff --git a/packages/agent-runtime/src/llm-api/codebuff-web-api.ts b/packages/agent-runtime/src/llm-api/codebuff-web-api.ts index cf0947f49c..61b77fd752 100644 --- a/packages/agent-runtime/src/llm-api/codebuff-web-api.ts +++ b/packages/agent-runtime/src/llm-api/codebuff-web-api.ts @@ -230,13 +230,14 @@ export async function callTokenCountAPI(params: { messages: unknown[] system?: string model?: string + tools?: Array<{ name: string; description?: string; input_schema?: unknown }> fetch: typeof globalThis.fetch logger: Logger env: CodebuffWebApiEnv baseUrl?: string apiKey?: string }): Promise<{ inputTokens?: number; error?: string }> { - const { messages, system, model, fetch, logger, env } = params + const { messages, system, model, tools, fetch, logger, env } = params const baseUrl = params.baseUrl ?? env.clientEnv.NEXT_PUBLIC_CODEBUFF_APP_URL const apiKey = params.apiKey ?? env.ciEnv.CODEBUFF_API_KEY @@ -248,6 +249,7 @@ export async function callTokenCountAPI(params: { const payload: Record = { messages } if (system) payload.system = system if (model) payload.model = model + if (tools) payload.tools = tools try { const res = await withTimeout( diff --git a/packages/agent-runtime/src/run-agent-step.ts b/packages/agent-runtime/src/run-agent-step.ts index 992db72aa7..704cedf3a6 100644 --- a/packages/agent-runtime/src/run-agent-step.ts +++ b/packages/agent-runtime/src/run-agent-step.ts @@ -806,6 +806,18 @@ export async function loopAgentSteps( systemPrompt: system, toolDefinitions, } + + // Convert tool definitions to Anthropic format for accurate token counting + // Tool definitions are stored as { [name]: { description, inputSchema } } + // Anthropic count_tokens API expects [{ name, description, input_schema }] + const toolsForTokenCount = Object.entries(toolDefinitions).map( + ([name, def]) => ({ + name, + ...(def.description && { description: def.description }), + ...(def.inputSchema && { input_schema: def.inputSchema }), + }), + ) + let shouldEndTurn = false let hasRetriedOutputSchema = false let currentPrompt = prompt @@ -845,6 +857,7 @@ export async function loopAgentSteps( messages: messagesWithStepPrompt, system, model: agentTemplate.model, + tools: toolsForTokenCount, fetch, logger, env: { clientEnv, ciEnv }, diff --git a/web/src/app/api/v1/token-count/_post.ts b/web/src/app/api/v1/token-count/_post.ts index ceb3d71e4a..1daea67723 100644 --- a/web/src/app/api/v1/token-count/_post.ts +++ b/web/src/app/api/v1/token-count/_post.ts @@ -3,6 +3,7 @@ import { isClaudeModel, toAnthropicModelId, } from '@codebuff/common/constants/claude-oauth' +import { isOpenAIProviderModel } from '@codebuff/common/constants/chatgpt-oauth' import { getErrorObject } from '@codebuff/common/util/error' import { env } from '@codebuff/internal/env' import { NextResponse } from 'next/server' @@ -22,6 +23,11 @@ const tokenCountRequestSchema = z.object({ messages: z.array(z.any()), system: z.string().optional(), model: z.string().optional(), + tools: z.array(z.object({ + name: z.string(), + description: z.string().optional(), + input_schema: z.any().optional(), + })).optional(), }) type TokenCountRequest = z.infer @@ -74,24 +80,27 @@ export async function postTokenCount(params: { return bodyResult.response } - const { messages, system, model } = bodyResult.data + const { messages, system, model, tools } = bodyResult.data try { const useOpenAI = model != null && false // isOpenAIProviderModel(model) const inputTokens = useOpenAI ? await countTokensViaOpenAI({ messages, system, model, fetch, logger }) : await countTokensViaAnthropic({ - messages, - system, - model, - fetch, - logger, - }) + messages, + system, + model, + tools, + fetch, + logger, + }) logger.info({ userId, messageCount: messages.length, hasSystem: !!system, + hasTools: !!tools, + toolCount: tools?.length, model: model ?? DEFAULT_ANTHROPIC_MODEL, tokenCount: inputTokens, provider: useOpenAI ? 'openai' : 'anthropic', @@ -285,10 +294,11 @@ async function countTokensViaAnthropic(params: { messages: TokenCountRequest['messages'] system: string | undefined model: string | undefined + tools: TokenCountRequest['tools'] fetch: typeof globalThis.fetch logger: Logger }): Promise { - const { messages, system, model, fetch, logger } = params + const { messages, system, model, tools, fetch, logger } = params // Convert messages to Anthropic format const anthropicMessages = convertToAnthropicMessages(messages) @@ -315,6 +325,7 @@ async function countTokensViaAnthropic(params: { model: anthropicModelId, messages: anthropicMessages, ...(system && { system }), + ...(tools && { tools }), }), }, ) @@ -337,8 +348,12 @@ async function countTokensViaAnthropic(params: { const data = await response.json() const baseTokens = data.input_tokens - // Add 30% buffer for non-Anthropic models since tokenizers differ - if (isNonAnthropicModel) { + // Add 30% buffer for OpenAI and Gemini models since their tokenizers differ from Anthropic's + // Other non-Anthropic models (x-ai, qwen, deepseek, etc.) are routed through providers that + // use similar tokenization, so the buffer is not needed and was causing premature context pruning. + const isOpenAIModel = model ? isOpenAIProviderModel(model) : false + const isGeminiModel = model?.startsWith('google/') ?? false + if (isOpenAIModel || isGeminiModel) { return Math.ceil(baseTokens * (1 + NON_ANTHROPIC_TOKEN_BUFFER)) } diff --git a/web/src/llm-api/__tests__/fireworks-deployment.test.ts b/web/src/llm-api/__tests__/fireworks-deployment.test.ts index d7e3f1727a..717b5c9990 100644 --- a/web/src/llm-api/__tests__/fireworks-deployment.test.ts +++ b/web/src/llm-api/__tests__/fireworks-deployment.test.ts @@ -11,8 +11,8 @@ import { import type { Logger } from '@codebuff/common/types/contracts/logger' -const STANDARD_MODEL_ID = 'accounts/fireworks/models/minimax-m2p5' -const DEPLOYMENT_MODEL_ID = 'accounts/james-65d217/deployments/lnfid5h9' +const STANDARD_MODEL_ID = 'accounts/fireworks/models/glm-5p1' +const DEPLOYMENT_MODEL_ID = 'accounts/james-65d217/deployments/mjb4i7ea' function createMockLogger(): Logger { return { @@ -78,7 +78,7 @@ describe('Fireworks deployment routing', () => { }) const minimalBody = { - model: 'minimax/minimax-m2.5', + model: 'z-ai/glm-5.1', messages: [{ role: 'user' as const, content: 'test' }], } @@ -115,7 +115,7 @@ describe('Fireworks deployment routing', () => { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'minimax/minimax-m2.5', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: false, @@ -140,7 +140,7 @@ describe('Fireworks deployment routing', () => { try { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'minimax/minimax-m2.5', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: true, @@ -184,7 +184,7 @@ describe('Fireworks deployment routing', () => { try { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'minimax/minimax-m2.5', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: true, @@ -231,7 +231,7 @@ describe('Fireworks deployment routing', () => { try { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'minimax/minimax-m2.5', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: true, @@ -272,7 +272,7 @@ describe('Fireworks deployment routing', () => { try { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'minimax/minimax-m2.5', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: true, @@ -303,7 +303,7 @@ describe('Fireworks deployment routing', () => { try { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'minimax/minimax-m2.5', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: true, @@ -363,7 +363,7 @@ describe('Fireworks deployment routing', () => { try { const response = await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'minimax/minimax-m2.5', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: true, @@ -403,7 +403,7 @@ describe('Fireworks deployment routing', () => { try { await createFireworksRequestWithFallback({ body: minimalBody as never, - originalModel: 'minimax/minimax-m2.5', + originalModel: 'z-ai/glm-5.1', fetch: mockFetch, logger, useCustomDeployment: true,