diff --git a/.changeset/compaction-output-token-cap.md b/.changeset/compaction-output-token-cap.md new file mode 100644 index 000000000..6f26f5c64 --- /dev/null +++ b/.changeset/compaction-output-token-cap.md @@ -0,0 +1,5 @@ +--- +"@moonshot-ai/agent-core": patch +--- + +Cap compaction output tokens to a conservative fallback when maxOutputSize is not configured, preventing APIContextOverflowError on providers that do not auto-clamp max_tokens. diff --git a/packages/agent-core/src/agent/compaction/full.ts b/packages/agent-core/src/agent/compaction/full.ts index e444aee52..65b99a80f 100644 --- a/packages/agent-core/src/agent/compaction/full.ts +++ b/packages/agent-core/src/agent/compaction/full.ts @@ -261,9 +261,19 @@ export class FullCompaction { await this.triggerPreCompactHook(data, tokensBefore, signal); const model = this.agent.config.model; + const maxOutputSize = this.agent.config.maxOutputSize; + const maxCtx = this.agent.config.modelCapabilities.max_context_tokens ?? 0; + // When maxOutputSize is not configured (the default), fall back to a + // conservative cap so compaction never requests the full context window + // as max_completion_tokens. 1/4 of the context window (capped at 8192) + // is generous for a summary while preventing overflow on providers that + // do not auto-clamp max_tokens server-side. + const compactionOutputCap = + maxOutputSize ?? (maxCtx > 0 ? Math.min(Math.floor(maxCtx / 4), 8192) : undefined); const provider = applyCompletionBudget({ provider: this.agent.config.provider, budget: resolveCompletionBudget({ + maxOutputSize: compactionOutputCap, reservedContextSize: this.agent.kimiConfig?.loopControl?.reservedContextSize, }), capability: this.agent.config.modelCapabilities, diff --git a/packages/agent-core/test/utils/compaction-overflow-verification.test.ts b/packages/agent-core/test/utils/compaction-overflow-verification.test.ts new file mode 100644 index 000000000..1072f0071 --- /dev/null +++ b/packages/agent-core/test/utils/compaction-overflow-verification.test.ts @@ -0,0 +1,140 @@ +import { describe, expect, it } from 'vitest'; + +import { + applyCompletionBudget, + computeCompletionBudgetCap, + resolveCompletionBudget, +} from '../../src/utils/completion-budget'; + +import type { ChatProvider, ModelCapability } from '@moonshot-ai/kosong'; + +function makeCapability(maxContextTokens: number): ModelCapability { + return { + image_in: false, + video_in: false, + audio_in: false, + thinking: false, + tool_use: true, + max_context_tokens: maxContextTokens, + }; +} + +function makeMockProvider(): { provider: ChatProvider; getCap: () => number | null } { + let cap: number | null = null; + const provider = { + name: 'mock', + modelName: 'mock-model', + thinkingEffort: null, + generate: (() => {}) as unknown as ChatProvider['generate'], + withThinking: (() => {}) as unknown as ChatProvider['withThinking'], + withMaxCompletionTokens: ((n: number) => { + cap = n; + return { ...provider, _cap: n } as unknown as ChatProvider; + }) as unknown as (n: number) => ChatProvider, + } as ChatProvider; + return { provider, getCap: () => cap }; +} + +/** + * Simulates the ORIGINAL compactionRound() budget logic (before fix): + * does NOT pass maxOutputSize to resolveCompletionBudget. + */ +function originalCompactionMaxTokens(args: { + maxOutputSize?: number; + maxCtx: number; + reservedContextSize?: number; +}): number { + const { provider, getCap } = makeMockProvider(); + applyCompletionBudget({ + provider, + budget: resolveCompletionBudget({ + // maxOutputSize intentionally omitted — this is the bug + reservedContextSize: args.reservedContextSize, + }), + capability: makeCapability(args.maxCtx), + }); + return getCap() ?? 0; +} + +/** + * Simulates the PATCHED compactionRound() budget logic (after fix): + * passes maxOutputSize or a conservative fallback cap. + */ +function patchedCompactionMaxTokens(args: { + maxOutputSize?: number; + maxCtx: number; + reservedContextSize?: number; +}): number { + const compactionOutputCap = + args.maxOutputSize ?? (args.maxCtx > 0 ? Math.min(Math.floor(args.maxCtx / 4), 8192) : undefined); + const { provider, getCap } = makeMockProvider(); + applyCompletionBudget({ + provider, + budget: resolveCompletionBudget({ + maxOutputSize: compactionOutputCap, + reservedContextSize: args.reservedContextSize, + }), + capability: makeCapability(args.maxCtx), + }); + return getCap() ?? 0; +} + +describe('compaction overflow verification (before vs after fix)', () => { + // Simulated compaction input size: a typical compaction prompt contains + // the entire conversation history being compacted. + const COMPACTION_INPUT_TOKENS = 80_000; + + const testModels = [ + { + name: 'stepfun/step-3.7-flash (maxOutputSize not configured)', + maxOutputSize: undefined, + maxCtx: 256_000, + reservedContextSize: 50_000, + }, + { + name: 'kimi-for-coding (maxOutputSize not configured)', + maxOutputSize: undefined, + maxCtx: 262_144, + reservedContextSize: 50_000, + }, + { + name: 'zhipu/glm-5.2 (maxOutputSize=131072)', + maxOutputSize: 131_072, + maxCtx: 1_000_000, + reservedContextSize: 50_000, + }, + ]; + + for (const model of testModels) { + it(`${model.name}: original overflows, patched is safe`, () => { + const origCap = originalCompactionMaxTokens(model); + const patchedCap = patchedCompactionMaxTokens(model); + const origTotal = COMPACTION_INPUT_TOKENS + origCap; + const patchedTotal = COMPACTION_INPUT_TOKENS + patchedCap; + + // --- Original code --- + // The original compaction code does NOT pass maxOutputSize to + // resolveCompletionBudget, so it always falls back to using the full + // context window as max_completion_tokens — regardless of whether + // maxOutputSize is configured. This is the core bug. + expect(origCap).toBe(model.maxCtx); // bug: always uses full context as max_tokens + expect(origTotal).toBeGreaterThan(model.maxCtx); // overflow! + + // --- Patched code --- + // The patched code always uses a safe cap (either maxOutputSize or min(maxCtx/4, 8192)) + expect(patchedTotal).toBeLessThanOrEqual(model.maxCtx); + expect(patchedCap).toBeLessThan(model.maxCtx); + + // Print a comparison table for manual verification + console.log(` + ${model.name} + max_context_tokens: ${model.maxCtx.toLocaleString()} + maxOutputSize: ${model.maxOutputSize?.toLocaleString() ?? 'undefined'} + compaction input est: ${COMPACTION_INPUT_TOKENS.toLocaleString()} + + Original: max_tokens=${origCap.toLocaleString()} total=${origTotal.toLocaleString()} overflow=${origTotal > model.maxCtx ? 'YES ❌' : 'NO'} + Patched: max_tokens=${patchedCap.toLocaleString()} total=${patchedTotal.toLocaleString()} overflow=${patchedTotal > model.maxCtx ? 'YES ❌' : 'NO ✅'} +`); + }); + } +}); diff --git a/packages/agent-core/test/utils/completion-budget.test.ts b/packages/agent-core/test/utils/completion-budget.test.ts index 7df91f5d0..f89034889 100644 --- a/packages/agent-core/test/utils/completion-budget.test.ts +++ b/packages/agent-core/test/utils/completion-budget.test.ts @@ -248,3 +248,92 @@ describe('resolveCompletionBudget', () => { expect(budget?.fallback).toBe(32000); }); }); + +describe('compaction budget resolution', () => { + // Simulates the budget resolution logic from full.ts compactionRound(): + // const compactionOutputCap = + // maxOutputSize ?? (maxCtx > 0 ? Math.min(Math.floor(maxCtx / 4), 8192) : undefined); + // This ensures compaction never requests the full context window as + // max_completion_tokens when maxOutputSize is not explicitly configured. + function resolveCompactionBudget(args: { + readonly maxOutputSize?: number; + readonly maxCtx: number; + readonly reservedContextSize?: number; + readonly env?: NodeJS.ProcessEnv; + }): ReturnType { + const compactionOutputCap = + args.maxOutputSize ?? (args.maxCtx > 0 ? Math.min(Math.floor(args.maxCtx / 4), 8192) : undefined); + return resolveCompletionBudget({ + maxOutputSize: compactionOutputCap, + reservedContextSize: args.reservedContextSize, + env: args.env, + }); + } + + it('uses a conservative fallback cap when maxOutputSize is undefined', () => { + const budget = resolveCompactionBudget({ + maxCtx: 262_144, + reservedContextSize: 50_000, + env: {}, + }); + // 262144 / 4 = 65536, min(65536, 8192) = 8192 + expect(budget?.hardCap).toBe(8192); + }); + + it('caps at 8192 even for very large context windows', () => { + const budget = resolveCompactionBudget({ + maxCtx: 1_000_000, + reservedContextSize: 50_000, + env: {}, + }); + expect(budget?.hardCap).toBe(8192); + }); + + it('uses 1/4 of context when context is small', () => { + const budget = resolveCompactionBudget({ + maxCtx: 20_000, + reservedContextSize: 50_000, + env: {}, + }); + // 20000 / 4 = 5000, min(5000, 8192) = 5000 + expect(budget?.hardCap).toBe(5000); + }); + + it('uses explicit maxOutputSize when configured', () => { + const budget = resolveCompactionBudget({ + maxOutputSize: 131_072, + maxCtx: 1_000_000, + reservedContextSize: 50_000, + env: {}, + }); + expect(budget?.hardCap).toBe(131_072); + }); + + it('respects KIMI_MODEL_MAX_COMPLETION_TOKENS over the fallback cap', () => { + const budget = resolveCompactionBudget({ + maxCtx: 262_144, + reservedContextSize: 50_000, + env: { KIMI_MODEL_MAX_COMPLETION_TOKENS: '4096' }, + }); + expect(budget?.hardCap).toBe(4096); + }); + + it('produces a hardCap that computeCompletionBudgetCap will use instead of maxCtx', () => { + const maxCtx = 262_144; + const budget = resolveCompactionBudget({ + maxCtx, + reservedContextSize: 50_000, + env: {}, + }); + // The budget should have a hardCap, not just a fallback + expect(budget?.hardCap).toBeDefined(); + expect(budget?.hardCap).not.toBe(maxCtx); + // computeCompletionBudgetCap should use the hardCap, not the context window + const cap = computeCompletionBudgetCap({ + budget: budget!, + capability: makeCapability(maxCtx), + }); + expect(cap).toBe(8192); + expect(cap).toBeLessThan(maxCtx); + }); +});