From 1c510292d5ea81389cfc8b3c3269210ee3e24f71 Mon Sep 17 00:00:00 2001 From: tiffanychum <71036662+tiffanychum@users.noreply.github.com> Date: Sun, 26 Apr 2026 03:03:45 +0800 Subject: [PATCH] fix(zen): stop double-counting reasoning_tokens in oa-compat usage (#24268) The OpenAI chat-completions usage spec says `completion_tokens` already includes `completion_tokens_details.reasoning_tokens`. Zen's downstream `calculateCost` bills `outputCost + reasoningCost` separately, so when the oa-compat normalizer reported `outputTokens = completion_tokens` and `reasoningTokens = reasoning_tokens`, reasoning was billed twice. Mirror the OpenAI Responses helper (openai.ts) and subtract reasoning from completion before returning. Clamp at 0 because some providers (e.g. Moonshot Kimi K2.6) report `reasoning_tokens > completion_tokens`. Adds unit tests for the reporter's exact payloads: - Kimi K2.6 "Hi": prompt 22 / completion 77 / reasoning 78 -> output 0 - Real session: prompt N / completion 1226 / reasoning 790 -> output 436 - No-reasoning case: outputTokens unchanged - Parity with `openaiHelper.normalizeUsage` for the same logical usage --- .../zen/util/provider/openai-compatible.ts | 14 +++- packages/console/app/test/zen-usage.test.ts | 80 +++++++++++++++++++ 2 files changed, 91 insertions(+), 3 deletions(-) create mode 100644 packages/console/app/test/zen-usage.test.ts diff --git a/packages/console/app/src/routes/zen/util/provider/openai-compatible.ts b/packages/console/app/src/routes/zen/util/provider/openai-compatible.ts index 97b0abc64f31..6c0694f46c7a 100644 --- a/packages/console/app/src/routes/zen/util/provider/openai-compatible.ts +++ b/packages/console/app/src/routes/zen/util/provider/openai-compatible.ts @@ -61,8 +61,16 @@ export const oaCompatHelper: ProviderHelper = ({ adjustCacheUsage, safetyIdentif }, normalizeUsage: (usage: Usage) => { let inputTokens = usage.prompt_tokens ?? 0 - const outputTokens = usage.completion_tokens ?? 0 - const reasoningTokens = usage.completion_tokens_details?.reasoning_tokens ?? undefined + const completionTokens = usage.completion_tokens ?? 0 + const reasoningTokensRaw = usage.completion_tokens_details?.reasoning_tokens + // Per OpenAI chat-completions spec, completion_tokens already includes reasoning_tokens. + // Downstream cost calculation bills outputCost + reasoningCost separately, so we must + // subtract here to avoid double-counting reasoning. Some providers (e.g. Moonshot Kimi + // K2.6) report reasoning_tokens > completion_tokens; in that case clamp reasoning down + // to completion so the invariant `outputTokens + reasoningTokens === completion_tokens` + // holds and we charge the same total the upstream API billed (no over-charge). + const reasoningTokens = + reasoningTokensRaw !== undefined ? Math.min(reasoningTokensRaw, completionTokens) : undefined let cacheReadTokens = usage.cached_tokens ?? usage.prompt_tokens_details?.cached_tokens ?? undefined const cacheWriteTokens = usage.prompt_tokens_details?.cache_creation_input_tokens ?? undefined @@ -72,7 +80,7 @@ export const oaCompatHelper: ProviderHelper = ({ adjustCacheUsage, safetyIdentif return { inputTokens: inputTokens - (cacheReadTokens ?? 0), - outputTokens, + outputTokens: completionTokens - (reasoningTokens ?? 0), reasoningTokens, cacheReadTokens, cacheWrite5mTokens: cacheWriteTokens, diff --git a/packages/console/app/test/zen-usage.test.ts b/packages/console/app/test/zen-usage.test.ts new file mode 100644 index 000000000000..addcca4fca86 --- /dev/null +++ b/packages/console/app/test/zen-usage.test.ts @@ -0,0 +1,80 @@ +import { describe, expect, test } from "bun:test" +import { oaCompatHelper } from "../src/routes/zen/util/provider/openai-compatible" +import { openaiHelper } from "../src/routes/zen/util/provider/openai" + +const helper = (h: ReturnType) => h +const ctx = { reqModel: "kimi-k2.6", providerModel: "moonshotai/kimi-k2.6-20260420" } + +describe("oaCompatHelper.normalizeUsage (#24268)", () => { + test("subtracts reasoning_tokens from completion_tokens so billing does not double-count", () => { + const h = helper(oaCompatHelper(ctx)) + + const usage = { + prompt_tokens: 22, + completion_tokens: 1226, + total_tokens: 1248, + completion_tokens_details: { reasoning_tokens: 790 }, + } + + const result = h.normalizeUsage(usage) + + expect(result.outputTokens).toBe(436) + expect(result.reasoningTokens).toBe(790) + expect(result.outputTokens + (result.reasoningTokens ?? 0)).toBe(1226) + }) + + test("clamps reasoning to completion when reasoning_tokens > completion_tokens (reporter's 'Hi' example)", () => { + const h = helper(oaCompatHelper(ctx)) + + const usage = { + prompt_tokens: 22, + completion_tokens: 77, + total_tokens: 99, + completion_tokens_details: { reasoning_tokens: 78 }, + } + + const result = h.normalizeUsage(usage) + + // outputTokens floors at 0; reasoningTokens is clamped to completion_tokens so the + // invariant `outputTokens + reasoningTokens === completion_tokens` holds and we bill + // exactly what the upstream API billed (no over-charge of the extra reasoning unit). + expect(result.outputTokens).toBe(0) + expect(result.reasoningTokens).toBe(77) + expect(result.outputTokens + (result.reasoningTokens ?? 0)).toBe(77) + }) + + test("leaves outputTokens unchanged when no reasoning_tokens are reported", () => { + const h = helper(oaCompatHelper(ctx)) + + const usage = { + prompt_tokens: 22, + completion_tokens: 77, + total_tokens: 99, + } + + const result = h.normalizeUsage(usage) + + expect(result.outputTokens).toBe(77) + expect(result.reasoningTokens).toBeUndefined() + }) + + test("matches OpenAI Responses helper convention for the same logical usage", () => { + const compat = helper(oaCompatHelper(ctx)) + const responses = openaiHelper(ctx) + + const compatResult = compat.normalizeUsage({ + prompt_tokens: 22, + completion_tokens: 1226, + completion_tokens_details: { reasoning_tokens: 790 }, + }) + const responsesResult = responses.normalizeUsage({ + input_tokens: 22, + output_tokens: 1226, + output_tokens_details: { reasoning_tokens: 790 }, + }) + + expect(compatResult.outputTokens).toBe(responsesResult.outputTokens) + expect(compatResult.reasoningTokens).toBe(responsesResult.reasoningTokens) + expect(compatResult.inputTokens).toBe(responsesResult.inputTokens) + }) +})