Skip to content

Commit e7a728f

Browse files
committed
fix(webapp,llm-model-catalog): stop double-counting cached input tokens
input_tokens is the total prompt count, inclusive of cache-read and cache-creation tokens. The cost pipeline charged the full input count at the input price and then added a separate cache line, so cached tokens were billed twice (e.g. ~2.4x on OpenAI), and the cache hit-rate metric divided cached reads by input + cached, understating the rate. Charge the input price only on the fresh (non-cached) remainder, resolve cache prices across provider alias keys (falling back to input price so cache tokens are never free), and compute the hit rate as cached / input.
1 parent adf0684 commit e7a728f

5 files changed

Lines changed: 173 additions & 8 deletions

File tree

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
area: webapp
3+
type: fix
4+
---
5+
6+
LLM cost no longer double-counts cached input tokens. Prompt-cache reads and writes are now billed once at their cache rate instead of also being charged at the full input price, so cost and cache hit-rate figures on the AI metrics dashboard and Models page are accurate.

apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -496,7 +496,7 @@ const llmDashboard: BuiltInDashboard = {
496496
"llm-cache-hit": {
497497
title: "Cache hit rate over time",
498498
query:
499-
"SELECT timeBucket(), round(ifNull(sum(cached_read_tokens) * 100.0 / nullIf(sum(input_tokens) + sum(cached_read_tokens), 0), 0), 1) AS cache_hit_pct FROM llm_metrics GROUP BY timeBucket ORDER BY timeBucket",
499+
"SELECT timeBucket(), round(ifNull(sum(cached_read_tokens) * 100.0 / nullIf(sum(input_tokens), 0), 0), 1) AS cache_hit_pct FROM llm_metrics GROUP BY timeBucket ORDER BY timeBucket",
500500
display: {
501501
type: "chart",
502502
chartType: "line",
@@ -528,7 +528,7 @@ const llmDashboard: BuiltInDashboard = {
528528
"llm-cache-savings": {
529529
title: "Cache savings over time",
530530
query:
531-
"SELECT timeBucket(), round(ifNull(sum(cached_read_tokens) * (sum(input_cost) / nullIf(sum(input_tokens), 0)) - sum(cached_read_cost), 0), 4) AS cache_savings FROM llm_metrics WHERE cached_read_tokens > 0 GROUP BY timeBucket ORDER BY timeBucket",
531+
"SELECT timeBucket(), round(ifNull(sum(cached_read_tokens) * (sum(input_cost) / nullIf(sum(input_tokens) - sum(cached_read_tokens) - sum(cache_creation_tokens), 0)) - sum(cached_read_cost), 0), 4) AS cache_savings FROM llm_metrics WHERE cached_read_tokens > 0 GROUP BY timeBucket ORDER BY timeBucket",
532532
display: {
533533
type: "chart",
534534
chartType: "bar",
@@ -544,7 +544,7 @@ const llmDashboard: BuiltInDashboard = {
544544
"llm-cache-by-model": {
545545
title: "Cache hit rate by model",
546546
query:
547-
"SELECT response_model, round(ifNull(sum(cached_read_tokens) * 100.0 / nullIf(sum(input_tokens) + sum(cached_read_tokens), 0), 0), 1) AS cache_hit_pct, sum(cached_read_tokens) AS cached_tokens FROM llm_metrics GROUP BY response_model ORDER BY cached_tokens DESC LIMIT 20",
547+
"SELECT response_model, round(ifNull(sum(cached_read_tokens) * 100.0 / nullIf(sum(input_tokens), 0), 0), 1) AS cache_hit_pct, sum(cached_read_tokens) AS cached_tokens FROM llm_metrics GROUP BY response_model ORDER BY cached_tokens DESC LIMIT 20",
548548
display: { type: "table", prettyFormatting: true, sorting: [] },
549549
},
550550
},

apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.models._index/route.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1171,7 +1171,7 @@ function DetailYourUsageTab({
11711171
<MetricWidget
11721172
widgetKey={`${modelName}-user-cache-hit`}
11731173
title="Cache hit rate over time"
1174-
query={`SELECT timeBucket(), round(ifNull(sum(cached_read_tokens) * 100.0 / nullIf(sum(input_tokens) + sum(cached_read_tokens), 0), 0), 1) AS cache_hit_pct FROM llm_metrics WHERE response_model = '${escapeTSQL(
1174+
query={`SELECT timeBucket(), round(ifNull(sum(cached_read_tokens) * 100.0 / nullIf(sum(input_tokens), 0), 0), 1) AS cache_hit_pct FROM llm_metrics WHERE response_model = '${escapeTSQL(
11751175
modelName
11761176
)}' GROUP BY timeBucket ORDER BY timeBucket`}
11771177
config={chartConfig({

internal-packages/llm-model-catalog/src/registry.test.ts

Lines changed: 101 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,59 @@ const claudeSonnet: LlmModelWithPricing = {
6969
],
7070
};
7171

72+
// Prices cache reads under the Anthropic-style alias `cache_read_input_tokens` (not
73+
// `input_cached_tokens`) plus a cache-creation price, to exercise alias resolution.
74+
const claudeWithCache: LlmModelWithPricing = {
75+
id: "model-claude-with-cache",
76+
friendlyId: "llm_model_claude_with_cache",
77+
modelName: "claude-with-cache",
78+
matchPattern: "^claude-with-cache$",
79+
startDate: null,
80+
pricingTiers: [
81+
{
82+
id: "tier-claude-with-cache",
83+
name: "Standard",
84+
isDefault: true,
85+
priority: 0,
86+
conditions: [],
87+
prices: [
88+
{ usageType: "input", price: 0.000003 },
89+
{ usageType: "output", price: 0.000015 },
90+
{ usageType: "cache_read_input_tokens", price: 0.0000003 },
91+
{ usageType: "cache_creation_input_tokens", price: 0.00000375 },
92+
],
93+
},
94+
],
95+
};
96+
97+
// No cache prices at all — cached tokens should fall back to the input price.
98+
const noCachePrice: LlmModelWithPricing = {
99+
id: "model-no-cache-price",
100+
friendlyId: "llm_model_no_cache_price",
101+
modelName: "no-cache-price",
102+
matchPattern: "^no-cache-price$",
103+
startDate: null,
104+
pricingTiers: [
105+
{
106+
id: "tier-no-cache-price",
107+
name: "Standard",
108+
isDefault: true,
109+
priority: 0,
110+
conditions: [],
111+
prices: [
112+
{ usageType: "input", price: 0.000003 },
113+
{ usageType: "output", price: 0.000015 },
114+
],
115+
},
116+
],
117+
};
118+
72119
describe("ModelPricingRegistry", () => {
73120
let registry: TestableRegistry;
74121

75122
beforeEach(() => {
76123
registry = new TestableRegistry(null as any);
77-
registry.loadPatterns([gpt4o, claudeSonnet]);
124+
registry.loadPatterns([gpt4o, claudeSonnet, claudeWithCache, noCachePrice]);
78125
});
79126

80127
describe("match", () => {
@@ -129,18 +176,68 @@ describe("ModelPricingRegistry", () => {
129176
expect(result!.totalCost).toBeCloseTo(0.0035);
130177
});
131178

132-
it("should include cached token costs", () => {
179+
it("should include cached token costs and charge input only on the fresh portion", () => {
180+
// input_tokens (500) is inclusive of the 200 cached read tokens, so the input price
181+
// applies to the 300 fresh tokens and the cache price to the 200 cached tokens — the
182+
// cached tokens must not be billed twice.
133183
const result = registry.calculateCost("gpt-4o", {
134184
input: 500,
135185
output: 50,
136186
input_cached_tokens: 200,
137187
});
138188

139189
expect(result).not.toBeNull();
140-
expect(result!.costDetails["input"]).toBeCloseTo(0.00125); // 500 * 0.0000025
190+
expect(result!.costDetails["input"]).toBeCloseTo(0.00075); // (500 - 200) * 0.0000025
141191
expect(result!.costDetails["output"]).toBeCloseTo(0.0005); // 50 * 0.00001
142192
expect(result!.costDetails["input_cached_tokens"]).toBeCloseTo(0.00025); // 200 * 0.00000125
143-
expect(result!.totalCost).toBeCloseTo(0.002);
193+
expect(result!.totalCost).toBeCloseTo(0.0015);
194+
});
195+
196+
it("should not double-charge cache creation tokens (subset of input)", () => {
197+
// input (1000) is inclusive of both the 400 cache-read and 300 cache-creation tokens.
198+
const result = registry.calculateCost("claude-with-cache", {
199+
input: 1000,
200+
output: 100,
201+
input_cached_tokens: 400,
202+
cache_creation_input_tokens: 300,
203+
});
204+
205+
expect(result).not.toBeNull();
206+
// fresh input = 1000 - 400 - 300 = 300
207+
expect(result!.costDetails["input"]).toBeCloseTo(0.0009); // 300 * 0.000003
208+
expect(result!.costDetails["input_cached_tokens"]).toBeCloseTo(0.00012); // 400 * 0.0000003
209+
expect(result!.costDetails["cache_creation_input_tokens"]).toBeCloseTo(0.001125); // 300 * 0.00000375
210+
expect(result!.costDetails["output"]).toBeCloseTo(0.0015); // 100 * 0.000015
211+
// 0.0009 + 0.00012 + 0.001125 + 0.0015
212+
expect(result!.totalCost).toBeCloseTo(0.003645);
213+
});
214+
215+
it("should apply the cache-read discount when priced under a provider alias key", () => {
216+
// The usage is normalized to `input_cached_tokens` but this model prices cache reads
217+
// under `cache_read_input_tokens` — the discount must still apply.
218+
const result = registry.calculateCost("claude-with-cache", {
219+
input: 1000,
220+
input_cached_tokens: 400,
221+
});
222+
223+
expect(result).not.toBeNull();
224+
expect(result!.costDetails["input"]).toBeCloseTo(0.0018); // (1000 - 400) * 0.000003
225+
expect(result!.costDetails["input_cached_tokens"]).toBeCloseTo(0.00012); // 400 * 0.0000003
226+
expect(result!.totalCost).toBeCloseTo(0.00192);
227+
});
228+
229+
it("should fall back to the input price for cache tokens when no cache price exists", () => {
230+
// no-cache-price model has only input/output prices; cached tokens must still be billed
231+
// (at the input price) — never free, never double-charged. Total equals input * price.
232+
const result = registry.calculateCost("no-cache-price", {
233+
input: 1000,
234+
input_cached_tokens: 400,
235+
});
236+
237+
expect(result).not.toBeNull();
238+
expect(result!.costDetails["input"]).toBeCloseTo(0.0018); // (1000 - 400) * 0.000003
239+
expect(result!.costDetails["input_cached_tokens"]).toBeCloseTo(0.0012); // 400 * 0.000003
240+
expect(result!.totalCost).toBeCloseTo(0.003); // 1000 * 0.000003 — unchanged from no-cache behavior
144241
});
145242

146243
it("should return null for unknown model", () => {

internal-packages/llm-model-catalog/src/registry.ts

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,69 @@ export class ModelPricingRegistry {
147147
const costDetails: Record<string, number> = {};
148148
let totalCost = 0;
149149

150+
// `input_tokens` (the "input" usage value) is the TOTAL prompt token count and is
151+
// inclusive of cache-read and cache-creation tokens — providers report it that way and
152+
// the AI SDK passes it through (verified: total_tokens == input + output, never the
153+
// sum of the decomposed parts). Cache reads/writes are therefore a SUBSET of input, not
154+
// additional to it. Charging the full input count at the input price AND charging a
155+
// separate cache line double-counts those tokens, so the input price must apply only to
156+
// the fresh (non-cached) remainder.
157+
const priceByType = new Map(tier.prices.map((p) => [p.usageType, p.price]));
158+
const resolvePrice = (aliases: string[]): number | undefined => {
159+
for (const alias of aliases) {
160+
const price = priceByType.get(alias);
161+
if (price !== undefined) return price;
162+
}
163+
return undefined;
164+
};
165+
166+
const inputPrice = resolvePrice(["input", "input_tokens"]) ?? 0;
167+
const cacheReadTokens = usageDetails["input_cached_tokens"] ?? 0;
168+
const cacheCreationTokens = usageDetails["cache_creation_input_tokens"] ?? 0;
169+
170+
// Providers price cache reads/writes under provider-specific keys, but our usage details
171+
// normalize them to `input_cached_tokens` / `cache_creation_input_tokens`. Resolve the
172+
// matching price across the known aliases, falling back to the input price so cache tokens
173+
// are never billed for free and never dropped when a model lacks a dedicated cache price.
174+
const cacheReadPrice =
175+
resolvePrice(["input_cached_tokens", "input_cache_read", "cache_read_input_tokens"]) ??
176+
inputPrice;
177+
const cacheCreationPrice =
178+
resolvePrice([
179+
"cache_creation_input_tokens",
180+
"input_cache_creation",
181+
"input_cache_creation_5m",
182+
]) ?? inputPrice;
183+
184+
const totalInputTokens = usageDetails["input"] ?? usageDetails["input_tokens"] ?? 0;
185+
const freshInputTokens = Math.max(0, totalInputTokens - cacheReadTokens - cacheCreationTokens);
186+
187+
const addCost = (usageType: string, tokenCount: number, price: number) => {
188+
if (tokenCount <= 0 || price <= 0) return;
189+
const cost = tokenCount * price;
190+
costDetails[usageType] = (costDetails[usageType] ?? 0) + cost;
191+
totalCost += cost;
192+
};
193+
194+
addCost("input", freshInputTokens, inputPrice);
195+
addCost("input_cached_tokens", cacheReadTokens, cacheReadPrice);
196+
addCost("cache_creation_input_tokens", cacheCreationTokens, cacheCreationPrice);
197+
198+
// Charge every remaining usage type generically. The input + cache types are handled
199+
// above (and their alias keys skipped here) so they are never charged twice.
200+
const handledUsageTypes = new Set([
201+
"input",
202+
"input_tokens",
203+
"input_cached_tokens",
204+
"input_cache_read",
205+
"cache_read_input_tokens",
206+
"cache_creation_input_tokens",
207+
"input_cache_creation",
208+
"input_cache_creation_5m",
209+
"input_cache_creation_1h",
210+
]);
150211
for (const priceEntry of tier.prices) {
212+
if (handledUsageTypes.has(priceEntry.usageType)) continue;
151213
const tokenCount = usageDetails[priceEntry.usageType] ?? 0;
152214
if (tokenCount === 0) continue;
153215
const cost = tokenCount * priceEntry.price;

0 commit comments

Comments
 (0)