fix(webapp,llm-model-catalog): stop double-counting cached input tokens

ericallam · ericallam · commit e7a728ffd9e3 · 2026-06-16T10:45:49.000+01:00
input_tokens is the total prompt count, inclusive of cache-read and
cache-creation tokens. The cost pipeline charged the full input count at
the input price and then added a separate cache line, so cached tokens
were billed twice (e.g. ~2.4x on OpenAI), and the cache hit-rate metric
divided cached reads by input + cached, understating the rate. Charge
the input price only on the fresh (non-cached) remainder, resolve cache
prices across provider alias keys (falling back to input price so cache
tokens are never free), and compute the hit rate as cached / input.
diff --git a/.server-changes/llm-cost-cached-token-double-charge.md b/.server-changes/llm-cost-cached-token-double-charge.md
@@ -0,0 +1,6 @@
+---
+area: webapp
+type: fix
+---
+
+LLM cost no longer double-counts cached input tokens. Prompt-cache reads and writes are now billed once at their cache rate instead of also being charged at the full input price, so cost and cache hit-rate figures on the AI metrics dashboard and Models page are accurate.
diff --git a/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts b/apps/webapp/app/presenters/v3/BuiltInDashboards.server.ts
@@ -496,7 +496,7 @@ const llmDashboard: BuiltInDashboard = {
       "llm-cache-hit": {
         title: "Cache hit rate over time",
         query:
-          "SELECT timeBucket(), round(ifNull(sum(cached_read_tokens) * 100.0 / nullIf(sum(input_tokens) + sum(cached_read_tokens), 0), 0), 1) AS cache_hit_pct FROM llm_metrics GROUP BY timeBucket ORDER BY timeBucket",
+          "SELECT timeBucket(), round(ifNull(sum(cached_read_tokens) * 100.0 / nullIf(sum(input_tokens), 0), 0), 1) AS cache_hit_pct FROM llm_metrics GROUP BY timeBucket ORDER BY timeBucket",
         display: {
           type: "chart",
           chartType: "line",
@@ -528,7 +528,7 @@ const llmDashboard: BuiltInDashboard = {
       "llm-cache-savings": {
         title: "Cache savings over time",
         query:
-          "SELECT timeBucket(), round(ifNull(sum(cached_read_tokens) * (sum(input_cost) / nullIf(sum(input_tokens), 0)) - sum(cached_read_cost), 0), 4) AS cache_savings FROM llm_metrics WHERE cached_read_tokens > 0 GROUP BY timeBucket ORDER BY timeBucket",
+          "SELECT timeBucket(), round(ifNull(sum(cached_read_tokens) * (sum(input_cost) / nullIf(sum(input_tokens) - sum(cached_read_tokens) - sum(cache_creation_tokens), 0)) - sum(cached_read_cost), 0), 4) AS cache_savings FROM llm_metrics WHERE cached_read_tokens > 0 GROUP BY timeBucket ORDER BY timeBucket",
         display: {
           type: "chart",
           chartType: "bar",
@@ -544,7 +544,7 @@ const llmDashboard: BuiltInDashboard = {
       "llm-cache-by-model": {
         title: "Cache hit rate by model",
         query:
-          "SELECT response_model, round(ifNull(sum(cached_read_tokens) * 100.0 / nullIf(sum(input_tokens) + sum(cached_read_tokens), 0), 0), 1) AS cache_hit_pct, sum(cached_read_tokens) AS cached_tokens FROM llm_metrics GROUP BY response_model ORDER BY cached_tokens DESC LIMIT 20",
+          "SELECT response_model, round(ifNull(sum(cached_read_tokens) * 100.0 / nullIf(sum(input_tokens), 0), 0), 1) AS cache_hit_pct, sum(cached_read_tokens) AS cached_tokens FROM llm_metrics GROUP BY response_model ORDER BY cached_tokens DESC LIMIT 20",
         display: { type: "table", prettyFormatting: true, sorting: [] },
       },
     },
diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.models._index/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.models._index/route.tsx
@@ -1171,7 +1171,7 @@ function DetailYourUsageTab({
         <MetricWidget
           widgetKey={`${modelName}-user-cache-hit`}
           title="Cache hit rate over time"
-          query={`SELECT timeBucket(), round(ifNull(sum(cached_read_tokens) * 100.0 / nullIf(sum(input_tokens) + sum(cached_read_tokens), 0), 0), 1) AS cache_hit_pct FROM llm_metrics WHERE response_model = '${escapeTSQL(
+          query={`SELECT timeBucket(), round(ifNull(sum(cached_read_tokens) * 100.0 / nullIf(sum(input_tokens), 0), 0), 1) AS cache_hit_pct FROM llm_metrics WHERE response_model = '${escapeTSQL(
             modelName
           )}' GROUP BY timeBucket ORDER BY timeBucket`}
           config={chartConfig({
diff --git a/internal-packages/llm-model-catalog/src/registry.test.ts b/internal-packages/llm-model-catalog/src/registry.test.ts
@@ -69,12 +69,59 @@ const claudeSonnet: LlmModelWithPricing = {
   ],
 };
 
+// Prices cache reads under the Anthropic-style alias `cache_read_input_tokens` (not
+// `input_cached_tokens`) plus a cache-creation price, to exercise alias resolution.
+const claudeWithCache: LlmModelWithPricing = {
+  id: "model-claude-with-cache",
+  friendlyId: "llm_model_claude_with_cache",
+  modelName: "claude-with-cache",
+  matchPattern: "^claude-with-cache$",
+  startDate: null,
+  pricingTiers: [
+    {
+      id: "tier-claude-with-cache",
+      name: "Standard",
+      isDefault: true,
+      priority: 0,
+      conditions: [],
+      prices: [
+        { usageType: "input", price: 0.000003 },
+        { usageType: "output", price: 0.000015 },
+        { usageType: "cache_read_input_tokens", price: 0.0000003 },
+        { usageType: "cache_creation_input_tokens", price: 0.00000375 },
+      ],
+    },
+  ],
+};
+
+// No cache prices at all — cached tokens should fall back to the input price.
+const noCachePrice: LlmModelWithPricing = {
+  id: "model-no-cache-price",
+  friendlyId: "llm_model_no_cache_price",
+  modelName: "no-cache-price",
+  matchPattern: "^no-cache-price$",
+  startDate: null,
+  pricingTiers: [
+    {
+      id: "tier-no-cache-price",
+      name: "Standard",
+      isDefault: true,
+      priority: 0,
+      conditions: [],
+      prices: [
+        { usageType: "input", price: 0.000003 },
+        { usageType: "output", price: 0.000015 },
+      ],
+    },
+  ],
+};
+
 describe("ModelPricingRegistry", () => {
   let registry: TestableRegistry;
 
   beforeEach(() => {
     registry = new TestableRegistry(null as any);
-    registry.loadPatterns([gpt4o, claudeSonnet]);
+    registry.loadPatterns([gpt4o, claudeSonnet, claudeWithCache, noCachePrice]);
   });
 
   describe("match", () => {
@@ -129,18 +176,68 @@ describe("ModelPricingRegistry", () => {
       expect(result!.totalCost).toBeCloseTo(0.0035);
     });
 
-    it("should include cached token costs", () => {
+    it("should include cached token costs and charge input only on the fresh portion", () => {
+      // input_tokens (500) is inclusive of the 200 cached read tokens, so the input price
+      // applies to the 300 fresh tokens and the cache price to the 200 cached tokens — the
+      // cached tokens must not be billed twice.
       const result = registry.calculateCost("gpt-4o", {
         input: 500,
         output: 50,
         input_cached_tokens: 200,
       });
 
       expect(result).not.toBeNull();
-      expect(result!.costDetails["input"]).toBeCloseTo(0.00125); // 500 * 0.0000025
+      expect(result!.costDetails["input"]).toBeCloseTo(0.00075); // (500 - 200) * 0.0000025
       expect(result!.costDetails["output"]).toBeCloseTo(0.0005); // 50 * 0.00001
       expect(result!.costDetails["input_cached_tokens"]).toBeCloseTo(0.00025); // 200 * 0.00000125
-      expect(result!.totalCost).toBeCloseTo(0.002);
+      expect(result!.totalCost).toBeCloseTo(0.0015);
+    });
+
+    it("should not double-charge cache creation tokens (subset of input)", () => {
+      // input (1000) is inclusive of both the 400 cache-read and 300 cache-creation tokens.
+      const result = registry.calculateCost("claude-with-cache", {
+        input: 1000,
+        output: 100,
+        input_cached_tokens: 400,
+        cache_creation_input_tokens: 300,
+      });
+
+      expect(result).not.toBeNull();
+      // fresh input = 1000 - 400 - 300 = 300
+      expect(result!.costDetails["input"]).toBeCloseTo(0.0009); // 300 * 0.000003
+      expect(result!.costDetails["input_cached_tokens"]).toBeCloseTo(0.00012); // 400 * 0.0000003
+      expect(result!.costDetails["cache_creation_input_tokens"]).toBeCloseTo(0.001125); // 300 * 0.00000375
+      expect(result!.costDetails["output"]).toBeCloseTo(0.0015); // 100 * 0.000015
+      // 0.0009 + 0.00012 + 0.001125 + 0.0015
+      expect(result!.totalCost).toBeCloseTo(0.003645);
+    });
+
+    it("should apply the cache-read discount when priced under a provider alias key", () => {
+      // The usage is normalized to `input_cached_tokens` but this model prices cache reads
+      // under `cache_read_input_tokens` — the discount must still apply.
+      const result = registry.calculateCost("claude-with-cache", {
+        input: 1000,
+        input_cached_tokens: 400,
+      });
+
+      expect(result).not.toBeNull();
+      expect(result!.costDetails["input"]).toBeCloseTo(0.0018); // (1000 - 400) * 0.000003
+      expect(result!.costDetails["input_cached_tokens"]).toBeCloseTo(0.00012); // 400 * 0.0000003
+      expect(result!.totalCost).toBeCloseTo(0.00192);
+    });
+
+    it("should fall back to the input price for cache tokens when no cache price exists", () => {
+      // no-cache-price model has only input/output prices; cached tokens must still be billed
+      // (at the input price) — never free, never double-charged. Total equals input * price.
+      const result = registry.calculateCost("no-cache-price", {
+        input: 1000,
+        input_cached_tokens: 400,
+      });
+
+      expect(result).not.toBeNull();
+      expect(result!.costDetails["input"]).toBeCloseTo(0.0018); // (1000 - 400) * 0.000003
+      expect(result!.costDetails["input_cached_tokens"]).toBeCloseTo(0.0012); // 400 * 0.000003
+      expect(result!.totalCost).toBeCloseTo(0.003); // 1000 * 0.000003 — unchanged from no-cache behavior
     });
 
     it("should return null for unknown model", () => {
diff --git a/internal-packages/llm-model-catalog/src/registry.ts b/internal-packages/llm-model-catalog/src/registry.ts
@@ -147,7 +147,69 @@ export class ModelPricingRegistry {
     const costDetails: Record<string, number> = {};
     let totalCost = 0;
 
+    // `input_tokens` (the "input" usage value) is the TOTAL prompt token count and is
+    // inclusive of cache-read and cache-creation tokens — providers report it that way and
+    // the AI SDK passes it through (verified: total_tokens == input + output, never the
+    // sum of the decomposed parts). Cache reads/writes are therefore a SUBSET of input, not
+    // additional to it. Charging the full input count at the input price AND charging a
+    // separate cache line double-counts those tokens, so the input price must apply only to
+    // the fresh (non-cached) remainder.
+    const priceByType = new Map(tier.prices.map((p) => [p.usageType, p.price]));
+    const resolvePrice = (aliases: string[]): number | undefined => {
+      for (const alias of aliases) {
+        const price = priceByType.get(alias);
+        if (price !== undefined) return price;
+      }
+      return undefined;
+    };
+
+    const inputPrice = resolvePrice(["input", "input_tokens"]) ?? 0;
+    const cacheReadTokens = usageDetails["input_cached_tokens"] ?? 0;
+    const cacheCreationTokens = usageDetails["cache_creation_input_tokens"] ?? 0;
+
+    // Providers price cache reads/writes under provider-specific keys, but our usage details
+    // normalize them to `input_cached_tokens` / `cache_creation_input_tokens`. Resolve the
+    // matching price across the known aliases, falling back to the input price so cache tokens
+    // are never billed for free and never dropped when a model lacks a dedicated cache price.
+    const cacheReadPrice =
+      resolvePrice(["input_cached_tokens", "input_cache_read", "cache_read_input_tokens"]) ??
+      inputPrice;
+    const cacheCreationPrice =
+      resolvePrice([
+        "cache_creation_input_tokens",
+        "input_cache_creation",
+        "input_cache_creation_5m",
+      ]) ?? inputPrice;
+
+    const totalInputTokens = usageDetails["input"] ?? usageDetails["input_tokens"] ?? 0;
+    const freshInputTokens = Math.max(0, totalInputTokens - cacheReadTokens - cacheCreationTokens);
+
+    const addCost = (usageType: string, tokenCount: number, price: number) => {
+      if (tokenCount <= 0 || price <= 0) return;
+      const cost = tokenCount * price;
+      costDetails[usageType] = (costDetails[usageType] ?? 0) + cost;
+      totalCost += cost;
+    };
+
+    addCost("input", freshInputTokens, inputPrice);
+    addCost("input_cached_tokens", cacheReadTokens, cacheReadPrice);
+    addCost("cache_creation_input_tokens", cacheCreationTokens, cacheCreationPrice);
+
+    // Charge every remaining usage type generically. The input + cache types are handled
+    // above (and their alias keys skipped here) so they are never charged twice.
+    const handledUsageTypes = new Set([
+      "input",
+      "input_tokens",
+      "input_cached_tokens",
+      "input_cache_read",
+      "cache_read_input_tokens",
+      "cache_creation_input_tokens",
+      "input_cache_creation",
+      "input_cache_creation_5m",
+      "input_cache_creation_1h",
+    ]);
     for (const priceEntry of tier.prices) {
+      if (handledUsageTypes.has(priceEntry.usageType)) continue;
       const tokenCount = usageDetails[priceEntry.usageType] ?? 0;
       if (tokenCount === 0) continue;
       const cost = tokenCount * priceEntry.price;