diff --git a/packages/cli/CHANGELOG.md b/packages/cli/CHANGELOG.md index c5f1b45..2cd2ac1 100644 --- a/packages/cli/CHANGELOG.md +++ b/packages/cli/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- **`burn limits` honors fidelity on its 5-hour forecast** ([#105](https://github.com/AgentWorkforce/burn/issues/105)). The forecast still consumes every windowed turn — partial / aggregate-only / cost-only data still contributes to the running token total — but `burn limits` now classifies the contributing slice via `summarizeFidelity` and surfaces a binary `high` / `low` confidence flag. Text mode appends a `forecast: low-confidence (N of M contributing turns lack per-turn token data)` notice when at least one contributing turn is missing per-turn token coverage; full-fidelity windows print no notice. `--json` output gains a `forecast.fidelity` block carrying the `confidence` flag and the underlying `FidelitySummary`. `--watch` re-evaluates confidence on each tick so the flag flips as fresher full-fidelity turns land. + ## [0.33.0] - 2026-04-27 ### Added diff --git a/packages/cli/src/commands/limits.test.ts b/packages/cli/src/commands/limits.test.ts index 27fdc00..363d537 100644 --- a/packages/cli/src/commands/limits.test.ts +++ b/packages/cli/src/commands/limits.test.ts @@ -1,12 +1,15 @@ import { strict as assert } from 'node:assert'; import { describe, it } from 'node:test'; +import type { Fidelity } from '@relayburn/reader'; import { emptyFidelitySummary } from '@relayburn/analyze'; import { + deriveForecastFidelity, makeCachingFetcher, runLimits, type ForecastInput, + type ForecastResult, type LimitsDeps, type UsageResponse, } from './limits.js'; @@ -46,6 +49,49 @@ function fakeNow(): Date { return new Date(FIXED_NOW); } +const FULL_FIDELITY: Fidelity = { + granularity: 'per-turn', + class: 'full', + coverage: { + hasInputTokens: true, + hasOutputTokens: true, + hasReasoningTokens: true, + hasCacheReadTokens: true, + hasCacheCreateTokens: true, + hasToolCalls: true, + hasToolResultEvents: true, + hasSessionRelationships: true, + hasRawContent: true, + }, +}; + +const PARTIAL_FIDELITY: Fidelity = { + granularity: 'per-turn', + class: 'partial', + coverage: { + hasInputTokens: false, + hasOutputTokens: false, + hasReasoningTokens: false, + hasCacheReadTokens: false, + hasCacheCreateTokens: false, + hasToolCalls: true, + hasToolResultEvents: false, + hasSessionRelationships: false, + hasRawContent: false, + }, +}; + +// Wrap a `ForecastInput` in a high-confidence `ForecastResult` so existing +// tests (which only care about the numeric forecast) get a benign fidelity +// block by default. Tests that exercise low-confidence behavior pass an +// explicit `ForecastResult` instead. +function highConfidence(input: ForecastInput): ForecastResult { + return { + input, + fidelity: deriveForecastFidelity([{ fidelity: FULL_FIDELITY }]), + }; +} + function noTokenDeps(): LimitsDeps { return { loadToken: async () => null, @@ -61,7 +107,7 @@ function tokenDeps(usage: UsageResponse, forecast: ForecastInput | null = null): loadToken: async () => 'fake-token', fetchUsage: async () => usage, now: fakeNow, - loadForecast: async () => forecast, + loadForecast: async () => (forecast ? highConfidence(forecast) : null), loadPlanStatuses: async () => [], }; } @@ -152,7 +198,7 @@ describe('burn limits', () => { throw new Error('should not be called when --no-api'); }, now: fakeNow, - loadForecast: async () => forecast, + loadForecast: async () => highConfidence(forecast), }; const { result, stdout } = await captureStdout(() => runLimits(args({ 'no-api': true }), deps), @@ -335,6 +381,161 @@ describe('burn limits', () => { assert.equal(parsed.plans[0].limitedData, false); }); + it('high-confidence forecast (all full) renders no fidelity notice', async () => { + // Acceptance criteria #105: full-fidelity windows show no notice. + const usage: UsageResponse = { + five_hour: { percent_used: 40, reset_at: '2026-04-24T14:00:00.000Z' }, + }; + const forecast: ForecastInput = { + tokensSoFar: 600_000, + elapsedMs: 2 * 60 * 60 * 1000, + remainingMs: 2 * 60 * 60 * 1000, + }; + const result: ForecastResult = { + input: forecast, + fidelity: deriveForecastFidelity([ + { fidelity: FULL_FIDELITY }, + { fidelity: FULL_FIDELITY }, + { fidelity: FULL_FIDELITY }, + ]), + }; + const deps: LimitsDeps = { + loadToken: async () => 'tok', + fetchUsage: async () => usage, + now: fakeNow, + loadForecast: async () => result, + loadPlanStatuses: async () => [], + }; + const { stdout } = await captureStdout(() => runLimits(args(), deps)); + assert.match(stdout, /burn rate/); + assert.doesNotMatch(stdout, /low-confidence/); + }); + + it('low-confidence forecast (one partial turn) appends a notice without refusing the projection', async () => { + // Acceptance criteria #105: rendered output shows a low-confidence notice + // when any contributing turn lacks per-turn token coverage; the forecast + // number itself is unchanged (still rendered). + const usage: UsageResponse = { + five_hour: { percent_used: 40, reset_at: '2026-04-24T14:00:00.000Z' }, + }; + const forecast: ForecastInput = { + tokensSoFar: 600_000, + elapsedMs: 2 * 60 * 60 * 1000, + remainingMs: 2 * 60 * 60 * 1000, + }; + const result: ForecastResult = { + input: forecast, + fidelity: deriveForecastFidelity([ + { fidelity: FULL_FIDELITY }, + { fidelity: FULL_FIDELITY }, + { fidelity: PARTIAL_FIDELITY }, + ]), + }; + const deps: LimitsDeps = { + loadToken: async () => 'tok', + fetchUsage: async () => usage, + now: fakeNow, + loadForecast: async () => result, + loadPlanStatuses: async () => [], + }; + const { stdout } = await captureStdout(() => runLimits(args(), deps)); + // Forecast is still rendered with both burn rate and projection. + assert.match(stdout, /burn rate 5\.0k tok\/min/); + assert.match(stdout, /projected 80% at reset/); + // And a low-confidence notice is appended naming the count. + assert.match( + stdout, + /forecast: low-confidence \(1 of 3 contributing turns lack per-turn token data\)/, + ); + }); + + it('--json forecast block carries a fidelity sub-object with confidence + summary', async () => { + // Acceptance criteria #105: --json emits a fidelity block with confidence + // and the underlying FidelitySummary. + const usage: UsageResponse = { + five_hour: { percent_used: 40, reset_at: '2026-04-24T14:00:00.000Z' }, + }; + const forecast: ForecastInput = { + tokensSoFar: 600_000, + elapsedMs: 2 * 60 * 60 * 1000, + remainingMs: 2 * 60 * 60 * 1000, + }; + const result: ForecastResult = { + input: forecast, + fidelity: deriveForecastFidelity([ + { fidelity: FULL_FIDELITY }, + { fidelity: PARTIAL_FIDELITY }, + ]), + }; + const deps: LimitsDeps = { + loadToken: async () => 'tok', + fetchUsage: async () => usage, + now: fakeNow, + loadForecast: async () => result, + loadPlanStatuses: async () => [], + }; + const { stdout } = await captureStdout(() => runLimits(args({ json: true }), deps)); + const parsed = JSON.parse(stdout); + assert.ok(parsed.forecast.fidelity, 'forecast.fidelity present'); + assert.equal(parsed.forecast.fidelity.confidence, 'low'); + assert.equal(parsed.forecast.fidelity.summary.total, 2); + assert.equal(parsed.forecast.fidelity.summary.byClass.full, 1); + assert.equal(parsed.forecast.fidelity.summary.byClass.partial, 1); + assert.equal(parsed.forecast.fidelity.summary.unknown, 0); + }); + + it('--json forecast fidelity reports high confidence when every turn is full', async () => { + const usage: UsageResponse = { + five_hour: { percent_used: 40, reset_at: '2026-04-24T14:00:00.000Z' }, + }; + const forecast: ForecastInput = { + tokensSoFar: 600_000, + elapsedMs: 2 * 60 * 60 * 1000, + remainingMs: 2 * 60 * 60 * 1000, + }; + const { stdout } = await captureStdout(() => + runLimits(args({ json: true }), tokenDeps(usage, forecast)), + ); + const parsed = JSON.parse(stdout); + assert.equal(parsed.forecast.fidelity.confidence, 'high'); + assert.equal(parsed.forecast.fidelity.summary.total, 1); + assert.equal(parsed.forecast.fidelity.summary.byClass.full, 1); + }); + + it('--watch re-evaluates confidence each tick (low → high as full turns arrive)', async () => { + // Acceptance criteria #105: --watch re-evaluates confidence on each tick. + // We exercise renderOnce indirectly by toggling the loadForecast result + // between calls and checking that runLimits picks up the change. (We + // don't actually run the watch loop here — the loop just calls + // renderOnce repeatedly, which is what we test below.) + const usage: UsageResponse = { + five_hour: { percent_used: 40, reset_at: '2026-04-24T14:00:00.000Z' }, + }; + const forecastInput: ForecastInput = { + tokensSoFar: 600_000, + elapsedMs: 2 * 60 * 60 * 1000, + remainingMs: 2 * 60 * 60 * 1000, + }; + let tick = 0; + const deps: LimitsDeps = { + loadToken: async () => 'tok', + fetchUsage: async () => usage, + now: fakeNow, + loadForecast: async () => { + const turns = + tick++ === 0 + ? [{ fidelity: FULL_FIDELITY }, { fidelity: PARTIAL_FIDELITY }] + : [{ fidelity: FULL_FIDELITY }, { fidelity: FULL_FIDELITY }]; + return { input: forecastInput, fidelity: deriveForecastFidelity(turns) }; + }, + loadPlanStatuses: async () => [], + }; + const { stdout: first } = await captureStdout(() => runLimits(args(), deps)); + assert.match(first, /low-confidence/); + const { stdout: second } = await captureStdout(() => runLimits(args(), deps)); + assert.doesNotMatch(second, /low-confidence/); + }); + it('renders very-low projected % without double-normalizing back to 0..1', async () => { // Regression: projectFromOauth returns a value already on the 0..100 scale // (and capped at 100). If the renderer pipes that through the same diff --git a/packages/cli/src/commands/limits.ts b/packages/cli/src/commands/limits.ts index 8f168cc..291fd68 100644 --- a/packages/cli/src/commands/limits.ts +++ b/packages/cli/src/commands/limits.ts @@ -3,8 +3,11 @@ import { readFile } from 'node:fs/promises'; import { homedir } from 'node:os'; import * as path from 'node:path'; +import { summarizeFidelity } from '@relayburn/analyze'; +import type { FidelitySummary } from '@relayburn/analyze'; import { loadPlans, queryAll } from '@relayburn/ledger'; import type { Plan } from '@relayburn/ledger'; +import type { TurnRecord } from '@relayburn/reader'; import type { ParsedArgs } from '../args.js'; import { formatUsd } from '../format.js'; @@ -38,11 +41,35 @@ export interface ForecastInput { remainingMs: number; } +// Binary "is this forecast trustworthy?" flag, derived from the fidelity of +// the turns that contributed to `tokensSoFar`. `high` means every contributing +// turn has full per-turn token coverage (`full` or `usage-only` with input + +// output token counts). `low` means at least one turn is `partial` / +// `aggregate-only` / `cost-only` / `unknown` — the running token total still +// represents *something* meaningful (cost totals, summed aggregates), but the +// per-turn shape is fuzzy enough that the projection should be flagged. +export type ForecastConfidence = 'high' | 'low'; + +export interface ForecastFidelity { + confidence: ForecastConfidence; + // Count of turns whose per-turn token data is unreliable for forecasting. + // Equivalent to `total - (full + qualified usage-only)` — unknowns (records + // with no `fidelity` field) are counted here too, not excluded. + // Surfaced separately so the rendered notice can read "N of M". + lowConfidenceTurns: number; + summary: FidelitySummary; +} + +export interface ForecastResult { + input: ForecastInput; + fidelity: ForecastFidelity; +} + export interface LimitsDeps { loadToken?: () => Promise; fetchUsage?: (token: string) => Promise; now?: () => Date; - loadForecast?: (windowStartMs: number, nowMs: number) => Promise; + loadForecast?: (windowStartMs: number, nowMs: number) => Promise; loadPlanStatuses?: () => Promise; } @@ -87,12 +114,12 @@ export async function runLimits(args: ParsedArgs, deps: LimitsDeps = {}): Promis } const nowDate = now(); - let forecast: { window: ForecastWindow; data: ForecastInput } | null = null; + let forecast: { window: ForecastWindow; data: ForecastInput; fidelity: ForecastFidelity } | null = null; if (!noForecast) { const windowStartMs = forecastWindowStartMs(usage?.five_hour, nowDate); - const data = await loadForecast(windowStartMs, nowDate.getTime()); - if (data) { - forecast = { window: { startMs: windowStartMs }, data }; + const result = await loadForecast(windowStartMs, nowDate.getTime()); + if (result) { + forecast = { window: { startMs: windowStartMs }, data: result.input, fidelity: result.fidelity }; } } @@ -128,6 +155,10 @@ export async function runLimits(args: ParsedArgs, deps: LimitsDeps = {}): Promis remainingMs: forecast.data.remainingMs, burnRateTokensPerMinute: burnRatePerMinute(forecast.data), projectedPercentAtReset: projectedPercent, + fidelity: { + confidence: forecast.fidelity.confidence, + summary: forecast.fidelity.summary, + }, } : null, plans: planStatuses.map((s) => ({ @@ -181,7 +212,7 @@ interface ForecastWindow { function renderTty(opts: { usage: UsageResponse | null; usageError: string | null; - forecast: { window: ForecastWindow; data: ForecastInput } | null; + forecast: { window: ForecastWindow; data: ForecastInput; fidelity: ForecastFidelity } | null; projectedPercent: number | null; planStatuses: PlanStatus[]; now: Date; @@ -235,6 +266,18 @@ function renderTty(opts: { } lines.push(` ${parts.join(', ')}`); } + // Low-confidence notice (#105): when one or more contributing turns lack + // per-turn token coverage the forecast number itself is unchanged — we + // still sum what's there — but the user should know the per-turn shape is + // fuzzy enough that the rate could be off. Full-fidelity windows print + // nothing here so the common case stays quiet. + if (forecast.fidelity.confidence === 'low') { + const total = forecast.fidelity.summary.total; + const lowN = forecast.fidelity.lowConfidenceTurns; + lines.push( + ` forecast: low-confidence (${lowN} of ${total} contributing turns lack per-turn token data)`, + ); + } } for (const status of planStatuses) { @@ -487,7 +530,7 @@ async function defaultLoadPlanStatuses(): Promise { async function loadForecastFromLedger( windowStartMs: number, nowMs: number, -): Promise { +): Promise { // Match the convention used by every other read-only command (summary, // by-tool, diagnose, …): sweep new session logs into the ledger before // querying, so the forecast reflects what just happened in the active @@ -496,6 +539,10 @@ async function loadForecastFromLedger( // every ~5s stays cheap on a steady-state ledger. await ingestAll(); const since = new Date(windowStartMs).toISOString(); + // Permissive filter (#105): unlike `burn compare`, `limits` consumes the + // entire windowed slice — partial / aggregate-only / cost-only turns still + // contribute meaningful spend totals. We surface confidence separately + // rather than refusing data. const turns = await queryAll({ since, source: 'claude-code' }); if (turns.length === 0) return null; let tokens = 0; @@ -511,5 +558,39 @@ async function loadForecastFromLedger( } const elapsedMs = Math.max(0, nowMs - windowStartMs); const remainingMs = Math.max(0, windowStartMs + SESSION_DURATION_MS - nowMs); - return { tokensSoFar: tokens, elapsedMs, remainingMs }; + return { + input: { tokensSoFar: tokens, elapsedMs, remainingMs }, + fidelity: deriveForecastFidelity(turns), + }; +} + +// Walk the windowed slice and decide whether the forecast deserves a "low +// confidence" flag. A turn is forecast-trustworthy when its fidelity class is +// `full`, or `usage-only` *and* both input and output token counts are +// covered. Anything else — `partial`, `aggregate-only`, `cost-only`, or a +// pre-#41 record with no fidelity at all — gets counted toward +// `lowConfidenceTurns` and flips the overall flag to `low`. +export function deriveForecastFidelity( + turns: ReadonlyArray>, +): ForecastFidelity { + const summary = summarizeFidelity(turns); + let lowConfidenceTurns = 0; + for (const t of turns) { + const f = t.fidelity; + if (!f) { + lowConfidenceTurns++; + continue; + } + if (f.class === 'full') continue; + if ( + f.class === 'usage-only' && + f.coverage.hasInputTokens && + f.coverage.hasOutputTokens + ) { + continue; + } + lowConfidenceTurns++; + } + const confidence: ForecastConfidence = lowConfidenceTurns === 0 ? 'high' : 'low'; + return { confidence, lowConfidenceTurns, summary }; }