diff --git a/packages/analyze/CHANGELOG.md b/packages/analyze/CHANGELOG.md index 05988d6..f647833 100644 --- a/packages/analyze/CHANGELOG.md +++ b/packages/analyze/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- **`compareFromArchive(query, opts)`** ([#88](https://github.com/AgentWorkforce/burn/issues/88)). New helper that builds a `CompareTable` directly from `archive.sqlite` via a single grouped `SELECT … GROUP BY model, activity, source` plus a tiny per-(model, activity) follow-up for median retries, instead of streaming every `EnrichedTurn` through `buildCompareTable` in memory. Returns `{ table, analyzedTurns }` so the caller can populate the same "turns analyzed" header the legacy path uses. Output is byte-identical to `buildCompareTable(await queryAll(q), opts)` for the parity fixture; per-source reasoning-mode handling (Codex's `included_in_output`) is preserved by grouping on `source` alongside `(model, activity)`. Powers the migration of `burn compare` to the archive read model. + ## [0.27.0] - 2026-04-26 ### Changed diff --git a/packages/analyze/src/compare-archive.test.ts b/packages/analyze/src/compare-archive.test.ts new file mode 100644 index 0000000..c02a76a --- /dev/null +++ b/packages/analyze/src/compare-archive.test.ts @@ -0,0 +1,560 @@ +import { strict as assert } from 'node:assert'; +import { mkdtemp, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import * as path from 'node:path'; +import { after, before, beforeEach, describe, it } from 'node:test'; + +import type { TurnRecord } from '@relayburn/reader'; +import { + appendTurns, + buildArchive, + queryAll, + stamp, + type Query, +} from '@relayburn/ledger'; + +import { buildCompareTable, type CompareOptions } from './compare.js'; +import { compareFromArchive } from './compare-archive.js'; +import { loadBuiltinPricing } from './pricing.js'; + +describe('compareFromArchive', () => { + let tmpDir: string; + const originalHome = process.env['RELAYBURN_HOME']; + // Per-test unique suffix folded into every messageId AND every default + // token count. Two layers of dedup live in the writer (`turnIdHash` keyed on + // (source|sessionId|messageId) and `turnContentFingerprint` keyed on + // (ts|model|input+output|cacheRead|cacheCreate*|firstToolArgsPrefix)), and + // both caches are module-scoped — they outlive a `RELAYBURN_HOME` reset. + // Bumping the suffix per test makes both ids and content fingerprints + // unique so the second test's appendTurns isn't silently deduped against + // the first's. Avoids needing to export the private + // __resetIndexCacheForTesting hook from @relayburn/ledger. + let testIdSuffix = 0; + function uid(label: string): string { + return `${label}-${testIdSuffix}`; + } + + // Build a TurnRecord with the minimum required fields. Tests override only + // the dimensions they care about; everything else defaults to a stable shape + // so parity comparisons aren't perturbed by unrelated fields. The default + // `usage.input` is keyed off `testIdSuffix` so two tests that don't override + // tokens still produce distinct content fingerprints (see comment above). + function fakeTurn(overrides: Partial = {}): TurnRecord { + return { + v: 1, + source: 'claude-code', + sessionId: uid('s-default'), + messageId: `m-${Math.random().toString(36).slice(2)}`, + turnIndex: 0, + ts: '2026-04-20T00:00:00.000Z', + model: 'claude-sonnet-4-6', + usage: { + input: 1000 + testIdSuffix * 17, + output: 500, + reasoning: 0, + cacheRead: 0, + cacheCreate5m: 0, + cacheCreate1h: 0, + }, + toolCalls: [], + project: '/tmp/project', + ...overrides, + }; + } + + before(async () => { + tmpDir = await mkdtemp(path.join(tmpdir(), 'relayburn-compare-archive-test-')); + }); + + beforeEach(async () => { + await rm(tmpDir, { recursive: true, force: true }); + tmpDir = await mkdtemp(path.join(tmpdir(), 'relayburn-compare-archive-test-')); + process.env['RELAYBURN_HOME'] = tmpDir; + testIdSuffix++; + }); + + after(async () => { + if (originalHome !== undefined) { + process.env['RELAYBURN_HOME'] = originalHome; + } else { + delete process.env['RELAYBURN_HOME']; + } + await rm(tmpDir, { recursive: true, force: true }); + }); + + // The headline acceptance criterion: same fixture, both code paths, same + // CompareTable + same analyzedTurns. We use a deliberately mixed fixture + // (multiple models, edit/non-edit categories, retries variation, + // cache-heavy turns, and an unpriced model) so the parity check exercises + // every cell-level metric and the sort orderings. + it('parity: matches in-memory buildCompareTable for a mixed fixture', async () => { + const pricing = await loadBuiltinPricing(); + const turns: TurnRecord[] = []; + let mid = 0; + const next = (): string => uid(`m-${++mid}`); + + // 6 Sonnet coding turns, 4 one-shot, varying retries & token weights. + const sSonnet = uid('s-sonnet'); + for (let i = 0; i < 4; i++) { + turns.push( + fakeTurn({ + messageId: next(), + sessionId: sSonnet, + turnIndex: i, + ts: `2026-04-20T00:00:${String(i).padStart(2, '0')}.000Z`, + model: 'claude-sonnet-4-6', + activity: 'coding', + hasEdits: true, + retries: 0, + usage: { + input: 5000, + output: 800, + reasoning: 0, + cacheRead: 12000, + cacheCreate5m: 0, + cacheCreate1h: 0, + }, + }), + ); + } + turns.push( + fakeTurn({ + messageId: next(), + sessionId: sSonnet, + turnIndex: 4, + ts: '2026-04-20T00:00:04.000Z', + model: 'claude-sonnet-4-6', + activity: 'coding', + hasEdits: true, + retries: 2, + }), + fakeTurn({ + messageId: next(), + sessionId: sSonnet, + turnIndex: 5, + ts: '2026-04-20T00:00:05.000Z', + model: 'claude-sonnet-4-6', + activity: 'coding', + hasEdits: true, + retries: 1, + }), + ); + + // 5 Haiku coding turns, 2 one-shot. Shape the cacheRead/input mix + // differently so cacheHitRate parity is non-trivial. + const sHaiku = uid('s-haiku'); + for (let i = 0; i < 5; i++) { + turns.push( + fakeTurn({ + messageId: next(), + sessionId: sHaiku, + turnIndex: i, + ts: `2026-04-20T01:00:${String(i).padStart(2, '0')}.000Z`, + model: 'claude-haiku-4-5', + activity: 'coding', + hasEdits: true, + retries: i < 2 ? 0 : i, + usage: { + input: 2000, + output: 400, + reasoning: 0, + cacheRead: i % 2 === 0 ? 6000 : 0, + cacheCreate5m: 0, + cacheCreate1h: 0, + }, + }), + ); + } + + // Sonnet exploration (no edits). + const sExpl = uid('s-expl'); + turns.push( + fakeTurn({ + messageId: next(), + sessionId: sExpl, + model: 'claude-sonnet-4-6', + activity: 'exploration', + hasEdits: false, + }), + fakeTurn({ + messageId: next(), + sessionId: sExpl, + turnIndex: 1, + ts: '2026-04-20T02:00:01.000Z', + model: 'claude-sonnet-4-6', + activity: 'exploration', + hasEdits: false, + }), + ); + + // Unpriced model — exercises the costPerTurn=null branch for both paths. + turns.push( + fakeTurn({ + messageId: next(), + sessionId: uid('s-unpriced'), + model: 'definitely-not-a-model', + activity: 'coding', + hasEdits: true, + retries: 1, + }), + ); + + // Codex-source turn — exercises the per-source `included_in_output` + // override that compareFromArchive folds via grouping on (model, + // activity, source). + turns.push( + fakeTurn({ + messageId: next(), + sessionId: uid('s-codex'), + source: 'codex', + model: 'gpt-5-codex', + activity: 'coding', + hasEdits: true, + retries: 0, + usage: { + input: 10000, + output: 2000, + reasoning: 800, + cacheRead: 0, + cacheCreate5m: 0, + cacheCreate1h: 0, + }, + }), + ); + + await appendTurns(turns); + await buildArchive(); + + const opts: CompareOptions = { pricing, minSample: 5 }; + const inMemoryTurns = await queryAll({}); + const inMemory = buildCompareTable(inMemoryTurns, opts); + const fromArchive = await compareFromArchive({}, opts); + + assert.deepEqual(fromArchive.table.models, inMemory.models, 'models order'); + assert.deepEqual(fromArchive.table.categories, inMemory.categories, 'categories order'); + assert.deepEqual(fromArchive.table.minSample, inMemory.minSample); + assert.deepEqual(fromArchive.analyzedTurns, inMemoryTurns.length, 'analyzedTurns'); + + // Per-cell deep equality across every (model, category) pair. + for (const m of inMemory.models) { + for (const cat of inMemory.categories) { + const a = fromArchive.table.cells[m]![cat]!; + const b = inMemory.cells[m]![cat]!; + assert.equal(a.turns, b.turns, `${m}/${cat} turns`); + assert.equal(a.editTurns, b.editTurns, `${m}/${cat} editTurns`); + assert.equal(a.oneShotTurns, b.oneShotTurns, `${m}/${cat} oneShotTurns`); + assert.equal(a.pricedTurns, b.pricedTurns, `${m}/${cat} pricedTurns`); + assertNumNear(a.totalCost, b.totalCost, `${m}/${cat} totalCost`); + assertNumNear(a.costPerTurn, b.costPerTurn, `${m}/${cat} costPerTurn`); + assertNumNear(a.oneShotRate, b.oneShotRate, `${m}/${cat} oneShotRate`); + assertNumNear(a.cacheHitRate, b.cacheHitRate, `${m}/${cat} cacheHitRate`); + assert.equal(a.medianRetries, b.medianRetries, `${m}/${cat} medianRetries`); + assert.equal(a.noData, b.noData, `${m}/${cat} noData`); + assert.equal(a.insufficientSample, b.insufficientSample, `${m}/${cat} insufficientSample`); + } + } + + // Per-model totals must match (turns + totalCost) within float epsilon. + for (const m of inMemory.models) { + const a = fromArchive.table.totals[m]!; + const b = inMemory.totals[m]!; + assert.equal(a.turns, b.turns, `${m} totals.turns`); + assertNumNear(a.totalCost, b.totalCost, `${m} totals.totalCost`); + } + }); + + it('honors --models filter and pre-seeds requested-but-absent models', async () => { + const pricing = await loadBuiltinPricing(); + await appendTurns([ + fakeTurn({ + messageId: uid('m-1'), + sessionId: uid('s-1'), + model: 'claude-sonnet-4-6', + activity: 'coding', + hasEdits: true, + retries: 0, + }), + fakeTurn({ + messageId: uid('m-2'), + sessionId: uid('s-2'), + model: 'claude-opus-4-7', + activity: 'coding', + hasEdits: true, + retries: 0, + }), + ]); + await buildArchive(); + + const result = await compareFromArchive( + {}, + { pricing, models: ['claude-sonnet-4-6', 'claude-haiku-4-5'] }, + ); + // Opus filtered out; Haiku pre-seeded as an empty column. + assert.deepEqual(result.table.models.sort(), ['claude-haiku-4-5', 'claude-sonnet-4-6']); + assert.equal(result.table.cells['claude-haiku-4-5']!['coding']!.noData, true); + assert.equal(result.table.totals['claude-haiku-4-5']!.turns, 0); + // analyzedTurns is the pre-`--models` count and must include both + // ledger turns (matches the legacy `queryAll(q).length` semantics). + assert.equal(result.analyzedTurns, 2); + }); + + it('honors --since', async () => { + const pricing = await loadBuiltinPricing(); + await appendTurns([ + fakeTurn({ + messageId: uid('m-old'), + sessionId: uid('s-1'), + ts: '2026-04-19T00:00:00.000Z', + model: 'claude-sonnet-4-6', + activity: 'coding', + hasEdits: true, + retries: 0, + }), + fakeTurn({ + messageId: uid('m-new'), + sessionId: uid('s-2'), + ts: '2026-04-21T00:00:00.000Z', + model: 'claude-sonnet-4-6', + activity: 'coding', + hasEdits: true, + retries: 0, + }), + ]); + await buildArchive(); + + const q: Query = { since: '2026-04-20T00:00:00.000Z' }; + const result = await compareFromArchive(q, { pricing }); + assert.equal(result.analyzedTurns, 1); + assert.equal(result.table.cells['claude-sonnet-4-6']!['coding']!.turns, 1); + }); + + it('honors --project (matches both literal project path and projectKey)', async () => { + const pricing = await loadBuiltinPricing(); + await appendTurns([ + fakeTurn({ + messageId: uid('m-a'), + sessionId: uid('s-1'), + project: '/tmp/proj-a', + projectKey: 'github.com/me/a', + model: 'claude-sonnet-4-6', + activity: 'coding', + hasEdits: true, + }), + fakeTurn({ + messageId: uid('m-b'), + sessionId: uid('s-2'), + project: '/tmp/proj-b', + projectKey: 'github.com/me/b', + model: 'claude-sonnet-4-6', + activity: 'coding', + hasEdits: true, + }), + ]); + await buildArchive(); + + // Literal path filter. + const byPath = await compareFromArchive({ project: '/tmp/proj-a' }, { pricing }); + assert.equal(byPath.analyzedTurns, 1); + + // projectKey filter. + const byKey = await compareFromArchive({ project: 'github.com/me/b' }, { pricing }); + assert.equal(byKey.analyzedTurns, 1); + }); + + it('honors --session', async () => { + const pricing = await loadBuiltinPricing(); + const sx = uid('s-X'); + await appendTurns([ + fakeTurn({ + messageId: uid('m-x'), + sessionId: sx, + model: 'claude-sonnet-4-6', + activity: 'coding', + hasEdits: true, + }), + fakeTurn({ + messageId: uid('m-y'), + sessionId: uid('s-Y'), + model: 'claude-sonnet-4-6', + activity: 'coding', + hasEdits: true, + }), + ]); + await buildArchive(); + + const result = await compareFromArchive({ sessionId: sx }, { pricing }); + assert.equal(result.analyzedTurns, 1); + }); + + it('honors --workflow and --agent (enrichment from stamps)', async () => { + const pricing = await loadBuiltinPricing(); + const sw = uid('s-W'); + await appendTurns([ + fakeTurn({ + messageId: uid('m-w'), + sessionId: sw, + model: 'claude-sonnet-4-6', + activity: 'coding', + hasEdits: true, + }), + fakeTurn({ + messageId: uid('m-other'), + sessionId: uid('s-other'), + model: 'claude-sonnet-4-6', + activity: 'coding', + hasEdits: true, + }), + ]); + await stamp({ sessionId: sw }, { workflowId: 'wf-42', agentId: 'agent-7' }); + await buildArchive(); + + const byWorkflow = await compareFromArchive( + { enrichment: { workflowId: 'wf-42' } }, + { pricing }, + ); + assert.equal(byWorkflow.analyzedTurns, 1); + + const byAgent = await compareFromArchive( + { enrichment: { agentId: 'agent-7' } }, + { pricing }, + ); + assert.equal(byAgent.analyzedTurns, 1); + + // A workflow that doesn't exist returns an empty table without throwing. + const empty = await compareFromArchive( + { enrichment: { workflowId: 'wf-nope' } }, + { pricing }, + ); + assert.equal(empty.analyzedTurns, 0); + assert.deepEqual(empty.table.models, []); + assert.deepEqual(empty.table.categories, []); + }); + + it('honors --min-sample (flags low-sample cells as insufficientSample)', async () => { + const pricing = await loadBuiltinPricing(); + await appendTurns([ + fakeTurn({ + messageId: uid('m-1'), + sessionId: uid('s-1'), + model: 'claude-sonnet-4-6', + activity: 'refactoring', + hasEdits: true, + }), + fakeTurn({ + messageId: uid('m-2'), + sessionId: uid('s-2'), + model: 'claude-sonnet-4-6', + activity: 'refactoring', + hasEdits: true, + }), + ]); + await buildArchive(); + + const result = await compareFromArchive({}, { pricing, minSample: 5 }); + const cell = result.table.cells['claude-sonnet-4-6']!['refactoring']!; + assert.equal(cell.turns, 2); + assert.equal(cell.insufficientSample, true); + assert.equal(cell.noData, false); + }); + + it('empty archive yields an empty table with analyzedTurns=0', async () => { + const pricing = await loadBuiltinPricing(); + await buildArchive(); + const result = await compareFromArchive({}, { pricing }); + assert.equal(result.analyzedTurns, 0); + assert.deepEqual(result.table.models, []); + assert.deepEqual(result.table.categories, []); + assert.deepEqual(result.table.totals, {}); + }); + + it('single-cell archive: one (model, activity) pair populates exactly one cell', async () => { + const pricing = await loadBuiltinPricing(); + await appendTurns([ + fakeTurn({ + messageId: uid('m-only'), + sessionId: uid('s-only'), + model: 'claude-sonnet-4-6', + activity: 'coding', + hasEdits: true, + retries: 0, + }), + ]); + await buildArchive(); + + const result = await compareFromArchive({}, { pricing }); + assert.deepEqual(result.table.models, ['claude-sonnet-4-6']); + assert.deepEqual(result.table.categories, ['coding']); + const cell = result.table.cells['claude-sonnet-4-6']!['coding']!; + assert.equal(cell.turns, 1); + assert.equal(cell.editTurns, 1); + assert.equal(cell.oneShotTurns, 1); + assert.equal(cell.medianRetries, 0); + assert.equal(cell.noData, false); + // Default minSample (5) makes this insufficient, which is the documented + // behavior — the cell still reports its metrics, just flagged. + assert.equal(cell.insufficientSample, true); + }); + + it('groups turns missing activity under "unclassified"', async () => { + const pricing = await loadBuiltinPricing(); + await appendTurns([ + fakeTurn({ messageId: uid('m-u1'), sessionId: uid('s-1'), model: 'claude-sonnet-4-6' }), + fakeTurn({ messageId: uid('m-u2'), sessionId: uid('s-2'), model: 'claude-sonnet-4-6' }), + ]); + await buildArchive(); + + const result = await compareFromArchive({}, { pricing }); + assert.ok(result.table.categories.includes('unclassified')); + assert.equal(result.table.cells['claude-sonnet-4-6']!['unclassified']!.turns, 2); + }); + + it('Codex turns bill reasoning as included_in_output (parity with costForTurn)', async () => { + // Regression guard for the source-aware reasoning override: Codex's + // output_tokens already includes reasoning, so reasoning_tokens must NOT + // be billed separately. compareFromArchive groups on `source` to apply + // this per-row before folding into the cell. If the override regressed, + // archive-path totalCost would diverge from in-memory costForTurn for + // any Codex turn with reasoning_tokens > 0. + const pricing = await loadBuiltinPricing(); + await appendTurns([ + fakeTurn({ + messageId: uid('m-cx'), + sessionId: uid('s-cx'), + source: 'codex', + model: 'gpt-5-codex', + activity: 'coding', + hasEdits: true, + retries: 0, + usage: { + input: 10000, + output: 2000, + reasoning: 800, + cacheRead: 0, + cacheCreate5m: 0, + cacheCreate1h: 0, + }, + }), + ]); + await buildArchive(); + + const opts: CompareOptions = { pricing }; + const inMemoryTurns = await queryAll({}); + const inMemory = buildCompareTable(inMemoryTurns, opts); + const fromArchive = await compareFromArchive({}, opts); + + const expected = inMemory.cells['gpt-5-codex']?.['coding']?.totalCost ?? 0; + const got = fromArchive.table.cells['gpt-5-codex']?.['coding']?.totalCost ?? 0; + assertNumNear(got, expected, 'Codex reasoning-mode parity'); + }); +}); + +// Numeric near-equality that handles `null` symmetrically. We use a generous +// epsilon (1e-9) because both paths sum floats but in different orders, so +// strict equality is brittle at the cent fraction. +function assertNumNear(a: number | null, b: number | null, msg: string): void { + if (a === null || b === null) { + assert.equal(a, b, msg); + return; + } + assert.ok(Math.abs(a - b) < 1e-9, `${msg}: ${a} != ${b}`); +} diff --git a/packages/analyze/src/compare-archive.ts b/packages/analyze/src/compare-archive.ts new file mode 100644 index 0000000..4f7ded3 --- /dev/null +++ b/packages/analyze/src/compare-archive.ts @@ -0,0 +1,370 @@ +import { openArchive, type Query } from '@relayburn/ledger'; + +import { + DEFAULT_MIN_SAMPLE, + type CompareCell, + type CompareOptions, + type CompareTable, +} from './compare.js'; +import { costForUsage, lookupModelRate } from './cost.js'; + +export interface CompareFromArchiveResult { + table: CompareTable; + /** + * Total turn count (pre-`--models` filter) matching `q`, used to populate + * the "turns analyzed" header line in text mode and the `analyzedTurns` + * field in `--json`. Mirrors `(await queryAll(q)).length` from the + * legacy path. + */ + analyzedTurns: number; +} + +/** + * Build a `CompareTable` from the analytics archive (`archive.sqlite`) + * instead of streaming the full ledger. Issued as a single grouped SQL + * query over `turns` plus a tiny per-(model, activity) follow-up for the + * median-retries quantile, and one top-level `COUNT(*)` for the + * `analyzedTurns` header. See issue #88. + * + * Behavior is byte-identical to `buildCompareTable(await queryAll(q), opts)` + * for the fixtures the parity tests cover. Cost math is computed in JS over + * SQL-aggregated token sums (linear, so equivalent to per-turn summation), + * with the source-specific reasoning-mode override (Codex's + * `included_in_output`) preserved by grouping on `source` alongside + * (model, activity) and folding into cells afterwards. + * + * Falling back to the legacy in-memory path is the caller's responsibility: + * see `runCompare` in `@relayburn/cli` for the `--no-archive` / + * `RELAYBURN_ARCHIVE=0` switch. + */ +export async function compareFromArchive( + q: Query, + opts: CompareOptions, +): Promise { + const minSample = opts.minSample ?? DEFAULT_MIN_SAMPLE; + const modelFilter = opts.models && opts.models.length > 0 ? new Set(opts.models) : null; + + const db = await openArchive(); + try { + const where = buildWhere(q); + // Group on (model, activity, source). Source is included so we can apply + // the per-source reasoning-mode override (Codex bills reasoning inside + // output) without losing per-turn fidelity. Within a single model cell, + // pricedTurns either equals turns (model has a rate) or 0 (no rate), + // because `lookupModelRate` is a function of model only. + const sql = ` + SELECT + COALESCE(NULLIF(model, ''), 'unknown') AS model, + COALESCE(activity, 'unclassified') AS activity, + source AS source, + COUNT(*) AS turns, + SUM(CASE WHEN has_edits = 1 THEN 1 ELSE 0 END) AS edit_turns, + SUM(CASE WHEN has_edits = 1 AND COALESCE(retries, 0) = 0 + THEN 1 ELSE 0 END) AS one_shot_turns, + SUM(input_tokens) AS input_tokens, + SUM(output_tokens) AS output_tokens, + SUM(reasoning_tokens) AS reasoning_tokens, + SUM(cache_read_tokens) AS cache_read_tokens, + SUM(cache_create_5m_tokens) AS cache_create_5m_tokens, + SUM(cache_create_1h_tokens) AS cache_create_1h_tokens + FROM turns + ${where.sql} + GROUP BY COALESCE(NULLIF(model, ''), 'unknown'), + COALESCE(activity, 'unclassified'), + source + `; + const rows = db.prepare(sql).all(...where.params) as Array<{ + model: string; + activity: string; + source: string; + turns: number | bigint; + edit_turns: number | bigint; + one_shot_turns: number | bigint; + input_tokens: number | bigint; + output_tokens: number | bigint; + reasoning_tokens: number | bigint; + cache_read_tokens: number | bigint; + cache_create_5m_tokens: number | bigint; + cache_create_1h_tokens: number | bigint; + }>; + + const byModelCategory = new Map>(); + const modelTotals = new Map(); + const modelSet = new Set(); + const categorySet = new Set(); + + // Pre-seed modelSet from the --models filter so a model the user + // explicitly asked about stays visible (as an all-empty column with + // coverage notes) even if zero turns matched. Mirrors the in-memory + // `buildCompareTable` behavior. + if (modelFilter) { + for (const m of modelFilter) { + modelSet.add(m); + modelTotals.set(m, { turns: 0, totalCost: 0 }); + } + } + + for (const r of rows) { + const model = r.model; + if (modelFilter && !modelFilter.has(model)) continue; + const cat = r.activity; + modelSet.add(model); + categorySet.add(cat); + + let byCat = byModelCategory.get(model); + if (!byCat) { + byCat = new Map(); + byModelCategory.set(model, byCat); + } + let acc = byCat.get(cat); + if (!acc) { + acc = newAccum(); + byCat.set(cat, acc); + } + + const turns = Number(r.turns); + const editTurns = Number(r.edit_turns); + const oneShotTurns = Number(r.one_shot_turns); + const inputTokens = Number(r.input_tokens); + const outputTokens = Number(r.output_tokens); + const reasoningTokens = Number(r.reasoning_tokens); + const cacheReadTokens = Number(r.cache_read_tokens); + const cacheCreate5mTokens = Number(r.cache_create_5m_tokens); + const cacheCreate1hTokens = Number(r.cache_create_1h_tokens); + + acc.turns += turns; + acc.editTurns += editTurns; + acc.oneShotTurns += oneShotTurns; + acc.cacheRead += cacheReadTokens; + acc.tokenDenominator += + inputTokens + cacheReadTokens + cacheCreate5mTokens + cacheCreate1hTokens; + + const mt = modelTotals.get(model) ?? { turns: 0, totalCost: 0 }; + mt.turns += turns; + + // Cost: applied per (source, model) group. lookupModelRate is a + // function of model only, so within a model cell pricedTurns is + // either `turns_in_group` (model priced) or 0 (model unpriced). + const rate = lookupModelRate(model, opts.pricing); + if (rate) { + const reasoningMode = r.source === 'codex' ? 'included_in_output' : undefined; + const breakdown = costForUsage( + { + input: inputTokens, + output: outputTokens, + reasoning: reasoningTokens, + cacheRead: cacheReadTokens, + cacheCreate5m: cacheCreate5mTokens, + cacheCreate1h: cacheCreate1hTokens, + }, + model, + opts.pricing, + reasoningMode ? { reasoningMode } : {}, + ); + if (breakdown) { + acc.pricedTurns += turns; + acc.totalCost += breakdown.total; + mt.totalCost += breakdown.total; + } + } + modelTotals.set(model, mt); + } + + // Per-cell median-retries follow-up. Only run for cells with editTurns > 0 + // — empty edit cells return medianRetries: null and need no follow-up. + // Each query is keyed on (source, session_id) indexes only at the WHERE + // level via (model, activity), but the planner uses idx_turns_model and + // idx_turns_activity, which is the documented use case for those + // indexes. For a typical cell this returns at most a few hundred rows. + const medianStmt = db.prepare( + `SELECT retries + FROM turns + ${where.sql ? where.sql + ' AND ' : 'WHERE '} + COALESCE(NULLIF(model, ''), 'unknown') = ? + AND COALESCE(activity, 'unclassified') = ? + AND has_edits = 1`, + ); + const cells: CompareTable['cells'] = {}; + const models = [...modelSet].sort((a, b) => { + const ca = modelTotals.get(a)?.totalCost ?? 0; + const cb = modelTotals.get(b)?.totalCost ?? 0; + if (cb !== ca) return cb - ca; + return a.localeCompare(b); + }); + const categories = [...categorySet].sort((a, b) => { + let ta = 0; + let tb = 0; + for (const m of models) { + ta += byModelCategory.get(m)?.get(a)?.turns ?? 0; + tb += byModelCategory.get(m)?.get(b)?.turns ?? 0; + } + if (tb !== ta) return tb - ta; + return a.localeCompare(b); + }); + for (const m of models) { + cells[m] = {}; + for (const cat of categories) { + const acc = byModelCategory.get(m)?.get(cat); + let medianRetries: number | null = null; + if (acc && acc.editTurns > 0) { + const retryRows = medianStmt.all( + ...where.params, + m, + cat, + ) as Array<{ retries: number | bigint | null }>; + const samples: number[] = []; + for (const rr of retryRows) { + samples.push(Number(rr.retries ?? 0)); + } + medianRetries = median(samples); + } + cells[m]![cat] = toCell(acc, minSample, medianRetries); + } + } + + const totals: CompareTable['totals'] = {}; + for (const [m, v] of modelTotals) totals[m] = v; + + // analyzedTurns is the pre-`--models` count over the same WHERE clause. + // Pre-models because the legacy path computes it as `turns.length` from + // queryAll(q), which doesn't see opts.models. + const countRow = db + .prepare(`SELECT COUNT(*) AS n FROM turns ${where.sql}`) + .get(...where.params) as { n: number | bigint }; + const analyzedTurns = Number(countRow.n); + + return { + table: { models, categories, cells, totals, minSample }, + analyzedTurns, + }; + } finally { + db.close(); + } +} + +interface BuiltWhere { + sql: string; + params: Array; +} + +function buildWhere(q: Query): BuiltWhere { + const conditions: string[] = []; + const params: string[] = []; + if (q.since) { + conditions.push('ts >= ?'); + params.push(q.since); + } + if (q.until) { + conditions.push('ts <= ?'); + params.push(q.until); + } + if (q.project) { + // Mirror the in-memory Query semantics: a project filter matches either + // the literal `project` path or the git-canonical `project_key`. + conditions.push('(project = ? OR project_key = ?)'); + params.push(q.project, q.project); + } + if (q.sessionId) { + conditions.push('session_id = ?'); + params.push(q.sessionId); + } + if (q.source) { + conditions.push('source = ?'); + params.push(q.source); + } + if (q.enrichment) { + for (const [k, v] of Object.entries(q.enrichment)) { + if (v === undefined) continue; + const col = ENRICHMENT_COLUMN[k]; + if (!col) { + // Unknown enrichment key — fall back to JSON_EXTRACT on the + // materialized `enrichment_json` column so callers keep working + // when they query a non-canonical key (matches the in-memory + // path's "any key in enrichment" semantics). + conditions.push(`COALESCE(json_extract(enrichment_json, ?), '') = ?`); + params.push(`$.${k}`, v); + continue; + } + conditions.push(`${col} = ?`); + params.push(v); + } + } + return { + sql: conditions.length === 0 ? '' : `WHERE ${conditions.join(' AND ')}`, + params, + }; +} + +// Canonical enrichment keys that have dedicated columns in the archive. +// Anything else falls through to a json_extract on `enrichment_json`. +const ENRICHMENT_COLUMN: Record = { + workflowId: 'workflow_id', + agentId: 'agent_id', + persona: 'persona', + tier: 'tier', +}; + +interface CellAccum { + turns: number; + editTurns: number; + oneShotTurns: number; + pricedTurns: number; + totalCost: number; + cacheRead: number; + tokenDenominator: number; +} + +function newAccum(): CellAccum { + return { + turns: 0, + editTurns: 0, + oneShotTurns: 0, + pricedTurns: 0, + totalCost: 0, + cacheRead: 0, + tokenDenominator: 0, + }; +} + +function toCell( + acc: CellAccum | undefined, + minSample: number, + medianRetries: number | null, +): CompareCell { + if (!acc || acc.turns === 0) { + return { + turns: 0, + editTurns: 0, + oneShotTurns: 0, + pricedTurns: 0, + totalCost: 0, + costPerTurn: null, + oneShotRate: null, + cacheHitRate: null, + medianRetries: null, + noData: true, + insufficientSample: false, + }; + } + return { + turns: acc.turns, + editTurns: acc.editTurns, + oneShotTurns: acc.oneShotTurns, + pricedTurns: acc.pricedTurns, + totalCost: acc.totalCost, + costPerTurn: acc.pricedTurns > 0 ? acc.totalCost / acc.pricedTurns : null, + oneShotRate: acc.editTurns > 0 ? acc.oneShotTurns / acc.editTurns : null, + cacheHitRate: acc.tokenDenominator > 0 ? acc.cacheRead / acc.tokenDenominator : null, + medianRetries: acc.editTurns > 0 ? medianRetries : null, + noData: false, + insufficientSample: acc.turns < minSample, + }; +} + +function median(xs: number[]): number { + if (xs.length === 0) return 0; + const s = [...xs].sort((a, b) => a - b); + const mid = Math.floor(s.length / 2); + return s.length % 2 === 0 ? (s[mid - 1]! + s[mid]!) / 2 : s[mid]!; +} + diff --git a/packages/analyze/src/index.ts b/packages/analyze/src/index.ts index e4c244b..455e92f 100644 --- a/packages/analyze/src/index.ts +++ b/packages/analyze/src/index.ts @@ -4,6 +4,8 @@ export { costForTurn, costForUsage, sumCosts } from './cost.js'; export type { CostBreakdown, CostForUsageOptions } from './cost.js'; export { buildCompareTable, DEFAULT_MIN_SAMPLE } from './compare.js'; export type { CompareCategory, CompareCell, CompareOptions, CompareTable } from './compare.js'; +export { compareFromArchive } from './compare-archive.js'; +export type { CompareFromArchiveResult } from './compare-archive.js'; export { attributeWaste, aggregateByFile, diff --git a/packages/cli/CHANGELOG.md b/packages/cli/CHANGELOG.md index f765751..83b9b4e 100644 --- a/packages/cli/CHANGELOG.md +++ b/packages/cli/CHANGELOG.md @@ -45,6 +45,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Execution-graph passthrough for Codex ingest** ([#87](https://github.com/AgentWorkforce/burn/issues/87)). `burn ingest` (and `burn codex`) now persist Codex `SessionRelationshipRecord`s and `ToolResultEventRecord`s the reader emits, alongside the existing turns / content lines, mirroring the Claude path landed in the previous release. The Codex cursor (`~/.relayburn/cursors.json`) gains `rootSessionEmitted`, `nextEventIndex`, and `toolResultCounters` so dedup of execution-graph rows survives across `burn` invocations even when the writer-side index isn't warm. +### Changed + +- **`burn compare` now reads from the SQLite archive by default** ([#88](https://github.com/AgentWorkforce/burn/issues/88)). The compare table is built from a single grouped `SELECT … GROUP BY model, activity, source` over `archive.sqlite` plus a tiny per-cell median-retries follow-up, instead of streaming every turn through `queryAll()` + an in-memory reduce. Output (text, CSV, `--json`) is byte-identical to the legacy path for the parity fixture; all existing flags (`--models`, `--since`, `--project`, `--session`, `--workflow`, `--agent`, `--min-sample`) work through the SQL path. New `--no-archive` flag (also honored via `RELAYBURN_ARCHIVE=0`) preserves the in-memory path as a parity-validation / safety-net fallback. + ## [0.25.0] - 2026-04-26 ### Added diff --git a/packages/cli/src/commands/compare.ts b/packages/cli/src/commands/compare.ts index cb55381..965410f 100644 --- a/packages/cli/src/commands/compare.ts +++ b/packages/cli/src/commands/compare.ts @@ -1,11 +1,12 @@ import { buildCompareTable, + compareFromArchive, DEFAULT_MIN_SAMPLE, loadPricing, type CompareCell, type CompareTable, } from '@relayburn/analyze'; -import { queryAll, type Query } from '@relayburn/ledger'; +import { buildArchive, queryAll, type Query } from '@relayburn/ledger'; import { ingestAll } from '../ingest.js'; import { formatInt, formatUsd, parseSinceArg } from '../format.js'; @@ -15,7 +16,8 @@ const COMPARE_HELP = `burn compare — per-(model, activity) comparison table Usage: burn compare [--models a,b] [--since 7d] [--project ] [--session ] - [--workflow ] [--agent ] [--min-sample ] [--json|--csv] + [--workflow ] [--agent ] [--min-sample ] + [--json|--csv] [--no-archive] Flags: --models comma-separated list of model names to include (default: all) @@ -29,6 +31,8 @@ Flags: --json emit a stable JSON object (analyzedTurns, models, categories, totals, cells[]) --csv emit a CSV with one row per (model, category) pair + --no-archive bypass the SQLite archive and stream the ledger directly + (legacy path; honored when env RELAYBURN_ARCHIVE=0) --help, -h show this message Cell metrics: @@ -84,14 +88,33 @@ export async function runCompare(args: ParsedArgs): Promise { await ingestAll(); const pricing = await loadPricing(); - const turns = await queryAll(q); const opts: Parameters[1] = { pricing, minSample }; if (models) opts.models = models; - const table = buildCompareTable(turns, opts); + + // Archive path is the default (#88). Fallback to the in-memory `queryAll` + // + `buildCompareTable` path is preserved behind `--no-archive` and the + // env override `RELAYBURN_ARCHIVE=0` for parity validation and as a + // safety net when the archive is missing or corrupt. + const useArchive = !shouldBypassArchive(args); + let table: CompareTable; + let analyzedTurns: number; + if (useArchive) { + // Materialize the ledger tail before reading. ingestAll() above only + // writes to the JSONL ledger; the archive is a derived read model that + // needs an explicit catch-up build to reflect the just-appended turns. + await buildArchive(); + const result = await compareFromArchive(q, opts); + table = result.table; + analyzedTurns = result.analyzedTurns; + } else { + const turns = await queryAll(q); + table = buildCompareTable(turns, opts); + analyzedTurns = turns.length; + } if (wantJson) { - process.stdout.write(JSON.stringify(toJson(table, turns.length), null, 2) + '\n'); + process.stdout.write(JSON.stringify(toJson(table, analyzedTurns), null, 2) + '\n'); return 0; } if (wantCsv) { @@ -99,10 +122,17 @@ export async function runCompare(args: ParsedArgs): Promise { return 0; } - process.stdout.write(renderTty(table, turns.length)); + process.stdout.write(renderTty(table, analyzedTurns)); return 0; } +function shouldBypassArchive(args: ParsedArgs): boolean { + if (args.flags['no-archive'] === true) return true; + const env = process.env['RELAYBURN_ARCHIVE']; + if (env === '0' || env === 'false' || env === 'no') return true; + return false; +} + function toJson(t: CompareTable, analyzedTurns: number): object { const cells: Array> = []; for (const m of t.models) { diff --git a/packages/ledger/src/archive.ts b/packages/ledger/src/archive.ts index 7dad762..0676f53 100644 --- a/packages/ledger/src/archive.ts +++ b/packages/ledger/src/archive.ts @@ -126,7 +126,7 @@ CREATE TABLE IF NOT EXISTS turns ( parent_tool_use_id TEXT, subagent_type TEXT, -- Free-form subagent description (Claude reader populates this from the - -- spawn payload). Persisted so `burn summary --subagent-tree --json` + -- spawn payload). Persisted so 'burn summary --subagent-tree --json' -- preserves SubagentTreeNode.description on archive-backed reads. subagent_description TEXT, input_tokens INTEGER NOT NULL DEFAULT 0,