diff --git a/CLAUDE.md b/CLAUDE.md
index 08e57f3..77cb80c 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -20,7 +20,9 @@ This is a Node 20+ TypeScript monorepo using npm workspaces and project referenc
 - Dev CLI (no build, via tsx): `npm.cmd run dev -- <args>`
 - Built CLI: `npm.cmd run myagent -- <args>` (requires `build` first)
 
-The CLI binary entry is `packages/cli/dist/index.js` (exposed as `myagent`). The full subcommand surface is documented by `myagent --help` — notable: `chat`, `agent`, `tui`, `memory`, `skill`, `mcp`, `task`, `remote`, `profile`, `resume`, `compact`, `week12 audit`, `week18 finalize`.
+The CLI binary entry is `packages/cli/dist/index.js` (exposed as `myagent`). The full subcommand surface is documented by `myagent --help` — notable: `chat`, `agent`, `tui`, `memory`, `skill`, `mcp`, `task`, `remote`, `profile`, `usage`, `resume`, `compact`, `week12 audit`, `week18 finalize`, `eval run`.
+
+`myagent eval run` is the offline fixture-based agent regression suite ([packages/cli/src/eval.ts](packages/cli/src/eval.ts)): 5 inline `EvalTask`s drive the real `query()` loop with deterministic `FakeModel` scripts (read-only analysis, safe edit, read-only Bash, plan-mode-blocks-Write permission enforcement, explore sub-agent). Token/cost numbers are deterministic because `FakeModelStep`'s `assistant_message` accepts an optional scripted `usage`; the suite reports pass/turns/in/out/cache/cost per task plus totals under `.myagent/evals/runs/<runId>/`. It is also a CI gate via `packages/cli/test/eval.test.ts` (asserts all pass + pins the metric fingerprint) — a behavior regression in the agent loop flips it red.
 
 ### Environment
 
diff --git a/packages/cli/src/eval.ts b/packages/cli/src/eval.ts
new file mode 100644
index 0000000..7d59edf
--- /dev/null
+++ b/packages/cli/src/eval.ts
@@ -0,0 +1,523 @@
+import { mkdir, readFile, writeFile } from "node:fs/promises";
+import { join } from "node:path";
+
+import {
+  FakeModel,
+  collectQuery,
+  estimateUsageCostUsd,
+  type CostRates,
+  type FakeModelStep,
+  type LoopEvent,
+  type ModelUsage,
+  type PermissionMode,
+  type TerminalState
+} from "@mini-claude-code/core";
+import { createProjectToolRegistry } from "@mini-claude-code/tools";
+
+// Fixed reference pricing so the eval's cost column is a deterministic,
+// reproducible regression signal regardless of the caller's environment.
+// Roughly Sonnet-class rates; the absolute number does not matter, only
+// that it never changes between runs unless token counts do.
+const EVAL_REFERENCE_RATES: CostRates = {
+  inputUsdPerMillionTokens: 3,
+  outputUsdPerMillionTokens: 15,
+  cacheWriteUsdPerMillionTokens: 3.75,
+  cacheReadUsdPerMillionTokens: 0.3
+};
+
+export type EvalSuiteOptions = {
+  cwd: string;
+  outputRootDir?: string;
+  now?: Date;
+};
+
+export type EvalTaskMetrics = {
+  turns: number;
+  inputTokens: number;
+  outputTokens: number;
+  cacheCreationInputTokens: number;
+  cacheReadInputTokens: number;
+  costUsd: number;
+};
+
+export type EvalTaskResult = {
+  taskId: string;
+  title: string;
+  category: "read_only" | "safe_edit" | "bash" | "permission" | "sub_agent";
+  prompt: string;
+  permissionMode: PermissionMode;
+  transcriptPath: string;
+  terminalState: TerminalState;
+  passed: boolean;
+  notes: string[];
+  metrics: EvalTaskMetrics;
+};
+
+export type EvalSuiteReport = {
+  runId: string;
+  rootDir: string;
+  fixtureDir: string;
+  status: "passed" | "failed";
+  tasks: EvalTaskResult[];
+  totals: EvalTaskMetrics & { taskCount: number; passedCount: number };
+  reportPath: string;
+};
+
+type EvalTask = {
+  taskId: string;
+  title: string;
+  category: EvalTaskResult["category"];
+  prompt: string;
+  permissionMode: PermissionMode;
+  maxTurns?: number;
+  script: FakeModelStep[];
+  validate(fixtureDir: string, events: readonly LoopEvent[]): Promise<string[]>;
+};
+
+export async function runEvalSuite(options: EvalSuiteOptions): Promise<EvalSuiteReport> {
+  const now = options.now ?? new Date();
+  const runId = `eval_${compactTimestamp(now.toISOString())}`;
+  const rootDir = join(options.outputRootDir ?? join(options.cwd, ".myagent", "evals", "runs"), runId);
+  const fixtureDir = join(rootDir, "fixture-project");
+  const transcriptDir = join(rootDir, "transcripts");
+
+  await writeEvalFixture(fixtureDir);
+  await mkdir(transcriptDir, { recursive: true });
+
+  const tasks: EvalTaskResult[] = [];
+  for (const task of createEvalTasks()) {
+    const toolRegistry = createProjectToolRegistry();
+    const events = await collectQuery({
+      model: new FakeModel(task.script),
+      initialMessages: [{ role: "user", content: task.prompt }],
+      tools: toolRegistry,
+      toolContext: { cwd: fixtureDir },
+      permissionMode: task.permissionMode,
+      maxTurns: task.maxTurns ?? 8
+    });
+
+    const terminalState = finalTerminalState(events);
+    const notes = [
+      ...terminalNotes(terminalState),
+      ...toolResultNotes(events, task),
+      ...(await task.validate(fixtureDir, events))
+    ];
+    const passed = terminalState.status === "completed" && notes.length === 0;
+    const metrics = computeMetrics(events);
+    const transcriptPath = join(transcriptDir, `${task.taskId}.json`);
+    const result: EvalTaskResult = {
+      taskId: task.taskId,
+      title: task.title,
+      category: task.category,
+      prompt: task.prompt,
+      permissionMode: task.permissionMode,
+      transcriptPath,
+      terminalState,
+      passed,
+      notes,
+      metrics
+    };
+    await writeFile(transcriptPath, `${JSON.stringify(result, null, 2)}\n`, "utf8");
+    tasks.push(result);
+  }
+
+  const totals = tasks.reduce(
+    (acc, t) => ({
+      taskCount: acc.taskCount + 1,
+      passedCount: acc.passedCount + (t.passed ? 1 : 0),
+      turns: acc.turns + t.metrics.turns,
+      inputTokens: acc.inputTokens + t.metrics.inputTokens,
+      outputTokens: acc.outputTokens + t.metrics.outputTokens,
+      cacheCreationInputTokens: acc.cacheCreationInputTokens + t.metrics.cacheCreationInputTokens,
+      cacheReadInputTokens: acc.cacheReadInputTokens + t.metrics.cacheReadInputTokens,
+      costUsd: acc.costUsd + t.metrics.costUsd
+    }),
+    {
+      taskCount: 0,
+      passedCount: 0,
+      turns: 0,
+      inputTokens: 0,
+      outputTokens: 0,
+      cacheCreationInputTokens: 0,
+      cacheReadInputTokens: 0,
+      costUsd: 0
+    }
+  );
+
+  const reportPath = join(rootDir, "REPORT.md");
+  const report: EvalSuiteReport = {
+    runId,
+    rootDir,
+    fixtureDir,
+    status: tasks.every((t) => t.passed) ? "passed" : "failed",
+    tasks,
+    totals,
+    reportPath
+  };
+  await writeFile(reportPath, renderEvalMarkdown(report), "utf8");
+  return report;
+}
+
+export function formatEvalReport(report: EvalSuiteReport): string {
+  const lines = [
+    `[eval] ${report.status}`,
+    `run: ${report.rootDir}`,
+    `fixture: ${report.fixtureDir}`,
+    "tasks:"
+  ];
+  for (const t of report.tasks) {
+    const m = t.metrics;
+    lines.push(
+      `- ${t.taskId}: ${t.passed ? "passed" : "failed"} (${t.category}) ` +
+        `turns=${m.turns} in=${m.inputTokens} out=${m.outputTokens} ` +
+        `cache_w=${m.cacheCreationInputTokens} cache_r=${m.cacheReadInputTokens} ` +
+        `cost=$${m.costUsd.toFixed(4)} -> ${t.transcriptPath}`
+    );
+    if (!t.passed && t.notes.length > 0) {
+      lines.push(`  notes: ${t.notes.join("; ")}`);
+    }
+  }
+  const tot = report.totals;
+  lines.push(
+    `totals: tasks=${tot.taskCount} passed=${tot.passedCount} turns=${tot.turns} ` +
+      `in=${tot.inputTokens} out=${tot.outputTokens} ` +
+      `cache_w=${tot.cacheCreationInputTokens} cache_r=${tot.cacheReadInputTokens} ` +
+      `cost=$${tot.costUsd.toFixed(4)}`
+  );
+  lines.push(`report: ${report.reportPath}`);
+  return `${lines.join("\n")}\n`;
+}
+
+function computeMetrics(events: readonly LoopEvent[]): EvalTaskMetrics {
+  let turns = 0;
+  const summed: ModelUsage = {
+    inputTokens: 0,
+    outputTokens: 0,
+    cacheCreationInputTokens: 0,
+    cacheReadInputTokens: 0
+  };
+  for (const event of events) {
+    if (event.type !== "assistant_message") {
+      continue;
+    }
+    turns += 1;
+    const u = event.usage;
+    if (u) {
+      summed.inputTokens = (summed.inputTokens ?? 0) + (u.inputTokens ?? 0);
+      summed.outputTokens = (summed.outputTokens ?? 0) + (u.outputTokens ?? 0);
+      summed.cacheCreationInputTokens =
+        (summed.cacheCreationInputTokens ?? 0) + (u.cacheCreationInputTokens ?? 0);
+      summed.cacheReadInputTokens =
+        (summed.cacheReadInputTokens ?? 0) + (u.cacheReadInputTokens ?? 0);
+    }
+  }
+  return {
+    turns,
+    inputTokens: summed.inputTokens ?? 0,
+    outputTokens: summed.outputTokens ?? 0,
+    cacheCreationInputTokens: summed.cacheCreationInputTokens ?? 0,
+    cacheReadInputTokens: summed.cacheReadInputTokens ?? 0,
+    costUsd: estimateUsageCostUsd(summed, EVAL_REFERENCE_RATES)
+  };
+}
+
+function usage(
+  inputTokens: number,
+  outputTokens: number,
+  cacheCreationInputTokens = 0,
+  cacheReadInputTokens = 0
+): ModelUsage {
+  return { inputTokens, outputTokens, cacheCreationInputTokens, cacheReadInputTokens };
+}
+
+function createEvalTasks(): EvalTask[] {
+  return [
+    {
+      taskId: "read-only-analysis",
+      title: "Read-only project analysis (Glob + Read in plan mode)",
+      category: "read_only",
+      prompt: "Summarize the fixture project and its math helper.",
+      permissionMode: "plan",
+      script: [
+        {
+          type: "assistant_message",
+          content: "I will inspect the fixture without modifying anything.",
+          usage: usage(1500, 60, 1500, 0)
+        },
+        { type: "tool_use", toolUse: { id: "ev_glob", name: "Glob", input: { pattern: "**/*" } } },
+        { type: "tool_use", toolUse: { id: "ev_read", name: "Read", input: { path: "README.md" } } },
+        { type: "turn_break" },
+        {
+          type: "assistant_message",
+          content: "The fixture has README.md and src/math.ts (an add helper).",
+          usage: usage(300, 120, 0, 1500)
+        }
+      ],
+      async validate(_fixtureDir, events) {
+        const names = toolUseNames(events);
+        return names.includes("Glob") && names.includes("Read")
+          ? []
+          : ["read-only analysis did not exercise both Glob and Read"];
+      }
+    },
+    {
+      taskId: "safe-edit",
+      title: "Read-before-write safe edit (bypassPermissions)",
+      category: "safe_edit",
+      prompt: "Fix the add helper in src/math.ts so it actually adds.",
+      permissionMode: "bypassPermissions",
+      script: [
+        {
+          type: "assistant_message",
+          content: "I will read the file before editing it.",
+          usage: usage(1600, 40, 1600, 0)
+        },
+        { type: "tool_use", toolUse: { id: "ev_read_math", name: "Read", input: { path: "src/math.ts" } } },
+        { type: "turn_break" },
+        {
+          type: "assistant_message",
+          content: "It subtracts; I'll make a minimal one-line edit.",
+          usage: usage(400, 50, 0, 1600)
+        },
+        {
+          type: "tool_use",
+          toolUse: {
+            id: "ev_edit_math",
+            name: "Edit",
+            input: { path: "src/math.ts", oldString: "return a - b;", newString: "return a + b;" }
+          }
+        },
+        { type: "turn_break" },
+        {
+          type: "assistant_message",
+          content: "The add helper now returns a + b.",
+          usage: usage(450, 30, 0, 1600)
+        }
+      ],
+      async validate(fixtureDir, _events) {
+        const content = await readFile(join(fixtureDir, "src", "math.ts"), "utf8");
+        return content.includes("return a + b;")
+          ? []
+          : ["safe-edit did not produce the corrected add helper"];
+      }
+    },
+    {
+      taskId: "bash-readonly",
+      title: "Read-only Bash whitelist (pwd) in plan mode",
+      category: "bash",
+      prompt: "Show the working directory.",
+      permissionMode: "plan",
+      script: [
+        {
+          type: "assistant_message",
+          content: "Running pwd, a whitelisted read-only command.",
+          usage: usage(900, 30, 0, 0)
+        },
+        { type: "tool_use", toolUse: { id: "ev_bash_pwd", name: "Bash", input: { command: "pwd" } } },
+        { type: "turn_break" },
+        {
+          type: "assistant_message",
+          content: "Reported the working directory.",
+          usage: usage(150, 20, 0, 0)
+        }
+      ],
+      async validate(_fixtureDir, events) {
+        const ok = events.some(
+          (e) => e.type === "tool_result" && e.result.status === "success"
+        );
+        return ok ? [] : ["bash pwd did not succeed"];
+      }
+    },
+    {
+      taskId: "plan-mode-blocks-write",
+      title: "Permission enforcement: Write denied in plan mode",
+      category: "permission",
+      prompt: "Try to create a file (should be blocked in plan mode).",
+      permissionMode: "plan",
+      script: [
+        {
+          type: "assistant_message",
+          content: "Attempting a Write; plan mode should block it.",
+          usage: usage(800, 25, 0, 0)
+        },
+        {
+          type: "tool_use",
+          toolUse: {
+            id: "ev_write_blocked",
+            name: "Write",
+            input: { path: "should-not-exist.txt", content: "nope" }
+          }
+        },
+        { type: "turn_break" },
+        {
+          type: "assistant_message",
+          content: "As expected, the write was denied by the plan-mode policy.",
+          usage: usage(200, 30, 0, 0)
+        }
+      ],
+      async validate(fixtureDir, events) {
+        const denied = events.some(
+          (e) => e.type === "tool_result" && e.result.status === "error"
+        );
+        if (!denied) {
+          return ["plan-mode Write was NOT denied (permission regression!)"];
+        }
+        const leaked = await readFile(join(fixtureDir, "should-not-exist.txt"), "utf8")
+          .then(() => true)
+          .catch(() => false);
+        return leaked ? ["plan-mode Write leaked a file to disk"] : [];
+      }
+    },
+    {
+      taskId: "subagent-explore",
+      title: "Explore sub-agent runs through the same query loop",
+      category: "sub_agent",
+      prompt: "Delegate a read-only exploration of the math helper.",
+      permissionMode: "plan",
+      script: [
+        {
+          type: "assistant_message",
+          content: "Delegating to a read-only explore sub-agent.",
+          usage: usage(1800, 40, 1800, 0)
+        },
+        {
+          type: "tool_use",
+          toolUse: {
+            id: "ev_agent",
+            name: "Agent",
+            input: {
+              description: "explore math helper",
+              prompt: "Read src/math.ts and report what add does",
+              subagent_type: "explore"
+            }
+          }
+        },
+        { type: "turn_break" },
+        {
+          type: "tool_use",
+          toolUse: { id: "ev_child_read", name: "Read", input: { path: "src/math.ts" } }
+        },
+        { type: "turn_break" },
+        {
+          type: "assistant_message",
+          content: "src/math.ts defines an add helper.",
+          usage: usage(250, 60, 0, 1800)
+        },
+        { type: "turn_break" },
+        {
+          type: "assistant_message",
+          content: "The sub-agent confirmed src/math.ts has an add helper.",
+          usage: usage(300, 40, 0, 1800)
+        }
+      ],
+      async validate(_fixtureDir, events) {
+        const ranAgent = events.some(
+          (e) =>
+            e.type === "tool_result" &&
+            e.result.toolUseId === "ev_agent" &&
+            e.result.status === "success"
+        );
+        return ranAgent ? [] : ["explore sub-agent did not complete successfully"];
+      }
+    }
+  ];
+}
+
+async function writeEvalFixture(fixtureDir: string): Promise<void> {
+  await mkdir(join(fixtureDir, "src"), { recursive: true });
+  await writeFile(
+    join(fixtureDir, "README.md"),
+    ["# Eval fixture", "", "A tiny project used by the offline eval regression suite.", ""].join("\n"),
+    "utf8"
+  );
+  await writeFile(
+    join(fixtureDir, "src", "math.ts"),
+    ["export function add(a: number, b: number): number {", "  return a - b;", "}", ""].join("\n"),
+    "utf8"
+  );
+}
+
+function finalTerminalState(events: readonly LoopEvent[]): TerminalState {
+  const terminal = [...events].reverse().find((event) => event.type === "terminal_state");
+  return terminal?.type === "terminal_state"
+    ? terminal.state
+    : { status: "error", error: "query ended without terminal_state" };
+}
+
+function terminalNotes(state: TerminalState): string[] {
+  if (state.status === "completed") {
+    return [];
+  }
+  return [
+    `terminal_state=${state.status}${state.reason ? ` reason=${state.reason}` : ""}${
+      state.error ? ` error=${state.error}` : ""
+    }`
+  ];
+}
+
+function toolUseNames(events: readonly LoopEvent[]): string[] {
+  return events
+    .filter((event) => event.type === "tool_use")
+    .map((event) => event.toolUse.name);
+}
+
+function toolResultNotes(events: readonly LoopEvent[], task: EvalTask): string[] {
+  // The permission task expects a tool_result error (the denied Write),
+  // so its validate() owns the semantics — don't flag errors here.
+  if (task.category === "permission") {
+    return [];
+  }
+  const notes: string[] = [];
+  for (const event of events) {
+    if (event.type === "tool_result" && event.result.status === "error") {
+      notes.push(
+        `tool_result error for ${event.result.toolUseId}: ${event.result.error ?? "unknown error"}`
+      );
+    }
+  }
+  return notes;
+}
+
+function renderEvalMarkdown(report: EvalSuiteReport): string {
+  const lines = [
+    "# Eval Regression Report",
+    "",
+    `Run: ${report.runId}`,
+    `Status: ${report.status}`,
+    `Fixture: ${report.fixtureDir}`,
+    "",
+    "| Task | Category | Result | Turns | In | Out | Cache W | Cache R | Cost |",
+    "|---|---|---|--:|--:|--:|--:|--:|--:|"
+  ];
+  for (const t of report.tasks) {
+    const m = t.metrics;
+    lines.push(
+      `| ${t.taskId} | ${t.category} | ${t.passed ? "pass" : "FAIL"} | ${m.turns} | ${m.inputTokens} | ${m.outputTokens} | ${m.cacheCreationInputTokens} | ${m.cacheReadInputTokens} | $${m.costUsd.toFixed(4)} |`
+    );
+  }
+  const tot = report.totals;
+  lines.push(
+    `| **total** | (${tot.taskCount} tasks) | ${tot.passedCount}/${tot.taskCount} | ${tot.turns} | ${tot.inputTokens} | ${tot.outputTokens} | ${tot.cacheCreationInputTokens} | ${tot.cacheReadInputTokens} | $${tot.costUsd.toFixed(4)} |`
+  );
+  lines.push("");
+  const failed = report.tasks.filter((t) => !t.passed);
+  if (failed.length > 0) {
+    lines.push("## Failures", "");
+    for (const t of failed) {
+      lines.push(`- **${t.taskId}**: ${t.notes.join("; ") || t.terminalState.status}`);
+    }
+    lines.push("");
+  }
+  lines.push(
+    "## Notes",
+    "",
+    "Offline, deterministic. Token counts are scripted via FakeModel so the",
+    "cost column is a stable regression signal — a diff in tokens/turns means",
+    "the agent loop's behavior changed, not the model's."
+  );
+  return `${lines.join("\n")}\n`;
+}
+
+function compactTimestamp(iso: string): string {
+  return iso.replace(/[-:.TZ]/g, "").slice(0, 14);
+}
diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts
index 295b08d..abf4763 100644
--- a/packages/cli/src/index.ts
+++ b/packages/cli/src/index.ts
@@ -74,6 +74,7 @@ import {
 } from "@mini-claude-code/tools";
 import { formatWeek12AuditReport, runWeek12Audit } from "./week12.js";
 import { formatWeek18FinalReport, runWeek18Final } from "./week18.js";
+import { formatEvalReport, runEvalSuite } from "./eval.js";
 
 const VERSION = "0.0.0";
 const DEFAULT_TOOL_RESULT_BUDGET_CHARS = 8_192;
@@ -91,6 +92,7 @@ Usage:
   myagent mcp <list|tools>
   myagent week12 audit
   myagent week18 finalize
+  myagent eval run
   myagent usage <sessionId>
   myagent profile <startup|list|show>
   myagent task <start-bash|list|read|kill|notify>
@@ -121,6 +123,7 @@ Week 18 scope:
   profile startup records fast-path and cold-path checkpoints under .myagent/profiles.
   usage <sessionId> renders a per-turn token + cost breakdown from the saved transcript.
   week18 finalize runs the final offline smoke suite and writes a portfolio report.
+  eval run executes the offline fixture-based agent regression suite (pass/turns/tokens/cost) under .myagent/evals.
   memory list shows editable long-term memory entries that will be recalled into future turns.
   resume <sessionId> prints a saved transcript, or continues it when a prompt is provided.
   resume <sessionId> --show-compactions lists every compaction event with its archive path.
@@ -253,6 +256,10 @@ export async function runCli(
     return runWeek18(argv.slice(1), stdout, stderr, dependencies);
   }
 
+  if (argv[0] === "eval") {
+    return runEval(argv.slice(1), stdout, stderr, dependencies);
+  }
+
   if (argv[0] === "profile") {
     return runProfile(argv.slice(1), stdout, stderr, dependencies);
   }
@@ -497,6 +504,24 @@ async function runWeek18(
   return report.status === "passed" ? 0 : 1;
 }
 
+async function runEval(
+  args: readonly string[],
+  stdout: WritableLike,
+  stderr: WritableLike,
+  dependencies: CliDependencies
+): Promise<number> {
+  const [command] = args;
+  if (command !== "run") {
+    stderr.write("Usage: myagent eval run\n");
+    return 1;
+  }
+
+  const cwd = dependencies.cwd ?? process.cwd();
+  const report = await runEvalSuite({ cwd });
+  stdout.write(formatEvalReport(report));
+  return report.status === "passed" ? 0 : 1;
+}
+
 async function runProfile(
   args: readonly string[],
   stdout: WritableLike,
diff --git a/packages/cli/test/eval.test.ts b/packages/cli/test/eval.test.ts
new file mode 100644
index 0000000..f349689
--- /dev/null
+++ b/packages/cli/test/eval.test.ts
@@ -0,0 +1,95 @@
+import { mkdtempSync, readFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+
+import { describe, expect, it } from "vitest";
+
+import { formatEvalReport, runEvalSuite } from "../src/eval.js";
+
+// B1: this file IS the regression gate. It runs the full offline eval
+// suite and fails the build if any task regresses or the deterministic
+// metrics drift.
+describe("M2.3 eval regression suite", () => {
+  it("all eval tasks pass and metrics are deterministic", async () => {
+    const cwd = mkdtempSync(join(tmpdir(), "myagent-eval-gate-"));
+    const outputRootDir = join(cwd, ".myagent", "evals", "runs");
+
+    const report = await runEvalSuite({
+      cwd,
+      outputRootDir,
+      now: new Date("2026-05-15T00:00:00.000Z")
+    });
+
+    // Hard gate: every task must pass.
+    expect(report.status).toBe("passed");
+    expect(report.totals.passedCount).toBe(report.totals.taskCount);
+    expect(report.totals.taskCount).toBe(5);
+
+    // The five tasks we expect, by id + category.
+    const byId = new Map(report.tasks.map((t) => [t.taskId, t]));
+    expect([...byId.keys()].sort()).toEqual(
+      [
+        "bash-readonly",
+        "plan-mode-blocks-write",
+        "read-only-analysis",
+        "safe-edit",
+        "subagent-explore"
+      ].sort()
+    );
+    expect(byId.get("plan-mode-blocks-write")?.category).toBe("permission");
+    expect(byId.get("subagent-explore")?.category).toBe("sub_agent");
+
+    // Deterministic metric pins — these are scripted via FakeModel usage,
+    // so any change here means the agent loop's behavior changed, which
+    // is exactly what this gate is meant to catch.
+    const readOnly = byId.get("read-only-analysis");
+    expect(readOnly?.metrics).toMatchObject({
+      turns: 2,
+      inputTokens: 1800,
+      outputTokens: 180,
+      cacheCreationInputTokens: 1500,
+      cacheReadInputTokens: 1500
+    });
+    // cost = 1800/1e6*3 + 180/1e6*15 + 1500/1e6*3.75 + 1500/1e6*0.3
+    //      = 0.0054 + 0.0027 + 0.005625 + 0.00045 = 0.014175
+    expect(readOnly?.metrics.costUsd).toBeCloseTo(0.0142, 4);
+
+    // Totals are a stable fingerprint of the whole suite.
+    expect(report.totals.turns).toBe(11);
+    expect(report.totals.inputTokens).toBe(8400);
+    expect(report.totals.outputTokens).toBe(485);
+    expect(report.totals.costUsd).toBeGreaterThan(0);
+
+    // The markdown report file exists and has the summary table.
+    const md = readFileSync(report.reportPath, "utf8");
+    expect(md).toContain("# Eval Regression Report");
+    expect(md).toContain("| **total** |");
+    expect(md).toContain("Status: passed");
+  });
+
+  it("permission task actually denies the plan-mode Write (no leaked file)", async () => {
+    const cwd = mkdtempSync(join(tmpdir(), "myagent-eval-perm-"));
+    const report = await runEvalSuite({
+      cwd,
+      outputRootDir: join(cwd, ".myagent", "evals", "runs")
+    });
+    const perm = report.tasks.find((t) => t.taskId === "plan-mode-blocks-write");
+    expect(perm?.passed).toBe(true);
+    expect(perm?.notes).toEqual([]);
+    // If the Write had leaked, validate() would have produced a note and
+    // the task would be failed — the green pass IS the assertion that
+    // plan mode held.
+  });
+
+  it("formatEvalReport renders a stable human summary", async () => {
+    const cwd = mkdtempSync(join(tmpdir(), "myagent-eval-fmt-"));
+    const report = await runEvalSuite({
+      cwd,
+      outputRootDir: join(cwd, ".myagent", "evals", "runs")
+    });
+    const text = formatEvalReport(report);
+    expect(text.startsWith("[eval] passed")).toBe(true);
+    expect(text).toContain("totals: tasks=5 passed=5 turns=11");
+    expect(text).toContain("read-only-analysis: passed (read_only)");
+  });
+});
diff --git a/packages/core/src/model.ts b/packages/core/src/model.ts
index 81aa916..d778d56 100644
--- a/packages/core/src/model.ts
+++ b/packages/core/src/model.ts
@@ -120,6 +120,13 @@ export type FakeModelStep =
   | {
       type: "assistant_message";
       content: string;
+      /**
+       * Optional scripted token usage for this assistant turn. Lets
+       * offline tests (esp. the M2.3 eval suite) assert deterministic
+       * token / cost accounting without a live model. When omitted the
+       * stream event carries no usage, exactly as before.
+       */
+      usage?: ModelUsage;
     }
   | {
       type: "tool_use";
@@ -207,6 +214,7 @@ export class FakeModel implements ModelClient {
             role: "assistant",
             content: step.content
           },
+          ...(step.usage ? { usage: step.usage } : {}),
           requestId
         };
         continue;
diff --git a/packages/core/test/fake-model-usage.test.ts b/packages/core/test/fake-model-usage.test.ts
new file mode 100644
index 0000000..17524b5
--- /dev/null
+++ b/packages/core/test/fake-model-usage.test.ts
@@ -0,0 +1,66 @@
+import { describe, expect, it } from "vitest";
+
+import { FakeModel, type ModelStreamEvent } from "../src/index.js";
+
+async function streamEvents(model: FakeModel): Promise<ModelStreamEvent[]> {
+  const events: ModelStreamEvent[] = [];
+  for await (const event of model.stream({ messages: [{ role: "user", content: "hi" }] })) {
+    events.push(event);
+  }
+  return events;
+}
+
+describe("FakeModel scripted usage (M2.3 A1 extension)", () => {
+  it("carries scripted usage on the assistant_message stream event", async () => {
+    const model = new FakeModel([
+      {
+        type: "assistant_message",
+        content: "done",
+        usage: {
+          inputTokens: 1234,
+          outputTokens: 56,
+          cacheCreationInputTokens: 1000,
+          cacheReadInputTokens: 200
+        }
+      }
+    ]);
+
+    const events = await streamEvents(model);
+    const assistant = events.find((e) => e.type === "assistant_message");
+    expect(assistant).toMatchObject({
+      type: "assistant_message",
+      usage: {
+        inputTokens: 1234,
+        outputTokens: 56,
+        cacheCreationInputTokens: 1000,
+        cacheReadInputTokens: 200
+      }
+    });
+  });
+
+  it("omits usage entirely when the step does not script it (back-compat)", async () => {
+    const model = new FakeModel([{ type: "assistant_message", content: "no usage here" }]);
+    const events = await streamEvents(model);
+    const assistant = events.find((e) => e.type === "assistant_message");
+    expect(assistant?.type).toBe("assistant_message");
+    expect(assistant && "usage" in assistant ? assistant.usage : undefined).toBeUndefined();
+  });
+
+  it("scripts independent usage across multiple turns", async () => {
+    const model = new FakeModel([
+      { type: "assistant_message", content: "turn 1", usage: { inputTokens: 10, outputTokens: 1 } },
+      { type: "turn_break" },
+      { type: "assistant_message", content: "turn 2", usage: { inputTokens: 20, outputTokens: 2 } }
+    ]);
+
+    const first = await streamEvents(model);
+    expect(first.find((e) => e.type === "assistant_message")).toMatchObject({
+      usage: { inputTokens: 10, outputTokens: 1 }
+    });
+
+    const second = await streamEvents(model);
+    expect(second.find((e) => e.type === "assistant_message")).toMatchObject({
+      usage: { inputTokens: 20, outputTokens: 2 }
+    });
+  });
+});
diff --git a/packages/core/test/security/README.md b/packages/core/test/security/README.md
index 10a338e..e22f79d 100644
--- a/packages/core/test/security/README.md
+++ b/packages/core/test/security/README.md
@@ -73,6 +73,16 @@ Tests live in two trees because of the package boundary
 | `executeToolBatch` never overlaps two non-concurrency-safe tools | `packages/core/test/security/scheduler-write-serialization.test.ts` |
 | Sibling read tools cancel when a Bash sibling errors with cancel-on-error | `packages/core/test/scheduler.test.ts` |
 
+### Eval regression suite (M2.3)
+
+| Invariant | Test |
+|---|---|
+| The offline eval suite runs all 5 fixture tasks through the real query loop and every one passes | `packages/cli/test/eval.test.ts` |
+| Per-task metrics are deterministic (turns + scripted token totals + reproducible cost) | `packages/cli/test/eval.test.ts` |
+| The permission task actually denies a plan-mode Write (no leaked file) — a real permission regression flips it red | `packages/cli/test/eval.test.ts` |
+| `formatEvalReport` renders a stable `[eval] passed` summary | `packages/cli/test/eval.test.ts` |
+| `FakeModel` carries scripted `usage` on the assistant_message event; omitting it stays back-compat (no usage) | `packages/core/test/fake-model-usage.test.ts` |
+
 ### Remote session ownership
 
 | Invariant | Test |