From 6ef7d0c506ef4269e73cc410efb24fce686b4dec Mon Sep 17 00:00:00 2001
From: Juha Kangas <42040080+valuecodes@users.noreply.github.com>
Date: Tue, 27 Jan 2026 11:03:57 +0200
Subject: [PATCH 01/14] docs: update AGENTS.md to specify schemas.ts for
 pipeline classes

---
 AGENTS.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/AGENTS.md b/AGENTS.md
index c67c81f..ce87b40 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -7,7 +7,7 @@
 **Repo:** `cli-agent-sandbox` — minimal TypeScript CLI sandbox built with `@openai/agents` and tool sandboxing under `tmp/`.
 
 1. Start at `src/cli/<cli>/main.ts` and the matching `src/cli/<cli>/README.md`.
-2. Follow the pipeline classes under `src/cli/<cli>/clients/*` and schemas under `src/cli/<cli>/types/*`.
+2. Follow the pipeline classes under `src/cli/<cli>/clients/*` and schemas under `src/cli/<cli>/types/schemas.ts`.
 3. Reuse shared helpers: `src/utils/parse-args.ts`, `src/utils/question-handler.ts`, `src/clients/logger.ts`.
 4. Keep `main.ts` focused on the basic agent flow; move non-trivial logic into `clients/` or `utils/`.
 5. Keep changes minimal; add/update **Vitest** tests (`*.test.ts`) when behavior changes.
@@ -117,7 +117,7 @@ All file tools are sandboxed to `tmp/` using path validation (`src/tools/utils/f
 - Prefer TypeScript path aliases over deep relative imports: `~tools/*`, `~clients/*`, `~utils/*`.
 - Use Zod schemas for CLI args and tool IO.
 - Keep object field names in `camelCase` (e.g., `trainSamples`), not `snake_case`.
-- Keep Zod schemas in a dedicated `schemas.ts` file for each CLI (avoid inline schemas in `main.ts`).
+- Keep Zod schemas in a dedicated `types/schemas.ts` file for each CLI (avoid inline schemas in `main.ts`).
 - Keep constants in a dedicated `constants.ts` file for each CLI.
 - Move hardcoded numeric values into `constants.ts` (treat numbers as configuration).
 - For HTTP fetching in code, prefer `Fetch` (sanitized) or `PlaywrightScraper` for JS-heavy pages.

From 9b503be94f4161a9ff0b9c799fb305746d5b9b59 Mon Sep 17 00:00:00 2001
From: Juha Kangas <42040080+valuecodes@users.noreply.github.com>
Date: Tue, 27 Jan 2026 13:02:06 +0200
Subject: [PATCH 02/14] feat: add agent evaluation CLI with reporting and
 assertion framework

- Implement evaluation runner to execute agent test suites
- Create report generator for JSON and Markdown outputs
- Add assertion evaluation utilities for various assertion types
- Introduce suite loader for loading evaluation suites from JSON files
- Update README and add checklist for post-scaffold tasks
---
 .claude/settings.json                         |  23 +-
 package.json                                  |   1 +
 src/cli/agent-evals/CHECKLIST.md              |  25 ++
 src/cli/agent-evals/README.md                 | 103 +++++++++
 src/cli/agent-evals/clients/eval-runner.ts    | 209 +++++++++++++++++
 .../agent-evals/clients/report-generator.ts   | 190 ++++++++++++++++
 src/cli/agent-evals/clients/suite-loader.ts   |  86 +++++++
 src/cli/agent-evals/constants.ts              |  41 ++++
 src/cli/agent-evals/main.ts                   | 104 +++++++++
 src/cli/agent-evals/schemas.ts                | 203 +++++++++++++++++
 src/cli/agent-evals/suites/example.json       |  79 +++++++
 src/cli/agent-evals/utils/assertions.test.ts  | 214 ++++++++++++++++++
 src/cli/agent-evals/utils/assertions.ts       | 164 ++++++++++++++
 src/clients/agent-runner.ts                   |   5 +-
 14 files changed, 1434 insertions(+), 13 deletions(-)
 create mode 100644 src/cli/agent-evals/CHECKLIST.md
 create mode 100644 src/cli/agent-evals/README.md
 create mode 100644 src/cli/agent-evals/clients/eval-runner.ts
 create mode 100644 src/cli/agent-evals/clients/report-generator.ts
 create mode 100644 src/cli/agent-evals/clients/suite-loader.ts
 create mode 100644 src/cli/agent-evals/constants.ts
 create mode 100644 src/cli/agent-evals/main.ts
 create mode 100644 src/cli/agent-evals/schemas.ts
 create mode 100644 src/cli/agent-evals/suites/example.json
 create mode 100644 src/cli/agent-evals/utils/assertions.test.ts
 create mode 100644 src/cli/agent-evals/utils/assertions.ts

diff --git a/.claude/settings.json b/.claude/settings.json
index e47738f..414b774 100644
--- a/.claude/settings.json
+++ b/.claude/settings.json
@@ -1,8 +1,6 @@
 {
   "$schema": "https://json.schemastore.org/claude-code-settings.json",
-  "description": "Settings for Claude code agents",
   "permissions": {
-    "defaultMode": "default",
     "allow": [
       "Bash(pnpm lint:*)",
       "Bash(pnpm lint:fix:*)",
@@ -10,13 +8,8 @@
       "Bash(pnpm build:*)",
       "Bash(pnpm format:*)",
       "Bash(pnpm format:check:*)",
-      "Bash(pnpm test:*)"
-    ],
-    "ask": [
-      "Bash(pnpm install:*)",
-      "Bash(pnpm add:*)",
-      "Bash(pnpm remove:*)",
-      "Bash(git commit:*)"
+      "Bash(pnpm test:*)",
+      "Bash(tsx scripts/scaffold-cli.ts:*)"
     ],
     "deny": [
       "Bash(curl:*)",
@@ -29,6 +22,14 @@
       "Read(**/secrets/**)",
       "Bash(git push:*)",
       "Bash(gh pr create:*)"
-    ]
-  }
+    ],
+    "ask": [
+      "Bash(pnpm install:*)",
+      "Bash(pnpm add:*)",
+      "Bash(pnpm remove:*)",
+      "Bash(git commit:*)"
+    ],
+    "defaultMode": "default"
+  },
+  "description": "Settings for Claude code agents"
 }
diff --git a/package.json b/package.json
index e7a9c88..af04036 100644
--- a/package.json
+++ b/package.json
@@ -8,6 +8,7 @@
     "run:name-explorer": "pnpm -s node:tsx -- src/cli/name-explorer/main.ts",
     "run:scrape-publications": "tsx src/cli/scrape-publications/main.ts",
     "run:etf-backtest": "tsx src/cli/etf-backtest/main.ts",
+    "run:agent-evals": "tsx src/cli/agent-evals/main.ts",
     "scaffold:cli": "tsx scripts/scaffold-cli.ts",
     "node:tsx": "node --disable-warning=ExperimentalWarning --import tsx",
     "typecheck": "tsc --noEmit",
diff --git a/src/cli/agent-evals/CHECKLIST.md b/src/cli/agent-evals/CHECKLIST.md
new file mode 100644
index 0000000..986ce0d
--- /dev/null
+++ b/src/cli/agent-evals/CHECKLIST.md
@@ -0,0 +1,25 @@
+# Post-Scaffold Checklist
+
+## Setup
+
+- [ ] Update `main.ts` with CLI logic
+- [ ] Add CLI arguments to the Zod schema
+- [ ] Update `README.md` description and flowchart
+
+## Optional Structure
+
+- [ ] Create `./clients/` for pipeline/client classes
+- [ ] Create `./types/` for Zod schemas
+- [ ] Create `./tools/` for CLI-specific agent tools
+
+## Before Committing
+
+- [ ] `pnpm typecheck`
+- [ ] `pnpm lint`
+- [ ] `pnpm format:check`
+- [ ] Add tests if behavior is testable
+- [ ] `pnpm test`
+
+## Cleanup
+
+- [ ] Delete this CHECKLIST.md when done
diff --git a/src/cli/agent-evals/README.md b/src/cli/agent-evals/README.md
new file mode 100644
index 0000000..7ea7245
--- /dev/null
+++ b/src/cli/agent-evals/README.md
@@ -0,0 +1,103 @@
+# Agent Evals
+
+Run automated evaluation cases for AI agents with PASS/FAIL results and reports.
+
+## Run
+
+```bash
+# Run a single suite
+pnpm run:agent-evals -- --suite=example
+
+# Run all suites
+pnpm run:agent-evals -- --all
+
+# With options
+pnpm run:agent-evals -- --suite=example --verbose --report=both
+```
+
+## Arguments
+
+- `--suite <name>`: Run a specific suite by name (without `.json` extension)
+- `--all`: Run all suites in the `suites/` directory
+- `--report <format>`: Report format: `json`, `md`, or `both` (default: `json`)
+- `--out <path>`: Output directory under `tmp/` (default: `agent-evals`)
+- `--verbose`: Enable verbose logging with assertion details
+
+Either `--suite` or `--all` is required.
+
+## Output
+
+Reports are written to `tmp/agent-evals/`:
+- `report-{timestamp}.json`: Machine-readable results
+- `report-{timestamp}.md`: Human-readable markdown report
+
+Exit code is 1 if any tests fail or error.
+
+## Creating Evaluation Suites
+
+Add JSON files to `suites/` directory. Example structure:
+
+```json
+{
+  "name": "my-suite",
+  "description": "Test suite description",
+  "version": "1.0.0",
+  "agent": {
+    "name": "MyTestAgent",
+    "model": "gpt-5-mini",
+    "instructions": "Agent system prompt here",
+    "tools": [],
+    "maxTurns": 3
+  },
+  "defaults": {
+    "timeout": 15000
+  },
+  "cases": [
+    {
+      "id": "case-1",
+      "name": "Test case name",
+      "prompt": "User prompt to test",
+      "assertions": [
+        { "type": "contains", "value": "expected text" }
+      ]
+    }
+  ]
+}
+```
+
+## Assertion Types
+
+- **contains**: Check if output contains a string
+  ```json
+  { "type": "contains", "value": "text", "caseSensitive": false }
+  ```
+
+- **matchesRegex**: Check if output matches a regex pattern
+  ```json
+  { "type": "matchesRegex", "pattern": "\\d+", "flags": "i" }
+  ```
+
+- **equals**: Deep equality check
+  ```json
+  { "type": "equals", "expected": { "key": "value" } }
+  ```
+
+- **jsonPath**: Extract and compare nested values
+  ```json
+  { "type": "jsonPath", "path": "$.response.status", "expected": "success" }
+  ```
+
+## Flowchart
+
+```mermaid
+flowchart TD
+  A["Start"] --> B["Parse args"]
+  B --> C["Load suites"]
+  C --> D["Run each suite"]
+  D --> E["Run each case"]
+  E --> F["Evaluate assertions"]
+  F --> G["Collect results"]
+  G --> H["Generate reports"]
+  H --> I["Print summary"]
+  I --> J["Exit"]
+```
diff --git a/src/cli/agent-evals/clients/eval-runner.ts b/src/cli/agent-evals/clients/eval-runner.ts
new file mode 100644
index 0000000..94f64e6
--- /dev/null
+++ b/src/cli/agent-evals/clients/eval-runner.ts
@@ -0,0 +1,209 @@
+import { AgentRunner } from "~clients/agent-runner";
+import type { Logger } from "~clients/logger";
+
+import {
+  DEFAULT_CASE_TIMEOUT_MS,
+  DEFAULT_MAX_TURNS,
+  STATUS_SYMBOLS,
+  ZERO,
+} from "../constants";
+import type {
+  AssertionResult,
+  CaseResult,
+  CaseStatus,
+  EvalCase,
+  EvalSuite,
+  SuiteResult,
+  SuiteSummary,
+} from "../schemas";
+import { evaluateAssertion } from "../utils/assertions";
+
+export type EvalRunnerConfig = {
+  logger: Logger;
+  verbose?: boolean;
+};
+
+/**
+ * Executes evaluation suites and collects results.
+ * Creates an AgentRunner for each suite based on its agent config,
+ * runs each case, validates outputs, and collects PASS/FAIL results.
+ */
+export class EvalRunner {
+  private logger: Logger;
+  private verbose: boolean;
+
+  constructor(config: EvalRunnerConfig) {
+    this.logger = config.logger;
+    this.verbose = config.verbose ?? false;
+  }
+
+  /**
+   * Run a single evaluation suite.
+   */
+  async runSuite(suite: EvalSuite): Promise<SuiteResult> {
+    const startedAt = new Date();
+    this.logger.info("Running suite", {
+      name: suite.name,
+      caseCount: suite.cases.length,
+    });
+
+    const agentRunner = this.createAgentRunner(suite);
+
+    const caseResults: CaseResult[] = [];
+    let passed = ZERO;
+    let failed = ZERO;
+    let errors = ZERO;
+    let skipped = ZERO;
+
+    for (const evalCase of suite.cases) {
+      const caseResult = await this.runCase(evalCase, agentRunner, suite);
+      caseResults.push(caseResult);
+
+      switch (caseResult.status) {
+        case "pass":
+          passed++;
+          break;
+        case "fail":
+          failed++;
+          break;
+        case "error":
+          errors++;
+          break;
+        case "skip":
+          skipped++;
+          break;
+      }
+
+      this.logCaseResult(caseResult);
+    }
+
+    const completedAt = new Date();
+    const total = suite.cases.length;
+    const summary: SuiteSummary = {
+      total,
+      passed,
+      failed,
+      errors,
+      skipped,
+      passRate: total > ZERO ? passed / total : ZERO,
+    };
+
+    return {
+      suiteName: suite.name,
+      suiteVersion: suite.version,
+      startedAt: startedAt.toISOString(),
+      completedAt: completedAt.toISOString(),
+      durationMs: completedAt.getTime() - startedAt.getTime(),
+      summary,
+      cases: caseResults,
+    };
+  }
+
+  private logCaseResult(caseResult: CaseResult): void {
+    const symbol = STATUS_SYMBOLS[caseResult.status];
+    const message = `${symbol} ${caseResult.caseId}: ${caseResult.caseName}`;
+
+    if (caseResult.status === "pass") {
+      this.logger.info(message, { durationMs: caseResult.durationMs });
+    } else {
+      this.logger.warn(message, {
+        durationMs: caseResult.durationMs,
+        error: caseResult.error,
+      });
+      if (this.verbose && caseResult.assertionResults.length > ZERO) {
+        const failedAssertions = caseResult.assertionResults.filter(
+          (r) => !r.passed
+        );
+        for (const ar of failedAssertions) {
+          this.logger.debug("  Assertion failed", { message: ar.message });
+        }
+      }
+    }
+  }
+
+  /**
+   * Run a single evaluation case.
+   */
+  private async runCase(
+    evalCase: EvalCase,
+    agentRunner: AgentRunner<unknown>,
+    suite: EvalSuite
+  ): Promise<CaseResult> {
+    const startTime = Date.now();
+    const timeout =
+      evalCase.timeout ?? suite.defaults?.timeout ?? DEFAULT_CASE_TIMEOUT_MS;
+
+    this.logger.debug("Running case", { id: evalCase.id, name: evalCase.name });
+
+    try {
+      const runPromise = agentRunner.run({
+        prompt: evalCase.prompt,
+        maxTurns: suite.agent.maxTurns ?? DEFAULT_MAX_TURNS,
+      });
+
+      const timeoutPromise = new Promise<never>((_, reject) => {
+        setTimeout(() => {
+          reject(new Error("Case timed out"));
+        }, timeout);
+      });
+
+      const result = await Promise.race([runPromise, timeoutPromise]);
+      const output: unknown = result.finalOutput;
+      const durationMs = Date.now() - startTime;
+
+      const assertionResults = this.runAssertions(evalCase.assertions, output);
+
+      const allAssertionsPassed = assertionResults.every((r) => r.passed);
+      const status: CaseStatus = allAssertionsPassed ? "pass" : "fail";
+
+      return {
+        caseId: evalCase.id,
+        caseName: evalCase.name,
+        status,
+        durationMs,
+        output,
+        assertionResults,
+        error: null,
+      };
+    } catch (err) {
+      const durationMs = Date.now() - startTime;
+      const errorMessage = err instanceof Error ? err.message : String(err);
+
+      return {
+        caseId: evalCase.id,
+        caseName: evalCase.name,
+        status: "error",
+        durationMs,
+        output: null,
+        assertionResults: [],
+        error: errorMessage,
+      };
+    }
+  }
+
+  /**
+   * Create an AgentRunner from suite's agent config.
+   * Omits outputType to get plain text responses (no structured output).
+   */
+  private createAgentRunner(suite: EvalSuite): AgentRunner<unknown> {
+    return new AgentRunner({
+      name: suite.agent.name,
+      model: suite.agent.model,
+      tools: [],
+      instructions: suite.agent.instructions,
+      logger: this.logger,
+      logToolResults: this.verbose,
+      stateless: true,
+    });
+  }
+
+  /**
+   * Run all assertions on the output.
+   */
+  private runAssertions(
+    assertions: EvalCase["assertions"],
+    output: unknown
+  ): AssertionResult[] {
+    return assertions.map((assertion) => evaluateAssertion(assertion, output));
+  }
+}
diff --git a/src/cli/agent-evals/clients/report-generator.ts b/src/cli/agent-evals/clients/report-generator.ts
new file mode 100644
index 0000000..ce4462e
--- /dev/null
+++ b/src/cli/agent-evals/clients/report-generator.ts
@@ -0,0 +1,190 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import type { Logger } from "~clients/logger";
+import { resolveTmpPathForWrite } from "~tools/utils/fs";
+
+import {
+  DECIMAL_PLACES,
+  PERCENT_MULTIPLIER,
+  STATUS_SYMBOLS,
+} from "../constants";
+import type { EvalReport, ReportSummary, SuiteResult } from "../schemas";
+
+export type ReportFormat = "json" | "md" | "both";
+
+export type ReportGeneratorConfig = {
+  logger: Logger;
+  outputDir: string;
+  format: ReportFormat;
+};
+
+/**
+ * Generates evaluation reports in JSON and/or Markdown format.
+ * Reports are written to the configured output directory under tmp/.
+ */
+export class ReportGenerator {
+  private logger: Logger;
+  private outputDir: string;
+  private format: ReportFormat;
+
+  constructor(config: ReportGeneratorConfig) {
+    this.logger = config.logger;
+    this.outputDir = config.outputDir;
+    this.format = config.format;
+  }
+
+  /**
+   * Generate and save report(s) from suite results.
+   * Returns the paths of saved reports.
+   */
+  async generate(suiteResults: SuiteResult[]): Promise<string[]> {
+    const report = this.buildReport(suiteResults);
+    const savedPaths: string[] = [];
+
+    if (this.format === "json" || this.format === "both") {
+      const jsonPath = await this.writeJson(report);
+      savedPaths.push(jsonPath);
+    }
+
+    if (this.format === "md" || this.format === "both") {
+      const mdPath = await this.writeMarkdown(report);
+      savedPaths.push(mdPath);
+    }
+
+    return savedPaths;
+  }
+
+  private buildReport(suiteResults: SuiteResult[]): EvalReport {
+    const totalCases = suiteResults.reduce(
+      (sum, s) => sum + s.summary.total,
+      0
+    );
+    const passed = suiteResults.reduce((sum, s) => sum + s.summary.passed, 0);
+    const failed = suiteResults.reduce((sum, s) => sum + s.summary.failed, 0);
+    const errors = suiteResults.reduce((sum, s) => sum + s.summary.errors, 0);
+    const skipped = suiteResults.reduce((sum, s) => sum + s.summary.skipped, 0);
+    const durationMs = suiteResults.reduce((sum, s) => sum + s.durationMs, 0);
+
+    const summary: ReportSummary = {
+      totalSuites: suiteResults.length,
+      totalCases,
+      passed,
+      failed,
+      errors,
+      skipped,
+      passRate: totalCases > 0 ? passed / totalCases : 0,
+    };
+
+    return {
+      generatedAt: new Date().toISOString(),
+      durationMs,
+      summary,
+      suites: suiteResults,
+    };
+  }
+
+  private async writeJson(report: EvalReport): Promise<string> {
+    const timestamp = this.getTimestamp();
+    const filename = `report-${timestamp}.json`;
+    const relativePath = path.join(this.outputDir, filename);
+    const fullPath = await resolveTmpPathForWrite(relativePath);
+
+    await fs.writeFile(fullPath, JSON.stringify(report, null, 2), "utf8");
+    this.logger.info("JSON report saved", { path: fullPath });
+    return fullPath;
+  }
+
+  private async writeMarkdown(report: EvalReport): Promise<string> {
+    const timestamp = this.getTimestamp();
+    const filename = `report-${timestamp}.md`;
+    const relativePath = path.join(this.outputDir, filename);
+    const fullPath = await resolveTmpPathForWrite(relativePath);
+
+    const markdown = this.formatMarkdown(report);
+    await fs.writeFile(fullPath, markdown, "utf8");
+    this.logger.info("Markdown report saved", { path: fullPath });
+    return fullPath;
+  }
+
+  private formatMarkdown(report: EvalReport): string {
+    const lines: string[] = [];
+
+    lines.push("# Agent Evaluation Report");
+    lines.push("");
+    lines.push(`Generated: ${report.generatedAt}`);
+    lines.push(`Duration: ${report.durationMs}ms`);
+    lines.push("");
+
+    lines.push("## Summary");
+    lines.push("");
+    lines.push("| Metric | Value |");
+    lines.push("|--------|-------|");
+    lines.push(`| Total Suites | ${report.summary.totalSuites} |`);
+    lines.push(`| Total Cases | ${report.summary.totalCases} |`);
+    lines.push(`| Passed | ${report.summary.passed} |`);
+    lines.push(`| Failed | ${report.summary.failed} |`);
+    lines.push(`| Errors | ${report.summary.errors} |`);
+    lines.push(`| Skipped | ${report.summary.skipped} |`);
+    lines.push(
+      `| Pass Rate | ${this.formatPercent(report.summary.passRate)} |`
+    );
+    lines.push("");
+
+    for (const suite of report.suites) {
+      lines.push(`## Suite: ${suite.suiteName}`);
+      lines.push("");
+      lines.push(`Version: ${suite.suiteVersion}`);
+      lines.push(`Duration: ${suite.durationMs}ms`);
+      lines.push(
+        `Pass Rate: ${this.formatPercent(suite.summary.passRate)} (${suite.summary.passed}/${suite.summary.total})`
+      );
+      lines.push("");
+
+      lines.push("### Cases");
+      lines.push("");
+      lines.push("| Status | ID | Name | Duration |");
+      lines.push("|--------|-----|------|----------|");
+
+      for (const caseResult of suite.cases) {
+        const status = STATUS_SYMBOLS[caseResult.status];
+        lines.push(
+          `| ${status} | ${caseResult.caseId} | ${caseResult.caseName} | ${caseResult.durationMs}ms |`
+        );
+      }
+      lines.push("");
+
+      const problemCases = suite.cases.filter(
+        (c) => c.status === "fail" || c.status === "error"
+      );
+      if (problemCases.length > 0) {
+        lines.push("### Details");
+        lines.push("");
+        for (const caseResult of problemCases) {
+          lines.push(`#### ${caseResult.caseId}: ${caseResult.caseName}`);
+          lines.push("");
+          if (caseResult.error) {
+            lines.push(`**Error:** ${caseResult.error}`);
+          }
+          if (caseResult.assertionResults.length > 0) {
+            lines.push("**Assertion Results:**");
+            for (const ar of caseResult.assertionResults) {
+              const icon = ar.passed ? "OK" : "FAIL";
+              lines.push(`- [${icon}] ${ar.assertion.type}: ${ar.message}`);
+            }
+          }
+          lines.push("");
+        }
+      }
+    }
+
+    return lines.join("\n");
+  }
+
+  private formatPercent(value: number): string {
+    return `${(value * PERCENT_MULTIPLIER).toFixed(DECIMAL_PLACES.passRate)}%`;
+  }
+
+  private getTimestamp(): string {
+    return new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19);
+  }
+}
diff --git a/src/cli/agent-evals/clients/suite-loader.ts b/src/cli/agent-evals/clients/suite-loader.ts
new file mode 100644
index 0000000..4cde5fa
--- /dev/null
+++ b/src/cli/agent-evals/clients/suite-loader.ts
@@ -0,0 +1,86 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import type { Logger } from "~clients/logger";
+
+import { SUITE_FILE_EXTENSION, SUITES_DIR } from "../constants";
+import type { EvalSuite } from "../schemas";
+import { EvalSuiteSchema } from "../schemas";
+
+export type SuiteLoaderConfig = {
+  logger: Logger;
+  suitesDir?: string;
+};
+
+/**
+ * Loads evaluation suite definitions from JSON files.
+ * Suite files are stored in the suites/ directory with .json extension.
+ */
+export class SuiteLoader {
+  private logger: Logger;
+  private suitesDir: string;
+
+  constructor(config: SuiteLoaderConfig) {
+    this.logger = config.logger;
+    this.suitesDir = config.suitesDir ?? SUITES_DIR;
+  }
+
+  /**
+   * Load a single suite by name.
+   * @param name Suite name (without .json extension)
+   */
+  async load(name: string): Promise<EvalSuite> {
+    const filePath = path.join(
+      this.suitesDir,
+      `${name}${SUITE_FILE_EXTENSION}`
+    );
+    this.logger.debug("Loading suite", { name, path: filePath });
+
+    const content = await fs.readFile(filePath, "utf8");
+    const json = JSON.parse(content) as unknown;
+    const suite = EvalSuiteSchema.parse(json);
+
+    this.logger.info("Suite loaded", {
+      name: suite.name,
+      caseCount: suite.cases.length,
+    });
+    return suite;
+  }
+
+  /**
+   * List all available suite names.
+   */
+  async listSuites(): Promise<string[]> {
+    try {
+      const entries = await fs.readdir(this.suitesDir, { withFileTypes: true });
+      const suiteNames = entries
+        .filter(
+          (entry) => entry.isFile() && entry.name.endsWith(SUITE_FILE_EXTENSION)
+        )
+        .map((entry) => entry.name.replace(SUITE_FILE_EXTENSION, ""));
+
+      this.logger.debug("Available suites", { suites: suiteNames });
+      return suiteNames;
+    } catch (err) {
+      if ((err as NodeJS.ErrnoException).code === "ENOENT") {
+        this.logger.warn("Suites directory not found", { dir: this.suitesDir });
+        return [];
+      }
+      throw err;
+    }
+  }
+
+  /**
+   * Load all available suites.
+   */
+  async loadAll(): Promise<EvalSuite[]> {
+    const names = await this.listSuites();
+    const suites: EvalSuite[] = [];
+
+    for (const name of names) {
+      const suite = await this.load(name);
+      suites.push(suite);
+    }
+
+    return suites;
+  }
+}
diff --git a/src/cli/agent-evals/constants.ts b/src/cli/agent-evals/constants.ts
new file mode 100644
index 0000000..6e6216b
--- /dev/null
+++ b/src/cli/agent-evals/constants.ts
@@ -0,0 +1,41 @@
+import path from "node:path";
+
+// CLI defaults
+export const DEFAULT_VERBOSE = false;
+export const DEFAULT_REPORT_FORMAT = "json" as const;
+export const DEFAULT_OUT_PATH = "agent-evals";
+
+// Paths
+export const SUITES_DIR = path.join(
+  process.cwd(),
+  "src",
+  "cli",
+  "agent-evals",
+  "suites"
+);
+export const SUITE_FILE_EXTENSION = ".json";
+
+// Execution defaults
+export const DEFAULT_CASE_TIMEOUT_MS = 30000;
+export const DEFAULT_MAX_TURNS = 5;
+
+// Numeric constants
+export const ZERO = 0;
+export const ONE = 1;
+export const PERCENT_MULTIPLIER = 100;
+
+// Report formatting
+export const DECIMAL_PLACES = {
+  passRate: 1,
+  duration: 0,
+} as const;
+
+export const LINE_WIDTH = 60;
+
+// Status symbols for console output
+export const STATUS_SYMBOLS = {
+  pass: "[PASS]",
+  fail: "[FAIL]",
+  error: "[ERROR]",
+  skip: "[SKIP]",
+} as const;
diff --git a/src/cli/agent-evals/main.ts b/src/cli/agent-evals/main.ts
new file mode 100644
index 0000000..a5d2659
--- /dev/null
+++ b/src/cli/agent-evals/main.ts
@@ -0,0 +1,104 @@
+// pnpm run:agent-evals
+
+// Run automated evaluation cases for AI agents with PASS/FAIL results and reports
+
+import "dotenv/config";
+
+import { Logger } from "~clients/logger";
+import { parseArgs } from "~utils/parse-args";
+
+import { EvalRunner } from "./clients/eval-runner";
+import { ReportGenerator } from "./clients/report-generator";
+import { SuiteLoader } from "./clients/suite-loader";
+import { LINE_WIDTH, PERCENT_MULTIPLIER, ZERO } from "./constants";
+import type { SuiteResult } from "./schemas";
+import { CliArgsSchema } from "./schemas";
+
+const logger = new Logger();
+
+logger.info("Agent Evals starting...");
+
+const { suite, all, report, out, verbose } = parseArgs({
+  logger,
+  schema: CliArgsSchema,
+});
+
+if (verbose) {
+  logger.debug("Verbose mode enabled");
+}
+
+const suiteLoader = new SuiteLoader({ logger });
+const evalRunner = new EvalRunner({ logger, verbose });
+const reportGenerator = new ReportGenerator({
+  logger,
+  outputDir: out,
+  format: report,
+});
+
+let suitesToRun;
+if (all) {
+  logger.info("Loading all suites...");
+  suitesToRun = await suiteLoader.loadAll();
+} else if (suite) {
+  logger.info("Loading suite", { name: suite });
+  const singleSuite = await suiteLoader.load(suite);
+  suitesToRun = [singleSuite];
+} else {
+  logger.error("Either --suite or --all is required");
+  process.exit(1);
+}
+
+if (suitesToRun.length === ZERO) {
+  logger.warn("No suites found to run");
+  process.exit(0);
+}
+
+logger.info("Suites to run", { count: suitesToRun.length });
+
+const suiteResults: SuiteResult[] = [];
+const separator = "=".repeat(LINE_WIDTH);
+
+for (const evalSuite of suitesToRun) {
+  logger.info(separator);
+  const result = await evalRunner.runSuite(evalSuite);
+  suiteResults.push(result);
+}
+
+logger.info(separator);
+logger.info("Generating reports...");
+const reportPaths = await reportGenerator.generate(suiteResults);
+
+const totalCases = suiteResults.reduce((sum, s) => sum + s.summary.total, ZERO);
+const totalPassed = suiteResults.reduce(
+  (sum, s) => sum + s.summary.passed,
+  ZERO
+);
+const totalFailed = suiteResults.reduce(
+  (sum, s) => sum + s.summary.failed,
+  ZERO
+);
+const totalErrors = suiteResults.reduce(
+  (sum, s) => sum + s.summary.errors,
+  ZERO
+);
+const passRate =
+  totalCases > ZERO ? (totalPassed / totalCases) * PERCENT_MULTIPLIER : ZERO;
+
+logger.info(separator);
+logger.info("EVALUATION COMPLETE");
+logger.info(separator);
+logger.info("Summary", {
+  suites: suiteResults.length,
+  cases: totalCases,
+  passed: totalPassed,
+  failed: totalFailed,
+  errors: totalErrors,
+  passRate: `${passRate.toFixed(1)}%`,
+});
+logger.info("Reports saved", { paths: reportPaths });
+
+if (totalFailed > ZERO || totalErrors > ZERO) {
+  process.exit(1);
+}
+
+logger.info("Agent Evals completed.");
diff --git a/src/cli/agent-evals/schemas.ts b/src/cli/agent-evals/schemas.ts
new file mode 100644
index 0000000..5a12ab1
--- /dev/null
+++ b/src/cli/agent-evals/schemas.ts
@@ -0,0 +1,203 @@
+import { z } from "zod";
+
+import {
+  DEFAULT_OUT_PATH,
+  DEFAULT_REPORT_FORMAT,
+  DEFAULT_VERBOSE,
+} from "./constants";
+
+// ============================================
+// CLI Arguments
+// ============================================
+
+export const CliArgsSchema = z
+  .object({
+    suite: z.string().optional(),
+    all: z.coerce.boolean().default(false),
+    report: z.enum(["json", "md", "both"]).default(DEFAULT_REPORT_FORMAT),
+    out: z.string().default(DEFAULT_OUT_PATH),
+    verbose: z.coerce.boolean().default(DEFAULT_VERBOSE),
+  })
+  .refine((data) => data.suite ?? data.all, {
+    message: "Either --suite <name> or --all is required",
+  });
+
+export type CliArgs = z.infer<typeof CliArgsSchema>;
+
+// ============================================
+// Assertion Types
+// ============================================
+
+export const ContainsAssertionSchema = z.object({
+  type: z.literal("contains"),
+  value: z.string(),
+  caseSensitive: z.boolean().optional(),
+  description: z.string().optional(),
+});
+
+export const MatchesRegexAssertionSchema = z.object({
+  type: z.literal("matchesRegex"),
+  pattern: z.string(),
+  flags: z.string().optional(),
+  description: z.string().optional(),
+});
+
+export const EqualsAssertionSchema = z.object({
+  type: z.literal("equals"),
+  expected: z.unknown(),
+  description: z.string().optional(),
+});
+
+export const JsonPathAssertionSchema = z.object({
+  type: z.literal("jsonPath"),
+  path: z.string(),
+  expected: z.unknown(),
+  description: z.string().optional(),
+});
+
+export const AssertionSchema = z.discriminatedUnion("type", [
+  ContainsAssertionSchema,
+  MatchesRegexAssertionSchema,
+  EqualsAssertionSchema,
+  JsonPathAssertionSchema,
+]);
+
+export type Assertion = z.infer<typeof AssertionSchema>;
+export type ContainsAssertion = z.infer<typeof ContainsAssertionSchema>;
+export type MatchesRegexAssertion = z.infer<typeof MatchesRegexAssertionSchema>;
+export type EqualsAssertion = z.infer<typeof EqualsAssertionSchema>;
+export type JsonPathAssertion = z.infer<typeof JsonPathAssertionSchema>;
+
+// ============================================
+// Eval Case
+// ============================================
+
+export const EvalCaseSchema = z.object({
+  id: z.string(),
+  name: z.string(),
+  description: z.string().optional(),
+  prompt: z.string(),
+  assertions: z.array(AssertionSchema).default([]),
+  timeout: z.number().optional(),
+  tags: z.array(z.string()).default([]),
+});
+
+export type EvalCase = z.infer<typeof EvalCaseSchema>;
+
+// ============================================
+// Agent Config (for suite)
+// ============================================
+
+export const AgentConfigSchema = z.object({
+  name: z.string(),
+  model: z.literal("gpt-5-mini"),
+  instructions: z.string(),
+  tools: z.array(z.string()).default([]),
+  maxTurns: z.number().optional(),
+});
+
+export type AgentConfig = z.infer<typeof AgentConfigSchema>;
+
+// ============================================
+// Eval Suite
+// ============================================
+
+export const EvalSuiteSchema = z.object({
+  name: z.string(),
+  description: z.string().optional(),
+  version: z.string().default("1.0.0"),
+  agent: AgentConfigSchema,
+  defaults: z
+    .object({
+      timeout: z.number().optional(),
+    })
+    .optional(),
+  cases: z.array(EvalCaseSchema).min(1),
+});
+
+export type EvalSuite = z.infer<typeof EvalSuiteSchema>;
+
+// ============================================
+// Assertion Result
+// ============================================
+
+export const AssertionResultSchema = z.object({
+  assertion: AssertionSchema,
+  passed: z.boolean(),
+  message: z.string(),
+  actual: z.unknown().optional(),
+  expected: z.unknown().optional(),
+});
+
+export type AssertionResult = z.infer<typeof AssertionResultSchema>;
+
+// ============================================
+// Case Result
+// ============================================
+
+export const CaseStatusSchema = z.enum(["pass", "fail", "error", "skip"]);
+export type CaseStatus = z.infer<typeof CaseStatusSchema>;
+
+export const CaseResultSchema = z.object({
+  caseId: z.string(),
+  caseName: z.string(),
+  status: CaseStatusSchema,
+  durationMs: z.number(),
+  output: z.unknown().nullable(),
+  assertionResults: z.array(AssertionResultSchema),
+  error: z.string().nullable(),
+});
+
+export type CaseResult = z.infer<typeof CaseResultSchema>;
+
+// ============================================
+// Suite Result
+// ============================================
+
+export const SuiteSummarySchema = z.object({
+  total: z.number(),
+  passed: z.number(),
+  failed: z.number(),
+  errors: z.number(),
+  skipped: z.number(),
+  passRate: z.number(),
+});
+
+export type SuiteSummary = z.infer<typeof SuiteSummarySchema>;
+
+export const SuiteResultSchema = z.object({
+  suiteName: z.string(),
+  suiteVersion: z.string(),
+  startedAt: z.string(),
+  completedAt: z.string(),
+  durationMs: z.number(),
+  summary: SuiteSummarySchema,
+  cases: z.array(CaseResultSchema),
+});
+
+export type SuiteResult = z.infer<typeof SuiteResultSchema>;
+
+// ============================================
+// Full Report (multiple suites)
+// ============================================
+
+export const ReportSummarySchema = z.object({
+  totalSuites: z.number(),
+  totalCases: z.number(),
+  passed: z.number(),
+  failed: z.number(),
+  errors: z.number(),
+  skipped: z.number(),
+  passRate: z.number(),
+});
+
+export type ReportSummary = z.infer<typeof ReportSummarySchema>;
+
+export const EvalReportSchema = z.object({
+  generatedAt: z.string(),
+  durationMs: z.number(),
+  summary: ReportSummarySchema,
+  suites: z.array(SuiteResultSchema),
+});
+
+export type EvalReport = z.infer<typeof EvalReportSchema>;
diff --git a/src/cli/agent-evals/suites/example.json b/src/cli/agent-evals/suites/example.json
new file mode 100644
index 0000000..b3aee2a
--- /dev/null
+++ b/src/cli/agent-evals/suites/example.json
@@ -0,0 +1,79 @@
+{
+  "name": "example-suite",
+  "description": "Example evaluation suite demonstrating the eval case pattern",
+  "version": "1.0.0",
+  "agent": {
+    "name": "SimpleTestAgent",
+    "model": "gpt-5-mini",
+    "instructions": "You are a helpful assistant. Always respond with valid JSON in the format: {\"answer\": \"your answer here\", \"confidence\": 0.0-1.0}. Be concise and accurate.",
+    "tools": [],
+    "maxTurns": 3
+  },
+  "defaults": {
+    "timeout": 15000
+  },
+  "cases": [
+    {
+      "id": "simple-math",
+      "name": "Simple arithmetic",
+      "description": "Tests basic math reasoning - 2 + 2",
+      "prompt": "What is 2 + 2? Respond with your answer and confidence level.",
+      "assertions": [
+        {
+          "type": "contains",
+          "value": "4",
+          "description": "Response should contain the number 4"
+        }
+      ],
+      "tags": ["math", "basic"]
+    },
+    {
+      "id": "json-format",
+      "name": "JSON format validation",
+      "description": "Tests that agent follows JSON format instructions",
+      "prompt": "Say hello. Respond with your greeting and confidence level.",
+      "assertions": [
+        {
+          "type": "matchesRegex",
+          "pattern": "\"answer\"\\s*:",
+          "description": "Response should have answer field"
+        },
+        {
+          "type": "matchesRegex",
+          "pattern": "\"confidence\"\\s*:",
+          "description": "Response should have confidence field"
+        }
+      ],
+      "tags": ["format", "basic"]
+    },
+    {
+      "id": "capital-france",
+      "name": "Basic knowledge - capital of France",
+      "description": "Tests knowledge retrieval",
+      "prompt": "What is the capital of France? Respond with your answer and confidence level.",
+      "assertions": [
+        {
+          "type": "contains",
+          "value": "Paris",
+          "caseSensitive": false,
+          "description": "Answer should mention Paris"
+        }
+      ],
+      "tags": ["knowledge", "geography"]
+    },
+    {
+      "id": "larger-number",
+      "name": "Comparison task",
+      "description": "Tests basic comparison reasoning",
+      "prompt": "Which is larger: 100 or 50? Respond with your answer and confidence level.",
+      "assertions": [
+        {
+          "type": "contains",
+          "value": "100",
+          "description": "Response should identify 100 as larger"
+        }
+      ],
+      "tags": ["math", "comparison"]
+    }
+  ]
+}
diff --git a/src/cli/agent-evals/utils/assertions.test.ts b/src/cli/agent-evals/utils/assertions.test.ts
new file mode 100644
index 0000000..0396d4b
--- /dev/null
+++ b/src/cli/agent-evals/utils/assertions.test.ts
@@ -0,0 +1,214 @@
+import { describe, expect, it } from "vitest";
+
+import type { Assertion } from "../schemas";
+import { evaluateAssertion } from "./assertions";
+
+describe("evaluateAssertion", () => {
+  describe("contains", () => {
+    it("passes when output contains the value", () => {
+      const assertion: Assertion = {
+        type: "contains",
+        value: "hello",
+      };
+      const result = evaluateAssertion(assertion, { message: "hello world" });
+      expect(result.passed).toBe(true);
+      expect(result.message).toContain("contains");
+    });
+
+    it("fails when output does not contain the value", () => {
+      const assertion: Assertion = {
+        type: "contains",
+        value: "goodbye",
+      };
+      const result = evaluateAssertion(assertion, { message: "hello world" });
+      expect(result.passed).toBe(false);
+      expect(result.message).toContain("does not contain");
+    });
+
+    it("is case sensitive by default", () => {
+      const assertion: Assertion = {
+        type: "contains",
+        value: "HELLO",
+      };
+      const result = evaluateAssertion(assertion, "hello world");
+      expect(result.passed).toBe(false);
+    });
+
+    it("respects caseSensitive: false", () => {
+      const assertion: Assertion = {
+        type: "contains",
+        value: "HELLO",
+        caseSensitive: false,
+      };
+      const result = evaluateAssertion(assertion, "hello world");
+      expect(result.passed).toBe(true);
+    });
+
+    it("works with string output", () => {
+      const assertion: Assertion = {
+        type: "contains",
+        value: "test",
+      };
+      const result = evaluateAssertion(assertion, "this is a test string");
+      expect(result.passed).toBe(true);
+    });
+  });
+
+  describe("matchesRegex", () => {
+    it("passes when output matches pattern", () => {
+      const assertion: Assertion = {
+        type: "matchesRegex",
+        pattern: "\\d{3}-\\d{4}",
+      };
+      const result = evaluateAssertion(assertion, "Call 555-1234");
+      expect(result.passed).toBe(true);
+    });
+
+    it("fails when output does not match pattern", () => {
+      const assertion: Assertion = {
+        type: "matchesRegex",
+        pattern: "\\d{3}-\\d{4}",
+      };
+      const result = evaluateAssertion(assertion, "No number here");
+      expect(result.passed).toBe(false);
+    });
+
+    it("supports regex flags", () => {
+      const assertion: Assertion = {
+        type: "matchesRegex",
+        pattern: "hello",
+        flags: "i",
+      };
+      const result = evaluateAssertion(assertion, "HELLO WORLD");
+      expect(result.passed).toBe(true);
+    });
+
+    it("handles invalid regex gracefully", () => {
+      const assertion: Assertion = {
+        type: "matchesRegex",
+        pattern: "[invalid",
+      };
+      const result = evaluateAssertion(assertion, "test");
+      expect(result.passed).toBe(false);
+      expect(result.message).toContain("Invalid regex");
+    });
+  });
+
+  describe("equals", () => {
+    it("passes for equal primitive values", () => {
+      const assertion: Assertion = {
+        type: "equals",
+        expected: 42,
+      };
+      const result = evaluateAssertion(assertion, 42);
+      expect(result.passed).toBe(true);
+    });
+
+    it("fails for different primitive values", () => {
+      const assertion: Assertion = {
+        type: "equals",
+        expected: 42,
+      };
+      const result = evaluateAssertion(assertion, 43);
+      expect(result.passed).toBe(false);
+    });
+
+    it("passes for equal objects", () => {
+      const assertion: Assertion = {
+        type: "equals",
+        expected: { a: 1, b: 2 },
+      };
+      const result = evaluateAssertion(assertion, { a: 1, b: 2 });
+      expect(result.passed).toBe(true);
+    });
+
+    it("fails for different objects", () => {
+      const assertion: Assertion = {
+        type: "equals",
+        expected: { a: 1, b: 2 },
+      };
+      const result = evaluateAssertion(assertion, { a: 1, b: 3 });
+      expect(result.passed).toBe(false);
+    });
+
+    it("passes for equal strings", () => {
+      const assertion: Assertion = {
+        type: "equals",
+        expected: "hello",
+      };
+      const result = evaluateAssertion(assertion, "hello");
+      expect(result.passed).toBe(true);
+    });
+  });
+
+  describe("jsonPath", () => {
+    it("extracts and compares nested values", () => {
+      const assertion: Assertion = {
+        type: "jsonPath",
+        path: "response.status",
+        expected: "success",
+      };
+      const result = evaluateAssertion(assertion, {
+        response: { status: "success" },
+      });
+      expect(result.passed).toBe(true);
+    });
+
+    it("supports $. prefix in path", () => {
+      const assertion: Assertion = {
+        type: "jsonPath",
+        path: "$.response.status",
+        expected: "success",
+      };
+      const result = evaluateAssertion(assertion, {
+        response: { status: "success" },
+      });
+      expect(result.passed).toBe(true);
+    });
+
+    it("fails when path value does not match", () => {
+      const assertion: Assertion = {
+        type: "jsonPath",
+        path: "response.status",
+        expected: "success",
+      };
+      const result = evaluateAssertion(assertion, {
+        response: { status: "error" },
+      });
+      expect(result.passed).toBe(false);
+    });
+
+    it("fails for missing path", () => {
+      const assertion: Assertion = {
+        type: "jsonPath",
+        path: "missing.path",
+        expected: "value",
+      };
+      const result = evaluateAssertion(assertion, { other: "data" });
+      expect(result.passed).toBe(false);
+      expect(result.message).toContain("Failed to evaluate path");
+    });
+
+    it("handles deeply nested paths", () => {
+      const assertion: Assertion = {
+        type: "jsonPath",
+        path: "a.b.c.d",
+        expected: 123,
+      };
+      const result = evaluateAssertion(assertion, {
+        a: { b: { c: { d: 123 } } },
+      });
+      expect(result.passed).toBe(true);
+    });
+
+    it("compares arrays correctly", () => {
+      const assertion: Assertion = {
+        type: "jsonPath",
+        path: "items",
+        expected: [1, 2, 3],
+      };
+      const result = evaluateAssertion(assertion, { items: [1, 2, 3] });
+      expect(result.passed).toBe(true);
+    });
+  });
+});
diff --git a/src/cli/agent-evals/utils/assertions.ts b/src/cli/agent-evals/utils/assertions.ts
new file mode 100644
index 0000000..ca870e4
--- /dev/null
+++ b/src/cli/agent-evals/utils/assertions.ts
@@ -0,0 +1,164 @@
+import type {
+  Assertion,
+  AssertionResult,
+  ContainsAssertion,
+  EqualsAssertion,
+  JsonPathAssertion,
+  MatchesRegexAssertion,
+} from "../schemas";
+
+/**
+ * Evaluate a single assertion against the agent output.
+ */
+export const evaluateAssertion = (
+  assertion: Assertion,
+  output: unknown
+): AssertionResult => {
+  switch (assertion.type) {
+    case "contains":
+      return evaluateContainsAssertion(assertion, output);
+    case "matchesRegex":
+      return evaluateMatchesRegexAssertion(assertion, output);
+    case "equals":
+      return evaluateEqualsAssertion(assertion, output);
+    case "jsonPath":
+      return evaluateJsonPathAssertion(assertion, output);
+  }
+};
+
+const evaluateContainsAssertion = (
+  assertion: ContainsAssertion,
+  output: unknown
+): AssertionResult => {
+  const outputStr = stringifyOutput(output);
+  const caseSensitive = assertion.caseSensitive ?? true;
+  const searchValue = caseSensitive
+    ? assertion.value
+    : assertion.value.toLowerCase();
+  const searchIn = caseSensitive ? outputStr : outputStr.toLowerCase();
+  const passed = searchIn.includes(searchValue);
+
+  return {
+    assertion,
+    passed,
+    message: passed
+      ? `Output contains "${assertion.value}"`
+      : `Output does not contain "${assertion.value}"`,
+    actual: outputStr,
+    expected: assertion.value,
+  };
+};
+
+const evaluateMatchesRegexAssertion = (
+  assertion: MatchesRegexAssertion,
+  output: unknown
+): AssertionResult => {
+  const outputStr = stringifyOutput(output);
+
+  try {
+    const regex = new RegExp(assertion.pattern, assertion.flags);
+    const passed = regex.test(outputStr);
+
+    return {
+      assertion,
+      passed,
+      message: passed
+        ? `Output matches pattern /${assertion.pattern}/${assertion.flags ?? ""}`
+        : `Output does not match pattern /${assertion.pattern}/${assertion.flags ?? ""}`,
+      actual: outputStr,
+      expected: assertion.pattern,
+    };
+  } catch (err) {
+    return {
+      assertion,
+      passed: false,
+      message: `Invalid regex pattern: ${err instanceof Error ? err.message : String(err)}`,
+      actual: outputStr,
+      expected: assertion.pattern,
+    };
+  }
+};
+
+const evaluateEqualsAssertion = (
+  assertion: EqualsAssertion,
+  output: unknown
+): AssertionResult => {
+  const passed = deepEquals(output, assertion.expected);
+
+  return {
+    assertion,
+    passed,
+    message: passed
+      ? "Output equals expected value"
+      : "Output does not equal expected value",
+    actual: output,
+    expected: assertion.expected,
+  };
+};
+
+const evaluateJsonPathAssertion = (
+  assertion: JsonPathAssertion,
+  output: unknown
+): AssertionResult => {
+  try {
+    const value = getJsonPath(output, assertion.path);
+    const passed = deepEquals(value, assertion.expected);
+
+    return {
+      assertion,
+      passed,
+      message: passed
+        ? `Value at ${assertion.path} equals expected`
+        : `Value at ${assertion.path} does not equal expected`,
+      actual: value,
+      expected: assertion.expected,
+    };
+  } catch (err) {
+    return {
+      assertion,
+      passed: false,
+      message: `Failed to evaluate path ${assertion.path}: ${err instanceof Error ? err.message : String(err)}`,
+      actual: output,
+      expected: assertion.expected,
+    };
+  }
+};
+
+/**
+ * Convert output to string for text-based assertions.
+ */
+const stringifyOutput = (output: unknown): string => {
+  if (typeof output === "string") {
+    return output;
+  }
+  return JSON.stringify(output, null, 2);
+};
+
+/**
+ * Deep equality check using JSON serialization.
+ */
+const deepEquals = (a: unknown, b: unknown): boolean => {
+  return JSON.stringify(a) === JSON.stringify(b);
+};
+
+/**
+ * Simple JSON path getter supporting dot notation.
+ * Supports paths like "response.status" or "$.response.status"
+ */
+const getJsonPath = (obj: unknown, path: string): unknown => {
+  const normalizedPath = path.startsWith("$.") ? path.slice(2) : path;
+  const parts = normalizedPath.split(".");
+
+  let current: unknown = obj;
+  for (const part of parts) {
+    if (current === null || current === undefined) {
+      throw new Error(`Cannot read property "${part}" of ${String(current)}`);
+    }
+    if (typeof current !== "object") {
+      throw new Error(`Cannot read property "${part}" of non-object`);
+    }
+    current = (current as Record<string, unknown>)[part];
+  }
+
+  return current;
+};
diff --git a/src/clients/agent-runner.ts b/src/clients/agent-runner.ts
index ff8350c..46b0553 100644
--- a/src/clients/agent-runner.ts
+++ b/src/clients/agent-runner.ts
@@ -11,7 +11,8 @@ export type AgentRunnerConfig<TOutput> = {
   name: string;
   model: "gpt-5-mini";
   tools: Tool[];
-  outputType: ZodType<TOutput>;
+  /** Zod schema for structured output. Omit for plain text responses. */
+  outputType?: ZodType<TOutput>;
   instructions: string;
 
   // Logging config
@@ -65,7 +66,7 @@ export class AgentRunner<TOutput> {
       name: config.name,
       model: config.model,
       tools: config.tools,
-      outputType: config.outputType,
+      ...(config.outputType ? { outputType: config.outputType } : {}),
       instructions: config.instructions,
     });
 

From 01afdc931d9841a94743a56df9ee6beaf42db9e8 Mon Sep 17 00:00:00 2001
From: Juha Kangas <42040080+valuecodes@users.noreply.github.com>
Date: Tue, 27 Jan 2026 13:06:46 +0200
Subject: [PATCH 03/14] test: add unit tests for parseArgs function with Zod
 validation

---
 src/utils/parse-args.test.ts | 44 ++++++++++++++++++++++++++++++++++++
 src/utils/parse-args.ts      | 10 ++++++--
 2 files changed, 52 insertions(+), 2 deletions(-)
 create mode 100644 src/utils/parse-args.test.ts

diff --git a/src/utils/parse-args.test.ts b/src/utils/parse-args.test.ts
new file mode 100644
index 0000000..b5e394d
--- /dev/null
+++ b/src/utils/parse-args.test.ts
@@ -0,0 +1,44 @@
+import { describe, expect, it } from "vitest";
+import { z } from "zod";
+
+import { Logger } from "~clients/logger";
+import { parseArgs } from "~utils/parse-args";
+
+const TestSchema = z
+  .object({
+    suite: z.string().optional(),
+    all: z.coerce.boolean().default(false),
+  })
+  .refine((data) => data.suite ?? data.all, {
+    message: "Either --suite <name> or --all is required",
+  });
+
+describe("parseArgs", () => {
+  const logger = new Logger({
+    level: "error",
+    useColors: false,
+    useTimestamps: false,
+  });
+
+  it("parses args after a standalone double-dash separator", () => {
+    const args = parseArgs({
+      logger,
+      schema: TestSchema,
+      rawArgs: ["--", "--suite=example"],
+    });
+
+    expect(args.suite).toBe("example");
+    expect(args.all).toBe(false);
+  });
+
+  it("parses --all even when preceded by a double-dash separator", () => {
+    const args = parseArgs({
+      logger,
+      schema: TestSchema,
+      rawArgs: ["--", "--all"],
+    });
+
+    expect(args.suite).toBeUndefined();
+    expect(args.all).toBe(true);
+  });
+});
diff --git a/src/utils/parse-args.ts b/src/utils/parse-args.ts
index f97c2c6..b777282 100644
--- a/src/utils/parse-args.ts
+++ b/src/utils/parse-args.ts
@@ -1,12 +1,16 @@
 import type { Logger } from "~clients/logger";
 import type { z } from "zod";
-import { argv } from "zx";
+import { parseArgv } from "zx";
 
 export type ParseArgsOptions<T extends z.ZodType> = {
   logger: Logger;
   schema: T;
+  rawArgs?: string[];
 };
 
+const sanitizeArgs = (rawArgs: string[]): string[] =>
+  rawArgs.filter((arg) => arg !== "--");
+
 /**
  * Parses and validates CLI arguments using a Zod schema.
  * @param options - Logger and Zod schema for validation
@@ -16,9 +20,11 @@ export type ParseArgsOptions<T extends z.ZodType> = {
 export const parseArgs = <T extends z.ZodType>({
   logger,
   schema,
+  rawArgs,
 }: ParseArgsOptions<T>): z.infer<T> => {
   logger.debug("Parsing CLI arguments...");
-  const args = schema.parse(argv);
+  const parsedArgs = parseArgv(sanitizeArgs(rawArgs ?? process.argv.slice(2)));
+  const args = schema.parse(parsedArgs);
   logger.debug("Parsed args", { args });
   return args;
 };

From 4e16806d4d1cfc36df0480c8c224971b2520ec14 Mon Sep 17 00:00:00 2001
From: Juha Kangas <42040080+valuecodes@users.noreply.github.com>
Date: Tue, 27 Jan 2026 13:51:18 +0200
Subject: [PATCH 04/14] feat: add file assertion types and enhance evaluation
 framework

- Implement fileExists, fileContains, and fileJsonPath assertions
- Update assertion evaluation to handle async file checks
- Create tool registry for managing file-related tools
---
 src/cli/agent-evals/README.md                 |  12 +-
 src/cli/agent-evals/clients/eval-runner.ts    |  22 +-
 .../agent-evals/clients/report-generator.ts   |   8 +-
 src/cli/agent-evals/clients/tool-registry.ts  |  35 +++
 src/cli/agent-evals/constants.ts              |   1 +
 src/cli/agent-evals/schemas.ts                |  32 +++
 src/cli/agent-evals/suites/tools.json         |  89 ++++++
 src/cli/agent-evals/utils/assertions.test.ts  | 259 +++++++++++++++---
 src/cli/agent-evals/utils/assertions.ts       |  16 +-
 src/cli/agent-evals/utils/file-assertions.ts  | 138 ++++++++++
 src/tools/list-files/list-files-tool.ts       |  11 +-
 src/utils/parse-args.test.ts                  |  86 +++++-
 12 files changed, 634 insertions(+), 75 deletions(-)
 create mode 100644 src/cli/agent-evals/clients/tool-registry.ts
 create mode 100644 src/cli/agent-evals/suites/tools.json
 create mode 100644 src/cli/agent-evals/utils/file-assertions.ts

diff --git a/src/cli/agent-evals/README.md b/src/cli/agent-evals/README.md
index 7ea7245..5c19b4a 100644
--- a/src/cli/agent-evals/README.md
+++ b/src/cli/agent-evals/README.md
@@ -20,14 +20,15 @@ pnpm run:agent-evals -- --suite=example --verbose --report=both
 - `--suite <name>`: Run a specific suite by name (without `.json` extension)
 - `--all`: Run all suites in the `suites/` directory
 - `--report <format>`: Report format: `json`, `md`, or `both` (default: `json`)
-- `--out <path>`: Output directory under `tmp/` (default: `agent-evals`)
+- `--out <path>`: Output base directory under `tmp/` (default: `agent-evals`)
 - `--verbose`: Enable verbose logging with assertion details
 
 Either `--suite` or `--all` is required.
 
 ## Output
 
-Reports are written to `tmp/agent-evals/`:
+Reports are written to `tmp/<out>/reports/` (default: `tmp/agent-evals/reports/`):
+
 - `report-{timestamp}.json`: Machine-readable results
 - `report-{timestamp}.md`: Human-readable markdown report
 
@@ -57,9 +58,7 @@ Add JSON files to `suites/` directory. Example structure:
       "id": "case-1",
       "name": "Test case name",
       "prompt": "User prompt to test",
-      "assertions": [
-        { "type": "contains", "value": "expected text" }
-      ]
+      "assertions": [{ "type": "contains", "value": "expected text" }]
     }
   ]
 }
@@ -68,16 +67,19 @@ Add JSON files to `suites/` directory. Example structure:
 ## Assertion Types
 
 - **contains**: Check if output contains a string
+
   ```json
   { "type": "contains", "value": "text", "caseSensitive": false }
   ```
 
 - **matchesRegex**: Check if output matches a regex pattern
+
   ```json
   { "type": "matchesRegex", "pattern": "\\d+", "flags": "i" }
   ```
 
 - **equals**: Deep equality check
+
   ```json
   { "type": "equals", "expected": { "key": "value" } }
   ```
diff --git a/src/cli/agent-evals/clients/eval-runner.ts b/src/cli/agent-evals/clients/eval-runner.ts
index 94f64e6..7205b24 100644
--- a/src/cli/agent-evals/clients/eval-runner.ts
+++ b/src/cli/agent-evals/clients/eval-runner.ts
@@ -17,6 +17,7 @@ import type {
   SuiteSummary,
 } from "../schemas";
 import { evaluateAssertion } from "../utils/assertions";
+import { createToolsFromNames } from "./tool-registry";
 
 export type EvalRunnerConfig = {
   logger: Logger;
@@ -151,7 +152,10 @@ export class EvalRunner {
       const output: unknown = result.finalOutput;
       const durationMs = Date.now() - startTime;
 
-      const assertionResults = this.runAssertions(evalCase.assertions, output);
+      const assertionResults = await this.runAssertions(
+        evalCase.assertions,
+        output
+      );
 
       const allAssertionsPassed = assertionResults.every((r) => r.passed);
       const status: CaseStatus = allAssertionsPassed ? "pass" : "fail";
@@ -183,13 +187,17 @@ export class EvalRunner {
 
   /**
    * Create an AgentRunner from suite's agent config.
-   * Omits outputType to get plain text responses (no structured output).
+   * Instantiates tools from the tool registry based on suite.agent.tools.
    */
   private createAgentRunner(suite: EvalSuite): AgentRunner<unknown> {
+    const tools = createToolsFromNames(suite.agent.tools, {
+      logger: this.logger,
+    });
+
     return new AgentRunner({
       name: suite.agent.name,
       model: suite.agent.model,
-      tools: [],
+      tools,
       instructions: suite.agent.instructions,
       logger: this.logger,
       logToolResults: this.verbose,
@@ -200,10 +208,12 @@ export class EvalRunner {
   /**
    * Run all assertions on the output.
    */
-  private runAssertions(
+  private async runAssertions(
     assertions: EvalCase["assertions"],
     output: unknown
-  ): AssertionResult[] {
-    return assertions.map((assertion) => evaluateAssertion(assertion, output));
+  ): Promise<AssertionResult[]> {
+    return Promise.all(
+      assertions.map((assertion) => evaluateAssertion(assertion, output))
+    );
   }
 }
diff --git a/src/cli/agent-evals/clients/report-generator.ts b/src/cli/agent-evals/clients/report-generator.ts
index ce4462e..5437cb7 100644
--- a/src/cli/agent-evals/clients/report-generator.ts
+++ b/src/cli/agent-evals/clients/report-generator.ts
@@ -6,6 +6,7 @@ import { resolveTmpPathForWrite } from "~tools/utils/fs";
 import {
   DECIMAL_PLACES,
   PERCENT_MULTIPLIER,
+  REPORTS_SUBDIR,
   STATUS_SYMBOLS,
 } from "../constants";
 import type { EvalReport, ReportSummary, SuiteResult } from "../schemas";
@@ -20,7 +21,8 @@ export type ReportGeneratorConfig = {
 
 /**
  * Generates evaluation reports in JSON and/or Markdown format.
- * Reports are written to the configured output directory under tmp/.
+ * Reports are written to the configured output directory under tmp/,
+ * inside a dedicated reports/ subfolder.
  */
 export class ReportGenerator {
   private logger: Logger;
@@ -86,7 +88,7 @@ export class ReportGenerator {
   private async writeJson(report: EvalReport): Promise<string> {
     const timestamp = this.getTimestamp();
     const filename = `report-${timestamp}.json`;
-    const relativePath = path.join(this.outputDir, filename);
+    const relativePath = path.join(this.outputDir, REPORTS_SUBDIR, filename);
     const fullPath = await resolveTmpPathForWrite(relativePath);
 
     await fs.writeFile(fullPath, JSON.stringify(report, null, 2), "utf8");
@@ -97,7 +99,7 @@ export class ReportGenerator {
   private async writeMarkdown(report: EvalReport): Promise<string> {
     const timestamp = this.getTimestamp();
     const filename = `report-${timestamp}.md`;
-    const relativePath = path.join(this.outputDir, filename);
+    const relativePath = path.join(this.outputDir, REPORTS_SUBDIR, filename);
     const fullPath = await resolveTmpPathForWrite(relativePath);
 
     const markdown = this.formatMarkdown(report);
diff --git a/src/cli/agent-evals/clients/tool-registry.ts b/src/cli/agent-evals/clients/tool-registry.ts
new file mode 100644
index 0000000..b4d0386
--- /dev/null
+++ b/src/cli/agent-evals/clients/tool-registry.ts
@@ -0,0 +1,35 @@
+import type { Tool } from "@openai/agents";
+import type { Logger } from "~clients/logger";
+import { createListFilesTool } from "~tools/list-files/list-files-tool";
+import { createReadFileTool } from "~tools/read-file/read-file-tool";
+import { createWriteFileTool } from "~tools/write-file/write-file-tool";
+
+export type ToolFactoryConfig = {
+  logger: Logger;
+};
+
+type ToolFactory = (config: ToolFactoryConfig) => Tool;
+
+const toolFactories: Record<string, ToolFactory> = {
+  readFile: ({ logger }) => createReadFileTool({ logger }),
+  writeFile: ({ logger }) => createWriteFileTool({ logger }),
+  listFiles: ({ logger }) => createListFilesTool({ logger }),
+};
+
+/**
+ * Creates tool instances from an array of tool names.
+ * Throws if an unknown tool name is provided.
+ */
+export const createToolsFromNames = (
+  names: string[],
+  config: ToolFactoryConfig
+): Tool[] => {
+  return names.map((name) => {
+    const factory = toolFactories[name];
+    if (!factory) {
+      const available = Object.keys(toolFactories).join(", ");
+      throw new Error(`Unknown tool: ${name}. Available: ${available}`);
+    }
+    return factory(config);
+  });
+};
diff --git a/src/cli/agent-evals/constants.ts b/src/cli/agent-evals/constants.ts
index 6e6216b..6a92493 100644
--- a/src/cli/agent-evals/constants.ts
+++ b/src/cli/agent-evals/constants.ts
@@ -4,6 +4,7 @@ import path from "node:path";
 export const DEFAULT_VERBOSE = false;
 export const DEFAULT_REPORT_FORMAT = "json" as const;
 export const DEFAULT_OUT_PATH = "agent-evals";
+export const REPORTS_SUBDIR = "reports";
 
 // Paths
 export const SUITES_DIR = path.join(
diff --git a/src/cli/agent-evals/schemas.ts b/src/cli/agent-evals/schemas.ts
index 5a12ab1..9b56a40 100644
--- a/src/cli/agent-evals/schemas.ts
+++ b/src/cli/agent-evals/schemas.ts
@@ -55,11 +55,40 @@ export const JsonPathAssertionSchema = z.object({
   description: z.string().optional(),
 });
 
+// ============================================
+// File Assertion Types (for verifying tool side effects)
+// ============================================
+
+export const FileExistsAssertionSchema = z.object({
+  type: z.literal("fileExists"),
+  path: z.string(),
+  description: z.string().optional(),
+});
+
+export const FileContainsAssertionSchema = z.object({
+  type: z.literal("fileContains"),
+  path: z.string(),
+  value: z.string(),
+  caseSensitive: z.boolean().optional(),
+  description: z.string().optional(),
+});
+
+export const FileJsonPathAssertionSchema = z.object({
+  type: z.literal("fileJsonPath"),
+  path: z.string(),
+  jsonPath: z.string(),
+  expected: z.unknown(),
+  description: z.string().optional(),
+});
+
 export const AssertionSchema = z.discriminatedUnion("type", [
   ContainsAssertionSchema,
   MatchesRegexAssertionSchema,
   EqualsAssertionSchema,
   JsonPathAssertionSchema,
+  FileExistsAssertionSchema,
+  FileContainsAssertionSchema,
+  FileJsonPathAssertionSchema,
 ]);
 
 export type Assertion = z.infer<typeof AssertionSchema>;
@@ -67,6 +96,9 @@ export type ContainsAssertion = z.infer<typeof ContainsAssertionSchema>;
 export type MatchesRegexAssertion = z.infer<typeof MatchesRegexAssertionSchema>;
 export type EqualsAssertion = z.infer<typeof EqualsAssertionSchema>;
 export type JsonPathAssertion = z.infer<typeof JsonPathAssertionSchema>;
+export type FileExistsAssertion = z.infer<typeof FileExistsAssertionSchema>;
+export type FileContainsAssertion = z.infer<typeof FileContainsAssertionSchema>;
+export type FileJsonPathAssertion = z.infer<typeof FileJsonPathAssertionSchema>;
 
 // ============================================
 // Eval Case
diff --git a/src/cli/agent-evals/suites/tools.json b/src/cli/agent-evals/suites/tools.json
new file mode 100644
index 0000000..0a765b4
--- /dev/null
+++ b/src/cli/agent-evals/suites/tools.json
@@ -0,0 +1,89 @@
+{
+  "name": "tools-suite",
+  "description": "Tests shared agent tools (readFile, writeFile, listFiles)",
+  "version": "1.0.0",
+  "agent": {
+    "name": "ToolTestAgent",
+    "model": "gpt-5-mini",
+    "instructions": "You are an assistant that tests file tools. Use the tools provided to complete tasks. After using a tool, report results concisely.",
+    "tools": ["readFile", "writeFile", "listFiles"],
+    "maxTurns": 3
+  },
+  "defaults": {
+    "timeout": 20000
+  },
+  "cases": [
+    {
+      "id": "write-file",
+      "name": "writeFile creates a file",
+      "prompt": "Write the text 'Hello World' to a file called 'agent-evals/tool-test-output.txt'",
+      "assertions": [
+        {
+          "type": "fileExists",
+          "path": "agent-evals/tool-test-output.txt",
+          "description": "File should be created"
+        },
+        {
+          "type": "fileContains",
+          "path": "agent-evals/tool-test-output.txt",
+          "value": "Hello World",
+          "description": "File should contain the written text"
+        }
+      ],
+      "tags": ["writeFile"]
+    },
+    {
+      "id": "read-file",
+      "name": "readFile reads file content",
+      "prompt": "Read the file 'agent-evals/tool-test-output.txt' and tell me its contents",
+      "assertions": [
+        {
+          "type": "contains",
+          "value": "Hello World",
+          "description": "Agent response should include the file contents"
+        }
+      ],
+      "tags": ["readFile"]
+    },
+    {
+      "id": "list-files",
+      "name": "listFiles shows directory contents",
+      "prompt": "List the files in the tmp/agent-evals directory",
+      "assertions": [
+        {
+          "type": "contains",
+          "value": "tool-test-output.txt",
+          "description": "Agent response should include the previously created file"
+        }
+      ],
+      "tags": ["listFiles"]
+    },
+    {
+      "id": "write-json",
+      "name": "writeFile with JSON content",
+      "prompt": "Write a JSON file called 'agent-evals/tool-test-data.json' with this exact content: {\"name\": \"test\", \"value\": 42}",
+      "assertions": [
+        {
+          "type": "fileExists",
+          "path": "agent-evals/tool-test-data.json",
+          "description": "JSON file should be created"
+        },
+        {
+          "type": "fileJsonPath",
+          "path": "agent-evals/tool-test-data.json",
+          "jsonPath": "$.name",
+          "expected": "test",
+          "description": "JSON name field should match"
+        },
+        {
+          "type": "fileJsonPath",
+          "path": "agent-evals/tool-test-data.json",
+          "jsonPath": "$.value",
+          "expected": 42,
+          "description": "JSON value field should match"
+        }
+      ],
+      "tags": ["writeFile", "json"]
+    }
+  ]
+}
diff --git a/src/cli/agent-evals/utils/assertions.test.ts b/src/cli/agent-evals/utils/assertions.test.ts
index 0396d4b..f57c86c 100644
--- a/src/cli/agent-evals/utils/assertions.test.ts
+++ b/src/cli/agent-evals/utils/assertions.test.ts
@@ -1,214 +1,391 @@
-import { describe, expect, it } from "vitest";
+import fs from "node:fs/promises";
+import path from "node:path";
+import { TMP_ROOT } from "~tools/utils/fs";
+import { afterAll, beforeAll, describe, expect, it } from "vitest";
 
 import type { Assertion } from "../schemas";
 import { evaluateAssertion } from "./assertions";
 
 describe("evaluateAssertion", () => {
   describe("contains", () => {
-    it("passes when output contains the value", () => {
+    it("passes when output contains the value", async () => {
       const assertion: Assertion = {
         type: "contains",
         value: "hello",
       };
-      const result = evaluateAssertion(assertion, { message: "hello world" });
+      const result = await evaluateAssertion(assertion, {
+        message: "hello world",
+      });
       expect(result.passed).toBe(true);
       expect(result.message).toContain("contains");
     });
 
-    it("fails when output does not contain the value", () => {
+    it("fails when output does not contain the value", async () => {
       const assertion: Assertion = {
         type: "contains",
         value: "goodbye",
       };
-      const result = evaluateAssertion(assertion, { message: "hello world" });
+      const result = await evaluateAssertion(assertion, {
+        message: "hello world",
+      });
       expect(result.passed).toBe(false);
       expect(result.message).toContain("does not contain");
     });
 
-    it("is case sensitive by default", () => {
+    it("is case sensitive by default", async () => {
       const assertion: Assertion = {
         type: "contains",
         value: "HELLO",
       };
-      const result = evaluateAssertion(assertion, "hello world");
+      const result = await evaluateAssertion(assertion, "hello world");
       expect(result.passed).toBe(false);
     });
 
-    it("respects caseSensitive: false", () => {
+    it("respects caseSensitive: false", async () => {
       const assertion: Assertion = {
         type: "contains",
         value: "HELLO",
         caseSensitive: false,
       };
-      const result = evaluateAssertion(assertion, "hello world");
+      const result = await evaluateAssertion(assertion, "hello world");
       expect(result.passed).toBe(true);
     });
 
-    it("works with string output", () => {
+    it("works with string output", async () => {
       const assertion: Assertion = {
         type: "contains",
         value: "test",
       };
-      const result = evaluateAssertion(assertion, "this is a test string");
+      const result = await evaluateAssertion(
+        assertion,
+        "this is a test string"
+      );
       expect(result.passed).toBe(true);
     });
   });
 
   describe("matchesRegex", () => {
-    it("passes when output matches pattern", () => {
+    it("passes when output matches pattern", async () => {
       const assertion: Assertion = {
         type: "matchesRegex",
         pattern: "\\d{3}-\\d{4}",
       };
-      const result = evaluateAssertion(assertion, "Call 555-1234");
+      const result = await evaluateAssertion(assertion, "Call 555-1234");
       expect(result.passed).toBe(true);
     });
 
-    it("fails when output does not match pattern", () => {
+    it("fails when output does not match pattern", async () => {
       const assertion: Assertion = {
         type: "matchesRegex",
         pattern: "\\d{3}-\\d{4}",
       };
-      const result = evaluateAssertion(assertion, "No number here");
+      const result = await evaluateAssertion(assertion, "No number here");
       expect(result.passed).toBe(false);
     });
 
-    it("supports regex flags", () => {
+    it("supports regex flags", async () => {
       const assertion: Assertion = {
         type: "matchesRegex",
         pattern: "hello",
         flags: "i",
       };
-      const result = evaluateAssertion(assertion, "HELLO WORLD");
+      const result = await evaluateAssertion(assertion, "HELLO WORLD");
       expect(result.passed).toBe(true);
     });
 
-    it("handles invalid regex gracefully", () => {
+    it("handles invalid regex gracefully", async () => {
       const assertion: Assertion = {
         type: "matchesRegex",
         pattern: "[invalid",
       };
-      const result = evaluateAssertion(assertion, "test");
+      const result = await evaluateAssertion(assertion, "test");
       expect(result.passed).toBe(false);
       expect(result.message).toContain("Invalid regex");
     });
   });
 
   describe("equals", () => {
-    it("passes for equal primitive values", () => {
+    it("passes for equal primitive values", async () => {
       const assertion: Assertion = {
         type: "equals",
         expected: 42,
       };
-      const result = evaluateAssertion(assertion, 42);
+      const result = await evaluateAssertion(assertion, 42);
       expect(result.passed).toBe(true);
     });
 
-    it("fails for different primitive values", () => {
+    it("fails for different primitive values", async () => {
       const assertion: Assertion = {
         type: "equals",
         expected: 42,
       };
-      const result = evaluateAssertion(assertion, 43);
+      const result = await evaluateAssertion(assertion, 43);
       expect(result.passed).toBe(false);
     });
 
-    it("passes for equal objects", () => {
+    it("passes for equal objects", async () => {
       const assertion: Assertion = {
         type: "equals",
         expected: { a: 1, b: 2 },
       };
-      const result = evaluateAssertion(assertion, { a: 1, b: 2 });
+      const result = await evaluateAssertion(assertion, { a: 1, b: 2 });
       expect(result.passed).toBe(true);
     });
 
-    it("fails for different objects", () => {
+    it("fails for different objects", async () => {
       const assertion: Assertion = {
         type: "equals",
         expected: { a: 1, b: 2 },
       };
-      const result = evaluateAssertion(assertion, { a: 1, b: 3 });
+      const result = await evaluateAssertion(assertion, { a: 1, b: 3 });
       expect(result.passed).toBe(false);
     });
 
-    it("passes for equal strings", () => {
+    it("passes for equal strings", async () => {
       const assertion: Assertion = {
         type: "equals",
         expected: "hello",
       };
-      const result = evaluateAssertion(assertion, "hello");
+      const result = await evaluateAssertion(assertion, "hello");
       expect(result.passed).toBe(true);
     });
   });
 
   describe("jsonPath", () => {
-    it("extracts and compares nested values", () => {
+    it("extracts and compares nested values", async () => {
       const assertion: Assertion = {
         type: "jsonPath",
         path: "response.status",
         expected: "success",
       };
-      const result = evaluateAssertion(assertion, {
+      const result = await evaluateAssertion(assertion, {
         response: { status: "success" },
       });
       expect(result.passed).toBe(true);
     });
 
-    it("supports $. prefix in path", () => {
+    it("supports $. prefix in path", async () => {
       const assertion: Assertion = {
         type: "jsonPath",
         path: "$.response.status",
         expected: "success",
       };
-      const result = evaluateAssertion(assertion, {
+      const result = await evaluateAssertion(assertion, {
         response: { status: "success" },
       });
       expect(result.passed).toBe(true);
     });
 
-    it("fails when path value does not match", () => {
+    it("fails when path value does not match", async () => {
       const assertion: Assertion = {
         type: "jsonPath",
         path: "response.status",
         expected: "success",
       };
-      const result = evaluateAssertion(assertion, {
+      const result = await evaluateAssertion(assertion, {
         response: { status: "error" },
       });
       expect(result.passed).toBe(false);
     });
 
-    it("fails for missing path", () => {
+    it("fails for missing path", async () => {
       const assertion: Assertion = {
         type: "jsonPath",
         path: "missing.path",
         expected: "value",
       };
-      const result = evaluateAssertion(assertion, { other: "data" });
+      const result = await evaluateAssertion(assertion, { other: "data" });
       expect(result.passed).toBe(false);
       expect(result.message).toContain("Failed to evaluate path");
     });
 
-    it("handles deeply nested paths", () => {
+    it("handles deeply nested paths", async () => {
       const assertion: Assertion = {
         type: "jsonPath",
         path: "a.b.c.d",
         expected: 123,
       };
-      const result = evaluateAssertion(assertion, {
+      const result = await evaluateAssertion(assertion, {
         a: { b: { c: { d: 123 } } },
       });
       expect(result.passed).toBe(true);
     });
 
-    it("compares arrays correctly", () => {
+    it("compares arrays correctly", async () => {
       const assertion: Assertion = {
         type: "jsonPath",
         path: "items",
         expected: [1, 2, 3],
       };
-      const result = evaluateAssertion(assertion, { items: [1, 2, 3] });
+      const result = await evaluateAssertion(assertion, { items: [1, 2, 3] });
       expect(result.passed).toBe(true);
     });
   });
+
+  describe("file assertions", () => {
+    const TEST_DIR = path.join(TMP_ROOT, "assertion-tests");
+    const TEST_FILE = path.join(TEST_DIR, "test-file.txt");
+    const TEST_JSON_FILE = path.join(TEST_DIR, "test-data.json");
+
+    beforeAll(async () => {
+      await fs.mkdir(TEST_DIR, { recursive: true });
+      await fs.writeFile(TEST_FILE, "Hello World\nThis is test content.");
+      await fs.writeFile(
+        TEST_JSON_FILE,
+        JSON.stringify({ name: "test", value: 42, nested: { key: "value" } })
+      );
+    });
+
+    afterAll(async () => {
+      await fs.rm(TEST_DIR, { recursive: true, force: true });
+    });
+
+    describe("fileExists", () => {
+      it("passes when file exists", async () => {
+        const assertion: Assertion = {
+          type: "fileExists",
+          path: "assertion-tests/test-file.txt",
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(true);
+        expect(result.message).toContain("exists");
+      });
+
+      it("fails when file does not exist", async () => {
+        const assertion: Assertion = {
+          type: "fileExists",
+          path: "assertion-tests/nonexistent.txt",
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(false);
+        expect(result.message).toContain("does not exist");
+      });
+    });
+
+    describe("fileContains", () => {
+      it("passes when file contains the value", async () => {
+        const assertion: Assertion = {
+          type: "fileContains",
+          path: "assertion-tests/test-file.txt",
+          value: "Hello World",
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(true);
+        expect(result.message).toContain("contains");
+      });
+
+      it("fails when file does not contain the value", async () => {
+        const assertion: Assertion = {
+          type: "fileContains",
+          path: "assertion-tests/test-file.txt",
+          value: "Goodbye",
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(false);
+        expect(result.message).toContain("does not contain");
+      });
+
+      it("is case sensitive by default", async () => {
+        const assertion: Assertion = {
+          type: "fileContains",
+          path: "assertion-tests/test-file.txt",
+          value: "HELLO WORLD",
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(false);
+      });
+
+      it("respects caseSensitive: false", async () => {
+        const assertion: Assertion = {
+          type: "fileContains",
+          path: "assertion-tests/test-file.txt",
+          value: "HELLO WORLD",
+          caseSensitive: false,
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(true);
+      });
+
+      it("fails gracefully when file does not exist", async () => {
+        const assertion: Assertion = {
+          type: "fileContains",
+          path: "assertion-tests/nonexistent.txt",
+          value: "test",
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(false);
+        expect(result.message).toContain("Failed to read file");
+      });
+    });
+
+    describe("fileJsonPath", () => {
+      it("extracts and compares JSON values", async () => {
+        const assertion: Assertion = {
+          type: "fileJsonPath",
+          path: "assertion-tests/test-data.json",
+          jsonPath: "name",
+          expected: "test",
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(true);
+      });
+
+      it("supports $. prefix in jsonPath", async () => {
+        const assertion: Assertion = {
+          type: "fileJsonPath",
+          path: "assertion-tests/test-data.json",
+          jsonPath: "$.value",
+          expected: 42,
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(true);
+      });
+
+      it("handles nested paths", async () => {
+        const assertion: Assertion = {
+          type: "fileJsonPath",
+          path: "assertion-tests/test-data.json",
+          jsonPath: "nested.key",
+          expected: "value",
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(true);
+      });
+
+      it("fails when value does not match", async () => {
+        const assertion: Assertion = {
+          type: "fileJsonPath",
+          path: "assertion-tests/test-data.json",
+          jsonPath: "value",
+          expected: 100,
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(false);
+      });
+
+      it("fails gracefully for missing path", async () => {
+        const assertion: Assertion = {
+          type: "fileJsonPath",
+          path: "assertion-tests/test-data.json",
+          jsonPath: "missing.path",
+          expected: "value",
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(false);
+        expect(result.message).toContain("Failed to evaluate");
+      });
+
+      it("fails gracefully when file does not exist", async () => {
+        const assertion: Assertion = {
+          type: "fileJsonPath",
+          path: "assertion-tests/nonexistent.json",
+          jsonPath: "key",
+          expected: "value",
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(false);
+        expect(result.message).toContain("Failed to evaluate");
+      });
+    });
+  });
 });
diff --git a/src/cli/agent-evals/utils/assertions.ts b/src/cli/agent-evals/utils/assertions.ts
index ca870e4..bdc50d1 100644
--- a/src/cli/agent-evals/utils/assertions.ts
+++ b/src/cli/agent-evals/utils/assertions.ts
@@ -6,14 +6,20 @@ import type {
   JsonPathAssertion,
   MatchesRegexAssertion,
 } from "../schemas";
+import {
+  evaluateFileContainsAssertion,
+  evaluateFileExistsAssertion,
+  evaluateFileJsonPathAssertion,
+} from "./file-assertions";
 
 /**
  * Evaluate a single assertion against the agent output.
+ * File assertions are async (require filesystem access).
  */
-export const evaluateAssertion = (
+export const evaluateAssertion = async (
   assertion: Assertion,
   output: unknown
-): AssertionResult => {
+): Promise<AssertionResult> => {
   switch (assertion.type) {
     case "contains":
       return evaluateContainsAssertion(assertion, output);
@@ -23,6 +29,12 @@ export const evaluateAssertion = (
       return evaluateEqualsAssertion(assertion, output);
     case "jsonPath":
       return evaluateJsonPathAssertion(assertion, output);
+    case "fileExists":
+      return evaluateFileExistsAssertion(assertion);
+    case "fileContains":
+      return evaluateFileContainsAssertion(assertion);
+    case "fileJsonPath":
+      return evaluateFileJsonPathAssertion(assertion);
   }
 };
 
diff --git a/src/cli/agent-evals/utils/file-assertions.ts b/src/cli/agent-evals/utils/file-assertions.ts
new file mode 100644
index 0000000..be8f360
--- /dev/null
+++ b/src/cli/agent-evals/utils/file-assertions.ts
@@ -0,0 +1,138 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { TMP_ROOT } from "~tools/utils/fs";
+
+import type {
+  AssertionResult,
+  FileContainsAssertion,
+  FileExistsAssertion,
+  FileJsonPathAssertion,
+} from "../schemas";
+
+/**
+ * Evaluate a fileExists assertion by checking if the file exists in tmp/.
+ */
+export const evaluateFileExistsAssertion = async (
+  assertion: FileExistsAssertion
+): Promise<AssertionResult> => {
+  const fullPath = path.join(TMP_ROOT, assertion.path);
+
+  try {
+    await fs.access(fullPath);
+    return {
+      assertion,
+      passed: true,
+      message: `File exists: ${assertion.path}`,
+      actual: assertion.path,
+      expected: "file to exist",
+    };
+  } catch {
+    return {
+      assertion,
+      passed: false,
+      message: `File does not exist: ${assertion.path}`,
+      actual: "file not found",
+      expected: "file to exist",
+    };
+  }
+};
+
+/**
+ * Evaluate a fileContains assertion by reading the file and checking for a substring.
+ */
+export const evaluateFileContainsAssertion = async (
+  assertion: FileContainsAssertion
+): Promise<AssertionResult> => {
+  const fullPath = path.join(TMP_ROOT, assertion.path);
+
+  try {
+    const content = await fs.readFile(fullPath, "utf8");
+    const caseSensitive = assertion.caseSensitive ?? true;
+    const searchValue = caseSensitive
+      ? assertion.value
+      : assertion.value.toLowerCase();
+    const searchIn = caseSensitive ? content : content.toLowerCase();
+    const passed = searchIn.includes(searchValue);
+
+    return {
+      assertion,
+      passed,
+      message: passed
+        ? `File contains "${assertion.value}"`
+        : `File does not contain "${assertion.value}"`,
+      actual: content.length > 500 ? `${content.slice(0, 500)}...` : content,
+      expected: assertion.value,
+    };
+  } catch (err) {
+    return {
+      assertion,
+      passed: false,
+      message: `Failed to read file: ${err instanceof Error ? err.message : String(err)}`,
+      actual: "file read error",
+      expected: assertion.value,
+    };
+  }
+};
+
+/**
+ * Evaluate a fileJsonPath assertion by reading a JSON file and checking a path.
+ */
+export const evaluateFileJsonPathAssertion = async (
+  assertion: FileJsonPathAssertion
+): Promise<AssertionResult> => {
+  const fullPath = path.join(TMP_ROOT, assertion.path);
+
+  try {
+    const content = await fs.readFile(fullPath, "utf8");
+    const json = JSON.parse(content) as unknown;
+    const value = getJsonPath(json, assertion.jsonPath);
+    const passed = deepEquals(value, assertion.expected);
+
+    return {
+      assertion,
+      passed,
+      message: passed
+        ? `Value at ${assertion.jsonPath} equals expected`
+        : `Value at ${assertion.jsonPath} does not equal expected`,
+      actual: value,
+      expected: assertion.expected,
+    };
+  } catch (err) {
+    return {
+      assertion,
+      passed: false,
+      message: `Failed to evaluate: ${err instanceof Error ? err.message : String(err)}`,
+      actual: "evaluation error",
+      expected: assertion.expected,
+    };
+  }
+};
+
+/**
+ * Deep equality check using JSON serialization.
+ */
+const deepEquals = (a: unknown, b: unknown): boolean => {
+  return JSON.stringify(a) === JSON.stringify(b);
+};
+
+/**
+ * Simple JSON path getter supporting dot notation.
+ * Supports paths like "name" or "$.response.status"
+ */
+const getJsonPath = (obj: unknown, pathStr: string): unknown => {
+  const normalizedPath = pathStr.startsWith("$.") ? pathStr.slice(2) : pathStr;
+  const parts = normalizedPath.split(".");
+
+  let current: unknown = obj;
+  for (const part of parts) {
+    if (current === null || current === undefined) {
+      throw new Error(`Cannot read property "${part}" of ${String(current)}`);
+    }
+    if (typeof current !== "object") {
+      throw new Error(`Cannot read property "${part}" of non-object`);
+    }
+    current = (current as Record<string, unknown>)[part];
+  }
+
+  return current;
+};
diff --git a/src/tools/list-files/list-files-tool.ts b/src/tools/list-files/list-files-tool.ts
index e11bddf..d0a08cc 100644
--- a/src/tools/list-files/list-files-tool.ts
+++ b/src/tools/list-files/list-files-tool.ts
@@ -19,15 +19,16 @@ export const createListFilesTool = ({ logger }: ListFilesToolOptions) =>
         path: {
           type: "string",
           description:
-            "Relative path within the repo tmp directory (optional, defaults to tmp root)",
+            'Relative path within the repo tmp directory. Use empty string "" to list tmp root.',
         },
       },
-      required: [],
+      required: ["path"],
       additionalProperties: false,
     },
-    execute: async ({ path: dirPath }: { path?: string }) => {
-      logger.tool("Listing files", { path: dirPath ?? "tmp root" });
-      const targetPath = await resolveTmpPathForList(dirPath);
+    execute: async ({ path: dirPath }: { path: string }) => {
+      const effectivePath = dirPath || undefined;
+      logger.tool("Listing files", { path: effectivePath ?? "tmp root" });
+      const targetPath = await resolveTmpPathForList(effectivePath);
 
       const entries = await fs.readdir(targetPath, { withFileTypes: true });
       const lines = entries.map((entry) => {
diff --git a/src/utils/parse-args.test.ts b/src/utils/parse-args.test.ts
index b5e394d..460be98 100644
--- a/src/utils/parse-args.test.ts
+++ b/src/utils/parse-args.test.ts
@@ -1,17 +1,8 @@
-import { describe, expect, it } from "vitest";
-import { z } from "zod";
-
 import { Logger } from "~clients/logger";
 import { parseArgs } from "~utils/parse-args";
+import { describe, expect, it } from "vitest";
 
-const TestSchema = z
-  .object({
-    suite: z.string().optional(),
-    all: z.coerce.boolean().default(false),
-  })
-  .refine((data) => data.suite ?? data.all, {
-    message: "Either --suite <name> or --all is required",
-  });
+import { CliArgsSchema } from "../cli/agent-evals/schemas";
 
 describe("parseArgs", () => {
   const logger = new Logger({
@@ -23,22 +14,91 @@ describe("parseArgs", () => {
   it("parses args after a standalone double-dash separator", () => {
     const args = parseArgs({
       logger,
-      schema: TestSchema,
+      schema: CliArgsSchema,
       rawArgs: ["--", "--suite=example"],
     });
 
     expect(args.suite).toBe("example");
     expect(args.all).toBe(false);
+    expect(args.report).toBe("json");
+    expect(args.out).toBe("agent-evals");
+    expect(args.verbose).toBe(false);
   });
 
   it("parses --all even when preceded by a double-dash separator", () => {
     const args = parseArgs({
       logger,
-      schema: TestSchema,
+      schema: CliArgsSchema,
       rawArgs: ["--", "--all"],
     });
 
     expect(args.suite).toBeUndefined();
     expect(args.all).toBe(true);
+    expect(args.report).toBe("json");
+    expect(args.out).toBe("agent-evals");
+    expect(args.verbose).toBe(false);
+  });
+
+  it("parses --report with valid enum values", () => {
+    const argsJson = parseArgs({
+      logger,
+      schema: CliArgsSchema,
+      rawArgs: ["--all", "--report=json"],
+    });
+    expect(argsJson.report).toBe("json");
+
+    const argsMd = parseArgs({
+      logger,
+      schema: CliArgsSchema,
+      rawArgs: ["--all", "--report=md"],
+    });
+    expect(argsMd.report).toBe("md");
+
+    const argsBoth = parseArgs({
+      logger,
+      schema: CliArgsSchema,
+      rawArgs: ["--all", "--report=both"],
+    });
+    expect(argsBoth.report).toBe("both");
+  });
+
+  it("parses --out with custom path", () => {
+    const args = parseArgs({
+      logger,
+      schema: CliArgsSchema,
+      rawArgs: ["--all", "--out=custom/output/path"],
+    });
+
+    expect(args.out).toBe("custom/output/path");
+  });
+
+  it("parses --verbose flag", () => {
+    const args = parseArgs({
+      logger,
+      schema: CliArgsSchema,
+      rawArgs: ["--all", "--verbose"],
+    });
+
+    expect(args.verbose).toBe(true);
+  });
+
+  it("throws on invalid --report value", () => {
+    expect(() =>
+      parseArgs({
+        logger,
+        schema: CliArgsSchema,
+        rawArgs: ["--all", "--report=invalid"],
+      })
+    ).toThrow();
+  });
+
+  it("throws when neither --suite nor --all is provided", () => {
+    expect(() =>
+      parseArgs({
+        logger,
+        schema: CliArgsSchema,
+        rawArgs: [],
+      })
+    ).toThrow("Either --suite <name> or --all is required");
   });
 });

From 7a1e10d8ed3ccf9740c3a5806fe16a2c69670da3 Mon Sep 17 00:00:00 2001
From: Juha Kangas <42040080+valuecodes@users.noreply.github.com>
Date: Tue, 27 Jan 2026 14:19:07 +0200
Subject: [PATCH 05/14] docs: update README with new suite field notes and file
 assertion types

---
 src/cli/agent-evals/README.md | 45 +++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/src/cli/agent-evals/README.md b/src/cli/agent-evals/README.md
index 5c19b4a..bd1cf4a 100644
--- a/src/cli/agent-evals/README.md
+++ b/src/cli/agent-evals/README.md
@@ -21,7 +21,7 @@ pnpm run:agent-evals -- --suite=example --verbose --report=both
 - `--all`: Run all suites in the `suites/` directory
 - `--report <format>`: Report format: `json`, `md`, or `both` (default: `json`)
 - `--out <path>`: Output base directory under `tmp/` (default: `agent-evals`)
-- `--verbose`: Enable verbose logging with assertion details
+- `--verbose`: Enable verbose logging with per-assertion failure details
 
 Either `--suite` or `--all` is required.
 
@@ -64,6 +64,15 @@ Add JSON files to `suites/` directory. Example structure:
 }
 ```
 
+### Suite Field Notes
+
+- `agent.model` is currently fixed to `gpt-5-mini`.
+- `agent.tools` accepts tool names from the registry: `readFile`, `writeFile`, `listFiles`.
+- `agent.maxTurns` defaults to `5` if omitted.
+- `defaults.timeout` applies per-case when the case does not provide `timeout`.
+- `cases[].timeout` defaults to `defaults.timeout`, then `30000` (ms).
+- `cases[].tags` is optional metadata for filtering/labeling (not used at runtime yet).
+
 ## Assertion Types
 
 - **contains**: Check if output contains a string
@@ -84,11 +93,43 @@ Add JSON files to `suites/` directory. Example structure:
   { "type": "equals", "expected": { "key": "value" } }
   ```
 
-- **jsonPath**: Extract and compare nested values
+- **jsonPath**: Extract and compare nested values (dot notation like `response.status` or `$.response.status`)
   ```json
   { "type": "jsonPath", "path": "$.response.status", "expected": "success" }
   ```
 
+### File Assertions (tmp/ only)
+
+These assertions read files under `tmp/` to verify tool side effects. Paths are relative to `tmp/`.
+
+- **fileExists**: Check that a file exists
+
+  ```json
+  { "type": "fileExists", "path": "agent-evals/tool-test-output.txt" }
+  ```
+
+- **fileContains**: Check that a file contains a string
+
+  ```json
+  {
+    "type": "fileContains",
+    "path": "agent-evals/tool-test-output.txt",
+    "value": "Hello World",
+    "caseSensitive": false
+  }
+  ```
+
+- **fileJsonPath**: Read a JSON file and compare a path (dot notation like `$.name`)
+
+  ```json
+  {
+    "type": "fileJsonPath",
+    "path": "agent-evals/tool-test-data.json",
+    "jsonPath": "$.name",
+    "expected": "test"
+  }
+  ```
+
 ## Flowchart
 
 ```mermaid

From d3d07b03ad80c7d247718a99fc3ce77f1ff379d2 Mon Sep 17 00:00:00 2001
From: Juha Kangas <42040080+valuecodes@users.noreply.github.com>
Date: Tue, 27 Jan 2026 14:35:39 +0200
Subject: [PATCH 06/14] feat: add delete file tool and related assertions to
 evaluation framework

- Implement delete file tool with path validation
- Add fileNotExists assertion schema and evaluation logic
- Update tools suite to include delete file tests
---
 src/cli/agent-evals/clients/tool-registry.ts  |  2 +
 src/cli/agent-evals/schemas.ts                | 10 ++
 src/cli/agent-evals/suites/tools.json         | 20 +++-
 src/cli/agent-evals/utils/assertions.ts       |  3 +
 src/cli/agent-evals/utils/file-assertions.ts  | 29 ++++++
 .../delete-file/delete-file-tool.test.ts      | 92 +++++++++++++++++++
 src/tools/delete-file/delete-file-tool.ts     | 36 ++++++++
 src/tools/utils/fs.ts                         | 28 ++++++
 8 files changed, 219 insertions(+), 1 deletion(-)
 create mode 100644 src/tools/delete-file/delete-file-tool.test.ts
 create mode 100644 src/tools/delete-file/delete-file-tool.ts

diff --git a/src/cli/agent-evals/clients/tool-registry.ts b/src/cli/agent-evals/clients/tool-registry.ts
index b4d0386..73e1fe0 100644
--- a/src/cli/agent-evals/clients/tool-registry.ts
+++ b/src/cli/agent-evals/clients/tool-registry.ts
@@ -1,5 +1,6 @@
 import type { Tool } from "@openai/agents";
 import type { Logger } from "~clients/logger";
+import { createDeleteFileTool } from "~tools/delete-file/delete-file-tool";
 import { createListFilesTool } from "~tools/list-files/list-files-tool";
 import { createReadFileTool } from "~tools/read-file/read-file-tool";
 import { createWriteFileTool } from "~tools/write-file/write-file-tool";
@@ -14,6 +15,7 @@ const toolFactories: Record<string, ToolFactory> = {
   readFile: ({ logger }) => createReadFileTool({ logger }),
   writeFile: ({ logger }) => createWriteFileTool({ logger }),
   listFiles: ({ logger }) => createListFilesTool({ logger }),
+  deleteFile: ({ logger }) => createDeleteFileTool({ logger }),
 };
 
 /**
diff --git a/src/cli/agent-evals/schemas.ts b/src/cli/agent-evals/schemas.ts
index 9b56a40..501c37f 100644
--- a/src/cli/agent-evals/schemas.ts
+++ b/src/cli/agent-evals/schemas.ts
@@ -81,6 +81,12 @@ export const FileJsonPathAssertionSchema = z.object({
   description: z.string().optional(),
 });
 
+export const FileNotExistsAssertionSchema = z.object({
+  type: z.literal("fileNotExists"),
+  path: z.string(),
+  description: z.string().optional(),
+});
+
 export const AssertionSchema = z.discriminatedUnion("type", [
   ContainsAssertionSchema,
   MatchesRegexAssertionSchema,
@@ -89,6 +95,7 @@ export const AssertionSchema = z.discriminatedUnion("type", [
   FileExistsAssertionSchema,
   FileContainsAssertionSchema,
   FileJsonPathAssertionSchema,
+  FileNotExistsAssertionSchema,
 ]);
 
 export type Assertion = z.infer<typeof AssertionSchema>;
@@ -99,6 +106,9 @@ export type JsonPathAssertion = z.infer<typeof JsonPathAssertionSchema>;
 export type FileExistsAssertion = z.infer<typeof FileExistsAssertionSchema>;
 export type FileContainsAssertion = z.infer<typeof FileContainsAssertionSchema>;
 export type FileJsonPathAssertion = z.infer<typeof FileJsonPathAssertionSchema>;
+export type FileNotExistsAssertion = z.infer<
+  typeof FileNotExistsAssertionSchema
+>;
 
 // ============================================
 // Eval Case
diff --git a/src/cli/agent-evals/suites/tools.json b/src/cli/agent-evals/suites/tools.json
index 0a765b4..ca26336 100644
--- a/src/cli/agent-evals/suites/tools.json
+++ b/src/cli/agent-evals/suites/tools.json
@@ -6,7 +6,7 @@
     "name": "ToolTestAgent",
     "model": "gpt-5-mini",
     "instructions": "You are an assistant that tests file tools. Use the tools provided to complete tasks. After using a tool, report results concisely.",
-    "tools": ["readFile", "writeFile", "listFiles"],
+    "tools": ["readFile", "writeFile", "listFiles", "deleteFile"],
     "maxTurns": 3
   },
   "defaults": {
@@ -84,6 +84,24 @@
         }
       ],
       "tags": ["writeFile", "json"]
+    },
+    {
+      "id": "delete-file",
+      "name": "deleteFile removes a file",
+      "prompt": "Delete the file 'agent-evals/tool-test-data.json'",
+      "assertions": [
+        {
+          "type": "fileNotExists",
+          "path": "agent-evals/tool-test-data.json",
+          "description": "File should be deleted"
+        },
+        {
+          "type": "contains",
+          "value": "Deleted",
+          "description": "Agent should confirm deletion"
+        }
+      ],
+      "tags": ["deleteFile"]
     }
   ]
 }
diff --git a/src/cli/agent-evals/utils/assertions.ts b/src/cli/agent-evals/utils/assertions.ts
index bdc50d1..5a2362b 100644
--- a/src/cli/agent-evals/utils/assertions.ts
+++ b/src/cli/agent-evals/utils/assertions.ts
@@ -10,6 +10,7 @@ import {
   evaluateFileContainsAssertion,
   evaluateFileExistsAssertion,
   evaluateFileJsonPathAssertion,
+  evaluateFileNotExistsAssertion,
 } from "./file-assertions";
 
 /**
@@ -35,6 +36,8 @@ export const evaluateAssertion = async (
       return evaluateFileContainsAssertion(assertion);
     case "fileJsonPath":
       return evaluateFileJsonPathAssertion(assertion);
+    case "fileNotExists":
+      return evaluateFileNotExistsAssertion(assertion);
   }
 };
 
diff --git a/src/cli/agent-evals/utils/file-assertions.ts b/src/cli/agent-evals/utils/file-assertions.ts
index be8f360..e5fd889 100644
--- a/src/cli/agent-evals/utils/file-assertions.ts
+++ b/src/cli/agent-evals/utils/file-assertions.ts
@@ -7,6 +7,7 @@ import type {
   FileContainsAssertion,
   FileExistsAssertion,
   FileJsonPathAssertion,
+  FileNotExistsAssertion,
 } from "../schemas";
 
 /**
@@ -108,6 +109,34 @@ export const evaluateFileJsonPathAssertion = async (
   }
 };
 
+/**
+ * Evaluate a fileNotExists assertion by checking if the file does not exist in tmp/.
+ */
+export const evaluateFileNotExistsAssertion = async (
+  assertion: FileNotExistsAssertion
+): Promise<AssertionResult> => {
+  const fullPath = path.join(TMP_ROOT, assertion.path);
+
+  try {
+    await fs.access(fullPath);
+    return {
+      assertion,
+      passed: false,
+      message: `File still exists: ${assertion.path}`,
+      actual: "file exists",
+      expected: "file to not exist",
+    };
+  } catch {
+    return {
+      assertion,
+      passed: true,
+      message: `File does not exist: ${assertion.path}`,
+      actual: "file not found",
+      expected: "file to not exist",
+    };
+  }
+};
+
 /**
  * Deep equality check using JSON serialization.
  */
diff --git a/src/tools/delete-file/delete-file-tool.test.ts b/src/tools/delete-file/delete-file-tool.test.ts
new file mode 100644
index 0000000..80a7bb9
--- /dev/null
+++ b/src/tools/delete-file/delete-file-tool.test.ts
@@ -0,0 +1,92 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { TMP_ROOT } from "~tools/utils/fs";
+import { invokeTool, tryCreateSymlink } from "~tools/utils/test-utils";
+import { afterEach, beforeEach, describe, expect, it } from "vitest";
+
+import { createDeleteFileTool } from "./delete-file-tool";
+
+describe("createDeleteFileTool tmp path safety", () => {
+  let testDir = "";
+  let relativeDir = "";
+  // eslint-disable-next-line @typescript-eslint/no-empty-function
+  const mockLogger = { tool: () => {} } as never;
+
+  beforeEach(async () => {
+    await fs.mkdir(TMP_ROOT, { recursive: true });
+    testDir = await fs.mkdtemp(path.join(TMP_ROOT, "vitest-tools-"));
+    relativeDir = path.relative(TMP_ROOT, testDir);
+  });
+
+  afterEach(async () => {
+    if (testDir) {
+      await fs.rm(testDir, { recursive: true, force: true });
+    }
+    testDir = "";
+    relativeDir = "";
+  });
+
+  it("deletes relative paths under tmp", async () => {
+    const relativePath = path.join(relativeDir, "to-delete.txt");
+    const absolutePath = path.join(TMP_ROOT, relativePath);
+    await fs.writeFile(absolutePath, "delete me", "utf8");
+
+    const deleteFileTool = createDeleteFileTool({ logger: mockLogger });
+    const result = await invokeTool<string>(deleteFileTool, {
+      path: relativePath,
+    });
+
+    expect(result).toContain("Deleted");
+    await expect(fs.access(absolutePath)).rejects.toThrow();
+  });
+
+  it("deletes absolute paths under tmp", async () => {
+    const absolutePath = path.join(testDir, "absolute-delete.txt");
+    await fs.writeFile(absolutePath, "delete me", "utf8");
+
+    const deleteFileTool = createDeleteFileTool({ logger: mockLogger });
+    const result = await invokeTool<string>(deleteFileTool, {
+      path: absolutePath,
+    });
+
+    expect(result).toContain("Deleted");
+    await expect(fs.access(absolutePath)).rejects.toThrow();
+  });
+
+  it("rejects path traversal attempts", async () => {
+    const deleteFileTool = createDeleteFileTool({ logger: mockLogger });
+    const result = await invokeTool<string>(deleteFileTool, {
+      path: "../outside.txt",
+    });
+    expect(result).toContain("Path traversal is not allowed.");
+  });
+
+  it("rejects symlink paths", async () => {
+    const realDir = path.join(testDir, "real");
+    await fs.mkdir(realDir, { recursive: true });
+    const realFile = path.join(realDir, "file.txt");
+    await fs.writeFile(realFile, "real content", "utf8");
+    const linkDir = path.join(testDir, "link");
+
+    const symlinkCreated = await tryCreateSymlink(realDir, linkDir);
+    if (!symlinkCreated) {
+      return;
+    }
+
+    const symlinkPath = path.join(relativeDir, "link", "file.txt");
+
+    const deleteFileTool = createDeleteFileTool({ logger: mockLogger });
+    const result = await invokeTool<string>(deleteFileTool, {
+      path: symlinkPath,
+    });
+    expect(result).toContain("Symlink paths are not allowed.");
+  });
+
+  it("returns error for non-existent files", async () => {
+    const deleteFileTool = createDeleteFileTool({ logger: mockLogger });
+    const result = await invokeTool<string>(deleteFileTool, {
+      path: path.join(relativeDir, "nonexistent.txt"),
+    });
+    expect(result).toContain("Path does not exist.");
+  });
+});
diff --git a/src/tools/delete-file/delete-file-tool.ts b/src/tools/delete-file/delete-file-tool.ts
new file mode 100644
index 0000000..7c10156
--- /dev/null
+++ b/src/tools/delete-file/delete-file-tool.ts
@@ -0,0 +1,36 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { tool } from "@openai/agents";
+import type { Logger } from "~clients/logger";
+import { resolveTmpPathForDelete, TMP_ROOT } from "~tools/utils/fs";
+
+export type DeleteFileToolOptions = {
+  logger: Logger;
+};
+
+export const createDeleteFileTool = ({ logger }: DeleteFileToolOptions) =>
+  tool({
+    name: "deleteFile",
+    description:
+      "Deletes a file under the repo tmp directory (path is relative to tmp).",
+    parameters: {
+      type: "object",
+      properties: {
+        path: {
+          type: "string",
+          description: "Relative path within the repo tmp directory",
+        },
+      },
+      required: ["path"],
+      additionalProperties: false,
+    },
+    execute: async ({ path: filePath }: { path: string }) => {
+      logger.tool("Deleting file", { path: filePath });
+      const targetPath = await resolveTmpPathForDelete(filePath);
+      await fs.unlink(targetPath);
+      const relativePath = path.relative(TMP_ROOT, targetPath);
+      const displayPath = path.join("tmp", relativePath);
+      logger.tool("Deleted file", { path: displayPath });
+      return `Deleted ${displayPath}`;
+    },
+  });
diff --git a/src/tools/utils/fs.ts b/src/tools/utils/fs.ts
index c023437..755173a 100644
--- a/src/tools/utils/fs.ts
+++ b/src/tools/utils/fs.ts
@@ -149,6 +149,34 @@ export const resolveTmpPathForRead = async (userPath: string) => {
   return candidatePath;
 };
 
+export const resolveTmpPathForDelete = async (userPath: string) => {
+  const trimmed = userPath.trim();
+  if (!trimmed) {
+    throw new Error("Path cannot be empty.");
+  }
+  if (PATH_TRAVERSAL.test(trimmed)) {
+    throw new Error("Path traversal is not allowed.");
+  }
+
+  await ensureTmpRoot({ create: false });
+  const candidatePath = resolveCandidatePath(trimmed);
+
+  await assertNoSymlinkComponents(TMP_ROOT, candidatePath);
+
+  const tmpRootReal = await fs.realpath(TMP_ROOT);
+  const parentReal = await fs.realpath(path.dirname(candidatePath));
+  if (!isPathInside(tmpRootReal, parentReal)) {
+    throw new Error("Resolved path escapes tmp directory.");
+  }
+
+  const fileStat = await fs.lstat(candidatePath);
+  if (!fileStat.isFile()) {
+    throw new Error("Path must point to a file.");
+  }
+
+  return candidatePath;
+};
+
 export const resolveTmpPathForList = async (userPath?: string) => {
   const trimmed = (userPath ?? "").trim();
 

From a66279ffe76a8187518c6110465ab05aa90e123f Mon Sep 17 00:00:00 2001
From: Juha Kangas <42040080+valuecodes@users.noreply.github.com>
Date: Tue, 27 Jan 2026 14:36:23 +0200
Subject: [PATCH 07/14] refactor: update report generation to log display paths
 instead of full paths

- Modify JSON and Markdown report saving methods to log relative paths
- Introduce toDisplayPath method for consistent path formatting
---
 src/cli/agent-evals/clients/report-generator.ts | 17 ++++++++++++-----
 src/tools/read-file/read-file-tool.ts           |  7 +++++--
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/cli/agent-evals/clients/report-generator.ts b/src/cli/agent-evals/clients/report-generator.ts
index 5437cb7..2e97223 100644
--- a/src/cli/agent-evals/clients/report-generator.ts
+++ b/src/cli/agent-evals/clients/report-generator.ts
@@ -1,7 +1,7 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import type { Logger } from "~clients/logger";
-import { resolveTmpPathForWrite } from "~tools/utils/fs";
+import { resolveTmpPathForWrite, TMP_ROOT } from "~tools/utils/fs";
 
 import {
   DECIMAL_PLACES,
@@ -92,8 +92,9 @@ export class ReportGenerator {
     const fullPath = await resolveTmpPathForWrite(relativePath);
 
     await fs.writeFile(fullPath, JSON.stringify(report, null, 2), "utf8");
-    this.logger.info("JSON report saved", { path: fullPath });
-    return fullPath;
+    const displayPath = this.toDisplayPath(fullPath);
+    this.logger.info("JSON report saved", { path: displayPath });
+    return displayPath;
   }
 
   private async writeMarkdown(report: EvalReport): Promise<string> {
@@ -104,8 +105,14 @@ export class ReportGenerator {
 
     const markdown = this.formatMarkdown(report);
     await fs.writeFile(fullPath, markdown, "utf8");
-    this.logger.info("Markdown report saved", { path: fullPath });
-    return fullPath;
+    const displayPath = this.toDisplayPath(fullPath);
+    this.logger.info("Markdown report saved", { path: displayPath });
+    return displayPath;
+  }
+
+  private toDisplayPath(fullPath: string): string {
+    const relativePath = path.relative(TMP_ROOT, fullPath);
+    return path.join("tmp", relativePath);
   }
 
   private formatMarkdown(report: EvalReport): string {
diff --git a/src/tools/read-file/read-file-tool.ts b/src/tools/read-file/read-file-tool.ts
index f122d48..885b6b3 100644
--- a/src/tools/read-file/read-file-tool.ts
+++ b/src/tools/read-file/read-file-tool.ts
@@ -1,7 +1,8 @@
 import fs from "node:fs/promises";
+import path from "node:path";
 import { tool } from "@openai/agents";
 import type { Logger } from "~clients/logger";
-import { resolveTmpPathForRead } from "~tools/utils/fs";
+import { resolveTmpPathForRead, TMP_ROOT } from "~tools/utils/fs";
 
 export type ReadFileToolOptions = {
   logger: Logger;
@@ -26,7 +27,9 @@ export const createReadFileTool = ({ logger }: ReadFileToolOptions) =>
     execute: async ({ path: filePath }: { path: string }) => {
       logger.tool("Reading file", { path: filePath });
       const targetPath = await resolveTmpPathForRead(filePath);
-      logger.tool("Read file result", { targetPath });
+      const relativePath = path.relative(TMP_ROOT, targetPath);
+      const displayPath = path.join("tmp", relativePath);
+      logger.tool("Read file result", { targetPath: displayPath });
       return fs.readFile(targetPath, "utf8");
     },
   });

From ae95f75b0f10a9edff5ada24215a043019e9019e Mon Sep 17 00:00:00 2001
From: Juha Kangas <42040080+valuecodes@users.noreply.github.com>
Date: Tue, 27 Jan 2026 14:45:21 +0200
Subject: [PATCH 08/14] feat: add deleteFile tool and update documentation for
 agent evals

---
 AGENTS.md                     |  3 +++
 README.md                     | 45 ++++++++++++++++++++++++++---------
 src/cli/agent-evals/README.md |  2 +-
 3 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index ce87b40..0471f0b 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -94,6 +94,9 @@ All file tools are sandboxed to `tmp/` using path validation (`src/tools/utils/f
 - **`listFiles`** (`src/tools/list-files/list-files-tool.ts`)
   - Lists files/dirs under `tmp/`.
   - Params: `{ path?: string }` (defaults to `tmp/` root)
+- **`deleteFile`** (`src/tools/delete-file/delete-file-tool.ts`)
+  - Deletes a file under `tmp/`.
+  - Params: `{ path: string }` (path is **relative to `tmp/`**)
 - **`runPython`** (`src/tools/run-python/run-python-tool.ts`)
   - Runs a Python script from a configured scripts directory.
   - Params: `{ scriptName: string, input: string }` (input is JSON string; pass `""` for no input)
diff --git a/README.md b/README.md
index 2a5ca6e..27639ad 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # cli-agent-sandbox
 
-A minimal TypeScript CLI sandbox for testing agent workflows and safe web scraping. This is a single-package repo built with [`@openai/agents`](https://github.com/openai/openai-agents-js), and it includes a guestbook demo, a Finnish name explorer CLI, a publication scraping pipeline with a Playwright-based scraper for JS-rendered pages, an ETF backtest CLI, and agent tools scoped to `tmp` with strong safety checks.
+A minimal TypeScript CLI sandbox for testing agent workflows and safe web scraping. This is a single-package repo built with [`@openai/agents`](https://github.com/openai/openai-agents-js), and it includes a guestbook demo, a Finnish name explorer CLI, a publication scraping pipeline with a Playwright-based scraper for JS-rendered pages, an ETF backtest CLI, an agent evals CLI, and agent tools scoped to `tmp` with strong safety checks.
 
 ## Quick Start
 
@@ -9,9 +9,10 @@ A minimal TypeScript CLI sandbox for testing agent workflows and safe web scrapi
 3. Install Playwright system deps (Chromium): `pnpm exec playwright install-deps chromium`
 4. Set `OPENAI_API_KEY` (export it or add to `.env`)
 5. Run the demo: `pnpm run:guestbook`
-6. (Optional) Explore Finnish name stats: `pnpm run:name-explorer -- --mode ai|stats`
-7. (Optional) Run publication scraping: `pnpm run:scrape-publications -- --url="https://example.com"`
-8. (Optional) Run ETF backtest: `pnpm run:etf-backtest -- --isin=IE00B5BMR087` (requires Python setup below)
+6. (Optional) Run agent evals: `pnpm run:agent-evals -- --suite=example`
+7. (Optional) Explore Finnish name stats: `pnpm run:name-explorer -- --mode ai|stats`
+8. (Optional) Run publication scraping: `pnpm run:scrape-publications -- --url="https://example.com"`
+9. (Optional) Run ETF backtest: `pnpm run:etf-backtest -- --isin=IE00B5BMR087` (requires Python setup below)
 
 ### Python Setup (for ETF backtest)
 
@@ -29,6 +30,7 @@ pip install numpy pandas torch
 | Command                        | Description                                            |
 | ------------------------------ | ------------------------------------------------------ |
 | `pnpm run:guestbook`           | Run the interactive guestbook CLI demo                 |
+| `pnpm run:agent-evals`         | Run agent evaluation suites and generate reports       |
 | `pnpm run:name-explorer`       | Explore Finnish name statistics (AI Q&A or stats)      |
 | `pnpm run:scrape-publications` | Scrape publication links and build a review page       |
 | `pnpm run:etf-backtest`        | Run ETF backtest + feature optimizer (requires Python) |
@@ -87,17 +89,29 @@ Notes:
 - `--refresh` forces a refetch; otherwise cached data is reused.
 - Python scripts live in `src/cli/etf-backtest/scripts/`.
 
+## Agent evals
+
+The `run:agent-evals` CLI executes evaluation suites for agents and writes reports under `tmp/agent-evals/` by default.
+
+Usage:
+
+```
+pnpm run:agent-evals -- --suite=example
+pnpm run:agent-evals -- --all
+```
+
 ## Tools
 
 File tools are sandboxed to the `tmp/` directory with path validation to prevent traversal and symlink attacks. The `fetchUrl` tool adds SSRF protections and HTML sanitization, and `runPython` executes whitelisted Python scripts from a configured directory.
 
-| Tool        | Location                                  | Description                                                                    |
-| ----------- | ----------------------------------------- | ------------------------------------------------------------------------------ |
-| `fetchUrl`  | `src/tools/fetch-url/fetch-url-tool.ts`   | Fetches URLs safely and returns sanitized Markdown/text                        |
-| `readFile`  | `src/tools/read-file/read-file-tool.ts`   | Reads file content from `tmp` directory                                        |
-| `writeFile` | `src/tools/write-file/write-file-tool.ts` | Writes content to files in `tmp` directory                                     |
-| `listFiles` | `src/tools/list-files/list-files-tool.ts` | Lists files and directories under `tmp`                                        |
-| `runPython` | `src/tools/run-python/run-python-tool.ts` | Runs Python scripts from a configured scripts directory (JSON stdin supported) |
+| Tool         | Location                                    | Description                                                                    |
+| ------------ | ------------------------------------------- | ------------------------------------------------------------------------------ |
+| `fetchUrl`   | `src/tools/fetch-url/fetch-url-tool.ts`     | Fetches URLs safely and returns sanitized Markdown/text                        |
+| `readFile`   | `src/tools/read-file/read-file-tool.ts`     | Reads file content from `tmp` directory                                        |
+| `writeFile`  | `src/tools/write-file/write-file-tool.ts`   | Writes content to files in `tmp` directory                                     |
+| `listFiles`  | `src/tools/list-files/list-files-tool.ts`   | Lists files and directories under `tmp`                                        |
+| `deleteFile` | `src/tools/delete-file/delete-file-tool.ts` | Deletes files under the `tmp` directory                                        |
+| `runPython`  | `src/tools/run-python/run-python-tool.ts`   | Runs Python scripts from a configured scripts directory (JSON stdin supported) |
 
 `runPython` details:
 
@@ -109,6 +123,14 @@ File tools are sandboxed to the `tmp/` directory with path validation to prevent
 ```
 src/
 ├── cli/
+│   ├── agent-evals/
+│   │   ├── main.ts            # Agent evals CLI entry point
+│   │   ├── README.md          # Agent evals CLI docs
+│   │   ├── constants.ts       # CLI constants
+│   │   ├── schemas.ts         # CLI args + suite schemas
+│   │   ├── clients/           # Suite runner + report generator
+│   │   ├── utils/             # Assertion + formatting helpers
+│   │   └── suites/            # Example evaluation suites
 │   ├── etf-backtest/
 │   │   ├── main.ts            # ETF backtest CLI entry point
 │   │   ├── README.md          # ETF backtest docs
@@ -142,6 +164,7 @@ src/
 │   ├── parse-args.ts          # Shared CLI arg parsing helper
 │   └── question-handler.ts    # Shared CLI prompt + validation helper
 ├── tools/
+│   ├── delete-file/           # Delete file tool
 │   ├── fetch-url/             # Safe fetch tool
 │   ├── list-files/            # List files tool
 │   ├── read-file/             # Read file tool
diff --git a/src/cli/agent-evals/README.md b/src/cli/agent-evals/README.md
index bd1cf4a..12c01c2 100644
--- a/src/cli/agent-evals/README.md
+++ b/src/cli/agent-evals/README.md
@@ -67,7 +67,7 @@ Add JSON files to `suites/` directory. Example structure:
 ### Suite Field Notes
 
 - `agent.model` is currently fixed to `gpt-5-mini`.
-- `agent.tools` accepts tool names from the registry: `readFile`, `writeFile`, `listFiles`.
+- `agent.tools` accepts tool names from the registry: `readFile`, `writeFile`, `listFiles`, `deleteFile`.
 - `agent.maxTurns` defaults to `5` if omitted.
 - `defaults.timeout` applies per-case when the case does not provide `timeout`.
 - `cases[].timeout` defaults to `defaults.timeout`, then `30000` (ms).

From 19e028356eeb24ea6c06410f40ec977b4d2748d1 Mon Sep 17 00:00:00 2001
From: Juha Kangas <42040080+valuecodes@users.noreply.github.com>
Date: Tue, 27 Jan 2026 15:08:37 +0200
Subject: [PATCH 09/14] chore: remove outdated CHECKLIST.md file

---
 src/cli/agent-evals/CHECKLIST.md | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 src/cli/agent-evals/CHECKLIST.md

diff --git a/src/cli/agent-evals/CHECKLIST.md b/src/cli/agent-evals/CHECKLIST.md
deleted file mode 100644
index 986ce0d..0000000
--- a/src/cli/agent-evals/CHECKLIST.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# Post-Scaffold Checklist
-
-## Setup
-
-- [ ] Update `main.ts` with CLI logic
-- [ ] Add CLI arguments to the Zod schema
-- [ ] Update `README.md` description and flowchart
-
-## Optional Structure
-
-- [ ] Create `./clients/` for pipeline/client classes
-- [ ] Create `./types/` for Zod schemas
-- [ ] Create `./tools/` for CLI-specific agent tools
-
-## Before Committing
-
-- [ ] `pnpm typecheck`
-- [ ] `pnpm lint`
-- [ ] `pnpm format:check`
-- [ ] Add tests if behavior is testable
-- [ ] `pnpm test`
-
-## Cleanup
-
-- [ ] Delete this CHECKLIST.md when done

From c20e80cc64a7898d94a322efcdcb7fbf830337bc Mon Sep 17 00:00:00 2001
From: Juha Kangas <42040080+valuecodes@users.noreply.github.com>
Date: Tue, 27 Jan 2026 15:14:47 +0200
Subject: [PATCH 10/14] docs: add comment to clarify purpose of sanitizeArgs
 function

---
 src/utils/parse-args.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/utils/parse-args.ts b/src/utils/parse-args.ts
index b777282..91f3855 100644
--- a/src/utils/parse-args.ts
+++ b/src/utils/parse-args.ts
@@ -8,6 +8,7 @@ export type ParseArgsOptions<T extends z.ZodType> = {
   rawArgs?: string[];
 };
 
+// Strip standalone "--" so parseArgv doesn't treat it as a literal arg after end-of-options.
 const sanitizeArgs = (rawArgs: string[]): string[] =>
   rawArgs.filter((arg) => arg !== "--");
 

From c9a6fd926e904e1af6a5155211a468a6ef84ea23 Mon Sep 17 00:00:00 2001
From: Juha Kangas <42040080+valuecodes@users.noreply.github.com>
Date: Tue, 27 Jan 2026 17:05:27 +0200
Subject: [PATCH 11/14] feat: add path traversal checks to file assertions and
 utilities - Implement path traversal rejection in file assertions - Add
 resolveTmpPathForAccess utility to validate paths

---
 src/cli/agent-evals/utils/assertions.test.ts | 43 +++++++++++++++++++
 src/cli/agent-evals/utils/file-assertions.ts | 45 +++++++++++++-------
 src/tools/utils/fs.ts                        | 32 ++++++++++++++
 3 files changed, 104 insertions(+), 16 deletions(-)

diff --git a/src/cli/agent-evals/utils/assertions.test.ts b/src/cli/agent-evals/utils/assertions.test.ts
index f57c86c..594c8a3 100644
--- a/src/cli/agent-evals/utils/assertions.test.ts
+++ b/src/cli/agent-evals/utils/assertions.test.ts
@@ -226,6 +226,7 @@ describe("evaluateAssertion", () => {
     const TEST_DIR = path.join(TMP_ROOT, "assertion-tests");
     const TEST_FILE = path.join(TEST_DIR, "test-file.txt");
     const TEST_JSON_FILE = path.join(TEST_DIR, "test-data.json");
+    const TRAVERSAL_PATH = "../package.json";
 
     beforeAll(async () => {
       await fs.mkdir(TEST_DIR, { recursive: true });
@@ -260,6 +261,16 @@ describe("evaluateAssertion", () => {
         expect(result.passed).toBe(false);
         expect(result.message).toContain("does not exist");
       });
+
+      it("rejects traversal paths", async () => {
+        const assertion: Assertion = {
+          type: "fileExists",
+          path: TRAVERSAL_PATH,
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(false);
+        expect(result.actual).toContain("Path traversal is not allowed");
+      });
     });
 
     describe("fileContains", () => {
@@ -387,5 +398,37 @@ describe("evaluateAssertion", () => {
         expect(result.message).toContain("Failed to evaluate");
       });
     });
+
+    describe("fileNotExists", () => {
+      it("passes when file does not exist", async () => {
+        const assertion: Assertion = {
+          type: "fileNotExists",
+          path: "assertion-tests/missing.txt",
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(true);
+        expect(result.message).toContain("does not exist");
+      });
+
+      it("fails when file exists", async () => {
+        const assertion: Assertion = {
+          type: "fileNotExists",
+          path: "assertion-tests/test-file.txt",
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(false);
+        expect(result.message).toContain("still exists");
+      });
+
+      it("rejects traversal paths", async () => {
+        const assertion: Assertion = {
+          type: "fileNotExists",
+          path: TRAVERSAL_PATH,
+        };
+        const result = await evaluateAssertion(assertion, null);
+        expect(result.passed).toBe(false);
+        expect(result.message).toContain("Failed to check file");
+      });
+    });
   });
 });
diff --git a/src/cli/agent-evals/utils/file-assertions.ts b/src/cli/agent-evals/utils/file-assertions.ts
index e5fd889..cee6b53 100644
--- a/src/cli/agent-evals/utils/file-assertions.ts
+++ b/src/cli/agent-evals/utils/file-assertions.ts
@@ -1,6 +1,5 @@
 import fs from "node:fs/promises";
-import path from "node:path";
-import { TMP_ROOT } from "~tools/utils/fs";
+import { resolveTmpPathForAccess, resolveTmpPathForRead } from "~tools/utils/fs";
 
 import type {
   AssertionResult,
@@ -16,9 +15,8 @@ import type {
 export const evaluateFileExistsAssertion = async (
   assertion: FileExistsAssertion
 ): Promise<AssertionResult> => {
-  const fullPath = path.join(TMP_ROOT, assertion.path);
-
   try {
+    const fullPath = await resolveTmpPathForAccess(assertion.path);
     await fs.access(fullPath);
     return {
       assertion,
@@ -27,12 +25,12 @@ export const evaluateFileExistsAssertion = async (
       actual: assertion.path,
       expected: "file to exist",
     };
-  } catch {
+  } catch (err) {
     return {
       assertion,
       passed: false,
       message: `File does not exist: ${assertion.path}`,
-      actual: "file not found",
+      actual: err instanceof Error ? err.message : String(err),
       expected: "file to exist",
     };
   }
@@ -44,9 +42,8 @@ export const evaluateFileExistsAssertion = async (
 export const evaluateFileContainsAssertion = async (
   assertion: FileContainsAssertion
 ): Promise<AssertionResult> => {
-  const fullPath = path.join(TMP_ROOT, assertion.path);
-
   try {
+    const fullPath = await resolveTmpPathForRead(assertion.path);
     const content = await fs.readFile(fullPath, "utf8");
     const caseSensitive = assertion.caseSensitive ?? true;
     const searchValue = caseSensitive
@@ -81,9 +78,8 @@ export const evaluateFileContainsAssertion = async (
 export const evaluateFileJsonPathAssertion = async (
   assertion: FileJsonPathAssertion
 ): Promise<AssertionResult> => {
-  const fullPath = path.join(TMP_ROOT, assertion.path);
-
   try {
+    const fullPath = await resolveTmpPathForRead(assertion.path);
     const content = await fs.readFile(fullPath, "utf8");
     const json = JSON.parse(content) as unknown;
     const value = getJsonPath(json, assertion.jsonPath);
@@ -115,9 +111,8 @@ export const evaluateFileJsonPathAssertion = async (
 export const evaluateFileNotExistsAssertion = async (
   assertion: FileNotExistsAssertion
 ): Promise<AssertionResult> => {
-  const fullPath = path.join(TMP_ROOT, assertion.path);
-
   try {
+    const fullPath = await resolveTmpPathForAccess(assertion.path);
     await fs.access(fullPath);
     return {
       assertion,
@@ -126,17 +121,35 @@ export const evaluateFileNotExistsAssertion = async (
       actual: "file exists",
       expected: "file to not exist",
     };
-  } catch {
+  } catch (err) {
+    if (isErrno(err, "ENOENT")) {
+      return {
+        assertion,
+        passed: true,
+        message: `File does not exist: ${assertion.path}`,
+        actual: "file not found",
+        expected: "file to not exist",
+      };
+    }
     return {
       assertion,
-      passed: true,
-      message: `File does not exist: ${assertion.path}`,
-      actual: "file not found",
+      passed: false,
+      message: `Failed to check file: ${err instanceof Error ? err.message : String(err)}`,
+      actual: err instanceof Error ? err.message : String(err),
       expected: "file to not exist",
     };
   }
 };
 
+const isErrno = (
+  error: unknown,
+  code: string
+): error is NodeJS.ErrnoException =>
+  typeof error === "object" &&
+  error !== null &&
+  "code" in error &&
+  (error as NodeJS.ErrnoException).code === code;
+
 /**
  * Deep equality check using JSON serialization.
  */
diff --git a/src/tools/utils/fs.ts b/src/tools/utils/fs.ts
index 755173a..8755033 100644
--- a/src/tools/utils/fs.ts
+++ b/src/tools/utils/fs.ts
@@ -149,6 +149,38 @@ export const resolveTmpPathForRead = async (userPath: string) => {
   return candidatePath;
 };
 
+export const resolveTmpPathForAccess = async (userPath: string) => {
+  const trimmed = userPath.trim();
+  if (!trimmed) {
+    throw new Error("Path cannot be empty.");
+  }
+  if (PATH_TRAVERSAL.test(trimmed)) {
+    throw new Error("Path traversal is not allowed.");
+  }
+
+  await ensureTmpRoot({ create: false });
+  const candidatePath = resolveCandidatePath(trimmed);
+
+  await assertNoSymlinkComponents(TMP_ROOT, candidatePath, {
+    allowMissing: true,
+  });
+
+  const tmpRootReal = await fs.realpath(TMP_ROOT);
+  const parentDir = path.dirname(candidatePath);
+  try {
+    const parentReal = await fs.realpath(parentDir);
+    if (!isPathInside(tmpRootReal, parentReal)) {
+      throw new Error("Resolved path escapes tmp directory.");
+    }
+  } catch (error) {
+    if (!isErrno(error, "ENOENT")) {
+      throw error;
+    }
+  }
+
+  return candidatePath;
+};
+
 export const resolveTmpPathForDelete = async (userPath: string) => {
   const trimmed = userPath.trim();
   if (!trimmed) {

From 880f072e0153d4bfa6e6a27310778ac66627a2f1 Mon Sep 17 00:00:00 2001
From: Juha Kangas <42040080+valuecodes@users.noreply.github.com>
Date: Tue, 27 Jan 2026 17:10:58 +0200
Subject: [PATCH 12/14] refactor: update description and test case for empty
 path handling

---
 src/tools/list-files/list-files-tool.test.ts | 4 ++--
 src/tools/list-files/list-files-tool.ts      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/tools/list-files/list-files-tool.test.ts b/src/tools/list-files/list-files-tool.test.ts
index 106a32d..15b65b8 100644
--- a/src/tools/list-files/list-files-tool.test.ts
+++ b/src/tools/list-files/list-files-tool.test.ts
@@ -52,9 +52,9 @@ describe("createListFilesTool tmp path safety", () => {
     expect(result).toContain("[file] absolute.txt");
   });
 
-  it("lists root of tmp when no path provided", async () => {
+  it("lists root of tmp when empty path provided", async () => {
     const listFilesTool = createListFilesTool({ logger: mockLogger });
-    const result = await invokeTool<string>(listFilesTool, {});
+    const result = await invokeTool<string>(listFilesTool, { path: "" });
 
     expect(result).toContain("Contents of tmp:");
     expect(result).toContain(path.basename(testDir));
diff --git a/src/tools/list-files/list-files-tool.ts b/src/tools/list-files/list-files-tool.ts
index d0a08cc..a719ab6 100644
--- a/src/tools/list-files/list-files-tool.ts
+++ b/src/tools/list-files/list-files-tool.ts
@@ -12,14 +12,14 @@ export const createListFilesTool = ({ logger }: ListFilesToolOptions) =>
   tool({
     name: "listFiles",
     description:
-      "Lists files and directories under the repo tmp directory (path is relative to tmp). If no path provided, lists root of tmp.",
+      "Lists files and directories under the repo tmp directory (path is relative to tmp). Use an empty path to list the tmp root.",
     parameters: {
       type: "object",
       properties: {
         path: {
           type: "string",
           description:
-            'Relative path within the repo tmp directory. Use empty string "" to list tmp root.',
+            "Relative path within the repo tmp directory.",
         },
       },
       required: ["path"],

From 8659764e396db0d8c3cf656d8a3d6f20744d8908 Mon Sep 17 00:00:00 2001
From: Juha Kangas <42040080+valuecodes@users.noreply.github.com>
Date: Tue, 27 Jan 2026 17:17:37 +0200
Subject: [PATCH 13/14] feat: add lodash-es for deep equality checks in
 assertions - Replace custom deep equality function with lodash's isEqual -
 Update assertions to handle objects with different key ordering - Add
 lodash-es type definitions to package.json and pnpm-lock.yaml

---
 package.json                                 |  2 ++
 pnpm-lock.yaml                               | 23 ++++++++++++++++++++
 src/cli/agent-evals/utils/assertions.test.ts |  9 ++++++++
 src/cli/agent-evals/utils/assertions.ts      | 13 ++++-------
 src/cli/agent-evals/utils/file-assertions.ts |  5 ++++-
 src/tools/list-files/list-files-tool.ts      |  3 +--
 6 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/package.json b/package.json
index af04036..50803be 100644
--- a/package.json
+++ b/package.json
@@ -35,6 +35,7 @@
     "@ianvs/prettier-plugin-sort-imports": "4.7.0",
     "@openai/agents": "0.3.7",
     "@types/jsdom": "27.0.0",
+    "@types/lodash-es": "4.17.12",
     "@types/node": "25.0.6",
     "@types/sanitize-html": "2.16.0",
     "@types/slug": "5.0.9",
@@ -43,6 +44,7 @@
     "eslint-plugin-import": "2.32.0",
     "jiti": "2.6.1",
     "jsdom": "27.4.0",
+    "lodash-es": "4.17.23",
     "marked": "17.0.1",
     "node-html-markdown": "2.0.0",
     "playwright": "1.57.0",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 9927fa0..6aa2b3a 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -23,6 +23,9 @@ importers:
       '@types/jsdom':
         specifier: 27.0.0
         version: 27.0.0
+      '@types/lodash-es':
+        specifier: 4.17.12
+        version: 4.17.12
       '@types/node':
         specifier: 25.0.6
         version: 25.0.6
@@ -47,6 +50,9 @@ importers:
       jsdom:
         specifier: 27.4.0
         version: 27.4.0
+      lodash-es:
+        specifier: 4.17.23
+        version: 4.17.23
       marked:
         specifier: 17.0.1
         version: 17.0.1
@@ -618,6 +624,12 @@ packages:
   '@types/json5@0.0.29':
     resolution: {integrity: sha512-dRLjCWHYg4oaA77cxO64oO+7JwCwnIzkZPdrrC71jQmQtlhM556pwKo5bUzqvZndkVbeFLIIi+9TC40JNF5hNQ==}
 
+  '@types/lodash-es@4.17.12':
+    resolution: {integrity: sha512-0NgftHUcV4v34VhXm8QBSftKVXtbkBG3ViCjs6+eJ5a6y6Mi/jiFGPc1sC7QK+9BFhWrURE3EOggmWaSxL9OzQ==}
+
+  '@types/lodash@4.17.23':
+    resolution: {integrity: sha512-RDvF6wTulMPjrNdCoYRC8gNR880JNGT8uB+REUpC2Ns4pRqQJhGz90wh7rgdXDPpCczF3VGktDuFGVnz8zP7HA==}
+
   '@types/node@25.0.6':
     resolution: {integrity: sha512-NNu0sjyNxpoiW3YuVFfNz7mxSQ+S4X2G28uqg2s+CzoqoQjLPsWSbsFFyztIAqt2vb8kfEAsJNepMGPTxFDx3Q==}
 
@@ -1497,6 +1509,9 @@ packages:
     resolution: {integrity: sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==}
     engines: {node: '>=10'}
 
+  lodash-es@4.17.23:
+    resolution: {integrity: sha512-kVI48u3PZr38HdYz98UmfPnXl2DXrpdctLrFLCd3kOx1xUkOmpFPx7gCWWM5MPkL/fD8zb+Ph0QzjGFs4+hHWg==}
+
   lodash.merge@4.6.2:
     resolution: {integrity: sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==}
 
@@ -2604,6 +2619,12 @@ snapshots:
 
   '@types/json5@0.0.29': {}
 
+  '@types/lodash-es@4.17.12':
+    dependencies:
+      '@types/lodash': 4.17.23
+
+  '@types/lodash@4.17.23': {}
+
   '@types/node@25.0.6':
     dependencies:
       undici-types: 7.16.0
@@ -3735,6 +3756,8 @@ snapshots:
     dependencies:
       p-locate: 5.0.0
 
+  lodash-es@4.17.23: {}
+
   lodash.merge@4.6.2: {}
 
   lru-cache@11.2.4: {}
diff --git a/src/cli/agent-evals/utils/assertions.test.ts b/src/cli/agent-evals/utils/assertions.test.ts
index 594c8a3..a03e3fb 100644
--- a/src/cli/agent-evals/utils/assertions.test.ts
+++ b/src/cli/agent-evals/utils/assertions.test.ts
@@ -132,6 +132,15 @@ describe("evaluateAssertion", () => {
       expect(result.passed).toBe(true);
     });
 
+    it("passes for objects with different key ordering", async () => {
+      const assertion: Assertion = {
+        type: "equals",
+        expected: { a: 1, b: 2 },
+      };
+      const result = await evaluateAssertion(assertion, { b: 2, a: 1 });
+      expect(result.passed).toBe(true);
+    });
+
     it("fails for different objects", async () => {
       const assertion: Assertion = {
         type: "equals",
diff --git a/src/cli/agent-evals/utils/assertions.ts b/src/cli/agent-evals/utils/assertions.ts
index 5a2362b..47670fd 100644
--- a/src/cli/agent-evals/utils/assertions.ts
+++ b/src/cli/agent-evals/utils/assertions.ts
@@ -1,3 +1,5 @@
+import { isEqual } from "lodash-es";
+
 import type {
   Assertion,
   AssertionResult,
@@ -98,7 +100,7 @@ const evaluateEqualsAssertion = (
   assertion: EqualsAssertion,
   output: unknown
 ): AssertionResult => {
-  const passed = deepEquals(output, assertion.expected);
+  const passed = isEqual(output, assertion.expected);
 
   return {
     assertion,
@@ -117,7 +119,7 @@ const evaluateJsonPathAssertion = (
 ): AssertionResult => {
   try {
     const value = getJsonPath(output, assertion.path);
-    const passed = deepEquals(value, assertion.expected);
+    const passed = isEqual(value, assertion.expected);
 
     return {
       assertion,
@@ -149,13 +151,6 @@ const stringifyOutput = (output: unknown): string => {
   return JSON.stringify(output, null, 2);
 };
 
-/**
- * Deep equality check using JSON serialization.
- */
-const deepEquals = (a: unknown, b: unknown): boolean => {
-  return JSON.stringify(a) === JSON.stringify(b);
-};
-
 /**
  * Simple JSON path getter supporting dot notation.
  * Supports paths like "response.status" or "$.response.status"
diff --git a/src/cli/agent-evals/utils/file-assertions.ts b/src/cli/agent-evals/utils/file-assertions.ts
index cee6b53..bca2100 100644
--- a/src/cli/agent-evals/utils/file-assertions.ts
+++ b/src/cli/agent-evals/utils/file-assertions.ts
@@ -1,5 +1,8 @@
 import fs from "node:fs/promises";
-import { resolveTmpPathForAccess, resolveTmpPathForRead } from "~tools/utils/fs";
+import {
+  resolveTmpPathForAccess,
+  resolveTmpPathForRead,
+} from "~tools/utils/fs";
 
 import type {
   AssertionResult,
diff --git a/src/tools/list-files/list-files-tool.ts b/src/tools/list-files/list-files-tool.ts
index a719ab6..28ec137 100644
--- a/src/tools/list-files/list-files-tool.ts
+++ b/src/tools/list-files/list-files-tool.ts
@@ -18,8 +18,7 @@ export const createListFilesTool = ({ logger }: ListFilesToolOptions) =>
       properties: {
         path: {
           type: "string",
-          description:
-            "Relative path within the repo tmp directory.",
+          description: "Relative path within the repo tmp directory.",
         },
       },
       required: ["path"],

From 384b9e2d6c5d86f4fbb6094017d1a7ccb322469d Mon Sep 17 00:00:00 2001
From: Juha Kangas <42040080+valuecodes@users.noreply.github.com>
Date: Tue, 27 Jan 2026 21:20:12 +0200
Subject: [PATCH 14/14] refactor: reorganize schemas into separate types
 directory

- Move schemas from the root to a dedicated types directory
- Update import paths across multiple files to reflect new structure
---
 README.md                                     | 21 +++++++++++++------
 src/cli/agent-evals/clients/eval-runner.ts    |  2 +-
 .../agent-evals/clients/report-generator.ts   |  2 +-
 src/cli/agent-evals/clients/suite-loader.ts   |  4 ++--
 src/cli/agent-evals/main.ts                   |  4 ++--
 src/cli/agent-evals/{ => types}/schemas.ts    |  2 +-
 src/cli/agent-evals/utils/assertions.test.ts  |  2 +-
 src/cli/agent-evals/utils/assertions.ts       |  2 +-
 src/cli/agent-evals/utils/file-assertions.ts  |  2 +-
 .../etf-backtest/clients/etf-data-fetcher.ts  |  4 ++--
 .../etf-backtest/clients/learnings-manager.ts |  8 +++++--
 src/cli/etf-backtest/main.ts                  |  4 ++--
 src/cli/etf-backtest/{ => types}/schemas.ts   |  2 +-
 .../etf-backtest/utils/experiment-extract.ts  |  4 ++--
 src/cli/etf-backtest/utils/final-report.ts    |  2 +-
 .../etf-backtest/utils/learnings-formatter.ts |  2 +-
 src/cli/etf-backtest/utils/scoring.ts         |  2 +-
 src/cli/guestbook/main.ts                     |  8 ++-----
 src/cli/guestbook/types/schemas.ts            |  8 +++++++
 src/cli/name-explorer/main.ts                 |  7 ++-----
 src/cli/name-explorer/types/schemas.ts        |  8 +++++++
 src/cli/scrape-publications/main.ts           |  8 ++-----
 src/cli/scrape-publications/types/schemas.ts  |  9 ++++++++
 src/utils/parse-args.test.ts                  |  2 +-
 24 files changed, 73 insertions(+), 46 deletions(-)
 rename src/cli/agent-evals/{ => types}/schemas.ts (99%)
 rename src/cli/etf-backtest/{ => types}/schemas.ts (99%)
 create mode 100644 src/cli/guestbook/types/schemas.ts
 create mode 100644 src/cli/name-explorer/types/schemas.ts
 create mode 100644 src/cli/scrape-publications/types/schemas.ts

diff --git a/README.md b/README.md
index 27639ad..9acc8a0 100644
--- a/README.md
+++ b/README.md
@@ -127,7 +127,8 @@ src/
 │   │   ├── main.ts            # Agent evals CLI entry point
 │   │   ├── README.md          # Agent evals CLI docs
 │   │   ├── constants.ts       # CLI constants
-│   │   ├── schemas.ts         # CLI args + suite schemas
+│   │   ├── types/             # CLI schemas
+│   │   │   └── schemas.ts     # CLI args + suite schemas
 │   │   ├── clients/           # Suite runner + report generator
 │   │   ├── utils/             # Assertion + formatting helpers
 │   │   └── suites/            # Example evaluation suites
@@ -135,17 +136,24 @@ src/
 │   │   ├── main.ts            # ETF backtest CLI entry point
 │   │   ├── README.md          # ETF backtest docs
 │   │   ├── constants.ts       # CLI constants
-│   │   ├── schemas.ts         # CLI args + agent output schemas
+│   │   ├── types/             # CLI schemas
+│   │   │   └── schemas.ts     # CLI args + agent output schemas
 │   │   ├── clients/           # Data fetcher + Playwright capture
 │   │   ├── utils/             # Scoring + formatting helpers
-│   │   ├── types/             # ETF data types
 │   │   └── scripts/           # Python backtest + prediction scripts
 │   ├── guestbook/
 │   │   ├── main.ts            # Guestbook CLI entry point
-│   │   └── README.md          # Guestbook CLI docs
+│   │   ├── README.md          # Guestbook CLI docs
+│   │   └── types/             # CLI schemas
+│   │       └── schemas.ts     # Guestbook output schema
 │   ├── name-explorer/
 │   │   ├── main.ts            # Name Explorer CLI entry point
-│   │   └── README.md          # Name Explorer CLI docs
+│   │   ├── README.md          # Name Explorer CLI docs
+│   │   └── types/             # CLI schemas + data types
+│   │       ├── ai-output.ts   # Agent output schema
+│   │       ├── index.ts       # Type exports
+│   │       ├── schemas.ts     # CLI args schema
+│   │       └── stats.ts       # Statistics types
 │   └── scrape-publications/
 │       ├── main.ts            # Publication scraping CLI entry point
 │       ├── README.md          # Publication scraping docs
@@ -154,7 +162,8 @@ src/
 │       │   ├── publication-scraper.ts  # Link discovery + selector inference
 │       │   └── review-page-generator.ts # Review HTML generator
 │       └── types/
-│           └── index.ts       # Publication Zod schemas
+│           ├── index.ts       # Publication Zod schemas
+│           └── schemas.ts     # CLI args schema
 ├── clients/
 │   ├── fetch.ts               # Shared HTTP fetch + sanitization
 │   ├── logger.ts              # Shared console logger
diff --git a/src/cli/agent-evals/clients/eval-runner.ts b/src/cli/agent-evals/clients/eval-runner.ts
index 7205b24..f698660 100644
--- a/src/cli/agent-evals/clients/eval-runner.ts
+++ b/src/cli/agent-evals/clients/eval-runner.ts
@@ -15,7 +15,7 @@ import type {
   EvalSuite,
   SuiteResult,
   SuiteSummary,
-} from "../schemas";
+} from "../types/schemas";
 import { evaluateAssertion } from "../utils/assertions";
 import { createToolsFromNames } from "./tool-registry";
 
diff --git a/src/cli/agent-evals/clients/report-generator.ts b/src/cli/agent-evals/clients/report-generator.ts
index 2e97223..180bc4d 100644
--- a/src/cli/agent-evals/clients/report-generator.ts
+++ b/src/cli/agent-evals/clients/report-generator.ts
@@ -9,7 +9,7 @@ import {
   REPORTS_SUBDIR,
   STATUS_SYMBOLS,
 } from "../constants";
-import type { EvalReport, ReportSummary, SuiteResult } from "../schemas";
+import type { EvalReport, ReportSummary, SuiteResult } from "../types/schemas";
 
 export type ReportFormat = "json" | "md" | "both";
 
diff --git a/src/cli/agent-evals/clients/suite-loader.ts b/src/cli/agent-evals/clients/suite-loader.ts
index 4cde5fa..22d29e4 100644
--- a/src/cli/agent-evals/clients/suite-loader.ts
+++ b/src/cli/agent-evals/clients/suite-loader.ts
@@ -3,8 +3,8 @@ import path from "node:path";
 import type { Logger } from "~clients/logger";
 
 import { SUITE_FILE_EXTENSION, SUITES_DIR } from "../constants";
-import type { EvalSuite } from "../schemas";
-import { EvalSuiteSchema } from "../schemas";
+import type { EvalSuite } from "../types/schemas";
+import { EvalSuiteSchema } from "../types/schemas";
 
 export type SuiteLoaderConfig = {
   logger: Logger;
diff --git a/src/cli/agent-evals/main.ts b/src/cli/agent-evals/main.ts
index a5d2659..ac71fca 100644
--- a/src/cli/agent-evals/main.ts
+++ b/src/cli/agent-evals/main.ts
@@ -11,8 +11,8 @@ import { EvalRunner } from "./clients/eval-runner";
 import { ReportGenerator } from "./clients/report-generator";
 import { SuiteLoader } from "./clients/suite-loader";
 import { LINE_WIDTH, PERCENT_MULTIPLIER, ZERO } from "./constants";
-import type { SuiteResult } from "./schemas";
-import { CliArgsSchema } from "./schemas";
+import type { SuiteResult } from "./types/schemas";
+import { CliArgsSchema } from "./types/schemas";
 
 const logger = new Logger();
 
diff --git a/src/cli/agent-evals/schemas.ts b/src/cli/agent-evals/types/schemas.ts
similarity index 99%
rename from src/cli/agent-evals/schemas.ts
rename to src/cli/agent-evals/types/schemas.ts
index 501c37f..1a564fb 100644
--- a/src/cli/agent-evals/schemas.ts
+++ b/src/cli/agent-evals/types/schemas.ts
@@ -4,7 +4,7 @@ import {
   DEFAULT_OUT_PATH,
   DEFAULT_REPORT_FORMAT,
   DEFAULT_VERBOSE,
-} from "./constants";
+} from "../constants";
 
 // ============================================
 // CLI Arguments
diff --git a/src/cli/agent-evals/utils/assertions.test.ts b/src/cli/agent-evals/utils/assertions.test.ts
index a03e3fb..4e2f365 100644
--- a/src/cli/agent-evals/utils/assertions.test.ts
+++ b/src/cli/agent-evals/utils/assertions.test.ts
@@ -3,7 +3,7 @@ import path from "node:path";
 import { TMP_ROOT } from "~tools/utils/fs";
 import { afterAll, beforeAll, describe, expect, it } from "vitest";
 
-import type { Assertion } from "../schemas";
+import type { Assertion } from "../types/schemas";
 import { evaluateAssertion } from "./assertions";
 
 describe("evaluateAssertion", () => {
diff --git a/src/cli/agent-evals/utils/assertions.ts b/src/cli/agent-evals/utils/assertions.ts
index 47670fd..4cce048 100644
--- a/src/cli/agent-evals/utils/assertions.ts
+++ b/src/cli/agent-evals/utils/assertions.ts
@@ -7,7 +7,7 @@ import type {
   EqualsAssertion,
   JsonPathAssertion,
   MatchesRegexAssertion,
-} from "../schemas";
+} from "../types/schemas";
 import {
   evaluateFileContainsAssertion,
   evaluateFileExistsAssertion,
diff --git a/src/cli/agent-evals/utils/file-assertions.ts b/src/cli/agent-evals/utils/file-assertions.ts
index bca2100..c041d06 100644
--- a/src/cli/agent-evals/utils/file-assertions.ts
+++ b/src/cli/agent-evals/utils/file-assertions.ts
@@ -10,7 +10,7 @@ import type {
   FileExistsAssertion,
   FileJsonPathAssertion,
   FileNotExistsAssertion,
-} from "../schemas";
+} from "../types/schemas";
 
 /**
  * Evaluate a fileExists assertion by checking if the file exists in tmp/.
diff --git a/src/cli/etf-backtest/clients/etf-data-fetcher.ts b/src/cli/etf-backtest/clients/etf-data-fetcher.ts
index 5bc3297..4548625 100644
--- a/src/cli/etf-backtest/clients/etf-data-fetcher.ts
+++ b/src/cli/etf-backtest/clients/etf-data-fetcher.ts
@@ -14,8 +14,8 @@ import {
   getEtfApiPattern,
   JUST_ETF_BASE_URL,
 } from "../constants";
-import type { EtfDataResponse } from "../schemas";
-import { EtfDataResponseSchema, isEtfDataResponse } from "../schemas";
+import type { EtfDataResponse } from "../types/schemas";
+import { EtfDataResponseSchema, isEtfDataResponse } from "../types/schemas";
 
 export type EtfDataFetcherConfig = {
   logger: Logger;
diff --git a/src/cli/etf-backtest/clients/learnings-manager.ts b/src/cli/etf-backtest/clients/learnings-manager.ts
index cee91cd..6ed9373 100644
--- a/src/cli/etf-backtest/clients/learnings-manager.ts
+++ b/src/cli/etf-backtest/clients/learnings-manager.ts
@@ -8,8 +8,12 @@ import {
   LEARNINGS_FILENAME,
   MAX_HISTORY_ITEMS,
 } from "../constants";
-import type { ExperimentResult, IterationRecord, Learnings } from "../schemas";
-import { LearningsSchema } from "../schemas";
+import type {
+  ExperimentResult,
+  IterationRecord,
+  Learnings,
+} from "../types/schemas";
+import { LearningsSchema } from "../types/schemas";
 import { computeScore } from "../utils/scoring";
 
 export type LearningsManagerConfig = {
diff --git a/src/cli/etf-backtest/main.ts b/src/cli/etf-backtest/main.ts
index e11aca4..43d0e5f 100644
--- a/src/cli/etf-backtest/main.ts
+++ b/src/cli/etf-backtest/main.ts
@@ -30,8 +30,8 @@ import {
   TARGET_R2_NON_OVERLAPPING,
   ZERO,
 } from "./constants";
-import { AgentOutputSchema, CliArgsSchema } from "./schemas";
-import type { ExperimentResult, Learnings } from "./schemas";
+import { AgentOutputSchema, CliArgsSchema } from "./types/schemas";
+import type { ExperimentResult, Learnings } from "./types/schemas";
 import { extractLastExperimentResult } from "./utils/experiment-extract";
 import { printFinalResults } from "./utils/final-report";
 import { formatFixed, formatPercent } from "./utils/formatters";
diff --git a/src/cli/etf-backtest/schemas.ts b/src/cli/etf-backtest/types/schemas.ts
similarity index 99%
rename from src/cli/etf-backtest/schemas.ts
rename to src/cli/etf-backtest/types/schemas.ts
index 7859d09..51c746e 100644
--- a/src/cli/etf-backtest/schemas.ts
+++ b/src/cli/etf-backtest/types/schemas.ts
@@ -6,7 +6,7 @@ import {
   DEFAULT_REFRESH,
   DEFAULT_SEED,
   DEFAULT_VERBOSE,
-} from "./constants";
+} from "../constants";
 
 // ISIN validation: 2 letter country code + 10 alphanumeric characters
 const IsinSchema = z
diff --git a/src/cli/etf-backtest/utils/experiment-extract.ts b/src/cli/etf-backtest/utils/experiment-extract.ts
index 903abf2..93fc1ec 100644
--- a/src/cli/etf-backtest/utils/experiment-extract.ts
+++ b/src/cli/etf-backtest/utils/experiment-extract.ts
@@ -1,6 +1,6 @@
 import { INDEX_NOT_FOUND, JSON_SLICE_END_OFFSET, ZERO } from "../constants";
-import { ExperimentResultSchema } from "../schemas";
-import type { ExperimentResult } from "../schemas";
+import { ExperimentResultSchema } from "../types/schemas";
+import type { ExperimentResult } from "../types/schemas";
 
 const extractJsonFromStdout = (stdout: string): unknown => {
   const startIdx = stdout.indexOf("{");
diff --git a/src/cli/etf-backtest/utils/final-report.ts b/src/cli/etf-backtest/utils/final-report.ts
index 9cc76f5..9351a43 100644
--- a/src/cli/etf-backtest/utils/final-report.ts
+++ b/src/cli/etf-backtest/utils/final-report.ts
@@ -7,7 +7,7 @@ import {
   LINE_SEPARATOR,
   PREDICTION_HORIZON_MONTHS,
 } from "../constants";
-import type { ExperimentResult } from "../schemas";
+import type { ExperimentResult } from "../types/schemas";
 import { formatFixed, formatPercent } from "./formatters";
 
 export const printFinalResults = (
diff --git a/src/cli/etf-backtest/utils/learnings-formatter.ts b/src/cli/etf-backtest/utils/learnings-formatter.ts
index a800fd1..27fd9b2 100644
--- a/src/cli/etf-backtest/utils/learnings-formatter.ts
+++ b/src/cli/etf-backtest/utils/learnings-formatter.ts
@@ -1,5 +1,5 @@
 import { DECIMAL_PLACES, LEARNINGS_SUMMARY_TOP_N } from "../constants";
-import type { Learnings } from "../schemas";
+import type { Learnings } from "../types/schemas";
 import { formatFixed, formatPercent } from "./formatters";
 
 const FEATURE_PREVIEW_COUNT = 4;
diff --git a/src/cli/etf-backtest/utils/scoring.ts b/src/cli/etf-backtest/utils/scoring.ts
index 4dbd25c..4ad237f 100644
--- a/src/cli/etf-backtest/utils/scoring.ts
+++ b/src/cli/etf-backtest/utils/scoring.ts
@@ -4,7 +4,7 @@ import {
   SCORE_WEIGHTS,
   ZERO,
 } from "../constants";
-import type { ExperimentResult } from "../schemas";
+import type { ExperimentResult } from "../types/schemas";
 
 export const computeScore = (metrics: ExperimentResult["metrics"]): number => {
   // Primary: prediction accuracy on non-overlapping samples (honest assessment)
diff --git a/src/cli/guestbook/main.ts b/src/cli/guestbook/main.ts
index a4013e6..5597d1d 100644
--- a/src/cli/guestbook/main.ts
+++ b/src/cli/guestbook/main.ts
@@ -6,18 +6,14 @@ import { AgentRunner } from "~clients/agent-runner";
 import { Logger } from "~clients/logger";
 import { createReadFileTool } from "~tools/read-file/read-file-tool";
 import { createWriteFileTool } from "~tools/write-file/write-file-tool";
-import { z } from "zod";
 import { question } from "zx";
 
+import { OutputSchema } from "./types/schemas";
+
 const logger = new Logger();
 
 logger.info("Guestbook running...");
 
-const OutputSchema = z.object({
-  success: z.boolean(),
-  message: z.string(),
-});
-
 const agentRunner = new AgentRunner({
   name: "GuestbookAgent",
   model: "gpt-5-mini",
diff --git a/src/cli/guestbook/types/schemas.ts b/src/cli/guestbook/types/schemas.ts
new file mode 100644
index 0000000..a6a633f
--- /dev/null
+++ b/src/cli/guestbook/types/schemas.ts
@@ -0,0 +1,8 @@
+import { z } from "zod";
+
+export const OutputSchema = z.object({
+  success: z.boolean(),
+  message: z.string(),
+});
+
+export type Output = z.infer<typeof OutputSchema>;
diff --git a/src/cli/name-explorer/main.ts b/src/cli/name-explorer/main.ts
index 39f8247..3827248 100644
--- a/src/cli/name-explorer/main.ts
+++ b/src/cli/name-explorer/main.ts
@@ -8,7 +8,6 @@ import { AgentRunner } from "~clients/agent-runner";
 import { Logger } from "~clients/logger";
 import { parseArgs } from "~utils/parse-args";
 import { QuestionHandler } from "~utils/question-handler";
-import { z } from "zod";
 
 import { NameSuggesterPipeline } from "./clients/pipeline";
 import { StatsGenerator } from "./clients/stats-generator";
@@ -22,16 +21,14 @@ import {
   NameSuggesterOutputSchema,
   NameSuggesterOutputTypeSchema,
 } from "./types";
+import { CliArgsSchema } from "./types/schemas";
 
 const logger = new Logger();
 
 // --- Parse CLI arguments ---
 const { refetch: shouldRefetch, mode } = parseArgs({
   logger,
-  schema: z.object({
-    refetch: z.coerce.boolean().default(false),
-    mode: z.enum(["stats", "ai"]).default("ai"),
-  }),
+  schema: CliArgsSchema,
 });
 
 // --- Initialize pipeline and database ---
diff --git a/src/cli/name-explorer/types/schemas.ts b/src/cli/name-explorer/types/schemas.ts
new file mode 100644
index 0000000..0ffd2e1
--- /dev/null
+++ b/src/cli/name-explorer/types/schemas.ts
@@ -0,0 +1,8 @@
+import { z } from "zod";
+
+export const CliArgsSchema = z.object({
+  refetch: z.coerce.boolean().default(false),
+  mode: z.enum(["stats", "ai"]).default("ai"),
+});
+
+export type CliArgs = z.infer<typeof CliArgsSchema>;
diff --git a/src/cli/scrape-publications/main.ts b/src/cli/scrape-publications/main.ts
index a1630e0..aacb520 100644
--- a/src/cli/scrape-publications/main.ts
+++ b/src/cli/scrape-publications/main.ts
@@ -8,10 +8,10 @@ import path from "node:path";
 import { Logger } from "~clients/logger";
 import { parseArgs } from "~utils/parse-args";
 import slug from "slug";
-import { z } from "zod";
 import { question } from "zx";
 
 import { PublicationPipeline } from "./clients/publication-pipeline";
+import { CliArgsSchema } from "./types/schemas";
 
 const logger = new Logger({ level: "info", useColors: true });
 
@@ -24,11 +24,7 @@ const {
   filterUrl,
 } = parseArgs({
   logger,
-  schema: z.object({
-    url: z.url(),
-    refetch: z.coerce.boolean().default(false),
-    filterUrl: z.string().optional(),
-  }),
+  schema: CliArgsSchema,
 });
 
 // 2. Create slugified directory path
diff --git a/src/cli/scrape-publications/types/schemas.ts b/src/cli/scrape-publications/types/schemas.ts
new file mode 100644
index 0000000..0797752
--- /dev/null
+++ b/src/cli/scrape-publications/types/schemas.ts
@@ -0,0 +1,9 @@
+import { z } from "zod";
+
+export const CliArgsSchema = z.object({
+  url: z.url(),
+  refetch: z.coerce.boolean().default(false),
+  filterUrl: z.string().optional(),
+});
+
+export type CliArgs = z.infer<typeof CliArgsSchema>;
diff --git a/src/utils/parse-args.test.ts b/src/utils/parse-args.test.ts
index 460be98..b92137f 100644
--- a/src/utils/parse-args.test.ts
+++ b/src/utils/parse-args.test.ts
@@ -2,7 +2,7 @@ import { Logger } from "~clients/logger";
 import { parseArgs } from "~utils/parse-args";
 import { describe, expect, it } from "vitest";
 
-import { CliArgsSchema } from "../cli/agent-evals/schemas";
+import { CliArgsSchema } from "../cli/agent-evals/types/schemas";
 
 describe("parseArgs", () => {
   const logger = new Logger({