From 6ef7d0c506ef4269e73cc410efb24fce686b4dec Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Tue, 27 Jan 2026 11:03:57 +0200 Subject: [PATCH 01/14] docs: update AGENTS.md to specify schemas.ts for pipeline classes --- AGENTS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index c67c81f..ce87b40 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -7,7 +7,7 @@ **Repo:** `cli-agent-sandbox` — minimal TypeScript CLI sandbox built with `@openai/agents` and tool sandboxing under `tmp/`. 1. Start at `src/cli//main.ts` and the matching `src/cli//README.md`. -2. Follow the pipeline classes under `src/cli//clients/*` and schemas under `src/cli//types/*`. +2. Follow the pipeline classes under `src/cli//clients/*` and schemas under `src/cli//types/schemas.ts`. 3. Reuse shared helpers: `src/utils/parse-args.ts`, `src/utils/question-handler.ts`, `src/clients/logger.ts`. 4. Keep `main.ts` focused on the basic agent flow; move non-trivial logic into `clients/` or `utils/`. 5. Keep changes minimal; add/update **Vitest** tests (`*.test.ts`) when behavior changes. @@ -117,7 +117,7 @@ All file tools are sandboxed to `tmp/` using path validation (`src/tools/utils/f - Prefer TypeScript path aliases over deep relative imports: `~tools/*`, `~clients/*`, `~utils/*`. - Use Zod schemas for CLI args and tool IO. - Keep object field names in `camelCase` (e.g., `trainSamples`), not `snake_case`. -- Keep Zod schemas in a dedicated `schemas.ts` file for each CLI (avoid inline schemas in `main.ts`). +- Keep Zod schemas in a dedicated `types/schemas.ts` file for each CLI (avoid inline schemas in `main.ts`). - Keep constants in a dedicated `constants.ts` file for each CLI. - Move hardcoded numeric values into `constants.ts` (treat numbers as configuration). - For HTTP fetching in code, prefer `Fetch` (sanitized) or `PlaywrightScraper` for JS-heavy pages. From 9b503be94f4161a9ff0b9c799fb305746d5b9b59 Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Tue, 27 Jan 2026 13:02:06 +0200 Subject: [PATCH 02/14] feat: add agent evaluation CLI with reporting and assertion framework - Implement evaluation runner to execute agent test suites - Create report generator for JSON and Markdown outputs - Add assertion evaluation utilities for various assertion types - Introduce suite loader for loading evaluation suites from JSON files - Update README and add checklist for post-scaffold tasks --- .claude/settings.json | 23 +- package.json | 1 + src/cli/agent-evals/CHECKLIST.md | 25 ++ src/cli/agent-evals/README.md | 103 +++++++++ src/cli/agent-evals/clients/eval-runner.ts | 209 +++++++++++++++++ .../agent-evals/clients/report-generator.ts | 190 ++++++++++++++++ src/cli/agent-evals/clients/suite-loader.ts | 86 +++++++ src/cli/agent-evals/constants.ts | 41 ++++ src/cli/agent-evals/main.ts | 104 +++++++++ src/cli/agent-evals/schemas.ts | 203 +++++++++++++++++ src/cli/agent-evals/suites/example.json | 79 +++++++ src/cli/agent-evals/utils/assertions.test.ts | 214 ++++++++++++++++++ src/cli/agent-evals/utils/assertions.ts | 164 ++++++++++++++ src/clients/agent-runner.ts | 5 +- 14 files changed, 1434 insertions(+), 13 deletions(-) create mode 100644 src/cli/agent-evals/CHECKLIST.md create mode 100644 src/cli/agent-evals/README.md create mode 100644 src/cli/agent-evals/clients/eval-runner.ts create mode 100644 src/cli/agent-evals/clients/report-generator.ts create mode 100644 src/cli/agent-evals/clients/suite-loader.ts create mode 100644 src/cli/agent-evals/constants.ts create mode 100644 src/cli/agent-evals/main.ts create mode 100644 src/cli/agent-evals/schemas.ts create mode 100644 src/cli/agent-evals/suites/example.json create mode 100644 src/cli/agent-evals/utils/assertions.test.ts create mode 100644 src/cli/agent-evals/utils/assertions.ts diff --git a/.claude/settings.json b/.claude/settings.json index e47738f..414b774 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -1,8 +1,6 @@ { "$schema": "https://json.schemastore.org/claude-code-settings.json", - "description": "Settings for Claude code agents", "permissions": { - "defaultMode": "default", "allow": [ "Bash(pnpm lint:*)", "Bash(pnpm lint:fix:*)", @@ -10,13 +8,8 @@ "Bash(pnpm build:*)", "Bash(pnpm format:*)", "Bash(pnpm format:check:*)", - "Bash(pnpm test:*)" - ], - "ask": [ - "Bash(pnpm install:*)", - "Bash(pnpm add:*)", - "Bash(pnpm remove:*)", - "Bash(git commit:*)" + "Bash(pnpm test:*)", + "Bash(tsx scripts/scaffold-cli.ts:*)" ], "deny": [ "Bash(curl:*)", @@ -29,6 +22,14 @@ "Read(**/secrets/**)", "Bash(git push:*)", "Bash(gh pr create:*)" - ] - } + ], + "ask": [ + "Bash(pnpm install:*)", + "Bash(pnpm add:*)", + "Bash(pnpm remove:*)", + "Bash(git commit:*)" + ], + "defaultMode": "default" + }, + "description": "Settings for Claude code agents" } diff --git a/package.json b/package.json index e7a9c88..af04036 100644 --- a/package.json +++ b/package.json @@ -8,6 +8,7 @@ "run:name-explorer": "pnpm -s node:tsx -- src/cli/name-explorer/main.ts", "run:scrape-publications": "tsx src/cli/scrape-publications/main.ts", "run:etf-backtest": "tsx src/cli/etf-backtest/main.ts", + "run:agent-evals": "tsx src/cli/agent-evals/main.ts", "scaffold:cli": "tsx scripts/scaffold-cli.ts", "node:tsx": "node --disable-warning=ExperimentalWarning --import tsx", "typecheck": "tsc --noEmit", diff --git a/src/cli/agent-evals/CHECKLIST.md b/src/cli/agent-evals/CHECKLIST.md new file mode 100644 index 0000000..986ce0d --- /dev/null +++ b/src/cli/agent-evals/CHECKLIST.md @@ -0,0 +1,25 @@ +# Post-Scaffold Checklist + +## Setup + +- [ ] Update `main.ts` with CLI logic +- [ ] Add CLI arguments to the Zod schema +- [ ] Update `README.md` description and flowchart + +## Optional Structure + +- [ ] Create `./clients/` for pipeline/client classes +- [ ] Create `./types/` for Zod schemas +- [ ] Create `./tools/` for CLI-specific agent tools + +## Before Committing + +- [ ] `pnpm typecheck` +- [ ] `pnpm lint` +- [ ] `pnpm format:check` +- [ ] Add tests if behavior is testable +- [ ] `pnpm test` + +## Cleanup + +- [ ] Delete this CHECKLIST.md when done diff --git a/src/cli/agent-evals/README.md b/src/cli/agent-evals/README.md new file mode 100644 index 0000000..7ea7245 --- /dev/null +++ b/src/cli/agent-evals/README.md @@ -0,0 +1,103 @@ +# Agent Evals + +Run automated evaluation cases for AI agents with PASS/FAIL results and reports. + +## Run + +```bash +# Run a single suite +pnpm run:agent-evals -- --suite=example + +# Run all suites +pnpm run:agent-evals -- --all + +# With options +pnpm run:agent-evals -- --suite=example --verbose --report=both +``` + +## Arguments + +- `--suite `: Run a specific suite by name (without `.json` extension) +- `--all`: Run all suites in the `suites/` directory +- `--report `: Report format: `json`, `md`, or `both` (default: `json`) +- `--out `: Output directory under `tmp/` (default: `agent-evals`) +- `--verbose`: Enable verbose logging with assertion details + +Either `--suite` or `--all` is required. + +## Output + +Reports are written to `tmp/agent-evals/`: +- `report-{timestamp}.json`: Machine-readable results +- `report-{timestamp}.md`: Human-readable markdown report + +Exit code is 1 if any tests fail or error. + +## Creating Evaluation Suites + +Add JSON files to `suites/` directory. Example structure: + +```json +{ + "name": "my-suite", + "description": "Test suite description", + "version": "1.0.0", + "agent": { + "name": "MyTestAgent", + "model": "gpt-5-mini", + "instructions": "Agent system prompt here", + "tools": [], + "maxTurns": 3 + }, + "defaults": { + "timeout": 15000 + }, + "cases": [ + { + "id": "case-1", + "name": "Test case name", + "prompt": "User prompt to test", + "assertions": [ + { "type": "contains", "value": "expected text" } + ] + } + ] +} +``` + +## Assertion Types + +- **contains**: Check if output contains a string + ```json + { "type": "contains", "value": "text", "caseSensitive": false } + ``` + +- **matchesRegex**: Check if output matches a regex pattern + ```json + { "type": "matchesRegex", "pattern": "\\d+", "flags": "i" } + ``` + +- **equals**: Deep equality check + ```json + { "type": "equals", "expected": { "key": "value" } } + ``` + +- **jsonPath**: Extract and compare nested values + ```json + { "type": "jsonPath", "path": "$.response.status", "expected": "success" } + ``` + +## Flowchart + +```mermaid +flowchart TD + A["Start"] --> B["Parse args"] + B --> C["Load suites"] + C --> D["Run each suite"] + D --> E["Run each case"] + E --> F["Evaluate assertions"] + F --> G["Collect results"] + G --> H["Generate reports"] + H --> I["Print summary"] + I --> J["Exit"] +``` diff --git a/src/cli/agent-evals/clients/eval-runner.ts b/src/cli/agent-evals/clients/eval-runner.ts new file mode 100644 index 0000000..94f64e6 --- /dev/null +++ b/src/cli/agent-evals/clients/eval-runner.ts @@ -0,0 +1,209 @@ +import { AgentRunner } from "~clients/agent-runner"; +import type { Logger } from "~clients/logger"; + +import { + DEFAULT_CASE_TIMEOUT_MS, + DEFAULT_MAX_TURNS, + STATUS_SYMBOLS, + ZERO, +} from "../constants"; +import type { + AssertionResult, + CaseResult, + CaseStatus, + EvalCase, + EvalSuite, + SuiteResult, + SuiteSummary, +} from "../schemas"; +import { evaluateAssertion } from "../utils/assertions"; + +export type EvalRunnerConfig = { + logger: Logger; + verbose?: boolean; +}; + +/** + * Executes evaluation suites and collects results. + * Creates an AgentRunner for each suite based on its agent config, + * runs each case, validates outputs, and collects PASS/FAIL results. + */ +export class EvalRunner { + private logger: Logger; + private verbose: boolean; + + constructor(config: EvalRunnerConfig) { + this.logger = config.logger; + this.verbose = config.verbose ?? false; + } + + /** + * Run a single evaluation suite. + */ + async runSuite(suite: EvalSuite): Promise { + const startedAt = new Date(); + this.logger.info("Running suite", { + name: suite.name, + caseCount: suite.cases.length, + }); + + const agentRunner = this.createAgentRunner(suite); + + const caseResults: CaseResult[] = []; + let passed = ZERO; + let failed = ZERO; + let errors = ZERO; + let skipped = ZERO; + + for (const evalCase of suite.cases) { + const caseResult = await this.runCase(evalCase, agentRunner, suite); + caseResults.push(caseResult); + + switch (caseResult.status) { + case "pass": + passed++; + break; + case "fail": + failed++; + break; + case "error": + errors++; + break; + case "skip": + skipped++; + break; + } + + this.logCaseResult(caseResult); + } + + const completedAt = new Date(); + const total = suite.cases.length; + const summary: SuiteSummary = { + total, + passed, + failed, + errors, + skipped, + passRate: total > ZERO ? passed / total : ZERO, + }; + + return { + suiteName: suite.name, + suiteVersion: suite.version, + startedAt: startedAt.toISOString(), + completedAt: completedAt.toISOString(), + durationMs: completedAt.getTime() - startedAt.getTime(), + summary, + cases: caseResults, + }; + } + + private logCaseResult(caseResult: CaseResult): void { + const symbol = STATUS_SYMBOLS[caseResult.status]; + const message = `${symbol} ${caseResult.caseId}: ${caseResult.caseName}`; + + if (caseResult.status === "pass") { + this.logger.info(message, { durationMs: caseResult.durationMs }); + } else { + this.logger.warn(message, { + durationMs: caseResult.durationMs, + error: caseResult.error, + }); + if (this.verbose && caseResult.assertionResults.length > ZERO) { + const failedAssertions = caseResult.assertionResults.filter( + (r) => !r.passed + ); + for (const ar of failedAssertions) { + this.logger.debug(" Assertion failed", { message: ar.message }); + } + } + } + } + + /** + * Run a single evaluation case. + */ + private async runCase( + evalCase: EvalCase, + agentRunner: AgentRunner, + suite: EvalSuite + ): Promise { + const startTime = Date.now(); + const timeout = + evalCase.timeout ?? suite.defaults?.timeout ?? DEFAULT_CASE_TIMEOUT_MS; + + this.logger.debug("Running case", { id: evalCase.id, name: evalCase.name }); + + try { + const runPromise = agentRunner.run({ + prompt: evalCase.prompt, + maxTurns: suite.agent.maxTurns ?? DEFAULT_MAX_TURNS, + }); + + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => { + reject(new Error("Case timed out")); + }, timeout); + }); + + const result = await Promise.race([runPromise, timeoutPromise]); + const output: unknown = result.finalOutput; + const durationMs = Date.now() - startTime; + + const assertionResults = this.runAssertions(evalCase.assertions, output); + + const allAssertionsPassed = assertionResults.every((r) => r.passed); + const status: CaseStatus = allAssertionsPassed ? "pass" : "fail"; + + return { + caseId: evalCase.id, + caseName: evalCase.name, + status, + durationMs, + output, + assertionResults, + error: null, + }; + } catch (err) { + const durationMs = Date.now() - startTime; + const errorMessage = err instanceof Error ? err.message : String(err); + + return { + caseId: evalCase.id, + caseName: evalCase.name, + status: "error", + durationMs, + output: null, + assertionResults: [], + error: errorMessage, + }; + } + } + + /** + * Create an AgentRunner from suite's agent config. + * Omits outputType to get plain text responses (no structured output). + */ + private createAgentRunner(suite: EvalSuite): AgentRunner { + return new AgentRunner({ + name: suite.agent.name, + model: suite.agent.model, + tools: [], + instructions: suite.agent.instructions, + logger: this.logger, + logToolResults: this.verbose, + stateless: true, + }); + } + + /** + * Run all assertions on the output. + */ + private runAssertions( + assertions: EvalCase["assertions"], + output: unknown + ): AssertionResult[] { + return assertions.map((assertion) => evaluateAssertion(assertion, output)); + } +} diff --git a/src/cli/agent-evals/clients/report-generator.ts b/src/cli/agent-evals/clients/report-generator.ts new file mode 100644 index 0000000..ce4462e --- /dev/null +++ b/src/cli/agent-evals/clients/report-generator.ts @@ -0,0 +1,190 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import type { Logger } from "~clients/logger"; +import { resolveTmpPathForWrite } from "~tools/utils/fs"; + +import { + DECIMAL_PLACES, + PERCENT_MULTIPLIER, + STATUS_SYMBOLS, +} from "../constants"; +import type { EvalReport, ReportSummary, SuiteResult } from "../schemas"; + +export type ReportFormat = "json" | "md" | "both"; + +export type ReportGeneratorConfig = { + logger: Logger; + outputDir: string; + format: ReportFormat; +}; + +/** + * Generates evaluation reports in JSON and/or Markdown format. + * Reports are written to the configured output directory under tmp/. + */ +export class ReportGenerator { + private logger: Logger; + private outputDir: string; + private format: ReportFormat; + + constructor(config: ReportGeneratorConfig) { + this.logger = config.logger; + this.outputDir = config.outputDir; + this.format = config.format; + } + + /** + * Generate and save report(s) from suite results. + * Returns the paths of saved reports. + */ + async generate(suiteResults: SuiteResult[]): Promise { + const report = this.buildReport(suiteResults); + const savedPaths: string[] = []; + + if (this.format === "json" || this.format === "both") { + const jsonPath = await this.writeJson(report); + savedPaths.push(jsonPath); + } + + if (this.format === "md" || this.format === "both") { + const mdPath = await this.writeMarkdown(report); + savedPaths.push(mdPath); + } + + return savedPaths; + } + + private buildReport(suiteResults: SuiteResult[]): EvalReport { + const totalCases = suiteResults.reduce( + (sum, s) => sum + s.summary.total, + 0 + ); + const passed = suiteResults.reduce((sum, s) => sum + s.summary.passed, 0); + const failed = suiteResults.reduce((sum, s) => sum + s.summary.failed, 0); + const errors = suiteResults.reduce((sum, s) => sum + s.summary.errors, 0); + const skipped = suiteResults.reduce((sum, s) => sum + s.summary.skipped, 0); + const durationMs = suiteResults.reduce((sum, s) => sum + s.durationMs, 0); + + const summary: ReportSummary = { + totalSuites: suiteResults.length, + totalCases, + passed, + failed, + errors, + skipped, + passRate: totalCases > 0 ? passed / totalCases : 0, + }; + + return { + generatedAt: new Date().toISOString(), + durationMs, + summary, + suites: suiteResults, + }; + } + + private async writeJson(report: EvalReport): Promise { + const timestamp = this.getTimestamp(); + const filename = `report-${timestamp}.json`; + const relativePath = path.join(this.outputDir, filename); + const fullPath = await resolveTmpPathForWrite(relativePath); + + await fs.writeFile(fullPath, JSON.stringify(report, null, 2), "utf8"); + this.logger.info("JSON report saved", { path: fullPath }); + return fullPath; + } + + private async writeMarkdown(report: EvalReport): Promise { + const timestamp = this.getTimestamp(); + const filename = `report-${timestamp}.md`; + const relativePath = path.join(this.outputDir, filename); + const fullPath = await resolveTmpPathForWrite(relativePath); + + const markdown = this.formatMarkdown(report); + await fs.writeFile(fullPath, markdown, "utf8"); + this.logger.info("Markdown report saved", { path: fullPath }); + return fullPath; + } + + private formatMarkdown(report: EvalReport): string { + const lines: string[] = []; + + lines.push("# Agent Evaluation Report"); + lines.push(""); + lines.push(`Generated: ${report.generatedAt}`); + lines.push(`Duration: ${report.durationMs}ms`); + lines.push(""); + + lines.push("## Summary"); + lines.push(""); + lines.push("| Metric | Value |"); + lines.push("|--------|-------|"); + lines.push(`| Total Suites | ${report.summary.totalSuites} |`); + lines.push(`| Total Cases | ${report.summary.totalCases} |`); + lines.push(`| Passed | ${report.summary.passed} |`); + lines.push(`| Failed | ${report.summary.failed} |`); + lines.push(`| Errors | ${report.summary.errors} |`); + lines.push(`| Skipped | ${report.summary.skipped} |`); + lines.push( + `| Pass Rate | ${this.formatPercent(report.summary.passRate)} |` + ); + lines.push(""); + + for (const suite of report.suites) { + lines.push(`## Suite: ${suite.suiteName}`); + lines.push(""); + lines.push(`Version: ${suite.suiteVersion}`); + lines.push(`Duration: ${suite.durationMs}ms`); + lines.push( + `Pass Rate: ${this.formatPercent(suite.summary.passRate)} (${suite.summary.passed}/${suite.summary.total})` + ); + lines.push(""); + + lines.push("### Cases"); + lines.push(""); + lines.push("| Status | ID | Name | Duration |"); + lines.push("|--------|-----|------|----------|"); + + for (const caseResult of suite.cases) { + const status = STATUS_SYMBOLS[caseResult.status]; + lines.push( + `| ${status} | ${caseResult.caseId} | ${caseResult.caseName} | ${caseResult.durationMs}ms |` + ); + } + lines.push(""); + + const problemCases = suite.cases.filter( + (c) => c.status === "fail" || c.status === "error" + ); + if (problemCases.length > 0) { + lines.push("### Details"); + lines.push(""); + for (const caseResult of problemCases) { + lines.push(`#### ${caseResult.caseId}: ${caseResult.caseName}`); + lines.push(""); + if (caseResult.error) { + lines.push(`**Error:** ${caseResult.error}`); + } + if (caseResult.assertionResults.length > 0) { + lines.push("**Assertion Results:**"); + for (const ar of caseResult.assertionResults) { + const icon = ar.passed ? "OK" : "FAIL"; + lines.push(`- [${icon}] ${ar.assertion.type}: ${ar.message}`); + } + } + lines.push(""); + } + } + } + + return lines.join("\n"); + } + + private formatPercent(value: number): string { + return `${(value * PERCENT_MULTIPLIER).toFixed(DECIMAL_PLACES.passRate)}%`; + } + + private getTimestamp(): string { + return new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19); + } +} diff --git a/src/cli/agent-evals/clients/suite-loader.ts b/src/cli/agent-evals/clients/suite-loader.ts new file mode 100644 index 0000000..4cde5fa --- /dev/null +++ b/src/cli/agent-evals/clients/suite-loader.ts @@ -0,0 +1,86 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import type { Logger } from "~clients/logger"; + +import { SUITE_FILE_EXTENSION, SUITES_DIR } from "../constants"; +import type { EvalSuite } from "../schemas"; +import { EvalSuiteSchema } from "../schemas"; + +export type SuiteLoaderConfig = { + logger: Logger; + suitesDir?: string; +}; + +/** + * Loads evaluation suite definitions from JSON files. + * Suite files are stored in the suites/ directory with .json extension. + */ +export class SuiteLoader { + private logger: Logger; + private suitesDir: string; + + constructor(config: SuiteLoaderConfig) { + this.logger = config.logger; + this.suitesDir = config.suitesDir ?? SUITES_DIR; + } + + /** + * Load a single suite by name. + * @param name Suite name (without .json extension) + */ + async load(name: string): Promise { + const filePath = path.join( + this.suitesDir, + `${name}${SUITE_FILE_EXTENSION}` + ); + this.logger.debug("Loading suite", { name, path: filePath }); + + const content = await fs.readFile(filePath, "utf8"); + const json = JSON.parse(content) as unknown; + const suite = EvalSuiteSchema.parse(json); + + this.logger.info("Suite loaded", { + name: suite.name, + caseCount: suite.cases.length, + }); + return suite; + } + + /** + * List all available suite names. + */ + async listSuites(): Promise { + try { + const entries = await fs.readdir(this.suitesDir, { withFileTypes: true }); + const suiteNames = entries + .filter( + (entry) => entry.isFile() && entry.name.endsWith(SUITE_FILE_EXTENSION) + ) + .map((entry) => entry.name.replace(SUITE_FILE_EXTENSION, "")); + + this.logger.debug("Available suites", { suites: suiteNames }); + return suiteNames; + } catch (err) { + if ((err as NodeJS.ErrnoException).code === "ENOENT") { + this.logger.warn("Suites directory not found", { dir: this.suitesDir }); + return []; + } + throw err; + } + } + + /** + * Load all available suites. + */ + async loadAll(): Promise { + const names = await this.listSuites(); + const suites: EvalSuite[] = []; + + for (const name of names) { + const suite = await this.load(name); + suites.push(suite); + } + + return suites; + } +} diff --git a/src/cli/agent-evals/constants.ts b/src/cli/agent-evals/constants.ts new file mode 100644 index 0000000..6e6216b --- /dev/null +++ b/src/cli/agent-evals/constants.ts @@ -0,0 +1,41 @@ +import path from "node:path"; + +// CLI defaults +export const DEFAULT_VERBOSE = false; +export const DEFAULT_REPORT_FORMAT = "json" as const; +export const DEFAULT_OUT_PATH = "agent-evals"; + +// Paths +export const SUITES_DIR = path.join( + process.cwd(), + "src", + "cli", + "agent-evals", + "suites" +); +export const SUITE_FILE_EXTENSION = ".json"; + +// Execution defaults +export const DEFAULT_CASE_TIMEOUT_MS = 30000; +export const DEFAULT_MAX_TURNS = 5; + +// Numeric constants +export const ZERO = 0; +export const ONE = 1; +export const PERCENT_MULTIPLIER = 100; + +// Report formatting +export const DECIMAL_PLACES = { + passRate: 1, + duration: 0, +} as const; + +export const LINE_WIDTH = 60; + +// Status symbols for console output +export const STATUS_SYMBOLS = { + pass: "[PASS]", + fail: "[FAIL]", + error: "[ERROR]", + skip: "[SKIP]", +} as const; diff --git a/src/cli/agent-evals/main.ts b/src/cli/agent-evals/main.ts new file mode 100644 index 0000000..a5d2659 --- /dev/null +++ b/src/cli/agent-evals/main.ts @@ -0,0 +1,104 @@ +// pnpm run:agent-evals + +// Run automated evaluation cases for AI agents with PASS/FAIL results and reports + +import "dotenv/config"; + +import { Logger } from "~clients/logger"; +import { parseArgs } from "~utils/parse-args"; + +import { EvalRunner } from "./clients/eval-runner"; +import { ReportGenerator } from "./clients/report-generator"; +import { SuiteLoader } from "./clients/suite-loader"; +import { LINE_WIDTH, PERCENT_MULTIPLIER, ZERO } from "./constants"; +import type { SuiteResult } from "./schemas"; +import { CliArgsSchema } from "./schemas"; + +const logger = new Logger(); + +logger.info("Agent Evals starting..."); + +const { suite, all, report, out, verbose } = parseArgs({ + logger, + schema: CliArgsSchema, +}); + +if (verbose) { + logger.debug("Verbose mode enabled"); +} + +const suiteLoader = new SuiteLoader({ logger }); +const evalRunner = new EvalRunner({ logger, verbose }); +const reportGenerator = new ReportGenerator({ + logger, + outputDir: out, + format: report, +}); + +let suitesToRun; +if (all) { + logger.info("Loading all suites..."); + suitesToRun = await suiteLoader.loadAll(); +} else if (suite) { + logger.info("Loading suite", { name: suite }); + const singleSuite = await suiteLoader.load(suite); + suitesToRun = [singleSuite]; +} else { + logger.error("Either --suite or --all is required"); + process.exit(1); +} + +if (suitesToRun.length === ZERO) { + logger.warn("No suites found to run"); + process.exit(0); +} + +logger.info("Suites to run", { count: suitesToRun.length }); + +const suiteResults: SuiteResult[] = []; +const separator = "=".repeat(LINE_WIDTH); + +for (const evalSuite of suitesToRun) { + logger.info(separator); + const result = await evalRunner.runSuite(evalSuite); + suiteResults.push(result); +} + +logger.info(separator); +logger.info("Generating reports..."); +const reportPaths = await reportGenerator.generate(suiteResults); + +const totalCases = suiteResults.reduce((sum, s) => sum + s.summary.total, ZERO); +const totalPassed = suiteResults.reduce( + (sum, s) => sum + s.summary.passed, + ZERO +); +const totalFailed = suiteResults.reduce( + (sum, s) => sum + s.summary.failed, + ZERO +); +const totalErrors = suiteResults.reduce( + (sum, s) => sum + s.summary.errors, + ZERO +); +const passRate = + totalCases > ZERO ? (totalPassed / totalCases) * PERCENT_MULTIPLIER : ZERO; + +logger.info(separator); +logger.info("EVALUATION COMPLETE"); +logger.info(separator); +logger.info("Summary", { + suites: suiteResults.length, + cases: totalCases, + passed: totalPassed, + failed: totalFailed, + errors: totalErrors, + passRate: `${passRate.toFixed(1)}%`, +}); +logger.info("Reports saved", { paths: reportPaths }); + +if (totalFailed > ZERO || totalErrors > ZERO) { + process.exit(1); +} + +logger.info("Agent Evals completed."); diff --git a/src/cli/agent-evals/schemas.ts b/src/cli/agent-evals/schemas.ts new file mode 100644 index 0000000..5a12ab1 --- /dev/null +++ b/src/cli/agent-evals/schemas.ts @@ -0,0 +1,203 @@ +import { z } from "zod"; + +import { + DEFAULT_OUT_PATH, + DEFAULT_REPORT_FORMAT, + DEFAULT_VERBOSE, +} from "./constants"; + +// ============================================ +// CLI Arguments +// ============================================ + +export const CliArgsSchema = z + .object({ + suite: z.string().optional(), + all: z.coerce.boolean().default(false), + report: z.enum(["json", "md", "both"]).default(DEFAULT_REPORT_FORMAT), + out: z.string().default(DEFAULT_OUT_PATH), + verbose: z.coerce.boolean().default(DEFAULT_VERBOSE), + }) + .refine((data) => data.suite ?? data.all, { + message: "Either --suite or --all is required", + }); + +export type CliArgs = z.infer; + +// ============================================ +// Assertion Types +// ============================================ + +export const ContainsAssertionSchema = z.object({ + type: z.literal("contains"), + value: z.string(), + caseSensitive: z.boolean().optional(), + description: z.string().optional(), +}); + +export const MatchesRegexAssertionSchema = z.object({ + type: z.literal("matchesRegex"), + pattern: z.string(), + flags: z.string().optional(), + description: z.string().optional(), +}); + +export const EqualsAssertionSchema = z.object({ + type: z.literal("equals"), + expected: z.unknown(), + description: z.string().optional(), +}); + +export const JsonPathAssertionSchema = z.object({ + type: z.literal("jsonPath"), + path: z.string(), + expected: z.unknown(), + description: z.string().optional(), +}); + +export const AssertionSchema = z.discriminatedUnion("type", [ + ContainsAssertionSchema, + MatchesRegexAssertionSchema, + EqualsAssertionSchema, + JsonPathAssertionSchema, +]); + +export type Assertion = z.infer; +export type ContainsAssertion = z.infer; +export type MatchesRegexAssertion = z.infer; +export type EqualsAssertion = z.infer; +export type JsonPathAssertion = z.infer; + +// ============================================ +// Eval Case +// ============================================ + +export const EvalCaseSchema = z.object({ + id: z.string(), + name: z.string(), + description: z.string().optional(), + prompt: z.string(), + assertions: z.array(AssertionSchema).default([]), + timeout: z.number().optional(), + tags: z.array(z.string()).default([]), +}); + +export type EvalCase = z.infer; + +// ============================================ +// Agent Config (for suite) +// ============================================ + +export const AgentConfigSchema = z.object({ + name: z.string(), + model: z.literal("gpt-5-mini"), + instructions: z.string(), + tools: z.array(z.string()).default([]), + maxTurns: z.number().optional(), +}); + +export type AgentConfig = z.infer; + +// ============================================ +// Eval Suite +// ============================================ + +export const EvalSuiteSchema = z.object({ + name: z.string(), + description: z.string().optional(), + version: z.string().default("1.0.0"), + agent: AgentConfigSchema, + defaults: z + .object({ + timeout: z.number().optional(), + }) + .optional(), + cases: z.array(EvalCaseSchema).min(1), +}); + +export type EvalSuite = z.infer; + +// ============================================ +// Assertion Result +// ============================================ + +export const AssertionResultSchema = z.object({ + assertion: AssertionSchema, + passed: z.boolean(), + message: z.string(), + actual: z.unknown().optional(), + expected: z.unknown().optional(), +}); + +export type AssertionResult = z.infer; + +// ============================================ +// Case Result +// ============================================ + +export const CaseStatusSchema = z.enum(["pass", "fail", "error", "skip"]); +export type CaseStatus = z.infer; + +export const CaseResultSchema = z.object({ + caseId: z.string(), + caseName: z.string(), + status: CaseStatusSchema, + durationMs: z.number(), + output: z.unknown().nullable(), + assertionResults: z.array(AssertionResultSchema), + error: z.string().nullable(), +}); + +export type CaseResult = z.infer; + +// ============================================ +// Suite Result +// ============================================ + +export const SuiteSummarySchema = z.object({ + total: z.number(), + passed: z.number(), + failed: z.number(), + errors: z.number(), + skipped: z.number(), + passRate: z.number(), +}); + +export type SuiteSummary = z.infer; + +export const SuiteResultSchema = z.object({ + suiteName: z.string(), + suiteVersion: z.string(), + startedAt: z.string(), + completedAt: z.string(), + durationMs: z.number(), + summary: SuiteSummarySchema, + cases: z.array(CaseResultSchema), +}); + +export type SuiteResult = z.infer; + +// ============================================ +// Full Report (multiple suites) +// ============================================ + +export const ReportSummarySchema = z.object({ + totalSuites: z.number(), + totalCases: z.number(), + passed: z.number(), + failed: z.number(), + errors: z.number(), + skipped: z.number(), + passRate: z.number(), +}); + +export type ReportSummary = z.infer; + +export const EvalReportSchema = z.object({ + generatedAt: z.string(), + durationMs: z.number(), + summary: ReportSummarySchema, + suites: z.array(SuiteResultSchema), +}); + +export type EvalReport = z.infer; diff --git a/src/cli/agent-evals/suites/example.json b/src/cli/agent-evals/suites/example.json new file mode 100644 index 0000000..b3aee2a --- /dev/null +++ b/src/cli/agent-evals/suites/example.json @@ -0,0 +1,79 @@ +{ + "name": "example-suite", + "description": "Example evaluation suite demonstrating the eval case pattern", + "version": "1.0.0", + "agent": { + "name": "SimpleTestAgent", + "model": "gpt-5-mini", + "instructions": "You are a helpful assistant. Always respond with valid JSON in the format: {\"answer\": \"your answer here\", \"confidence\": 0.0-1.0}. Be concise and accurate.", + "tools": [], + "maxTurns": 3 + }, + "defaults": { + "timeout": 15000 + }, + "cases": [ + { + "id": "simple-math", + "name": "Simple arithmetic", + "description": "Tests basic math reasoning - 2 + 2", + "prompt": "What is 2 + 2? Respond with your answer and confidence level.", + "assertions": [ + { + "type": "contains", + "value": "4", + "description": "Response should contain the number 4" + } + ], + "tags": ["math", "basic"] + }, + { + "id": "json-format", + "name": "JSON format validation", + "description": "Tests that agent follows JSON format instructions", + "prompt": "Say hello. Respond with your greeting and confidence level.", + "assertions": [ + { + "type": "matchesRegex", + "pattern": "\"answer\"\\s*:", + "description": "Response should have answer field" + }, + { + "type": "matchesRegex", + "pattern": "\"confidence\"\\s*:", + "description": "Response should have confidence field" + } + ], + "tags": ["format", "basic"] + }, + { + "id": "capital-france", + "name": "Basic knowledge - capital of France", + "description": "Tests knowledge retrieval", + "prompt": "What is the capital of France? Respond with your answer and confidence level.", + "assertions": [ + { + "type": "contains", + "value": "Paris", + "caseSensitive": false, + "description": "Answer should mention Paris" + } + ], + "tags": ["knowledge", "geography"] + }, + { + "id": "larger-number", + "name": "Comparison task", + "description": "Tests basic comparison reasoning", + "prompt": "Which is larger: 100 or 50? Respond with your answer and confidence level.", + "assertions": [ + { + "type": "contains", + "value": "100", + "description": "Response should identify 100 as larger" + } + ], + "tags": ["math", "comparison"] + } + ] +} diff --git a/src/cli/agent-evals/utils/assertions.test.ts b/src/cli/agent-evals/utils/assertions.test.ts new file mode 100644 index 0000000..0396d4b --- /dev/null +++ b/src/cli/agent-evals/utils/assertions.test.ts @@ -0,0 +1,214 @@ +import { describe, expect, it } from "vitest"; + +import type { Assertion } from "../schemas"; +import { evaluateAssertion } from "./assertions"; + +describe("evaluateAssertion", () => { + describe("contains", () => { + it("passes when output contains the value", () => { + const assertion: Assertion = { + type: "contains", + value: "hello", + }; + const result = evaluateAssertion(assertion, { message: "hello world" }); + expect(result.passed).toBe(true); + expect(result.message).toContain("contains"); + }); + + it("fails when output does not contain the value", () => { + const assertion: Assertion = { + type: "contains", + value: "goodbye", + }; + const result = evaluateAssertion(assertion, { message: "hello world" }); + expect(result.passed).toBe(false); + expect(result.message).toContain("does not contain"); + }); + + it("is case sensitive by default", () => { + const assertion: Assertion = { + type: "contains", + value: "HELLO", + }; + const result = evaluateAssertion(assertion, "hello world"); + expect(result.passed).toBe(false); + }); + + it("respects caseSensitive: false", () => { + const assertion: Assertion = { + type: "contains", + value: "HELLO", + caseSensitive: false, + }; + const result = evaluateAssertion(assertion, "hello world"); + expect(result.passed).toBe(true); + }); + + it("works with string output", () => { + const assertion: Assertion = { + type: "contains", + value: "test", + }; + const result = evaluateAssertion(assertion, "this is a test string"); + expect(result.passed).toBe(true); + }); + }); + + describe("matchesRegex", () => { + it("passes when output matches pattern", () => { + const assertion: Assertion = { + type: "matchesRegex", + pattern: "\\d{3}-\\d{4}", + }; + const result = evaluateAssertion(assertion, "Call 555-1234"); + expect(result.passed).toBe(true); + }); + + it("fails when output does not match pattern", () => { + const assertion: Assertion = { + type: "matchesRegex", + pattern: "\\d{3}-\\d{4}", + }; + const result = evaluateAssertion(assertion, "No number here"); + expect(result.passed).toBe(false); + }); + + it("supports regex flags", () => { + const assertion: Assertion = { + type: "matchesRegex", + pattern: "hello", + flags: "i", + }; + const result = evaluateAssertion(assertion, "HELLO WORLD"); + expect(result.passed).toBe(true); + }); + + it("handles invalid regex gracefully", () => { + const assertion: Assertion = { + type: "matchesRegex", + pattern: "[invalid", + }; + const result = evaluateAssertion(assertion, "test"); + expect(result.passed).toBe(false); + expect(result.message).toContain("Invalid regex"); + }); + }); + + describe("equals", () => { + it("passes for equal primitive values", () => { + const assertion: Assertion = { + type: "equals", + expected: 42, + }; + const result = evaluateAssertion(assertion, 42); + expect(result.passed).toBe(true); + }); + + it("fails for different primitive values", () => { + const assertion: Assertion = { + type: "equals", + expected: 42, + }; + const result = evaluateAssertion(assertion, 43); + expect(result.passed).toBe(false); + }); + + it("passes for equal objects", () => { + const assertion: Assertion = { + type: "equals", + expected: { a: 1, b: 2 }, + }; + const result = evaluateAssertion(assertion, { a: 1, b: 2 }); + expect(result.passed).toBe(true); + }); + + it("fails for different objects", () => { + const assertion: Assertion = { + type: "equals", + expected: { a: 1, b: 2 }, + }; + const result = evaluateAssertion(assertion, { a: 1, b: 3 }); + expect(result.passed).toBe(false); + }); + + it("passes for equal strings", () => { + const assertion: Assertion = { + type: "equals", + expected: "hello", + }; + const result = evaluateAssertion(assertion, "hello"); + expect(result.passed).toBe(true); + }); + }); + + describe("jsonPath", () => { + it("extracts and compares nested values", () => { + const assertion: Assertion = { + type: "jsonPath", + path: "response.status", + expected: "success", + }; + const result = evaluateAssertion(assertion, { + response: { status: "success" }, + }); + expect(result.passed).toBe(true); + }); + + it("supports $. prefix in path", () => { + const assertion: Assertion = { + type: "jsonPath", + path: "$.response.status", + expected: "success", + }; + const result = evaluateAssertion(assertion, { + response: { status: "success" }, + }); + expect(result.passed).toBe(true); + }); + + it("fails when path value does not match", () => { + const assertion: Assertion = { + type: "jsonPath", + path: "response.status", + expected: "success", + }; + const result = evaluateAssertion(assertion, { + response: { status: "error" }, + }); + expect(result.passed).toBe(false); + }); + + it("fails for missing path", () => { + const assertion: Assertion = { + type: "jsonPath", + path: "missing.path", + expected: "value", + }; + const result = evaluateAssertion(assertion, { other: "data" }); + expect(result.passed).toBe(false); + expect(result.message).toContain("Failed to evaluate path"); + }); + + it("handles deeply nested paths", () => { + const assertion: Assertion = { + type: "jsonPath", + path: "a.b.c.d", + expected: 123, + }; + const result = evaluateAssertion(assertion, { + a: { b: { c: { d: 123 } } }, + }); + expect(result.passed).toBe(true); + }); + + it("compares arrays correctly", () => { + const assertion: Assertion = { + type: "jsonPath", + path: "items", + expected: [1, 2, 3], + }; + const result = evaluateAssertion(assertion, { items: [1, 2, 3] }); + expect(result.passed).toBe(true); + }); + }); +}); diff --git a/src/cli/agent-evals/utils/assertions.ts b/src/cli/agent-evals/utils/assertions.ts new file mode 100644 index 0000000..ca870e4 --- /dev/null +++ b/src/cli/agent-evals/utils/assertions.ts @@ -0,0 +1,164 @@ +import type { + Assertion, + AssertionResult, + ContainsAssertion, + EqualsAssertion, + JsonPathAssertion, + MatchesRegexAssertion, +} from "../schemas"; + +/** + * Evaluate a single assertion against the agent output. + */ +export const evaluateAssertion = ( + assertion: Assertion, + output: unknown +): AssertionResult => { + switch (assertion.type) { + case "contains": + return evaluateContainsAssertion(assertion, output); + case "matchesRegex": + return evaluateMatchesRegexAssertion(assertion, output); + case "equals": + return evaluateEqualsAssertion(assertion, output); + case "jsonPath": + return evaluateJsonPathAssertion(assertion, output); + } +}; + +const evaluateContainsAssertion = ( + assertion: ContainsAssertion, + output: unknown +): AssertionResult => { + const outputStr = stringifyOutput(output); + const caseSensitive = assertion.caseSensitive ?? true; + const searchValue = caseSensitive + ? assertion.value + : assertion.value.toLowerCase(); + const searchIn = caseSensitive ? outputStr : outputStr.toLowerCase(); + const passed = searchIn.includes(searchValue); + + return { + assertion, + passed, + message: passed + ? `Output contains "${assertion.value}"` + : `Output does not contain "${assertion.value}"`, + actual: outputStr, + expected: assertion.value, + }; +}; + +const evaluateMatchesRegexAssertion = ( + assertion: MatchesRegexAssertion, + output: unknown +): AssertionResult => { + const outputStr = stringifyOutput(output); + + try { + const regex = new RegExp(assertion.pattern, assertion.flags); + const passed = regex.test(outputStr); + + return { + assertion, + passed, + message: passed + ? `Output matches pattern /${assertion.pattern}/${assertion.flags ?? ""}` + : `Output does not match pattern /${assertion.pattern}/${assertion.flags ?? ""}`, + actual: outputStr, + expected: assertion.pattern, + }; + } catch (err) { + return { + assertion, + passed: false, + message: `Invalid regex pattern: ${err instanceof Error ? err.message : String(err)}`, + actual: outputStr, + expected: assertion.pattern, + }; + } +}; + +const evaluateEqualsAssertion = ( + assertion: EqualsAssertion, + output: unknown +): AssertionResult => { + const passed = deepEquals(output, assertion.expected); + + return { + assertion, + passed, + message: passed + ? "Output equals expected value" + : "Output does not equal expected value", + actual: output, + expected: assertion.expected, + }; +}; + +const evaluateJsonPathAssertion = ( + assertion: JsonPathAssertion, + output: unknown +): AssertionResult => { + try { + const value = getJsonPath(output, assertion.path); + const passed = deepEquals(value, assertion.expected); + + return { + assertion, + passed, + message: passed + ? `Value at ${assertion.path} equals expected` + : `Value at ${assertion.path} does not equal expected`, + actual: value, + expected: assertion.expected, + }; + } catch (err) { + return { + assertion, + passed: false, + message: `Failed to evaluate path ${assertion.path}: ${err instanceof Error ? err.message : String(err)}`, + actual: output, + expected: assertion.expected, + }; + } +}; + +/** + * Convert output to string for text-based assertions. + */ +const stringifyOutput = (output: unknown): string => { + if (typeof output === "string") { + return output; + } + return JSON.stringify(output, null, 2); +}; + +/** + * Deep equality check using JSON serialization. + */ +const deepEquals = (a: unknown, b: unknown): boolean => { + return JSON.stringify(a) === JSON.stringify(b); +}; + +/** + * Simple JSON path getter supporting dot notation. + * Supports paths like "response.status" or "$.response.status" + */ +const getJsonPath = (obj: unknown, path: string): unknown => { + const normalizedPath = path.startsWith("$.") ? path.slice(2) : path; + const parts = normalizedPath.split("."); + + let current: unknown = obj; + for (const part of parts) { + if (current === null || current === undefined) { + throw new Error(`Cannot read property "${part}" of ${String(current)}`); + } + if (typeof current !== "object") { + throw new Error(`Cannot read property "${part}" of non-object`); + } + current = (current as Record)[part]; + } + + return current; +}; diff --git a/src/clients/agent-runner.ts b/src/clients/agent-runner.ts index ff8350c..46b0553 100644 --- a/src/clients/agent-runner.ts +++ b/src/clients/agent-runner.ts @@ -11,7 +11,8 @@ export type AgentRunnerConfig = { name: string; model: "gpt-5-mini"; tools: Tool[]; - outputType: ZodType; + /** Zod schema for structured output. Omit for plain text responses. */ + outputType?: ZodType; instructions: string; // Logging config @@ -65,7 +66,7 @@ export class AgentRunner { name: config.name, model: config.model, tools: config.tools, - outputType: config.outputType, + ...(config.outputType ? { outputType: config.outputType } : {}), instructions: config.instructions, }); From 01afdc931d9841a94743a56df9ee6beaf42db9e8 Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Tue, 27 Jan 2026 13:06:46 +0200 Subject: [PATCH 03/14] test: add unit tests for parseArgs function with Zod validation --- src/utils/parse-args.test.ts | 44 ++++++++++++++++++++++++++++++++++++ src/utils/parse-args.ts | 10 ++++++-- 2 files changed, 52 insertions(+), 2 deletions(-) create mode 100644 src/utils/parse-args.test.ts diff --git a/src/utils/parse-args.test.ts b/src/utils/parse-args.test.ts new file mode 100644 index 0000000..b5e394d --- /dev/null +++ b/src/utils/parse-args.test.ts @@ -0,0 +1,44 @@ +import { describe, expect, it } from "vitest"; +import { z } from "zod"; + +import { Logger } from "~clients/logger"; +import { parseArgs } from "~utils/parse-args"; + +const TestSchema = z + .object({ + suite: z.string().optional(), + all: z.coerce.boolean().default(false), + }) + .refine((data) => data.suite ?? data.all, { + message: "Either --suite or --all is required", + }); + +describe("parseArgs", () => { + const logger = new Logger({ + level: "error", + useColors: false, + useTimestamps: false, + }); + + it("parses args after a standalone double-dash separator", () => { + const args = parseArgs({ + logger, + schema: TestSchema, + rawArgs: ["--", "--suite=example"], + }); + + expect(args.suite).toBe("example"); + expect(args.all).toBe(false); + }); + + it("parses --all even when preceded by a double-dash separator", () => { + const args = parseArgs({ + logger, + schema: TestSchema, + rawArgs: ["--", "--all"], + }); + + expect(args.suite).toBeUndefined(); + expect(args.all).toBe(true); + }); +}); diff --git a/src/utils/parse-args.ts b/src/utils/parse-args.ts index f97c2c6..b777282 100644 --- a/src/utils/parse-args.ts +++ b/src/utils/parse-args.ts @@ -1,12 +1,16 @@ import type { Logger } from "~clients/logger"; import type { z } from "zod"; -import { argv } from "zx"; +import { parseArgv } from "zx"; export type ParseArgsOptions = { logger: Logger; schema: T; + rawArgs?: string[]; }; +const sanitizeArgs = (rawArgs: string[]): string[] => + rawArgs.filter((arg) => arg !== "--"); + /** * Parses and validates CLI arguments using a Zod schema. * @param options - Logger and Zod schema for validation @@ -16,9 +20,11 @@ export type ParseArgsOptions = { export const parseArgs = ({ logger, schema, + rawArgs, }: ParseArgsOptions): z.infer => { logger.debug("Parsing CLI arguments..."); - const args = schema.parse(argv); + const parsedArgs = parseArgv(sanitizeArgs(rawArgs ?? process.argv.slice(2))); + const args = schema.parse(parsedArgs); logger.debug("Parsed args", { args }); return args; }; From 4e16806d4d1cfc36df0480c8c224971b2520ec14 Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Tue, 27 Jan 2026 13:51:18 +0200 Subject: [PATCH 04/14] feat: add file assertion types and enhance evaluation framework - Implement fileExists, fileContains, and fileJsonPath assertions - Update assertion evaluation to handle async file checks - Create tool registry for managing file-related tools --- src/cli/agent-evals/README.md | 12 +- src/cli/agent-evals/clients/eval-runner.ts | 22 +- .../agent-evals/clients/report-generator.ts | 8 +- src/cli/agent-evals/clients/tool-registry.ts | 35 +++ src/cli/agent-evals/constants.ts | 1 + src/cli/agent-evals/schemas.ts | 32 +++ src/cli/agent-evals/suites/tools.json | 89 ++++++ src/cli/agent-evals/utils/assertions.test.ts | 259 +++++++++++++++--- src/cli/agent-evals/utils/assertions.ts | 16 +- src/cli/agent-evals/utils/file-assertions.ts | 138 ++++++++++ src/tools/list-files/list-files-tool.ts | 11 +- src/utils/parse-args.test.ts | 86 +++++- 12 files changed, 634 insertions(+), 75 deletions(-) create mode 100644 src/cli/agent-evals/clients/tool-registry.ts create mode 100644 src/cli/agent-evals/suites/tools.json create mode 100644 src/cli/agent-evals/utils/file-assertions.ts diff --git a/src/cli/agent-evals/README.md b/src/cli/agent-evals/README.md index 7ea7245..5c19b4a 100644 --- a/src/cli/agent-evals/README.md +++ b/src/cli/agent-evals/README.md @@ -20,14 +20,15 @@ pnpm run:agent-evals -- --suite=example --verbose --report=both - `--suite `: Run a specific suite by name (without `.json` extension) - `--all`: Run all suites in the `suites/` directory - `--report `: Report format: `json`, `md`, or `both` (default: `json`) -- `--out `: Output directory under `tmp/` (default: `agent-evals`) +- `--out `: Output base directory under `tmp/` (default: `agent-evals`) - `--verbose`: Enable verbose logging with assertion details Either `--suite` or `--all` is required. ## Output -Reports are written to `tmp/agent-evals/`: +Reports are written to `tmp//reports/` (default: `tmp/agent-evals/reports/`): + - `report-{timestamp}.json`: Machine-readable results - `report-{timestamp}.md`: Human-readable markdown report @@ -57,9 +58,7 @@ Add JSON files to `suites/` directory. Example structure: "id": "case-1", "name": "Test case name", "prompt": "User prompt to test", - "assertions": [ - { "type": "contains", "value": "expected text" } - ] + "assertions": [{ "type": "contains", "value": "expected text" }] } ] } @@ -68,16 +67,19 @@ Add JSON files to `suites/` directory. Example structure: ## Assertion Types - **contains**: Check if output contains a string + ```json { "type": "contains", "value": "text", "caseSensitive": false } ``` - **matchesRegex**: Check if output matches a regex pattern + ```json { "type": "matchesRegex", "pattern": "\\d+", "flags": "i" } ``` - **equals**: Deep equality check + ```json { "type": "equals", "expected": { "key": "value" } } ``` diff --git a/src/cli/agent-evals/clients/eval-runner.ts b/src/cli/agent-evals/clients/eval-runner.ts index 94f64e6..7205b24 100644 --- a/src/cli/agent-evals/clients/eval-runner.ts +++ b/src/cli/agent-evals/clients/eval-runner.ts @@ -17,6 +17,7 @@ import type { SuiteSummary, } from "../schemas"; import { evaluateAssertion } from "../utils/assertions"; +import { createToolsFromNames } from "./tool-registry"; export type EvalRunnerConfig = { logger: Logger; @@ -151,7 +152,10 @@ export class EvalRunner { const output: unknown = result.finalOutput; const durationMs = Date.now() - startTime; - const assertionResults = this.runAssertions(evalCase.assertions, output); + const assertionResults = await this.runAssertions( + evalCase.assertions, + output + ); const allAssertionsPassed = assertionResults.every((r) => r.passed); const status: CaseStatus = allAssertionsPassed ? "pass" : "fail"; @@ -183,13 +187,17 @@ export class EvalRunner { /** * Create an AgentRunner from suite's agent config. - * Omits outputType to get plain text responses (no structured output). + * Instantiates tools from the tool registry based on suite.agent.tools. */ private createAgentRunner(suite: EvalSuite): AgentRunner { + const tools = createToolsFromNames(suite.agent.tools, { + logger: this.logger, + }); + return new AgentRunner({ name: suite.agent.name, model: suite.agent.model, - tools: [], + tools, instructions: suite.agent.instructions, logger: this.logger, logToolResults: this.verbose, @@ -200,10 +208,12 @@ export class EvalRunner { /** * Run all assertions on the output. */ - private runAssertions( + private async runAssertions( assertions: EvalCase["assertions"], output: unknown - ): AssertionResult[] { - return assertions.map((assertion) => evaluateAssertion(assertion, output)); + ): Promise { + return Promise.all( + assertions.map((assertion) => evaluateAssertion(assertion, output)) + ); } } diff --git a/src/cli/agent-evals/clients/report-generator.ts b/src/cli/agent-evals/clients/report-generator.ts index ce4462e..5437cb7 100644 --- a/src/cli/agent-evals/clients/report-generator.ts +++ b/src/cli/agent-evals/clients/report-generator.ts @@ -6,6 +6,7 @@ import { resolveTmpPathForWrite } from "~tools/utils/fs"; import { DECIMAL_PLACES, PERCENT_MULTIPLIER, + REPORTS_SUBDIR, STATUS_SYMBOLS, } from "../constants"; import type { EvalReport, ReportSummary, SuiteResult } from "../schemas"; @@ -20,7 +21,8 @@ export type ReportGeneratorConfig = { /** * Generates evaluation reports in JSON and/or Markdown format. - * Reports are written to the configured output directory under tmp/. + * Reports are written to the configured output directory under tmp/, + * inside a dedicated reports/ subfolder. */ export class ReportGenerator { private logger: Logger; @@ -86,7 +88,7 @@ export class ReportGenerator { private async writeJson(report: EvalReport): Promise { const timestamp = this.getTimestamp(); const filename = `report-${timestamp}.json`; - const relativePath = path.join(this.outputDir, filename); + const relativePath = path.join(this.outputDir, REPORTS_SUBDIR, filename); const fullPath = await resolveTmpPathForWrite(relativePath); await fs.writeFile(fullPath, JSON.stringify(report, null, 2), "utf8"); @@ -97,7 +99,7 @@ export class ReportGenerator { private async writeMarkdown(report: EvalReport): Promise { const timestamp = this.getTimestamp(); const filename = `report-${timestamp}.md`; - const relativePath = path.join(this.outputDir, filename); + const relativePath = path.join(this.outputDir, REPORTS_SUBDIR, filename); const fullPath = await resolveTmpPathForWrite(relativePath); const markdown = this.formatMarkdown(report); diff --git a/src/cli/agent-evals/clients/tool-registry.ts b/src/cli/agent-evals/clients/tool-registry.ts new file mode 100644 index 0000000..b4d0386 --- /dev/null +++ b/src/cli/agent-evals/clients/tool-registry.ts @@ -0,0 +1,35 @@ +import type { Tool } from "@openai/agents"; +import type { Logger } from "~clients/logger"; +import { createListFilesTool } from "~tools/list-files/list-files-tool"; +import { createReadFileTool } from "~tools/read-file/read-file-tool"; +import { createWriteFileTool } from "~tools/write-file/write-file-tool"; + +export type ToolFactoryConfig = { + logger: Logger; +}; + +type ToolFactory = (config: ToolFactoryConfig) => Tool; + +const toolFactories: Record = { + readFile: ({ logger }) => createReadFileTool({ logger }), + writeFile: ({ logger }) => createWriteFileTool({ logger }), + listFiles: ({ logger }) => createListFilesTool({ logger }), +}; + +/** + * Creates tool instances from an array of tool names. + * Throws if an unknown tool name is provided. + */ +export const createToolsFromNames = ( + names: string[], + config: ToolFactoryConfig +): Tool[] => { + return names.map((name) => { + const factory = toolFactories[name]; + if (!factory) { + const available = Object.keys(toolFactories).join(", "); + throw new Error(`Unknown tool: ${name}. Available: ${available}`); + } + return factory(config); + }); +}; diff --git a/src/cli/agent-evals/constants.ts b/src/cli/agent-evals/constants.ts index 6e6216b..6a92493 100644 --- a/src/cli/agent-evals/constants.ts +++ b/src/cli/agent-evals/constants.ts @@ -4,6 +4,7 @@ import path from "node:path"; export const DEFAULT_VERBOSE = false; export const DEFAULT_REPORT_FORMAT = "json" as const; export const DEFAULT_OUT_PATH = "agent-evals"; +export const REPORTS_SUBDIR = "reports"; // Paths export const SUITES_DIR = path.join( diff --git a/src/cli/agent-evals/schemas.ts b/src/cli/agent-evals/schemas.ts index 5a12ab1..9b56a40 100644 --- a/src/cli/agent-evals/schemas.ts +++ b/src/cli/agent-evals/schemas.ts @@ -55,11 +55,40 @@ export const JsonPathAssertionSchema = z.object({ description: z.string().optional(), }); +// ============================================ +// File Assertion Types (for verifying tool side effects) +// ============================================ + +export const FileExistsAssertionSchema = z.object({ + type: z.literal("fileExists"), + path: z.string(), + description: z.string().optional(), +}); + +export const FileContainsAssertionSchema = z.object({ + type: z.literal("fileContains"), + path: z.string(), + value: z.string(), + caseSensitive: z.boolean().optional(), + description: z.string().optional(), +}); + +export const FileJsonPathAssertionSchema = z.object({ + type: z.literal("fileJsonPath"), + path: z.string(), + jsonPath: z.string(), + expected: z.unknown(), + description: z.string().optional(), +}); + export const AssertionSchema = z.discriminatedUnion("type", [ ContainsAssertionSchema, MatchesRegexAssertionSchema, EqualsAssertionSchema, JsonPathAssertionSchema, + FileExistsAssertionSchema, + FileContainsAssertionSchema, + FileJsonPathAssertionSchema, ]); export type Assertion = z.infer; @@ -67,6 +96,9 @@ export type ContainsAssertion = z.infer; export type MatchesRegexAssertion = z.infer; export type EqualsAssertion = z.infer; export type JsonPathAssertion = z.infer; +export type FileExistsAssertion = z.infer; +export type FileContainsAssertion = z.infer; +export type FileJsonPathAssertion = z.infer; // ============================================ // Eval Case diff --git a/src/cli/agent-evals/suites/tools.json b/src/cli/agent-evals/suites/tools.json new file mode 100644 index 0000000..0a765b4 --- /dev/null +++ b/src/cli/agent-evals/suites/tools.json @@ -0,0 +1,89 @@ +{ + "name": "tools-suite", + "description": "Tests shared agent tools (readFile, writeFile, listFiles)", + "version": "1.0.0", + "agent": { + "name": "ToolTestAgent", + "model": "gpt-5-mini", + "instructions": "You are an assistant that tests file tools. Use the tools provided to complete tasks. After using a tool, report results concisely.", + "tools": ["readFile", "writeFile", "listFiles"], + "maxTurns": 3 + }, + "defaults": { + "timeout": 20000 + }, + "cases": [ + { + "id": "write-file", + "name": "writeFile creates a file", + "prompt": "Write the text 'Hello World' to a file called 'agent-evals/tool-test-output.txt'", + "assertions": [ + { + "type": "fileExists", + "path": "agent-evals/tool-test-output.txt", + "description": "File should be created" + }, + { + "type": "fileContains", + "path": "agent-evals/tool-test-output.txt", + "value": "Hello World", + "description": "File should contain the written text" + } + ], + "tags": ["writeFile"] + }, + { + "id": "read-file", + "name": "readFile reads file content", + "prompt": "Read the file 'agent-evals/tool-test-output.txt' and tell me its contents", + "assertions": [ + { + "type": "contains", + "value": "Hello World", + "description": "Agent response should include the file contents" + } + ], + "tags": ["readFile"] + }, + { + "id": "list-files", + "name": "listFiles shows directory contents", + "prompt": "List the files in the tmp/agent-evals directory", + "assertions": [ + { + "type": "contains", + "value": "tool-test-output.txt", + "description": "Agent response should include the previously created file" + } + ], + "tags": ["listFiles"] + }, + { + "id": "write-json", + "name": "writeFile with JSON content", + "prompt": "Write a JSON file called 'agent-evals/tool-test-data.json' with this exact content: {\"name\": \"test\", \"value\": 42}", + "assertions": [ + { + "type": "fileExists", + "path": "agent-evals/tool-test-data.json", + "description": "JSON file should be created" + }, + { + "type": "fileJsonPath", + "path": "agent-evals/tool-test-data.json", + "jsonPath": "$.name", + "expected": "test", + "description": "JSON name field should match" + }, + { + "type": "fileJsonPath", + "path": "agent-evals/tool-test-data.json", + "jsonPath": "$.value", + "expected": 42, + "description": "JSON value field should match" + } + ], + "tags": ["writeFile", "json"] + } + ] +} diff --git a/src/cli/agent-evals/utils/assertions.test.ts b/src/cli/agent-evals/utils/assertions.test.ts index 0396d4b..f57c86c 100644 --- a/src/cli/agent-evals/utils/assertions.test.ts +++ b/src/cli/agent-evals/utils/assertions.test.ts @@ -1,214 +1,391 @@ -import { describe, expect, it } from "vitest"; +import fs from "node:fs/promises"; +import path from "node:path"; +import { TMP_ROOT } from "~tools/utils/fs"; +import { afterAll, beforeAll, describe, expect, it } from "vitest"; import type { Assertion } from "../schemas"; import { evaluateAssertion } from "./assertions"; describe("evaluateAssertion", () => { describe("contains", () => { - it("passes when output contains the value", () => { + it("passes when output contains the value", async () => { const assertion: Assertion = { type: "contains", value: "hello", }; - const result = evaluateAssertion(assertion, { message: "hello world" }); + const result = await evaluateAssertion(assertion, { + message: "hello world", + }); expect(result.passed).toBe(true); expect(result.message).toContain("contains"); }); - it("fails when output does not contain the value", () => { + it("fails when output does not contain the value", async () => { const assertion: Assertion = { type: "contains", value: "goodbye", }; - const result = evaluateAssertion(assertion, { message: "hello world" }); + const result = await evaluateAssertion(assertion, { + message: "hello world", + }); expect(result.passed).toBe(false); expect(result.message).toContain("does not contain"); }); - it("is case sensitive by default", () => { + it("is case sensitive by default", async () => { const assertion: Assertion = { type: "contains", value: "HELLO", }; - const result = evaluateAssertion(assertion, "hello world"); + const result = await evaluateAssertion(assertion, "hello world"); expect(result.passed).toBe(false); }); - it("respects caseSensitive: false", () => { + it("respects caseSensitive: false", async () => { const assertion: Assertion = { type: "contains", value: "HELLO", caseSensitive: false, }; - const result = evaluateAssertion(assertion, "hello world"); + const result = await evaluateAssertion(assertion, "hello world"); expect(result.passed).toBe(true); }); - it("works with string output", () => { + it("works with string output", async () => { const assertion: Assertion = { type: "contains", value: "test", }; - const result = evaluateAssertion(assertion, "this is a test string"); + const result = await evaluateAssertion( + assertion, + "this is a test string" + ); expect(result.passed).toBe(true); }); }); describe("matchesRegex", () => { - it("passes when output matches pattern", () => { + it("passes when output matches pattern", async () => { const assertion: Assertion = { type: "matchesRegex", pattern: "\\d{3}-\\d{4}", }; - const result = evaluateAssertion(assertion, "Call 555-1234"); + const result = await evaluateAssertion(assertion, "Call 555-1234"); expect(result.passed).toBe(true); }); - it("fails when output does not match pattern", () => { + it("fails when output does not match pattern", async () => { const assertion: Assertion = { type: "matchesRegex", pattern: "\\d{3}-\\d{4}", }; - const result = evaluateAssertion(assertion, "No number here"); + const result = await evaluateAssertion(assertion, "No number here"); expect(result.passed).toBe(false); }); - it("supports regex flags", () => { + it("supports regex flags", async () => { const assertion: Assertion = { type: "matchesRegex", pattern: "hello", flags: "i", }; - const result = evaluateAssertion(assertion, "HELLO WORLD"); + const result = await evaluateAssertion(assertion, "HELLO WORLD"); expect(result.passed).toBe(true); }); - it("handles invalid regex gracefully", () => { + it("handles invalid regex gracefully", async () => { const assertion: Assertion = { type: "matchesRegex", pattern: "[invalid", }; - const result = evaluateAssertion(assertion, "test"); + const result = await evaluateAssertion(assertion, "test"); expect(result.passed).toBe(false); expect(result.message).toContain("Invalid regex"); }); }); describe("equals", () => { - it("passes for equal primitive values", () => { + it("passes for equal primitive values", async () => { const assertion: Assertion = { type: "equals", expected: 42, }; - const result = evaluateAssertion(assertion, 42); + const result = await evaluateAssertion(assertion, 42); expect(result.passed).toBe(true); }); - it("fails for different primitive values", () => { + it("fails for different primitive values", async () => { const assertion: Assertion = { type: "equals", expected: 42, }; - const result = evaluateAssertion(assertion, 43); + const result = await evaluateAssertion(assertion, 43); expect(result.passed).toBe(false); }); - it("passes for equal objects", () => { + it("passes for equal objects", async () => { const assertion: Assertion = { type: "equals", expected: { a: 1, b: 2 }, }; - const result = evaluateAssertion(assertion, { a: 1, b: 2 }); + const result = await evaluateAssertion(assertion, { a: 1, b: 2 }); expect(result.passed).toBe(true); }); - it("fails for different objects", () => { + it("fails for different objects", async () => { const assertion: Assertion = { type: "equals", expected: { a: 1, b: 2 }, }; - const result = evaluateAssertion(assertion, { a: 1, b: 3 }); + const result = await evaluateAssertion(assertion, { a: 1, b: 3 }); expect(result.passed).toBe(false); }); - it("passes for equal strings", () => { + it("passes for equal strings", async () => { const assertion: Assertion = { type: "equals", expected: "hello", }; - const result = evaluateAssertion(assertion, "hello"); + const result = await evaluateAssertion(assertion, "hello"); expect(result.passed).toBe(true); }); }); describe("jsonPath", () => { - it("extracts and compares nested values", () => { + it("extracts and compares nested values", async () => { const assertion: Assertion = { type: "jsonPath", path: "response.status", expected: "success", }; - const result = evaluateAssertion(assertion, { + const result = await evaluateAssertion(assertion, { response: { status: "success" }, }); expect(result.passed).toBe(true); }); - it("supports $. prefix in path", () => { + it("supports $. prefix in path", async () => { const assertion: Assertion = { type: "jsonPath", path: "$.response.status", expected: "success", }; - const result = evaluateAssertion(assertion, { + const result = await evaluateAssertion(assertion, { response: { status: "success" }, }); expect(result.passed).toBe(true); }); - it("fails when path value does not match", () => { + it("fails when path value does not match", async () => { const assertion: Assertion = { type: "jsonPath", path: "response.status", expected: "success", }; - const result = evaluateAssertion(assertion, { + const result = await evaluateAssertion(assertion, { response: { status: "error" }, }); expect(result.passed).toBe(false); }); - it("fails for missing path", () => { + it("fails for missing path", async () => { const assertion: Assertion = { type: "jsonPath", path: "missing.path", expected: "value", }; - const result = evaluateAssertion(assertion, { other: "data" }); + const result = await evaluateAssertion(assertion, { other: "data" }); expect(result.passed).toBe(false); expect(result.message).toContain("Failed to evaluate path"); }); - it("handles deeply nested paths", () => { + it("handles deeply nested paths", async () => { const assertion: Assertion = { type: "jsonPath", path: "a.b.c.d", expected: 123, }; - const result = evaluateAssertion(assertion, { + const result = await evaluateAssertion(assertion, { a: { b: { c: { d: 123 } } }, }); expect(result.passed).toBe(true); }); - it("compares arrays correctly", () => { + it("compares arrays correctly", async () => { const assertion: Assertion = { type: "jsonPath", path: "items", expected: [1, 2, 3], }; - const result = evaluateAssertion(assertion, { items: [1, 2, 3] }); + const result = await evaluateAssertion(assertion, { items: [1, 2, 3] }); expect(result.passed).toBe(true); }); }); + + describe("file assertions", () => { + const TEST_DIR = path.join(TMP_ROOT, "assertion-tests"); + const TEST_FILE = path.join(TEST_DIR, "test-file.txt"); + const TEST_JSON_FILE = path.join(TEST_DIR, "test-data.json"); + + beforeAll(async () => { + await fs.mkdir(TEST_DIR, { recursive: true }); + await fs.writeFile(TEST_FILE, "Hello World\nThis is test content."); + await fs.writeFile( + TEST_JSON_FILE, + JSON.stringify({ name: "test", value: 42, nested: { key: "value" } }) + ); + }); + + afterAll(async () => { + await fs.rm(TEST_DIR, { recursive: true, force: true }); + }); + + describe("fileExists", () => { + it("passes when file exists", async () => { + const assertion: Assertion = { + type: "fileExists", + path: "assertion-tests/test-file.txt", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(true); + expect(result.message).toContain("exists"); + }); + + it("fails when file does not exist", async () => { + const assertion: Assertion = { + type: "fileExists", + path: "assertion-tests/nonexistent.txt", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.message).toContain("does not exist"); + }); + }); + + describe("fileContains", () => { + it("passes when file contains the value", async () => { + const assertion: Assertion = { + type: "fileContains", + path: "assertion-tests/test-file.txt", + value: "Hello World", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(true); + expect(result.message).toContain("contains"); + }); + + it("fails when file does not contain the value", async () => { + const assertion: Assertion = { + type: "fileContains", + path: "assertion-tests/test-file.txt", + value: "Goodbye", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.message).toContain("does not contain"); + }); + + it("is case sensitive by default", async () => { + const assertion: Assertion = { + type: "fileContains", + path: "assertion-tests/test-file.txt", + value: "HELLO WORLD", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + }); + + it("respects caseSensitive: false", async () => { + const assertion: Assertion = { + type: "fileContains", + path: "assertion-tests/test-file.txt", + value: "HELLO WORLD", + caseSensitive: false, + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(true); + }); + + it("fails gracefully when file does not exist", async () => { + const assertion: Assertion = { + type: "fileContains", + path: "assertion-tests/nonexistent.txt", + value: "test", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.message).toContain("Failed to read file"); + }); + }); + + describe("fileJsonPath", () => { + it("extracts and compares JSON values", async () => { + const assertion: Assertion = { + type: "fileJsonPath", + path: "assertion-tests/test-data.json", + jsonPath: "name", + expected: "test", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(true); + }); + + it("supports $. prefix in jsonPath", async () => { + const assertion: Assertion = { + type: "fileJsonPath", + path: "assertion-tests/test-data.json", + jsonPath: "$.value", + expected: 42, + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(true); + }); + + it("handles nested paths", async () => { + const assertion: Assertion = { + type: "fileJsonPath", + path: "assertion-tests/test-data.json", + jsonPath: "nested.key", + expected: "value", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(true); + }); + + it("fails when value does not match", async () => { + const assertion: Assertion = { + type: "fileJsonPath", + path: "assertion-tests/test-data.json", + jsonPath: "value", + expected: 100, + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + }); + + it("fails gracefully for missing path", async () => { + const assertion: Assertion = { + type: "fileJsonPath", + path: "assertion-tests/test-data.json", + jsonPath: "missing.path", + expected: "value", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.message).toContain("Failed to evaluate"); + }); + + it("fails gracefully when file does not exist", async () => { + const assertion: Assertion = { + type: "fileJsonPath", + path: "assertion-tests/nonexistent.json", + jsonPath: "key", + expected: "value", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.message).toContain("Failed to evaluate"); + }); + }); + }); }); diff --git a/src/cli/agent-evals/utils/assertions.ts b/src/cli/agent-evals/utils/assertions.ts index ca870e4..bdc50d1 100644 --- a/src/cli/agent-evals/utils/assertions.ts +++ b/src/cli/agent-evals/utils/assertions.ts @@ -6,14 +6,20 @@ import type { JsonPathAssertion, MatchesRegexAssertion, } from "../schemas"; +import { + evaluateFileContainsAssertion, + evaluateFileExistsAssertion, + evaluateFileJsonPathAssertion, +} from "./file-assertions"; /** * Evaluate a single assertion against the agent output. + * File assertions are async (require filesystem access). */ -export const evaluateAssertion = ( +export const evaluateAssertion = async ( assertion: Assertion, output: unknown -): AssertionResult => { +): Promise => { switch (assertion.type) { case "contains": return evaluateContainsAssertion(assertion, output); @@ -23,6 +29,12 @@ export const evaluateAssertion = ( return evaluateEqualsAssertion(assertion, output); case "jsonPath": return evaluateJsonPathAssertion(assertion, output); + case "fileExists": + return evaluateFileExistsAssertion(assertion); + case "fileContains": + return evaluateFileContainsAssertion(assertion); + case "fileJsonPath": + return evaluateFileJsonPathAssertion(assertion); } }; diff --git a/src/cli/agent-evals/utils/file-assertions.ts b/src/cli/agent-evals/utils/file-assertions.ts new file mode 100644 index 0000000..be8f360 --- /dev/null +++ b/src/cli/agent-evals/utils/file-assertions.ts @@ -0,0 +1,138 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { TMP_ROOT } from "~tools/utils/fs"; + +import type { + AssertionResult, + FileContainsAssertion, + FileExistsAssertion, + FileJsonPathAssertion, +} from "../schemas"; + +/** + * Evaluate a fileExists assertion by checking if the file exists in tmp/. + */ +export const evaluateFileExistsAssertion = async ( + assertion: FileExistsAssertion +): Promise => { + const fullPath = path.join(TMP_ROOT, assertion.path); + + try { + await fs.access(fullPath); + return { + assertion, + passed: true, + message: `File exists: ${assertion.path}`, + actual: assertion.path, + expected: "file to exist", + }; + } catch { + return { + assertion, + passed: false, + message: `File does not exist: ${assertion.path}`, + actual: "file not found", + expected: "file to exist", + }; + } +}; + +/** + * Evaluate a fileContains assertion by reading the file and checking for a substring. + */ +export const evaluateFileContainsAssertion = async ( + assertion: FileContainsAssertion +): Promise => { + const fullPath = path.join(TMP_ROOT, assertion.path); + + try { + const content = await fs.readFile(fullPath, "utf8"); + const caseSensitive = assertion.caseSensitive ?? true; + const searchValue = caseSensitive + ? assertion.value + : assertion.value.toLowerCase(); + const searchIn = caseSensitive ? content : content.toLowerCase(); + const passed = searchIn.includes(searchValue); + + return { + assertion, + passed, + message: passed + ? `File contains "${assertion.value}"` + : `File does not contain "${assertion.value}"`, + actual: content.length > 500 ? `${content.slice(0, 500)}...` : content, + expected: assertion.value, + }; + } catch (err) { + return { + assertion, + passed: false, + message: `Failed to read file: ${err instanceof Error ? err.message : String(err)}`, + actual: "file read error", + expected: assertion.value, + }; + } +}; + +/** + * Evaluate a fileJsonPath assertion by reading a JSON file and checking a path. + */ +export const evaluateFileJsonPathAssertion = async ( + assertion: FileJsonPathAssertion +): Promise => { + const fullPath = path.join(TMP_ROOT, assertion.path); + + try { + const content = await fs.readFile(fullPath, "utf8"); + const json = JSON.parse(content) as unknown; + const value = getJsonPath(json, assertion.jsonPath); + const passed = deepEquals(value, assertion.expected); + + return { + assertion, + passed, + message: passed + ? `Value at ${assertion.jsonPath} equals expected` + : `Value at ${assertion.jsonPath} does not equal expected`, + actual: value, + expected: assertion.expected, + }; + } catch (err) { + return { + assertion, + passed: false, + message: `Failed to evaluate: ${err instanceof Error ? err.message : String(err)}`, + actual: "evaluation error", + expected: assertion.expected, + }; + } +}; + +/** + * Deep equality check using JSON serialization. + */ +const deepEquals = (a: unknown, b: unknown): boolean => { + return JSON.stringify(a) === JSON.stringify(b); +}; + +/** + * Simple JSON path getter supporting dot notation. + * Supports paths like "name" or "$.response.status" + */ +const getJsonPath = (obj: unknown, pathStr: string): unknown => { + const normalizedPath = pathStr.startsWith("$.") ? pathStr.slice(2) : pathStr; + const parts = normalizedPath.split("."); + + let current: unknown = obj; + for (const part of parts) { + if (current === null || current === undefined) { + throw new Error(`Cannot read property "${part}" of ${String(current)}`); + } + if (typeof current !== "object") { + throw new Error(`Cannot read property "${part}" of non-object`); + } + current = (current as Record)[part]; + } + + return current; +}; diff --git a/src/tools/list-files/list-files-tool.ts b/src/tools/list-files/list-files-tool.ts index e11bddf..d0a08cc 100644 --- a/src/tools/list-files/list-files-tool.ts +++ b/src/tools/list-files/list-files-tool.ts @@ -19,15 +19,16 @@ export const createListFilesTool = ({ logger }: ListFilesToolOptions) => path: { type: "string", description: - "Relative path within the repo tmp directory (optional, defaults to tmp root)", + 'Relative path within the repo tmp directory. Use empty string "" to list tmp root.', }, }, - required: [], + required: ["path"], additionalProperties: false, }, - execute: async ({ path: dirPath }: { path?: string }) => { - logger.tool("Listing files", { path: dirPath ?? "tmp root" }); - const targetPath = await resolveTmpPathForList(dirPath); + execute: async ({ path: dirPath }: { path: string }) => { + const effectivePath = dirPath || undefined; + logger.tool("Listing files", { path: effectivePath ?? "tmp root" }); + const targetPath = await resolveTmpPathForList(effectivePath); const entries = await fs.readdir(targetPath, { withFileTypes: true }); const lines = entries.map((entry) => { diff --git a/src/utils/parse-args.test.ts b/src/utils/parse-args.test.ts index b5e394d..460be98 100644 --- a/src/utils/parse-args.test.ts +++ b/src/utils/parse-args.test.ts @@ -1,17 +1,8 @@ -import { describe, expect, it } from "vitest"; -import { z } from "zod"; - import { Logger } from "~clients/logger"; import { parseArgs } from "~utils/parse-args"; +import { describe, expect, it } from "vitest"; -const TestSchema = z - .object({ - suite: z.string().optional(), - all: z.coerce.boolean().default(false), - }) - .refine((data) => data.suite ?? data.all, { - message: "Either --suite or --all is required", - }); +import { CliArgsSchema } from "../cli/agent-evals/schemas"; describe("parseArgs", () => { const logger = new Logger({ @@ -23,22 +14,91 @@ describe("parseArgs", () => { it("parses args after a standalone double-dash separator", () => { const args = parseArgs({ logger, - schema: TestSchema, + schema: CliArgsSchema, rawArgs: ["--", "--suite=example"], }); expect(args.suite).toBe("example"); expect(args.all).toBe(false); + expect(args.report).toBe("json"); + expect(args.out).toBe("agent-evals"); + expect(args.verbose).toBe(false); }); it("parses --all even when preceded by a double-dash separator", () => { const args = parseArgs({ logger, - schema: TestSchema, + schema: CliArgsSchema, rawArgs: ["--", "--all"], }); expect(args.suite).toBeUndefined(); expect(args.all).toBe(true); + expect(args.report).toBe("json"); + expect(args.out).toBe("agent-evals"); + expect(args.verbose).toBe(false); + }); + + it("parses --report with valid enum values", () => { + const argsJson = parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: ["--all", "--report=json"], + }); + expect(argsJson.report).toBe("json"); + + const argsMd = parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: ["--all", "--report=md"], + }); + expect(argsMd.report).toBe("md"); + + const argsBoth = parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: ["--all", "--report=both"], + }); + expect(argsBoth.report).toBe("both"); + }); + + it("parses --out with custom path", () => { + const args = parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: ["--all", "--out=custom/output/path"], + }); + + expect(args.out).toBe("custom/output/path"); + }); + + it("parses --verbose flag", () => { + const args = parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: ["--all", "--verbose"], + }); + + expect(args.verbose).toBe(true); + }); + + it("throws on invalid --report value", () => { + expect(() => + parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: ["--all", "--report=invalid"], + }) + ).toThrow(); + }); + + it("throws when neither --suite nor --all is provided", () => { + expect(() => + parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: [], + }) + ).toThrow("Either --suite or --all is required"); }); }); From 7a1e10d8ed3ccf9740c3a5806fe16a2c69670da3 Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:19:07 +0200 Subject: [PATCH 05/14] docs: update README with new suite field notes and file assertion types --- src/cli/agent-evals/README.md | 45 +++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/src/cli/agent-evals/README.md b/src/cli/agent-evals/README.md index 5c19b4a..bd1cf4a 100644 --- a/src/cli/agent-evals/README.md +++ b/src/cli/agent-evals/README.md @@ -21,7 +21,7 @@ pnpm run:agent-evals -- --suite=example --verbose --report=both - `--all`: Run all suites in the `suites/` directory - `--report `: Report format: `json`, `md`, or `both` (default: `json`) - `--out `: Output base directory under `tmp/` (default: `agent-evals`) -- `--verbose`: Enable verbose logging with assertion details +- `--verbose`: Enable verbose logging with per-assertion failure details Either `--suite` or `--all` is required. @@ -64,6 +64,15 @@ Add JSON files to `suites/` directory. Example structure: } ``` +### Suite Field Notes + +- `agent.model` is currently fixed to `gpt-5-mini`. +- `agent.tools` accepts tool names from the registry: `readFile`, `writeFile`, `listFiles`. +- `agent.maxTurns` defaults to `5` if omitted. +- `defaults.timeout` applies per-case when the case does not provide `timeout`. +- `cases[].timeout` defaults to `defaults.timeout`, then `30000` (ms). +- `cases[].tags` is optional metadata for filtering/labeling (not used at runtime yet). + ## Assertion Types - **contains**: Check if output contains a string @@ -84,11 +93,43 @@ Add JSON files to `suites/` directory. Example structure: { "type": "equals", "expected": { "key": "value" } } ``` -- **jsonPath**: Extract and compare nested values +- **jsonPath**: Extract and compare nested values (dot notation like `response.status` or `$.response.status`) ```json { "type": "jsonPath", "path": "$.response.status", "expected": "success" } ``` +### File Assertions (tmp/ only) + +These assertions read files under `tmp/` to verify tool side effects. Paths are relative to `tmp/`. + +- **fileExists**: Check that a file exists + + ```json + { "type": "fileExists", "path": "agent-evals/tool-test-output.txt" } + ``` + +- **fileContains**: Check that a file contains a string + + ```json + { + "type": "fileContains", + "path": "agent-evals/tool-test-output.txt", + "value": "Hello World", + "caseSensitive": false + } + ``` + +- **fileJsonPath**: Read a JSON file and compare a path (dot notation like `$.name`) + + ```json + { + "type": "fileJsonPath", + "path": "agent-evals/tool-test-data.json", + "jsonPath": "$.name", + "expected": "test" + } + ``` + ## Flowchart ```mermaid From d3d07b03ad80c7d247718a99fc3ce77f1ff379d2 Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:35:39 +0200 Subject: [PATCH 06/14] feat: add delete file tool and related assertions to evaluation framework - Implement delete file tool with path validation - Add fileNotExists assertion schema and evaluation logic - Update tools suite to include delete file tests --- src/cli/agent-evals/clients/tool-registry.ts | 2 + src/cli/agent-evals/schemas.ts | 10 ++ src/cli/agent-evals/suites/tools.json | 20 +++- src/cli/agent-evals/utils/assertions.ts | 3 + src/cli/agent-evals/utils/file-assertions.ts | 29 ++++++ .../delete-file/delete-file-tool.test.ts | 92 +++++++++++++++++++ src/tools/delete-file/delete-file-tool.ts | 36 ++++++++ src/tools/utils/fs.ts | 28 ++++++ 8 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 src/tools/delete-file/delete-file-tool.test.ts create mode 100644 src/tools/delete-file/delete-file-tool.ts diff --git a/src/cli/agent-evals/clients/tool-registry.ts b/src/cli/agent-evals/clients/tool-registry.ts index b4d0386..73e1fe0 100644 --- a/src/cli/agent-evals/clients/tool-registry.ts +++ b/src/cli/agent-evals/clients/tool-registry.ts @@ -1,5 +1,6 @@ import type { Tool } from "@openai/agents"; import type { Logger } from "~clients/logger"; +import { createDeleteFileTool } from "~tools/delete-file/delete-file-tool"; import { createListFilesTool } from "~tools/list-files/list-files-tool"; import { createReadFileTool } from "~tools/read-file/read-file-tool"; import { createWriteFileTool } from "~tools/write-file/write-file-tool"; @@ -14,6 +15,7 @@ const toolFactories: Record = { readFile: ({ logger }) => createReadFileTool({ logger }), writeFile: ({ logger }) => createWriteFileTool({ logger }), listFiles: ({ logger }) => createListFilesTool({ logger }), + deleteFile: ({ logger }) => createDeleteFileTool({ logger }), }; /** diff --git a/src/cli/agent-evals/schemas.ts b/src/cli/agent-evals/schemas.ts index 9b56a40..501c37f 100644 --- a/src/cli/agent-evals/schemas.ts +++ b/src/cli/agent-evals/schemas.ts @@ -81,6 +81,12 @@ export const FileJsonPathAssertionSchema = z.object({ description: z.string().optional(), }); +export const FileNotExistsAssertionSchema = z.object({ + type: z.literal("fileNotExists"), + path: z.string(), + description: z.string().optional(), +}); + export const AssertionSchema = z.discriminatedUnion("type", [ ContainsAssertionSchema, MatchesRegexAssertionSchema, @@ -89,6 +95,7 @@ export const AssertionSchema = z.discriminatedUnion("type", [ FileExistsAssertionSchema, FileContainsAssertionSchema, FileJsonPathAssertionSchema, + FileNotExistsAssertionSchema, ]); export type Assertion = z.infer; @@ -99,6 +106,9 @@ export type JsonPathAssertion = z.infer; export type FileExistsAssertion = z.infer; export type FileContainsAssertion = z.infer; export type FileJsonPathAssertion = z.infer; +export type FileNotExistsAssertion = z.infer< + typeof FileNotExistsAssertionSchema +>; // ============================================ // Eval Case diff --git a/src/cli/agent-evals/suites/tools.json b/src/cli/agent-evals/suites/tools.json index 0a765b4..ca26336 100644 --- a/src/cli/agent-evals/suites/tools.json +++ b/src/cli/agent-evals/suites/tools.json @@ -6,7 +6,7 @@ "name": "ToolTestAgent", "model": "gpt-5-mini", "instructions": "You are an assistant that tests file tools. Use the tools provided to complete tasks. After using a tool, report results concisely.", - "tools": ["readFile", "writeFile", "listFiles"], + "tools": ["readFile", "writeFile", "listFiles", "deleteFile"], "maxTurns": 3 }, "defaults": { @@ -84,6 +84,24 @@ } ], "tags": ["writeFile", "json"] + }, + { + "id": "delete-file", + "name": "deleteFile removes a file", + "prompt": "Delete the file 'agent-evals/tool-test-data.json'", + "assertions": [ + { + "type": "fileNotExists", + "path": "agent-evals/tool-test-data.json", + "description": "File should be deleted" + }, + { + "type": "contains", + "value": "Deleted", + "description": "Agent should confirm deletion" + } + ], + "tags": ["deleteFile"] } ] } diff --git a/src/cli/agent-evals/utils/assertions.ts b/src/cli/agent-evals/utils/assertions.ts index bdc50d1..5a2362b 100644 --- a/src/cli/agent-evals/utils/assertions.ts +++ b/src/cli/agent-evals/utils/assertions.ts @@ -10,6 +10,7 @@ import { evaluateFileContainsAssertion, evaluateFileExistsAssertion, evaluateFileJsonPathAssertion, + evaluateFileNotExistsAssertion, } from "./file-assertions"; /** @@ -35,6 +36,8 @@ export const evaluateAssertion = async ( return evaluateFileContainsAssertion(assertion); case "fileJsonPath": return evaluateFileJsonPathAssertion(assertion); + case "fileNotExists": + return evaluateFileNotExistsAssertion(assertion); } }; diff --git a/src/cli/agent-evals/utils/file-assertions.ts b/src/cli/agent-evals/utils/file-assertions.ts index be8f360..e5fd889 100644 --- a/src/cli/agent-evals/utils/file-assertions.ts +++ b/src/cli/agent-evals/utils/file-assertions.ts @@ -7,6 +7,7 @@ import type { FileContainsAssertion, FileExistsAssertion, FileJsonPathAssertion, + FileNotExistsAssertion, } from "../schemas"; /** @@ -108,6 +109,34 @@ export const evaluateFileJsonPathAssertion = async ( } }; +/** + * Evaluate a fileNotExists assertion by checking if the file does not exist in tmp/. + */ +export const evaluateFileNotExistsAssertion = async ( + assertion: FileNotExistsAssertion +): Promise => { + const fullPath = path.join(TMP_ROOT, assertion.path); + + try { + await fs.access(fullPath); + return { + assertion, + passed: false, + message: `File still exists: ${assertion.path}`, + actual: "file exists", + expected: "file to not exist", + }; + } catch { + return { + assertion, + passed: true, + message: `File does not exist: ${assertion.path}`, + actual: "file not found", + expected: "file to not exist", + }; + } +}; + /** * Deep equality check using JSON serialization. */ diff --git a/src/tools/delete-file/delete-file-tool.test.ts b/src/tools/delete-file/delete-file-tool.test.ts new file mode 100644 index 0000000..80a7bb9 --- /dev/null +++ b/src/tools/delete-file/delete-file-tool.test.ts @@ -0,0 +1,92 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { TMP_ROOT } from "~tools/utils/fs"; +import { invokeTool, tryCreateSymlink } from "~tools/utils/test-utils"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; + +import { createDeleteFileTool } from "./delete-file-tool"; + +describe("createDeleteFileTool tmp path safety", () => { + let testDir = ""; + let relativeDir = ""; + // eslint-disable-next-line @typescript-eslint/no-empty-function + const mockLogger = { tool: () => {} } as never; + + beforeEach(async () => { + await fs.mkdir(TMP_ROOT, { recursive: true }); + testDir = await fs.mkdtemp(path.join(TMP_ROOT, "vitest-tools-")); + relativeDir = path.relative(TMP_ROOT, testDir); + }); + + afterEach(async () => { + if (testDir) { + await fs.rm(testDir, { recursive: true, force: true }); + } + testDir = ""; + relativeDir = ""; + }); + + it("deletes relative paths under tmp", async () => { + const relativePath = path.join(relativeDir, "to-delete.txt"); + const absolutePath = path.join(TMP_ROOT, relativePath); + await fs.writeFile(absolutePath, "delete me", "utf8"); + + const deleteFileTool = createDeleteFileTool({ logger: mockLogger }); + const result = await invokeTool(deleteFileTool, { + path: relativePath, + }); + + expect(result).toContain("Deleted"); + await expect(fs.access(absolutePath)).rejects.toThrow(); + }); + + it("deletes absolute paths under tmp", async () => { + const absolutePath = path.join(testDir, "absolute-delete.txt"); + await fs.writeFile(absolutePath, "delete me", "utf8"); + + const deleteFileTool = createDeleteFileTool({ logger: mockLogger }); + const result = await invokeTool(deleteFileTool, { + path: absolutePath, + }); + + expect(result).toContain("Deleted"); + await expect(fs.access(absolutePath)).rejects.toThrow(); + }); + + it("rejects path traversal attempts", async () => { + const deleteFileTool = createDeleteFileTool({ logger: mockLogger }); + const result = await invokeTool(deleteFileTool, { + path: "../outside.txt", + }); + expect(result).toContain("Path traversal is not allowed."); + }); + + it("rejects symlink paths", async () => { + const realDir = path.join(testDir, "real"); + await fs.mkdir(realDir, { recursive: true }); + const realFile = path.join(realDir, "file.txt"); + await fs.writeFile(realFile, "real content", "utf8"); + const linkDir = path.join(testDir, "link"); + + const symlinkCreated = await tryCreateSymlink(realDir, linkDir); + if (!symlinkCreated) { + return; + } + + const symlinkPath = path.join(relativeDir, "link", "file.txt"); + + const deleteFileTool = createDeleteFileTool({ logger: mockLogger }); + const result = await invokeTool(deleteFileTool, { + path: symlinkPath, + }); + expect(result).toContain("Symlink paths are not allowed."); + }); + + it("returns error for non-existent files", async () => { + const deleteFileTool = createDeleteFileTool({ logger: mockLogger }); + const result = await invokeTool(deleteFileTool, { + path: path.join(relativeDir, "nonexistent.txt"), + }); + expect(result).toContain("Path does not exist."); + }); +}); diff --git a/src/tools/delete-file/delete-file-tool.ts b/src/tools/delete-file/delete-file-tool.ts new file mode 100644 index 0000000..7c10156 --- /dev/null +++ b/src/tools/delete-file/delete-file-tool.ts @@ -0,0 +1,36 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { tool } from "@openai/agents"; +import type { Logger } from "~clients/logger"; +import { resolveTmpPathForDelete, TMP_ROOT } from "~tools/utils/fs"; + +export type DeleteFileToolOptions = { + logger: Logger; +}; + +export const createDeleteFileTool = ({ logger }: DeleteFileToolOptions) => + tool({ + name: "deleteFile", + description: + "Deletes a file under the repo tmp directory (path is relative to tmp).", + parameters: { + type: "object", + properties: { + path: { + type: "string", + description: "Relative path within the repo tmp directory", + }, + }, + required: ["path"], + additionalProperties: false, + }, + execute: async ({ path: filePath }: { path: string }) => { + logger.tool("Deleting file", { path: filePath }); + const targetPath = await resolveTmpPathForDelete(filePath); + await fs.unlink(targetPath); + const relativePath = path.relative(TMP_ROOT, targetPath); + const displayPath = path.join("tmp", relativePath); + logger.tool("Deleted file", { path: displayPath }); + return `Deleted ${displayPath}`; + }, + }); diff --git a/src/tools/utils/fs.ts b/src/tools/utils/fs.ts index c023437..755173a 100644 --- a/src/tools/utils/fs.ts +++ b/src/tools/utils/fs.ts @@ -149,6 +149,34 @@ export const resolveTmpPathForRead = async (userPath: string) => { return candidatePath; }; +export const resolveTmpPathForDelete = async (userPath: string) => { + const trimmed = userPath.trim(); + if (!trimmed) { + throw new Error("Path cannot be empty."); + } + if (PATH_TRAVERSAL.test(trimmed)) { + throw new Error("Path traversal is not allowed."); + } + + await ensureTmpRoot({ create: false }); + const candidatePath = resolveCandidatePath(trimmed); + + await assertNoSymlinkComponents(TMP_ROOT, candidatePath); + + const tmpRootReal = await fs.realpath(TMP_ROOT); + const parentReal = await fs.realpath(path.dirname(candidatePath)); + if (!isPathInside(tmpRootReal, parentReal)) { + throw new Error("Resolved path escapes tmp directory."); + } + + const fileStat = await fs.lstat(candidatePath); + if (!fileStat.isFile()) { + throw new Error("Path must point to a file."); + } + + return candidatePath; +}; + export const resolveTmpPathForList = async (userPath?: string) => { const trimmed = (userPath ?? "").trim(); From a66279ffe76a8187518c6110465ab05aa90e123f Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:36:23 +0200 Subject: [PATCH 07/14] refactor: update report generation to log display paths instead of full paths - Modify JSON and Markdown report saving methods to log relative paths - Introduce toDisplayPath method for consistent path formatting --- src/cli/agent-evals/clients/report-generator.ts | 17 ++++++++++++----- src/tools/read-file/read-file-tool.ts | 7 +++++-- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/cli/agent-evals/clients/report-generator.ts b/src/cli/agent-evals/clients/report-generator.ts index 5437cb7..2e97223 100644 --- a/src/cli/agent-evals/clients/report-generator.ts +++ b/src/cli/agent-evals/clients/report-generator.ts @@ -1,7 +1,7 @@ import fs from "node:fs/promises"; import path from "node:path"; import type { Logger } from "~clients/logger"; -import { resolveTmpPathForWrite } from "~tools/utils/fs"; +import { resolveTmpPathForWrite, TMP_ROOT } from "~tools/utils/fs"; import { DECIMAL_PLACES, @@ -92,8 +92,9 @@ export class ReportGenerator { const fullPath = await resolveTmpPathForWrite(relativePath); await fs.writeFile(fullPath, JSON.stringify(report, null, 2), "utf8"); - this.logger.info("JSON report saved", { path: fullPath }); - return fullPath; + const displayPath = this.toDisplayPath(fullPath); + this.logger.info("JSON report saved", { path: displayPath }); + return displayPath; } private async writeMarkdown(report: EvalReport): Promise { @@ -104,8 +105,14 @@ export class ReportGenerator { const markdown = this.formatMarkdown(report); await fs.writeFile(fullPath, markdown, "utf8"); - this.logger.info("Markdown report saved", { path: fullPath }); - return fullPath; + const displayPath = this.toDisplayPath(fullPath); + this.logger.info("Markdown report saved", { path: displayPath }); + return displayPath; + } + + private toDisplayPath(fullPath: string): string { + const relativePath = path.relative(TMP_ROOT, fullPath); + return path.join("tmp", relativePath); } private formatMarkdown(report: EvalReport): string { diff --git a/src/tools/read-file/read-file-tool.ts b/src/tools/read-file/read-file-tool.ts index f122d48..885b6b3 100644 --- a/src/tools/read-file/read-file-tool.ts +++ b/src/tools/read-file/read-file-tool.ts @@ -1,7 +1,8 @@ import fs from "node:fs/promises"; +import path from "node:path"; import { tool } from "@openai/agents"; import type { Logger } from "~clients/logger"; -import { resolveTmpPathForRead } from "~tools/utils/fs"; +import { resolveTmpPathForRead, TMP_ROOT } from "~tools/utils/fs"; export type ReadFileToolOptions = { logger: Logger; @@ -26,7 +27,9 @@ export const createReadFileTool = ({ logger }: ReadFileToolOptions) => execute: async ({ path: filePath }: { path: string }) => { logger.tool("Reading file", { path: filePath }); const targetPath = await resolveTmpPathForRead(filePath); - logger.tool("Read file result", { targetPath }); + const relativePath = path.relative(TMP_ROOT, targetPath); + const displayPath = path.join("tmp", relativePath); + logger.tool("Read file result", { targetPath: displayPath }); return fs.readFile(targetPath, "utf8"); }, }); From ae95f75b0f10a9edff5ada24215a043019e9019e Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:45:21 +0200 Subject: [PATCH 08/14] feat: add deleteFile tool and update documentation for agent evals --- AGENTS.md | 3 +++ README.md | 45 ++++++++++++++++++++++++++--------- src/cli/agent-evals/README.md | 2 +- 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index ce87b40..0471f0b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -94,6 +94,9 @@ All file tools are sandboxed to `tmp/` using path validation (`src/tools/utils/f - **`listFiles`** (`src/tools/list-files/list-files-tool.ts`) - Lists files/dirs under `tmp/`. - Params: `{ path?: string }` (defaults to `tmp/` root) +- **`deleteFile`** (`src/tools/delete-file/delete-file-tool.ts`) + - Deletes a file under `tmp/`. + - Params: `{ path: string }` (path is **relative to `tmp/`**) - **`runPython`** (`src/tools/run-python/run-python-tool.ts`) - Runs a Python script from a configured scripts directory. - Params: `{ scriptName: string, input: string }` (input is JSON string; pass `""` for no input) diff --git a/README.md b/README.md index 2a5ca6e..27639ad 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # cli-agent-sandbox -A minimal TypeScript CLI sandbox for testing agent workflows and safe web scraping. This is a single-package repo built with [`@openai/agents`](https://github.com/openai/openai-agents-js), and it includes a guestbook demo, a Finnish name explorer CLI, a publication scraping pipeline with a Playwright-based scraper for JS-rendered pages, an ETF backtest CLI, and agent tools scoped to `tmp` with strong safety checks. +A minimal TypeScript CLI sandbox for testing agent workflows and safe web scraping. This is a single-package repo built with [`@openai/agents`](https://github.com/openai/openai-agents-js), and it includes a guestbook demo, a Finnish name explorer CLI, a publication scraping pipeline with a Playwright-based scraper for JS-rendered pages, an ETF backtest CLI, an agent evals CLI, and agent tools scoped to `tmp` with strong safety checks. ## Quick Start @@ -9,9 +9,10 @@ A minimal TypeScript CLI sandbox for testing agent workflows and safe web scrapi 3. Install Playwright system deps (Chromium): `pnpm exec playwright install-deps chromium` 4. Set `OPENAI_API_KEY` (export it or add to `.env`) 5. Run the demo: `pnpm run:guestbook` -6. (Optional) Explore Finnish name stats: `pnpm run:name-explorer -- --mode ai|stats` -7. (Optional) Run publication scraping: `pnpm run:scrape-publications -- --url="https://example.com"` -8. (Optional) Run ETF backtest: `pnpm run:etf-backtest -- --isin=IE00B5BMR087` (requires Python setup below) +6. (Optional) Run agent evals: `pnpm run:agent-evals -- --suite=example` +7. (Optional) Explore Finnish name stats: `pnpm run:name-explorer -- --mode ai|stats` +8. (Optional) Run publication scraping: `pnpm run:scrape-publications -- --url="https://example.com"` +9. (Optional) Run ETF backtest: `pnpm run:etf-backtest -- --isin=IE00B5BMR087` (requires Python setup below) ### Python Setup (for ETF backtest) @@ -29,6 +30,7 @@ pip install numpy pandas torch | Command | Description | | ------------------------------ | ------------------------------------------------------ | | `pnpm run:guestbook` | Run the interactive guestbook CLI demo | +| `pnpm run:agent-evals` | Run agent evaluation suites and generate reports | | `pnpm run:name-explorer` | Explore Finnish name statistics (AI Q&A or stats) | | `pnpm run:scrape-publications` | Scrape publication links and build a review page | | `pnpm run:etf-backtest` | Run ETF backtest + feature optimizer (requires Python) | @@ -87,17 +89,29 @@ Notes: - `--refresh` forces a refetch; otherwise cached data is reused. - Python scripts live in `src/cli/etf-backtest/scripts/`. +## Agent evals + +The `run:agent-evals` CLI executes evaluation suites for agents and writes reports under `tmp/agent-evals/` by default. + +Usage: + +``` +pnpm run:agent-evals -- --suite=example +pnpm run:agent-evals -- --all +``` + ## Tools File tools are sandboxed to the `tmp/` directory with path validation to prevent traversal and symlink attacks. The `fetchUrl` tool adds SSRF protections and HTML sanitization, and `runPython` executes whitelisted Python scripts from a configured directory. -| Tool | Location | Description | -| ----------- | ----------------------------------------- | ------------------------------------------------------------------------------ | -| `fetchUrl` | `src/tools/fetch-url/fetch-url-tool.ts` | Fetches URLs safely and returns sanitized Markdown/text | -| `readFile` | `src/tools/read-file/read-file-tool.ts` | Reads file content from `tmp` directory | -| `writeFile` | `src/tools/write-file/write-file-tool.ts` | Writes content to files in `tmp` directory | -| `listFiles` | `src/tools/list-files/list-files-tool.ts` | Lists files and directories under `tmp` | -| `runPython` | `src/tools/run-python/run-python-tool.ts` | Runs Python scripts from a configured scripts directory (JSON stdin supported) | +| Tool | Location | Description | +| ------------ | ------------------------------------------- | ------------------------------------------------------------------------------ | +| `fetchUrl` | `src/tools/fetch-url/fetch-url-tool.ts` | Fetches URLs safely and returns sanitized Markdown/text | +| `readFile` | `src/tools/read-file/read-file-tool.ts` | Reads file content from `tmp` directory | +| `writeFile` | `src/tools/write-file/write-file-tool.ts` | Writes content to files in `tmp` directory | +| `listFiles` | `src/tools/list-files/list-files-tool.ts` | Lists files and directories under `tmp` | +| `deleteFile` | `src/tools/delete-file/delete-file-tool.ts` | Deletes files under the `tmp` directory | +| `runPython` | `src/tools/run-python/run-python-tool.ts` | Runs Python scripts from a configured scripts directory (JSON stdin supported) | `runPython` details: @@ -109,6 +123,14 @@ File tools are sandboxed to the `tmp/` directory with path validation to prevent ``` src/ ├── cli/ +│ ├── agent-evals/ +│ │ ├── main.ts # Agent evals CLI entry point +│ │ ├── README.md # Agent evals CLI docs +│ │ ├── constants.ts # CLI constants +│ │ ├── schemas.ts # CLI args + suite schemas +│ │ ├── clients/ # Suite runner + report generator +│ │ ├── utils/ # Assertion + formatting helpers +│ │ └── suites/ # Example evaluation suites │ ├── etf-backtest/ │ │ ├── main.ts # ETF backtest CLI entry point │ │ ├── README.md # ETF backtest docs @@ -142,6 +164,7 @@ src/ │ ├── parse-args.ts # Shared CLI arg parsing helper │ └── question-handler.ts # Shared CLI prompt + validation helper ├── tools/ +│ ├── delete-file/ # Delete file tool │ ├── fetch-url/ # Safe fetch tool │ ├── list-files/ # List files tool │ ├── read-file/ # Read file tool diff --git a/src/cli/agent-evals/README.md b/src/cli/agent-evals/README.md index bd1cf4a..12c01c2 100644 --- a/src/cli/agent-evals/README.md +++ b/src/cli/agent-evals/README.md @@ -67,7 +67,7 @@ Add JSON files to `suites/` directory. Example structure: ### Suite Field Notes - `agent.model` is currently fixed to `gpt-5-mini`. -- `agent.tools` accepts tool names from the registry: `readFile`, `writeFile`, `listFiles`. +- `agent.tools` accepts tool names from the registry: `readFile`, `writeFile`, `listFiles`, `deleteFile`. - `agent.maxTurns` defaults to `5` if omitted. - `defaults.timeout` applies per-case when the case does not provide `timeout`. - `cases[].timeout` defaults to `defaults.timeout`, then `30000` (ms). From 19e028356eeb24ea6c06410f40ec977b4d2748d1 Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Tue, 27 Jan 2026 15:08:37 +0200 Subject: [PATCH 09/14] chore: remove outdated CHECKLIST.md file --- src/cli/agent-evals/CHECKLIST.md | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 src/cli/agent-evals/CHECKLIST.md diff --git a/src/cli/agent-evals/CHECKLIST.md b/src/cli/agent-evals/CHECKLIST.md deleted file mode 100644 index 986ce0d..0000000 --- a/src/cli/agent-evals/CHECKLIST.md +++ /dev/null @@ -1,25 +0,0 @@ -# Post-Scaffold Checklist - -## Setup - -- [ ] Update `main.ts` with CLI logic -- [ ] Add CLI arguments to the Zod schema -- [ ] Update `README.md` description and flowchart - -## Optional Structure - -- [ ] Create `./clients/` for pipeline/client classes -- [ ] Create `./types/` for Zod schemas -- [ ] Create `./tools/` for CLI-specific agent tools - -## Before Committing - -- [ ] `pnpm typecheck` -- [ ] `pnpm lint` -- [ ] `pnpm format:check` -- [ ] Add tests if behavior is testable -- [ ] `pnpm test` - -## Cleanup - -- [ ] Delete this CHECKLIST.md when done From c20e80cc64a7898d94a322efcdcb7fbf830337bc Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Tue, 27 Jan 2026 15:14:47 +0200 Subject: [PATCH 10/14] docs: add comment to clarify purpose of sanitizeArgs function --- src/utils/parse-args.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/utils/parse-args.ts b/src/utils/parse-args.ts index b777282..91f3855 100644 --- a/src/utils/parse-args.ts +++ b/src/utils/parse-args.ts @@ -8,6 +8,7 @@ export type ParseArgsOptions = { rawArgs?: string[]; }; +// Strip standalone "--" so parseArgv doesn't treat it as a literal arg after end-of-options. const sanitizeArgs = (rawArgs: string[]): string[] => rawArgs.filter((arg) => arg !== "--"); From c9a6fd926e904e1af6a5155211a468a6ef84ea23 Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Tue, 27 Jan 2026 17:05:27 +0200 Subject: [PATCH 11/14] feat: add path traversal checks to file assertions and utilities - Implement path traversal rejection in file assertions - Add resolveTmpPathForAccess utility to validate paths --- src/cli/agent-evals/utils/assertions.test.ts | 43 +++++++++++++++++++ src/cli/agent-evals/utils/file-assertions.ts | 45 +++++++++++++------- src/tools/utils/fs.ts | 32 ++++++++++++++ 3 files changed, 104 insertions(+), 16 deletions(-) diff --git a/src/cli/agent-evals/utils/assertions.test.ts b/src/cli/agent-evals/utils/assertions.test.ts index f57c86c..594c8a3 100644 --- a/src/cli/agent-evals/utils/assertions.test.ts +++ b/src/cli/agent-evals/utils/assertions.test.ts @@ -226,6 +226,7 @@ describe("evaluateAssertion", () => { const TEST_DIR = path.join(TMP_ROOT, "assertion-tests"); const TEST_FILE = path.join(TEST_DIR, "test-file.txt"); const TEST_JSON_FILE = path.join(TEST_DIR, "test-data.json"); + const TRAVERSAL_PATH = "../package.json"; beforeAll(async () => { await fs.mkdir(TEST_DIR, { recursive: true }); @@ -260,6 +261,16 @@ describe("evaluateAssertion", () => { expect(result.passed).toBe(false); expect(result.message).toContain("does not exist"); }); + + it("rejects traversal paths", async () => { + const assertion: Assertion = { + type: "fileExists", + path: TRAVERSAL_PATH, + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.actual).toContain("Path traversal is not allowed"); + }); }); describe("fileContains", () => { @@ -387,5 +398,37 @@ describe("evaluateAssertion", () => { expect(result.message).toContain("Failed to evaluate"); }); }); + + describe("fileNotExists", () => { + it("passes when file does not exist", async () => { + const assertion: Assertion = { + type: "fileNotExists", + path: "assertion-tests/missing.txt", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(true); + expect(result.message).toContain("does not exist"); + }); + + it("fails when file exists", async () => { + const assertion: Assertion = { + type: "fileNotExists", + path: "assertion-tests/test-file.txt", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.message).toContain("still exists"); + }); + + it("rejects traversal paths", async () => { + const assertion: Assertion = { + type: "fileNotExists", + path: TRAVERSAL_PATH, + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.message).toContain("Failed to check file"); + }); + }); }); }); diff --git a/src/cli/agent-evals/utils/file-assertions.ts b/src/cli/agent-evals/utils/file-assertions.ts index e5fd889..cee6b53 100644 --- a/src/cli/agent-evals/utils/file-assertions.ts +++ b/src/cli/agent-evals/utils/file-assertions.ts @@ -1,6 +1,5 @@ import fs from "node:fs/promises"; -import path from "node:path"; -import { TMP_ROOT } from "~tools/utils/fs"; +import { resolveTmpPathForAccess, resolveTmpPathForRead } from "~tools/utils/fs"; import type { AssertionResult, @@ -16,9 +15,8 @@ import type { export const evaluateFileExistsAssertion = async ( assertion: FileExistsAssertion ): Promise => { - const fullPath = path.join(TMP_ROOT, assertion.path); - try { + const fullPath = await resolveTmpPathForAccess(assertion.path); await fs.access(fullPath); return { assertion, @@ -27,12 +25,12 @@ export const evaluateFileExistsAssertion = async ( actual: assertion.path, expected: "file to exist", }; - } catch { + } catch (err) { return { assertion, passed: false, message: `File does not exist: ${assertion.path}`, - actual: "file not found", + actual: err instanceof Error ? err.message : String(err), expected: "file to exist", }; } @@ -44,9 +42,8 @@ export const evaluateFileExistsAssertion = async ( export const evaluateFileContainsAssertion = async ( assertion: FileContainsAssertion ): Promise => { - const fullPath = path.join(TMP_ROOT, assertion.path); - try { + const fullPath = await resolveTmpPathForRead(assertion.path); const content = await fs.readFile(fullPath, "utf8"); const caseSensitive = assertion.caseSensitive ?? true; const searchValue = caseSensitive @@ -81,9 +78,8 @@ export const evaluateFileContainsAssertion = async ( export const evaluateFileJsonPathAssertion = async ( assertion: FileJsonPathAssertion ): Promise => { - const fullPath = path.join(TMP_ROOT, assertion.path); - try { + const fullPath = await resolveTmpPathForRead(assertion.path); const content = await fs.readFile(fullPath, "utf8"); const json = JSON.parse(content) as unknown; const value = getJsonPath(json, assertion.jsonPath); @@ -115,9 +111,8 @@ export const evaluateFileJsonPathAssertion = async ( export const evaluateFileNotExistsAssertion = async ( assertion: FileNotExistsAssertion ): Promise => { - const fullPath = path.join(TMP_ROOT, assertion.path); - try { + const fullPath = await resolveTmpPathForAccess(assertion.path); await fs.access(fullPath); return { assertion, @@ -126,17 +121,35 @@ export const evaluateFileNotExistsAssertion = async ( actual: "file exists", expected: "file to not exist", }; - } catch { + } catch (err) { + if (isErrno(err, "ENOENT")) { + return { + assertion, + passed: true, + message: `File does not exist: ${assertion.path}`, + actual: "file not found", + expected: "file to not exist", + }; + } return { assertion, - passed: true, - message: `File does not exist: ${assertion.path}`, - actual: "file not found", + passed: false, + message: `Failed to check file: ${err instanceof Error ? err.message : String(err)}`, + actual: err instanceof Error ? err.message : String(err), expected: "file to not exist", }; } }; +const isErrno = ( + error: unknown, + code: string +): error is NodeJS.ErrnoException => + typeof error === "object" && + error !== null && + "code" in error && + (error as NodeJS.ErrnoException).code === code; + /** * Deep equality check using JSON serialization. */ diff --git a/src/tools/utils/fs.ts b/src/tools/utils/fs.ts index 755173a..8755033 100644 --- a/src/tools/utils/fs.ts +++ b/src/tools/utils/fs.ts @@ -149,6 +149,38 @@ export const resolveTmpPathForRead = async (userPath: string) => { return candidatePath; }; +export const resolveTmpPathForAccess = async (userPath: string) => { + const trimmed = userPath.trim(); + if (!trimmed) { + throw new Error("Path cannot be empty."); + } + if (PATH_TRAVERSAL.test(trimmed)) { + throw new Error("Path traversal is not allowed."); + } + + await ensureTmpRoot({ create: false }); + const candidatePath = resolveCandidatePath(trimmed); + + await assertNoSymlinkComponents(TMP_ROOT, candidatePath, { + allowMissing: true, + }); + + const tmpRootReal = await fs.realpath(TMP_ROOT); + const parentDir = path.dirname(candidatePath); + try { + const parentReal = await fs.realpath(parentDir); + if (!isPathInside(tmpRootReal, parentReal)) { + throw new Error("Resolved path escapes tmp directory."); + } + } catch (error) { + if (!isErrno(error, "ENOENT")) { + throw error; + } + } + + return candidatePath; +}; + export const resolveTmpPathForDelete = async (userPath: string) => { const trimmed = userPath.trim(); if (!trimmed) { From 880f072e0153d4bfa6e6a27310778ac66627a2f1 Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Tue, 27 Jan 2026 17:10:58 +0200 Subject: [PATCH 12/14] refactor: update description and test case for empty path handling --- src/tools/list-files/list-files-tool.test.ts | 4 ++-- src/tools/list-files/list-files-tool.ts | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tools/list-files/list-files-tool.test.ts b/src/tools/list-files/list-files-tool.test.ts index 106a32d..15b65b8 100644 --- a/src/tools/list-files/list-files-tool.test.ts +++ b/src/tools/list-files/list-files-tool.test.ts @@ -52,9 +52,9 @@ describe("createListFilesTool tmp path safety", () => { expect(result).toContain("[file] absolute.txt"); }); - it("lists root of tmp when no path provided", async () => { + it("lists root of tmp when empty path provided", async () => { const listFilesTool = createListFilesTool({ logger: mockLogger }); - const result = await invokeTool(listFilesTool, {}); + const result = await invokeTool(listFilesTool, { path: "" }); expect(result).toContain("Contents of tmp:"); expect(result).toContain(path.basename(testDir)); diff --git a/src/tools/list-files/list-files-tool.ts b/src/tools/list-files/list-files-tool.ts index d0a08cc..a719ab6 100644 --- a/src/tools/list-files/list-files-tool.ts +++ b/src/tools/list-files/list-files-tool.ts @@ -12,14 +12,14 @@ export const createListFilesTool = ({ logger }: ListFilesToolOptions) => tool({ name: "listFiles", description: - "Lists files and directories under the repo tmp directory (path is relative to tmp). If no path provided, lists root of tmp.", + "Lists files and directories under the repo tmp directory (path is relative to tmp). Use an empty path to list the tmp root.", parameters: { type: "object", properties: { path: { type: "string", description: - 'Relative path within the repo tmp directory. Use empty string "" to list tmp root.', + "Relative path within the repo tmp directory.", }, }, required: ["path"], From 8659764e396db0d8c3cf656d8a3d6f20744d8908 Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Tue, 27 Jan 2026 17:17:37 +0200 Subject: [PATCH 13/14] feat: add lodash-es for deep equality checks in assertions - Replace custom deep equality function with lodash's isEqual - Update assertions to handle objects with different key ordering - Add lodash-es type definitions to package.json and pnpm-lock.yaml --- package.json | 2 ++ pnpm-lock.yaml | 23 ++++++++++++++++++++ src/cli/agent-evals/utils/assertions.test.ts | 9 ++++++++ src/cli/agent-evals/utils/assertions.ts | 13 ++++------- src/cli/agent-evals/utils/file-assertions.ts | 5 ++++- src/tools/list-files/list-files-tool.ts | 3 +-- 6 files changed, 43 insertions(+), 12 deletions(-) diff --git a/package.json b/package.json index af04036..50803be 100644 --- a/package.json +++ b/package.json @@ -35,6 +35,7 @@ "@ianvs/prettier-plugin-sort-imports": "4.7.0", "@openai/agents": "0.3.7", "@types/jsdom": "27.0.0", + "@types/lodash-es": "4.17.12", "@types/node": "25.0.6", "@types/sanitize-html": "2.16.0", "@types/slug": "5.0.9", @@ -43,6 +44,7 @@ "eslint-plugin-import": "2.32.0", "jiti": "2.6.1", "jsdom": "27.4.0", + "lodash-es": "4.17.23", "marked": "17.0.1", "node-html-markdown": "2.0.0", "playwright": "1.57.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9927fa0..6aa2b3a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -23,6 +23,9 @@ importers: '@types/jsdom': specifier: 27.0.0 version: 27.0.0 + '@types/lodash-es': + specifier: 4.17.12 + version: 4.17.12 '@types/node': specifier: 25.0.6 version: 25.0.6 @@ -47,6 +50,9 @@ importers: jsdom: specifier: 27.4.0 version: 27.4.0 + lodash-es: + specifier: 4.17.23 + version: 4.17.23 marked: specifier: 17.0.1 version: 17.0.1 @@ -618,6 +624,12 @@ packages: '@types/json5@0.0.29': resolution: {integrity: sha512-dRLjCWHYg4oaA77cxO64oO+7JwCwnIzkZPdrrC71jQmQtlhM556pwKo5bUzqvZndkVbeFLIIi+9TC40JNF5hNQ==} + '@types/lodash-es@4.17.12': + resolution: {integrity: sha512-0NgftHUcV4v34VhXm8QBSftKVXtbkBG3ViCjs6+eJ5a6y6Mi/jiFGPc1sC7QK+9BFhWrURE3EOggmWaSxL9OzQ==} + + '@types/lodash@4.17.23': + resolution: {integrity: sha512-RDvF6wTulMPjrNdCoYRC8gNR880JNGT8uB+REUpC2Ns4pRqQJhGz90wh7rgdXDPpCczF3VGktDuFGVnz8zP7HA==} + '@types/node@25.0.6': resolution: {integrity: sha512-NNu0sjyNxpoiW3YuVFfNz7mxSQ+S4X2G28uqg2s+CzoqoQjLPsWSbsFFyztIAqt2vb8kfEAsJNepMGPTxFDx3Q==} @@ -1497,6 +1509,9 @@ packages: resolution: {integrity: sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==} engines: {node: '>=10'} + lodash-es@4.17.23: + resolution: {integrity: sha512-kVI48u3PZr38HdYz98UmfPnXl2DXrpdctLrFLCd3kOx1xUkOmpFPx7gCWWM5MPkL/fD8zb+Ph0QzjGFs4+hHWg==} + lodash.merge@4.6.2: resolution: {integrity: sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==} @@ -2604,6 +2619,12 @@ snapshots: '@types/json5@0.0.29': {} + '@types/lodash-es@4.17.12': + dependencies: + '@types/lodash': 4.17.23 + + '@types/lodash@4.17.23': {} + '@types/node@25.0.6': dependencies: undici-types: 7.16.0 @@ -3735,6 +3756,8 @@ snapshots: dependencies: p-locate: 5.0.0 + lodash-es@4.17.23: {} + lodash.merge@4.6.2: {} lru-cache@11.2.4: {} diff --git a/src/cli/agent-evals/utils/assertions.test.ts b/src/cli/agent-evals/utils/assertions.test.ts index 594c8a3..a03e3fb 100644 --- a/src/cli/agent-evals/utils/assertions.test.ts +++ b/src/cli/agent-evals/utils/assertions.test.ts @@ -132,6 +132,15 @@ describe("evaluateAssertion", () => { expect(result.passed).toBe(true); }); + it("passes for objects with different key ordering", async () => { + const assertion: Assertion = { + type: "equals", + expected: { a: 1, b: 2 }, + }; + const result = await evaluateAssertion(assertion, { b: 2, a: 1 }); + expect(result.passed).toBe(true); + }); + it("fails for different objects", async () => { const assertion: Assertion = { type: "equals", diff --git a/src/cli/agent-evals/utils/assertions.ts b/src/cli/agent-evals/utils/assertions.ts index 5a2362b..47670fd 100644 --- a/src/cli/agent-evals/utils/assertions.ts +++ b/src/cli/agent-evals/utils/assertions.ts @@ -1,3 +1,5 @@ +import { isEqual } from "lodash-es"; + import type { Assertion, AssertionResult, @@ -98,7 +100,7 @@ const evaluateEqualsAssertion = ( assertion: EqualsAssertion, output: unknown ): AssertionResult => { - const passed = deepEquals(output, assertion.expected); + const passed = isEqual(output, assertion.expected); return { assertion, @@ -117,7 +119,7 @@ const evaluateJsonPathAssertion = ( ): AssertionResult => { try { const value = getJsonPath(output, assertion.path); - const passed = deepEquals(value, assertion.expected); + const passed = isEqual(value, assertion.expected); return { assertion, @@ -149,13 +151,6 @@ const stringifyOutput = (output: unknown): string => { return JSON.stringify(output, null, 2); }; -/** - * Deep equality check using JSON serialization. - */ -const deepEquals = (a: unknown, b: unknown): boolean => { - return JSON.stringify(a) === JSON.stringify(b); -}; - /** * Simple JSON path getter supporting dot notation. * Supports paths like "response.status" or "$.response.status" diff --git a/src/cli/agent-evals/utils/file-assertions.ts b/src/cli/agent-evals/utils/file-assertions.ts index cee6b53..bca2100 100644 --- a/src/cli/agent-evals/utils/file-assertions.ts +++ b/src/cli/agent-evals/utils/file-assertions.ts @@ -1,5 +1,8 @@ import fs from "node:fs/promises"; -import { resolveTmpPathForAccess, resolveTmpPathForRead } from "~tools/utils/fs"; +import { + resolveTmpPathForAccess, + resolveTmpPathForRead, +} from "~tools/utils/fs"; import type { AssertionResult, diff --git a/src/tools/list-files/list-files-tool.ts b/src/tools/list-files/list-files-tool.ts index a719ab6..28ec137 100644 --- a/src/tools/list-files/list-files-tool.ts +++ b/src/tools/list-files/list-files-tool.ts @@ -18,8 +18,7 @@ export const createListFilesTool = ({ logger }: ListFilesToolOptions) => properties: { path: { type: "string", - description: - "Relative path within the repo tmp directory.", + description: "Relative path within the repo tmp directory.", }, }, required: ["path"], From 384b9e2d6c5d86f4fbb6094017d1a7ccb322469d Mon Sep 17 00:00:00 2001 From: Juha Kangas <42040080+valuecodes@users.noreply.github.com> Date: Tue, 27 Jan 2026 21:20:12 +0200 Subject: [PATCH 14/14] refactor: reorganize schemas into separate types directory - Move schemas from the root to a dedicated types directory - Update import paths across multiple files to reflect new structure --- README.md | 21 +++++++++++++------ src/cli/agent-evals/clients/eval-runner.ts | 2 +- .../agent-evals/clients/report-generator.ts | 2 +- src/cli/agent-evals/clients/suite-loader.ts | 4 ++-- src/cli/agent-evals/main.ts | 4 ++-- src/cli/agent-evals/{ => types}/schemas.ts | 2 +- src/cli/agent-evals/utils/assertions.test.ts | 2 +- src/cli/agent-evals/utils/assertions.ts | 2 +- src/cli/agent-evals/utils/file-assertions.ts | 2 +- .../etf-backtest/clients/etf-data-fetcher.ts | 4 ++-- .../etf-backtest/clients/learnings-manager.ts | 8 +++++-- src/cli/etf-backtest/main.ts | 4 ++-- src/cli/etf-backtest/{ => types}/schemas.ts | 2 +- .../etf-backtest/utils/experiment-extract.ts | 4 ++-- src/cli/etf-backtest/utils/final-report.ts | 2 +- .../etf-backtest/utils/learnings-formatter.ts | 2 +- src/cli/etf-backtest/utils/scoring.ts | 2 +- src/cli/guestbook/main.ts | 8 ++----- src/cli/guestbook/types/schemas.ts | 8 +++++++ src/cli/name-explorer/main.ts | 7 ++----- src/cli/name-explorer/types/schemas.ts | 8 +++++++ src/cli/scrape-publications/main.ts | 8 ++----- src/cli/scrape-publications/types/schemas.ts | 9 ++++++++ src/utils/parse-args.test.ts | 2 +- 24 files changed, 73 insertions(+), 46 deletions(-) rename src/cli/agent-evals/{ => types}/schemas.ts (99%) rename src/cli/etf-backtest/{ => types}/schemas.ts (99%) create mode 100644 src/cli/guestbook/types/schemas.ts create mode 100644 src/cli/name-explorer/types/schemas.ts create mode 100644 src/cli/scrape-publications/types/schemas.ts diff --git a/README.md b/README.md index 27639ad..9acc8a0 100644 --- a/README.md +++ b/README.md @@ -127,7 +127,8 @@ src/ │ │ ├── main.ts # Agent evals CLI entry point │ │ ├── README.md # Agent evals CLI docs │ │ ├── constants.ts # CLI constants -│ │ ├── schemas.ts # CLI args + suite schemas +│ │ ├── types/ # CLI schemas +│ │ │ └── schemas.ts # CLI args + suite schemas │ │ ├── clients/ # Suite runner + report generator │ │ ├── utils/ # Assertion + formatting helpers │ │ └── suites/ # Example evaluation suites @@ -135,17 +136,24 @@ src/ │ │ ├── main.ts # ETF backtest CLI entry point │ │ ├── README.md # ETF backtest docs │ │ ├── constants.ts # CLI constants -│ │ ├── schemas.ts # CLI args + agent output schemas +│ │ ├── types/ # CLI schemas +│ │ │ └── schemas.ts # CLI args + agent output schemas │ │ ├── clients/ # Data fetcher + Playwright capture │ │ ├── utils/ # Scoring + formatting helpers -│ │ ├── types/ # ETF data types │ │ └── scripts/ # Python backtest + prediction scripts │ ├── guestbook/ │ │ ├── main.ts # Guestbook CLI entry point -│ │ └── README.md # Guestbook CLI docs +│ │ ├── README.md # Guestbook CLI docs +│ │ └── types/ # CLI schemas +│ │ └── schemas.ts # Guestbook output schema │ ├── name-explorer/ │ │ ├── main.ts # Name Explorer CLI entry point -│ │ └── README.md # Name Explorer CLI docs +│ │ ├── README.md # Name Explorer CLI docs +│ │ └── types/ # CLI schemas + data types +│ │ ├── ai-output.ts # Agent output schema +│ │ ├── index.ts # Type exports +│ │ ├── schemas.ts # CLI args schema +│ │ └── stats.ts # Statistics types │ └── scrape-publications/ │ ├── main.ts # Publication scraping CLI entry point │ ├── README.md # Publication scraping docs @@ -154,7 +162,8 @@ src/ │ │ ├── publication-scraper.ts # Link discovery + selector inference │ │ └── review-page-generator.ts # Review HTML generator │ └── types/ -│ └── index.ts # Publication Zod schemas +│ ├── index.ts # Publication Zod schemas +│ └── schemas.ts # CLI args schema ├── clients/ │ ├── fetch.ts # Shared HTTP fetch + sanitization │ ├── logger.ts # Shared console logger diff --git a/src/cli/agent-evals/clients/eval-runner.ts b/src/cli/agent-evals/clients/eval-runner.ts index 7205b24..f698660 100644 --- a/src/cli/agent-evals/clients/eval-runner.ts +++ b/src/cli/agent-evals/clients/eval-runner.ts @@ -15,7 +15,7 @@ import type { EvalSuite, SuiteResult, SuiteSummary, -} from "../schemas"; +} from "../types/schemas"; import { evaluateAssertion } from "../utils/assertions"; import { createToolsFromNames } from "./tool-registry"; diff --git a/src/cli/agent-evals/clients/report-generator.ts b/src/cli/agent-evals/clients/report-generator.ts index 2e97223..180bc4d 100644 --- a/src/cli/agent-evals/clients/report-generator.ts +++ b/src/cli/agent-evals/clients/report-generator.ts @@ -9,7 +9,7 @@ import { REPORTS_SUBDIR, STATUS_SYMBOLS, } from "../constants"; -import type { EvalReport, ReportSummary, SuiteResult } from "../schemas"; +import type { EvalReport, ReportSummary, SuiteResult } from "../types/schemas"; export type ReportFormat = "json" | "md" | "both"; diff --git a/src/cli/agent-evals/clients/suite-loader.ts b/src/cli/agent-evals/clients/suite-loader.ts index 4cde5fa..22d29e4 100644 --- a/src/cli/agent-evals/clients/suite-loader.ts +++ b/src/cli/agent-evals/clients/suite-loader.ts @@ -3,8 +3,8 @@ import path from "node:path"; import type { Logger } from "~clients/logger"; import { SUITE_FILE_EXTENSION, SUITES_DIR } from "../constants"; -import type { EvalSuite } from "../schemas"; -import { EvalSuiteSchema } from "../schemas"; +import type { EvalSuite } from "../types/schemas"; +import { EvalSuiteSchema } from "../types/schemas"; export type SuiteLoaderConfig = { logger: Logger; diff --git a/src/cli/agent-evals/main.ts b/src/cli/agent-evals/main.ts index a5d2659..ac71fca 100644 --- a/src/cli/agent-evals/main.ts +++ b/src/cli/agent-evals/main.ts @@ -11,8 +11,8 @@ import { EvalRunner } from "./clients/eval-runner"; import { ReportGenerator } from "./clients/report-generator"; import { SuiteLoader } from "./clients/suite-loader"; import { LINE_WIDTH, PERCENT_MULTIPLIER, ZERO } from "./constants"; -import type { SuiteResult } from "./schemas"; -import { CliArgsSchema } from "./schemas"; +import type { SuiteResult } from "./types/schemas"; +import { CliArgsSchema } from "./types/schemas"; const logger = new Logger(); diff --git a/src/cli/agent-evals/schemas.ts b/src/cli/agent-evals/types/schemas.ts similarity index 99% rename from src/cli/agent-evals/schemas.ts rename to src/cli/agent-evals/types/schemas.ts index 501c37f..1a564fb 100644 --- a/src/cli/agent-evals/schemas.ts +++ b/src/cli/agent-evals/types/schemas.ts @@ -4,7 +4,7 @@ import { DEFAULT_OUT_PATH, DEFAULT_REPORT_FORMAT, DEFAULT_VERBOSE, -} from "./constants"; +} from "../constants"; // ============================================ // CLI Arguments diff --git a/src/cli/agent-evals/utils/assertions.test.ts b/src/cli/agent-evals/utils/assertions.test.ts index a03e3fb..4e2f365 100644 --- a/src/cli/agent-evals/utils/assertions.test.ts +++ b/src/cli/agent-evals/utils/assertions.test.ts @@ -3,7 +3,7 @@ import path from "node:path"; import { TMP_ROOT } from "~tools/utils/fs"; import { afterAll, beforeAll, describe, expect, it } from "vitest"; -import type { Assertion } from "../schemas"; +import type { Assertion } from "../types/schemas"; import { evaluateAssertion } from "./assertions"; describe("evaluateAssertion", () => { diff --git a/src/cli/agent-evals/utils/assertions.ts b/src/cli/agent-evals/utils/assertions.ts index 47670fd..4cce048 100644 --- a/src/cli/agent-evals/utils/assertions.ts +++ b/src/cli/agent-evals/utils/assertions.ts @@ -7,7 +7,7 @@ import type { EqualsAssertion, JsonPathAssertion, MatchesRegexAssertion, -} from "../schemas"; +} from "../types/schemas"; import { evaluateFileContainsAssertion, evaluateFileExistsAssertion, diff --git a/src/cli/agent-evals/utils/file-assertions.ts b/src/cli/agent-evals/utils/file-assertions.ts index bca2100..c041d06 100644 --- a/src/cli/agent-evals/utils/file-assertions.ts +++ b/src/cli/agent-evals/utils/file-assertions.ts @@ -10,7 +10,7 @@ import type { FileExistsAssertion, FileJsonPathAssertion, FileNotExistsAssertion, -} from "../schemas"; +} from "../types/schemas"; /** * Evaluate a fileExists assertion by checking if the file exists in tmp/. diff --git a/src/cli/etf-backtest/clients/etf-data-fetcher.ts b/src/cli/etf-backtest/clients/etf-data-fetcher.ts index 5bc3297..4548625 100644 --- a/src/cli/etf-backtest/clients/etf-data-fetcher.ts +++ b/src/cli/etf-backtest/clients/etf-data-fetcher.ts @@ -14,8 +14,8 @@ import { getEtfApiPattern, JUST_ETF_BASE_URL, } from "../constants"; -import type { EtfDataResponse } from "../schemas"; -import { EtfDataResponseSchema, isEtfDataResponse } from "../schemas"; +import type { EtfDataResponse } from "../types/schemas"; +import { EtfDataResponseSchema, isEtfDataResponse } from "../types/schemas"; export type EtfDataFetcherConfig = { logger: Logger; diff --git a/src/cli/etf-backtest/clients/learnings-manager.ts b/src/cli/etf-backtest/clients/learnings-manager.ts index cee91cd..6ed9373 100644 --- a/src/cli/etf-backtest/clients/learnings-manager.ts +++ b/src/cli/etf-backtest/clients/learnings-manager.ts @@ -8,8 +8,12 @@ import { LEARNINGS_FILENAME, MAX_HISTORY_ITEMS, } from "../constants"; -import type { ExperimentResult, IterationRecord, Learnings } from "../schemas"; -import { LearningsSchema } from "../schemas"; +import type { + ExperimentResult, + IterationRecord, + Learnings, +} from "../types/schemas"; +import { LearningsSchema } from "../types/schemas"; import { computeScore } from "../utils/scoring"; export type LearningsManagerConfig = { diff --git a/src/cli/etf-backtest/main.ts b/src/cli/etf-backtest/main.ts index e11aca4..43d0e5f 100644 --- a/src/cli/etf-backtest/main.ts +++ b/src/cli/etf-backtest/main.ts @@ -30,8 +30,8 @@ import { TARGET_R2_NON_OVERLAPPING, ZERO, } from "./constants"; -import { AgentOutputSchema, CliArgsSchema } from "./schemas"; -import type { ExperimentResult, Learnings } from "./schemas"; +import { AgentOutputSchema, CliArgsSchema } from "./types/schemas"; +import type { ExperimentResult, Learnings } from "./types/schemas"; import { extractLastExperimentResult } from "./utils/experiment-extract"; import { printFinalResults } from "./utils/final-report"; import { formatFixed, formatPercent } from "./utils/formatters"; diff --git a/src/cli/etf-backtest/schemas.ts b/src/cli/etf-backtest/types/schemas.ts similarity index 99% rename from src/cli/etf-backtest/schemas.ts rename to src/cli/etf-backtest/types/schemas.ts index 7859d09..51c746e 100644 --- a/src/cli/etf-backtest/schemas.ts +++ b/src/cli/etf-backtest/types/schemas.ts @@ -6,7 +6,7 @@ import { DEFAULT_REFRESH, DEFAULT_SEED, DEFAULT_VERBOSE, -} from "./constants"; +} from "../constants"; // ISIN validation: 2 letter country code + 10 alphanumeric characters const IsinSchema = z diff --git a/src/cli/etf-backtest/utils/experiment-extract.ts b/src/cli/etf-backtest/utils/experiment-extract.ts index 903abf2..93fc1ec 100644 --- a/src/cli/etf-backtest/utils/experiment-extract.ts +++ b/src/cli/etf-backtest/utils/experiment-extract.ts @@ -1,6 +1,6 @@ import { INDEX_NOT_FOUND, JSON_SLICE_END_OFFSET, ZERO } from "../constants"; -import { ExperimentResultSchema } from "../schemas"; -import type { ExperimentResult } from "../schemas"; +import { ExperimentResultSchema } from "../types/schemas"; +import type { ExperimentResult } from "../types/schemas"; const extractJsonFromStdout = (stdout: string): unknown => { const startIdx = stdout.indexOf("{"); diff --git a/src/cli/etf-backtest/utils/final-report.ts b/src/cli/etf-backtest/utils/final-report.ts index 9cc76f5..9351a43 100644 --- a/src/cli/etf-backtest/utils/final-report.ts +++ b/src/cli/etf-backtest/utils/final-report.ts @@ -7,7 +7,7 @@ import { LINE_SEPARATOR, PREDICTION_HORIZON_MONTHS, } from "../constants"; -import type { ExperimentResult } from "../schemas"; +import type { ExperimentResult } from "../types/schemas"; import { formatFixed, formatPercent } from "./formatters"; export const printFinalResults = ( diff --git a/src/cli/etf-backtest/utils/learnings-formatter.ts b/src/cli/etf-backtest/utils/learnings-formatter.ts index a800fd1..27fd9b2 100644 --- a/src/cli/etf-backtest/utils/learnings-formatter.ts +++ b/src/cli/etf-backtest/utils/learnings-formatter.ts @@ -1,5 +1,5 @@ import { DECIMAL_PLACES, LEARNINGS_SUMMARY_TOP_N } from "../constants"; -import type { Learnings } from "../schemas"; +import type { Learnings } from "../types/schemas"; import { formatFixed, formatPercent } from "./formatters"; const FEATURE_PREVIEW_COUNT = 4; diff --git a/src/cli/etf-backtest/utils/scoring.ts b/src/cli/etf-backtest/utils/scoring.ts index 4dbd25c..4ad237f 100644 --- a/src/cli/etf-backtest/utils/scoring.ts +++ b/src/cli/etf-backtest/utils/scoring.ts @@ -4,7 +4,7 @@ import { SCORE_WEIGHTS, ZERO, } from "../constants"; -import type { ExperimentResult } from "../schemas"; +import type { ExperimentResult } from "../types/schemas"; export const computeScore = (metrics: ExperimentResult["metrics"]): number => { // Primary: prediction accuracy on non-overlapping samples (honest assessment) diff --git a/src/cli/guestbook/main.ts b/src/cli/guestbook/main.ts index a4013e6..5597d1d 100644 --- a/src/cli/guestbook/main.ts +++ b/src/cli/guestbook/main.ts @@ -6,18 +6,14 @@ import { AgentRunner } from "~clients/agent-runner"; import { Logger } from "~clients/logger"; import { createReadFileTool } from "~tools/read-file/read-file-tool"; import { createWriteFileTool } from "~tools/write-file/write-file-tool"; -import { z } from "zod"; import { question } from "zx"; +import { OutputSchema } from "./types/schemas"; + const logger = new Logger(); logger.info("Guestbook running..."); -const OutputSchema = z.object({ - success: z.boolean(), - message: z.string(), -}); - const agentRunner = new AgentRunner({ name: "GuestbookAgent", model: "gpt-5-mini", diff --git a/src/cli/guestbook/types/schemas.ts b/src/cli/guestbook/types/schemas.ts new file mode 100644 index 0000000..a6a633f --- /dev/null +++ b/src/cli/guestbook/types/schemas.ts @@ -0,0 +1,8 @@ +import { z } from "zod"; + +export const OutputSchema = z.object({ + success: z.boolean(), + message: z.string(), +}); + +export type Output = z.infer; diff --git a/src/cli/name-explorer/main.ts b/src/cli/name-explorer/main.ts index 39f8247..3827248 100644 --- a/src/cli/name-explorer/main.ts +++ b/src/cli/name-explorer/main.ts @@ -8,7 +8,6 @@ import { AgentRunner } from "~clients/agent-runner"; import { Logger } from "~clients/logger"; import { parseArgs } from "~utils/parse-args"; import { QuestionHandler } from "~utils/question-handler"; -import { z } from "zod"; import { NameSuggesterPipeline } from "./clients/pipeline"; import { StatsGenerator } from "./clients/stats-generator"; @@ -22,16 +21,14 @@ import { NameSuggesterOutputSchema, NameSuggesterOutputTypeSchema, } from "./types"; +import { CliArgsSchema } from "./types/schemas"; const logger = new Logger(); // --- Parse CLI arguments --- const { refetch: shouldRefetch, mode } = parseArgs({ logger, - schema: z.object({ - refetch: z.coerce.boolean().default(false), - mode: z.enum(["stats", "ai"]).default("ai"), - }), + schema: CliArgsSchema, }); // --- Initialize pipeline and database --- diff --git a/src/cli/name-explorer/types/schemas.ts b/src/cli/name-explorer/types/schemas.ts new file mode 100644 index 0000000..0ffd2e1 --- /dev/null +++ b/src/cli/name-explorer/types/schemas.ts @@ -0,0 +1,8 @@ +import { z } from "zod"; + +export const CliArgsSchema = z.object({ + refetch: z.coerce.boolean().default(false), + mode: z.enum(["stats", "ai"]).default("ai"), +}); + +export type CliArgs = z.infer; diff --git a/src/cli/scrape-publications/main.ts b/src/cli/scrape-publications/main.ts index a1630e0..aacb520 100644 --- a/src/cli/scrape-publications/main.ts +++ b/src/cli/scrape-publications/main.ts @@ -8,10 +8,10 @@ import path from "node:path"; import { Logger } from "~clients/logger"; import { parseArgs } from "~utils/parse-args"; import slug from "slug"; -import { z } from "zod"; import { question } from "zx"; import { PublicationPipeline } from "./clients/publication-pipeline"; +import { CliArgsSchema } from "./types/schemas"; const logger = new Logger({ level: "info", useColors: true }); @@ -24,11 +24,7 @@ const { filterUrl, } = parseArgs({ logger, - schema: z.object({ - url: z.url(), - refetch: z.coerce.boolean().default(false), - filterUrl: z.string().optional(), - }), + schema: CliArgsSchema, }); // 2. Create slugified directory path diff --git a/src/cli/scrape-publications/types/schemas.ts b/src/cli/scrape-publications/types/schemas.ts new file mode 100644 index 0000000..0797752 --- /dev/null +++ b/src/cli/scrape-publications/types/schemas.ts @@ -0,0 +1,9 @@ +import { z } from "zod"; + +export const CliArgsSchema = z.object({ + url: z.url(), + refetch: z.coerce.boolean().default(false), + filterUrl: z.string().optional(), +}); + +export type CliArgs = z.infer; diff --git a/src/utils/parse-args.test.ts b/src/utils/parse-args.test.ts index 460be98..b92137f 100644 --- a/src/utils/parse-args.test.ts +++ b/src/utils/parse-args.test.ts @@ -2,7 +2,7 @@ import { Logger } from "~clients/logger"; import { parseArgs } from "~utils/parse-args"; import { describe, expect, it } from "vitest"; -import { CliArgsSchema } from "../cli/agent-evals/schemas"; +import { CliArgsSchema } from "../cli/agent-evals/types/schemas"; describe("parseArgs", () => { const logger = new Logger({