diff --git a/.claude/settings.json b/.claude/settings.json index e47738f..414b774 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -1,8 +1,6 @@ { "$schema": "https://json.schemastore.org/claude-code-settings.json", - "description": "Settings for Claude code agents", "permissions": { - "defaultMode": "default", "allow": [ "Bash(pnpm lint:*)", "Bash(pnpm lint:fix:*)", @@ -10,13 +8,8 @@ "Bash(pnpm build:*)", "Bash(pnpm format:*)", "Bash(pnpm format:check:*)", - "Bash(pnpm test:*)" - ], - "ask": [ - "Bash(pnpm install:*)", - "Bash(pnpm add:*)", - "Bash(pnpm remove:*)", - "Bash(git commit:*)" + "Bash(pnpm test:*)", + "Bash(tsx scripts/scaffold-cli.ts:*)" ], "deny": [ "Bash(curl:*)", @@ -29,6 +22,14 @@ "Read(**/secrets/**)", "Bash(git push:*)", "Bash(gh pr create:*)" - ] - } + ], + "ask": [ + "Bash(pnpm install:*)", + "Bash(pnpm add:*)", + "Bash(pnpm remove:*)", + "Bash(git commit:*)" + ], + "defaultMode": "default" + }, + "description": "Settings for Claude code agents" } diff --git a/AGENTS.md b/AGENTS.md index c67c81f..0471f0b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -7,7 +7,7 @@ **Repo:** `cli-agent-sandbox` — minimal TypeScript CLI sandbox built with `@openai/agents` and tool sandboxing under `tmp/`. 1. Start at `src/cli//main.ts` and the matching `src/cli//README.md`. -2. Follow the pipeline classes under `src/cli//clients/*` and schemas under `src/cli//types/*`. +2. Follow the pipeline classes under `src/cli//clients/*` and schemas under `src/cli//types/schemas.ts`. 3. Reuse shared helpers: `src/utils/parse-args.ts`, `src/utils/question-handler.ts`, `src/clients/logger.ts`. 4. Keep `main.ts` focused on the basic agent flow; move non-trivial logic into `clients/` or `utils/`. 5. Keep changes minimal; add/update **Vitest** tests (`*.test.ts`) when behavior changes. @@ -94,6 +94,9 @@ All file tools are sandboxed to `tmp/` using path validation (`src/tools/utils/f - **`listFiles`** (`src/tools/list-files/list-files-tool.ts`) - Lists files/dirs under `tmp/`. - Params: `{ path?: string }` (defaults to `tmp/` root) +- **`deleteFile`** (`src/tools/delete-file/delete-file-tool.ts`) + - Deletes a file under `tmp/`. + - Params: `{ path: string }` (path is **relative to `tmp/`**) - **`runPython`** (`src/tools/run-python/run-python-tool.ts`) - Runs a Python script from a configured scripts directory. - Params: `{ scriptName: string, input: string }` (input is JSON string; pass `""` for no input) @@ -117,7 +120,7 @@ All file tools are sandboxed to `tmp/` using path validation (`src/tools/utils/f - Prefer TypeScript path aliases over deep relative imports: `~tools/*`, `~clients/*`, `~utils/*`. - Use Zod schemas for CLI args and tool IO. - Keep object field names in `camelCase` (e.g., `trainSamples`), not `snake_case`. -- Keep Zod schemas in a dedicated `schemas.ts` file for each CLI (avoid inline schemas in `main.ts`). +- Keep Zod schemas in a dedicated `types/schemas.ts` file for each CLI (avoid inline schemas in `main.ts`). - Keep constants in a dedicated `constants.ts` file for each CLI. - Move hardcoded numeric values into `constants.ts` (treat numbers as configuration). - For HTTP fetching in code, prefer `Fetch` (sanitized) or `PlaywrightScraper` for JS-heavy pages. diff --git a/README.md b/README.md index 2a5ca6e..9acc8a0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # cli-agent-sandbox -A minimal TypeScript CLI sandbox for testing agent workflows and safe web scraping. This is a single-package repo built with [`@openai/agents`](https://github.com/openai/openai-agents-js), and it includes a guestbook demo, a Finnish name explorer CLI, a publication scraping pipeline with a Playwright-based scraper for JS-rendered pages, an ETF backtest CLI, and agent tools scoped to `tmp` with strong safety checks. +A minimal TypeScript CLI sandbox for testing agent workflows and safe web scraping. This is a single-package repo built with [`@openai/agents`](https://github.com/openai/openai-agents-js), and it includes a guestbook demo, a Finnish name explorer CLI, a publication scraping pipeline with a Playwright-based scraper for JS-rendered pages, an ETF backtest CLI, an agent evals CLI, and agent tools scoped to `tmp` with strong safety checks. ## Quick Start @@ -9,9 +9,10 @@ A minimal TypeScript CLI sandbox for testing agent workflows and safe web scrapi 3. Install Playwright system deps (Chromium): `pnpm exec playwright install-deps chromium` 4. Set `OPENAI_API_KEY` (export it or add to `.env`) 5. Run the demo: `pnpm run:guestbook` -6. (Optional) Explore Finnish name stats: `pnpm run:name-explorer -- --mode ai|stats` -7. (Optional) Run publication scraping: `pnpm run:scrape-publications -- --url="https://example.com"` -8. (Optional) Run ETF backtest: `pnpm run:etf-backtest -- --isin=IE00B5BMR087` (requires Python setup below) +6. (Optional) Run agent evals: `pnpm run:agent-evals -- --suite=example` +7. (Optional) Explore Finnish name stats: `pnpm run:name-explorer -- --mode ai|stats` +8. (Optional) Run publication scraping: `pnpm run:scrape-publications -- --url="https://example.com"` +9. (Optional) Run ETF backtest: `pnpm run:etf-backtest -- --isin=IE00B5BMR087` (requires Python setup below) ### Python Setup (for ETF backtest) @@ -29,6 +30,7 @@ pip install numpy pandas torch | Command | Description | | ------------------------------ | ------------------------------------------------------ | | `pnpm run:guestbook` | Run the interactive guestbook CLI demo | +| `pnpm run:agent-evals` | Run agent evaluation suites and generate reports | | `pnpm run:name-explorer` | Explore Finnish name statistics (AI Q&A or stats) | | `pnpm run:scrape-publications` | Scrape publication links and build a review page | | `pnpm run:etf-backtest` | Run ETF backtest + feature optimizer (requires Python) | @@ -87,17 +89,29 @@ Notes: - `--refresh` forces a refetch; otherwise cached data is reused. - Python scripts live in `src/cli/etf-backtest/scripts/`. +## Agent evals + +The `run:agent-evals` CLI executes evaluation suites for agents and writes reports under `tmp/agent-evals/` by default. + +Usage: + +``` +pnpm run:agent-evals -- --suite=example +pnpm run:agent-evals -- --all +``` + ## Tools File tools are sandboxed to the `tmp/` directory with path validation to prevent traversal and symlink attacks. The `fetchUrl` tool adds SSRF protections and HTML sanitization, and `runPython` executes whitelisted Python scripts from a configured directory. -| Tool | Location | Description | -| ----------- | ----------------------------------------- | ------------------------------------------------------------------------------ | -| `fetchUrl` | `src/tools/fetch-url/fetch-url-tool.ts` | Fetches URLs safely and returns sanitized Markdown/text | -| `readFile` | `src/tools/read-file/read-file-tool.ts` | Reads file content from `tmp` directory | -| `writeFile` | `src/tools/write-file/write-file-tool.ts` | Writes content to files in `tmp` directory | -| `listFiles` | `src/tools/list-files/list-files-tool.ts` | Lists files and directories under `tmp` | -| `runPython` | `src/tools/run-python/run-python-tool.ts` | Runs Python scripts from a configured scripts directory (JSON stdin supported) | +| Tool | Location | Description | +| ------------ | ------------------------------------------- | ------------------------------------------------------------------------------ | +| `fetchUrl` | `src/tools/fetch-url/fetch-url-tool.ts` | Fetches URLs safely and returns sanitized Markdown/text | +| `readFile` | `src/tools/read-file/read-file-tool.ts` | Reads file content from `tmp` directory | +| `writeFile` | `src/tools/write-file/write-file-tool.ts` | Writes content to files in `tmp` directory | +| `listFiles` | `src/tools/list-files/list-files-tool.ts` | Lists files and directories under `tmp` | +| `deleteFile` | `src/tools/delete-file/delete-file-tool.ts` | Deletes files under the `tmp` directory | +| `runPython` | `src/tools/run-python/run-python-tool.ts` | Runs Python scripts from a configured scripts directory (JSON stdin supported) | `runPython` details: @@ -109,21 +123,37 @@ File tools are sandboxed to the `tmp/` directory with path validation to prevent ``` src/ ├── cli/ +│ ├── agent-evals/ +│ │ ├── main.ts # Agent evals CLI entry point +│ │ ├── README.md # Agent evals CLI docs +│ │ ├── constants.ts # CLI constants +│ │ ├── types/ # CLI schemas +│ │ │ └── schemas.ts # CLI args + suite schemas +│ │ ├── clients/ # Suite runner + report generator +│ │ ├── utils/ # Assertion + formatting helpers +│ │ └── suites/ # Example evaluation suites │ ├── etf-backtest/ │ │ ├── main.ts # ETF backtest CLI entry point │ │ ├── README.md # ETF backtest docs │ │ ├── constants.ts # CLI constants -│ │ ├── schemas.ts # CLI args + agent output schemas +│ │ ├── types/ # CLI schemas +│ │ │ └── schemas.ts # CLI args + agent output schemas │ │ ├── clients/ # Data fetcher + Playwright capture │ │ ├── utils/ # Scoring + formatting helpers -│ │ ├── types/ # ETF data types │ │ └── scripts/ # Python backtest + prediction scripts │ ├── guestbook/ │ │ ├── main.ts # Guestbook CLI entry point -│ │ └── README.md # Guestbook CLI docs +│ │ ├── README.md # Guestbook CLI docs +│ │ └── types/ # CLI schemas +│ │ └── schemas.ts # Guestbook output schema │ ├── name-explorer/ │ │ ├── main.ts # Name Explorer CLI entry point -│ │ └── README.md # Name Explorer CLI docs +│ │ ├── README.md # Name Explorer CLI docs +│ │ └── types/ # CLI schemas + data types +│ │ ├── ai-output.ts # Agent output schema +│ │ ├── index.ts # Type exports +│ │ ├── schemas.ts # CLI args schema +│ │ └── stats.ts # Statistics types │ └── scrape-publications/ │ ├── main.ts # Publication scraping CLI entry point │ ├── README.md # Publication scraping docs @@ -132,7 +162,8 @@ src/ │ │ ├── publication-scraper.ts # Link discovery + selector inference │ │ └── review-page-generator.ts # Review HTML generator │ └── types/ -│ └── index.ts # Publication Zod schemas +│ ├── index.ts # Publication Zod schemas +│ └── schemas.ts # CLI args schema ├── clients/ │ ├── fetch.ts # Shared HTTP fetch + sanitization │ ├── logger.ts # Shared console logger @@ -142,6 +173,7 @@ src/ │ ├── parse-args.ts # Shared CLI arg parsing helper │ └── question-handler.ts # Shared CLI prompt + validation helper ├── tools/ +│ ├── delete-file/ # Delete file tool │ ├── fetch-url/ # Safe fetch tool │ ├── list-files/ # List files tool │ ├── read-file/ # Read file tool diff --git a/package.json b/package.json index e7a9c88..50803be 100644 --- a/package.json +++ b/package.json @@ -8,6 +8,7 @@ "run:name-explorer": "pnpm -s node:tsx -- src/cli/name-explorer/main.ts", "run:scrape-publications": "tsx src/cli/scrape-publications/main.ts", "run:etf-backtest": "tsx src/cli/etf-backtest/main.ts", + "run:agent-evals": "tsx src/cli/agent-evals/main.ts", "scaffold:cli": "tsx scripts/scaffold-cli.ts", "node:tsx": "node --disable-warning=ExperimentalWarning --import tsx", "typecheck": "tsc --noEmit", @@ -34,6 +35,7 @@ "@ianvs/prettier-plugin-sort-imports": "4.7.0", "@openai/agents": "0.3.7", "@types/jsdom": "27.0.0", + "@types/lodash-es": "4.17.12", "@types/node": "25.0.6", "@types/sanitize-html": "2.16.0", "@types/slug": "5.0.9", @@ -42,6 +44,7 @@ "eslint-plugin-import": "2.32.0", "jiti": "2.6.1", "jsdom": "27.4.0", + "lodash-es": "4.17.23", "marked": "17.0.1", "node-html-markdown": "2.0.0", "playwright": "1.57.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9927fa0..6aa2b3a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -23,6 +23,9 @@ importers: '@types/jsdom': specifier: 27.0.0 version: 27.0.0 + '@types/lodash-es': + specifier: 4.17.12 + version: 4.17.12 '@types/node': specifier: 25.0.6 version: 25.0.6 @@ -47,6 +50,9 @@ importers: jsdom: specifier: 27.4.0 version: 27.4.0 + lodash-es: + specifier: 4.17.23 + version: 4.17.23 marked: specifier: 17.0.1 version: 17.0.1 @@ -618,6 +624,12 @@ packages: '@types/json5@0.0.29': resolution: {integrity: sha512-dRLjCWHYg4oaA77cxO64oO+7JwCwnIzkZPdrrC71jQmQtlhM556pwKo5bUzqvZndkVbeFLIIi+9TC40JNF5hNQ==} + '@types/lodash-es@4.17.12': + resolution: {integrity: sha512-0NgftHUcV4v34VhXm8QBSftKVXtbkBG3ViCjs6+eJ5a6y6Mi/jiFGPc1sC7QK+9BFhWrURE3EOggmWaSxL9OzQ==} + + '@types/lodash@4.17.23': + resolution: {integrity: sha512-RDvF6wTulMPjrNdCoYRC8gNR880JNGT8uB+REUpC2Ns4pRqQJhGz90wh7rgdXDPpCczF3VGktDuFGVnz8zP7HA==} + '@types/node@25.0.6': resolution: {integrity: sha512-NNu0sjyNxpoiW3YuVFfNz7mxSQ+S4X2G28uqg2s+CzoqoQjLPsWSbsFFyztIAqt2vb8kfEAsJNepMGPTxFDx3Q==} @@ -1497,6 +1509,9 @@ packages: resolution: {integrity: sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==} engines: {node: '>=10'} + lodash-es@4.17.23: + resolution: {integrity: sha512-kVI48u3PZr38HdYz98UmfPnXl2DXrpdctLrFLCd3kOx1xUkOmpFPx7gCWWM5MPkL/fD8zb+Ph0QzjGFs4+hHWg==} + lodash.merge@4.6.2: resolution: {integrity: sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==} @@ -2604,6 +2619,12 @@ snapshots: '@types/json5@0.0.29': {} + '@types/lodash-es@4.17.12': + dependencies: + '@types/lodash': 4.17.23 + + '@types/lodash@4.17.23': {} + '@types/node@25.0.6': dependencies: undici-types: 7.16.0 @@ -3735,6 +3756,8 @@ snapshots: dependencies: p-locate: 5.0.0 + lodash-es@4.17.23: {} + lodash.merge@4.6.2: {} lru-cache@11.2.4: {} diff --git a/src/cli/agent-evals/README.md b/src/cli/agent-evals/README.md new file mode 100644 index 0000000..12c01c2 --- /dev/null +++ b/src/cli/agent-evals/README.md @@ -0,0 +1,146 @@ +# Agent Evals + +Run automated evaluation cases for AI agents with PASS/FAIL results and reports. + +## Run + +```bash +# Run a single suite +pnpm run:agent-evals -- --suite=example + +# Run all suites +pnpm run:agent-evals -- --all + +# With options +pnpm run:agent-evals -- --suite=example --verbose --report=both +``` + +## Arguments + +- `--suite `: Run a specific suite by name (without `.json` extension) +- `--all`: Run all suites in the `suites/` directory +- `--report `: Report format: `json`, `md`, or `both` (default: `json`) +- `--out `: Output base directory under `tmp/` (default: `agent-evals`) +- `--verbose`: Enable verbose logging with per-assertion failure details + +Either `--suite` or `--all` is required. + +## Output + +Reports are written to `tmp//reports/` (default: `tmp/agent-evals/reports/`): + +- `report-{timestamp}.json`: Machine-readable results +- `report-{timestamp}.md`: Human-readable markdown report + +Exit code is 1 if any tests fail or error. + +## Creating Evaluation Suites + +Add JSON files to `suites/` directory. Example structure: + +```json +{ + "name": "my-suite", + "description": "Test suite description", + "version": "1.0.0", + "agent": { + "name": "MyTestAgent", + "model": "gpt-5-mini", + "instructions": "Agent system prompt here", + "tools": [], + "maxTurns": 3 + }, + "defaults": { + "timeout": 15000 + }, + "cases": [ + { + "id": "case-1", + "name": "Test case name", + "prompt": "User prompt to test", + "assertions": [{ "type": "contains", "value": "expected text" }] + } + ] +} +``` + +### Suite Field Notes + +- `agent.model` is currently fixed to `gpt-5-mini`. +- `agent.tools` accepts tool names from the registry: `readFile`, `writeFile`, `listFiles`, `deleteFile`. +- `agent.maxTurns` defaults to `5` if omitted. +- `defaults.timeout` applies per-case when the case does not provide `timeout`. +- `cases[].timeout` defaults to `defaults.timeout`, then `30000` (ms). +- `cases[].tags` is optional metadata for filtering/labeling (not used at runtime yet). + +## Assertion Types + +- **contains**: Check if output contains a string + + ```json + { "type": "contains", "value": "text", "caseSensitive": false } + ``` + +- **matchesRegex**: Check if output matches a regex pattern + + ```json + { "type": "matchesRegex", "pattern": "\\d+", "flags": "i" } + ``` + +- **equals**: Deep equality check + + ```json + { "type": "equals", "expected": { "key": "value" } } + ``` + +- **jsonPath**: Extract and compare nested values (dot notation like `response.status` or `$.response.status`) + ```json + { "type": "jsonPath", "path": "$.response.status", "expected": "success" } + ``` + +### File Assertions (tmp/ only) + +These assertions read files under `tmp/` to verify tool side effects. Paths are relative to `tmp/`. + +- **fileExists**: Check that a file exists + + ```json + { "type": "fileExists", "path": "agent-evals/tool-test-output.txt" } + ``` + +- **fileContains**: Check that a file contains a string + + ```json + { + "type": "fileContains", + "path": "agent-evals/tool-test-output.txt", + "value": "Hello World", + "caseSensitive": false + } + ``` + +- **fileJsonPath**: Read a JSON file and compare a path (dot notation like `$.name`) + + ```json + { + "type": "fileJsonPath", + "path": "agent-evals/tool-test-data.json", + "jsonPath": "$.name", + "expected": "test" + } + ``` + +## Flowchart + +```mermaid +flowchart TD + A["Start"] --> B["Parse args"] + B --> C["Load suites"] + C --> D["Run each suite"] + D --> E["Run each case"] + E --> F["Evaluate assertions"] + F --> G["Collect results"] + G --> H["Generate reports"] + H --> I["Print summary"] + I --> J["Exit"] +``` diff --git a/src/cli/agent-evals/clients/eval-runner.ts b/src/cli/agent-evals/clients/eval-runner.ts new file mode 100644 index 0000000..f698660 --- /dev/null +++ b/src/cli/agent-evals/clients/eval-runner.ts @@ -0,0 +1,219 @@ +import { AgentRunner } from "~clients/agent-runner"; +import type { Logger } from "~clients/logger"; + +import { + DEFAULT_CASE_TIMEOUT_MS, + DEFAULT_MAX_TURNS, + STATUS_SYMBOLS, + ZERO, +} from "../constants"; +import type { + AssertionResult, + CaseResult, + CaseStatus, + EvalCase, + EvalSuite, + SuiteResult, + SuiteSummary, +} from "../types/schemas"; +import { evaluateAssertion } from "../utils/assertions"; +import { createToolsFromNames } from "./tool-registry"; + +export type EvalRunnerConfig = { + logger: Logger; + verbose?: boolean; +}; + +/** + * Executes evaluation suites and collects results. + * Creates an AgentRunner for each suite based on its agent config, + * runs each case, validates outputs, and collects PASS/FAIL results. + */ +export class EvalRunner { + private logger: Logger; + private verbose: boolean; + + constructor(config: EvalRunnerConfig) { + this.logger = config.logger; + this.verbose = config.verbose ?? false; + } + + /** + * Run a single evaluation suite. + */ + async runSuite(suite: EvalSuite): Promise { + const startedAt = new Date(); + this.logger.info("Running suite", { + name: suite.name, + caseCount: suite.cases.length, + }); + + const agentRunner = this.createAgentRunner(suite); + + const caseResults: CaseResult[] = []; + let passed = ZERO; + let failed = ZERO; + let errors = ZERO; + let skipped = ZERO; + + for (const evalCase of suite.cases) { + const caseResult = await this.runCase(evalCase, agentRunner, suite); + caseResults.push(caseResult); + + switch (caseResult.status) { + case "pass": + passed++; + break; + case "fail": + failed++; + break; + case "error": + errors++; + break; + case "skip": + skipped++; + break; + } + + this.logCaseResult(caseResult); + } + + const completedAt = new Date(); + const total = suite.cases.length; + const summary: SuiteSummary = { + total, + passed, + failed, + errors, + skipped, + passRate: total > ZERO ? passed / total : ZERO, + }; + + return { + suiteName: suite.name, + suiteVersion: suite.version, + startedAt: startedAt.toISOString(), + completedAt: completedAt.toISOString(), + durationMs: completedAt.getTime() - startedAt.getTime(), + summary, + cases: caseResults, + }; + } + + private logCaseResult(caseResult: CaseResult): void { + const symbol = STATUS_SYMBOLS[caseResult.status]; + const message = `${symbol} ${caseResult.caseId}: ${caseResult.caseName}`; + + if (caseResult.status === "pass") { + this.logger.info(message, { durationMs: caseResult.durationMs }); + } else { + this.logger.warn(message, { + durationMs: caseResult.durationMs, + error: caseResult.error, + }); + if (this.verbose && caseResult.assertionResults.length > ZERO) { + const failedAssertions = caseResult.assertionResults.filter( + (r) => !r.passed + ); + for (const ar of failedAssertions) { + this.logger.debug(" Assertion failed", { message: ar.message }); + } + } + } + } + + /** + * Run a single evaluation case. + */ + private async runCase( + evalCase: EvalCase, + agentRunner: AgentRunner, + suite: EvalSuite + ): Promise { + const startTime = Date.now(); + const timeout = + evalCase.timeout ?? suite.defaults?.timeout ?? DEFAULT_CASE_TIMEOUT_MS; + + this.logger.debug("Running case", { id: evalCase.id, name: evalCase.name }); + + try { + const runPromise = agentRunner.run({ + prompt: evalCase.prompt, + maxTurns: suite.agent.maxTurns ?? DEFAULT_MAX_TURNS, + }); + + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => { + reject(new Error("Case timed out")); + }, timeout); + }); + + const result = await Promise.race([runPromise, timeoutPromise]); + const output: unknown = result.finalOutput; + const durationMs = Date.now() - startTime; + + const assertionResults = await this.runAssertions( + evalCase.assertions, + output + ); + + const allAssertionsPassed = assertionResults.every((r) => r.passed); + const status: CaseStatus = allAssertionsPassed ? "pass" : "fail"; + + return { + caseId: evalCase.id, + caseName: evalCase.name, + status, + durationMs, + output, + assertionResults, + error: null, + }; + } catch (err) { + const durationMs = Date.now() - startTime; + const errorMessage = err instanceof Error ? err.message : String(err); + + return { + caseId: evalCase.id, + caseName: evalCase.name, + status: "error", + durationMs, + output: null, + assertionResults: [], + error: errorMessage, + }; + } + } + + /** + * Create an AgentRunner from suite's agent config. + * Instantiates tools from the tool registry based on suite.agent.tools. + */ + private createAgentRunner(suite: EvalSuite): AgentRunner { + const tools = createToolsFromNames(suite.agent.tools, { + logger: this.logger, + }); + + return new AgentRunner({ + name: suite.agent.name, + model: suite.agent.model, + tools, + instructions: suite.agent.instructions, + logger: this.logger, + logToolResults: this.verbose, + stateless: true, + }); + } + + /** + * Run all assertions on the output. + */ + private async runAssertions( + assertions: EvalCase["assertions"], + output: unknown + ): Promise { + return Promise.all( + assertions.map((assertion) => evaluateAssertion(assertion, output)) + ); + } +} diff --git a/src/cli/agent-evals/clients/report-generator.ts b/src/cli/agent-evals/clients/report-generator.ts new file mode 100644 index 0000000..180bc4d --- /dev/null +++ b/src/cli/agent-evals/clients/report-generator.ts @@ -0,0 +1,199 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import type { Logger } from "~clients/logger"; +import { resolveTmpPathForWrite, TMP_ROOT } from "~tools/utils/fs"; + +import { + DECIMAL_PLACES, + PERCENT_MULTIPLIER, + REPORTS_SUBDIR, + STATUS_SYMBOLS, +} from "../constants"; +import type { EvalReport, ReportSummary, SuiteResult } from "../types/schemas"; + +export type ReportFormat = "json" | "md" | "both"; + +export type ReportGeneratorConfig = { + logger: Logger; + outputDir: string; + format: ReportFormat; +}; + +/** + * Generates evaluation reports in JSON and/or Markdown format. + * Reports are written to the configured output directory under tmp/, + * inside a dedicated reports/ subfolder. + */ +export class ReportGenerator { + private logger: Logger; + private outputDir: string; + private format: ReportFormat; + + constructor(config: ReportGeneratorConfig) { + this.logger = config.logger; + this.outputDir = config.outputDir; + this.format = config.format; + } + + /** + * Generate and save report(s) from suite results. + * Returns the paths of saved reports. + */ + async generate(suiteResults: SuiteResult[]): Promise { + const report = this.buildReport(suiteResults); + const savedPaths: string[] = []; + + if (this.format === "json" || this.format === "both") { + const jsonPath = await this.writeJson(report); + savedPaths.push(jsonPath); + } + + if (this.format === "md" || this.format === "both") { + const mdPath = await this.writeMarkdown(report); + savedPaths.push(mdPath); + } + + return savedPaths; + } + + private buildReport(suiteResults: SuiteResult[]): EvalReport { + const totalCases = suiteResults.reduce( + (sum, s) => sum + s.summary.total, + 0 + ); + const passed = suiteResults.reduce((sum, s) => sum + s.summary.passed, 0); + const failed = suiteResults.reduce((sum, s) => sum + s.summary.failed, 0); + const errors = suiteResults.reduce((sum, s) => sum + s.summary.errors, 0); + const skipped = suiteResults.reduce((sum, s) => sum + s.summary.skipped, 0); + const durationMs = suiteResults.reduce((sum, s) => sum + s.durationMs, 0); + + const summary: ReportSummary = { + totalSuites: suiteResults.length, + totalCases, + passed, + failed, + errors, + skipped, + passRate: totalCases > 0 ? passed / totalCases : 0, + }; + + return { + generatedAt: new Date().toISOString(), + durationMs, + summary, + suites: suiteResults, + }; + } + + private async writeJson(report: EvalReport): Promise { + const timestamp = this.getTimestamp(); + const filename = `report-${timestamp}.json`; + const relativePath = path.join(this.outputDir, REPORTS_SUBDIR, filename); + const fullPath = await resolveTmpPathForWrite(relativePath); + + await fs.writeFile(fullPath, JSON.stringify(report, null, 2), "utf8"); + const displayPath = this.toDisplayPath(fullPath); + this.logger.info("JSON report saved", { path: displayPath }); + return displayPath; + } + + private async writeMarkdown(report: EvalReport): Promise { + const timestamp = this.getTimestamp(); + const filename = `report-${timestamp}.md`; + const relativePath = path.join(this.outputDir, REPORTS_SUBDIR, filename); + const fullPath = await resolveTmpPathForWrite(relativePath); + + const markdown = this.formatMarkdown(report); + await fs.writeFile(fullPath, markdown, "utf8"); + const displayPath = this.toDisplayPath(fullPath); + this.logger.info("Markdown report saved", { path: displayPath }); + return displayPath; + } + + private toDisplayPath(fullPath: string): string { + const relativePath = path.relative(TMP_ROOT, fullPath); + return path.join("tmp", relativePath); + } + + private formatMarkdown(report: EvalReport): string { + const lines: string[] = []; + + lines.push("# Agent Evaluation Report"); + lines.push(""); + lines.push(`Generated: ${report.generatedAt}`); + lines.push(`Duration: ${report.durationMs}ms`); + lines.push(""); + + lines.push("## Summary"); + lines.push(""); + lines.push("| Metric | Value |"); + lines.push("|--------|-------|"); + lines.push(`| Total Suites | ${report.summary.totalSuites} |`); + lines.push(`| Total Cases | ${report.summary.totalCases} |`); + lines.push(`| Passed | ${report.summary.passed} |`); + lines.push(`| Failed | ${report.summary.failed} |`); + lines.push(`| Errors | ${report.summary.errors} |`); + lines.push(`| Skipped | ${report.summary.skipped} |`); + lines.push( + `| Pass Rate | ${this.formatPercent(report.summary.passRate)} |` + ); + lines.push(""); + + for (const suite of report.suites) { + lines.push(`## Suite: ${suite.suiteName}`); + lines.push(""); + lines.push(`Version: ${suite.suiteVersion}`); + lines.push(`Duration: ${suite.durationMs}ms`); + lines.push( + `Pass Rate: ${this.formatPercent(suite.summary.passRate)} (${suite.summary.passed}/${suite.summary.total})` + ); + lines.push(""); + + lines.push("### Cases"); + lines.push(""); + lines.push("| Status | ID | Name | Duration |"); + lines.push("|--------|-----|------|----------|"); + + for (const caseResult of suite.cases) { + const status = STATUS_SYMBOLS[caseResult.status]; + lines.push( + `| ${status} | ${caseResult.caseId} | ${caseResult.caseName} | ${caseResult.durationMs}ms |` + ); + } + lines.push(""); + + const problemCases = suite.cases.filter( + (c) => c.status === "fail" || c.status === "error" + ); + if (problemCases.length > 0) { + lines.push("### Details"); + lines.push(""); + for (const caseResult of problemCases) { + lines.push(`#### ${caseResult.caseId}: ${caseResult.caseName}`); + lines.push(""); + if (caseResult.error) { + lines.push(`**Error:** ${caseResult.error}`); + } + if (caseResult.assertionResults.length > 0) { + lines.push("**Assertion Results:**"); + for (const ar of caseResult.assertionResults) { + const icon = ar.passed ? "OK" : "FAIL"; + lines.push(`- [${icon}] ${ar.assertion.type}: ${ar.message}`); + } + } + lines.push(""); + } + } + } + + return lines.join("\n"); + } + + private formatPercent(value: number): string { + return `${(value * PERCENT_MULTIPLIER).toFixed(DECIMAL_PLACES.passRate)}%`; + } + + private getTimestamp(): string { + return new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19); + } +} diff --git a/src/cli/agent-evals/clients/suite-loader.ts b/src/cli/agent-evals/clients/suite-loader.ts new file mode 100644 index 0000000..22d29e4 --- /dev/null +++ b/src/cli/agent-evals/clients/suite-loader.ts @@ -0,0 +1,86 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import type { Logger } from "~clients/logger"; + +import { SUITE_FILE_EXTENSION, SUITES_DIR } from "../constants"; +import type { EvalSuite } from "../types/schemas"; +import { EvalSuiteSchema } from "../types/schemas"; + +export type SuiteLoaderConfig = { + logger: Logger; + suitesDir?: string; +}; + +/** + * Loads evaluation suite definitions from JSON files. + * Suite files are stored in the suites/ directory with .json extension. + */ +export class SuiteLoader { + private logger: Logger; + private suitesDir: string; + + constructor(config: SuiteLoaderConfig) { + this.logger = config.logger; + this.suitesDir = config.suitesDir ?? SUITES_DIR; + } + + /** + * Load a single suite by name. + * @param name Suite name (without .json extension) + */ + async load(name: string): Promise { + const filePath = path.join( + this.suitesDir, + `${name}${SUITE_FILE_EXTENSION}` + ); + this.logger.debug("Loading suite", { name, path: filePath }); + + const content = await fs.readFile(filePath, "utf8"); + const json = JSON.parse(content) as unknown; + const suite = EvalSuiteSchema.parse(json); + + this.logger.info("Suite loaded", { + name: suite.name, + caseCount: suite.cases.length, + }); + return suite; + } + + /** + * List all available suite names. + */ + async listSuites(): Promise { + try { + const entries = await fs.readdir(this.suitesDir, { withFileTypes: true }); + const suiteNames = entries + .filter( + (entry) => entry.isFile() && entry.name.endsWith(SUITE_FILE_EXTENSION) + ) + .map((entry) => entry.name.replace(SUITE_FILE_EXTENSION, "")); + + this.logger.debug("Available suites", { suites: suiteNames }); + return suiteNames; + } catch (err) { + if ((err as NodeJS.ErrnoException).code === "ENOENT") { + this.logger.warn("Suites directory not found", { dir: this.suitesDir }); + return []; + } + throw err; + } + } + + /** + * Load all available suites. + */ + async loadAll(): Promise { + const names = await this.listSuites(); + const suites: EvalSuite[] = []; + + for (const name of names) { + const suite = await this.load(name); + suites.push(suite); + } + + return suites; + } +} diff --git a/src/cli/agent-evals/clients/tool-registry.ts b/src/cli/agent-evals/clients/tool-registry.ts new file mode 100644 index 0000000..73e1fe0 --- /dev/null +++ b/src/cli/agent-evals/clients/tool-registry.ts @@ -0,0 +1,37 @@ +import type { Tool } from "@openai/agents"; +import type { Logger } from "~clients/logger"; +import { createDeleteFileTool } from "~tools/delete-file/delete-file-tool"; +import { createListFilesTool } from "~tools/list-files/list-files-tool"; +import { createReadFileTool } from "~tools/read-file/read-file-tool"; +import { createWriteFileTool } from "~tools/write-file/write-file-tool"; + +export type ToolFactoryConfig = { + logger: Logger; +}; + +type ToolFactory = (config: ToolFactoryConfig) => Tool; + +const toolFactories: Record = { + readFile: ({ logger }) => createReadFileTool({ logger }), + writeFile: ({ logger }) => createWriteFileTool({ logger }), + listFiles: ({ logger }) => createListFilesTool({ logger }), + deleteFile: ({ logger }) => createDeleteFileTool({ logger }), +}; + +/** + * Creates tool instances from an array of tool names. + * Throws if an unknown tool name is provided. + */ +export const createToolsFromNames = ( + names: string[], + config: ToolFactoryConfig +): Tool[] => { + return names.map((name) => { + const factory = toolFactories[name]; + if (!factory) { + const available = Object.keys(toolFactories).join(", "); + throw new Error(`Unknown tool: ${name}. Available: ${available}`); + } + return factory(config); + }); +}; diff --git a/src/cli/agent-evals/constants.ts b/src/cli/agent-evals/constants.ts new file mode 100644 index 0000000..6a92493 --- /dev/null +++ b/src/cli/agent-evals/constants.ts @@ -0,0 +1,42 @@ +import path from "node:path"; + +// CLI defaults +export const DEFAULT_VERBOSE = false; +export const DEFAULT_REPORT_FORMAT = "json" as const; +export const DEFAULT_OUT_PATH = "agent-evals"; +export const REPORTS_SUBDIR = "reports"; + +// Paths +export const SUITES_DIR = path.join( + process.cwd(), + "src", + "cli", + "agent-evals", + "suites" +); +export const SUITE_FILE_EXTENSION = ".json"; + +// Execution defaults +export const DEFAULT_CASE_TIMEOUT_MS = 30000; +export const DEFAULT_MAX_TURNS = 5; + +// Numeric constants +export const ZERO = 0; +export const ONE = 1; +export const PERCENT_MULTIPLIER = 100; + +// Report formatting +export const DECIMAL_PLACES = { + passRate: 1, + duration: 0, +} as const; + +export const LINE_WIDTH = 60; + +// Status symbols for console output +export const STATUS_SYMBOLS = { + pass: "[PASS]", + fail: "[FAIL]", + error: "[ERROR]", + skip: "[SKIP]", +} as const; diff --git a/src/cli/agent-evals/main.ts b/src/cli/agent-evals/main.ts new file mode 100644 index 0000000..ac71fca --- /dev/null +++ b/src/cli/agent-evals/main.ts @@ -0,0 +1,104 @@ +// pnpm run:agent-evals + +// Run automated evaluation cases for AI agents with PASS/FAIL results and reports + +import "dotenv/config"; + +import { Logger } from "~clients/logger"; +import { parseArgs } from "~utils/parse-args"; + +import { EvalRunner } from "./clients/eval-runner"; +import { ReportGenerator } from "./clients/report-generator"; +import { SuiteLoader } from "./clients/suite-loader"; +import { LINE_WIDTH, PERCENT_MULTIPLIER, ZERO } from "./constants"; +import type { SuiteResult } from "./types/schemas"; +import { CliArgsSchema } from "./types/schemas"; + +const logger = new Logger(); + +logger.info("Agent Evals starting..."); + +const { suite, all, report, out, verbose } = parseArgs({ + logger, + schema: CliArgsSchema, +}); + +if (verbose) { + logger.debug("Verbose mode enabled"); +} + +const suiteLoader = new SuiteLoader({ logger }); +const evalRunner = new EvalRunner({ logger, verbose }); +const reportGenerator = new ReportGenerator({ + logger, + outputDir: out, + format: report, +}); + +let suitesToRun; +if (all) { + logger.info("Loading all suites..."); + suitesToRun = await suiteLoader.loadAll(); +} else if (suite) { + logger.info("Loading suite", { name: suite }); + const singleSuite = await suiteLoader.load(suite); + suitesToRun = [singleSuite]; +} else { + logger.error("Either --suite or --all is required"); + process.exit(1); +} + +if (suitesToRun.length === ZERO) { + logger.warn("No suites found to run"); + process.exit(0); +} + +logger.info("Suites to run", { count: suitesToRun.length }); + +const suiteResults: SuiteResult[] = []; +const separator = "=".repeat(LINE_WIDTH); + +for (const evalSuite of suitesToRun) { + logger.info(separator); + const result = await evalRunner.runSuite(evalSuite); + suiteResults.push(result); +} + +logger.info(separator); +logger.info("Generating reports..."); +const reportPaths = await reportGenerator.generate(suiteResults); + +const totalCases = suiteResults.reduce((sum, s) => sum + s.summary.total, ZERO); +const totalPassed = suiteResults.reduce( + (sum, s) => sum + s.summary.passed, + ZERO +); +const totalFailed = suiteResults.reduce( + (sum, s) => sum + s.summary.failed, + ZERO +); +const totalErrors = suiteResults.reduce( + (sum, s) => sum + s.summary.errors, + ZERO +); +const passRate = + totalCases > ZERO ? (totalPassed / totalCases) * PERCENT_MULTIPLIER : ZERO; + +logger.info(separator); +logger.info("EVALUATION COMPLETE"); +logger.info(separator); +logger.info("Summary", { + suites: suiteResults.length, + cases: totalCases, + passed: totalPassed, + failed: totalFailed, + errors: totalErrors, + passRate: `${passRate.toFixed(1)}%`, +}); +logger.info("Reports saved", { paths: reportPaths }); + +if (totalFailed > ZERO || totalErrors > ZERO) { + process.exit(1); +} + +logger.info("Agent Evals completed."); diff --git a/src/cli/agent-evals/suites/example.json b/src/cli/agent-evals/suites/example.json new file mode 100644 index 0000000..b3aee2a --- /dev/null +++ b/src/cli/agent-evals/suites/example.json @@ -0,0 +1,79 @@ +{ + "name": "example-suite", + "description": "Example evaluation suite demonstrating the eval case pattern", + "version": "1.0.0", + "agent": { + "name": "SimpleTestAgent", + "model": "gpt-5-mini", + "instructions": "You are a helpful assistant. Always respond with valid JSON in the format: {\"answer\": \"your answer here\", \"confidence\": 0.0-1.0}. Be concise and accurate.", + "tools": [], + "maxTurns": 3 + }, + "defaults": { + "timeout": 15000 + }, + "cases": [ + { + "id": "simple-math", + "name": "Simple arithmetic", + "description": "Tests basic math reasoning - 2 + 2", + "prompt": "What is 2 + 2? Respond with your answer and confidence level.", + "assertions": [ + { + "type": "contains", + "value": "4", + "description": "Response should contain the number 4" + } + ], + "tags": ["math", "basic"] + }, + { + "id": "json-format", + "name": "JSON format validation", + "description": "Tests that agent follows JSON format instructions", + "prompt": "Say hello. Respond with your greeting and confidence level.", + "assertions": [ + { + "type": "matchesRegex", + "pattern": "\"answer\"\\s*:", + "description": "Response should have answer field" + }, + { + "type": "matchesRegex", + "pattern": "\"confidence\"\\s*:", + "description": "Response should have confidence field" + } + ], + "tags": ["format", "basic"] + }, + { + "id": "capital-france", + "name": "Basic knowledge - capital of France", + "description": "Tests knowledge retrieval", + "prompt": "What is the capital of France? Respond with your answer and confidence level.", + "assertions": [ + { + "type": "contains", + "value": "Paris", + "caseSensitive": false, + "description": "Answer should mention Paris" + } + ], + "tags": ["knowledge", "geography"] + }, + { + "id": "larger-number", + "name": "Comparison task", + "description": "Tests basic comparison reasoning", + "prompt": "Which is larger: 100 or 50? Respond with your answer and confidence level.", + "assertions": [ + { + "type": "contains", + "value": "100", + "description": "Response should identify 100 as larger" + } + ], + "tags": ["math", "comparison"] + } + ] +} diff --git a/src/cli/agent-evals/suites/tools.json b/src/cli/agent-evals/suites/tools.json new file mode 100644 index 0000000..ca26336 --- /dev/null +++ b/src/cli/agent-evals/suites/tools.json @@ -0,0 +1,107 @@ +{ + "name": "tools-suite", + "description": "Tests shared agent tools (readFile, writeFile, listFiles)", + "version": "1.0.0", + "agent": { + "name": "ToolTestAgent", + "model": "gpt-5-mini", + "instructions": "You are an assistant that tests file tools. Use the tools provided to complete tasks. After using a tool, report results concisely.", + "tools": ["readFile", "writeFile", "listFiles", "deleteFile"], + "maxTurns": 3 + }, + "defaults": { + "timeout": 20000 + }, + "cases": [ + { + "id": "write-file", + "name": "writeFile creates a file", + "prompt": "Write the text 'Hello World' to a file called 'agent-evals/tool-test-output.txt'", + "assertions": [ + { + "type": "fileExists", + "path": "agent-evals/tool-test-output.txt", + "description": "File should be created" + }, + { + "type": "fileContains", + "path": "agent-evals/tool-test-output.txt", + "value": "Hello World", + "description": "File should contain the written text" + } + ], + "tags": ["writeFile"] + }, + { + "id": "read-file", + "name": "readFile reads file content", + "prompt": "Read the file 'agent-evals/tool-test-output.txt' and tell me its contents", + "assertions": [ + { + "type": "contains", + "value": "Hello World", + "description": "Agent response should include the file contents" + } + ], + "tags": ["readFile"] + }, + { + "id": "list-files", + "name": "listFiles shows directory contents", + "prompt": "List the files in the tmp/agent-evals directory", + "assertions": [ + { + "type": "contains", + "value": "tool-test-output.txt", + "description": "Agent response should include the previously created file" + } + ], + "tags": ["listFiles"] + }, + { + "id": "write-json", + "name": "writeFile with JSON content", + "prompt": "Write a JSON file called 'agent-evals/tool-test-data.json' with this exact content: {\"name\": \"test\", \"value\": 42}", + "assertions": [ + { + "type": "fileExists", + "path": "agent-evals/tool-test-data.json", + "description": "JSON file should be created" + }, + { + "type": "fileJsonPath", + "path": "agent-evals/tool-test-data.json", + "jsonPath": "$.name", + "expected": "test", + "description": "JSON name field should match" + }, + { + "type": "fileJsonPath", + "path": "agent-evals/tool-test-data.json", + "jsonPath": "$.value", + "expected": 42, + "description": "JSON value field should match" + } + ], + "tags": ["writeFile", "json"] + }, + { + "id": "delete-file", + "name": "deleteFile removes a file", + "prompt": "Delete the file 'agent-evals/tool-test-data.json'", + "assertions": [ + { + "type": "fileNotExists", + "path": "agent-evals/tool-test-data.json", + "description": "File should be deleted" + }, + { + "type": "contains", + "value": "Deleted", + "description": "Agent should confirm deletion" + } + ], + "tags": ["deleteFile"] + } + ] +} diff --git a/src/cli/agent-evals/types/schemas.ts b/src/cli/agent-evals/types/schemas.ts new file mode 100644 index 0000000..1a564fb --- /dev/null +++ b/src/cli/agent-evals/types/schemas.ts @@ -0,0 +1,245 @@ +import { z } from "zod"; + +import { + DEFAULT_OUT_PATH, + DEFAULT_REPORT_FORMAT, + DEFAULT_VERBOSE, +} from "../constants"; + +// ============================================ +// CLI Arguments +// ============================================ + +export const CliArgsSchema = z + .object({ + suite: z.string().optional(), + all: z.coerce.boolean().default(false), + report: z.enum(["json", "md", "both"]).default(DEFAULT_REPORT_FORMAT), + out: z.string().default(DEFAULT_OUT_PATH), + verbose: z.coerce.boolean().default(DEFAULT_VERBOSE), + }) + .refine((data) => data.suite ?? data.all, { + message: "Either --suite or --all is required", + }); + +export type CliArgs = z.infer; + +// ============================================ +// Assertion Types +// ============================================ + +export const ContainsAssertionSchema = z.object({ + type: z.literal("contains"), + value: z.string(), + caseSensitive: z.boolean().optional(), + description: z.string().optional(), +}); + +export const MatchesRegexAssertionSchema = z.object({ + type: z.literal("matchesRegex"), + pattern: z.string(), + flags: z.string().optional(), + description: z.string().optional(), +}); + +export const EqualsAssertionSchema = z.object({ + type: z.literal("equals"), + expected: z.unknown(), + description: z.string().optional(), +}); + +export const JsonPathAssertionSchema = z.object({ + type: z.literal("jsonPath"), + path: z.string(), + expected: z.unknown(), + description: z.string().optional(), +}); + +// ============================================ +// File Assertion Types (for verifying tool side effects) +// ============================================ + +export const FileExistsAssertionSchema = z.object({ + type: z.literal("fileExists"), + path: z.string(), + description: z.string().optional(), +}); + +export const FileContainsAssertionSchema = z.object({ + type: z.literal("fileContains"), + path: z.string(), + value: z.string(), + caseSensitive: z.boolean().optional(), + description: z.string().optional(), +}); + +export const FileJsonPathAssertionSchema = z.object({ + type: z.literal("fileJsonPath"), + path: z.string(), + jsonPath: z.string(), + expected: z.unknown(), + description: z.string().optional(), +}); + +export const FileNotExistsAssertionSchema = z.object({ + type: z.literal("fileNotExists"), + path: z.string(), + description: z.string().optional(), +}); + +export const AssertionSchema = z.discriminatedUnion("type", [ + ContainsAssertionSchema, + MatchesRegexAssertionSchema, + EqualsAssertionSchema, + JsonPathAssertionSchema, + FileExistsAssertionSchema, + FileContainsAssertionSchema, + FileJsonPathAssertionSchema, + FileNotExistsAssertionSchema, +]); + +export type Assertion = z.infer; +export type ContainsAssertion = z.infer; +export type MatchesRegexAssertion = z.infer; +export type EqualsAssertion = z.infer; +export type JsonPathAssertion = z.infer; +export type FileExistsAssertion = z.infer; +export type FileContainsAssertion = z.infer; +export type FileJsonPathAssertion = z.infer; +export type FileNotExistsAssertion = z.infer< + typeof FileNotExistsAssertionSchema +>; + +// ============================================ +// Eval Case +// ============================================ + +export const EvalCaseSchema = z.object({ + id: z.string(), + name: z.string(), + description: z.string().optional(), + prompt: z.string(), + assertions: z.array(AssertionSchema).default([]), + timeout: z.number().optional(), + tags: z.array(z.string()).default([]), +}); + +export type EvalCase = z.infer; + +// ============================================ +// Agent Config (for suite) +// ============================================ + +export const AgentConfigSchema = z.object({ + name: z.string(), + model: z.literal("gpt-5-mini"), + instructions: z.string(), + tools: z.array(z.string()).default([]), + maxTurns: z.number().optional(), +}); + +export type AgentConfig = z.infer; + +// ============================================ +// Eval Suite +// ============================================ + +export const EvalSuiteSchema = z.object({ + name: z.string(), + description: z.string().optional(), + version: z.string().default("1.0.0"), + agent: AgentConfigSchema, + defaults: z + .object({ + timeout: z.number().optional(), + }) + .optional(), + cases: z.array(EvalCaseSchema).min(1), +}); + +export type EvalSuite = z.infer; + +// ============================================ +// Assertion Result +// ============================================ + +export const AssertionResultSchema = z.object({ + assertion: AssertionSchema, + passed: z.boolean(), + message: z.string(), + actual: z.unknown().optional(), + expected: z.unknown().optional(), +}); + +export type AssertionResult = z.infer; + +// ============================================ +// Case Result +// ============================================ + +export const CaseStatusSchema = z.enum(["pass", "fail", "error", "skip"]); +export type CaseStatus = z.infer; + +export const CaseResultSchema = z.object({ + caseId: z.string(), + caseName: z.string(), + status: CaseStatusSchema, + durationMs: z.number(), + output: z.unknown().nullable(), + assertionResults: z.array(AssertionResultSchema), + error: z.string().nullable(), +}); + +export type CaseResult = z.infer; + +// ============================================ +// Suite Result +// ============================================ + +export const SuiteSummarySchema = z.object({ + total: z.number(), + passed: z.number(), + failed: z.number(), + errors: z.number(), + skipped: z.number(), + passRate: z.number(), +}); + +export type SuiteSummary = z.infer; + +export const SuiteResultSchema = z.object({ + suiteName: z.string(), + suiteVersion: z.string(), + startedAt: z.string(), + completedAt: z.string(), + durationMs: z.number(), + summary: SuiteSummarySchema, + cases: z.array(CaseResultSchema), +}); + +export type SuiteResult = z.infer; + +// ============================================ +// Full Report (multiple suites) +// ============================================ + +export const ReportSummarySchema = z.object({ + totalSuites: z.number(), + totalCases: z.number(), + passed: z.number(), + failed: z.number(), + errors: z.number(), + skipped: z.number(), + passRate: z.number(), +}); + +export type ReportSummary = z.infer; + +export const EvalReportSchema = z.object({ + generatedAt: z.string(), + durationMs: z.number(), + summary: ReportSummarySchema, + suites: z.array(SuiteResultSchema), +}); + +export type EvalReport = z.infer; diff --git a/src/cli/agent-evals/utils/assertions.test.ts b/src/cli/agent-evals/utils/assertions.test.ts new file mode 100644 index 0000000..4e2f365 --- /dev/null +++ b/src/cli/agent-evals/utils/assertions.test.ts @@ -0,0 +1,443 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { TMP_ROOT } from "~tools/utils/fs"; +import { afterAll, beforeAll, describe, expect, it } from "vitest"; + +import type { Assertion } from "../types/schemas"; +import { evaluateAssertion } from "./assertions"; + +describe("evaluateAssertion", () => { + describe("contains", () => { + it("passes when output contains the value", async () => { + const assertion: Assertion = { + type: "contains", + value: "hello", + }; + const result = await evaluateAssertion(assertion, { + message: "hello world", + }); + expect(result.passed).toBe(true); + expect(result.message).toContain("contains"); + }); + + it("fails when output does not contain the value", async () => { + const assertion: Assertion = { + type: "contains", + value: "goodbye", + }; + const result = await evaluateAssertion(assertion, { + message: "hello world", + }); + expect(result.passed).toBe(false); + expect(result.message).toContain("does not contain"); + }); + + it("is case sensitive by default", async () => { + const assertion: Assertion = { + type: "contains", + value: "HELLO", + }; + const result = await evaluateAssertion(assertion, "hello world"); + expect(result.passed).toBe(false); + }); + + it("respects caseSensitive: false", async () => { + const assertion: Assertion = { + type: "contains", + value: "HELLO", + caseSensitive: false, + }; + const result = await evaluateAssertion(assertion, "hello world"); + expect(result.passed).toBe(true); + }); + + it("works with string output", async () => { + const assertion: Assertion = { + type: "contains", + value: "test", + }; + const result = await evaluateAssertion( + assertion, + "this is a test string" + ); + expect(result.passed).toBe(true); + }); + }); + + describe("matchesRegex", () => { + it("passes when output matches pattern", async () => { + const assertion: Assertion = { + type: "matchesRegex", + pattern: "\\d{3}-\\d{4}", + }; + const result = await evaluateAssertion(assertion, "Call 555-1234"); + expect(result.passed).toBe(true); + }); + + it("fails when output does not match pattern", async () => { + const assertion: Assertion = { + type: "matchesRegex", + pattern: "\\d{3}-\\d{4}", + }; + const result = await evaluateAssertion(assertion, "No number here"); + expect(result.passed).toBe(false); + }); + + it("supports regex flags", async () => { + const assertion: Assertion = { + type: "matchesRegex", + pattern: "hello", + flags: "i", + }; + const result = await evaluateAssertion(assertion, "HELLO WORLD"); + expect(result.passed).toBe(true); + }); + + it("handles invalid regex gracefully", async () => { + const assertion: Assertion = { + type: "matchesRegex", + pattern: "[invalid", + }; + const result = await evaluateAssertion(assertion, "test"); + expect(result.passed).toBe(false); + expect(result.message).toContain("Invalid regex"); + }); + }); + + describe("equals", () => { + it("passes for equal primitive values", async () => { + const assertion: Assertion = { + type: "equals", + expected: 42, + }; + const result = await evaluateAssertion(assertion, 42); + expect(result.passed).toBe(true); + }); + + it("fails for different primitive values", async () => { + const assertion: Assertion = { + type: "equals", + expected: 42, + }; + const result = await evaluateAssertion(assertion, 43); + expect(result.passed).toBe(false); + }); + + it("passes for equal objects", async () => { + const assertion: Assertion = { + type: "equals", + expected: { a: 1, b: 2 }, + }; + const result = await evaluateAssertion(assertion, { a: 1, b: 2 }); + expect(result.passed).toBe(true); + }); + + it("passes for objects with different key ordering", async () => { + const assertion: Assertion = { + type: "equals", + expected: { a: 1, b: 2 }, + }; + const result = await evaluateAssertion(assertion, { b: 2, a: 1 }); + expect(result.passed).toBe(true); + }); + + it("fails for different objects", async () => { + const assertion: Assertion = { + type: "equals", + expected: { a: 1, b: 2 }, + }; + const result = await evaluateAssertion(assertion, { a: 1, b: 3 }); + expect(result.passed).toBe(false); + }); + + it("passes for equal strings", async () => { + const assertion: Assertion = { + type: "equals", + expected: "hello", + }; + const result = await evaluateAssertion(assertion, "hello"); + expect(result.passed).toBe(true); + }); + }); + + describe("jsonPath", () => { + it("extracts and compares nested values", async () => { + const assertion: Assertion = { + type: "jsonPath", + path: "response.status", + expected: "success", + }; + const result = await evaluateAssertion(assertion, { + response: { status: "success" }, + }); + expect(result.passed).toBe(true); + }); + + it("supports $. prefix in path", async () => { + const assertion: Assertion = { + type: "jsonPath", + path: "$.response.status", + expected: "success", + }; + const result = await evaluateAssertion(assertion, { + response: { status: "success" }, + }); + expect(result.passed).toBe(true); + }); + + it("fails when path value does not match", async () => { + const assertion: Assertion = { + type: "jsonPath", + path: "response.status", + expected: "success", + }; + const result = await evaluateAssertion(assertion, { + response: { status: "error" }, + }); + expect(result.passed).toBe(false); + }); + + it("fails for missing path", async () => { + const assertion: Assertion = { + type: "jsonPath", + path: "missing.path", + expected: "value", + }; + const result = await evaluateAssertion(assertion, { other: "data" }); + expect(result.passed).toBe(false); + expect(result.message).toContain("Failed to evaluate path"); + }); + + it("handles deeply nested paths", async () => { + const assertion: Assertion = { + type: "jsonPath", + path: "a.b.c.d", + expected: 123, + }; + const result = await evaluateAssertion(assertion, { + a: { b: { c: { d: 123 } } }, + }); + expect(result.passed).toBe(true); + }); + + it("compares arrays correctly", async () => { + const assertion: Assertion = { + type: "jsonPath", + path: "items", + expected: [1, 2, 3], + }; + const result = await evaluateAssertion(assertion, { items: [1, 2, 3] }); + expect(result.passed).toBe(true); + }); + }); + + describe("file assertions", () => { + const TEST_DIR = path.join(TMP_ROOT, "assertion-tests"); + const TEST_FILE = path.join(TEST_DIR, "test-file.txt"); + const TEST_JSON_FILE = path.join(TEST_DIR, "test-data.json"); + const TRAVERSAL_PATH = "../package.json"; + + beforeAll(async () => { + await fs.mkdir(TEST_DIR, { recursive: true }); + await fs.writeFile(TEST_FILE, "Hello World\nThis is test content."); + await fs.writeFile( + TEST_JSON_FILE, + JSON.stringify({ name: "test", value: 42, nested: { key: "value" } }) + ); + }); + + afterAll(async () => { + await fs.rm(TEST_DIR, { recursive: true, force: true }); + }); + + describe("fileExists", () => { + it("passes when file exists", async () => { + const assertion: Assertion = { + type: "fileExists", + path: "assertion-tests/test-file.txt", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(true); + expect(result.message).toContain("exists"); + }); + + it("fails when file does not exist", async () => { + const assertion: Assertion = { + type: "fileExists", + path: "assertion-tests/nonexistent.txt", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.message).toContain("does not exist"); + }); + + it("rejects traversal paths", async () => { + const assertion: Assertion = { + type: "fileExists", + path: TRAVERSAL_PATH, + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.actual).toContain("Path traversal is not allowed"); + }); + }); + + describe("fileContains", () => { + it("passes when file contains the value", async () => { + const assertion: Assertion = { + type: "fileContains", + path: "assertion-tests/test-file.txt", + value: "Hello World", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(true); + expect(result.message).toContain("contains"); + }); + + it("fails when file does not contain the value", async () => { + const assertion: Assertion = { + type: "fileContains", + path: "assertion-tests/test-file.txt", + value: "Goodbye", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.message).toContain("does not contain"); + }); + + it("is case sensitive by default", async () => { + const assertion: Assertion = { + type: "fileContains", + path: "assertion-tests/test-file.txt", + value: "HELLO WORLD", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + }); + + it("respects caseSensitive: false", async () => { + const assertion: Assertion = { + type: "fileContains", + path: "assertion-tests/test-file.txt", + value: "HELLO WORLD", + caseSensitive: false, + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(true); + }); + + it("fails gracefully when file does not exist", async () => { + const assertion: Assertion = { + type: "fileContains", + path: "assertion-tests/nonexistent.txt", + value: "test", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.message).toContain("Failed to read file"); + }); + }); + + describe("fileJsonPath", () => { + it("extracts and compares JSON values", async () => { + const assertion: Assertion = { + type: "fileJsonPath", + path: "assertion-tests/test-data.json", + jsonPath: "name", + expected: "test", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(true); + }); + + it("supports $. prefix in jsonPath", async () => { + const assertion: Assertion = { + type: "fileJsonPath", + path: "assertion-tests/test-data.json", + jsonPath: "$.value", + expected: 42, + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(true); + }); + + it("handles nested paths", async () => { + const assertion: Assertion = { + type: "fileJsonPath", + path: "assertion-tests/test-data.json", + jsonPath: "nested.key", + expected: "value", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(true); + }); + + it("fails when value does not match", async () => { + const assertion: Assertion = { + type: "fileJsonPath", + path: "assertion-tests/test-data.json", + jsonPath: "value", + expected: 100, + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + }); + + it("fails gracefully for missing path", async () => { + const assertion: Assertion = { + type: "fileJsonPath", + path: "assertion-tests/test-data.json", + jsonPath: "missing.path", + expected: "value", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.message).toContain("Failed to evaluate"); + }); + + it("fails gracefully when file does not exist", async () => { + const assertion: Assertion = { + type: "fileJsonPath", + path: "assertion-tests/nonexistent.json", + jsonPath: "key", + expected: "value", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.message).toContain("Failed to evaluate"); + }); + }); + + describe("fileNotExists", () => { + it("passes when file does not exist", async () => { + const assertion: Assertion = { + type: "fileNotExists", + path: "assertion-tests/missing.txt", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(true); + expect(result.message).toContain("does not exist"); + }); + + it("fails when file exists", async () => { + const assertion: Assertion = { + type: "fileNotExists", + path: "assertion-tests/test-file.txt", + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.message).toContain("still exists"); + }); + + it("rejects traversal paths", async () => { + const assertion: Assertion = { + type: "fileNotExists", + path: TRAVERSAL_PATH, + }; + const result = await evaluateAssertion(assertion, null); + expect(result.passed).toBe(false); + expect(result.message).toContain("Failed to check file"); + }); + }); + }); +}); diff --git a/src/cli/agent-evals/utils/assertions.ts b/src/cli/agent-evals/utils/assertions.ts new file mode 100644 index 0000000..4cce048 --- /dev/null +++ b/src/cli/agent-evals/utils/assertions.ts @@ -0,0 +1,174 @@ +import { isEqual } from "lodash-es"; + +import type { + Assertion, + AssertionResult, + ContainsAssertion, + EqualsAssertion, + JsonPathAssertion, + MatchesRegexAssertion, +} from "../types/schemas"; +import { + evaluateFileContainsAssertion, + evaluateFileExistsAssertion, + evaluateFileJsonPathAssertion, + evaluateFileNotExistsAssertion, +} from "./file-assertions"; + +/** + * Evaluate a single assertion against the agent output. + * File assertions are async (require filesystem access). + */ +export const evaluateAssertion = async ( + assertion: Assertion, + output: unknown +): Promise => { + switch (assertion.type) { + case "contains": + return evaluateContainsAssertion(assertion, output); + case "matchesRegex": + return evaluateMatchesRegexAssertion(assertion, output); + case "equals": + return evaluateEqualsAssertion(assertion, output); + case "jsonPath": + return evaluateJsonPathAssertion(assertion, output); + case "fileExists": + return evaluateFileExistsAssertion(assertion); + case "fileContains": + return evaluateFileContainsAssertion(assertion); + case "fileJsonPath": + return evaluateFileJsonPathAssertion(assertion); + case "fileNotExists": + return evaluateFileNotExistsAssertion(assertion); + } +}; + +const evaluateContainsAssertion = ( + assertion: ContainsAssertion, + output: unknown +): AssertionResult => { + const outputStr = stringifyOutput(output); + const caseSensitive = assertion.caseSensitive ?? true; + const searchValue = caseSensitive + ? assertion.value + : assertion.value.toLowerCase(); + const searchIn = caseSensitive ? outputStr : outputStr.toLowerCase(); + const passed = searchIn.includes(searchValue); + + return { + assertion, + passed, + message: passed + ? `Output contains "${assertion.value}"` + : `Output does not contain "${assertion.value}"`, + actual: outputStr, + expected: assertion.value, + }; +}; + +const evaluateMatchesRegexAssertion = ( + assertion: MatchesRegexAssertion, + output: unknown +): AssertionResult => { + const outputStr = stringifyOutput(output); + + try { + const regex = new RegExp(assertion.pattern, assertion.flags); + const passed = regex.test(outputStr); + + return { + assertion, + passed, + message: passed + ? `Output matches pattern /${assertion.pattern}/${assertion.flags ?? ""}` + : `Output does not match pattern /${assertion.pattern}/${assertion.flags ?? ""}`, + actual: outputStr, + expected: assertion.pattern, + }; + } catch (err) { + return { + assertion, + passed: false, + message: `Invalid regex pattern: ${err instanceof Error ? err.message : String(err)}`, + actual: outputStr, + expected: assertion.pattern, + }; + } +}; + +const evaluateEqualsAssertion = ( + assertion: EqualsAssertion, + output: unknown +): AssertionResult => { + const passed = isEqual(output, assertion.expected); + + return { + assertion, + passed, + message: passed + ? "Output equals expected value" + : "Output does not equal expected value", + actual: output, + expected: assertion.expected, + }; +}; + +const evaluateJsonPathAssertion = ( + assertion: JsonPathAssertion, + output: unknown +): AssertionResult => { + try { + const value = getJsonPath(output, assertion.path); + const passed = isEqual(value, assertion.expected); + + return { + assertion, + passed, + message: passed + ? `Value at ${assertion.path} equals expected` + : `Value at ${assertion.path} does not equal expected`, + actual: value, + expected: assertion.expected, + }; + } catch (err) { + return { + assertion, + passed: false, + message: `Failed to evaluate path ${assertion.path}: ${err instanceof Error ? err.message : String(err)}`, + actual: output, + expected: assertion.expected, + }; + } +}; + +/** + * Convert output to string for text-based assertions. + */ +const stringifyOutput = (output: unknown): string => { + if (typeof output === "string") { + return output; + } + return JSON.stringify(output, null, 2); +}; + +/** + * Simple JSON path getter supporting dot notation. + * Supports paths like "response.status" or "$.response.status" + */ +const getJsonPath = (obj: unknown, path: string): unknown => { + const normalizedPath = path.startsWith("$.") ? path.slice(2) : path; + const parts = normalizedPath.split("."); + + let current: unknown = obj; + for (const part of parts) { + if (current === null || current === undefined) { + throw new Error(`Cannot read property "${part}" of ${String(current)}`); + } + if (typeof current !== "object") { + throw new Error(`Cannot read property "${part}" of non-object`); + } + current = (current as Record)[part]; + } + + return current; +}; diff --git a/src/cli/agent-evals/utils/file-assertions.ts b/src/cli/agent-evals/utils/file-assertions.ts new file mode 100644 index 0000000..c041d06 --- /dev/null +++ b/src/cli/agent-evals/utils/file-assertions.ts @@ -0,0 +1,183 @@ +import fs from "node:fs/promises"; +import { + resolveTmpPathForAccess, + resolveTmpPathForRead, +} from "~tools/utils/fs"; + +import type { + AssertionResult, + FileContainsAssertion, + FileExistsAssertion, + FileJsonPathAssertion, + FileNotExistsAssertion, +} from "../types/schemas"; + +/** + * Evaluate a fileExists assertion by checking if the file exists in tmp/. + */ +export const evaluateFileExistsAssertion = async ( + assertion: FileExistsAssertion +): Promise => { + try { + const fullPath = await resolveTmpPathForAccess(assertion.path); + await fs.access(fullPath); + return { + assertion, + passed: true, + message: `File exists: ${assertion.path}`, + actual: assertion.path, + expected: "file to exist", + }; + } catch (err) { + return { + assertion, + passed: false, + message: `File does not exist: ${assertion.path}`, + actual: err instanceof Error ? err.message : String(err), + expected: "file to exist", + }; + } +}; + +/** + * Evaluate a fileContains assertion by reading the file and checking for a substring. + */ +export const evaluateFileContainsAssertion = async ( + assertion: FileContainsAssertion +): Promise => { + try { + const fullPath = await resolveTmpPathForRead(assertion.path); + const content = await fs.readFile(fullPath, "utf8"); + const caseSensitive = assertion.caseSensitive ?? true; + const searchValue = caseSensitive + ? assertion.value + : assertion.value.toLowerCase(); + const searchIn = caseSensitive ? content : content.toLowerCase(); + const passed = searchIn.includes(searchValue); + + return { + assertion, + passed, + message: passed + ? `File contains "${assertion.value}"` + : `File does not contain "${assertion.value}"`, + actual: content.length > 500 ? `${content.slice(0, 500)}...` : content, + expected: assertion.value, + }; + } catch (err) { + return { + assertion, + passed: false, + message: `Failed to read file: ${err instanceof Error ? err.message : String(err)}`, + actual: "file read error", + expected: assertion.value, + }; + } +}; + +/** + * Evaluate a fileJsonPath assertion by reading a JSON file and checking a path. + */ +export const evaluateFileJsonPathAssertion = async ( + assertion: FileJsonPathAssertion +): Promise => { + try { + const fullPath = await resolveTmpPathForRead(assertion.path); + const content = await fs.readFile(fullPath, "utf8"); + const json = JSON.parse(content) as unknown; + const value = getJsonPath(json, assertion.jsonPath); + const passed = deepEquals(value, assertion.expected); + + return { + assertion, + passed, + message: passed + ? `Value at ${assertion.jsonPath} equals expected` + : `Value at ${assertion.jsonPath} does not equal expected`, + actual: value, + expected: assertion.expected, + }; + } catch (err) { + return { + assertion, + passed: false, + message: `Failed to evaluate: ${err instanceof Error ? err.message : String(err)}`, + actual: "evaluation error", + expected: assertion.expected, + }; + } +}; + +/** + * Evaluate a fileNotExists assertion by checking if the file does not exist in tmp/. + */ +export const evaluateFileNotExistsAssertion = async ( + assertion: FileNotExistsAssertion +): Promise => { + try { + const fullPath = await resolveTmpPathForAccess(assertion.path); + await fs.access(fullPath); + return { + assertion, + passed: false, + message: `File still exists: ${assertion.path}`, + actual: "file exists", + expected: "file to not exist", + }; + } catch (err) { + if (isErrno(err, "ENOENT")) { + return { + assertion, + passed: true, + message: `File does not exist: ${assertion.path}`, + actual: "file not found", + expected: "file to not exist", + }; + } + return { + assertion, + passed: false, + message: `Failed to check file: ${err instanceof Error ? err.message : String(err)}`, + actual: err instanceof Error ? err.message : String(err), + expected: "file to not exist", + }; + } +}; + +const isErrno = ( + error: unknown, + code: string +): error is NodeJS.ErrnoException => + typeof error === "object" && + error !== null && + "code" in error && + (error as NodeJS.ErrnoException).code === code; + +/** + * Deep equality check using JSON serialization. + */ +const deepEquals = (a: unknown, b: unknown): boolean => { + return JSON.stringify(a) === JSON.stringify(b); +}; + +/** + * Simple JSON path getter supporting dot notation. + * Supports paths like "name" or "$.response.status" + */ +const getJsonPath = (obj: unknown, pathStr: string): unknown => { + const normalizedPath = pathStr.startsWith("$.") ? pathStr.slice(2) : pathStr; + const parts = normalizedPath.split("."); + + let current: unknown = obj; + for (const part of parts) { + if (current === null || current === undefined) { + throw new Error(`Cannot read property "${part}" of ${String(current)}`); + } + if (typeof current !== "object") { + throw new Error(`Cannot read property "${part}" of non-object`); + } + current = (current as Record)[part]; + } + + return current; +}; diff --git a/src/cli/etf-backtest/clients/etf-data-fetcher.ts b/src/cli/etf-backtest/clients/etf-data-fetcher.ts index 5bc3297..4548625 100644 --- a/src/cli/etf-backtest/clients/etf-data-fetcher.ts +++ b/src/cli/etf-backtest/clients/etf-data-fetcher.ts @@ -14,8 +14,8 @@ import { getEtfApiPattern, JUST_ETF_BASE_URL, } from "../constants"; -import type { EtfDataResponse } from "../schemas"; -import { EtfDataResponseSchema, isEtfDataResponse } from "../schemas"; +import type { EtfDataResponse } from "../types/schemas"; +import { EtfDataResponseSchema, isEtfDataResponse } from "../types/schemas"; export type EtfDataFetcherConfig = { logger: Logger; diff --git a/src/cli/etf-backtest/clients/learnings-manager.ts b/src/cli/etf-backtest/clients/learnings-manager.ts index cee91cd..6ed9373 100644 --- a/src/cli/etf-backtest/clients/learnings-manager.ts +++ b/src/cli/etf-backtest/clients/learnings-manager.ts @@ -8,8 +8,12 @@ import { LEARNINGS_FILENAME, MAX_HISTORY_ITEMS, } from "../constants"; -import type { ExperimentResult, IterationRecord, Learnings } from "../schemas"; -import { LearningsSchema } from "../schemas"; +import type { + ExperimentResult, + IterationRecord, + Learnings, +} from "../types/schemas"; +import { LearningsSchema } from "../types/schemas"; import { computeScore } from "../utils/scoring"; export type LearningsManagerConfig = { diff --git a/src/cli/etf-backtest/main.ts b/src/cli/etf-backtest/main.ts index e11aca4..43d0e5f 100644 --- a/src/cli/etf-backtest/main.ts +++ b/src/cli/etf-backtest/main.ts @@ -30,8 +30,8 @@ import { TARGET_R2_NON_OVERLAPPING, ZERO, } from "./constants"; -import { AgentOutputSchema, CliArgsSchema } from "./schemas"; -import type { ExperimentResult, Learnings } from "./schemas"; +import { AgentOutputSchema, CliArgsSchema } from "./types/schemas"; +import type { ExperimentResult, Learnings } from "./types/schemas"; import { extractLastExperimentResult } from "./utils/experiment-extract"; import { printFinalResults } from "./utils/final-report"; import { formatFixed, formatPercent } from "./utils/formatters"; diff --git a/src/cli/etf-backtest/schemas.ts b/src/cli/etf-backtest/types/schemas.ts similarity index 99% rename from src/cli/etf-backtest/schemas.ts rename to src/cli/etf-backtest/types/schemas.ts index 7859d09..51c746e 100644 --- a/src/cli/etf-backtest/schemas.ts +++ b/src/cli/etf-backtest/types/schemas.ts @@ -6,7 +6,7 @@ import { DEFAULT_REFRESH, DEFAULT_SEED, DEFAULT_VERBOSE, -} from "./constants"; +} from "../constants"; // ISIN validation: 2 letter country code + 10 alphanumeric characters const IsinSchema = z diff --git a/src/cli/etf-backtest/utils/experiment-extract.ts b/src/cli/etf-backtest/utils/experiment-extract.ts index 903abf2..93fc1ec 100644 --- a/src/cli/etf-backtest/utils/experiment-extract.ts +++ b/src/cli/etf-backtest/utils/experiment-extract.ts @@ -1,6 +1,6 @@ import { INDEX_NOT_FOUND, JSON_SLICE_END_OFFSET, ZERO } from "../constants"; -import { ExperimentResultSchema } from "../schemas"; -import type { ExperimentResult } from "../schemas"; +import { ExperimentResultSchema } from "../types/schemas"; +import type { ExperimentResult } from "../types/schemas"; const extractJsonFromStdout = (stdout: string): unknown => { const startIdx = stdout.indexOf("{"); diff --git a/src/cli/etf-backtest/utils/final-report.ts b/src/cli/etf-backtest/utils/final-report.ts index 9cc76f5..9351a43 100644 --- a/src/cli/etf-backtest/utils/final-report.ts +++ b/src/cli/etf-backtest/utils/final-report.ts @@ -7,7 +7,7 @@ import { LINE_SEPARATOR, PREDICTION_HORIZON_MONTHS, } from "../constants"; -import type { ExperimentResult } from "../schemas"; +import type { ExperimentResult } from "../types/schemas"; import { formatFixed, formatPercent } from "./formatters"; export const printFinalResults = ( diff --git a/src/cli/etf-backtest/utils/learnings-formatter.ts b/src/cli/etf-backtest/utils/learnings-formatter.ts index a800fd1..27fd9b2 100644 --- a/src/cli/etf-backtest/utils/learnings-formatter.ts +++ b/src/cli/etf-backtest/utils/learnings-formatter.ts @@ -1,5 +1,5 @@ import { DECIMAL_PLACES, LEARNINGS_SUMMARY_TOP_N } from "../constants"; -import type { Learnings } from "../schemas"; +import type { Learnings } from "../types/schemas"; import { formatFixed, formatPercent } from "./formatters"; const FEATURE_PREVIEW_COUNT = 4; diff --git a/src/cli/etf-backtest/utils/scoring.ts b/src/cli/etf-backtest/utils/scoring.ts index 4dbd25c..4ad237f 100644 --- a/src/cli/etf-backtest/utils/scoring.ts +++ b/src/cli/etf-backtest/utils/scoring.ts @@ -4,7 +4,7 @@ import { SCORE_WEIGHTS, ZERO, } from "../constants"; -import type { ExperimentResult } from "../schemas"; +import type { ExperimentResult } from "../types/schemas"; export const computeScore = (metrics: ExperimentResult["metrics"]): number => { // Primary: prediction accuracy on non-overlapping samples (honest assessment) diff --git a/src/cli/guestbook/main.ts b/src/cli/guestbook/main.ts index a4013e6..5597d1d 100644 --- a/src/cli/guestbook/main.ts +++ b/src/cli/guestbook/main.ts @@ -6,18 +6,14 @@ import { AgentRunner } from "~clients/agent-runner"; import { Logger } from "~clients/logger"; import { createReadFileTool } from "~tools/read-file/read-file-tool"; import { createWriteFileTool } from "~tools/write-file/write-file-tool"; -import { z } from "zod"; import { question } from "zx"; +import { OutputSchema } from "./types/schemas"; + const logger = new Logger(); logger.info("Guestbook running..."); -const OutputSchema = z.object({ - success: z.boolean(), - message: z.string(), -}); - const agentRunner = new AgentRunner({ name: "GuestbookAgent", model: "gpt-5-mini", diff --git a/src/cli/guestbook/types/schemas.ts b/src/cli/guestbook/types/schemas.ts new file mode 100644 index 0000000..a6a633f --- /dev/null +++ b/src/cli/guestbook/types/schemas.ts @@ -0,0 +1,8 @@ +import { z } from "zod"; + +export const OutputSchema = z.object({ + success: z.boolean(), + message: z.string(), +}); + +export type Output = z.infer; diff --git a/src/cli/name-explorer/main.ts b/src/cli/name-explorer/main.ts index 39f8247..3827248 100644 --- a/src/cli/name-explorer/main.ts +++ b/src/cli/name-explorer/main.ts @@ -8,7 +8,6 @@ import { AgentRunner } from "~clients/agent-runner"; import { Logger } from "~clients/logger"; import { parseArgs } from "~utils/parse-args"; import { QuestionHandler } from "~utils/question-handler"; -import { z } from "zod"; import { NameSuggesterPipeline } from "./clients/pipeline"; import { StatsGenerator } from "./clients/stats-generator"; @@ -22,16 +21,14 @@ import { NameSuggesterOutputSchema, NameSuggesterOutputTypeSchema, } from "./types"; +import { CliArgsSchema } from "./types/schemas"; const logger = new Logger(); // --- Parse CLI arguments --- const { refetch: shouldRefetch, mode } = parseArgs({ logger, - schema: z.object({ - refetch: z.coerce.boolean().default(false), - mode: z.enum(["stats", "ai"]).default("ai"), - }), + schema: CliArgsSchema, }); // --- Initialize pipeline and database --- diff --git a/src/cli/name-explorer/types/schemas.ts b/src/cli/name-explorer/types/schemas.ts new file mode 100644 index 0000000..0ffd2e1 --- /dev/null +++ b/src/cli/name-explorer/types/schemas.ts @@ -0,0 +1,8 @@ +import { z } from "zod"; + +export const CliArgsSchema = z.object({ + refetch: z.coerce.boolean().default(false), + mode: z.enum(["stats", "ai"]).default("ai"), +}); + +export type CliArgs = z.infer; diff --git a/src/cli/scrape-publications/main.ts b/src/cli/scrape-publications/main.ts index a1630e0..aacb520 100644 --- a/src/cli/scrape-publications/main.ts +++ b/src/cli/scrape-publications/main.ts @@ -8,10 +8,10 @@ import path from "node:path"; import { Logger } from "~clients/logger"; import { parseArgs } from "~utils/parse-args"; import slug from "slug"; -import { z } from "zod"; import { question } from "zx"; import { PublicationPipeline } from "./clients/publication-pipeline"; +import { CliArgsSchema } from "./types/schemas"; const logger = new Logger({ level: "info", useColors: true }); @@ -24,11 +24,7 @@ const { filterUrl, } = parseArgs({ logger, - schema: z.object({ - url: z.url(), - refetch: z.coerce.boolean().default(false), - filterUrl: z.string().optional(), - }), + schema: CliArgsSchema, }); // 2. Create slugified directory path diff --git a/src/cli/scrape-publications/types/schemas.ts b/src/cli/scrape-publications/types/schemas.ts new file mode 100644 index 0000000..0797752 --- /dev/null +++ b/src/cli/scrape-publications/types/schemas.ts @@ -0,0 +1,9 @@ +import { z } from "zod"; + +export const CliArgsSchema = z.object({ + url: z.url(), + refetch: z.coerce.boolean().default(false), + filterUrl: z.string().optional(), +}); + +export type CliArgs = z.infer; diff --git a/src/clients/agent-runner.ts b/src/clients/agent-runner.ts index ff8350c..46b0553 100644 --- a/src/clients/agent-runner.ts +++ b/src/clients/agent-runner.ts @@ -11,7 +11,8 @@ export type AgentRunnerConfig = { name: string; model: "gpt-5-mini"; tools: Tool[]; - outputType: ZodType; + /** Zod schema for structured output. Omit for plain text responses. */ + outputType?: ZodType; instructions: string; // Logging config @@ -65,7 +66,7 @@ export class AgentRunner { name: config.name, model: config.model, tools: config.tools, - outputType: config.outputType, + ...(config.outputType ? { outputType: config.outputType } : {}), instructions: config.instructions, }); diff --git a/src/tools/delete-file/delete-file-tool.test.ts b/src/tools/delete-file/delete-file-tool.test.ts new file mode 100644 index 0000000..80a7bb9 --- /dev/null +++ b/src/tools/delete-file/delete-file-tool.test.ts @@ -0,0 +1,92 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { TMP_ROOT } from "~tools/utils/fs"; +import { invokeTool, tryCreateSymlink } from "~tools/utils/test-utils"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; + +import { createDeleteFileTool } from "./delete-file-tool"; + +describe("createDeleteFileTool tmp path safety", () => { + let testDir = ""; + let relativeDir = ""; + // eslint-disable-next-line @typescript-eslint/no-empty-function + const mockLogger = { tool: () => {} } as never; + + beforeEach(async () => { + await fs.mkdir(TMP_ROOT, { recursive: true }); + testDir = await fs.mkdtemp(path.join(TMP_ROOT, "vitest-tools-")); + relativeDir = path.relative(TMP_ROOT, testDir); + }); + + afterEach(async () => { + if (testDir) { + await fs.rm(testDir, { recursive: true, force: true }); + } + testDir = ""; + relativeDir = ""; + }); + + it("deletes relative paths under tmp", async () => { + const relativePath = path.join(relativeDir, "to-delete.txt"); + const absolutePath = path.join(TMP_ROOT, relativePath); + await fs.writeFile(absolutePath, "delete me", "utf8"); + + const deleteFileTool = createDeleteFileTool({ logger: mockLogger }); + const result = await invokeTool(deleteFileTool, { + path: relativePath, + }); + + expect(result).toContain("Deleted"); + await expect(fs.access(absolutePath)).rejects.toThrow(); + }); + + it("deletes absolute paths under tmp", async () => { + const absolutePath = path.join(testDir, "absolute-delete.txt"); + await fs.writeFile(absolutePath, "delete me", "utf8"); + + const deleteFileTool = createDeleteFileTool({ logger: mockLogger }); + const result = await invokeTool(deleteFileTool, { + path: absolutePath, + }); + + expect(result).toContain("Deleted"); + await expect(fs.access(absolutePath)).rejects.toThrow(); + }); + + it("rejects path traversal attempts", async () => { + const deleteFileTool = createDeleteFileTool({ logger: mockLogger }); + const result = await invokeTool(deleteFileTool, { + path: "../outside.txt", + }); + expect(result).toContain("Path traversal is not allowed."); + }); + + it("rejects symlink paths", async () => { + const realDir = path.join(testDir, "real"); + await fs.mkdir(realDir, { recursive: true }); + const realFile = path.join(realDir, "file.txt"); + await fs.writeFile(realFile, "real content", "utf8"); + const linkDir = path.join(testDir, "link"); + + const symlinkCreated = await tryCreateSymlink(realDir, linkDir); + if (!symlinkCreated) { + return; + } + + const symlinkPath = path.join(relativeDir, "link", "file.txt"); + + const deleteFileTool = createDeleteFileTool({ logger: mockLogger }); + const result = await invokeTool(deleteFileTool, { + path: symlinkPath, + }); + expect(result).toContain("Symlink paths are not allowed."); + }); + + it("returns error for non-existent files", async () => { + const deleteFileTool = createDeleteFileTool({ logger: mockLogger }); + const result = await invokeTool(deleteFileTool, { + path: path.join(relativeDir, "nonexistent.txt"), + }); + expect(result).toContain("Path does not exist."); + }); +}); diff --git a/src/tools/delete-file/delete-file-tool.ts b/src/tools/delete-file/delete-file-tool.ts new file mode 100644 index 0000000..7c10156 --- /dev/null +++ b/src/tools/delete-file/delete-file-tool.ts @@ -0,0 +1,36 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { tool } from "@openai/agents"; +import type { Logger } from "~clients/logger"; +import { resolveTmpPathForDelete, TMP_ROOT } from "~tools/utils/fs"; + +export type DeleteFileToolOptions = { + logger: Logger; +}; + +export const createDeleteFileTool = ({ logger }: DeleteFileToolOptions) => + tool({ + name: "deleteFile", + description: + "Deletes a file under the repo tmp directory (path is relative to tmp).", + parameters: { + type: "object", + properties: { + path: { + type: "string", + description: "Relative path within the repo tmp directory", + }, + }, + required: ["path"], + additionalProperties: false, + }, + execute: async ({ path: filePath }: { path: string }) => { + logger.tool("Deleting file", { path: filePath }); + const targetPath = await resolveTmpPathForDelete(filePath); + await fs.unlink(targetPath); + const relativePath = path.relative(TMP_ROOT, targetPath); + const displayPath = path.join("tmp", relativePath); + logger.tool("Deleted file", { path: displayPath }); + return `Deleted ${displayPath}`; + }, + }); diff --git a/src/tools/list-files/list-files-tool.test.ts b/src/tools/list-files/list-files-tool.test.ts index 106a32d..15b65b8 100644 --- a/src/tools/list-files/list-files-tool.test.ts +++ b/src/tools/list-files/list-files-tool.test.ts @@ -52,9 +52,9 @@ describe("createListFilesTool tmp path safety", () => { expect(result).toContain("[file] absolute.txt"); }); - it("lists root of tmp when no path provided", async () => { + it("lists root of tmp when empty path provided", async () => { const listFilesTool = createListFilesTool({ logger: mockLogger }); - const result = await invokeTool(listFilesTool, {}); + const result = await invokeTool(listFilesTool, { path: "" }); expect(result).toContain("Contents of tmp:"); expect(result).toContain(path.basename(testDir)); diff --git a/src/tools/list-files/list-files-tool.ts b/src/tools/list-files/list-files-tool.ts index e11bddf..28ec137 100644 --- a/src/tools/list-files/list-files-tool.ts +++ b/src/tools/list-files/list-files-tool.ts @@ -12,22 +12,22 @@ export const createListFilesTool = ({ logger }: ListFilesToolOptions) => tool({ name: "listFiles", description: - "Lists files and directories under the repo tmp directory (path is relative to tmp). If no path provided, lists root of tmp.", + "Lists files and directories under the repo tmp directory (path is relative to tmp). Use an empty path to list the tmp root.", parameters: { type: "object", properties: { path: { type: "string", - description: - "Relative path within the repo tmp directory (optional, defaults to tmp root)", + description: "Relative path within the repo tmp directory.", }, }, - required: [], + required: ["path"], additionalProperties: false, }, - execute: async ({ path: dirPath }: { path?: string }) => { - logger.tool("Listing files", { path: dirPath ?? "tmp root" }); - const targetPath = await resolveTmpPathForList(dirPath); + execute: async ({ path: dirPath }: { path: string }) => { + const effectivePath = dirPath || undefined; + logger.tool("Listing files", { path: effectivePath ?? "tmp root" }); + const targetPath = await resolveTmpPathForList(effectivePath); const entries = await fs.readdir(targetPath, { withFileTypes: true }); const lines = entries.map((entry) => { diff --git a/src/tools/read-file/read-file-tool.ts b/src/tools/read-file/read-file-tool.ts index f122d48..885b6b3 100644 --- a/src/tools/read-file/read-file-tool.ts +++ b/src/tools/read-file/read-file-tool.ts @@ -1,7 +1,8 @@ import fs from "node:fs/promises"; +import path from "node:path"; import { tool } from "@openai/agents"; import type { Logger } from "~clients/logger"; -import { resolveTmpPathForRead } from "~tools/utils/fs"; +import { resolveTmpPathForRead, TMP_ROOT } from "~tools/utils/fs"; export type ReadFileToolOptions = { logger: Logger; @@ -26,7 +27,9 @@ export const createReadFileTool = ({ logger }: ReadFileToolOptions) => execute: async ({ path: filePath }: { path: string }) => { logger.tool("Reading file", { path: filePath }); const targetPath = await resolveTmpPathForRead(filePath); - logger.tool("Read file result", { targetPath }); + const relativePath = path.relative(TMP_ROOT, targetPath); + const displayPath = path.join("tmp", relativePath); + logger.tool("Read file result", { targetPath: displayPath }); return fs.readFile(targetPath, "utf8"); }, }); diff --git a/src/tools/utils/fs.ts b/src/tools/utils/fs.ts index c023437..8755033 100644 --- a/src/tools/utils/fs.ts +++ b/src/tools/utils/fs.ts @@ -149,6 +149,66 @@ export const resolveTmpPathForRead = async (userPath: string) => { return candidatePath; }; +export const resolveTmpPathForAccess = async (userPath: string) => { + const trimmed = userPath.trim(); + if (!trimmed) { + throw new Error("Path cannot be empty."); + } + if (PATH_TRAVERSAL.test(trimmed)) { + throw new Error("Path traversal is not allowed."); + } + + await ensureTmpRoot({ create: false }); + const candidatePath = resolveCandidatePath(trimmed); + + await assertNoSymlinkComponents(TMP_ROOT, candidatePath, { + allowMissing: true, + }); + + const tmpRootReal = await fs.realpath(TMP_ROOT); + const parentDir = path.dirname(candidatePath); + try { + const parentReal = await fs.realpath(parentDir); + if (!isPathInside(tmpRootReal, parentReal)) { + throw new Error("Resolved path escapes tmp directory."); + } + } catch (error) { + if (!isErrno(error, "ENOENT")) { + throw error; + } + } + + return candidatePath; +}; + +export const resolveTmpPathForDelete = async (userPath: string) => { + const trimmed = userPath.trim(); + if (!trimmed) { + throw new Error("Path cannot be empty."); + } + if (PATH_TRAVERSAL.test(trimmed)) { + throw new Error("Path traversal is not allowed."); + } + + await ensureTmpRoot({ create: false }); + const candidatePath = resolveCandidatePath(trimmed); + + await assertNoSymlinkComponents(TMP_ROOT, candidatePath); + + const tmpRootReal = await fs.realpath(TMP_ROOT); + const parentReal = await fs.realpath(path.dirname(candidatePath)); + if (!isPathInside(tmpRootReal, parentReal)) { + throw new Error("Resolved path escapes tmp directory."); + } + + const fileStat = await fs.lstat(candidatePath); + if (!fileStat.isFile()) { + throw new Error("Path must point to a file."); + } + + return candidatePath; +}; + export const resolveTmpPathForList = async (userPath?: string) => { const trimmed = (userPath ?? "").trim(); diff --git a/src/utils/parse-args.test.ts b/src/utils/parse-args.test.ts new file mode 100644 index 0000000..b92137f --- /dev/null +++ b/src/utils/parse-args.test.ts @@ -0,0 +1,104 @@ +import { Logger } from "~clients/logger"; +import { parseArgs } from "~utils/parse-args"; +import { describe, expect, it } from "vitest"; + +import { CliArgsSchema } from "../cli/agent-evals/types/schemas"; + +describe("parseArgs", () => { + const logger = new Logger({ + level: "error", + useColors: false, + useTimestamps: false, + }); + + it("parses args after a standalone double-dash separator", () => { + const args = parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: ["--", "--suite=example"], + }); + + expect(args.suite).toBe("example"); + expect(args.all).toBe(false); + expect(args.report).toBe("json"); + expect(args.out).toBe("agent-evals"); + expect(args.verbose).toBe(false); + }); + + it("parses --all even when preceded by a double-dash separator", () => { + const args = parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: ["--", "--all"], + }); + + expect(args.suite).toBeUndefined(); + expect(args.all).toBe(true); + expect(args.report).toBe("json"); + expect(args.out).toBe("agent-evals"); + expect(args.verbose).toBe(false); + }); + + it("parses --report with valid enum values", () => { + const argsJson = parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: ["--all", "--report=json"], + }); + expect(argsJson.report).toBe("json"); + + const argsMd = parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: ["--all", "--report=md"], + }); + expect(argsMd.report).toBe("md"); + + const argsBoth = parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: ["--all", "--report=both"], + }); + expect(argsBoth.report).toBe("both"); + }); + + it("parses --out with custom path", () => { + const args = parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: ["--all", "--out=custom/output/path"], + }); + + expect(args.out).toBe("custom/output/path"); + }); + + it("parses --verbose flag", () => { + const args = parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: ["--all", "--verbose"], + }); + + expect(args.verbose).toBe(true); + }); + + it("throws on invalid --report value", () => { + expect(() => + parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: ["--all", "--report=invalid"], + }) + ).toThrow(); + }); + + it("throws when neither --suite nor --all is provided", () => { + expect(() => + parseArgs({ + logger, + schema: CliArgsSchema, + rawArgs: [], + }) + ).toThrow("Either --suite or --all is required"); + }); +}); diff --git a/src/utils/parse-args.ts b/src/utils/parse-args.ts index f97c2c6..91f3855 100644 --- a/src/utils/parse-args.ts +++ b/src/utils/parse-args.ts @@ -1,12 +1,17 @@ import type { Logger } from "~clients/logger"; import type { z } from "zod"; -import { argv } from "zx"; +import { parseArgv } from "zx"; export type ParseArgsOptions = { logger: Logger; schema: T; + rawArgs?: string[]; }; +// Strip standalone "--" so parseArgv doesn't treat it as a literal arg after end-of-options. +const sanitizeArgs = (rawArgs: string[]): string[] => + rawArgs.filter((arg) => arg !== "--"); + /** * Parses and validates CLI arguments using a Zod schema. * @param options - Logger and Zod schema for validation @@ -16,9 +21,11 @@ export type ParseArgsOptions = { export const parseArgs = ({ logger, schema, + rawArgs, }: ParseArgsOptions): z.infer => { logger.debug("Parsing CLI arguments..."); - const args = schema.parse(argv); + const parsedArgs = parseArgv(sanitizeArgs(rawArgs ?? process.argv.slice(2))); + const args = schema.parse(parsedArgs); logger.debug("Parsed args", { args }); return args; };