From 7ac2411a67ddfbd3d94485f7b77a889c526ac4ec Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 30 Mar 2026 04:33:45 +0000 Subject: [PATCH 1/7] feat(ci): add marketplace, frontmatter, and eval validation to GitHub Actions - Add composite setup-bun action with caching (mirrors opencode pattern) - Add marketplace job: schema validation, sort check, sync check, frontmatter validation - Add evals job: installs agentv globally, validates eval dirs have eval files, runs agentv validate with glob patterns - Add glob pattern support to `agentv validate` command (uses fast-glob, already a dependency) - Sort marketplace.json plugins alphabetically and sync .github/plugin/ copy Co-Authored-By: Claude Opus 4.6 --- .claude-plugin/marketplace.json | 12 +- .github/actions/setup-bun/action.yml | 41 ++++ .github/plugin/marketplace.json | 12 +- .github/workflows/validate.yml | 35 +++ .../src/commands/validate/validate-files.ts | 49 ++-- scripts/marketplace/check-sorted.ts | 41 ++++ scripts/marketplace/sync.ts | 17 ++ scripts/marketplace/validate-frontmatter.ts | 219 ++++++++++++++++++ scripts/marketplace/validate-marketplace.ts | 94 ++++++++ scripts/validate-eval-dirs.ts | 74 ++++++ 10 files changed, 566 insertions(+), 28 deletions(-) create mode 100644 .github/actions/setup-bun/action.yml create mode 100644 scripts/marketplace/check-sorted.ts create mode 100644 scripts/marketplace/sync.ts create mode 100644 scripts/marketplace/validate-frontmatter.ts create mode 100644 scripts/marketplace/validate-marketplace.ts create mode 100644 scripts/validate-eval-dirs.ts diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 1d2e158cb..8283ecf40 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -8,9 +8,9 @@ }, "plugins": [ { - "name": "agentv-dev", - "description": "Development skills for building and optimizing AgentV evaluations", - "source": "./plugins/agentv-dev" + "name": "agentic-engineering", + "description": "Design and review AI agent systems — architecture patterns, workflow design, and plugin quality review", + "source": "./plugins/agentic-engineering" }, { "name": "agentv-claude-trace", @@ -18,9 +18,9 @@ "source": "./plugins/agentv-claude-trace" }, { - "name": "agentic-engineering", - "description": "Design and review AI agent systems — architecture patterns, workflow design, and plugin quality review", - "source": "./plugins/agentic-engineering" + "name": "agentv-dev", + "description": "Development skills for building and optimizing AgentV evaluations", + "source": "./plugins/agentv-dev" } ] } diff --git a/.github/actions/setup-bun/action.yml b/.github/actions/setup-bun/action.yml new file mode 100644 index 000000000..51efd282a --- /dev/null +++ b/.github/actions/setup-bun/action.yml @@ -0,0 +1,41 @@ +name: "Setup Bun" +description: "Setup Bun with caching and install dependencies" +runs: + using: "composite" + steps: + - name: Get baseline download URL + id: bun-url + shell: bash + run: | + if [ "$RUNNER_ARCH" = "X64" ]; then + V=$(node -p "require('./package.json').packageManager.split('@')[1]") + case "$RUNNER_OS" in + macOS) OS=darwin ;; + Linux) OS=linux ;; + Windows) OS=windows ;; + esac + echo "url=https://github.com/oven-sh/bun/releases/download/bun-v${V}/bun-${OS}-x64-baseline.zip" >> "$GITHUB_OUTPUT" + fi + + - name: Setup Bun + uses: oven-sh/setup-bun@v2 + with: + bun-version-file: ${{ !steps.bun-url.outputs.url && 'package.json' || '' }} + bun-download-url: ${{ steps.bun-url.outputs.url }} + + - name: Get cache directory + id: cache + shell: bash + run: echo "dir=$(bun pm cache)" >> "$GITHUB_OUTPUT" + + - name: Cache Bun dependencies + uses: actions/cache@v4 + with: + path: ${{ steps.cache.outputs.dir }} + key: ${{ runner.os }}-bun-${{ hashFiles('**/bun.lock') }} + restore-keys: | + ${{ runner.os }}-bun- + + - name: Install dependencies + run: bun install --frozen-lockfile + shell: bash diff --git a/.github/plugin/marketplace.json b/.github/plugin/marketplace.json index 1d2e158cb..8283ecf40 100644 --- a/.github/plugin/marketplace.json +++ b/.github/plugin/marketplace.json @@ -8,9 +8,9 @@ }, "plugins": [ { - "name": "agentv-dev", - "description": "Development skills for building and optimizing AgentV evaluations", - "source": "./plugins/agentv-dev" + "name": "agentic-engineering", + "description": "Design and review AI agent systems — architecture patterns, workflow design, and plugin quality review", + "source": "./plugins/agentic-engineering" }, { "name": "agentv-claude-trace", @@ -18,9 +18,9 @@ "source": "./plugins/agentv-claude-trace" }, { - "name": "agentic-engineering", - "description": "Design and review AI agent systems — architecture patterns, workflow design, and plugin quality review", - "source": "./plugins/agentic-engineering" + "name": "agentv-dev", + "description": "Development skills for building and optimizing AgentV evaluations", + "source": "./plugins/agentv-dev" } ] } diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index 815846321..15036576a 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -21,3 +21,38 @@ jobs: --glob-ignore-case --root-dir . "**/*.md" + + marketplace: + name: Validate Marketplace + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/setup-bun + + - name: Validate marketplace.json (schema + sync) + run: bun scripts/marketplace/validate-marketplace.ts + + - name: Check marketplace sorted + run: bun scripts/marketplace/check-sorted.ts + + - name: Validate frontmatter + run: bun scripts/marketplace/validate-frontmatter.ts + + evals: + name: Validate Evals + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/setup-bun + + - name: Build + run: bun run build + + - name: Install agentv globally + run: bun install -g agentv + + - name: Check evals directories have eval files + run: bun scripts/validate-eval-dirs.ts + + - name: Validate eval schemas + run: agentv validate 'examples/features/**/evals/**/*.eval.yaml' 'examples/features/**/*.EVAL.yaml' diff --git a/apps/cli/src/commands/validate/validate-files.ts b/apps/cli/src/commands/validate/validate-files.ts index 91b569e34..b00c27b47 100644 --- a/apps/cli/src/commands/validate/validate-files.ts +++ b/apps/cli/src/commands/validate/validate-files.ts @@ -10,6 +10,7 @@ import { validateFileReferences, validateTargetsFile, } from '@agentv/core/evaluation/validation'; +import fg from 'fast-glob'; /** * Validate YAML files for AgentV schema compliance. @@ -67,34 +68,50 @@ async function validateSingleFile(filePath: string): Promise { } async function expandPaths(paths: readonly string[]): Promise { - const expanded: string[] = []; + const expanded = new Set(); for (const inputPath of paths) { const absolutePath = path.resolve(inputPath); - // Check if path exists + // Try as literal file or directory first try { await access(absolutePath, constants.F_OK); + const stats = await stat(absolutePath); + + if (stats.isFile()) { + if (isYamlFile(absolutePath)) expanded.add(absolutePath); + continue; + } + if (stats.isDirectory()) { + const yamlFiles = await findYamlFiles(absolutePath); + for (const f of yamlFiles) expanded.add(f); + continue; + } } catch { - console.warn(`Warning: Path not found: ${inputPath}`); - continue; + // Not a literal path — fall through to glob matching } - const stats = await stat(absolutePath); - - if (stats.isFile()) { - // Only include YAML files - if (isYamlFile(absolutePath)) { - expanded.push(absolutePath); - } - } else if (stats.isDirectory()) { - // Recursively find all YAML files in directory - const yamlFiles = await findYamlFiles(absolutePath); - expanded.push(...yamlFiles); + // Treat as glob pattern + const globPattern = inputPath.includes('\\') ? inputPath.replace(/\\/g, '/') : inputPath; + const matches = await fg(globPattern, { + cwd: process.cwd(), + absolute: true, + onlyFiles: true, + unique: true, + dot: false, + followSymbolicLinks: true, + }); + + const yamlMatches = matches.filter((f) => isYamlFile(f)); + if (yamlMatches.length === 0) { + console.warn(`Warning: No YAML files matched pattern: ${inputPath}`); } + for (const f of yamlMatches) expanded.add(path.normalize(f)); } - return expanded; + const sorted = Array.from(expanded); + sorted.sort(); + return sorted; } async function findYamlFiles(dirPath: string): Promise { diff --git a/scripts/marketplace/check-sorted.ts b/scripts/marketplace/check-sorted.ts new file mode 100644 index 000000000..8192911a8 --- /dev/null +++ b/scripts/marketplace/check-sorted.ts @@ -0,0 +1,41 @@ +#!/usr/bin/env bun +/** + * Checks that marketplace.json plugins are alphabetically sorted by name. + * + * Usage: + * bun scripts/marketplace/check-sorted.ts # check, exit 1 if unsorted + * bun scripts/marketplace/check-sorted.ts --fix # sort in place + */ + +import { readFileSync, writeFileSync } from 'node:fs'; +import { resolve } from 'node:path'; + +const root = resolve(import.meta.dirname, '../..'); +const MARKETPLACE = resolve(root, '.claude-plugin/marketplace.json'); + +type Plugin = { name: string; [k: string]: unknown }; +type Marketplace = { plugins: Plugin[]; [k: string]: unknown }; + +const raw = readFileSync(MARKETPLACE, 'utf8'); +const mp: Marketplace = JSON.parse(raw); + +const cmp = (a: Plugin, b: Plugin) => a.name.toLowerCase().localeCompare(b.name.toLowerCase()); + +if (process.argv.includes('--fix')) { + mp.plugins.sort(cmp); + writeFileSync(MARKETPLACE, `${JSON.stringify(mp, null, 2)}\n`); + console.log(`Sorted ${mp.plugins.length} plugins`); + process.exit(0); +} + +for (let i = 1; i < mp.plugins.length; i++) { + if (cmp(mp.plugins[i - 1], mp.plugins[i]) > 0) { + console.error( + `marketplace.json plugins are not sorted: '${mp.plugins[i - 1].name}' should come after '${mp.plugins[i].name}' (index ${i})`, + ); + console.error(' run: bun scripts/marketplace/check-sorted.ts --fix'); + process.exit(1); + } +} + +console.log(`OK: ${mp.plugins.length} plugins sorted`); diff --git a/scripts/marketplace/sync.ts b/scripts/marketplace/sync.ts new file mode 100644 index 000000000..68d621290 --- /dev/null +++ b/scripts/marketplace/sync.ts @@ -0,0 +1,17 @@ +#!/usr/bin/env bun +/** + * Syncs marketplace.json from .claude-plugin/ to .github/plugin/. + * + * Usage: + * bun scripts/marketplace/sync.ts + */ + +import { cp } from 'node:fs/promises'; +import { resolve } from 'node:path'; + +const root = resolve(import.meta.dirname, '../..'); +const src = resolve(root, '.claude-plugin/marketplace.json'); +const dest = resolve(root, '.github/plugin/marketplace.json'); + +await cp(src, dest); +console.log('Synced marketplace.json → .github/plugin/marketplace.json'); diff --git a/scripts/marketplace/validate-frontmatter.ts b/scripts/marketplace/validate-frontmatter.ts new file mode 100644 index 000000000..99c205e9a --- /dev/null +++ b/scripts/marketplace/validate-frontmatter.ts @@ -0,0 +1,219 @@ +#!/usr/bin/env bun +/** + * Validates YAML frontmatter in agent, skill, and command .md files. + * + * Usage: + * bun scripts/marketplace/validate-frontmatter.ts # scan plugins/ + * bun scripts/marketplace/validate-frontmatter.ts /path/to/dir # scan specific directory + * bun scripts/marketplace/validate-frontmatter.ts file1.md file2.md + */ + +import { readFile, readdir } from 'node:fs/promises'; +import { basename, join, relative, resolve } from 'node:path'; +import { parse as parseYaml } from 'yaml'; + +const YAML_SPECIAL_CHARS = /[{}[\]*&#!|>%@`]/; +const BOM = /^\uFEFF/; +const FRONTMATTER_REGEX = /^---\s*\n([\s\S]*?)---\s*\n?/; + +function quoteSpecialValues(text: string): string { + const lines = text.split('\n'); + const result: string[] = []; + + for (const line of lines) { + const match = line.match(/^([a-zA-Z_-]+):\s+(.+)$/); + if (match) { + const [, key, value] = match; + if (!key || !value) { + result.push(line); + continue; + } + if ( + (value.startsWith('"') && value.endsWith('"')) || + (value.startsWith("'") && value.endsWith("'")) + ) { + result.push(line); + continue; + } + // Skip YAML block scalar indicators (>, >-, |, |-, etc.) + if (/^[>|][+-]?$/.test(value.trim())) { + result.push(line); + continue; + } + if (YAML_SPECIAL_CHARS.test(value)) { + const escaped = value.replace(/\\/g, '\\\\').replace(/"/g, '\\"'); + result.push(`${key}: "${escaped}"`); + continue; + } + } + result.push(line); + } + + return result.join('\n'); +} + +interface ParseResult { + frontmatter: Record; + content: string; + error?: string; +} + +function parseFrontmatter(markdown: string): ParseResult { + const match = markdown.replace(BOM, '').match(FRONTMATTER_REGEX); + + if (!match) { + return { frontmatter: {}, content: markdown, error: 'No frontmatter found' }; + } + + const frontmatterText = quoteSpecialValues(match[1] || ''); + const content = markdown.slice(match[0].length); + + try { + const parsed = parseYaml(frontmatterText); + if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) { + return { frontmatter: parsed as Record, content }; + } + return { + frontmatter: {}, + content, + error: `YAML parsed but result is not an object (got ${typeof parsed}${Array.isArray(parsed) ? ' array' : ''})`, + }; + } catch (err) { + return { + frontmatter: {}, + content, + error: `YAML parse failed: ${err instanceof Error ? err.message : err}`, + }; + } +} + +type FileType = 'agent' | 'skill' | 'command'; + +interface ValidationIssue { + level: 'error' | 'warning'; + message: string; +} + +function validateAgent(fm: Record): ValidationIssue[] { + const issues: ValidationIssue[] = []; + if (!fm.name || typeof fm.name !== 'string') + issues.push({ level: 'error', message: 'Missing required "name" field' }); + if (!fm.description || typeof fm.description !== 'string') + issues.push({ level: 'error', message: 'Missing required "description" field' }); + return issues; +} + +function validateSkill(fm: Record): ValidationIssue[] { + const issues: ValidationIssue[] = []; + if (!fm.description && !fm.when_to_use) + issues.push({ level: 'error', message: 'Missing required "description" field' }); + return issues; +} + +function validateCommand(fm: Record): ValidationIssue[] { + const issues: ValidationIssue[] = []; + if (!fm.description || typeof fm.description !== 'string') + issues.push({ level: 'error', message: 'Missing required "description" field' }); + return issues; +} + +function detectFileType(filePath: string): FileType | null { + const normalized = filePath.replace(/\\/g, '/'); + const inSkillContent = /\/skills\/[^/]+\//.test(normalized); + if (normalized.includes('/agents/') && !inSkillContent && basename(filePath) !== 'README.md') + return 'agent'; + if (normalized.includes('/skills/') && basename(filePath) === 'SKILL.md') return 'skill'; + if (normalized.includes('/commands/') && !inSkillContent) return 'command'; + return null; +} + +async function findMdFiles(baseDir: string): Promise<{ path: string; type: FileType }[]> { + const results: { path: string; type: FileType }[] = []; + + async function walk(dir: string) { + const entries = await readdir(dir, { withFileTypes: true }); + for (const entry of entries) { + const fullPath = join(dir, entry.name); + if (entry.isDirectory()) { + await walk(fullPath); + } else if (entry.name.endsWith('.md')) { + const type = detectFileType(fullPath); + if (type) results.push({ path: fullPath, type }); + } + } + } + + await walk(baseDir); + return results; +} + +async function main() { + const args = process.argv.slice(2); + const root = resolve(import.meta.dirname, '../..'); + + let files: { path: string; type: FileType }[]; + let baseDir: string; + + if (args.length > 0 && args.every((a) => a.endsWith('.md'))) { + baseDir = process.cwd(); + files = []; + for (const arg of args) { + const fullPath = resolve(arg); + const type = detectFileType(fullPath); + if (type) files.push({ path: fullPath, type }); + } + } else { + baseDir = args[0] || resolve(root, 'plugins'); + files = await findMdFiles(baseDir); + } + + let totalErrors = 0; + let totalWarnings = 0; + + console.log(`Validating ${files.length} frontmatter files...\n`); + + for (const { path: filePath, type } of files) { + const rel = relative(baseDir, filePath); + const content = await readFile(filePath, 'utf-8'); + const result = parseFrontmatter(content); + + const issues: ValidationIssue[] = []; + + if (result.error) { + issues.push({ level: 'error', message: result.error }); + } else { + switch (type) { + case 'agent': + issues.push(...validateAgent(result.frontmatter)); + break; + case 'skill': + issues.push(...validateSkill(result.frontmatter)); + break; + case 'command': + issues.push(...validateCommand(result.frontmatter)); + break; + } + } + + if (issues.length > 0) { + console.log(`${rel} (${type})`); + for (const issue of issues) { + const prefix = issue.level === 'error' ? ' ERROR' : ' WARN '; + console.log(`${prefix}: ${issue.message}`); + if (issue.level === 'error') totalErrors++; + else totalWarnings++; + } + console.log(); + } + } + + console.log('---'); + console.log(`Validated ${files.length} files: ${totalErrors} errors, ${totalWarnings} warnings`); + + if (totalErrors > 0) process.exit(1); +} + +main().catch((err) => { + console.error('Fatal error:', err); + process.exit(2); +}); diff --git a/scripts/marketplace/validate-marketplace.ts b/scripts/marketplace/validate-marketplace.ts new file mode 100644 index 000000000..c8486e204 --- /dev/null +++ b/scripts/marketplace/validate-marketplace.ts @@ -0,0 +1,94 @@ +#!/usr/bin/env bun +/** + * Validates marketplace.json: well-formed JSON, plugins array present, + * each entry has required fields, no duplicates, and .github copy is in sync. + * + * Usage: + * bun scripts/marketplace/validate-marketplace.ts + */ + +import { readFile } from 'node:fs/promises'; +import { resolve } from 'node:path'; + +const root = resolve(import.meta.dirname, '../..'); +const src = resolve(root, '.claude-plugin/marketplace.json'); +const dest = resolve(root, '.github/plugin/marketplace.json'); + +// --- 1. JSON validation --- + +const content = await readFile(src, 'utf-8'); + +let parsed: unknown; +try { + parsed = JSON.parse(content); +} catch (err) { + console.error( + `[json] ERROR: .claude-plugin/marketplace.json is not valid JSON: ${err instanceof Error ? err.message : err}`, + ); + process.exit(1); +} + +if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) { + console.error('[json] ERROR: .claude-plugin/marketplace.json must be a JSON object'); + process.exit(1); +} + +const marketplace = parsed as Record; +if (!Array.isArray(marketplace.plugins)) { + console.error('[json] ERROR: .claude-plugin/marketplace.json missing "plugins" array'); + process.exit(1); +} + +// --- 2. Plugin entry validation --- + +const errors: string[] = []; +const seen = new Set(); +const required = ['name', 'description', 'source'] as const; + +marketplace.plugins.forEach((p: unknown, i: number) => { + if (!p || typeof p !== 'object') { + errors.push(`plugins[${i}]: must be an object`); + return; + } + const entry = p as Record; + for (const field of required) { + if (!entry[field]) { + errors.push(`plugins[${i}] (${entry.name ?? '?'}): missing required field "${field}"`); + } + } + if (typeof entry.name === 'string') { + if (seen.has(entry.name)) { + errors.push(`plugins[${i}]: duplicate plugin name "${entry.name}"`); + } + seen.add(entry.name); + } +}); + +if (errors.length) { + console.error( + `[schema] ${errors.length} validation error(s) in .claude-plugin/marketplace.json:`, + ); + for (const e of errors) console.error(` - ${e}`); + process.exit(1); +} + +// --- 3. Sync check (.claude-plugin → .github/plugin) --- + +let destContent: string; +try { + destContent = await readFile(dest, 'utf-8'); +} catch { + console.error('[sync] ERROR: .github/plugin/marketplace.json not found'); + console.error(' Run: bun scripts/marketplace/sync.ts'); + process.exit(1); +} + +if (content !== destContent) { + console.error( + '[sync] ERROR: .github/plugin/marketplace.json is out of sync with .claude-plugin/marketplace.json', + ); + console.error(' Run: bun scripts/marketplace/sync.ts'); + process.exit(1); +} + +console.log(`OK: ${marketplace.plugins.length} plugins validated, sync verified`); diff --git a/scripts/validate-eval-dirs.ts b/scripts/validate-eval-dirs.ts new file mode 100644 index 000000000..39c6e9896 --- /dev/null +++ b/scripts/validate-eval-dirs.ts @@ -0,0 +1,74 @@ +#!/usr/bin/env bun +/** + * Validates that each feature directory under examples/features/ that has an + * evals/ subdirectory contains at least one *.eval.yaml or *.EVAL.yaml file + * (either inside evals/ or at the feature root). + * + * Directories without an evals/ subdirectory are skipped — they may be SDK + * examples or other non-eval feature demos. + * + * Usage: + * bun scripts/validate-eval-dirs.ts + */ + +import { globSync, readdirSync, statSync } from 'node:fs'; +import { join, relative, resolve } from 'node:path'; + +const root = resolve(import.meta.dirname, '..'); +const featuresDir = resolve(root, 'examples/features'); + +// Feature dirs whose evals/ folder intentionally holds only support files +// (result JSONL, baselines) rather than eval definitions. Remove entries here +// once they gain proper eval YAML files. +const KNOWN_EXCEPTIONS = new Set([ + 'compare', // evals/ holds baseline/candidate result JSONL for agentv compare + 'trace-analysis', // evals/ holds pre-recorded trace results +]); + +const errors: string[] = []; +const entries = readdirSync(featuresDir, { withFileTypes: true }); + +for (const entry of entries) { + if (!entry.isDirectory() || entry.name.startsWith('.')) continue; + + const featureDir = join(featuresDir, entry.name); + const evalsDir = join(featureDir, 'evals'); + + // Only check features that have an evals/ subdirectory + try { + if (!statSync(evalsDir).isDirectory()) continue; + } catch { + continue; + } + + // Look for eval files in evals/ (recursive) and at feature root. + // Matches: *.eval.yaml, *.EVAL.yaml, eval.yaml, dataset*.yaml (config default patterns) + const evalPatterns = [ + '**/*.{eval.yaml,eval.yml,EVAL.yaml,EVAL.yml}', + '**/eval.{yaml,yml}', + '**/dataset*.{yaml,yml}', + ]; + const evalFilesInEvalsDir = evalPatterns.flatMap((p) => globSync(p, { cwd: evalsDir })); + const evalFilesAtRoot = evalPatterns.flatMap((p) => + globSync(p.replace('**/', ''), { cwd: featureDir }), + ); + + if (evalFilesInEvalsDir.length === 0 && evalFilesAtRoot.length === 0) { + if (KNOWN_EXCEPTIONS.has(entry.name)) { + console.warn(`WARN: ${relative(root, evalsDir)} has no eval files (known exception)`); + } else { + errors.push(relative(root, evalsDir)); + } + } +} + +if (errors.length > 0) { + console.error( + 'The following evals/ directories contain no eval files (*.eval.yaml or *.EVAL.yaml):', + ); + for (const e of errors) console.error(` - ${e}`); + process.exit(1); +} + +const checked = entries.filter((e) => e.isDirectory() && !e.name.startsWith('.')).length; +console.log(`OK: ${checked} feature directories checked`); From f5b4f802192fb10c811c5a9baacfeba2fab4636f Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 30 Mar 2026 04:35:19 +0000 Subject: [PATCH 2/7] fix(examples): add demo evals for compare and trace-analysis features Instead of hardcoding known exceptions in the eval-dirs validation script, add proper eval YAML files to the compare and trace-analysis example directories so they pass validation like all other features. Co-Authored-By: Claude Opus 4.6 --- .../features/compare/evals/dataset.eval.yaml | 39 +++++++++++++++++++ .../trace-analysis/evals/dataset.eval.yaml | 36 +++++++++++++++++ scripts/validate-eval-dirs.ts | 14 +------ 3 files changed, 76 insertions(+), 13 deletions(-) create mode 100644 examples/features/compare/evals/dataset.eval.yaml create mode 100644 examples/features/trace-analysis/evals/dataset.eval.yaml diff --git a/examples/features/compare/evals/dataset.eval.yaml b/examples/features/compare/evals/dataset.eval.yaml new file mode 100644 index 000000000..93adff30d --- /dev/null +++ b/examples/features/compare/evals/dataset.eval.yaml @@ -0,0 +1,39 @@ +$schema: agentv-eval-v2 + +# Demo eval for the compare example. +# Run against two targets to generate baseline and candidate result files: +# agentv eval evals/dataset.eval.yaml --target baseline +# agentv eval evals/dataset.eval.yaml --target candidate +# Then compare: +# agentv compare evals/baseline-results.jsonl evals/candidate-results.jsonl + +name: compare-demo +description: Demo eval for generating baseline and candidate results to compare + +tests: + - id: code-review-001 + input: Review the following code for bugs and suggest improvements. + criteria: Identifies at least one issue and suggests a fix + assertions: + - type: contains + value: bug + - type: contains + value: fix + + - id: code-review-002 + input: Explain what this function does and how it could be optimized. + criteria: Provides a clear explanation and at least one optimization suggestion + assertions: + - type: contains + value: function + - type: contains + value: optim + + - id: code-gen-001 + input: Write a function that checks if a string is a palindrome. + criteria: Returns working code that handles basic palindrome cases + assertions: + - type: contains + value: palindrome + - type: is-json + required: false diff --git a/examples/features/trace-analysis/evals/dataset.eval.yaml b/examples/features/trace-analysis/evals/dataset.eval.yaml new file mode 100644 index 000000000..d77ef444a --- /dev/null +++ b/examples/features/trace-analysis/evals/dataset.eval.yaml @@ -0,0 +1,36 @@ +$schema: agentv-eval-v2 + +# Demo eval for the trace-analysis example. +# Run this eval to generate result traces, then analyze with: +# agentv trace evals/multi-agent.eval.results.jsonl + +name: trace-analysis-demo +description: Demo eval for generating execution traces to analyze + +tests: + - id: research-question + input: What are the key differences between REST and GraphQL APIs? + criteria: Covers at least three differences including query flexibility, over-fetching, and type system + assertions: + - type: contains + value: REST + - type: contains + value: GraphQL + - type: regex + value: "type.?system|schema|typed" + + - id: code-review-task + input: Review this Python function for potential issues and suggest improvements. + criteria: Identifies at least one code quality issue + assertions: + - type: contains + value: suggest + - type: regex + value: "improv|fix|refactor|optim" + + - id: simple-qa + input: What is the capital of France? + criteria: Correctly answers Paris + assertions: + - type: contains + value: Paris diff --git a/scripts/validate-eval-dirs.ts b/scripts/validate-eval-dirs.ts index 39c6e9896..94e4c197d 100644 --- a/scripts/validate-eval-dirs.ts +++ b/scripts/validate-eval-dirs.ts @@ -17,14 +17,6 @@ import { join, relative, resolve } from 'node:path'; const root = resolve(import.meta.dirname, '..'); const featuresDir = resolve(root, 'examples/features'); -// Feature dirs whose evals/ folder intentionally holds only support files -// (result JSONL, baselines) rather than eval definitions. Remove entries here -// once they gain proper eval YAML files. -const KNOWN_EXCEPTIONS = new Set([ - 'compare', // evals/ holds baseline/candidate result JSONL for agentv compare - 'trace-analysis', // evals/ holds pre-recorded trace results -]); - const errors: string[] = []; const entries = readdirSync(featuresDir, { withFileTypes: true }); @@ -54,11 +46,7 @@ for (const entry of entries) { ); if (evalFilesInEvalsDir.length === 0 && evalFilesAtRoot.length === 0) { - if (KNOWN_EXCEPTIONS.has(entry.name)) { - console.warn(`WARN: ${relative(root, evalsDir)} has no eval files (known exception)`); - } else { - errors.push(relative(root, evalsDir)); - } + errors.push(relative(root, evalsDir)); } } From f153d0234876fa38ca842b696b97cbfc13534fb8 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 30 Mar 2026 04:44:15 +0000 Subject: [PATCH 3/7] fix(ci): use local build for validate, fix import.meta CJS warning - Run agentv validate from built dist (bun apps/cli/dist/cli.js) instead of installing from npm, which lacks the new glob support - Replace import.meta.url with __dirname in pi-coding-agent.ts to eliminate the esbuild CJS warning about empty import.meta Co-Authored-By: Claude Opus 4.6 --- .github/workflows/validate.yml | 5 +---- .../core/src/evaluation/providers/pi-coding-agent.ts | 10 ++++------ 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index 15036576a..8f62d5a8a 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -48,11 +48,8 @@ jobs: - name: Build run: bun run build - - name: Install agentv globally - run: bun install -g agentv - - name: Check evals directories have eval files run: bun scripts/validate-eval-dirs.ts - name: Validate eval schemas - run: agentv validate 'examples/features/**/evals/**/*.eval.yaml' 'examples/features/**/*.EVAL.yaml' + run: bun apps/cli/dist/cli.js validate 'examples/features/**/evals/**/*.eval.yaml' 'examples/features/**/*.EVAL.yaml' diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts index 3e4691bd0..10284081f 100644 --- a/packages/core/src/evaluation/providers/pi-coding-agent.ts +++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts @@ -15,7 +15,6 @@ import type { WriteStream } from 'node:fs'; import { mkdir } from 'node:fs/promises'; import path from 'node:path'; import { createInterface } from 'node:readline'; -import { fileURLToPath } from 'node:url'; import { recordPiLogEntry } from './pi-log-tracker.js'; import { extractPiTextContent, toFiniteNumber, toPiContentArray } from './pi-utils.js'; @@ -53,9 +52,9 @@ async function promptInstall(): Promise { /** Resolve agentv's own package root (where bun add should install peer deps). */ function findAgentvRoot(): string { - const thisFile = fileURLToPath(import.meta.url); - let dir = path.dirname(thisFile); - // Walk up until we find a package.json (covers both src and dist layouts) + // Walk up from this file's directory until we find a package.json. + // Works in both ESM (__dirname via Node/Bun polyfill) and CJS (__dirname native). + let dir = __dirname; for (let i = 0; i < 10; i++) { try { const pkg = path.join(dir, 'package.json'); @@ -68,8 +67,7 @@ function findAgentvRoot(): string { dir = parent; } } - // Fallback: current file's directory - return path.dirname(thisFile); + return __dirname; } async function doLoadSdkModules(): Promise { From 9aa3ff8b80cbaaad83bd65863f8ee2981312443a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 30 Mar 2026 05:04:06 +0000 Subject: [PATCH 4/7] fix(core): use tsup shims for import.meta CJS compatibility Enable `shims: true` in tsup config instead of manual runtime guards. tsup injects proper CJS shims (pathToFileURL(__filename)) so import.meta.url works in CJS output without esbuild warnings, while ESM output uses native import.meta.url directly. Reverts pi-coding-agent.ts to its original idiomatic ESM code since the build tool now handles cross-format compatibility. Co-Authored-By: Claude Opus 4.6 --- .../core/src/evaluation/providers/pi-coding-agent.ts | 10 ++++++---- packages/core/tsup.config.ts | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts index 10284081f..3e4691bd0 100644 --- a/packages/core/src/evaluation/providers/pi-coding-agent.ts +++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts @@ -15,6 +15,7 @@ import type { WriteStream } from 'node:fs'; import { mkdir } from 'node:fs/promises'; import path from 'node:path'; import { createInterface } from 'node:readline'; +import { fileURLToPath } from 'node:url'; import { recordPiLogEntry } from './pi-log-tracker.js'; import { extractPiTextContent, toFiniteNumber, toPiContentArray } from './pi-utils.js'; @@ -52,9 +53,9 @@ async function promptInstall(): Promise { /** Resolve agentv's own package root (where bun add should install peer deps). */ function findAgentvRoot(): string { - // Walk up from this file's directory until we find a package.json. - // Works in both ESM (__dirname via Node/Bun polyfill) and CJS (__dirname native). - let dir = __dirname; + const thisFile = fileURLToPath(import.meta.url); + let dir = path.dirname(thisFile); + // Walk up until we find a package.json (covers both src and dist layouts) for (let i = 0; i < 10; i++) { try { const pkg = path.join(dir, 'package.json'); @@ -67,7 +68,8 @@ function findAgentvRoot(): string { dir = parent; } } - return __dirname; + // Fallback: current file's directory + return path.dirname(thisFile); } async function doLoadSdkModules(): Promise { diff --git a/packages/core/tsup.config.ts b/packages/core/tsup.config.ts index e4edee5a4..e85a45308 100644 --- a/packages/core/tsup.config.ts +++ b/packages/core/tsup.config.ts @@ -3,6 +3,7 @@ import { defineConfig } from 'tsup'; export default defineConfig({ entry: ['src/index.ts', 'src/evaluation/validation/index.ts'], format: ['esm', 'cjs'], + shims: true, sourcemap: true, clean: true, dts: { From a4ddc99de8a2dd56fd4addaa4aeeb2ffd98fb8cb Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 30 Mar 2026 05:20:26 +0000 Subject: [PATCH 5/7] fix(examples): use static criteria in env-interpolation eval The criteria field was set to ${{ EVAL_CRITERIA }} which resolves to empty string when the env var isn't set, causing validation to fail in CI. Move interpolation demo to expected_output and input fields instead, keeping criteria as a static string. Co-Authored-By: Claude Opus 4.6 --- .../features/env-interpolation/evals/dataset.eval.yaml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/features/env-interpolation/evals/dataset.eval.yaml b/examples/features/env-interpolation/evals/dataset.eval.yaml index b6d9980a1..608b843bd 100644 --- a/examples/features/env-interpolation/evals/dataset.eval.yaml +++ b/examples/features/env-interpolation/evals/dataset.eval.yaml @@ -4,12 +4,10 @@ # Missing variables resolve to empty string. # # Usage: -# export EVAL_CRITERIA="Responds with a friendly greeting" # export CUSTOM_SYSTEM_PROMPT="You are a helpful assistant who always greets warmly." # agentv eval examples/features/env-interpolation/evals/dataset.eval.yaml # # Or use a .env file in the project root: -# EVAL_CRITERIA=Responds with a friendly greeting # CUSTOM_SYSTEM_PROMPT=You are a helpful assistant who always greets warmly. description: Demonstrates ${{ VAR }} interpolation in eval fields @@ -20,13 +18,13 @@ execution: tests: # Full-value interpolation: entire field value from env var - id: full-value - criteria: "${{ EVAL_CRITERIA }}" + criteria: Responds with a friendly greeting input: "Hello!" - expected_output: "Hello! How can I help you today?" + expected_output: "${{ EXPECTED_GREETING }}" # Partial/inline interpolation: env var embedded in a larger string - id: partial-value - criteria: "Response uses the system prompt persona and ${{ EVAL_CRITERIA }}" + criteria: Response uses the system prompt persona input: - role: system content: "${{ CUSTOM_SYSTEM_PROMPT }}" From a23783806ad23dfd1e8aa8b1e5ac30c9f5013084 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 30 Mar 2026 05:24:42 +0000 Subject: [PATCH 6/7] fix(core): remove name-requires-description validation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit name and description are optional metadata — no reason to couple them. name can be derived from the filename if not provided. Co-Authored-By: Claude Opus 4.6 --- .../src/evaluation/validation/eval-validator.ts | 10 ---------- .../validation/eval-validator.test.ts | 17 ----------------- 2 files changed, 27 deletions(-) diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts index 9db0d7f6a..133decd15 100644 --- a/packages/core/src/evaluation/validation/eval-validator.ts +++ b/packages/core/src/evaluation/validation/eval-validator.ts @@ -523,16 +523,6 @@ function validateMetadata(parsed: JsonObject, filePath: string, errors: Validati }); } } - - // Warn if name is present but description is missing - if (!('description' in parsed) || parsed.description === undefined) { - errors.push({ - severity: 'warning', - filePath, - location: 'name', - message: "When 'name' is present, 'description' should also be provided.", - }); - } } } diff --git a/packages/core/test/evaluation/validation/eval-validator.test.ts b/packages/core/test/evaluation/validation/eval-validator.test.ts index 65546754f..cd11bd48d 100644 --- a/packages/core/test/evaluation/validation/eval-validator.test.ts +++ b/packages/core/test/evaluation/validation/eval-validator.test.ts @@ -457,23 +457,6 @@ describe('validateEvalFile', () => { }); describe('metadata validation', () => { - it('warns when name is present without description', async () => { - const filePath = path.join(tempDir, 'meta-name-only.yaml'); - await writeFile( - filePath, - `name: my-eval -tests: - - id: test-1 - input: "Query" -`, - ); - - const result = await validateEvalFile(filePath); - - const warnings = result.errors.filter((e) => e.severity === 'warning'); - expect(warnings.some((e) => e.message.includes('description'))).toBe(true); - }); - it('warns when name has invalid format', async () => { const filePath = path.join(tempDir, 'meta-invalid-name.yaml'); await writeFile( From 4b4230271d923f476df3be5c6272419be79c7173 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 30 Mar 2026 05:46:56 +0000 Subject: [PATCH 7/7] fix(examples): align demo eval test IDs with fixtures, remove $schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Compare eval: match all 5 fixture test IDs (code-review-001/002/003, code-gen-001/002) so eval output is directly comparable - Remove $schema from eval YAML files — not needed - Verified: full eval run passes (5/5, 3/3), output is agentv-compare-compatible with matching test IDs Co-Authored-By: Claude Opus 4.6 --- .../features/compare/evals/dataset.eval.yaml | 20 +++++++++++++++---- .../trace-analysis/evals/dataset.eval.yaml | 2 -- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/examples/features/compare/evals/dataset.eval.yaml b/examples/features/compare/evals/dataset.eval.yaml index 93adff30d..158c70b0d 100644 --- a/examples/features/compare/evals/dataset.eval.yaml +++ b/examples/features/compare/evals/dataset.eval.yaml @@ -1,5 +1,3 @@ -$schema: agentv-eval-v2 - # Demo eval for the compare example. # Run against two targets to generate baseline and candidate result files: # agentv eval evals/dataset.eval.yaml --target baseline @@ -29,11 +27,25 @@ tests: - type: contains value: optim + - id: code-review-003 + input: Review this error handling code for edge cases and missing checks. + criteria: Identifies missing error handling or edge cases + assertions: + - type: contains + value: error + - type: regex + value: "edge.?case|missing|exception|null" + - id: code-gen-001 input: Write a function that checks if a string is a palindrome. criteria: Returns working code that handles basic palindrome cases assertions: - type: contains value: palindrome - - type: is-json - required: false + + - id: code-gen-002 + input: Write a function that finds the longest common subsequence of two strings. + criteria: Returns a correct implementation with reasonable time complexity + assertions: + - type: regex + value: "subsequence|lcs|LCS" diff --git a/examples/features/trace-analysis/evals/dataset.eval.yaml b/examples/features/trace-analysis/evals/dataset.eval.yaml index d77ef444a..a8f683aca 100644 --- a/examples/features/trace-analysis/evals/dataset.eval.yaml +++ b/examples/features/trace-analysis/evals/dataset.eval.yaml @@ -1,5 +1,3 @@ -$schema: agentv-eval-v2 - # Demo eval for the trace-analysis example. # Run this eval to generate result traces, then analyze with: # agentv trace evals/multi-agent.eval.results.jsonl