Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .claude-plugin/marketplace.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
},
"plugins": [
{
"name": "agentv-dev",
"description": "Development skills for building and optimizing AgentV evaluations",
"source": "./plugins/agentv-dev"
"name": "agentic-engineering",
"description": "Design and review AI agent systems — architecture patterns, workflow design, and plugin quality review",
"source": "./plugins/agentic-engineering"
},
{
"name": "agentv-claude-trace",
"description": "Session tracing plugin — exports Claude Code session traces via OpenTelemetry",
"source": "./plugins/agentv-claude-trace"
},
{
"name": "agentic-engineering",
"description": "Design and review AI agent systems — architecture patterns, workflow design, and plugin quality review",
"source": "./plugins/agentic-engineering"
"name": "agentv-dev",
"description": "Development skills for building and optimizing AgentV evaluations",
"source": "./plugins/agentv-dev"
}
]
}
41 changes: 41 additions & 0 deletions .github/actions/setup-bun/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: "Setup Bun"
description: "Setup Bun with caching and install dependencies"
runs:
using: "composite"
steps:
- name: Get baseline download URL
id: bun-url
shell: bash
run: |
if [ "$RUNNER_ARCH" = "X64" ]; then
V=$(node -p "require('./package.json').packageManager.split('@')[1]")
case "$RUNNER_OS" in
macOS) OS=darwin ;;
Linux) OS=linux ;;
Windows) OS=windows ;;
esac
echo "url=https://github.com/oven-sh/bun/releases/download/bun-v${V}/bun-${OS}-x64-baseline.zip" >> "$GITHUB_OUTPUT"
fi

- name: Setup Bun
uses: oven-sh/setup-bun@v2
with:
bun-version-file: ${{ !steps.bun-url.outputs.url && 'package.json' || '' }}
bun-download-url: ${{ steps.bun-url.outputs.url }}

- name: Get cache directory
id: cache
shell: bash
run: echo "dir=$(bun pm cache)" >> "$GITHUB_OUTPUT"

- name: Cache Bun dependencies
uses: actions/cache@v4
with:
path: ${{ steps.cache.outputs.dir }}
key: ${{ runner.os }}-bun-${{ hashFiles('**/bun.lock') }}
restore-keys: |
${{ runner.os }}-bun-

- name: Install dependencies
run: bun install --frozen-lockfile
shell: bash
12 changes: 6 additions & 6 deletions .github/plugin/marketplace.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
},
"plugins": [
{
"name": "agentv-dev",
"description": "Development skills for building and optimizing AgentV evaluations",
"source": "./plugins/agentv-dev"
"name": "agentic-engineering",
"description": "Design and review AI agent systems — architecture patterns, workflow design, and plugin quality review",
"source": "./plugins/agentic-engineering"
},
{
"name": "agentv-claude-trace",
"description": "Session tracing plugin — exports Claude Code session traces via OpenTelemetry",
"source": "./plugins/agentv-claude-trace"
},
{
"name": "agentic-engineering",
"description": "Design and review AI agent systems — architecture patterns, workflow design, and plugin quality review",
"source": "./plugins/agentic-engineering"
"name": "agentv-dev",
"description": "Development skills for building and optimizing AgentV evaluations",
"source": "./plugins/agentv-dev"
}
]
}
32 changes: 32 additions & 0 deletions .github/workflows/validate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,35 @@ jobs:
--glob-ignore-case
--root-dir .
"**/*.md"

marketplace:
name: Validate Marketplace
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/setup-bun

- name: Validate marketplace.json (schema + sync)
run: bun scripts/marketplace/validate-marketplace.ts

- name: Check marketplace sorted
run: bun scripts/marketplace/check-sorted.ts

- name: Validate frontmatter
run: bun scripts/marketplace/validate-frontmatter.ts

evals:
name: Validate Evals
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/setup-bun

- name: Build
run: bun run build

- name: Check evals directories have eval files
run: bun scripts/validate-eval-dirs.ts

- name: Validate eval schemas
run: bun apps/cli/dist/cli.js validate 'examples/features/**/evals/**/*.eval.yaml' 'examples/features/**/*.EVAL.yaml'
49 changes: 33 additions & 16 deletions apps/cli/src/commands/validate/validate-files.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {
validateFileReferences,
validateTargetsFile,
} from '@agentv/core/evaluation/validation';
import fg from 'fast-glob';

/**
* Validate YAML files for AgentV schema compliance.
Expand Down Expand Up @@ -67,34 +68,50 @@ async function validateSingleFile(filePath: string): Promise<ValidationResult> {
}

async function expandPaths(paths: readonly string[]): Promise<readonly string[]> {
const expanded: string[] = [];
const expanded = new Set<string>();

for (const inputPath of paths) {
const absolutePath = path.resolve(inputPath);

// Check if path exists
// Try as literal file or directory first
try {
await access(absolutePath, constants.F_OK);
const stats = await stat(absolutePath);

if (stats.isFile()) {
if (isYamlFile(absolutePath)) expanded.add(absolutePath);
continue;
}
if (stats.isDirectory()) {
const yamlFiles = await findYamlFiles(absolutePath);
for (const f of yamlFiles) expanded.add(f);
continue;
}
} catch {
console.warn(`Warning: Path not found: ${inputPath}`);
continue;
// Not a literal path — fall through to glob matching
}

const stats = await stat(absolutePath);

if (stats.isFile()) {
// Only include YAML files
if (isYamlFile(absolutePath)) {
expanded.push(absolutePath);
}
} else if (stats.isDirectory()) {
// Recursively find all YAML files in directory
const yamlFiles = await findYamlFiles(absolutePath);
expanded.push(...yamlFiles);
// Treat as glob pattern
const globPattern = inputPath.includes('\\') ? inputPath.replace(/\\/g, '/') : inputPath;
const matches = await fg(globPattern, {
cwd: process.cwd(),
absolute: true,
onlyFiles: true,
unique: true,
dot: false,
followSymbolicLinks: true,
});

const yamlMatches = matches.filter((f) => isYamlFile(f));
if (yamlMatches.length === 0) {
console.warn(`Warning: No YAML files matched pattern: ${inputPath}`);
}
for (const f of yamlMatches) expanded.add(path.normalize(f));
}

return expanded;
const sorted = Array.from(expanded);
sorted.sort();
return sorted;
}

async function findYamlFiles(dirPath: string): Promise<readonly string[]> {
Expand Down
51 changes: 51 additions & 0 deletions examples/features/compare/evals/dataset.eval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Demo eval for the compare example.
# Run against two targets to generate baseline and candidate result files:
# agentv eval evals/dataset.eval.yaml --target baseline
# agentv eval evals/dataset.eval.yaml --target candidate
# Then compare:
# agentv compare evals/baseline-results.jsonl evals/candidate-results.jsonl

name: compare-demo
description: Demo eval for generating baseline and candidate results to compare

tests:
- id: code-review-001
input: Review the following code for bugs and suggest improvements.
criteria: Identifies at least one issue and suggests a fix
assertions:
- type: contains
value: bug
- type: contains
value: fix

- id: code-review-002
input: Explain what this function does and how it could be optimized.
criteria: Provides a clear explanation and at least one optimization suggestion
assertions:
- type: contains
value: function
- type: contains
value: optim

- id: code-review-003
input: Review this error handling code for edge cases and missing checks.
criteria: Identifies missing error handling or edge cases
assertions:
- type: contains
value: error
- type: regex
value: "edge.?case|missing|exception|null"

- id: code-gen-001
input: Write a function that checks if a string is a palindrome.
criteria: Returns working code that handles basic palindrome cases
assertions:
- type: contains
value: palindrome

- id: code-gen-002
input: Write a function that finds the longest common subsequence of two strings.
criteria: Returns a correct implementation with reasonable time complexity
assertions:
- type: regex
value: "subsequence|lcs|LCS"
8 changes: 3 additions & 5 deletions examples/features/env-interpolation/evals/dataset.eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,10 @@
# Missing variables resolve to empty string.
#
# Usage:
# export EVAL_CRITERIA="Responds with a friendly greeting"
# export CUSTOM_SYSTEM_PROMPT="You are a helpful assistant who always greets warmly."
# agentv eval examples/features/env-interpolation/evals/dataset.eval.yaml
#
# Or use a .env file in the project root:
# EVAL_CRITERIA=Responds with a friendly greeting
# CUSTOM_SYSTEM_PROMPT=You are a helpful assistant who always greets warmly.

description: Demonstrates ${{ VAR }} interpolation in eval fields
Expand All @@ -20,13 +18,13 @@ execution:
tests:
# Full-value interpolation: entire field value from env var
- id: full-value
criteria: "${{ EVAL_CRITERIA }}"
criteria: Responds with a friendly greeting
input: "Hello!"
expected_output: "Hello! How can I help you today?"
expected_output: "${{ EXPECTED_GREETING }}"

# Partial/inline interpolation: env var embedded in a larger string
- id: partial-value
criteria: "Response uses the system prompt persona and ${{ EVAL_CRITERIA }}"
criteria: Response uses the system prompt persona
input:
- role: system
content: "${{ CUSTOM_SYSTEM_PROMPT }}"
Expand Down
34 changes: 34 additions & 0 deletions examples/features/trace-analysis/evals/dataset.eval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Demo eval for the trace-analysis example.
# Run this eval to generate result traces, then analyze with:
# agentv trace evals/multi-agent.eval.results.jsonl

name: trace-analysis-demo
description: Demo eval for generating execution traces to analyze

tests:
- id: research-question
input: What are the key differences between REST and GraphQL APIs?
criteria: Covers at least three differences including query flexibility, over-fetching, and type system
assertions:
- type: contains
value: REST
- type: contains
value: GraphQL
- type: regex
value: "type.?system|schema|typed"

- id: code-review-task
input: Review this Python function for potential issues and suggest improvements.
criteria: Identifies at least one code quality issue
assertions:
- type: contains
value: suggest
- type: regex
value: "improv|fix|refactor|optim"

- id: simple-qa
input: What is the capital of France?
criteria: Correctly answers Paris
assertions:
- type: contains
value: Paris
10 changes: 0 additions & 10 deletions packages/core/src/evaluation/validation/eval-validator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -523,16 +523,6 @@ function validateMetadata(parsed: JsonObject, filePath: string, errors: Validati
});
}
}

// Warn if name is present but description is missing
if (!('description' in parsed) || parsed.description === undefined) {
errors.push({
severity: 'warning',
filePath,
location: 'name',
message: "When 'name' is present, 'description' should also be provided.",
});
}
}
}

Expand Down
17 changes: 0 additions & 17 deletions packages/core/test/evaluation/validation/eval-validator.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -457,23 +457,6 @@ describe('validateEvalFile', () => {
});

describe('metadata validation', () => {
it('warns when name is present without description', async () => {
const filePath = path.join(tempDir, 'meta-name-only.yaml');
await writeFile(
filePath,
`name: my-eval
tests:
- id: test-1
input: "Query"
`,
);

const result = await validateEvalFile(filePath);

const warnings = result.errors.filter((e) => e.severity === 'warning');
expect(warnings.some((e) => e.message.includes('description'))).toBe(true);
});

it('warns when name has invalid format', async () => {
const filePath = path.join(tempDir, 'meta-invalid-name.yaml');
await writeFile(
Expand Down
1 change: 1 addition & 0 deletions packages/core/tsup.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { defineConfig } from 'tsup';
export default defineConfig({
entry: ['src/index.ts', 'src/evaluation/validation/index.ts'],
format: ['esm', 'cjs'],
shims: true,
sourcemap: true,
clean: true,
dts: {
Expand Down
Loading
Loading