From 2a2de55b47093d7f2cf871df26134862306b2c51 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sat, 28 Mar 2026 00:25:59 +1100 Subject: [PATCH 1/2] feat(pipeline): agent-mode artifacts align with CLI-mode schema - pipeline run/input: --out now optional, defaults to .agentv/results/runs/eval_ - pipeline bench: index.jsonl now includes scores[], execution_status, response_path to match CLI-mode dashboard schema - results validate: new command to check run dir naming, index.jsonl fields, artifact presence, and score bounds - skill: update agent-mode workflow docs to use default --out, add validate step, clarify llm_scores.json -> index.jsonl flow; user-stated mode overrides .env --- apps/cli/src/commands/pipeline/bench.ts | 20 +- apps/cli/src/commands/pipeline/input.ts | 10 +- apps/cli/src/commands/pipeline/run.ts | 7 +- apps/cli/src/commands/results/index.ts | 2 + apps/cli/src/commands/results/validate.ts | 261 ++++++++++++++++++ .../agentv-dev/skills/agentv-bench/SKILL.md | 53 ++-- 6 files changed, 323 insertions(+), 30 deletions(-) create mode 100644 apps/cli/src/commands/results/validate.ts diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts index 5cd4cefc7..6aabd716f 100644 --- a/apps/cli/src/commands/pipeline/bench.ts +++ b/apps/cli/src/commands/pipeline/bench.ts @@ -12,6 +12,7 @@ * Stdin format (LLM scores): * { "": { "": { "score": 0.85, "assertions": [...] } } } */ +import { existsSync } from 'node:fs'; import { readFile, readdir, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; @@ -150,15 +151,32 @@ export const evalBenchCommand = command({ 'utf8', ); - // Build index entry + // Build index entry (match CLI-mode schema for dashboard compatibility) + const scores = evaluators.map((e) => ({ + name: e.name, + type: e.type, + score: e.score, + weight: e.weight, + verdict: e.score >= 0.5 ? 'pass' : 'fail', + assertions: e.assertions.map((a) => ({ + text: a.text, + passed: a.passed, + evidence: a.evidence ?? '', + })), + })); + + const hasResponse = existsSync(join(testDir, 'response.md')); indexLines.push( JSON.stringify({ timestamp: manifest.timestamp, test_id: testId, score: Math.round(weightedScore * 1000) / 1000, target: targetName, + scores, + execution_status: 'ok', grading_path: `${testId}/grading.json`, timing_path: `${testId}/timing.json`, + response_path: hasResponse ? `${testId}/response.md` : undefined, }), ); } diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts index 3b698c05e..f09760b74 100644 --- a/apps/cli/src/commands/pipeline/input.ts +++ b/apps/cli/src/commands/pipeline/input.ts @@ -23,8 +23,9 @@ import { dirname, join, resolve } from 'node:path'; import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core'; import { loadTestSuite } from '@agentv/core'; -import { command, option, positional, string } from 'cmd-ts'; +import { command, option, optional, positional, string } from 'cmd-ts'; +import { buildDefaultRunDir } from '../eval/result-layout.js'; import { findRepoRoot } from '../eval/shared.js'; import { selectTarget } from '../eval/targets.js'; @@ -38,14 +39,15 @@ export const evalInputCommand = command({ description: 'Path to eval YAML file', }), out: option({ - type: string, + type: optional(string), long: 'out', - description: 'Output directory for extracted inputs', + description: + 'Output directory for extracted inputs (default: .agentv/results/runs/eval_)', }), }, handler: async ({ evalPath, out }) => { const resolvedEvalPath = resolve(evalPath); - const outDir = resolve(out); + const outDir = resolve(out ?? buildDefaultRunDir(process.cwd())); const repoRoot = await findRepoRoot(dirname(resolvedEvalPath)); const evalDir = dirname(resolvedEvalPath); diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index ce2490995..c542d1b36 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -21,6 +21,7 @@ import { executeScript, loadTestSuite } from '@agentv/core'; import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core'; import { command, number, option, optional, positional, string } from 'cmd-ts'; +import { buildDefaultRunDir } from '../eval/result-layout.js'; import { findRepoRoot } from '../eval/shared.js'; import { selectTarget } from '../eval/targets.js'; @@ -57,9 +58,9 @@ export const evalRunCommand = command({ description: 'Path to eval YAML file', }), out: option({ - type: string, + type: optional(string), long: 'out', - description: 'Output directory for results', + description: 'Output directory for results (default: .agentv/results/runs/eval_)', }), workers: option({ type: optional(number), @@ -69,7 +70,7 @@ export const evalRunCommand = command({ }, handler: async ({ evalPath, out, workers }) => { const resolvedEvalPath = resolve(evalPath); - const outDir = resolve(out); + const outDir = resolve(out ?? buildDefaultRunDir(process.cwd())); const repoRoot = await findRepoRoot(dirname(resolvedEvalPath)); const evalDir = dirname(resolvedEvalPath); diff --git a/apps/cli/src/commands/results/index.ts b/apps/cli/src/commands/results/index.ts index f5dc929c5..c09d51d3e 100644 --- a/apps/cli/src/commands/results/index.ts +++ b/apps/cli/src/commands/results/index.ts @@ -4,6 +4,7 @@ import { resultsExportCommand } from './export.js'; import { resultsFailuresCommand } from './failures.js'; import { resultsShowCommand } from './show.js'; import { resultsSummaryCommand } from './summary.js'; +import { resultsValidateCommand } from './validate.js'; export const resultsCommand = subcommands({ name: 'results', @@ -13,5 +14,6 @@ export const resultsCommand = subcommands({ summary: resultsSummaryCommand, failures: resultsFailuresCommand, show: resultsShowCommand, + validate: resultsValidateCommand, }, }); diff --git a/apps/cli/src/commands/results/validate.ts b/apps/cli/src/commands/results/validate.ts new file mode 100644 index 000000000..da56f0683 --- /dev/null +++ b/apps/cli/src/commands/results/validate.ts @@ -0,0 +1,261 @@ +/** + * `agentv results validate` — Validate that a run directory contains well-formed + * artifacts compatible with the AgentV dashboard and results commands. + * + * Checks: + * 1. Directory follows the `runs/eval_` naming convention + * 2. index.jsonl exists and each line has required fields + * 3. Per-test grading.json exists for every entry in the index + * 4. Per-test timing.json exists (warning if missing) + * 5. benchmark.json exists (warning if missing) + * 6. Scores are within [0, 1] + * 7. index.jsonl entries have `scores[]` array (warning if missing — dashboard needs it) + * + * Exit code 0 = valid, 1 = errors found. + * + * To extend: add new check functions to the `checks` array. + */ +import { existsSync, readFileSync, statSync } from 'node:fs'; +import path from 'node:path'; + +import { command, positional, string } from 'cmd-ts'; + +// ── Types ──────────────────────────────────────────────────────────────── + +interface Diagnostic { + readonly severity: 'error' | 'warning'; + readonly message: string; +} + +interface IndexEntry { + readonly timestamp?: string; + readonly test_id?: string; + readonly score?: number; + readonly target?: string; + readonly scores?: unknown[]; + readonly execution_status?: string; + readonly grading_path?: string; + readonly timing_path?: string; + readonly [key: string]: unknown; +} + +// ── Checks ─────────────────────────────────────────────────────────────── + +function checkDirectoryNaming(runDir: string): Diagnostic[] { + const dirName = path.basename(runDir); + const parentName = path.basename(path.dirname(runDir)); + const diagnostics: Diagnostic[] = []; + + if (parentName !== 'runs') { + diagnostics.push({ + severity: 'warning', + message: `Directory is not under a 'runs/' parent (found '${parentName}/'). Expected: .agentv/results/runs/`, + }); + } + + if (!/^eval_\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z$/.test(dirName)) { + diagnostics.push({ + severity: 'warning', + message: `Directory name '${dirName}' does not match the expected pattern 'eval_'. Example: eval_2026-03-27T12-42-24-429Z`, + }); + } + + return diagnostics; +} + +function checkIndexJsonl(runDir: string): { diagnostics: Diagnostic[]; entries: IndexEntry[] } { + const indexPath = path.join(runDir, 'index.jsonl'); + const diagnostics: Diagnostic[] = []; + const entries: IndexEntry[] = []; + + if (!existsSync(indexPath)) { + diagnostics.push({ severity: 'error', message: 'index.jsonl is missing' }); + return { diagnostics, entries }; + } + + const content = readFileSync(indexPath, 'utf8'); + const lines = content.split('\n').filter((l) => l.trim().length > 0); + + if (lines.length === 0) { + diagnostics.push({ severity: 'error', message: 'index.jsonl is empty' }); + return { diagnostics, entries }; + } + + for (let i = 0; i < lines.length; i++) { + try { + const entry: IndexEntry = JSON.parse(lines[i]); + entries.push(entry); + + if (!entry.test_id) { + diagnostics.push({ + severity: 'error', + message: `index.jsonl line ${i + 1}: missing 'test_id'`, + }); + } + + if (entry.score === undefined || entry.score === null) { + diagnostics.push({ + severity: 'error', + message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): missing 'score'`, + }); + } else if (typeof entry.score !== 'number' || entry.score < 0 || entry.score > 1) { + diagnostics.push({ + severity: 'error', + message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): score ${entry.score} is outside [0, 1]`, + }); + } + + if (!entry.target) { + diagnostics.push({ + severity: 'error', + message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): missing 'target'`, + }); + } + + if (!entry.grading_path) { + diagnostics.push({ + severity: 'warning', + message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): missing 'grading_path'`, + }); + } + + if (!entry.scores || !Array.isArray(entry.scores) || entry.scores.length === 0) { + diagnostics.push({ + severity: 'warning', + message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): missing 'scores[]' array — dashboard may not show per-evaluator breakdown`, + }); + } + + if (!entry.execution_status) { + diagnostics.push({ + severity: 'warning', + message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): missing 'execution_status'`, + }); + } + } catch { + diagnostics.push({ + severity: 'error', + message: `index.jsonl line ${i + 1}: invalid JSON`, + }); + } + } + + return { diagnostics, entries }; +} + +function checkArtifactFiles(runDir: string, entries: IndexEntry[]): Diagnostic[] { + const diagnostics: Diagnostic[] = []; + + for (const entry of entries) { + const testId = entry.test_id ?? '?'; + + // Check grading.json + if (entry.grading_path) { + const gradingPath = path.join(runDir, entry.grading_path); + if (!existsSync(gradingPath)) { + diagnostics.push({ + severity: 'error', + message: `${testId}: grading.json not found at '${entry.grading_path}'`, + }); + } else { + try { + const grading = JSON.parse(readFileSync(gradingPath, 'utf8')); + if (!grading.assertions || !Array.isArray(grading.assertions)) { + diagnostics.push({ + severity: 'error', + message: `${testId}: grading.json missing 'assertions' array`, + }); + } + if (!grading.summary) { + diagnostics.push({ + severity: 'warning', + message: `${testId}: grading.json missing 'summary' object`, + }); + } + } catch { + diagnostics.push({ + severity: 'error', + message: `${testId}: grading.json is not valid JSON`, + }); + } + } + } + + // Check timing.json + if (entry.timing_path) { + const timingPath = path.join(runDir, entry.timing_path); + if (!existsSync(timingPath)) { + diagnostics.push({ + severity: 'warning', + message: `${testId}: timing.json not found at '${entry.timing_path}'`, + }); + } + } + } + + // Check benchmark.json + const benchmarkPath = path.join(runDir, 'benchmark.json'); + if (!existsSync(benchmarkPath)) { + diagnostics.push({ severity: 'warning', message: 'benchmark.json is missing' }); + } + + return diagnostics; +} + +// ── Command ────────────────────────────────────────────────────────────── + +export const resultsValidateCommand = command({ + name: 'validate', + description: 'Validate that a run directory contains well-formed result artifacts', + args: { + runDir: positional({ + type: string, + displayName: 'run-dir', + description: 'Path to the run directory to validate', + }), + }, + handler: async ({ runDir }) => { + const resolvedDir = path.resolve(runDir); + + if (!existsSync(resolvedDir) || !statSync(resolvedDir).isDirectory()) { + console.error(`Error: '${runDir}' is not a directory`); + process.exit(1); + } + + const allDiagnostics: Diagnostic[] = []; + + // Run all checks + allDiagnostics.push(...checkDirectoryNaming(resolvedDir)); + + const { diagnostics: indexDiags, entries } = checkIndexJsonl(resolvedDir); + allDiagnostics.push(...indexDiags); + + if (entries.length > 0) { + allDiagnostics.push(...checkArtifactFiles(resolvedDir, entries)); + } + + // Report + const errors = allDiagnostics.filter((d) => d.severity === 'error'); + const warnings = allDiagnostics.filter((d) => d.severity === 'warning'); + + if (allDiagnostics.length === 0) { + console.log(`✓ Valid run directory: ${entries.length} test(s), no issues found`); + return; + } + + for (const d of errors) { + console.error(` ERROR: ${d.message}`); + } + for (const d of warnings) { + console.warn(` WARN: ${d.message}`); + } + + console.log( + `\n${entries.length} test(s), ${errors.length} error(s), ${warnings.length} warning(s)`, + ); + + if (errors.length > 0) { + process.exit(1); + } + }, +}); diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md index 298111029..f1b27e619 100644 --- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md @@ -49,13 +49,17 @@ These scripts break the eval pipeline into discrete steps. The agent runs them i ```bash # 1. Extract inputs, invoke CLI targets, run code graders (one command): -agentv pipeline run evals/repro.eval.yaml --out .agentv/results/export/run-1 +# --out is optional; defaults to .agentv/results/runs/eval_ +agentv pipeline run evals/repro.eval.yaml # 2. Agent performs LLM grading (reads llm_graders/*.json, produces scores JSON) # ... agent reads prompts, grades responses, writes llm_scores.json ... -# 3. Merge all scores and produce final artifacts -agentv pipeline bench .agentv/results/export/run-1 --llm-scores llm_scores.json +# 3. Merge all scores and produce final artifacts (writes index.jsonl for dashboard) +agentv pipeline bench --llm-scores llm_scores.json + +# 4. Validate artifacts are dashboard-compatible +agentv results validate ``` ### Skill management scripts @@ -168,11 +172,13 @@ Put results in a workspace directory organized by iteration (`iteration-1/`, `it ### Choosing a run mode -Read the mode and CLI override from `.env` before doing anything: +**User instruction takes priority.** If the user says "run in agent mode", "use agent mode", or "use CLI mode", use that mode directly — do not check `.env`. + +Only read `.env` when the user has not specified a mode: ```bash -grep AGENT_EVAL_MODE .env 2>/dev/null || echo "AGENT_EVAL_MODE=agent" grep AGENTV_CLI .env 2>/dev/null || echo "AGENTV_CLI=(not set, using global agentv)" +grep AGENT_EVAL_MODE .env 2>/dev/null || echo "AGENT_EVAL_MODE=agent" ``` **`AGENTV_CLI` override:** If `AGENTV_CLI` is set in `.env`, use that value as the command prefix in place of `agentv` for every pipeline command. This lets you run from a local source checkout instead of the globally installed binary. @@ -198,7 +204,7 @@ The Python wrapper scripts (`scripts/run_tests.py`, etc.) pick up `AGENTV_CLI` a | `agent` (default) | **Agent mode** | Subagent-driven eval — parses eval.yaml, spawns executor + grader subagents. Zero CLI dependency. | | `cli` | **AgentV CLI** | `agentv eval ` — end-to-end, multi-provider | -Set `AGENT_EVAL_MODE` in `.env` at the project root. If absent, default to `agent`. +Set `AGENT_EVAL_MODE` in `.env` at the project root as the default when no mode is specified. If absent, default to `agent`. **User instruction always overrides this.** **`agent`** — Parses eval.yaml directly, spawns executor subagents to run each test case in the current workspace, then spawns grader subagents to evaluate all assertion types natively. No CLI or external API calls required. See "Agent mode: Running eval.yaml without CLI" below. @@ -294,44 +300,45 @@ When `AGENT_EVAL_MODE=agent` (default), use the pipeline CLI subcommands (`pipel **Recommended: Single command for CLI targets** -For evals with CLI targets, `pipeline run` handles input extraction, target invocation, and code grading in one step: +For evals with CLI targets, `pipeline run` handles input extraction, target invocation, and code grading in one step. When `--out` is omitted, the output directory defaults to `.agentv/results/runs/eval_` (same convention as `agentv eval`): ```bash # Extract inputs, invoke all CLI targets in parallel, run code graders: -agentv pipeline run evals/repro.eval.yaml --out .agentv/results/export/run-1 +# Output goes to .agentv/results/runs/eval_/ by default +agentv pipeline run evals/repro.eval.yaml ``` -Then the agent performs LLM grading and merges scores: +The run directory is printed to stdout. Then the agent performs LLM grading and merges scores: ```bash -# bash: -agentv pipeline bench .agentv/results/export/run-1 --llm-scores llm_scores.json +agentv pipeline bench --llm-scores llm_scores.json -# PowerShell (if --llm-scores is unavailable, pipe via Get-Content): -# Get-Content llm_scores.json | agentv pipeline bench .agentv/results/export/run-1 +# Validate artifacts are dashboard-compatible: +agentv results validate ``` -That's the entire pipeline: **2 commands** plus the agent's LLM grading step. +That's the entire pipeline: **2 commands** + LLM grading + optional validation. **Alternative: Step-by-step (agent-as-target or fine-grained control)** Use individual commands when the agent IS the target or you need control over each step: ```bash -# Step 1: Extract inputs -agentv pipeline input evals/repro.eval.yaml --out .agentv/results/export/run-1 +# Step 1: Extract inputs (defaults to .agentv/results/runs/eval_) +agentv pipeline input evals/repro.eval.yaml # Step 2: Agent invokes each test (reads input.json, writes response.md) -# For CLI targets, you can also use the Python wrapper: -# python scripts/run_tests.py evals/repro.eval.yaml --out .agentv/results/export/run-1 # Step 3: Run code graders -agentv pipeline grade .agentv/results/export/run-1 +agentv pipeline grade # Step 4: Agent does LLM grading, writes llm_scores.json -# Step 5: Merge scores -agentv pipeline bench .agentv/results/export/run-1 --llm-scores llm_scores.json +# Step 5: Merge scores (writes index.jsonl with full scores[] for dashboard) +agentv pipeline bench --llm-scores llm_scores.json + +# Step 6: Validate +agentv results validate ``` This creates an export directory with per-test `input.json`, `invoke.json`, `criteria.md`, and grader configs (`code_graders/*.json`, `llm_graders/*.json`). @@ -353,7 +360,9 @@ The agent reads `llm_graders/.json` for each test, grades the response usi **Subagent environments (Claude Code):** Dispatch the `grader` subagent (read `agents/grader.md`) for this step. -**Non-subagent environments (VS Code Copilot, Codex, etc.):** Perform LLM grading inline. Read each `llm_graders/.json`, grade the response against the `prompt_content` criteria, score 0.0–1.0 with evidence, and write the result to `llm_scores.json` in the export directory. +**Non-subagent environments (VS Code Copilot, Codex, etc.):** Perform LLM grading inline. Read each `llm_graders/.json`, grade the response against the `prompt_content` criteria, score 0.0–1.0 with evidence, and write the result to `llm_scores.json` in the run directory. + +**Note:** `pipeline bench` merges LLM scores into `index.jsonl` with a full `scores[]` array per entry, matching the CLI-mode schema. The web dashboard (`agentv results serve`) reads this format directly — no separate conversion script is needed. Run `agentv results validate ` to verify compatibility. **Note on Python wrapper scripts:** The `scripts/` directory contains Python wrappers (`run_tests.py`, `run_code_graders.py`, `bench.py`) that call the CLI commands. These are provided as an alternative but the direct CLI commands above are preferred — they work cross-platform without Python dependency issues. From b94ff8a019ce60781b7b0a8d9e863bf71be1d958 Mon Sep 17 00:00:00 2001 From: Christopher Date: Fri, 27 Mar 2026 13:45:39 +0000 Subject: [PATCH 2/2] fix: address code review issues for pipeline artifact alignment 1. execution_status: run.ts now writes status into timing.json ('ok' or 'execution_error'), bench.ts reads it back instead of hardcoding 'ok' 2. response_path: use null instead of undefined so the field is always present in index.jsonl 3. --workers concurrency: implement actual concurrency limiter using Promise.race instead of unbounded Promise.all 4. validate.ts: validate scores[] entry structure (name, type, score, verdict) and warn on unknown execution_status values Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/pipeline/bench.ts | 18 +++++++++++++-- apps/cli/src/commands/pipeline/run.ts | 17 +++++++++++--- apps/cli/src/commands/results/validate.ts | 27 +++++++++++++++++++++++ 3 files changed, 57 insertions(+), 5 deletions(-) diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts index 6aabd716f..1a96cea63 100644 --- a/apps/cli/src/commands/pipeline/bench.ts +++ b/apps/cli/src/commands/pipeline/bench.ts @@ -165,6 +165,20 @@ export const evalBenchCommand = command({ })), })); + // Read execution_status from timing.json (written by pipeline run) + let executionStatus = 'ok'; + const timingPath = join(testDir, 'timing.json'); + if (existsSync(timingPath)) { + try { + const timing = JSON.parse(await readFile(timingPath, 'utf8')); + if (typeof timing.execution_status === 'string') { + executionStatus = timing.execution_status; + } + } catch { + // Fall back to 'ok' if timing.json is unreadable + } + } + const hasResponse = existsSync(join(testDir, 'response.md')); indexLines.push( JSON.stringify({ @@ -173,10 +187,10 @@ export const evalBenchCommand = command({ score: Math.round(weightedScore * 1000) / 1000, target: targetName, scores, - execution_status: 'ok', + execution_status: executionStatus, grading_path: `${testId}/grading.json`, timing_path: `${testId}/timing.json`, - response_path: hasResponse ? `${testId}/response.md` : undefined, + response_path: hasResponse ? `${testId}/response.md` : null, }), ); } diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index c542d1b36..0ca236132 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -233,6 +233,7 @@ export const evalRunCommand = command({ await writeJson(join(testDir, 'timing.json'), { duration_ms: durationMs, total_duration_seconds: Math.round(durationMs / 10) / 100, + execution_status: 'ok', }); console.log(` ${testId}: OK (${durationMs}ms, ${response.length} chars)`); @@ -244,6 +245,7 @@ export const evalRunCommand = command({ await writeJson(join(testDir, 'timing.json'), { duration_ms: durationMs, total_duration_seconds: Math.round(durationMs / 10) / 100, + execution_status: 'execution_error', }); console.error(` ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}`); } finally { @@ -257,9 +259,18 @@ export const evalRunCommand = command({ } }; - // Run all targets in parallel - const allTasks = testIds.map((testId) => invokeTarget(testId)); - await Promise.all(allTasks); + // Run targets with concurrency limit + const pending = new Set>(); + for (const testId of testIds) { + const task = invokeTarget(testId).then(() => { + pending.delete(task); + }); + pending.add(task); + if (pending.size >= maxWorkers) { + await Promise.race(pending); + } + } + await Promise.all(pending); } else { console.log('Agent-as-target mode — skipping CLI invocation.'); } diff --git a/apps/cli/src/commands/results/validate.ts b/apps/cli/src/commands/results/validate.ts index da56f0683..06274d85d 100644 --- a/apps/cli/src/commands/results/validate.ts +++ b/apps/cli/src/commands/results/validate.ts @@ -124,6 +124,28 @@ function checkIndexJsonl(runDir: string): { diagnostics: Diagnostic[]; entries: severity: 'warning', message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): missing 'scores[]' array — dashboard may not show per-evaluator breakdown`, }); + } else { + for (let j = 0; j < entry.scores.length; j++) { + const s = entry.scores[j] as Record | null; + if (!s || typeof s !== 'object') { + diagnostics.push({ + severity: 'error', + message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): scores[${j}] is not an object`, + }); + continue; + } + const missing: string[] = []; + if (typeof s.name !== 'string') missing.push('name'); + if (typeof s.type !== 'string') missing.push('type'); + if (typeof s.score !== 'number') missing.push('score'); + if (typeof s.verdict !== 'string') missing.push('verdict'); + if (missing.length > 0) { + diagnostics.push({ + severity: 'warning', + message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): scores[${j}] missing fields: ${missing.join(', ')}`, + }); + } + } } if (!entry.execution_status) { @@ -131,6 +153,11 @@ function checkIndexJsonl(runDir: string): { diagnostics: Diagnostic[]; entries: severity: 'warning', message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): missing 'execution_status'`, }); + } else if (!['ok', 'quality_failure', 'execution_error'].includes(entry.execution_status)) { + diagnostics.push({ + severity: 'warning', + message: `index.jsonl line ${i + 1} (${entry.test_id ?? '?'}): unknown execution_status '${entry.execution_status}' (expected: ok, quality_failure, execution_error)`, + }); } } catch { diagnostics.push({