diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 95ee3ac8..14f33f97 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -181,6 +181,16 @@ export const evalRunCommand = command({ description: 'Per-test score threshold (0-1, default 0.8). Exit 1 if any test scores below this value', }), + tag: multioption({ + type: array(string), + long: 'tag', + description: 'Only run eval files that have this tag (repeatable, AND logic)', + }), + excludeTag: multioption({ + type: array(string), + long: 'exclude-tag', + description: 'Skip eval files that have this tag (repeatable, file skipped if any match)', + }), }, handler: async (args) => { // Launch interactive wizard when no eval paths and stdin is a TTY @@ -224,6 +234,8 @@ export const evalRunCommand = command({ model: args.model, outputMessages: args.outputMessages, threshold: args.threshold, + tag: args.tag, + excludeTag: args.excludeTag, }; const result = await runEvalCommand({ testFiles: resolvedPaths, rawOptions }); if (result?.thresholdFailed) { diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index ff53b8b5..c13eb2f6 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -88,6 +88,8 @@ interface NormalizedOptions { readonly model?: string; readonly outputMessages: number | 'all'; readonly threshold?: number; + readonly tags: readonly string[]; + readonly excludeTags: readonly string[]; } function normalizeBoolean(value: unknown): boolean { @@ -140,6 +142,43 @@ function normalizeWorkspaceMode(value: unknown): 'pooled' | 'temp' | 'static' | return value === 'pooled' || value === 'temp' || value === 'static' ? value : undefined; } +function normalizeStringArray(value: unknown): readonly string[] { + if (Array.isArray(value)) { + return value.filter((v): v is string => typeof v === 'string' && v.trim().length > 0); + } + return []; +} + +/** + * Check whether an eval file's tags satisfy --tag / --exclude-tag filters. + * + * - `--tag X` means the file must have tag X (AND logic: all specified tags must be present) + * - `--exclude-tag X` means the file must NOT have tag X (AND logic: none of the specified tags may be present) + * - When both are used, both conditions must hold. + * - Files without tags are excluded when --tag is specified, but included when only --exclude-tag is specified. + */ +export function matchesTagFilters( + fileTags: readonly string[] | undefined, + includeTags: readonly string[], + excludeTags: readonly string[], +): boolean { + const tags = new Set(fileTags ?? []); + + // --tag: every specified tag must be present + if (includeTags.length > 0) { + for (const required of includeTags) { + if (!tags.has(required)) return false; + } + } + + // --exclude-tag: none of the specified tags may be present + for (const excluded of excludeTags) { + if (tags.has(excluded)) return false; + } + + return true; +} + /** * Normalize --output-messages value. Accepts a number (>= 1) or "all". * Defaults to 1 (last assistant message only). @@ -304,6 +343,8 @@ function normalizeOptions( model: normalizeString(rawOptions.model), outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)), threshold: normalizeOptionalNumber(rawOptions.threshold), + tags: normalizeStringArray(rawOptions.tag), + excludeTags: normalizeStringArray(rawOptions.excludeTag), } satisfies NormalizedOptions; } @@ -434,6 +475,7 @@ async function prepareFileMetadata(params: { readonly totalBudgetUsd?: number; readonly failOnError?: FailOnError; readonly threshold?: number; + readonly tags?: readonly string[]; }> { const { testFilePath, repoRoot, cwd, options } = params; @@ -524,6 +566,7 @@ async function prepareFileMetadata(params: { totalBudgetUsd: suite.totalBudgetUsd, failOnError: suite.failOnError, threshold: suite.threshold, + tags: suite.metadata?.tags, }; } @@ -970,6 +1013,7 @@ export async function runEvalCommand( readonly totalBudgetUsd?: number; readonly failOnError?: FailOnError; readonly threshold?: number; + readonly tags?: readonly string[]; } >(); // Separate TypeScript/JS eval files from YAML files. @@ -1006,6 +1050,27 @@ export async function runEvalCommand( fileMetadata.set(testFilePath, meta); } + // Apply --tag / --exclude-tag filtering at the eval-file level + const hasTagFilters = options.tags.length > 0 || options.excludeTags.length > 0; + if (hasTagFilters) { + const skippedFiles: string[] = []; + for (const [testFilePath, meta] of fileMetadata.entries()) { + if (!matchesTagFilters(meta.tags, options.tags, options.excludeTags)) { + fileMetadata.delete(testFilePath); + skippedFiles.push(path.relative(cwd, testFilePath)); + } + } + if (skippedFiles.length > 0 && options.verbose) { + console.log( + `Skipped ${skippedFiles.length} eval file(s) by tag filter: ${skippedFiles.join(', ')}`, + ); + } + if (fileMetadata.size === 0) { + console.log('No eval files matched the tag filters. Nothing to run.'); + return; + } + } + // Resolve cache: combine CLI flags with YAML config // Use first file's YAML config for cache settings (consistent across a run) const firstMeta = fileMetadata.values().next().value; @@ -1116,8 +1181,11 @@ export async function runEvalCommand( } } + // Use only files that survived tag filtering (fileMetadata keys) + const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f)); + try { - await runWithLimit(resolvedTestFiles, fileConcurrency, async (testFilePath) => { + await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => { const targetPrep = fileMetadata.get(testFilePath); if (!targetPrep) { throw new Error(`Missing metadata for ${testFilePath}`); @@ -1208,7 +1276,7 @@ export async function runEvalCommand( } if (usesDefaultArtifactWorkspace) { - const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : ''; + const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : ''; const workspaceDir = path.dirname(outputPath); const { testArtifactDir, @@ -1230,7 +1298,7 @@ export async function runEvalCommand( // Write companion artifacts (grading, timing, benchmark) if requested if (options.artifacts) { const artifactsDir = path.resolve(options.artifacts); - const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : ''; + const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : ''; const { testArtifactDir, indexPath, @@ -1275,7 +1343,7 @@ export async function runEvalCommand( // Suggest retry-errors command when execution errors are detected if (summary.executionErrorCount > 0 && !options.retryErrors) { - const evalFileArgs = resolvedTestFiles.map((f) => path.relative(cwd, f)).join(' '); + const evalFileArgs = activeTestFiles.map((f) => path.relative(cwd, f)).join(' '); const targetFlag = options.target ? ` --target ${options.target}` : ''; const relativeOutputPath = path.relative(cwd, outputPath); console.log( @@ -1287,7 +1355,7 @@ export async function runEvalCommand( return { executionErrorCount: summary.executionErrorCount, outputPath, - testFiles: resolvedTestFiles, + testFiles: activeTestFiles, target: options.target, thresholdFailed, }; diff --git a/apps/cli/test/commands/eval/tag-filtering.test.ts b/apps/cli/test/commands/eval/tag-filtering.test.ts new file mode 100644 index 00000000..88372de7 --- /dev/null +++ b/apps/cli/test/commands/eval/tag-filtering.test.ts @@ -0,0 +1,73 @@ +import { describe, expect, it } from 'bun:test'; + +import { matchesTagFilters } from '../../../src/commands/eval/run-eval.js'; + +describe('matchesTagFilters', () => { + describe('no filters', () => { + it('accepts files with tags', () => { + expect(matchesTagFilters(['agent', 'slow'], [], [])).toBe(true); + }); + + it('accepts files without tags', () => { + expect(matchesTagFilters(undefined, [], [])).toBe(true); + }); + + it('accepts files with empty tags', () => { + expect(matchesTagFilters([], [], [])).toBe(true); + }); + }); + + describe('--tag (include)', () => { + it('accepts file with matching tag', () => { + expect(matchesTagFilters(['agent', 'fast'], ['agent'], [])).toBe(true); + }); + + it('rejects file without matching tag', () => { + expect(matchesTagFilters(['slow', 'multi-provider'], ['agent'], [])).toBe(false); + }); + + it('requires all specified tags (AND logic)', () => { + expect(matchesTagFilters(['agent', 'fast'], ['agent', 'fast'], [])).toBe(true); + expect(matchesTagFilters(['agent'], ['agent', 'fast'], [])).toBe(false); + }); + + it('rejects files with no tags when --tag is specified', () => { + expect(matchesTagFilters(undefined, ['agent'], [])).toBe(false); + expect(matchesTagFilters([], ['agent'], [])).toBe(false); + }); + }); + + describe('--exclude-tag', () => { + it('accepts file without excluded tag', () => { + expect(matchesTagFilters(['agent', 'fast'], [], ['slow'])).toBe(true); + }); + + it('rejects file with excluded tag', () => { + expect(matchesTagFilters(['agent', 'slow'], [], ['slow'])).toBe(false); + }); + + it('rejects file if any excluded tag is present (AND logic)', () => { + expect(matchesTagFilters(['agent', 'slow'], [], ['slow', 'flaky'])).toBe(false); + expect(matchesTagFilters(['agent', 'flaky'], [], ['slow', 'flaky'])).toBe(false); + }); + + it('accepts files with no tags when only --exclude-tag is specified', () => { + expect(matchesTagFilters(undefined, [], ['slow'])).toBe(true); + expect(matchesTagFilters([], [], ['slow'])).toBe(true); + }); + }); + + describe('combined --tag and --exclude-tag', () => { + it('accepts file matching include and not matching exclude', () => { + expect(matchesTagFilters(['agent', 'fast'], ['agent'], ['slow'])).toBe(true); + }); + + it('rejects file matching include but also matching exclude', () => { + expect(matchesTagFilters(['agent', 'slow'], ['agent'], ['slow'])).toBe(false); + }); + + it('rejects file not matching include even if not matching exclude', () => { + expect(matchesTagFilters(['fast'], ['agent'], ['slow'])).toBe(false); + }); + }); +});