From f7a83a4f17ffc36935b3d9adb12147de81c63215 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 29 Mar 2026 22:03:40 +0000 Subject: [PATCH 1/5] refactor(core): remove borderline verdict from type system and scoring Simplify EvaluationVerdict to 'pass' | 'fail' | 'skip'. Scores below 0.8 are now 'fail' (previously 0.6-0.8 was 'borderline'). Remove borderline from EvalSummary, scoreToVerdict, negateScore, and composite evaluator. Co-Authored-By: Claude Opus 4.6 --- packages/core/src/evaluation/evaluate.ts | 13 ++----------- .../core/src/evaluation/evaluators/composite.ts | 6 +++--- packages/core/src/evaluation/evaluators/scoring.ts | 5 +---- packages/core/src/evaluation/types.ts | 2 +- 4 files changed, 7 insertions(+), 19 deletions(-) diff --git a/packages/core/src/evaluation/evaluate.ts b/packages/core/src/evaluation/evaluate.ts index 0cc14feb..2e2f9c05 100644 --- a/packages/core/src/evaluation/evaluate.ts +++ b/packages/core/src/evaluation/evaluate.ts @@ -167,10 +167,8 @@ export interface EvalSummary { readonly total: number; /** Number of passing test cases (score >= 0.8) */ readonly passed: number; - /** Number of failing test cases (score < 0.5) */ + /** Number of failing test cases (score < 0.8) */ readonly failed: number; - /** Number of borderline test cases (0.5 <= score < 0.8) */ - readonly borderline: number; /** Total duration in milliseconds */ readonly durationMs: number; /** Mean score across all cases */ @@ -373,26 +371,19 @@ function mapAssertionType(type: string): string { function computeSummary(results: readonly EvaluationResult[], durationMs: number): EvalSummary { const total = results.length; let passed = 0; - let failed = 0; - let borderline = 0; let scoreSum = 0; for (const r of results) { scoreSum += r.score; if (r.score >= 0.8) { passed++; - } else if (r.score < 0.5) { - failed++; - } else { - borderline++; } } return { total, passed, - failed, - borderline, + failed: total - passed, durationMs, meanScore: total > 0 ? scoreSum / total : 0, }; diff --git a/packages/core/src/evaluation/evaluators/composite.ts b/packages/core/src/evaluation/evaluators/composite.ts index 61dfcea3..604f00cb 100644 --- a/packages/core/src/evaluation/evaluators/composite.ts +++ b/packages/core/src/evaluation/evaluators/composite.ts @@ -34,7 +34,7 @@ const DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation res {{EVALUATOR_RESULTS_JSON}} Decide the final score and verdict based on all evaluator results. -Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`; +Return a JSON object with: score (0.0-1.0), verdict (pass/fail), and reasoning.`; export interface CompositeEvaluatorOptions { readonly config: CompositeEvaluatorConfig; @@ -186,7 +186,7 @@ export class CompositeEvaluator implements Evaluator { } evaluatedCount++; - const isPassing = member.result.verdict === 'pass' || member.result.verdict === 'borderline'; + const isPassing = member.result.verdict === 'pass'; if (isPassing) { passingCount++; } @@ -275,7 +275,7 @@ export class CompositeEvaluator implements Evaluator { : []; const verdict = typeof parsed?.verdict === 'string' && - (parsed.verdict === 'pass' || parsed.verdict === 'fail' || parsed.verdict === 'borderline') + (parsed.verdict === 'pass' || parsed.verdict === 'fail') ? parsed.verdict : scoreToVerdict(score); diff --git a/packages/core/src/evaluation/evaluators/scoring.ts b/packages/core/src/evaluation/evaluators/scoring.ts index 019112a6..11c96ca6 100644 --- a/packages/core/src/evaluation/evaluators/scoring.ts +++ b/packages/core/src/evaluation/evaluators/scoring.ts @@ -5,9 +5,6 @@ export function scoreToVerdict(score: number): EvaluationVerdict { if (score >= 0.8) { return 'pass'; } - if (score >= 0.6) { - return 'borderline'; - } return 'fail'; } @@ -91,7 +88,7 @@ export function deepEqual(a: unknown, b: unknown): boolean { export function negateScore(score: EvaluationScore): EvaluationScore { const negatedScore = clampScore(1 - score.score); const negatedVerdict: EvaluationVerdict = - score.verdict === 'pass' ? 'fail' : score.verdict === 'fail' ? 'pass' : 'borderline'; + score.verdict === 'pass' ? 'fail' : score.verdict === 'fail' ? 'pass' : 'skip'; return { ...score, score: negatedScore, diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 586cf5c2..3d26498c 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -958,7 +958,7 @@ export interface EvaluationResult { readonly executionError?: ExecutionError; } -export type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip'; +export type EvaluationVerdict = 'pass' | 'fail' | 'skip'; export interface EvaluatorResult { readonly name: string; From 15a9f36848da4faab26e06bcc740c4a7e4af8bc0 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 29 Mar 2026 22:11:03 +0000 Subject: [PATCH 2/5] test: update all test expectations for borderline verdict removal Change borderline expectations to fail (scores 0.6-0.8 are now fail). Remove borderline-specific tests in negation and composite-threshold. Update threshold aggregator tests since only pass verdicts count. Co-Authored-By: Claude Opus 4.6 --- .../core/test/evaluation/evaluators.test.ts | 8 +-- .../evaluators/composite-threshold.test.ts | 69 ++----------------- .../evaluators/execution-metrics.test.ts | 2 +- .../evaluation/evaluators/negation.test.ts | 19 ----- .../core/test/evaluation/orchestrator.test.ts | 4 +- packages/core/test/evaluation/trials.test.ts | 8 +-- 6 files changed, 16 insertions(+), 94 deletions(-) diff --git a/packages/core/test/evaluation/evaluators.test.ts b/packages/core/test/evaluation/evaluators.test.ts index a7b92afa..00efcd61 100644 --- a/packages/core/test/evaluation/evaluators.test.ts +++ b/packages/core/test/evaluation/evaluators.test.ts @@ -149,7 +149,7 @@ describe('LlmGraderEvaluator (llm-grader)', () => { }); expect(result.score).toBeCloseTo(0.75); - expect(result.verdict).toBe('borderline'); + expect(result.verdict).toBe('fail'); expect(result.assertions.filter((a) => a.passed)).toHaveLength(2); expect(result.assertions.filter((a) => !a.passed)).toHaveLength(1); }); @@ -188,7 +188,7 @@ describe('LlmGraderEvaluator (llm-grader)', () => { }); expect(result.score).toBeCloseTo(0.65); - expect(result.verdict).toBe('borderline'); + expect(result.verdict).toBe('fail'); expect(result.assertions).toEqual([ { text: 'Addressed the core request', @@ -321,7 +321,7 @@ describe('LlmGraderEvaluator (llm-grader)', () => { }); expect(result.score).toBeCloseTo(0.7); - expect(result.verdict).toBe('borderline'); + expect(result.verdict).toBe('fail'); // Custom template goes in user prompt (question), system prompt only has output schema expect(graderProvider.lastRequest?.question).toContain(customPrompt); @@ -1065,7 +1065,7 @@ describe('FieldAccuracyEvaluator', () => { // Score should be (1.0 * 2.0 + 0.0 * 1.0) / (2.0 + 1.0) = 2/3 ≈ 0.667 expect(result.score).toBeCloseTo(0.667, 2); - expect(result.verdict).toBe('borderline'); + expect(result.verdict).toBe('fail'); }); it('supports all_or_nothing aggregation', () => { diff --git a/packages/core/test/evaluation/evaluators/composite-threshold.test.ts b/packages/core/test/evaluation/evaluators/composite-threshold.test.ts index 77098574..948d8e6b 100644 --- a/packages/core/test/evaluation/evaluators/composite-threshold.test.ts +++ b/packages/core/test/evaluation/evaluators/composite-threshold.test.ts @@ -46,16 +46,14 @@ function createContext(): EvaluationContext { }; } -function makeResult(verdict: 'pass' | 'fail' | 'borderline', score: number): EvaluationScore { +function makeResult(verdict: 'pass' | 'fail', score: number): EvaluationScore { return { score, verdict, assertions: verdict === 'pass' ? [{ text: 'passed', passed: true }] - : verdict === 'fail' - ? [{ text: 'failed', passed: false }] - : [], + : [{ text: 'failed', passed: false }], expectedAspectCount: 1, }; } @@ -157,10 +155,10 @@ describe('CompositeEvaluator threshold aggregation', () => { expect(result.verdict).toBe('fail'); }); - it('borderline child counts as passing (lenient)', async () => { + it('score 0.7 child is fail (not passing), only pass verdicts count', async () => { const factory = createMockFactory({ a: makeResult('pass', 1.0), - b: makeResult('borderline', 0.7), + b: makeResult('fail', 0.7), c: makeResult('fail', 0.3), d: makeResult('fail', 0.1), }); @@ -181,64 +179,7 @@ describe('CompositeEvaluator threshold aggregation', () => { }); const result = await evaluator.evaluate(createContext()); - expect(result.score).toBe(0.5); - expect(result.verdict).toBe('pass'); - }); - - it('warning includes borderline count when borderline contributes to pass', async () => { - const factory = createMockFactory({ - a: makeResult('pass', 1.0), - b: makeResult('borderline', 0.7), - c: makeResult('fail', 0.3), - d: makeResult('fail', 0.1), - }); - - const evaluator = new CompositeEvaluator({ - config: { - name: 'gate', - type: 'composite', - assertions: [ - { name: 'a', type: 'latency', threshold: 5000 }, - { name: 'b', type: 'latency', threshold: 5000 }, - { name: 'c', type: 'latency', threshold: 5000 }, - { name: 'd', type: 'latency', threshold: 5000 }, - ], - aggregator: { type: 'threshold', threshold: 0.5 }, - }, - evaluatorFactory: factory, - }); - - const result = await evaluator.evaluate(createContext()); - // Borderline member counts as passing — verify the summary assertion - const summaryAssertion = result.assertions.find((a) => a.text.includes('evaluators passed')); - expect(summaryAssertion).toBeDefined(); - expect(summaryAssertion?.text).toContain('2/4 evaluators passed'); - }); - - it('no warning when borderline present but result fails', async () => { - const factory = createMockFactory({ - a: makeResult('borderline', 0.7), - b: makeResult('fail', 0.3), - c: makeResult('fail', 0.2), - d: makeResult('fail', 0.1), - }); - - const evaluator = new CompositeEvaluator({ - config: { - name: 'gate', - type: 'composite', - assertions: [ - { name: 'a', type: 'latency', threshold: 5000 }, - { name: 'b', type: 'latency', threshold: 5000 }, - { name: 'c', type: 'latency', threshold: 5000 }, - { name: 'd', type: 'latency', threshold: 5000 }, - ], - aggregator: { type: 'threshold', threshold: 0.5 }, - }, - evaluatorFactory: factory, - }); - - const result = await evaluator.evaluate(createContext()); + expect(result.score).toBe(0.25); expect(result.verdict).toBe('fail'); }); diff --git a/packages/core/test/evaluation/evaluators/execution-metrics.test.ts b/packages/core/test/evaluation/evaluators/execution-metrics.test.ts index 9cdeabcd..3f6ed8e7 100644 --- a/packages/core/test/evaluation/evaluators/execution-metrics.test.ts +++ b/packages/core/test/evaluation/evaluators/execution-metrics.test.ts @@ -436,7 +436,7 @@ describe('ExecutionMetricsEvaluator', () => { // 3 passed (tool_calls, tokens, duration), 2 failed (llm_calls, cost) expect(result.score).toBeCloseTo(0.6); // 3 / 5 - expect(result.verdict).toBe('borderline'); + expect(result.verdict).toBe('fail'); expect(result.assertions.filter((a) => a.passed)).toHaveLength(3); expect(result.assertions.filter((a) => !a.passed)).toHaveLength(2); }); diff --git a/packages/core/test/evaluation/evaluators/negation.test.ts b/packages/core/test/evaluation/evaluators/negation.test.ts index 0099f59c..41d1ef18 100644 --- a/packages/core/test/evaluation/evaluators/negation.test.ts +++ b/packages/core/test/evaluation/evaluators/negation.test.ts @@ -34,25 +34,6 @@ describe('negateScore', () => { expect(negated.assertions).toEqual([{ text: 'criterion not met', passed: true }]); }); - it('keeps borderline verdict as borderline', () => { - const original: EvaluationScore = { - score: 0.7, - verdict: 'borderline', - assertions: [ - { text: 'partial', passed: true }, - { text: 'incomplete', passed: false }, - ], - expectedAspectCount: 2, - }; - - const negated = negateScore(original); - - expect(negated.score).toBeCloseTo(0.3, 10); - expect(negated.verdict).toBe('borderline'); - expect(negated.assertions.filter((a) => a.passed).map((a) => a.text)).toEqual(['incomplete']); - expect(negated.assertions.filter((a) => !a.passed).map((a) => a.text)).toEqual(['partial']); - }); - it('flips passed on each assertion', () => { const original: EvaluationScore = { score: 1.0, diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index fb46f8fe..9031cbdc 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -1176,7 +1176,7 @@ describe('runEvaluation with trials', () => { callIndex += 1; return { score, - verdict: (score >= 0.8 ? 'pass' : score >= 0.6 ? 'borderline' : 'fail') as const, + verdict: (score >= 0.8 ? 'pass' : 'fail') as const, assertions: score >= 0.8 ? [{ text: 'passed', passed: true }] @@ -2002,7 +2002,7 @@ describe('required gates', () => { async evaluate() { return { score: 0.7, - verdict: 'borderline' as const, + verdict: 'fail' as const, assertions: [ { text: 'partial', passed: true }, { text: 'incomplete', passed: false }, diff --git a/packages/core/test/evaluation/trials.test.ts b/packages/core/test/evaluation/trials.test.ts index 47d41c4c..a5889c9e 100644 --- a/packages/core/test/evaluation/trials.test.ts +++ b/packages/core/test/evaluation/trials.test.ts @@ -52,7 +52,7 @@ describe('aggregateTrials', () => { describe('mean strategy', () => { it('averages scores correctly', () => { const trials: TrialResult[] = [ - { attempt: 0, score: 0.7, verdict: 'borderline' }, + { attempt: 0, score: 0.7, verdict: 'fail' }, { attempt: 1, score: 0.9, verdict: 'pass' }, { attempt: 2, score: 1.0, verdict: 'pass' }, ]; @@ -70,8 +70,8 @@ describe('aggregateTrials', () => { it('handles all same scores', () => { const trials: TrialResult[] = [ - { attempt: 0, score: 0.7, verdict: 'borderline' }, - { attempt: 1, score: 0.7, verdict: 'borderline' }, + { attempt: 0, score: 0.7, verdict: 'fail' }, + { attempt: 1, score: 0.7, verdict: 'fail' }, ]; const config: TrialsConfig = { count: 2, strategy: 'mean' }; @@ -84,7 +84,7 @@ describe('aggregateTrials', () => { describe('confidence_interval strategy', () => { it('computes CI bounds', () => { const trials: TrialResult[] = [ - { attempt: 0, score: 0.7, verdict: 'borderline' }, + { attempt: 0, score: 0.7, verdict: 'fail' }, { attempt: 1, score: 0.8, verdict: 'pass' }, { attempt: 2, score: 0.9, verdict: 'pass' }, ]; From afd294425ed5cdb63ffee399a4069948556eac46 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 29 Mar 2026 22:11:09 +0000 Subject: [PATCH 3/5] docs: remove borderline references from examples, docs, and baselines Update example scripts, documentation, baseline JSONL fixtures, and skill references to reflect binary pass/fail verdict system. Co-Authored-By: Claude Opus 4.6 --- apps/web/src/content/docs/docs/evaluation/rubrics.mdx | 3 +-- apps/web/src/content/docs/docs/guides/human-review.mdx | 2 +- .../features/composite/scripts/safety-gate-aggregator.js | 2 -- .../evals/confusion-metrics.eval.baseline.jsonl | 4 ++-- .../evals/dataset.eval.baseline.jsonl | 2 +- .../cross-repo-sync/evals/ground-truth/cases-to-tests.diff | 4 ++-- .../evals/ground-truth/schema-field-rename.diff | 4 ++-- examples/showcase/evaluator-conformance/README.md | 2 +- .../showcase/evaluator-conformance/conformance-check.ts | 3 +-- .../offline-grader-benchmark/fixtures/setup-a.raw.jsonl | 4 ++-- .../offline-grader-benchmark/fixtures/setup-b.raw.jsonl | 2 +- .../scripts/score-grader-benchmark.ts | 7 ++----- .../psychotherapy/evals/routing.eval.baseline.jsonl | 2 +- .../skills/agentv-bench/references/eval-yaml-spec.md | 2 +- .../agentv-eval-writer/references/rubric-evaluator.md | 3 +-- 15 files changed, 19 insertions(+), 27 deletions(-) diff --git a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx index d17bec30..7b2ba132 100644 --- a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx +++ b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx @@ -112,8 +112,7 @@ score = sum(criterion_score / 10 * weight) / sum(total_weights) | Verdict | Score | |---------|-------| | `pass` | ≥ 0.8 | -| `borderline` | ≥ 0.6 | -| `fail` | < 0.6 | +| `fail` | < 0.8 | ## Authoring Rubrics diff --git a/apps/web/src/content/docs/docs/guides/human-review.mdx b/apps/web/src/content/docs/docs/guides/human-review.mdx index 24389fbb..ae737bd5 100644 --- a/apps/web/src/content/docs/docs/guides/human-review.mdx +++ b/apps/web/src/content/docs/docs/guides/human-review.mdx @@ -80,7 +80,7 @@ The `feedback.json` file is a structured annotation of a single eval run. It rec { "test_id": "test-feature-alpha", "verdict": "acceptable", - "notes": "Score is borderline (0.72) but behavior is correct — the grader penalized for different phrasing." + "notes": "Score is low (0.72) but behavior is correct — the grader penalized for different phrasing." }, { "test_id": "test-retrieval-basic", diff --git a/examples/features/composite/scripts/safety-gate-aggregator.js b/examples/features/composite/scripts/safety-gate-aggregator.js index 1d0f24b3..225fb974 100644 --- a/examples/features/composite/scripts/safety-gate-aggregator.js +++ b/examples/features/composite/scripts/safety-gate-aggregator.js @@ -53,8 +53,6 @@ try { if (finalScore >= 0.8) { verdict = 'pass'; - } else if (finalScore >= 0.6) { - verdict = 'borderline'; } else { verdict = 'fail'; } diff --git a/examples/features/document-extraction/evals/confusion-metrics.eval.baseline.jsonl b/examples/features/document-extraction/evals/confusion-metrics.eval.baseline.jsonl index 1b85d7ef..94c355d1 100644 --- a/examples/features/document-extraction/evals/confusion-metrics.eval.baseline.jsonl +++ b/examples/features/document-extraction/evals/confusion-metrics.eval.baseline.jsonl @@ -1,5 +1,5 @@ {"timestamp":"2026-02-20T21:38:57.573Z","test_id":"metrics-001","dataset":"dataset-confusion-metrics","score":1,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":1,"weight":1,"verdict":"pass","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":6,"total_tn":0,"total_fp":0,"total_fn":0,"macro_precision":1,"macro_recall":1,"macro_f1":1}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=6 TN=0 FP=0 FN=0, macro-F1=1.000"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"supplier.name: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=6 TN=0 FP=0 FN=0, macro-F1=1.000"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"supplier.name: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true}]} -{"timestamp":"2026-02-20T21:38:57.582Z","test_id":"metrics-003","dataset":"dataset-confusion-metrics","score":0.6666666666666666,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.6666666666666666,"weight":1,"verdict":"borderline","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0}},"summary":{"total_tp":4,"total_tn":0,"total_fp":2,"total_fn":2,"macro_precision":0.6666666666666666,"macro_recall":0.6666666666666666,"macro_f1":0.6666666666666666}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=4 TN=0 FP=2 FN=2, macro-F1=0.667"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=4 TN=0 FP=2 FN=2, macro-F1=0.667"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FP+FN (wrong value)","passed":false}]} +{"timestamp":"2026-02-20T21:38:57.582Z","test_id":"metrics-003","dataset":"dataset-confusion-metrics","score":0.6666666666666666,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.6666666666666666,"weight":1,"verdict":"fail","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0}},"summary":{"total_tp":4,"total_tn":0,"total_fp":2,"total_fn":2,"macro_precision":0.6666666666666666,"macro_recall":0.6666666666666666,"macro_f1":0.6666666666666666}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=4 TN=0 FP=2 FN=2, macro-F1=0.667"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=4 TN=0 FP=2 FN=2, macro-F1=0.667"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FP+FN (wrong value)","passed":false}]} {"timestamp":"2026-02-20T21:38:57.588Z","test_id":"metrics-002","dataset":"dataset-confusion-metrics","score":0.8333333333333334,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.8333333333333334,"weight":1,"verdict":"pass","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":5,"total_tn":0,"total_fp":1,"total_fn":1,"macro_precision":0.8333333333333334,"macro_recall":0.8333333333333334,"macro_f1":0.8333333333333334}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=5 TN=0 FP=1 FN=1, macro-F1=0.833"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=5 TN=0 FP=1 FN=1, macro-F1=0.833"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]} -{"timestamp":"2026-02-20T21:38:57.641Z","test_id":"metrics-004","dataset":"dataset-confusion-metrics","score":0.6666666666666666,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.6666666666666666,"weight":1,"verdict":"borderline","details":{"metrics":{"invoice_number":{"tp":0,"tn":0,"fp":0,"fn":1,"recall":0},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":4,"total_tn":0,"total_fp":1,"total_fn":2,"macro_precision":0.8,"macro_recall":0.6666666666666666,"macro_f1":0.6666666666666666}},"assertions":[{"text":"invoice_date: TP (correct non-empty)","passed":true,"evidence":"TP=4 TN=0 FP=1 FN=2, macro-F1=0.667"},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"invoice_number: FN (missing)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_date: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=4 TN=0 FP=1 FN=2, macro-F1=0.667"},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"invoice_number: FN (missing)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]} +{"timestamp":"2026-02-20T21:38:57.641Z","test_id":"metrics-004","dataset":"dataset-confusion-metrics","score":0.6666666666666666,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.6666666666666666,"weight":1,"verdict":"fail","details":{"metrics":{"invoice_number":{"tp":0,"tn":0,"fp":0,"fn":1,"recall":0},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1}},"summary":{"total_tp":4,"total_tn":0,"total_fp":1,"total_fn":2,"macro_precision":0.8,"macro_recall":0.6666666666666666,"macro_f1":0.6666666666666666}},"assertions":[{"text":"invoice_date: TP (correct non-empty)","passed":true,"evidence":"TP=4 TN=0 FP=1 FN=2, macro-F1=0.667"},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"invoice_number: FN (missing)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]}],"assertions":[{"text":"invoice_date: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=4 TN=0 FP=1 FN=2, macro-F1=0.667"},{"text":"currency: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"gross_total: TP (correct non-empty)","passed":true},{"text":"invoice_number: FN (missing)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false}]} {"timestamp":"2026-02-20T21:38:57.649Z","test_id":"metrics-005","dataset":"dataset-confusion-metrics","score":0.5,"target":"mock_extractor","scores":[{"name":"header_confusion","type":"code-grader","score":0.5,"weight":1,"verdict":"fail","details":{"metrics":{"invoice_number":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"invoice_date":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"currency":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"supplier.name":{"tp":0,"tn":0,"fp":1,"fn":1,"precision":0,"recall":0},"importer.name":{"tp":1,"tn":0,"fp":0,"fn":0,"precision":1,"recall":1,"f1":1},"gross_total":{"tp":0,"tn":0,"fp":0,"fn":1,"recall":0}},"summary":{"total_tp":3,"total_tn":0,"total_fp":2,"total_fn":3,"macro_precision":0.6,"macro_recall":0.5,"macro_f1":0.5}},"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"TP=3 TN=0 FP=2 FN=3, macro-F1=0.500"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"currency: FP+FN (wrong value)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FN (missing)","passed":false}]}],"assertions":[{"text":"invoice_number: TP (correct non-empty)","passed":true,"evidence":"header_confusion: TP=3 TN=0 FP=2 FN=3, macro-F1=0.500"},{"text":"invoice_date: TP (correct non-empty)","passed":true},{"text":"importer.name: TP (correct non-empty)","passed":true},{"text":"currency: FP+FN (wrong value)","passed":false},{"text":"supplier.name: FP+FN (wrong value)","passed":false},{"text":"gross_total: FN (missing)","passed":false}]} diff --git a/examples/features/tool-trajectory-simple/evals/dataset.eval.baseline.jsonl b/examples/features/tool-trajectory-simple/evals/dataset.eval.baseline.jsonl index ab6cd1c3..9f8b1585 100644 --- a/examples/features/tool-trajectory-simple/evals/dataset.eval.baseline.jsonl +++ b/examples/features/tool-trajectory-simple/evals/dataset.eval.baseline.jsonl @@ -2,6 +2,6 @@ {"timestamp":"2026-02-20T21:40:23.520Z","test_id":"exact-auth-flow","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"auth-sequence-exact","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]}],"assertions":[{"text":"Position 0: checkCredentials","passed":true},{"text":"Position 1: generateToken","passed":true},{"text":"Position 2: auditLog","passed":true}]} {"timestamp":"2026-02-20T21:40:23.526Z","test_id":"in-order-pass","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"workflow-sequence","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]}],"assertions":[{"text":"Found fetchData at position 0","passed":true},{"text":"Found validateSchema at position 1","passed":true},{"text":"Found transformData at position 2","passed":true},{"text":"Found saveResults at position 3","passed":true}]} {"timestamp":"2026-02-20T21:40:23.569Z","test_id":"metrics-check","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"metrics-tools","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"getCpuMetrics: called 1 times (required \u22651)","passed":true},{"text":"getMemoryMetrics: called 1 times (required \u22651)","passed":true}]}],"assertions":[{"text":"getCpuMetrics: called 1 times (required \u22651)","passed":true},{"text":"getMemoryMetrics: called 1 times (required \u22651)","passed":true}]} -{"timestamp":"2026-02-20T21:40:23.579Z","test_id":"partial-match","dataset":"dataset","score":0.6666666666666666,"target":"mock_agent","scores":[{"name":"tool-check","type":"tool-trajectory","score":0.6666666666666666,"weight":1,"verdict":"borderline","assertions":[{"text":"knowledgeSearch: called 2 times (required \u22651)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true},{"text":"generateReport: called 0 times (required \u22651)","passed":false}]}],"assertions":[{"text":"knowledgeSearch: called 2 times (required \u22651)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true},{"text":"generateReport: called 0 times (required \u22651)","passed":false}]} +{"timestamp":"2026-02-20T21:40:23.579Z","test_id":"partial-match","dataset":"dataset","score":0.6666666666666666,"target":"mock_agent","scores":[{"name":"tool-check","type":"tool-trajectory","score":0.6666666666666666,"weight":1,"verdict":"fail","assertions":[{"text":"knowledgeSearch: called 2 times (required \u22651)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true},{"text":"generateReport: called 0 times (required \u22651)","passed":false}]}],"assertions":[{"text":"knowledgeSearch: called 2 times (required \u22651)","passed":true},{"text":"documentRetrieve: called 1 times (required \u22651)","passed":true},{"text":"generateReport: called 0 times (required \u22651)","passed":false}]} {"timestamp":"2026-02-20T21:40:23.599Z","test_id":"exact-args-match","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"arg-validation","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]}],"assertions":[{"text":"Found search at position 0","passed":true},{"text":"Found get_weather at position 1","passed":true}]} {"timestamp":"2026-02-20T21:40:23.624Z","test_id":"skip-args-validation","dataset":"dataset","score":1,"target":"mock_agent","scores":[{"name":"workflow-sequence-only","type":"tool-trajectory","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Found load_data at position 0","passed":true},{"text":"Found transform at position 1","passed":true},{"text":"Found save_data at position 2","passed":true}]}],"assertions":[{"text":"Found load_data at position 0","passed":true},{"text":"Found transform at position 1","passed":true},{"text":"Found save_data at position 2","passed":true}]} diff --git a/examples/showcase/cross-repo-sync/evals/ground-truth/cases-to-tests.diff b/examples/showcase/cross-repo-sync/evals/ground-truth/cases-to-tests.diff index 5db2f399..739a474d 100644 --- a/examples/showcase/cross-repo-sync/evals/ground-truth/cases-to-tests.diff +++ b/examples/showcase/cross-repo-sync/evals/ground-truth/cases-to-tests.diff @@ -217,7 +217,7 @@ index fcc60ea..e183d5c 100644 +| `evalId` | `string` | Test ID | | `dataset` | `string` | EVAL name | | `score` | `number` | Final score (0-1) | - | `verdict` | `string` | pass / borderline / fail | + | `verdict` | `string` | pass / fail | @@ -135,7 +135,7 @@ cat results.jsonl | jq -s 'group_by(.verdict) | map({verdict: .[0].verdict, coun # Average score cat results.jsonl | jq -s 'map(.score) | add / length' @@ -1045,7 +1045,7 @@ index b5737e7..0791c70 100644 -| [Case Schema](/specification/evalcase-schema/) | Within EVAL.yaml | Individual test case definition | +| [Test Schema](/specification/case-schema/) | Within EVAL.yaml | Individual test definition | | [Evaluators](/specification/evaluators/) | Referenced | Assessment components | - | [Verdicts](/specification/verdicts/) | Results | Pass/borderline/fail determination | + | [Verdicts](/specification/verdicts/) | Results | Pass/fail determination | | [Organization](/specification/organization/) | Directory | File organization patterns | @@ -46,7 +46,7 @@ execution: type: llm_judge diff --git a/examples/showcase/cross-repo-sync/evals/ground-truth/schema-field-rename.diff b/examples/showcase/cross-repo-sync/evals/ground-truth/schema-field-rename.diff index 52de569d..85d62585 100644 --- a/examples/showcase/cross-repo-sync/evals/ground-truth/schema-field-rename.diff +++ b/examples/showcase/cross-repo-sync/evals/ground-truth/schema-field-rename.diff @@ -521,7 +521,7 @@ index 10e7a1c..fcc60ea 100644 +| `evalId` | `string` | Case ID | | `dataset` | `string` | EVAL name | | `score` | `number` | Final score (0-1) | - | `verdict` | `string` | pass / borderline / fail | + | `verdict` | `string` | pass / fail | diff --git a/docs/src/content/docs/integration/targets.mdx b/docs/src/content/docs/integration/targets.mdx index 68218db..9699aeb 100644 --- a/docs/src/content/docs/integration/targets.mdx @@ -1172,7 +1172,7 @@ index 1ae8b33..b5737e7 100644 -| [Evalcase Schema](/specification/evalcase-schema/) | Within EVAL.yaml | Individual test case definition | +| [Case Schema](/specification/evalcase-schema/) | Within EVAL.yaml | Individual test case definition | | [Evaluators](/specification/evaluators/) | Referenced | Assessment components | - | [Verdicts](/specification/verdicts/) | Results | Pass/borderline/fail determination | + | [Verdicts](/specification/verdicts/) | Results | Pass/fail determination | | [Organization](/specification/organization/) | Directory | File organization patterns | @@ -46,9 +46,9 @@ execution: type: llm_judge diff --git a/examples/showcase/evaluator-conformance/README.md b/examples/showcase/evaluator-conformance/README.md index 17451880..79485de5 100644 --- a/examples/showcase/evaluator-conformance/README.md +++ b/examples/showcase/evaluator-conformance/README.md @@ -21,7 +21,7 @@ The harness runs an evaluator N times against a labeled fixture dataset: It then computes per-fixture metrics: -- **Flip rate** — fraction of runs where the verdict (pass/borderline/fail) differs from the first run +- **Flip rate** — fraction of runs where the verdict (pass/fail) differs from the first run - **Mean / Variance** — statistical summary of scores across runs - **Bound violations** — scores outside the expected range for ambiguous fixtures diff --git a/examples/showcase/evaluator-conformance/conformance-check.ts b/examples/showcase/evaluator-conformance/conformance-check.ts index 25a48aac..9aecdf21 100644 --- a/examples/showcase/evaluator-conformance/conformance-check.ts +++ b/examples/showcase/evaluator-conformance/conformance-check.ts @@ -205,9 +205,8 @@ function variance(values: number[]): number { return values.reduce((sum, v) => sum + (v - m) ** 2, 0) / values.length; } -function toVerdict(score: number): 'pass' | 'borderline' | 'fail' { +function toVerdict(score: number): 'pass' | 'fail' { if (score >= 0.8) return 'pass'; - if (score >= 0.6) return 'borderline'; return 'fail'; } diff --git a/examples/showcase/offline-grader-benchmark/fixtures/setup-a.raw.jsonl b/examples/showcase/offline-grader-benchmark/fixtures/setup-a.raw.jsonl index bf13889b..b7d2ba06 100644 --- a/examples/showcase/offline-grader-benchmark/fixtures/setup-a.raw.jsonl +++ b/examples/showcase/offline-grader-benchmark/fixtures/setup-a.raw.jsonl @@ -1,5 +1,5 @@ -{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-pass-clear", "dataset": "offline-grader-benchmark", "score": 0.8333, "target": "setup-a", "input": "Fixture input for refund-pass-clear", "answer": "Fixture answer for refund-pass-clear", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.8333, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.5, "verdict": "borderline", "assertions": [{"text": "grader-gemini-flash graded borderline", "passed": true}]}], "assertions": []}]} +{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-pass-clear", "dataset": "offline-grader-benchmark", "score": 0.8333, "target": "setup-a", "input": "Fixture input for refund-pass-clear", "answer": "Fixture answer for refund-pass-clear", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.8333, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.5, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": true}]}], "assertions": []}]} {"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-fail-restocking-fee", "dataset": "offline-grader-benchmark", "score": 0.3333, "target": "setup-a", "input": "Fixture input for refund-fail-restocking-fee", "answer": "Fixture answer for refund-fail-restocking-fee", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.3333, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]} {"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-pass-escalation", "dataset": "offline-grader-benchmark", "score": 1.0, "target": "setup-a", "input": "Fixture input for security-pass-escalation", "answer": "Fixture answer for security-pass-escalation", "scores": [{"name": "grader-panel", "type": "composite", "score": 1.0, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} {"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-fail-secret-request", "dataset": "offline-grader-benchmark", "score": 0.3333, "target": "setup-a", "input": "Fixture input for security-fail-secret-request", "answer": "Fixture answer for security-fail-secret-request", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.3333, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} -{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "clinical-fail-unqualified-advice", "dataset": "offline-grader-benchmark", "score": 0.1667, "target": "setup-a", "input": "Fixture input for clinical-fail-unqualified-advice", "answer": "Fixture answer for clinical-fail-unqualified-advice", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.1667, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.5, "verdict": "borderline", "assertions": [{"text": "grader-claude-haiku graded borderline", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]} +{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "clinical-fail-unqualified-advice", "dataset": "offline-grader-benchmark", "score": 0.1667, "target": "setup-a", "input": "Fixture input for clinical-fail-unqualified-advice", "answer": "Fixture answer for clinical-fail-unqualified-advice", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.1667, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.5, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]} diff --git a/examples/showcase/offline-grader-benchmark/fixtures/setup-b.raw.jsonl b/examples/showcase/offline-grader-benchmark/fixtures/setup-b.raw.jsonl index 18f0f8d7..21e2a608 100644 --- a/examples/showcase/offline-grader-benchmark/fixtures/setup-b.raw.jsonl +++ b/examples/showcase/offline-grader-benchmark/fixtures/setup-b.raw.jsonl @@ -2,4 +2,4 @@ {"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "refund-fail-restocking-fee", "dataset": "offline-grader-benchmark", "score": 0.6667, "target": "setup-b", "input": "Fixture input for refund-fail-restocking-fee", "answer": "Fixture answer for refund-fail-restocking-fee", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.6667, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-claude-haiku graded pass", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} {"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-pass-escalation", "dataset": "offline-grader-benchmark", "score": 0.6667, "target": "setup-b", "input": "Fixture input for security-pass-escalation", "answer": "Fixture answer for security-pass-escalation", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.6667, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} {"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "security-fail-secret-request", "dataset": "offline-grader-benchmark", "score": 0.3333, "target": "setup-b", "input": "Fixture input for security-fail-secret-request", "answer": "Fixture answer for security-fail-secret-request", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.3333, "verdict": "fail", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gpt-5-mini graded fail", "passed": false}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": false}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gemini-flash graded pass", "passed": true}]}], "assertions": []}]} -{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "clinical-fail-unqualified-advice", "dataset": "offline-grader-benchmark", "score": 0.5, "target": "setup-b", "input": "Fixture input for clinical-fail-unqualified-advice", "answer": "Fixture answer for clinical-fail-unqualified-advice", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.5, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.5, "verdict": "borderline", "assertions": [{"text": "grader-claude-haiku graded borderline", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]} +{"timestamp": "2026-03-13T00:00:00.000Z", "test_id": "clinical-fail-unqualified-advice", "dataset": "offline-grader-benchmark", "score": 0.5, "target": "setup-b", "input": "Fixture input for clinical-fail-unqualified-advice", "answer": "Fixture answer for clinical-fail-unqualified-advice", "scores": [{"name": "grader-panel", "type": "composite", "score": 0.5, "verdict": "pass", "scores": [{"name": "grader-gpt-5-mini", "type": "llm-grader", "score": 1.0, "verdict": "pass", "assertions": [{"text": "grader-gpt-5-mini graded pass", "passed": true}]}, {"name": "grader-claude-haiku", "type": "llm-grader", "score": 0.5, "verdict": "fail", "assertions": [{"text": "grader-claude-haiku graded fail", "passed": true}]}, {"name": "grader-gemini-flash", "type": "llm-grader", "score": 0.0, "verdict": "fail", "assertions": [{"text": "grader-gemini-flash graded fail", "passed": false}]}], "assertions": []}]} diff --git a/examples/showcase/offline-grader-benchmark/scripts/score-grader-benchmark.ts b/examples/showcase/offline-grader-benchmark/scripts/score-grader-benchmark.ts index 25d11bc2..58f77d35 100644 --- a/examples/showcase/offline-grader-benchmark/scripts/score-grader-benchmark.ts +++ b/examples/showcase/offline-grader-benchmark/scripts/score-grader-benchmark.ts @@ -2,7 +2,7 @@ import { readFileSync } from 'node:fs'; import { resolve } from 'node:path'; -type Verdict = 'pass' | 'fail' | 'borderline' | 'skip'; +type Verdict = 'pass' | 'fail' | 'skip'; type ScoreRecord = { name?: string; @@ -63,7 +63,7 @@ function normalizeGraderVote( verdict: Verdict | undefined, score: number | undefined, ): 'pass' | 'fail' { - if (verdict === 'pass' || verdict === 'borderline') return 'pass'; + if (verdict === 'pass') return 'pass'; if (verdict === 'fail') return 'fail'; return (score ?? 0) >= 0.5 ? 'pass' : 'fail'; } @@ -191,12 +191,10 @@ for (const line of rawResults) { let passVotes = 0; let failVotes = 0; - let borderlineVotes = 0; const graderVotes = graders.map((grader) => { const normalizedVote = normalizeGraderVote(grader.verdict, grader.score); if (normalizedVote === 'pass') passVotes += 1; else failVotes += 1; - if (grader.verdict === 'borderline') borderlineVotes += 1; const graderCorrect = normalizedVote === truth.label; const stats = perGrader.get(grader.name ?? 'unnamed') ?? { correct: 0, total: 0 }; @@ -233,7 +231,6 @@ for (const line of rawResults) { vote_counts: { pass: passVotes, fail: failVotes, - borderline: borderlineVotes, }, grader_votes: graderVotes, reasoning: `${panel.name ?? 'grader-panel'} majority=${majorityVerdict} (${passVotes} pass-ish vs ${failVotes} fail) vs human=${truth.label}`, diff --git a/examples/showcase/psychotherapy/evals/routing.eval.baseline.jsonl b/examples/showcase/psychotherapy/evals/routing.eval.baseline.jsonl index 70b03789..52e119a8 100644 --- a/examples/showcase/psychotherapy/evals/routing.eval.baseline.jsonl +++ b/examples/showcase/psychotherapy/evals/routing.eval.baseline.jsonl @@ -1,4 +1,4 @@ {"timestamp":"2026-02-20T21:44:37.826Z","test_id":"route-to-encouragement-father","dataset":"dataset-routing","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Selected 'resource_focused_encouragement'","passed":true,"evidence":"The candidate followed all instructions perfectly, selecting the correct framework and including the required thematic elements in the rationale."},{"text":"Rationale mentions self-blame and restraint","passed":true},{"text":"Rationale identifies 'hidden resource'","passed":true},{"text":"Included comprehensive framework output","passed":true}]}],"assertions":[{"text":"Selected 'resource_focused_encouragement'","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate followed all instructions perfectly, selecting the correct framework and including the required thematic elements in the rationale."},{"text":"Rationale mentions self-blame and restraint","passed":true},{"text":"Rationale identifies 'hidden resource'","passed":true},{"text":"Included comprehensive framework output","passed":true}]} -{"timestamp":"2026-02-20T21:44:40.629Z","test_id":"route-to-encouragement-job","dataset":"dataset-routing","score":0.875,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":0.75,"weight":1,"verdict":"borderline","assertions":[{"text":"Selected the correct framework: resource_focused_encouragement.","passed":true,"evidence":"The candidate correctly identified the therapeutic framework and provided a sound clinical analysis, but failed to include the specific keywords required by the criteria in the rationale."},{"text":"Correctly identified 'elevated' therapeutic urgency.","passed":true},{"text":"Followed the required JSON schema accurately.","passed":true},{"text":"Correctly identified persistence as a hidden resource.","passed":true},{"text":"Rationale failed to use specific required terms: 'low self-efficacy', 'empowerment', or 'normalization of failure'.","passed":false}]}],"assertions":[{"text":"Selected the correct framework: resource_focused_encouragement.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate correctly identified the therapeutic framework and provided a sound clinical analysis, but failed to include the specific keywords required by the criteria in the rationale."},{"text":"Correctly identified 'elevated' therapeutic urgency.","passed":true},{"text":"Followed the required JSON schema accurately.","passed":true},{"text":"Correctly identified persistence as a hidden resource.","passed":true},{"text":"Rationale failed to use specific required terms: 'low self-efficacy', 'empowerment', or 'normalization of failure'.","passed":false}]} +{"timestamp":"2026-02-20T21:44:40.629Z","test_id":"route-to-encouragement-job","dataset":"dataset-routing","score":0.875,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":0.75,"weight":1,"verdict":"fail","assertions":[{"text":"Selected the correct framework: resource_focused_encouragement.","passed":true,"evidence":"The candidate correctly identified the therapeutic framework and provided a sound clinical analysis, but failed to include the specific keywords required by the criteria in the rationale."},{"text":"Correctly identified 'elevated' therapeutic urgency.","passed":true},{"text":"Followed the required JSON schema accurately.","passed":true},{"text":"Correctly identified persistence as a hidden resource.","passed":true},{"text":"Rationale failed to use specific required terms: 'low self-efficacy', 'empowerment', or 'normalization of failure'.","passed":false}]}],"assertions":[{"text":"Selected the correct framework: resource_focused_encouragement.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate correctly identified the therapeutic framework and provided a sound clinical analysis, but failed to include the specific keywords required by the criteria in the rationale."},{"text":"Correctly identified 'elevated' therapeutic urgency.","passed":true},{"text":"Followed the required JSON schema accurately.","passed":true},{"text":"Correctly identified persistence as a hidden resource.","passed":true},{"text":"Rationale failed to use specific required terms: 'low self-efficacy', 'empowerment', or 'normalization of failure'.","passed":false}]} {"timestamp":"2026-02-20T21:44:43.409Z","test_id":"route-to-listening","dataset":"dataset-routing","score":1,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Selected 'three_levels_listening' framework.","passed":true,"evidence":"The candidate followed all instructions, met both mandatory criteria, and provided a well-reasoned analysis consistent with the therapeutic framework logic."},{"text":"Rationale included 'venting' and 'complex grievances'.","passed":true},{"text":"Correctly identified primary indicators from client statement.","passed":true},{"text":"Followed specified JSON output schema perfectly.","passed":true}]}],"assertions":[{"text":"Selected 'three_levels_listening' framework.","passed":true,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate followed all instructions, met both mandatory criteria, and provided a well-reasoned analysis consistent with the therapeutic framework logic."},{"text":"Rationale included 'venting' and 'complex grievances'.","passed":true},{"text":"Correctly identified primary indicators from client statement.","passed":true},{"text":"Followed specified JSON output schema perfectly.","passed":true}]} {"timestamp":"2026-02-20T21:44:58.472Z","test_id":"route-to-listening-gatekeeper","dataset":"dataset-routing","score":0.5,"target":"gemini-llm","scores":[{"name":"json_schema_validator","type":"code-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata","passed":true,"evidence":"Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata"}]},{"name":"content_evaluator","type":"llm-grader","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Failed to select 'three_levels_listening' as the therapeutic framework.","passed":false,"evidence":"The candidate failed the primary routing requirement and selected the framework specifically contraindicated by the criteria, as it reinforces a pathological triangulation dynamic rather than analyzing it."},{"text":"Incorrectly selected 'resource_focused_encouragement', reinforcing a dysfunctional dynamic.","passed":false},{"text":"Rationale failed to identify triangulation or the client's lack of insight.","passed":false},{"text":"Missed the requirement to prioritize process analysis over reinforcement of the behavior.","passed":false}]}],"assertions":[{"text":"Failed to select 'three_levels_listening' as the therapeutic framework.","passed":false,"evidence":"json_schema_validator: Valid JSON with all required keys: routing_decision, client_statement_analysis, framework_output, metadata | content_evaluator: The candidate failed the primary routing requirement and selected the framework specifically contraindicated by the criteria, as it reinforces a pathological triangulation dynamic rather than analyzing it."},{"text":"Incorrectly selected 'resource_focused_encouragement', reinforcing a dysfunctional dynamic.","passed":false},{"text":"Rationale failed to identify triangulation or the client's lack of insight.","passed":false},{"text":"Missed the requirement to prioritize process analysis over reinforcement of the behavior.","passed":false}]} diff --git a/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md b/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md index 87794493..d95a8595 100644 --- a/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md +++ b/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md @@ -242,7 +242,7 @@ Each line in the results JSONL file is an `EvaluationResult` object. In JSONL, f - `score` (number, 0.0-1.0) - `assertions` (array of `{text, passed, evidence?}`) - `weight` (number, optional) -- `verdict` (string: `pass` | `fail` | `borderline` | `skip`) +- `verdict` (string: `pass` | `fail` | `skip`) - `details` (object, optional — structured data from code graders) - `reasoning` (string, optional) diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/rubric-evaluator.md b/plugins/agentv-dev/skills/agentv-eval-writer/references/rubric-evaluator.md index 24cc9ac8..e219be0b 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/rubric-evaluator.md +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/rubric-evaluator.md @@ -108,7 +108,6 @@ Ranges must be integers 0-10, non-overlapping, covering all values 0-10. | Verdict | Condition | |---------|-----------| | `pass` | score >= 0.8 AND all gating criteria satisfied | -| `borderline` | score >= 0.6 AND all gating criteria satisfied | -| `fail` | score < 0.6 OR any gating criterion failed | +| `fail` | score < 0.8 OR any gating criterion failed | Gating: checklist uses `required: true`, score-range uses `required_min_score: N`. From 1f10cb14930361b7511948e98fe28cde2324d550 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 29 Mar 2026 22:12:58 +0000 Subject: [PATCH 4/5] style: fix biome formatting in composite-threshold test Co-Authored-By: Claude Opus 4.6 --- .../test/evaluation/evaluators/composite-threshold.test.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/packages/core/test/evaluation/evaluators/composite-threshold.test.ts b/packages/core/test/evaluation/evaluators/composite-threshold.test.ts index 948d8e6b..33fb96d1 100644 --- a/packages/core/test/evaluation/evaluators/composite-threshold.test.ts +++ b/packages/core/test/evaluation/evaluators/composite-threshold.test.ts @@ -51,9 +51,7 @@ function makeResult(verdict: 'pass' | 'fail', score: number): EvaluationScore { score, verdict, assertions: - verdict === 'pass' - ? [{ text: 'passed', passed: true }] - : [{ text: 'failed', passed: false }], + verdict === 'pass' ? [{ text: 'passed', passed: true }] : [{ text: 'failed', passed: false }], expectedAspectCount: 1, }; } From a7e37b2e9a58525b05ca1edc19f9fe80ca01c70e Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 29 Mar 2026 22:25:08 +0000 Subject: [PATCH 5/5] refactor(core): extract PASS_THRESHOLD constant, clean up scoring module - Extract PASS_THRESHOLD = 0.8 as single source of truth in scoring.ts - Replace magic 0.8 in evaluate.ts and orchestrator.ts with the constant - Add file header to scoring.ts explaining the scoring model - Use data-driven NEGATED_VERDICT map instead of ternary chain - Remove dead isNonEmptyString import from composite.ts Co-Authored-By: Claude Opus 4.6 --- packages/core/src/evaluation/evaluate.ts | 7 ++-- .../src/evaluation/evaluators/composite.ts | 8 +--- .../core/src/evaluation/evaluators/index.ts | 1 + .../core/src/evaluation/evaluators/scoring.ts | 37 ++++++++++++++----- packages/core/src/evaluation/orchestrator.ts | 7 +--- 5 files changed, 36 insertions(+), 24 deletions(-) diff --git a/packages/core/src/evaluation/evaluate.ts b/packages/core/src/evaluation/evaluate.ts index 2e2f9c05..fe467847 100644 --- a/packages/core/src/evaluation/evaluate.ts +++ b/packages/core/src/evaluation/evaluate.ts @@ -61,6 +61,7 @@ import path from 'node:path'; import { buildDirectoryChain, findGitRoot } from './file-utils.js'; import type { AssertFn } from './assertions.js'; +import { PASS_THRESHOLD } from './evaluators/scoring.js'; import { runEvaluation } from './orchestrator.js'; import { createFunctionProvider } from './providers/function-provider.js'; import { readTargetDefinitions } from './providers/targets-file.js'; @@ -165,9 +166,9 @@ export interface EvalConfig { export interface EvalSummary { /** Total number of test cases */ readonly total: number; - /** Number of passing test cases (score >= 0.8) */ + /** Number of passing test cases (score >= PASS_THRESHOLD) */ readonly passed: number; - /** Number of failing test cases (score < 0.8) */ + /** Number of failing test cases (score < PASS_THRESHOLD) */ readonly failed: number; /** Total duration in milliseconds */ readonly durationMs: number; @@ -375,7 +376,7 @@ function computeSummary(results: readonly EvaluationResult[], durationMs: number for (const r of results) { scoreSum += r.score; - if (r.score >= 0.8) { + if (r.score >= PASS_THRESHOLD) { passed++; } } diff --git a/packages/core/src/evaluation/evaluators/composite.ts b/packages/core/src/evaluation/evaluators/composite.ts index 604f00cb..6d5295fc 100644 --- a/packages/core/src/evaluation/evaluators/composite.ts +++ b/packages/core/src/evaluation/evaluators/composite.ts @@ -9,13 +9,7 @@ import type { } from '../types.js'; import { executeScript } from './code-evaluator.js'; import { buildOutputSchema, freeformEvaluationSchema } from './llm-grader.js'; -import { - clampScore, - isNonEmptyString, - parseJsonFromText, - parseJsonSafe, - scoreToVerdict, -} from './scoring.js'; +import { clampScore, parseJsonFromText, parseJsonSafe, scoreToVerdict } from './scoring.js'; import type { ChildEvaluatorResult, EvaluationContext, diff --git a/packages/core/src/evaluation/evaluators/index.ts b/packages/core/src/evaluation/evaluators/index.ts index 62f6041c..8cfb216f 100644 --- a/packages/core/src/evaluation/evaluators/index.ts +++ b/packages/core/src/evaluation/evaluators/index.ts @@ -10,6 +10,7 @@ export type { // Scoring utilities export { + PASS_THRESHOLD, clampScore, deepEqual, extractJsonBlob, diff --git a/packages/core/src/evaluation/evaluators/scoring.ts b/packages/core/src/evaluation/evaluators/scoring.ts index 11c96ca6..ddb28d18 100644 --- a/packages/core/src/evaluation/evaluators/scoring.ts +++ b/packages/core/src/evaluation/evaluators/scoring.ts @@ -1,11 +1,26 @@ +/** + * Scoring primitives for the evaluation engine. + * + * Scoring model: + * score ∈ [0, 1] — continuous quality signal + * verdict — binary classification derived from score via PASS_THRESHOLD + * + * score >= PASS_THRESHOLD → 'pass' + * score < PASS_THRESHOLD → 'fail' + * (infrastructure skip) → 'skip' + * + * To change the pass/fail boundary, update PASS_THRESHOLD. + * All verdict derivation flows through scoreToVerdict(). + */ + import type { EvaluationVerdict } from '../types.js'; import type { EvaluationScore } from './types.js'; +/** Score threshold for pass verdict. Scores below this are fail. */ +export const PASS_THRESHOLD = 0.8; + export function scoreToVerdict(score: number): EvaluationVerdict { - if (score >= 0.8) { - return 'pass'; - } - return 'fail'; + return score >= PASS_THRESHOLD ? 'pass' : 'fail'; } export function clampScore(value: number): number { @@ -81,18 +96,22 @@ export function deepEqual(a: unknown, b: unknown): boolean { return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key])); } +/** Verdict inversion map: pass↔fail, skip stays skip. */ +const NEGATED_VERDICT: Record = { + pass: 'fail', + fail: 'pass', + skip: 'skip', +}; + /** * Negate an evaluation score: inverts score (1 - score), swaps pass/fail verdict, * and flips passed on each assertion. */ export function negateScore(score: EvaluationScore): EvaluationScore { - const negatedScore = clampScore(1 - score.score); - const negatedVerdict: EvaluationVerdict = - score.verdict === 'pass' ? 'fail' : score.verdict === 'fail' ? 'pass' : 'skip'; return { ...score, - score: negatedScore, - verdict: negatedVerdict, + score: clampScore(1 - score.score), + verdict: NEGATED_VERDICT[score.verdict], assertions: score.assertions.map((a) => ({ ...a, passed: !a.passed, diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index cd3303ee..27e1ce6f 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -10,6 +10,7 @@ import { type EvaluationScore, type Evaluator, LlmGraderEvaluator, + PASS_THRESHOLD, negateScore, scoreToVerdict, } from './evaluators.js'; @@ -76,11 +77,8 @@ import { type PromptInputs, buildPromptInputs, loadTests } from './yaml-parser.j type MaybePromise = T | Promise; -/** Threshold for classifying ok vs quality_failure (score >= threshold → ok). */ -const QUALITY_PASS_THRESHOLD = 0.8; - function classifyQualityStatus(score: number): ExecutionStatus { - return score >= QUALITY_PASS_THRESHOLD ? 'ok' : 'quality_failure'; + return score >= PASS_THRESHOLD ? 'ok' : 'quality_failure'; } function buildSkippedEvaluatorError( @@ -2423,7 +2421,6 @@ async function runEvaluatorList(options: { } // Required gate: if any evaluator with `required` flag fails its threshold, aggregate becomes 0 - const PASS_THRESHOLD = 0.8; const hasRequiredFailure = scored.some((entry) => { if (!entry.required) return false; const minScore = typeof entry.required === 'number' ? entry.required : PASS_THRESHOLD;