diff --git a/CHANGELOG.md b/CHANGELOG.md index 92c9a909..12b6c0ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,18 @@ ## [Unreleased] +### Phase 7 — A/B eval: crew vs single-call on goldens (AI-046) (2026-06-16) + +The Phase 7 DoD gate that answers "does the four-agent crew actually earn its orchestration?" — it A/B's the full `FieldCrew` (researcher→drafter→critic→editor) against **one** single LLM call that writes the field directly, on the same brief+source, and gates on a meaningful quality **lift** AND a bounded **cost ratio**. Backend core only; the admin "Run A/B" button is a deliberate fast-follow. + +- **Honest baseline (A) — same brief contract, only orchestration differs.** New `Application/Agents/BaselineFieldAgent.cs` (`SingleCallAgent`, FeatureTag `crew.baseline`): one gateway call whose system prompt folds the brief's FULL contract (MinLength/MaxLength, BannedPhrases, StyleGuide, TargetLanguage) via the shared `BriefConstraints` the drafter/critic/editor already use — so A is held to the identical rubric the crew enforces; the user prompt is the raw source. New `Prompts/BaselineFieldPrompt.cs` + `record BaselineInput(ContentBrief Brief, string SourceMaterial)`. **A and B run the SAME generator model (nano)** — the eval isolates orchestration, not model. +- **Independent judge, no label leakage.** A stronger judge (gpt-4.1 via `Eval:JudgeModel`, the dedicated `openai-judge` keyed provider) scores each candidate **absolutely** on a 3-axis 1-5 prose rubric (grounding / tone / completeness) through the shared `RubricEvaluator`. A and B are judged in **separate** calls with the **same** rubric, and the judge prompt carries only the source + an anonymous candidate — it never learns "single-call" vs "crew". +- **Metrics + gate** (`Ai.EvalSuite/CrewAbEvalRunner.cs`): per fixture `judgeScoreA/judgeScoreB (0-5)`, `costA/costB`, `bWins`. Aggregate `avgA`, `avgB`, `liftPct = (avgB-avgA)/avgA`, `costRatio = sumCostB/sumCostA`, `winRate`. `Passed = liftPct >= 0.10 && costRatio <= 2.0` — **the cost gate is independent of lift** (better-but-too-expensive still fails). Div-by-zero guarded: `avgA==0 → lift 0`; `sumCostA==0 → costRatio +inf → fails`. **A's cost** = the single call's `LlmResponse.Usage.CostUsd`; **B's cost** = the crew run's TOTAL (sum of the 4 sub-agent usages) — surfaced honestly via a new additive `FieldResult.CostUsd` (populated from `CrewResult.Usage.CostUsdTotal`; no other caller churns). +- **Crew halt → B scores 0.** If the crew halts before the editor (null `EditedText`), B is judged 0 for that case (no prose to judge); `NeedsReview==true` does NOT zero B — quality A/B is separate from the review gate, so a flagged-but-present edit is still judged. +- **Golden set** (`Datasets/crew_ab.json`, auto-embedded via the existing `Datasets/*.json` glob): **N=10 edition/description** fixtures (realistic title+author+excerpt-style book source, varied — Frankenstein, Dracula, Moby-Dick, …) resolved to a `ContentBrief` via `SeoBriefs.For("edition","description","en")`. `// TODO grow to 50`. The gate runs on whatever N is present. New `CrewAbGolden` record + `CrewAbGoldenSet.Load()` (mirrors `CriticDefectGoldenSet`). +- **Reuses, no schema change**: persists a `crew_ab` `EvalRun` (Feature=`crew_ab`, Score=round(liftPct,3), JudgeModelId=judge model, BreakdownJson = `{avgA, avgB, liftPct, costRatio, winRate, n, perFixture[]}`). Endpoint `POST /admin/ai-quality/evals/crew-ab/run` mirrors `studybuddy`/`criticdefects`: resolve the gateway `ILlmService` (503 if no key), the keyed `openai-judge` + `Eval:JudgeModel`, build `BaselineFieldAgent` + resolve `FieldCrew` from scope, run + persist, return `{ avgA, avgB, liftPct, costRatio, winRate, n, passed, cases }`. Structurally mirrors `CriticDefectEvalRunner` (AI-044). +- **Tests** (`tests/TextStack.AiEvals/CrewAbEvalRunnerTests.cs`, deterministic, fake-gen + fake-judge, **NO key, CI**): a fake generator routed by FeatureTag (`crew.baseline`→A text+cost; `crew.drafter`/`critic`/`editor`/`researcher`→B sub-agent texts+cost) + a fake judge returning canned scores keyed on which candidate marker is in the prompt, run **through the real `RubricEvaluator`** + the reused `IAgentRunWriter`/`CapturingDb` fakes. Covers: `BFarBetter_Passes` (lift≈0.67 ∧ ratio≤2), `BNotBetter_Fails` (lift≈0), `BBetterButTooExpensive_Fails` (cost gate alone fails despite lift), `BWorse_NegativeLift_Fails`, `CrewHalted_NullEditedText_ScoresZero`, `Persist_WritesCrewAbEvalRun` (Feature/Score/BreakdownJson shape/N/JudgeModelId). No `ITool` introduced — the StudyBuddy set-equality test stays green. + ### Phase 7 — admin "view transcript" UI for crew/agent runs (AI-045) (2026-06-15) Makes the multi-agent reasoning chain inspectable. Every crew run (`crew.autopublish`/`crew.seo`) and single-agent run (`studybuddy`) already persists an `agent_run` row with a nested-step transcript (researcher→drafter→critic→editor); AI-045 surfaces it in the admin app. **No schema change, no new table** — read-only over the existing Phase 6 `agent_run`. diff --git a/apps/admin/src/api/client.ts b/apps/admin/src/api/client.ts index d43db026..34f8db5e 100644 --- a/apps/admin/src/api/client.ts +++ b/apps/admin/src/api/client.ts @@ -546,6 +546,16 @@ export interface CriticDefectEvalResult { parseFailed: boolean }[] } +export interface CrewAbEvalResult { + avgA: number + avgB: number + liftPct: number + costRatio: number + winRate: number + n: number + passed: boolean + cases?: unknown[] +} async function fetchJson(path: string, init?: RequestInit): Promise { const res = await fetch(`${API_BASE}${path}`, { @@ -1182,6 +1192,12 @@ export const adminApi = { }) }, + runCrewAbEval: async (): Promise => { + return fetchJson('/admin/ai-quality/evals/crew-ab/run', { + method: 'POST', + }) + }, + // Podcasts generatePodcast: async (editionId: string, lang?: string, force?: boolean): Promise => { return fetchJson('/admin/podcasts', { diff --git a/apps/admin/src/pages/AiQualityPage.tsx b/apps/admin/src/pages/AiQualityPage.tsx index de5a3ee5..91c2f2c2 100644 --- a/apps/admin/src/pages/AiQualityPage.tsx +++ b/apps/admin/src/pages/AiQualityPage.tsx @@ -10,6 +10,7 @@ import { AgentRunDetail, EvalRun, CriticDefectEvalResult, + CrewAbEvalResult, } from '../api/client' type Tab = 'summary' | 'traces' | 'transcripts' | 'evals' @@ -682,6 +683,8 @@ function EvalsTab() { const [running, setRunning] = useState(false) const [criticRunning, setCriticRunning] = useState(false) const [criticResult, setCriticResult] = useState(null) + const [crewAbRunning, setCrewAbRunning] = useState(false) + const [crewAbResult, setCrewAbResult] = useState(null) const load = () => adminApi @@ -732,6 +735,19 @@ function EvalsTab() { } } + const runCrewAb = async () => { + setError(null) + setCrewAbRunning(true) + try { + setCrewAbResult(await adminApi.runCrewAbEval()) + load() // persisted as a crew_ab eval_run → refresh history + } catch (e) { + setError(e instanceof Error ? e.message : 'Failed to run crew A/B eval') + } finally { + setCrewAbRunning(false) + } + } + const controls = (
@@ -754,6 +770,14 @@ function EvalsTab() { Injects known defects into clean drafts, runs the real nano critic (~23 calls, 20–30s), gate ≥ 0.80 catch-rate.
+
+ + + A/B-tests the crew vs a single-call baseline over the goldens (~1–2 min), gate lift ≥ 10% and cost ratio ≤ 2×. + +
{criticResult && (
@@ -773,6 +797,32 @@ function EvalsTab() {
)} + {crewAbResult && ( +
+
+ Crew A/B eval + + {crewAbResult.passed ? 'PASS' : 'FAIL'} + +
+
+ + 2 ? '#dc2626' : undefined} + /> + + + + +
+
+ )}
) diff --git a/backend/src/Ai/TextStack.Ai.EvalSuite/CrewAbEvalRunner.cs b/backend/src/Ai/TextStack.Ai.EvalSuite/CrewAbEvalRunner.cs new file mode 100644 index 00000000..f07bded1 --- /dev/null +++ b/backend/src/Ai/TextStack.Ai.EvalSuite/CrewAbEvalRunner.cs @@ -0,0 +1,242 @@ +using System.Globalization; +using System.Text; +using Application.Agents; +using Application.Common.Interfaces; +using Domain.Entities; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.AI.Evaluation; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using TextStack.Ai.Core; +using TextStack.Ai.Evals; +using TextStack.Ai.Llm; + +namespace TextStack.Ai.EvalSuite; + +/// +/// One fixture's A/B outcome (AI-046). / are the independent +/// judge's 0-5 means for the single-call baseline (A) and the full crew (B); a crew that halted before the +/// editor (null EditedText) scores B = 0. / are the honest USD costs +/// (A = the single call's usage; B = the crew run's total across its four sub-agents). is +/// true when B's judge mean strictly beats A's. +/// +public sealed record CrewAbCase( + string Id, double JudgeScoreA, double JudgeScoreB, decimal CostA, decimal CostB, bool BWins); + +/// +/// Result of a crew-vs-single-call A/B run (AI-046, Phase 7 DoD gate). is the headline +/// metric: the crew's mean judge score over the baseline's, as a fraction. is the +/// crew's total spend over the baseline's. The gate demands BOTH a meaningful quality lift AND a bounded cost +/// multiple — a crew that is better but ruinously expensive does not earn its orchestration. +/// +public sealed record CrewAbEvalResult( + double AvgA, + double AvgB, + double LiftPct, + double CostRatio, + double WinRate, + int N, + bool Passed, + IReadOnlyList Cases); + +/// +/// Runs the Phase 7 crew-vs-single-call A/B eval (AI-046): for each golden, send the SAME brief + source +/// through (A) the single-call that writes the field directly and (B) the full +/// (researcher → drafter → critic → editor). Both arms use the SAME generator model +/// (nano) so only ORCHESTRATION differs, not the rubric. An independent, STRONGER judge (gpt-4.1 via +/// Eval:JudgeModel) scores each candidate absolutely on a 3-axis 1-5 rubric (grounding / tone / +/// completeness) — A and B in SEPARATE calls with the SAME rubric, and the judge NEVER learns which arm a +/// candidate came from (no "single-call" vs "crew" label leakage). Reports crew lift % and cost ratio and +/// gates on liftPct ≥ 0.10 AND costRatio ≤ 2.0. Persists a crew_ab (Score = lift). +/// Mirrors : real generation per case, judged scoring, sync. +/// +public sealed class CrewAbEvalRunner(ILogger logger) +{ + private const string Feature = "crew_ab"; + + /// Minimum crew quality lift over the single-call baseline (fraction) the orchestration must earn. + public const double LiftGate = 0.10; + + /// Maximum crew cost as a multiple of the single-call baseline's — keeps the lift honest about spend. + public const double CostRatioGate = 2.0; + + // Absolute prose rubric. Axis labels (text before ':') become the RubricEvaluator metric suffixes the + // judge scores 1-5; the judge sees only an anonymous candidate + the source — never the arm it came from. + private static readonly Rubric Rubric = new( + "grounding: every claim is supported by the source material; no invented facts.", + "tone: factual, encyclopedic, third-person prose with no subjective superlatives or marketing language.", + "completeness: covers the field within its length bounds, reads clearly, and reads well as a whole."); + + private static readonly ChatMessage[] JudgePlaceholderMessages = [new ChatMessage(ChatRole.User, string.Empty)]; + + public async Task RunAsync( + BaselineFieldAgent baseline, + FieldCrew crew, + ILlmService judge, + string judgeModelId, + bool persist, + IAppDbContext? db, + string? gitSha, + CancellationToken ct) + { + var goldens = CrewAbGoldenSet.Load(); + var chatConfig = new ChatConfiguration(new LlmServiceChatClient(judge, defaultFeatureTag: "eval.judge")); + + // The crew's CrewTasks.Of opens a child DI scope per stage (ToolDispatcher per-invocation rule), so its + // AgentContext.Services must expose IServiceScopeFactory. An empty container provides exactly that and + // resolves no tools — the crew specialists are single ILlmService calls. The baseline ignores Services. + using var crewServices = new ServiceCollection().BuildServiceProvider(); + + var cases = new List(); + var sumA = 0.0; + var sumB = 0.0; + var sumCostA = 0m; + var sumCostB = 0m; + var bWins = 0; + + foreach (var g in goldens) + { + ct.ThrowIfCancellationRequested(); + + var brief = SeoBriefs.For(g.EntityType, g.FieldName, g.TargetLanguage); + + // A — single LLM call. SingleCallAgent ignores ctx.Services (no tools); an empty provider suffices. + var ctxA = new AgentContext(UserId: null, EditionId: null, Guid.NewGuid(), EmptyServiceProvider.Instance); + var aResult = await baseline.RunAsync(new BaselineInput(brief, g.SourceMaterial), ctxA, ct); + var textA = aResult.Output.Text; + var costA = aResult.Usage.CostUsdTotal; + + // B — the full crew. Cost is the run total (sum of the 4 sub-agent usages), surfaced on FieldResult. + var ctxB = new AgentContext(UserId: null, EditionId: null, Guid.NewGuid(), crewServices); + var bResult = await crew.RunFieldAsync( + "crew_ab", "edition.description", AutoPublishCrew.CostCapUsd, brief, g.SourceMaterial, ctxB, ct); + var textB = bResult.EditedText; // null when the crew halted before the editor + var costB = bResult.CostUsd; + + // Judge each candidate against the SAME source with the SAME rubric, in SEPARATE calls. The crew's + // null edited text (halted pre-edit) scores 0 — there is no prose to judge. + var scoreA = await JudgeAsync(brief, g.SourceMaterial, textA, chatConfig, ct); + var scoreB = textB is null ? 0.0 : await JudgeAsync(brief, g.SourceMaterial, textB, chatConfig, ct); + + var won = scoreB > scoreA; + cases.Add(new CrewAbCase(g.Id, scoreA, scoreB, costA, costB, won)); + sumA += scoreA; + sumB += scoreB; + sumCostA += costA; + sumCostB += costB; + if (won) + bWins++; + } + + var n = cases.Count; + var avgA = n > 0 ? sumA / n : 0.0; + var avgB = n > 0 ? sumB / n : 0.0; + var liftPct = avgA > 0 ? (avgB - avgA) / avgA : 0.0; // avgA == 0 → no lift basis → 0 (fails gate) + var costRatio = sumCostA > 0 ? (double)(sumCostB / sumCostA) : double.PositiveInfinity; // no A spend → +inf + var winRate = n > 0 ? (double)bWins / n : 0.0; + var passed = liftPct >= LiftGate && costRatio <= CostRatioGate; + + logger.LogInformation( + "Crew A/B eval: avgA={AvgA:0.00} avgB={AvgB:0.00} lift={Lift:0.000} costRatio={Ratio:0.00} winRate={Win:0.00} (N={N}) passed={Passed}", + avgA, avgB, liftPct, costRatio, winRate, n, passed); + + if (persist && db is not null) + { + db.EvalRuns.Add(new EvalRun + { + Id = Guid.NewGuid(), + Feature = Feature, + ModelId = "crew.baseline-vs-crew", // both arms share the nano generator; the headline is the comparison + JudgeModelId = judgeModelId, + Score = Math.Round((decimal)liftPct, 3), + N = n, + BreakdownJson = BuildBreakdown(avgA, avgB, liftPct, costRatio, winRate, n, cases), + GitSha = gitSha, + CreatedAt = DateTimeOffset.UtcNow, + }); + await db.SaveChangesAsync(ct); + } + + return new CrewAbEvalResult(avgA, avgB, liftPct, costRatio, winRate, n, passed, cases); + } + + /// + /// Score one anonymous candidate against the source on the shared prose rubric (mean of the 3 axes, 0-5). + /// The judge prompt carries only the source + the candidate — never "single-call" or "crew" — so there is + /// no label leakage that could bias the A/B comparison. + /// + private static async Task JudgeAsync( + ContentBrief brief, string source, string candidate, ChatConfiguration chatConfig, CancellationToken ct) + { + var evidence = + $"Field: the {brief.FieldName} of a {brief.EntityType} ({brief.MinLength}-{brief.MaxLength} characters, " + + $"in {brief.TargetLanguage}).\n\nSource material:\n{source}\n\nCandidate text:\n{candidate}"; + + var evaluator = new RubricEvaluator(Feature, Rubric); + var result = await evaluator.EvaluateAsync( + JudgePlaceholderMessages, + new ChatResponse(new ChatMessage(ChatRole.Assistant, candidate)), + chatConfig, + [new RubricEvidenceContext(evidence)], + ct); + + var score = new JudgeScore( + ReadAxis(result, Rubric.Dim1), ReadAxis(result, Rubric.Dim2), ReadAxis(result, Rubric.Dim3), string.Empty); + return score.Mean; + } + + private static int ReadAxis(EvaluationResult result, string dim) => + (int)Math.Round(result.Get($"{Feature}.{dim.Split(':')[0].Trim()}").Value ?? 0); + + private static string BuildBreakdown( + double avgA, double avgB, double liftPct, double costRatio, double winRate, int n, + IReadOnlyList cases) + { + var sb = new StringBuilder(); + sb.Append('{'); + sb.Append("\"avgA\":").Append(Num(avgA)).Append(','); + sb.Append("\"avgB\":").Append(Num(avgB)).Append(','); + sb.Append("\"liftPct\":").Append(Num(liftPct)).Append(','); + sb.Append("\"costRatio\":").Append(CostRatioJson(costRatio)).Append(','); + sb.Append("\"winRate\":").Append(Num(winRate)).Append(','); + sb.Append("\"n\":").Append(n).Append(','); + + sb.Append("\"perFixture\":["); + for (var i = 0; i < cases.Count; i++) + { + var c = cases[i]; + if (i > 0) + sb.Append(','); + sb.Append("{\"id\":\"").Append(Escape(c.Id)).Append("\",") + .Append("\"scoreA\":").Append(Num(c.JudgeScoreA)).Append(',') + .Append("\"scoreB\":").Append(Num(c.JudgeScoreB)).Append(',') + .Append("\"costA\":").Append(Cost(c.CostA)).Append(',') + .Append("\"costB\":").Append(Cost(c.CostB)).Append(',') + .Append("\"bWins\":").Append(c.BWins ? "true" : "false") + .Append('}'); + } + sb.Append(']'); + + sb.Append('}'); + return sb.ToString(); + } + + // +inf serializes as JSON null (no valid IEEE literal) so the breakdown stays parseable when A had no spend. + private static string CostRatioJson(double ratio) => + double.IsPositiveInfinity(ratio) ? "null" : Num(ratio); + + private static string Num(double value) => + Math.Round(value, 3).ToString("0.###", CultureInfo.InvariantCulture); + + private static string Cost(decimal value) => + Math.Round(value, 6).ToString("0.######", CultureInfo.InvariantCulture); + + private static string Escape(string s) => s.Replace("\\", "\\\\").Replace("\"", "\\\""); + + /// No-service provider for both arms' — neither resolves tools. + private sealed class EmptyServiceProvider : IServiceProvider + { + public static readonly EmptyServiceProvider Instance = new(); + public object? GetService(Type serviceType) => null; + } +} diff --git a/backend/src/Ai/TextStack.Ai.EvalSuite/CrewAbGolden.cs b/backend/src/Ai/TextStack.Ai.EvalSuite/CrewAbGolden.cs new file mode 100644 index 00000000..c4de7416 --- /dev/null +++ b/backend/src/Ai/TextStack.Ai.EvalSuite/CrewAbGolden.cs @@ -0,0 +1,27 @@ +namespace TextStack.Ai.EvalSuite; + +/// +/// One A/B golden case (AI-046): a piece of raw book source material that BOTH the single-call baseline +/// (crew.baseline) and the full write the same field from. +/// The (, ) pair resolves the shared ContentBrief via +/// SeoBriefs.For, so both arms enforce the identical contract — the eval isolates ORCHESTRATION, not +/// the rubric. v1 is edition/description only (N=10, TODO grow to 50). +/// +/// Stable case id for the admin UI / test diagnostics. +/// The brief's entity (v1: edition). +/// The brief's field (v1: description). +/// The output language passed to SeoBriefs.For (v1: en). +/// Title + author + excerpt-style facts both arms ground their prose in. +public record CrewAbGolden( + string Id, + string EntityType, + string FieldName, + string TargetLanguage, + string SourceMaterial); + +/// Loads the embedded A/B golden set (crew_ab.json). +public static class CrewAbGoldenSet +{ + public static IReadOnlyList Load() => + GoldenLoader.Load("crew_ab.json"); +} diff --git a/backend/src/Ai/TextStack.Ai.EvalSuite/Datasets/crew_ab.json b/backend/src/Ai/TextStack.Ai.EvalSuite/Datasets/crew_ab.json new file mode 100644 index 00000000..19fc43ed --- /dev/null +++ b/backend/src/Ai/TextStack.Ai.EvalSuite/Datasets/crew_ab.json @@ -0,0 +1,72 @@ +[ + { + "id": "ab_frankenstein", + "entityType": "edition", + "fieldName": "description", + "targetLanguage": "en", + "sourceMaterial": "Title: Frankenstein; or, The Modern Prometheus\nAuthor: Mary Shelley\nFirst published: London, 1818 (anonymous); revised edition 1831.\nForm: Epistolary frame narrative told through the letters of Arctic explorer Robert Walton, who encounters Victor Frankenstein during an expedition.\nPlot facts:\n- Victor Frankenstein, a Genevan student of natural philosophy, assembles and animates a creature from dead matter.\n- The creature is rejected by its creator and by the society that recoils from its appearance.\n- The creature turns against Frankenstein; the two become bound in a mutual pursuit that drives the plot.\nContext:\n- Widely treated as an early example of science fiction.\n- Themes: ambition, responsibility, isolation, and what an inventor owes to the life he brings into being.\nExcerpt: \"It was on a dreary night of November that I beheld the accomplishment of my toils... I saw the dull yellow eye of the creature open.\"" + }, + { + "id": "ab_dracula", + "entityType": "edition", + "fieldName": "description", + "targetLanguage": "en", + "sourceMaterial": "Title: Dracula\nAuthor: Bram Stoker (Irish author)\nFirst published: London, 1897.\nForm: Epistolary novel assembled from letters, diary entries, and newspaper clippings rather than a single continuous narration.\nPlot facts:\n- Count Dracula, a vampire from Transylvania, relocates to England.\n- The solicitor Jonathan Harker travels to the Count's castle on business as the novel opens.\n- A group led by Professor Abraham Van Helsing forms to oppose the vampire and protect those he targets.\nContext:\n- Credited with helping define the modern vampire genre.\n- Engages period anxieties about foreign invasion and the spread of disease.\n- The layered documentary structure lets several narrators assemble one account from fragments.\nExcerpt: \"I am all in a sea of wonders. I doubt; I fear; I think strange things, which I dare not confess to my own soul.\"" + }, + { + "id": "ab_mobydick", + "entityType": "edition", + "fieldName": "description", + "targetLanguage": "en", + "sourceMaterial": "Title: Moby-Dick; or, The Whale\nAuthor: Herman Melville\nFirst published: 1851.\nNarration: A sailor who introduces himself with the words \"Call me Ishmael.\"\nPlot facts:\n- The whaling ship Pequod sails under Captain Ahab.\n- Ahab is consumed by his pursuit of a white sperm whale named Moby Dick that had maimed him on an earlier voyage.\n- The novel interleaves the chase with extended digressions on the practice and science of whaling.\nContext:\n- Early reviews were mixed and the book sold modestly during Melville's lifetime.\n- Now regarded as a major work of American literature, frequently studied for its blend of adventure, encyclopedic detail, and philosophical inquiry.\nExcerpt: \"Whenever I find myself growing grim about the mouth... then I account it high time to get to sea as soon as I can.\"" + }, + { + "id": "ab_prideandprejudice", + "entityType": "edition", + "fieldName": "description", + "targetLanguage": "en", + "sourceMaterial": "Title: Pride and Prejudice\nAuthor: Jane Austen\nFirst published: 1813.\nSetting: Among the landed gentry of early nineteenth-century England.\nPlot facts:\n- Elizabeth Bennet is the second of five sisters whose family's circumstances make advantageous marriage a pressing question.\n- The narrative traces Elizabeth's changing relationship with Mr. Darcy, a wealthy and reserved gentleman.\n- The initial misjudgements each forms about the other are gradually corrected.\nContext:\n- Known for its irony and its commentary on the social conventions governing courtship and class.\n- Recurring concerns: marriage, economic security, reputation, first impressions, and family standing.\nExcerpt: \"It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.\"" + }, + { + "id": "ab_janeeyre", + "entityType": "edition", + "fieldName": "description", + "targetLanguage": "en", + "sourceMaterial": "Title: Jane Eyre\nAuthor: Charlotte Bronte (originally published under the pen name Currer Bell)\nFirst published: 1847.\nForm: First-person bildungsroman.\nPlot facts:\n- Jane Eyre, an orphan, endures a harsh childhood at Gateshead and the austere Lowood school before taking a post as governess at Thornfield Hall.\n- She falls in love with her employer, Edward Rochester, whose house holds a hidden secret.\n- Jane leaves Thornfield to preserve her principles and later returns on changed terms.\nContext:\n- Notable for its assertion of the moral and spiritual independence of its narrator.\n- Themes: conscience, social class, religion, and the struggle for autonomy.\nExcerpt: \"I am no bird; and no net ensnares me: I am a free human being with an independent will.\"" + }, + { + "id": "ab_greatgatsby", + "entityType": "edition", + "fieldName": "description", + "targetLanguage": "en", + "sourceMaterial": "Title: The Great Gatsby\nAuthor: F. Scott Fitzgerald\nFirst published: 1925.\nSetting: Long Island and New York City during the Jazz Age.\nNarration: Told by Nick Carraway, a Midwesterner who moves east and rents a cottage next to a mansion.\nPlot facts:\n- Jay Gatsby, a wealthy man of mysterious origin, throws lavish parties in pursuit of his lost love, Daisy Buchanan.\n- Nick observes the entanglement of Gatsby, Daisy, and her husband Tom, which moves toward tragedy.\nContext:\n- Often read as a portrait of the American Dream and its disillusionment.\n- Themes: wealth, class, longing, and the gap between aspiration and reality.\nExcerpt: \"So we beat on, boats against the current, borne back ceaselessly into the past.\"" + }, + { + "id": "ab_warandpeace", + "entityType": "edition", + "fieldName": "description", + "targetLanguage": "en", + "sourceMaterial": "Title: War and Peace\nAuthor: Leo Tolstoy\nFirst published: serialized 1865-1867, complete 1869.\nScope: A long historical novel set in Russia during the Napoleonic Wars.\nPlot facts:\n- Follows several aristocratic families, among them the Bezukhovs, Bolkonskys, and Rostovs, across war and domestic life.\n- Pierre Bezukhov, Prince Andrei Bolkonsky, and Natasha Rostova are central figures whose lives are reshaped by the 1812 French invasion.\n- The narrative alternates between battlefield events and the private concerns of its characters.\nContext:\n- Famous for its scale and for Tolstoy's reflections on history and the limits of individual agency.\n- Themes: history, fate, family, and the search for meaning.\nExcerpt: \"We can know only that we know nothing. And that is the highest degree of human wisdom.\"" + }, + { + "id": "ab_crimeandpunishment", + "entityType": "edition", + "fieldName": "description", + "targetLanguage": "en", + "sourceMaterial": "Title: Crime and Punishment\nAuthor: Fyodor Dostoevsky\nFirst published: serialized 1866.\nSetting: Saint Petersburg.\nPlot facts:\n- Rodion Raskolnikov, an impoverished former student, murders a pawnbroker, convinced he is exempt from ordinary moral law.\n- The act plunges him into guilt, paranoia, and illness as the investigator Porfiry Petrovich closes in.\n- His relationship with Sonia, a young woman driven to prostitution by poverty, moves him toward confession.\nContext:\n- A psychological novel examining conscience, suffering, and redemption.\n- Themes: guilt, free will, alienation, and moral responsibility.\nExcerpt: \"Pain and suffering are always inevitable for a large intelligence and a deep heart.\"" + }, + { + "id": "ab_huckfinn", + "entityType": "edition", + "fieldName": "description", + "targetLanguage": "en", + "sourceMaterial": "Title: Adventures of Huckleberry Finn\nAuthor: Mark Twain (pen name of Samuel Clemens)\nFirst published: 1884 (UK), 1885 (US).\nNarration: First-person, in Huck's vernacular voice.\nPlot facts:\n- Huckleberry Finn flees his abusive father and his \"civilizing\" guardians and travels down the Mississippi River on a raft.\n- He journeys with Jim, a man escaping enslavement, and the two form a bond as they encounter swindlers and danger along the river.\nContext:\n- Noted for its use of regional dialect and its satire of pre-Civil-War Southern society.\n- Themes: freedom, conscience, race, and the tension between social convention and individual morality.\nExcerpt: \"All right, then, I'll go to hell.\"" + }, + { + "id": "ab_wuthering", + "entityType": "edition", + "fieldName": "description", + "targetLanguage": "en", + "sourceMaterial": "Title: Wuthering Heights\nAuthor: Emily Bronte (originally published under the pen name Ellis Bell)\nFirst published: 1847.\nForm: Frame narrative related through the housekeeper Nelly Dean to the tenant Mr. Lockwood.\nSetting: The Yorkshire moors.\nPlot facts:\n- The intense, destructive bond between Catherine Earnshaw and the foundling Heathcliff shapes two generations of the Earnshaw and Linton families.\n- Heathcliff's thwarted love turns to a long campaign of revenge against both households.\nContext:\n- Distinctive for its nested narration and its bleak moorland atmosphere.\n- Themes: obsessive love, revenge, class, and the persistence of the past.\nExcerpt: \"Whatever our souls are made of, his and mine are the same.\"" + } +] diff --git a/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs b/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs index 6122a1f3..6f203197 100644 --- a/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs +++ b/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs @@ -33,6 +33,68 @@ public static void MapAdminAiQualityEndpoints(this WebApplication app) group.MapPost("/evals/toolcalls/run", RunToolCallEval); group.MapPost("/evals/studybuddy/run", RunStudyBuddyEval); group.MapPost("/evals/criticdefects/run", RunCriticDefectEval); + group.MapPost("/evals/crew-ab/run", RunCrewAbEval); + } + + // Phase 7 DoD gate (AI-046): A/B the single-call baseline vs the full FieldCrew on the same brief+source over + // the golden set, judged by an independent stronger judge (gpt-4.1). Reports crew lift % + cost ratio and + // gates on lift ≥ 0.10 AND costRatio ≤ 2.0. Generation goes through the gateway (same nano for both arms); + // the judge runs the dedicated openai-judge provider. ~30 gen + 20 judge calls, run sync like the others. + private static async Task RunCrewAbEval( + HttpContext httpContext, + IServiceProvider services, + IConfiguration config, + TextStack.Ai.EvalSuite.CrewAbEvalRunner runner, + FieldCrew crew, + IAppDbContext db, + CancellationToken ct) + { + ILlmService gateway; + try + { + gateway = services.GetRequiredService(); + } + catch (InvalidOperationException) + { + return Results.Problem("LLM gateway is not configured (no OpenAI key).", statusCode: 503); + } + + ILlmService judge; + try + { + judge = services.GetRequiredKeyedService("openai-judge"); + } + catch (InvalidOperationException) + { + return Results.Problem("Judge LLM is not configured.", statusCode: 503); + } + + var judgeModelId = config["Eval:JudgeModel"] ?? "gpt-4.1"; + var baseline = new BaselineFieldAgent(gateway); + var gitSha = Environment.GetEnvironmentVariable("GIT_SHA"); + + var result = await runner.RunAsync( + baseline, crew, judge, judgeModelId, persist: true, db, gitSha, ct); + + return Results.Ok(new + { + avgA = Math.Round(result.AvgA, 3), + avgB = Math.Round(result.AvgB, 3), + liftPct = Math.Round(result.LiftPct, 4), + costRatio = double.IsPositiveInfinity(result.CostRatio) ? (double?)null : Math.Round(result.CostRatio, 3), + winRate = Math.Round(result.WinRate, 3), + n = result.N, + passed = result.Passed, + cases = result.Cases.Select(c => new + { + c.Id, + scoreA = Math.Round(c.JudgeScoreA, 3), + scoreB = Math.Round(c.JudgeScoreB, 3), + c.CostA, + c.CostB, + c.BWins, + }), + }); } // Phase 7 DoD gate (AI-044): inject KNOWN defects into clean drafts, run the REAL AI-041 critic (nano) diff --git a/backend/src/Api/Program.cs b/backend/src/Api/Program.cs index d96236fe..f42f56d6 100644 --- a/backend/src/Api/Program.cs +++ b/backend/src/Api/Program.cs @@ -85,6 +85,7 @@ builder.Services.AddSingleton(); builder.Services.AddSingleton(); builder.Services.AddSingleton(); +builder.Services.AddSingleton(); // Tool catalogue (AI-029/030): scans Application for ITool impls; dispatch is schema-validated. builder.Services.AddAiTools(typeof(Application.Tools.GetChapterTool).Assembly); // Agent loop engine (Phase 6, AI-034). Concrete agents (StudyBuddy, AI-035) build on it. @@ -96,6 +97,9 @@ builder.Services.AddSingleton(); builder.Services.AddSingleton(); builder.Services.AddSingleton(); +// Single-call A/B baseline (Phase 7, AI-046): the "A" arm of the crew-vs-single-call eval — one ILlmService +// call that writes a field directly under the crew's full brief contract. Stateless singleton like the specialists. +builder.Services.AddSingleton(); // AutoPublish crew (Phase 7, AI-042): in-process admin path that runs the specialists over ILlmService to // generate SEO prose for an Edition. Scoped because it persists via the scoped IAgentRunWriter (per-request // DbContext). The legacy bash + Claude-CLI poller stays the default; this is the observable, traced alternative. diff --git a/backend/src/Application/Agents/BaselineFieldAgent.cs b/backend/src/Application/Agents/BaselineFieldAgent.cs new file mode 100644 index 00000000..28cce580 --- /dev/null +++ b/backend/src/Application/Agents/BaselineFieldAgent.cs @@ -0,0 +1,26 @@ +using Application.Agents.Prompts; +using TextStack.Ai.Core; + +namespace Application.Agents; + +/// +/// The single-call A/B baseline (AI-046): ONE gateway call that writes the requested +/// field directly from the source, with the brief's FULL contract folded into the system prompt. It is the "A" +/// arm of the crew-vs-single-call eval — the "B" arm is the full . Both run the SAME +/// generator model (nano) and are judged by the SAME rubric, so the eval measures only what orchestration buys. +/// +/// FeatureTag crew.baseline (distinct from the crew's crew.drafter/crew.editor tags so the +/// A and B calls route + trace separately). Token budget mirrors the drafter's (500): one prose field of the +/// same size. Parse is the same trim the drafter/editor use — fail-closed via . +/// +public sealed class BaselineFieldAgent(ILlmService llm) : SingleCallAgent(llm) +{ + protected override string FeatureTag => "crew.baseline"; + protected override int MaxOutputTokens => 500; + + protected override (string system, string user) BuildPrompt(BaselineInput input) => + (BaselineFieldPrompt.BuildSystemPrompt(input.Brief), + BaselineFieldPrompt.BuildUserPrompt(input.SourceMaterial)); + + protected override Draft Parse(string text, BaselineInput input) => new(text.Trim()); +} diff --git a/backend/src/Application/Agents/CrewAgentContracts.cs b/backend/src/Application/Agents/CrewAgentContracts.cs index 6b264885..3fa49f76 100644 --- a/backend/src/Application/Agents/CrewAgentContracts.cs +++ b/backend/src/Application/Agents/CrewAgentContracts.cs @@ -18,6 +18,13 @@ public record ContentBrief( /// Input to the researcher: the brief plus the raw source material to condense into neutral facts. public record ResearchInput(ContentBrief Brief, string SourceMaterial); +/// +/// Input to the single-call baseline (AI-046): the SAME brief the crew reads plus the raw source material. +/// One LLM call folds the brief's full contract into its system prompt and writes the field directly — so an +/// A/B eval isolates orchestration (crew vs single call), not the rubric the two are held to. +/// +public record BaselineInput(ContentBrief Brief, string SourceMaterial); + /// The researcher's output: bullet FACTS grounded entirely in the source, ready for the drafter. public record ResearchNotes(string Notes); diff --git a/backend/src/Application/Agents/FieldCrew.cs b/backend/src/Application/Agents/FieldCrew.cs index dd9be7ba..4cd04d7c 100644 --- a/backend/src/Application/Agents/FieldCrew.cs +++ b/backend/src/Application/Agents/FieldCrew.cs @@ -72,7 +72,8 @@ public async Task RunFieldAsync( result); await runWriter.WriteAsync(record, ct); - return new FieldResult(editedText, state.Critique, needsReview, result.Status, ctx.AgentRunId); + return new FieldResult( + editedText, state.Critique, needsReview, result.Status, ctx.AgentRunId, result.Usage.CostUsdTotal); } /// @@ -134,11 +135,14 @@ private CrewPlan BuildPlan(string crewName, decimal costCapUsd) /// Outcome of one content-crew field run (AI-043). is the editor's final prose (null /// if the crew never reached the edit stage). is the critic's verdict (null on halt /// before critique). is the fail-closed gate — true means "do not auto-apply". The -/// entity write and apply decision live in the caller, never here. +/// entity write and apply decision live in the caller, never here. is the crew run's +/// TOTAL cost (sum of the four sub-agent usages, honestly aggregated by the orchestrator) — surfaced so the +/// AI-046 A/B eval can compute crew-vs-baseline cost ratio without re-reading the persisted agent_run. /// public record FieldResult( string? EditedText, CritiqueResult? Critique, bool NeedsReview, string Status, - Guid RunId); + Guid RunId, + decimal CostUsd); diff --git a/backend/src/Application/Agents/Prompts/BaselineFieldPrompt.cs b/backend/src/Application/Agents/Prompts/BaselineFieldPrompt.cs new file mode 100644 index 00000000..26d3fc1e --- /dev/null +++ b/backend/src/Application/Agents/Prompts/BaselineFieldPrompt.cs @@ -0,0 +1,34 @@ +namespace Application.Agents.Prompts; + +/// +/// The single-call baseline's prompt (AI-046, FeatureTag crew.baseline): write the requested field +/// DIRECTLY from the raw source in ONE LLM call. The system prompt folds the brief's FULL contract — length +/// bounds, banned phrases, target language, style guide — the SAME constraints the crew enforces across its +/// four specialists. The point of the A/B eval is to isolate ORCHESTRATION: the baseline is held to the +/// identical contract, so any quality lift the crew shows comes from researcher→drafter→critic→editor, not +/// from a weaker rubric. Reuses so length + banned phrases read identically to +/// the drafter/critic/editor. Pure string building. +/// +public static class BaselineFieldPrompt +{ + public static string BuildSystemPrompt(ContentBrief brief) + { + var prompt = + $"You are a copywriter writing the {brief.FieldName} of a {brief.EntityType}. " + + "Use ONLY the facts in the source material provided — do not add any information that is not in it. " + + $"Write the text in {brief.TargetLanguage}. " + + $"The text must be {BriefConstraints.Length(brief)} long."; + + if (BriefConstraints.BannedPhrases(brief) is { } banned) + prompt += $" Do not use these phrases: {banned}."; + + if (!string.IsNullOrWhiteSpace(brief.StyleGuide)) + prompt += $" Style guide: {brief.StyleGuide.Trim()}."; + + prompt += " Output only the finished text — no markdown, no preface, no quotes around it."; + return prompt; + } + + public static string BuildUserPrompt(string sourceMaterial) => + $"Source material:\n{sourceMaterial}"; +} diff --git a/tests/TextStack.AiEvals/CrewAbEvalRunnerTests.cs b/tests/TextStack.AiEvals/CrewAbEvalRunnerTests.cs new file mode 100644 index 00000000..b08a87d0 --- /dev/null +++ b/tests/TextStack.AiEvals/CrewAbEvalRunnerTests.cs @@ -0,0 +1,276 @@ +using System.Text.Json; +using Application.Agents; +using Microsoft.Extensions.Logging.Abstractions; +using TextStack.Ai.Agents; +using TextStack.Ai.Core; +using TextStack.Ai.EvalSuite; + +namespace TextStack.AiEvals; + +/// +/// Deterministic coverage for (AI-046) — NO key, NO network. A fake generator +/// routes per FeatureTag: crew.baseline → the single-call (A) text + its cost; the crew tags +/// (crew.researcher/drafter/critic/editor) → B's sub-agent texts + their cost. A fake JUDGE runs through +/// the REAL and returns canned 1-5 scores keyed on WHICH +/// candidate text is in the prompt (A's marker vs B's editor marker), so the runner's lift / cost-ratio / gate +/// math is exercised end-to-end. Asserts the four gate scenarios, the crew-halt 0-score, and EvalRun persistence. +/// +public class CrewAbEvalRunnerTests +{ + private static readonly int GoldenN = CrewAbGoldenSet.Load().Count; + + // Distinct markers so the judge can tell the two arms apart purely from the candidate text in its prompt. + private const string AMarker = "BASELINE_A_CANDIDATE"; + private const string BMarker = "CREW_B_EDITED_CANDIDATE"; + + // A clean critic verdict so the crew completes through the editor (B's EditedText is non-null). + private const string CleanCritic = + """{"scores":{"factual_accuracy":5,"tone":5,"length":5,"banned_phrases":5},"issues":[]}"""; + + private static CrewAbEvalRunner Runner() => new(NullLogger.Instance); + + // ---- Fake generator: routes A vs the four crew specialists, with per-arm cost ------------------ + + /// + /// One fake driving BOTH arms: crew.baseline emits A's marker text at + /// ; each crew specialist emits its canned text, and the per-call crew cost is + /// (4 calls → crew total ≈ 4× that). The editor emits B's marker so the judge + /// can score B's edited candidate. + /// + private sealed class FakeGenerator(decimal baselineCost, decimal crewCostEach) : ILlmService + { + public Task CompleteAsync(LlmRequest request, CancellationToken ct) + { + var (text, cost) = request.FeatureTag switch + { + "crew.baseline" => (AMarker, baselineCost), + "crew.researcher" => ("- a grounded fact\n- another fact", crewCostEach), + "crew.drafter" => ("A drafted candidate.", crewCostEach), + "crew.critic" => (CleanCritic, crewCostEach), + "crew.editor" => (BMarker, crewCostEach), + _ => throw new InvalidOperationException($"Unexpected feature tag: {request.FeatureTag}"), + }; + return Task.FromResult(new LlmResponse(text, [], new LlmUsage(40, 20, cost), "fake-gen", Guid.NewGuid())); + } + + public IAsyncEnumerable StreamAsync(LlmRequest request, CancellationToken ct) => + throw new NotSupportedException(); + } + + // ---- Fake judge (through the real RubricEvaluator): canned scores per candidate marker ----------- + + /// + /// Returns a fixed 3-axis verdict for A's candidate and another for B's, detected by the marker present in + /// the judge prompt. The runner judges in separate calls, so each call carries exactly one candidate. + /// + private sealed class MarkerJudge(int a1, int a2, int a3, int b1, int b2, int b3) : ILlmService + { + public Task CompleteAsync(LlmRequest request, CancellationToken ct) + { + var prompt = string.Join("\n", request.Messages.Select(m => m.Content)); + var isB = prompt.Contains(BMarker, StringComparison.Ordinal); + var (d1, d2, d3) = isB ? (b1, b2, b3) : (a1, a2, a3); + return Task.FromResult(new LlmResponse( + $"{{\"d1\": {d1}, \"d2\": {d2}, \"d3\": {d3}, \"rationale\": \"ok\"}}", + [], new LlmUsage(0, 0, 0m), "fake-judge", Guid.NewGuid())); + } + + public IAsyncEnumerable StreamAsync(LlmRequest request, CancellationToken ct) => + throw new NotSupportedException(); + } + + private sealed class RecordingAgentRunWriter : IAgentRunWriter + { + public Task WriteAsync(AgentRunRecord run, CancellationToken ct) => Task.CompletedTask; + } + + private static FieldCrew Crew(ILlmService gen) => + new(new CrewOrchestrator(), + new ResearcherAgent(gen), + new DrafterAgent(gen), + new CriticAgent(gen), + new EditorAgent(gen), + new RecordingAgentRunWriter()); + + private static BaselineFieldAgent Baseline(ILlmService gen) => new(gen); + + private static CancellationToken Ct => TestContext.Current.CancellationToken; + + // ---- 1. B far better, cost in budget → passes --------------------------------------------------- + + [Fact] + public async Task RunAsync_BFarBetter_Passes() + { + // B scores 5 across the board, A scores 3 → lift = (5-3)/3 ≈ 0.667. Crew cost (4 × 0.001 = 0.004) is + // exactly 2× the baseline (0.002) → ratio == 2.0 ≤ gate. Both gates clear. + var gen = new FakeGenerator(baselineCost: 0.002m, crewCostEach: 0.001m); + var judge = new MarkerJudge(3, 3, 3, 5, 5, 5); + + var result = await Runner().RunAsync( + Baseline(gen), Crew(gen), judge, "judge-test", persist: false, db: null, gitSha: null, Ct); + + Assert.Equal(GoldenN, result.N); + Assert.Equal(3.0, result.AvgA, 6); + Assert.Equal(5.0, result.AvgB, 6); + Assert.Equal((5.0 - 3.0) / 3.0, result.LiftPct, 6); + Assert.Equal(2.0, result.CostRatio, 6); + Assert.Equal(1.0, result.WinRate, 6); + Assert.True(result.LiftPct >= CrewAbEvalRunner.LiftGate); + Assert.True(result.CostRatio <= CrewAbEvalRunner.CostRatioGate); + Assert.True(result.Passed); + } + + // ---- 2. B not better (A == B) → fails on lift --------------------------------------------------- + + [Fact] + public async Task RunAsync_BNotBetter_Fails() + { + var gen = new FakeGenerator(baselineCost: 0.002m, crewCostEach: 0.0005m); // crew cheaper, cost gate fine + var judge = new MarkerJudge(4, 4, 4, 4, 4, 4); // identical → lift 0 + + var result = await Runner().RunAsync( + Baseline(gen), Crew(gen), judge, "judge-test", persist: false, db: null, gitSha: null, Ct); + + Assert.Equal(0.0, result.LiftPct, 6); + Assert.Equal(0.0, result.WinRate, 6); + Assert.True(result.LiftPct < CrewAbEvalRunner.LiftGate); + Assert.False(result.Passed); // cost is fine, but no lift → gate fails + } + + // ---- 3. B better but too expensive → fails on cost (independent of lift) ------------------------ + + [Fact] + public async Task RunAsync_BBetterButTooExpensive_Fails() + { + // B clearly better (lift would pass) but the crew costs > 2× the baseline → cost gate fails alone. + var gen = new FakeGenerator(baselineCost: 0.001m, crewCostEach: 0.001m); // crew total 0.004 = 4× baseline + var judge = new MarkerJudge(3, 3, 3, 5, 5, 5); + + var result = await Runner().RunAsync( + Baseline(gen), Crew(gen), judge, "judge-test", persist: false, db: null, gitSha: null, Ct); + + Assert.True(result.LiftPct >= CrewAbEvalRunner.LiftGate); // lift alone would pass + Assert.Equal(4.0, result.CostRatio, 6); + Assert.True(result.CostRatio > CrewAbEvalRunner.CostRatioGate); + Assert.False(result.Passed); // cost gate is independent — fails despite the lift + } + + // ---- 4. B worse → negative lift, fails ---------------------------------------------------------- + + [Fact] + public async Task RunAsync_BWorse_NegativeLift_Fails() + { + var gen = new FakeGenerator(baselineCost: 0.002m, crewCostEach: 0.0005m); + var judge = new MarkerJudge(5, 5, 5, 2, 2, 2); // A beats B + + var result = await Runner().RunAsync( + Baseline(gen), Crew(gen), judge, "judge-test", persist: false, db: null, gitSha: null, Ct); + + Assert.True(result.LiftPct < 0); + Assert.Equal((2.0 - 5.0) / 5.0, result.LiftPct, 6); + Assert.Equal(0.0, result.WinRate, 6); + Assert.False(result.Passed); + } + + // ---- 5. Crew halted (null EditedText) → B scores 0 for that case -------------------------------- + + [Fact] + public async Task RunAsync_CrewHalted_NullEditedText_ScoresZero() + { + // Crew cost per call busts the per-field cap → the orchestrator halts after research, before the editor, + // so EditedText is null and the runner scores B = 0 WITHOUT calling the judge for B. A still scores 3. + var gen = new FakeGenerator(baselineCost: 0.002m, crewCostEach: AutoPublishCrew.CostCapUsd + 0.01m); + var judge = new MarkerJudge(3, 3, 3, 5, 5, 5); // B markers never reach the judge (no edited text) + + var result = await Runner().RunAsync( + Baseline(gen), Crew(gen), judge, "judge-test", persist: false, db: null, gitSha: null, Ct); + + Assert.Equal(3.0, result.AvgA, 6); + Assert.Equal(0.0, result.AvgB, 6); // every B halted → 0 + Assert.All(result.Cases, c => Assert.Equal(0.0, c.JudgeScoreB, 6)); + Assert.True(result.LiftPct < 0); // (0-3)/3 = -1 + Assert.False(result.Passed); + } + + // ---- 5b. Zero baseline judge score → lift guarded to 0 (no div-by-zero), gate fails ------------- + + [Fact] + public async Task RunAsync_ZeroBaselineScore_LiftGuardedToZero_Fails() + { + // avgA == 0 (judge scores A all-zeros). liftPct must be guarded to 0, NOT NaN/+inf, and the gate must + // fail — a zero baseline is not a free pass for the crew. + var gen = new FakeGenerator(baselineCost: 0.002m, crewCostEach: 0.0005m); + var judge = new MarkerJudge(0, 0, 0, 5, 5, 5); + + var result = await Runner().RunAsync( + Baseline(gen), Crew(gen), judge, "judge-test", persist: false, db: null, gitSha: null, Ct); + + Assert.Equal(0.0, result.AvgA, 6); + Assert.Equal(5.0, result.AvgB, 6); + Assert.Equal(0.0, result.LiftPct, 6); // guarded, not (5-0)/0 + Assert.False(double.IsNaN(result.LiftPct)); + Assert.False(double.IsInfinity(result.LiftPct)); + Assert.False(result.Passed); + } + + // ---- 5c. Zero baseline spend → costRatio = +inf (no div-by-zero), gate fails, JSON null -------- + + [Fact] + public async Task RunAsync_ZeroBaselineCost_CostRatioInfinite_Fails() + { + // sumCostA == 0 (baseline reports zero cost). costRatio must be +inf (not NaN/0), the gate must FAIL + // (a free baseline makes any crew spend an infinite multiple), and the persisted JSON must stay + // parseable — +inf serializes to JSON null, never a bare `Infinity` literal. + var db = new CapturingDb(); + var gen = new FakeGenerator(baselineCost: 0m, crewCostEach: 0.001m); + var judge = new MarkerJudge(3, 3, 3, 5, 5, 5); // lift alone would pass + + var result = await Runner().RunAsync( + Baseline(gen), Crew(gen), judge, "judge-test", persist: true, db, gitSha: null, Ct); + + Assert.True(result.LiftPct >= CrewAbEvalRunner.LiftGate); // lift would pass on its own + Assert.True(double.IsPositiveInfinity(result.CostRatio)); + Assert.False(result.Passed); // cost gate fails on infinite ratio + + var run = Assert.Single(db.Added); + using var doc = JsonDocument.Parse(run.BreakdownJson!); // must not throw on +inf + Assert.Equal(JsonValueKind.Null, doc.RootElement.GetProperty("costRatio").ValueKind); + } + + // ---- 6. Persistence ---------------------------------------------------------------------------- + + [Fact] + public async Task RunAsync_Persist_WritesCrewAbEvalRun() + { + var db = new CapturingDb(); + var gen = new FakeGenerator(baselineCost: 0.002m, crewCostEach: 0.001m); + var judge = new MarkerJudge(3, 3, 3, 5, 5, 5); + + var result = await Runner().RunAsync( + Baseline(gen), Crew(gen), judge, "judge-model-x", persist: true, db, gitSha: "abc123", Ct); + + var run = Assert.Single(db.Added); + Assert.Equal("crew_ab", run.Feature); + Assert.Equal("judge-model-x", run.JudgeModelId); + Assert.Equal("abc123", run.GitSha); + Assert.Equal(GoldenN, run.N); + Assert.Equal(Math.Round((decimal)result.LiftPct, 3), run.Score); + + using var doc = JsonDocument.Parse(run.BreakdownJson!); + var root = doc.RootElement; + Assert.True(root.TryGetProperty("avgA", out _)); + Assert.True(root.TryGetProperty("avgB", out _)); + Assert.True(root.TryGetProperty("liftPct", out _)); + Assert.True(root.TryGetProperty("costRatio", out _)); + Assert.True(root.TryGetProperty("winRate", out _)); + Assert.Equal(GoldenN, root.GetProperty("n").GetInt32()); + + var perFixture = root.GetProperty("perFixture"); + Assert.Equal(GoldenN, perFixture.GetArrayLength()); + var first = perFixture[0]; + Assert.True(first.TryGetProperty("id", out _)); + Assert.True(first.TryGetProperty("scoreA", out _)); + Assert.True(first.TryGetProperty("scoreB", out _)); + Assert.True(first.TryGetProperty("bWins", out _)); + } +}