diff --git a/CHANGELOG.md b/CHANGELOG.md
index 92c9a909..12b6c0ea 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,18 @@
 
 ## [Unreleased]
 
+### Phase 7 — A/B eval: crew vs single-call on goldens (AI-046) (2026-06-16)
+
+The Phase 7 DoD gate that answers "does the four-agent crew actually earn its orchestration?" — it A/B's the full `FieldCrew` (researcher→drafter→critic→editor) against **one** single LLM call that writes the field directly, on the same brief+source, and gates on a meaningful quality **lift** AND a bounded **cost ratio**. Backend core only; the admin "Run A/B" button is a deliberate fast-follow.
+
+- **Honest baseline (A) — same brief contract, only orchestration differs.** New `Application/Agents/BaselineFieldAgent.cs` (`SingleCallAgent<BaselineInput, Draft>`, FeatureTag `crew.baseline`): one gateway call whose system prompt folds the brief's FULL contract (MinLength/MaxLength, BannedPhrases, StyleGuide, TargetLanguage) via the shared `BriefConstraints` the drafter/critic/editor already use — so A is held to the identical rubric the crew enforces; the user prompt is the raw source. New `Prompts/BaselineFieldPrompt.cs` + `record BaselineInput(ContentBrief Brief, string SourceMaterial)`. **A and B run the SAME generator model (nano)** — the eval isolates orchestration, not model.
+- **Independent judge, no label leakage.** A stronger judge (gpt-4.1 via `Eval:JudgeModel`, the dedicated `openai-judge` keyed provider) scores each candidate **absolutely** on a 3-axis 1-5 prose rubric (grounding / tone / completeness) through the shared `RubricEvaluator`. A and B are judged in **separate** calls with the **same** rubric, and the judge prompt carries only the source + an anonymous candidate — it never learns "single-call" vs "crew".
+- **Metrics + gate** (`Ai.EvalSuite/CrewAbEvalRunner.cs`): per fixture `judgeScoreA/judgeScoreB (0-5)`, `costA/costB`, `bWins`. Aggregate `avgA`, `avgB`, `liftPct = (avgB-avgA)/avgA`, `costRatio = sumCostB/sumCostA`, `winRate`. `Passed = liftPct >= 0.10 && costRatio <= 2.0` — **the cost gate is independent of lift** (better-but-too-expensive still fails). Div-by-zero guarded: `avgA==0 → lift 0`; `sumCostA==0 → costRatio +inf → fails`. **A's cost** = the single call's `LlmResponse.Usage.CostUsd`; **B's cost** = the crew run's TOTAL (sum of the 4 sub-agent usages) — surfaced honestly via a new additive `FieldResult.CostUsd` (populated from `CrewResult.Usage.CostUsdTotal`; no other caller churns).
+- **Crew halt → B scores 0.** If the crew halts before the editor (null `EditedText`), B is judged 0 for that case (no prose to judge); `NeedsReview==true` does NOT zero B — quality A/B is separate from the review gate, so a flagged-but-present edit is still judged.
+- **Golden set** (`Datasets/crew_ab.json`, auto-embedded via the existing `Datasets/*.json` glob): **N=10 edition/description** fixtures (realistic title+author+excerpt-style book source, varied — Frankenstein, Dracula, Moby-Dick, …) resolved to a `ContentBrief` via `SeoBriefs.For("edition","description","en")`. `// TODO grow to 50`. The gate runs on whatever N is present. New `CrewAbGolden` record + `CrewAbGoldenSet.Load()` (mirrors `CriticDefectGoldenSet`).
+- **Reuses, no schema change**: persists a `crew_ab` `EvalRun` (Feature=`crew_ab`, Score=round(liftPct,3), JudgeModelId=judge model, BreakdownJson = `{avgA, avgB, liftPct, costRatio, winRate, n, perFixture[]}`). Endpoint `POST /admin/ai-quality/evals/crew-ab/run` mirrors `studybuddy`/`criticdefects`: resolve the gateway `ILlmService` (503 if no key), the keyed `openai-judge` + `Eval:JudgeModel`, build `BaselineFieldAgent` + resolve `FieldCrew` from scope, run + persist, return `{ avgA, avgB, liftPct, costRatio, winRate, n, passed, cases }`. Structurally mirrors `CriticDefectEvalRunner` (AI-044).
+- **Tests** (`tests/TextStack.AiEvals/CrewAbEvalRunnerTests.cs`, deterministic, fake-gen + fake-judge, **NO key, CI**): a fake generator routed by FeatureTag (`crew.baseline`→A text+cost; `crew.drafter`/`critic`/`editor`/`researcher`→B sub-agent texts+cost) + a fake judge returning canned scores keyed on which candidate marker is in the prompt, run **through the real `RubricEvaluator`** + the reused `IAgentRunWriter`/`CapturingDb` fakes. Covers: `BFarBetter_Passes` (lift≈0.67 ∧ ratio≤2), `BNotBetter_Fails` (lift≈0), `BBetterButTooExpensive_Fails` (cost gate alone fails despite lift), `BWorse_NegativeLift_Fails`, `CrewHalted_NullEditedText_ScoresZero`, `Persist_WritesCrewAbEvalRun` (Feature/Score/BreakdownJson shape/N/JudgeModelId). No `ITool` introduced — the StudyBuddy set-equality test stays green.
+
 ### Phase 7 — admin "view transcript" UI for crew/agent runs (AI-045) (2026-06-15)
 
 Makes the multi-agent reasoning chain inspectable. Every crew run (`crew.autopublish`/`crew.seo`) and single-agent run (`studybuddy`) already persists an `agent_run` row with a nested-step transcript (researcher→drafter→critic→editor); AI-045 surfaces it in the admin app. **No schema change, no new table** — read-only over the existing Phase 6 `agent_run`.
diff --git a/apps/admin/src/api/client.ts b/apps/admin/src/api/client.ts
index d43db026..34f8db5e 100644
--- a/apps/admin/src/api/client.ts
+++ b/apps/admin/src/api/client.ts
@@ -546,6 +546,16 @@ export interface CriticDefectEvalResult {
     parseFailed: boolean
   }[]
 }
+export interface CrewAbEvalResult {
+  avgA: number
+  avgB: number
+  liftPct: number
+  costRatio: number
+  winRate: number
+  n: number
+  passed: boolean
+  cases?: unknown[]
+}
 
 async function fetchJson<T>(path: string, init?: RequestInit): Promise<T> {
   const res = await fetch(`${API_BASE}${path}`, {
@@ -1182,6 +1192,12 @@ export const adminApi = {
     })
   },
 
+  runCrewAbEval: async (): Promise<CrewAbEvalResult> => {
+    return fetchJson<CrewAbEvalResult>('/admin/ai-quality/evals/crew-ab/run', {
+      method: 'POST',
+    })
+  },
+
   // Podcasts
   generatePodcast: async (editionId: string, lang?: string, force?: boolean): Promise<PodcastStatusDto> => {
     return fetchJson<PodcastStatusDto>('/admin/podcasts', {
diff --git a/apps/admin/src/pages/AiQualityPage.tsx b/apps/admin/src/pages/AiQualityPage.tsx
index de5a3ee5..91c2f2c2 100644
--- a/apps/admin/src/pages/AiQualityPage.tsx
+++ b/apps/admin/src/pages/AiQualityPage.tsx
@@ -10,6 +10,7 @@ import {
   AgentRunDetail,
   EvalRun,
   CriticDefectEvalResult,
+  CrewAbEvalResult,
 } from '../api/client'
 
 type Tab = 'summary' | 'traces' | 'transcripts' | 'evals'
@@ -682,6 +683,8 @@ function EvalsTab() {
   const [running, setRunning] = useState(false)
   const [criticRunning, setCriticRunning] = useState(false)
   const [criticResult, setCriticResult] = useState<CriticDefectEvalResult | null>(null)
+  const [crewAbRunning, setCrewAbRunning] = useState(false)
+  const [crewAbResult, setCrewAbResult] = useState<CrewAbEvalResult | null>(null)
 
   const load = () =>
     adminApi
@@ -732,6 +735,19 @@ function EvalsTab() {
     }
   }
 
+  const runCrewAb = async () => {
+    setError(null)
+    setCrewAbRunning(true)
+    try {
+      setCrewAbResult(await adminApi.runCrewAbEval())
+      load() // persisted as a crew_ab eval_run → refresh history
+    } catch (e) {
+      setError(e instanceof Error ? e.message : 'Failed to run crew A/B eval')
+    } finally {
+      setCrewAbRunning(false)
+    }
+  }
+
   const controls = (
     <div style={{ display: 'flex', flexDirection: 'column', gap: 8, marginBottom: 16 }}>
       <div style={{ display: 'flex', gap: 8, alignItems: 'center', flexWrap: 'wrap' }}>
@@ -754,6 +770,14 @@ function EvalsTab() {
           Injects known defects into clean drafts, runs the real nano critic (~23 calls, 20–30s), gate ≥ 0.80 catch-rate.
         </span>
       </div>
+      <div style={{ display: 'flex', gap: 8, alignItems: 'center', flexWrap: 'wrap' }}>
+        <button onClick={runCrewAb} disabled={crewAbRunning} style={rangeBtn(false)}>
+          {crewAbRunning ? 'Running…' : 'Run crew A/B eval'}
+        </button>
+        <span style={{ fontSize: 12, color: '#6b7280' }}>
+          A/B-tests the crew vs a single-call baseline over the goldens (~1–2 min), gate lift ≥ 10% and cost ratio ≤ 2×.
+        </span>
+      </div>
       {criticResult && (
         <div style={card}>
           <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: 12 }}>
@@ -773,6 +797,32 @@ function EvalsTab() {
           </div>
         </div>
       )}
+      {crewAbResult && (
+        <div style={card}>
+          <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: 12 }}>
+            <span style={{ fontWeight: 600, fontSize: 15, color: '#111827' }}>Crew A/B eval</span>
+            <span style={{ fontWeight: 600, fontSize: 13, color: crewAbResult.passed ? '#059669' : '#dc2626' }}>
+              {crewAbResult.passed ? 'PASS' : 'FAIL'}
+            </span>
+          </div>
+          <div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fill, minmax(120px, 1fr))', gap: '8px 12px' }}>
+            <Metric
+              label="Lift"
+              value={`${(crewAbResult.liftPct * 100).toFixed(1)}%`}
+              color={crewAbResult.passed ? '#059669' : '#dc2626'}
+            />
+            <Metric
+              label="Cost ratio"
+              value={`${crewAbResult.costRatio.toFixed(1)}×`}
+              color={crewAbResult.costRatio > 2 ? '#dc2626' : undefined}
+            />
+            <Metric label="Avg A (baseline)" value={crewAbResult.avgA.toFixed(2)} />
+            <Metric label="Avg B (crew)" value={crewAbResult.avgB.toFixed(2)} />
+            <Metric label="Win rate" value={`${(crewAbResult.winRate * 100).toFixed(1)}%`} />
+            <Metric label="N" value={String(crewAbResult.n)} />
+          </div>
+        </div>
+      )}
     </div>
   )
 
diff --git a/backend/src/Ai/TextStack.Ai.EvalSuite/CrewAbEvalRunner.cs b/backend/src/Ai/TextStack.Ai.EvalSuite/CrewAbEvalRunner.cs
new file mode 100644
index 00000000..f07bded1
--- /dev/null
+++ b/backend/src/Ai/TextStack.Ai.EvalSuite/CrewAbEvalRunner.cs
@@ -0,0 +1,242 @@
+using System.Globalization;
+using System.Text;
+using Application.Agents;
+using Application.Common.Interfaces;
+using Domain.Entities;
+using Microsoft.Extensions.AI;
+using Microsoft.Extensions.AI.Evaluation;
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.Logging;
+using TextStack.Ai.Core;
+using TextStack.Ai.Evals;
+using TextStack.Ai.Llm;
+
+namespace TextStack.Ai.EvalSuite;
+
+/// <summary>
+/// One fixture's A/B outcome (AI-046). <see cref="JudgeScoreA"/>/<see cref="JudgeScoreB"/> are the independent
+/// judge's 0-5 means for the single-call baseline (A) and the full crew (B); a crew that halted before the
+/// editor (null EditedText) scores B = 0. <see cref="CostA"/>/<see cref="CostB"/> are the honest USD costs
+/// (A = the single call's usage; B = the crew run's total across its four sub-agents). <see cref="BWins"/> is
+/// true when B's judge mean strictly beats A's.
+/// </summary>
+public sealed record CrewAbCase(
+    string Id, double JudgeScoreA, double JudgeScoreB, decimal CostA, decimal CostB, bool BWins);
+
+/// <summary>
+/// Result of a crew-vs-single-call A/B run (AI-046, Phase 7 DoD gate). <see cref="LiftPct"/> is the headline
+/// metric: the crew's mean judge score over the baseline's, as a fraction. <see cref="CostRatio"/> is the
+/// crew's total spend over the baseline's. The gate demands BOTH a meaningful quality lift AND a bounded cost
+/// multiple — a crew that is better but ruinously expensive does not earn its orchestration.
+/// </summary>
+public sealed record CrewAbEvalResult(
+    double AvgA,
+    double AvgB,
+    double LiftPct,
+    double CostRatio,
+    double WinRate,
+    int N,
+    bool Passed,
+    IReadOnlyList<CrewAbCase> Cases);
+
+/// <summary>
+/// Runs the Phase 7 crew-vs-single-call A/B eval (AI-046): for each golden, send the SAME brief + source
+/// through (A) the single-call <see cref="BaselineFieldAgent"/> that writes the field directly and (B) the full
+/// <see cref="FieldCrew"/> (researcher → drafter → critic → editor). Both arms use the SAME generator model
+/// (nano) so only ORCHESTRATION differs, not the rubric. An independent, STRONGER judge (gpt-4.1 via
+/// <c>Eval:JudgeModel</c>) scores each candidate absolutely on a 3-axis 1-5 rubric (grounding / tone /
+/// completeness) — A and B in SEPARATE calls with the SAME rubric, and the judge NEVER learns which arm a
+/// candidate came from (no "single-call" vs "crew" label leakage). Reports crew lift % and cost ratio and
+/// gates on liftPct ≥ 0.10 AND costRatio ≤ 2.0. Persists a <c>crew_ab</c> <see cref="EvalRun"/> (Score = lift).
+/// Mirrors <see cref="CriticDefectEvalRunner"/>: real generation per case, judged scoring, sync.
+/// </summary>
+public sealed class CrewAbEvalRunner(ILogger<CrewAbEvalRunner> logger)
+{
+    private const string Feature = "crew_ab";
+
+    /// <summary>Minimum crew quality lift over the single-call baseline (fraction) the orchestration must earn.</summary>
+    public const double LiftGate = 0.10;
+
+    /// <summary>Maximum crew cost as a multiple of the single-call baseline's — keeps the lift honest about spend.</summary>
+    public const double CostRatioGate = 2.0;
+
+    // Absolute prose rubric. Axis labels (text before ':') become the RubricEvaluator metric suffixes the
+    // judge scores 1-5; the judge sees only an anonymous candidate + the source — never the arm it came from.
+    private static readonly Rubric Rubric = new(
+        "grounding: every claim is supported by the source material; no invented facts.",
+        "tone: factual, encyclopedic, third-person prose with no subjective superlatives or marketing language.",
+        "completeness: covers the field within its length bounds, reads clearly, and reads well as a whole.");
+
+    private static readonly ChatMessage[] JudgePlaceholderMessages = [new ChatMessage(ChatRole.User, string.Empty)];
+
+    public async Task<CrewAbEvalResult> RunAsync(
+        BaselineFieldAgent baseline,
+        FieldCrew crew,
+        ILlmService judge,
+        string judgeModelId,
+        bool persist,
+        IAppDbContext? db,
+        string? gitSha,
+        CancellationToken ct)
+    {
+        var goldens = CrewAbGoldenSet.Load();
+        var chatConfig = new ChatConfiguration(new LlmServiceChatClient(judge, defaultFeatureTag: "eval.judge"));
+
+        // The crew's CrewTasks.Of opens a child DI scope per stage (ToolDispatcher per-invocation rule), so its
+        // AgentContext.Services must expose IServiceScopeFactory. An empty container provides exactly that and
+        // resolves no tools — the crew specialists are single ILlmService calls. The baseline ignores Services.
+        using var crewServices = new ServiceCollection().BuildServiceProvider();
+
+        var cases = new List<CrewAbCase>();
+        var sumA = 0.0;
+        var sumB = 0.0;
+        var sumCostA = 0m;
+        var sumCostB = 0m;
+        var bWins = 0;
+
+        foreach (var g in goldens)
+        {
+            ct.ThrowIfCancellationRequested();
+
+            var brief = SeoBriefs.For(g.EntityType, g.FieldName, g.TargetLanguage);
+
+            // A — single LLM call. SingleCallAgent ignores ctx.Services (no tools); an empty provider suffices.
+            var ctxA = new AgentContext(UserId: null, EditionId: null, Guid.NewGuid(), EmptyServiceProvider.Instance);
+            var aResult = await baseline.RunAsync(new BaselineInput(brief, g.SourceMaterial), ctxA, ct);
+            var textA = aResult.Output.Text;
+            var costA = aResult.Usage.CostUsdTotal;
+
+            // B — the full crew. Cost is the run total (sum of the 4 sub-agent usages), surfaced on FieldResult.
+            var ctxB = new AgentContext(UserId: null, EditionId: null, Guid.NewGuid(), crewServices);
+            var bResult = await crew.RunFieldAsync(
+                "crew_ab", "edition.description", AutoPublishCrew.CostCapUsd, brief, g.SourceMaterial, ctxB, ct);
+            var textB = bResult.EditedText; // null when the crew halted before the editor
+            var costB = bResult.CostUsd;
+
+            // Judge each candidate against the SAME source with the SAME rubric, in SEPARATE calls. The crew's
+            // null edited text (halted pre-edit) scores 0 — there is no prose to judge.
+            var scoreA = await JudgeAsync(brief, g.SourceMaterial, textA, chatConfig, ct);
+            var scoreB = textB is null ? 0.0 : await JudgeAsync(brief, g.SourceMaterial, textB, chatConfig, ct);
+
+            var won = scoreB > scoreA;
+            cases.Add(new CrewAbCase(g.Id, scoreA, scoreB, costA, costB, won));
+            sumA += scoreA;
+            sumB += scoreB;
+            sumCostA += costA;
+            sumCostB += costB;
+            if (won)
+                bWins++;
+        }
+
+        var n = cases.Count;
+        var avgA = n > 0 ? sumA / n : 0.0;
+        var avgB = n > 0 ? sumB / n : 0.0;
+        var liftPct = avgA > 0 ? (avgB - avgA) / avgA : 0.0; // avgA == 0 → no lift basis → 0 (fails gate)
+        var costRatio = sumCostA > 0 ? (double)(sumCostB / sumCostA) : double.PositiveInfinity; // no A spend → +inf
+        var winRate = n > 0 ? (double)bWins / n : 0.0;
+        var passed = liftPct >= LiftGate && costRatio <= CostRatioGate;
+
+        logger.LogInformation(
+            "Crew A/B eval: avgA={AvgA:0.00} avgB={AvgB:0.00} lift={Lift:0.000} costRatio={Ratio:0.00} winRate={Win:0.00} (N={N}) passed={Passed}",
+            avgA, avgB, liftPct, costRatio, winRate, n, passed);
+
+        if (persist && db is not null)
+        {
+            db.EvalRuns.Add(new EvalRun
+            {
+                Id = Guid.NewGuid(),
+                Feature = Feature,
+                ModelId = "crew.baseline-vs-crew", // both arms share the nano generator; the headline is the comparison
+                JudgeModelId = judgeModelId,
+                Score = Math.Round((decimal)liftPct, 3),
+                N = n,
+                BreakdownJson = BuildBreakdown(avgA, avgB, liftPct, costRatio, winRate, n, cases),
+                GitSha = gitSha,
+                CreatedAt = DateTimeOffset.UtcNow,
+            });
+            await db.SaveChangesAsync(ct);
+        }
+
+        return new CrewAbEvalResult(avgA, avgB, liftPct, costRatio, winRate, n, passed, cases);
+    }
+
+    /// <summary>
+    /// Score one anonymous candidate against the source on the shared prose rubric (mean of the 3 axes, 0-5).
+    /// The judge prompt carries only the source + the candidate — never "single-call" or "crew" — so there is
+    /// no label leakage that could bias the A/B comparison.
+    /// </summary>
+    private static async Task<double> JudgeAsync(
+        ContentBrief brief, string source, string candidate, ChatConfiguration chatConfig, CancellationToken ct)
+    {
+        var evidence =
+            $"Field: the {brief.FieldName} of a {brief.EntityType} ({brief.MinLength}-{brief.MaxLength} characters, " +
+            $"in {brief.TargetLanguage}).\n\nSource material:\n{source}\n\nCandidate text:\n{candidate}";
+
+        var evaluator = new RubricEvaluator(Feature, Rubric);
+        var result = await evaluator.EvaluateAsync(
+            JudgePlaceholderMessages,
+            new ChatResponse(new ChatMessage(ChatRole.Assistant, candidate)),
+            chatConfig,
+            [new RubricEvidenceContext(evidence)],
+            ct);
+
+        var score = new JudgeScore(
+            ReadAxis(result, Rubric.Dim1), ReadAxis(result, Rubric.Dim2), ReadAxis(result, Rubric.Dim3), string.Empty);
+        return score.Mean;
+    }
+
+    private static int ReadAxis(EvaluationResult result, string dim) =>
+        (int)Math.Round(result.Get<NumericMetric>($"{Feature}.{dim.Split(':')[0].Trim()}").Value ?? 0);
+
+    private static string BuildBreakdown(
+        double avgA, double avgB, double liftPct, double costRatio, double winRate, int n,
+        IReadOnlyList<CrewAbCase> cases)
+    {
+        var sb = new StringBuilder();
+        sb.Append('{');
+        sb.Append("\"avgA\":").Append(Num(avgA)).Append(',');
+        sb.Append("\"avgB\":").Append(Num(avgB)).Append(',');
+        sb.Append("\"liftPct\":").Append(Num(liftPct)).Append(',');
+        sb.Append("\"costRatio\":").Append(CostRatioJson(costRatio)).Append(',');
+        sb.Append("\"winRate\":").Append(Num(winRate)).Append(',');
+        sb.Append("\"n\":").Append(n).Append(',');
+
+        sb.Append("\"perFixture\":[");
+        for (var i = 0; i < cases.Count; i++)
+        {
+            var c = cases[i];
+            if (i > 0)
+                sb.Append(',');
+            sb.Append("{\"id\":\"").Append(Escape(c.Id)).Append("\",")
+              .Append("\"scoreA\":").Append(Num(c.JudgeScoreA)).Append(',')
+              .Append("\"scoreB\":").Append(Num(c.JudgeScoreB)).Append(',')
+              .Append("\"costA\":").Append(Cost(c.CostA)).Append(',')
+              .Append("\"costB\":").Append(Cost(c.CostB)).Append(',')
+              .Append("\"bWins\":").Append(c.BWins ? "true" : "false")
+              .Append('}');
+        }
+        sb.Append(']');
+
+        sb.Append('}');
+        return sb.ToString();
+    }
+
+    // +inf serializes as JSON null (no valid IEEE literal) so the breakdown stays parseable when A had no spend.
+    private static string CostRatioJson(double ratio) =>
+        double.IsPositiveInfinity(ratio) ? "null" : Num(ratio);
+
+    private static string Num(double value) =>
+        Math.Round(value, 3).ToString("0.###", CultureInfo.InvariantCulture);
+
+    private static string Cost(decimal value) =>
+        Math.Round(value, 6).ToString("0.######", CultureInfo.InvariantCulture);
+
+    private static string Escape(string s) => s.Replace("\\", "\\\\").Replace("\"", "\\\"");
+
+    /// <summary>No-service provider for both arms' <see cref="AgentContext"/> — neither resolves tools.</summary>
+    private sealed class EmptyServiceProvider : IServiceProvider
+    {
+        public static readonly EmptyServiceProvider Instance = new();
+        public object? GetService(Type serviceType) => null;
+    }
+}
diff --git a/backend/src/Ai/TextStack.Ai.EvalSuite/CrewAbGolden.cs b/backend/src/Ai/TextStack.Ai.EvalSuite/CrewAbGolden.cs
new file mode 100644
index 00000000..c4de7416
--- /dev/null
+++ b/backend/src/Ai/TextStack.Ai.EvalSuite/CrewAbGolden.cs
@@ -0,0 +1,27 @@
+namespace TextStack.Ai.EvalSuite;
+
+/// <summary>
+/// One A/B golden case (AI-046): a piece of raw book source material that BOTH the single-call baseline
+/// (<c>crew.baseline</c>) and the full <see cref="Application.Agents.FieldCrew"/> write the same field from.
+/// The (<see cref="EntityType"/>, <see cref="FieldName"/>) pair resolves the shared <c>ContentBrief</c> via
+/// <c>SeoBriefs.For</c>, so both arms enforce the identical contract — the eval isolates ORCHESTRATION, not
+/// the rubric. v1 is edition/description only (N=10, TODO grow to 50).
+/// </summary>
+/// <param name="Id">Stable case id for the admin UI / test diagnostics.</param>
+/// <param name="EntityType">The brief's entity (v1: <c>edition</c>).</param>
+/// <param name="FieldName">The brief's field (v1: <c>description</c>).</param>
+/// <param name="TargetLanguage">The output language passed to <c>SeoBriefs.For</c> (v1: <c>en</c>).</param>
+/// <param name="SourceMaterial">Title + author + excerpt-style facts both arms ground their prose in.</param>
+public record CrewAbGolden(
+    string Id,
+    string EntityType,
+    string FieldName,
+    string TargetLanguage,
+    string SourceMaterial);
+
+/// <summary>Loads the embedded A/B golden set (<c>crew_ab.json</c>).</summary>
+public static class CrewAbGoldenSet
+{
+    public static IReadOnlyList<CrewAbGolden> Load() =>
+        GoldenLoader.Load<CrewAbGolden>("crew_ab.json");
+}
diff --git a/backend/src/Ai/TextStack.Ai.EvalSuite/Datasets/crew_ab.json b/backend/src/Ai/TextStack.Ai.EvalSuite/Datasets/crew_ab.json
new file mode 100644
index 00000000..19fc43ed
--- /dev/null
+++ b/backend/src/Ai/TextStack.Ai.EvalSuite/Datasets/crew_ab.json
@@ -0,0 +1,72 @@
+[
+  {
+    "id": "ab_frankenstein",
+    "entityType": "edition",
+    "fieldName": "description",
+    "targetLanguage": "en",
+    "sourceMaterial": "Title: Frankenstein; or, The Modern Prometheus\nAuthor: Mary Shelley\nFirst published: London, 1818 (anonymous); revised edition 1831.\nForm: Epistolary frame narrative told through the letters of Arctic explorer Robert Walton, who encounters Victor Frankenstein during an expedition.\nPlot facts:\n- Victor Frankenstein, a Genevan student of natural philosophy, assembles and animates a creature from dead matter.\n- The creature is rejected by its creator and by the society that recoils from its appearance.\n- The creature turns against Frankenstein; the two become bound in a mutual pursuit that drives the plot.\nContext:\n- Widely treated as an early example of science fiction.\n- Themes: ambition, responsibility, isolation, and what an inventor owes to the life he brings into being.\nExcerpt: \"It was on a dreary night of November that I beheld the accomplishment of my toils... I saw the dull yellow eye of the creature open.\""
+  },
+  {
+    "id": "ab_dracula",
+    "entityType": "edition",
+    "fieldName": "description",
+    "targetLanguage": "en",
+    "sourceMaterial": "Title: Dracula\nAuthor: Bram Stoker (Irish author)\nFirst published: London, 1897.\nForm: Epistolary novel assembled from letters, diary entries, and newspaper clippings rather than a single continuous narration.\nPlot facts:\n- Count Dracula, a vampire from Transylvania, relocates to England.\n- The solicitor Jonathan Harker travels to the Count's castle on business as the novel opens.\n- A group led by Professor Abraham Van Helsing forms to oppose the vampire and protect those he targets.\nContext:\n- Credited with helping define the modern vampire genre.\n- Engages period anxieties about foreign invasion and the spread of disease.\n- The layered documentary structure lets several narrators assemble one account from fragments.\nExcerpt: \"I am all in a sea of wonders. I doubt; I fear; I think strange things, which I dare not confess to my own soul.\""
+  },
+  {
+    "id": "ab_mobydick",
+    "entityType": "edition",
+    "fieldName": "description",
+    "targetLanguage": "en",
+    "sourceMaterial": "Title: Moby-Dick; or, The Whale\nAuthor: Herman Melville\nFirst published: 1851.\nNarration: A sailor who introduces himself with the words \"Call me Ishmael.\"\nPlot facts:\n- The whaling ship Pequod sails under Captain Ahab.\n- Ahab is consumed by his pursuit of a white sperm whale named Moby Dick that had maimed him on an earlier voyage.\n- The novel interleaves the chase with extended digressions on the practice and science of whaling.\nContext:\n- Early reviews were mixed and the book sold modestly during Melville's lifetime.\n- Now regarded as a major work of American literature, frequently studied for its blend of adventure, encyclopedic detail, and philosophical inquiry.\nExcerpt: \"Whenever I find myself growing grim about the mouth... then I account it high time to get to sea as soon as I can.\""
+  },
+  {
+    "id": "ab_prideandprejudice",
+    "entityType": "edition",
+    "fieldName": "description",
+    "targetLanguage": "en",
+    "sourceMaterial": "Title: Pride and Prejudice\nAuthor: Jane Austen\nFirst published: 1813.\nSetting: Among the landed gentry of early nineteenth-century England.\nPlot facts:\n- Elizabeth Bennet is the second of five sisters whose family's circumstances make advantageous marriage a pressing question.\n- The narrative traces Elizabeth's changing relationship with Mr. Darcy, a wealthy and reserved gentleman.\n- The initial misjudgements each forms about the other are gradually corrected.\nContext:\n- Known for its irony and its commentary on the social conventions governing courtship and class.\n- Recurring concerns: marriage, economic security, reputation, first impressions, and family standing.\nExcerpt: \"It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.\""
+  },
+  {
+    "id": "ab_janeeyre",
+    "entityType": "edition",
+    "fieldName": "description",
+    "targetLanguage": "en",
+    "sourceMaterial": "Title: Jane Eyre\nAuthor: Charlotte Bronte (originally published under the pen name Currer Bell)\nFirst published: 1847.\nForm: First-person bildungsroman.\nPlot facts:\n- Jane Eyre, an orphan, endures a harsh childhood at Gateshead and the austere Lowood school before taking a post as governess at Thornfield Hall.\n- She falls in love with her employer, Edward Rochester, whose house holds a hidden secret.\n- Jane leaves Thornfield to preserve her principles and later returns on changed terms.\nContext:\n- Notable for its assertion of the moral and spiritual independence of its narrator.\n- Themes: conscience, social class, religion, and the struggle for autonomy.\nExcerpt: \"I am no bird; and no net ensnares me: I am a free human being with an independent will.\""
+  },
+  {
+    "id": "ab_greatgatsby",
+    "entityType": "edition",
+    "fieldName": "description",
+    "targetLanguage": "en",
+    "sourceMaterial": "Title: The Great Gatsby\nAuthor: F. Scott Fitzgerald\nFirst published: 1925.\nSetting: Long Island and New York City during the Jazz Age.\nNarration: Told by Nick Carraway, a Midwesterner who moves east and rents a cottage next to a mansion.\nPlot facts:\n- Jay Gatsby, a wealthy man of mysterious origin, throws lavish parties in pursuit of his lost love, Daisy Buchanan.\n- Nick observes the entanglement of Gatsby, Daisy, and her husband Tom, which moves toward tragedy.\nContext:\n- Often read as a portrait of the American Dream and its disillusionment.\n- Themes: wealth, class, longing, and the gap between aspiration and reality.\nExcerpt: \"So we beat on, boats against the current, borne back ceaselessly into the past.\""
+  },
+  {
+    "id": "ab_warandpeace",
+    "entityType": "edition",
+    "fieldName": "description",
+    "targetLanguage": "en",
+    "sourceMaterial": "Title: War and Peace\nAuthor: Leo Tolstoy\nFirst published: serialized 1865-1867, complete 1869.\nScope: A long historical novel set in Russia during the Napoleonic Wars.\nPlot facts:\n- Follows several aristocratic families, among them the Bezukhovs, Bolkonskys, and Rostovs, across war and domestic life.\n- Pierre Bezukhov, Prince Andrei Bolkonsky, and Natasha Rostova are central figures whose lives are reshaped by the 1812 French invasion.\n- The narrative alternates between battlefield events and the private concerns of its characters.\nContext:\n- Famous for its scale and for Tolstoy's reflections on history and the limits of individual agency.\n- Themes: history, fate, family, and the search for meaning.\nExcerpt: \"We can know only that we know nothing. And that is the highest degree of human wisdom.\""
+  },
+  {
+    "id": "ab_crimeandpunishment",
+    "entityType": "edition",
+    "fieldName": "description",
+    "targetLanguage": "en",
+    "sourceMaterial": "Title: Crime and Punishment\nAuthor: Fyodor Dostoevsky\nFirst published: serialized 1866.\nSetting: Saint Petersburg.\nPlot facts:\n- Rodion Raskolnikov, an impoverished former student, murders a pawnbroker, convinced he is exempt from ordinary moral law.\n- The act plunges him into guilt, paranoia, and illness as the investigator Porfiry Petrovich closes in.\n- His relationship with Sonia, a young woman driven to prostitution by poverty, moves him toward confession.\nContext:\n- A psychological novel examining conscience, suffering, and redemption.\n- Themes: guilt, free will, alienation, and moral responsibility.\nExcerpt: \"Pain and suffering are always inevitable for a large intelligence and a deep heart.\""
+  },
+  {
+    "id": "ab_huckfinn",
+    "entityType": "edition",
+    "fieldName": "description",
+    "targetLanguage": "en",
+    "sourceMaterial": "Title: Adventures of Huckleberry Finn\nAuthor: Mark Twain (pen name of Samuel Clemens)\nFirst published: 1884 (UK), 1885 (US).\nNarration: First-person, in Huck's vernacular voice.\nPlot facts:\n- Huckleberry Finn flees his abusive father and his \"civilizing\" guardians and travels down the Mississippi River on a raft.\n- He journeys with Jim, a man escaping enslavement, and the two form a bond as they encounter swindlers and danger along the river.\nContext:\n- Noted for its use of regional dialect and its satire of pre-Civil-War Southern society.\n- Themes: freedom, conscience, race, and the tension between social convention and individual morality.\nExcerpt: \"All right, then, I'll go to hell.\""
+  },
+  {
+    "id": "ab_wuthering",
+    "entityType": "edition",
+    "fieldName": "description",
+    "targetLanguage": "en",
+    "sourceMaterial": "Title: Wuthering Heights\nAuthor: Emily Bronte (originally published under the pen name Ellis Bell)\nFirst published: 1847.\nForm: Frame narrative related through the housekeeper Nelly Dean to the tenant Mr. Lockwood.\nSetting: The Yorkshire moors.\nPlot facts:\n- The intense, destructive bond between Catherine Earnshaw and the foundling Heathcliff shapes two generations of the Earnshaw and Linton families.\n- Heathcliff's thwarted love turns to a long campaign of revenge against both households.\nContext:\n- Distinctive for its nested narration and its bleak moorland atmosphere.\n- Themes: obsessive love, revenge, class, and the persistence of the past.\nExcerpt: \"Whatever our souls are made of, his and mine are the same.\""
+  }
+]
diff --git a/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs b/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs
index 6122a1f3..6f203197 100644
--- a/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs
+++ b/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs
@@ -33,6 +33,68 @@ public static void MapAdminAiQualityEndpoints(this WebApplication app)
         group.MapPost("/evals/toolcalls/run", RunToolCallEval);
         group.MapPost("/evals/studybuddy/run", RunStudyBuddyEval);
         group.MapPost("/evals/criticdefects/run", RunCriticDefectEval);
+        group.MapPost("/evals/crew-ab/run", RunCrewAbEval);
+    }
+
+    // Phase 7 DoD gate (AI-046): A/B the single-call baseline vs the full FieldCrew on the same brief+source over
+    // the golden set, judged by an independent stronger judge (gpt-4.1). Reports crew lift % + cost ratio and
+    // gates on lift ≥ 0.10 AND costRatio ≤ 2.0. Generation goes through the gateway (same nano for both arms);
+    // the judge runs the dedicated openai-judge provider. ~30 gen + 20 judge calls, run sync like the others.
+    private static async Task<IResult> RunCrewAbEval(
+        HttpContext httpContext,
+        IServiceProvider services,
+        IConfiguration config,
+        TextStack.Ai.EvalSuite.CrewAbEvalRunner runner,
+        FieldCrew crew,
+        IAppDbContext db,
+        CancellationToken ct)
+    {
+        ILlmService gateway;
+        try
+        {
+            gateway = services.GetRequiredService<ILlmService>();
+        }
+        catch (InvalidOperationException)
+        {
+            return Results.Problem("LLM gateway is not configured (no OpenAI key).", statusCode: 503);
+        }
+
+        ILlmService judge;
+        try
+        {
+            judge = services.GetRequiredKeyedService<ILlmService>("openai-judge");
+        }
+        catch (InvalidOperationException)
+        {
+            return Results.Problem("Judge LLM is not configured.", statusCode: 503);
+        }
+
+        var judgeModelId = config["Eval:JudgeModel"] ?? "gpt-4.1";
+        var baseline = new BaselineFieldAgent(gateway);
+        var gitSha = Environment.GetEnvironmentVariable("GIT_SHA");
+
+        var result = await runner.RunAsync(
+            baseline, crew, judge, judgeModelId, persist: true, db, gitSha, ct);
+
+        return Results.Ok(new
+        {
+            avgA = Math.Round(result.AvgA, 3),
+            avgB = Math.Round(result.AvgB, 3),
+            liftPct = Math.Round(result.LiftPct, 4),
+            costRatio = double.IsPositiveInfinity(result.CostRatio) ? (double?)null : Math.Round(result.CostRatio, 3),
+            winRate = Math.Round(result.WinRate, 3),
+            n = result.N,
+            passed = result.Passed,
+            cases = result.Cases.Select(c => new
+            {
+                c.Id,
+                scoreA = Math.Round(c.JudgeScoreA, 3),
+                scoreB = Math.Round(c.JudgeScoreB, 3),
+                c.CostA,
+                c.CostB,
+                c.BWins,
+            }),
+        });
     }
 
     // Phase 7 DoD gate (AI-044): inject KNOWN defects into clean drafts, run the REAL AI-041 critic (nano)
diff --git a/backend/src/Api/Program.cs b/backend/src/Api/Program.cs
index d96236fe..f42f56d6 100644
--- a/backend/src/Api/Program.cs
+++ b/backend/src/Api/Program.cs
@@ -85,6 +85,7 @@
 builder.Services.AddSingleton<TextStack.Ai.EvalSuite.ToolCallEvalRunner>();
 builder.Services.AddSingleton<TextStack.Ai.EvalSuite.StudyBuddyEvalRunner>();
 builder.Services.AddSingleton<TextStack.Ai.EvalSuite.CriticDefectEvalRunner>();
+builder.Services.AddSingleton<TextStack.Ai.EvalSuite.CrewAbEvalRunner>();
 // Tool catalogue (AI-029/030): scans Application for ITool impls; dispatch is schema-validated.
 builder.Services.AddAiTools(typeof(Application.Tools.GetChapterTool).Assembly);
 // Agent loop engine (Phase 6, AI-034). Concrete agents (StudyBuddy, AI-035) build on it.
@@ -96,6 +97,9 @@
 builder.Services.AddSingleton<Application.Agents.DrafterAgent>();
 builder.Services.AddSingleton<Application.Agents.CriticAgent>();
 builder.Services.AddSingleton<Application.Agents.EditorAgent>();
+// Single-call A/B baseline (Phase 7, AI-046): the "A" arm of the crew-vs-single-call eval — one ILlmService
+// call that writes a field directly under the crew's full brief contract. Stateless singleton like the specialists.
+builder.Services.AddSingleton<Application.Agents.BaselineFieldAgent>();
 // AutoPublish crew (Phase 7, AI-042): in-process admin path that runs the specialists over ILlmService to
 // generate SEO prose for an Edition. Scoped because it persists via the scoped IAgentRunWriter (per-request
 // DbContext). The legacy bash + Claude-CLI poller stays the default; this is the observable, traced alternative.
diff --git a/backend/src/Application/Agents/BaselineFieldAgent.cs b/backend/src/Application/Agents/BaselineFieldAgent.cs
new file mode 100644
index 00000000..28cce580
--- /dev/null
+++ b/backend/src/Application/Agents/BaselineFieldAgent.cs
@@ -0,0 +1,26 @@
+using Application.Agents.Prompts;
+using TextStack.Ai.Core;
+
+namespace Application.Agents;
+
+/// <summary>
+/// The single-call A/B baseline (AI-046): ONE <see cref="ILlmService"/> gateway call that writes the requested
+/// field directly from the source, with the brief's FULL contract folded into the system prompt. It is the "A"
+/// arm of the crew-vs-single-call eval — the "B" arm is the full <see cref="FieldCrew"/>. Both run the SAME
+/// generator model (nano) and are judged by the SAME rubric, so the eval measures only what orchestration buys.
+///
+/// FeatureTag <c>crew.baseline</c> (distinct from the crew's <c>crew.drafter</c>/<c>crew.editor</c> tags so the
+/// A and B calls route + trace separately). Token budget mirrors the drafter's (500): one prose field of the
+/// same size. Parse is the same trim the drafter/editor use — fail-closed via <see cref="Draft"/>.
+/// </summary>
+public sealed class BaselineFieldAgent(ILlmService llm) : SingleCallAgent<BaselineInput, Draft>(llm)
+{
+    protected override string FeatureTag => "crew.baseline";
+    protected override int MaxOutputTokens => 500;
+
+    protected override (string system, string user) BuildPrompt(BaselineInput input) =>
+        (BaselineFieldPrompt.BuildSystemPrompt(input.Brief),
+         BaselineFieldPrompt.BuildUserPrompt(input.SourceMaterial));
+
+    protected override Draft Parse(string text, BaselineInput input) => new(text.Trim());
+}
diff --git a/backend/src/Application/Agents/CrewAgentContracts.cs b/backend/src/Application/Agents/CrewAgentContracts.cs
index 6b264885..3fa49f76 100644
--- a/backend/src/Application/Agents/CrewAgentContracts.cs
+++ b/backend/src/Application/Agents/CrewAgentContracts.cs
@@ -18,6 +18,13 @@ public record ContentBrief(
 /// <summary>Input to the researcher: the brief plus the raw source material to condense into neutral facts.</summary>
 public record ResearchInput(ContentBrief Brief, string SourceMaterial);
 
+/// <summary>
+/// Input to the single-call baseline (AI-046): the SAME brief the crew reads plus the raw source material.
+/// One LLM call folds the brief's full contract into its system prompt and writes the field directly — so an
+/// A/B eval isolates orchestration (crew vs single call), not the rubric the two are held to.
+/// </summary>
+public record BaselineInput(ContentBrief Brief, string SourceMaterial);
+
 /// <summary>The researcher's output: bullet FACTS grounded entirely in the source, ready for the drafter.</summary>
 public record ResearchNotes(string Notes);
 
diff --git a/backend/src/Application/Agents/FieldCrew.cs b/backend/src/Application/Agents/FieldCrew.cs
index dd9be7ba..4cd04d7c 100644
--- a/backend/src/Application/Agents/FieldCrew.cs
+++ b/backend/src/Application/Agents/FieldCrew.cs
@@ -72,7 +72,8 @@ public async Task<FieldResult> RunFieldAsync(
             result);
         await runWriter.WriteAsync(record, ct);
 
-        return new FieldResult(editedText, state.Critique, needsReview, result.Status, ctx.AgentRunId);
+        return new FieldResult(
+            editedText, state.Critique, needsReview, result.Status, ctx.AgentRunId, result.Usage.CostUsdTotal);
     }
 
     /// <summary>
@@ -134,11 +135,14 @@ private CrewPlan<FieldCrewState> BuildPlan(string crewName, decimal costCapUsd)
 /// Outcome of one content-crew field run (AI-043). <see cref="EditedText"/> is the editor's final prose (null
 /// if the crew never reached the edit stage). <see cref="Critique"/> is the critic's verdict (null on halt
 /// before critique). <see cref="NeedsReview"/> is the fail-closed gate — true means "do not auto-apply". The
-/// entity write and apply decision live in the caller, never here.
+/// entity write and apply decision live in the caller, never here. <see cref="CostUsd"/> is the crew run's
+/// TOTAL cost (sum of the four sub-agent usages, honestly aggregated by the orchestrator) — surfaced so the
+/// AI-046 A/B eval can compute crew-vs-baseline cost ratio without re-reading the persisted agent_run.
 /// </summary>
 public record FieldResult(
     string? EditedText,
     CritiqueResult? Critique,
     bool NeedsReview,
     string Status,
-    Guid RunId);
+    Guid RunId,
+    decimal CostUsd);
diff --git a/backend/src/Application/Agents/Prompts/BaselineFieldPrompt.cs b/backend/src/Application/Agents/Prompts/BaselineFieldPrompt.cs
new file mode 100644
index 00000000..26d3fc1e
--- /dev/null
+++ b/backend/src/Application/Agents/Prompts/BaselineFieldPrompt.cs
@@ -0,0 +1,34 @@
+namespace Application.Agents.Prompts;
+
+/// <summary>
+/// The single-call baseline's prompt (AI-046, FeatureTag <c>crew.baseline</c>): write the requested field
+/// DIRECTLY from the raw source in ONE LLM call. The system prompt folds the brief's FULL contract — length
+/// bounds, banned phrases, target language, style guide — the SAME constraints the crew enforces across its
+/// four specialists. The point of the A/B eval is to isolate ORCHESTRATION: the baseline is held to the
+/// identical contract, so any quality lift the crew shows comes from researcher→drafter→critic→editor, not
+/// from a weaker rubric. Reuses <see cref="BriefConstraints"/> so length + banned phrases read identically to
+/// the drafter/critic/editor. Pure string building.
+/// </summary>
+public static class BaselineFieldPrompt
+{
+    public static string BuildSystemPrompt(ContentBrief brief)
+    {
+        var prompt =
+            $"You are a copywriter writing the {brief.FieldName} of a {brief.EntityType}. " +
+            "Use ONLY the facts in the source material provided — do not add any information that is not in it. " +
+            $"Write the text in {brief.TargetLanguage}. " +
+            $"The text must be {BriefConstraints.Length(brief)} long.";
+
+        if (BriefConstraints.BannedPhrases(brief) is { } banned)
+            prompt += $" Do not use these phrases: {banned}.";
+
+        if (!string.IsNullOrWhiteSpace(brief.StyleGuide))
+            prompt += $" Style guide: {brief.StyleGuide.Trim()}.";
+
+        prompt += " Output only the finished text — no markdown, no preface, no quotes around it.";
+        return prompt;
+    }
+
+    public static string BuildUserPrompt(string sourceMaterial) =>
+        $"Source material:\n{sourceMaterial}";
+}
diff --git a/tests/TextStack.AiEvals/CrewAbEvalRunnerTests.cs b/tests/TextStack.AiEvals/CrewAbEvalRunnerTests.cs
new file mode 100644
index 00000000..b08a87d0
--- /dev/null
+++ b/tests/TextStack.AiEvals/CrewAbEvalRunnerTests.cs
@@ -0,0 +1,276 @@
+using System.Text.Json;
+using Application.Agents;
+using Microsoft.Extensions.Logging.Abstractions;
+using TextStack.Ai.Agents;
+using TextStack.Ai.Core;
+using TextStack.Ai.EvalSuite;
+
+namespace TextStack.AiEvals;
+
+/// <summary>
+/// Deterministic coverage for <see cref="CrewAbEvalRunner"/> (AI-046) — NO key, NO network. A fake generator
+/// routes per FeatureTag: <c>crew.baseline</c> → the single-call (A) text + its cost; the crew tags
+/// (<c>crew.researcher/drafter/critic/editor</c>) → B's sub-agent texts + their cost. A fake JUDGE runs through
+/// the REAL <see cref="TextStack.Ai.Evals.RubricEvaluator"/> and returns canned 1-5 scores keyed on WHICH
+/// candidate text is in the prompt (A's marker vs B's editor marker), so the runner's lift / cost-ratio / gate
+/// math is exercised end-to-end. Asserts the four gate scenarios, the crew-halt 0-score, and EvalRun persistence.
+/// </summary>
+public class CrewAbEvalRunnerTests
+{
+    private static readonly int GoldenN = CrewAbGoldenSet.Load().Count;
+
+    // Distinct markers so the judge can tell the two arms apart purely from the candidate text in its prompt.
+    private const string AMarker = "BASELINE_A_CANDIDATE";
+    private const string BMarker = "CREW_B_EDITED_CANDIDATE";
+
+    // A clean critic verdict so the crew completes through the editor (B's EditedText is non-null).
+    private const string CleanCritic =
+        """{"scores":{"factual_accuracy":5,"tone":5,"length":5,"banned_phrases":5},"issues":[]}""";
+
+    private static CrewAbEvalRunner Runner() => new(NullLogger<CrewAbEvalRunner>.Instance);
+
+    // ---- Fake generator: routes A vs the four crew specialists, with per-arm cost ------------------
+
+    /// <summary>
+    /// One fake <see cref="ILlmService"/> driving BOTH arms: <c>crew.baseline</c> emits A's marker text at
+    /// <paramref name="baselineCost"/>; each crew specialist emits its canned text, and the per-call crew cost is
+    /// <paramref name="crewCostEach"/> (4 calls → crew total ≈ 4× that). The editor emits B's marker so the judge
+    /// can score B's edited candidate.
+    /// </summary>
+    private sealed class FakeGenerator(decimal baselineCost, decimal crewCostEach) : ILlmService
+    {
+        public Task<LlmResponse> CompleteAsync(LlmRequest request, CancellationToken ct)
+        {
+            var (text, cost) = request.FeatureTag switch
+            {
+                "crew.baseline" => (AMarker, baselineCost),
+                "crew.researcher" => ("- a grounded fact\n- another fact", crewCostEach),
+                "crew.drafter" => ("A drafted candidate.", crewCostEach),
+                "crew.critic" => (CleanCritic, crewCostEach),
+                "crew.editor" => (BMarker, crewCostEach),
+                _ => throw new InvalidOperationException($"Unexpected feature tag: {request.FeatureTag}"),
+            };
+            return Task.FromResult(new LlmResponse(text, [], new LlmUsage(40, 20, cost), "fake-gen", Guid.NewGuid()));
+        }
+
+        public IAsyncEnumerable<LlmDelta> StreamAsync(LlmRequest request, CancellationToken ct) =>
+            throw new NotSupportedException();
+    }
+
+    // ---- Fake judge (through the real RubricEvaluator): canned scores per candidate marker -----------
+
+    /// <summary>
+    /// Returns a fixed 3-axis verdict for A's candidate and another for B's, detected by the marker present in
+    /// the judge prompt. The runner judges in separate calls, so each call carries exactly one candidate.
+    /// </summary>
+    private sealed class MarkerJudge(int a1, int a2, int a3, int b1, int b2, int b3) : ILlmService
+    {
+        public Task<LlmResponse> CompleteAsync(LlmRequest request, CancellationToken ct)
+        {
+            var prompt = string.Join("\n", request.Messages.Select(m => m.Content));
+            var isB = prompt.Contains(BMarker, StringComparison.Ordinal);
+            var (d1, d2, d3) = isB ? (b1, b2, b3) : (a1, a2, a3);
+            return Task.FromResult(new LlmResponse(
+                $"{{\"d1\": {d1}, \"d2\": {d2}, \"d3\": {d3}, \"rationale\": \"ok\"}}",
+                [], new LlmUsage(0, 0, 0m), "fake-judge", Guid.NewGuid()));
+        }
+
+        public IAsyncEnumerable<LlmDelta> StreamAsync(LlmRequest request, CancellationToken ct) =>
+            throw new NotSupportedException();
+    }
+
+    private sealed class RecordingAgentRunWriter : IAgentRunWriter
+    {
+        public Task WriteAsync(AgentRunRecord run, CancellationToken ct) => Task.CompletedTask;
+    }
+
+    private static FieldCrew Crew(ILlmService gen) =>
+        new(new CrewOrchestrator(),
+            new ResearcherAgent(gen),
+            new DrafterAgent(gen),
+            new CriticAgent(gen),
+            new EditorAgent(gen),
+            new RecordingAgentRunWriter());
+
+    private static BaselineFieldAgent Baseline(ILlmService gen) => new(gen);
+
+    private static CancellationToken Ct => TestContext.Current.CancellationToken;
+
+    // ---- 1. B far better, cost in budget → passes ---------------------------------------------------
+
+    [Fact]
+    public async Task RunAsync_BFarBetter_Passes()
+    {
+        // B scores 5 across the board, A scores 3 → lift = (5-3)/3 ≈ 0.667. Crew cost (4 × 0.001 = 0.004) is
+        // exactly 2× the baseline (0.002) → ratio == 2.0 ≤ gate. Both gates clear.
+        var gen = new FakeGenerator(baselineCost: 0.002m, crewCostEach: 0.001m);
+        var judge = new MarkerJudge(3, 3, 3, 5, 5, 5);
+
+        var result = await Runner().RunAsync(
+            Baseline(gen), Crew(gen), judge, "judge-test", persist: false, db: null, gitSha: null, Ct);
+
+        Assert.Equal(GoldenN, result.N);
+        Assert.Equal(3.0, result.AvgA, 6);
+        Assert.Equal(5.0, result.AvgB, 6);
+        Assert.Equal((5.0 - 3.0) / 3.0, result.LiftPct, 6);
+        Assert.Equal(2.0, result.CostRatio, 6);
+        Assert.Equal(1.0, result.WinRate, 6);
+        Assert.True(result.LiftPct >= CrewAbEvalRunner.LiftGate);
+        Assert.True(result.CostRatio <= CrewAbEvalRunner.CostRatioGate);
+        Assert.True(result.Passed);
+    }
+
+    // ---- 2. B not better (A == B) → fails on lift ---------------------------------------------------
+
+    [Fact]
+    public async Task RunAsync_BNotBetter_Fails()
+    {
+        var gen = new FakeGenerator(baselineCost: 0.002m, crewCostEach: 0.0005m); // crew cheaper, cost gate fine
+        var judge = new MarkerJudge(4, 4, 4, 4, 4, 4); // identical → lift 0
+
+        var result = await Runner().RunAsync(
+            Baseline(gen), Crew(gen), judge, "judge-test", persist: false, db: null, gitSha: null, Ct);
+
+        Assert.Equal(0.0, result.LiftPct, 6);
+        Assert.Equal(0.0, result.WinRate, 6);
+        Assert.True(result.LiftPct < CrewAbEvalRunner.LiftGate);
+        Assert.False(result.Passed); // cost is fine, but no lift → gate fails
+    }
+
+    // ---- 3. B better but too expensive → fails on cost (independent of lift) ------------------------
+
+    [Fact]
+    public async Task RunAsync_BBetterButTooExpensive_Fails()
+    {
+        // B clearly better (lift would pass) but the crew costs > 2× the baseline → cost gate fails alone.
+        var gen = new FakeGenerator(baselineCost: 0.001m, crewCostEach: 0.001m); // crew total 0.004 = 4× baseline
+        var judge = new MarkerJudge(3, 3, 3, 5, 5, 5);
+
+        var result = await Runner().RunAsync(
+            Baseline(gen), Crew(gen), judge, "judge-test", persist: false, db: null, gitSha: null, Ct);
+
+        Assert.True(result.LiftPct >= CrewAbEvalRunner.LiftGate); // lift alone would pass
+        Assert.Equal(4.0, result.CostRatio, 6);
+        Assert.True(result.CostRatio > CrewAbEvalRunner.CostRatioGate);
+        Assert.False(result.Passed); // cost gate is independent — fails despite the lift
+    }
+
+    // ---- 4. B worse → negative lift, fails ----------------------------------------------------------
+
+    [Fact]
+    public async Task RunAsync_BWorse_NegativeLift_Fails()
+    {
+        var gen = new FakeGenerator(baselineCost: 0.002m, crewCostEach: 0.0005m);
+        var judge = new MarkerJudge(5, 5, 5, 2, 2, 2); // A beats B
+
+        var result = await Runner().RunAsync(
+            Baseline(gen), Crew(gen), judge, "judge-test", persist: false, db: null, gitSha: null, Ct);
+
+        Assert.True(result.LiftPct < 0);
+        Assert.Equal((2.0 - 5.0) / 5.0, result.LiftPct, 6);
+        Assert.Equal(0.0, result.WinRate, 6);
+        Assert.False(result.Passed);
+    }
+
+    // ---- 5. Crew halted (null EditedText) → B scores 0 for that case --------------------------------
+
+    [Fact]
+    public async Task RunAsync_CrewHalted_NullEditedText_ScoresZero()
+    {
+        // Crew cost per call busts the per-field cap → the orchestrator halts after research, before the editor,
+        // so EditedText is null and the runner scores B = 0 WITHOUT calling the judge for B. A still scores 3.
+        var gen = new FakeGenerator(baselineCost: 0.002m, crewCostEach: AutoPublishCrew.CostCapUsd + 0.01m);
+        var judge = new MarkerJudge(3, 3, 3, 5, 5, 5); // B markers never reach the judge (no edited text)
+
+        var result = await Runner().RunAsync(
+            Baseline(gen), Crew(gen), judge, "judge-test", persist: false, db: null, gitSha: null, Ct);
+
+        Assert.Equal(3.0, result.AvgA, 6);
+        Assert.Equal(0.0, result.AvgB, 6); // every B halted → 0
+        Assert.All(result.Cases, c => Assert.Equal(0.0, c.JudgeScoreB, 6));
+        Assert.True(result.LiftPct < 0); // (0-3)/3 = -1
+        Assert.False(result.Passed);
+    }
+
+    // ---- 5b. Zero baseline judge score → lift guarded to 0 (no div-by-zero), gate fails -------------
+
+    [Fact]
+    public async Task RunAsync_ZeroBaselineScore_LiftGuardedToZero_Fails()
+    {
+        // avgA == 0 (judge scores A all-zeros). liftPct must be guarded to 0, NOT NaN/+inf, and the gate must
+        // fail — a zero baseline is not a free pass for the crew.
+        var gen = new FakeGenerator(baselineCost: 0.002m, crewCostEach: 0.0005m);
+        var judge = new MarkerJudge(0, 0, 0, 5, 5, 5);
+
+        var result = await Runner().RunAsync(
+            Baseline(gen), Crew(gen), judge, "judge-test", persist: false, db: null, gitSha: null, Ct);
+
+        Assert.Equal(0.0, result.AvgA, 6);
+        Assert.Equal(5.0, result.AvgB, 6);
+        Assert.Equal(0.0, result.LiftPct, 6); // guarded, not (5-0)/0
+        Assert.False(double.IsNaN(result.LiftPct));
+        Assert.False(double.IsInfinity(result.LiftPct));
+        Assert.False(result.Passed);
+    }
+
+    // ---- 5c. Zero baseline spend → costRatio = +inf (no div-by-zero), gate fails, JSON null --------
+
+    [Fact]
+    public async Task RunAsync_ZeroBaselineCost_CostRatioInfinite_Fails()
+    {
+        // sumCostA == 0 (baseline reports zero cost). costRatio must be +inf (not NaN/0), the gate must FAIL
+        // (a free baseline makes any crew spend an infinite multiple), and the persisted JSON must stay
+        // parseable — +inf serializes to JSON null, never a bare `Infinity` literal.
+        var db = new CapturingDb();
+        var gen = new FakeGenerator(baselineCost: 0m, crewCostEach: 0.001m);
+        var judge = new MarkerJudge(3, 3, 3, 5, 5, 5); // lift alone would pass
+
+        var result = await Runner().RunAsync(
+            Baseline(gen), Crew(gen), judge, "judge-test", persist: true, db, gitSha: null, Ct);
+
+        Assert.True(result.LiftPct >= CrewAbEvalRunner.LiftGate); // lift would pass on its own
+        Assert.True(double.IsPositiveInfinity(result.CostRatio));
+        Assert.False(result.Passed); // cost gate fails on infinite ratio
+
+        var run = Assert.Single(db.Added);
+        using var doc = JsonDocument.Parse(run.BreakdownJson!); // must not throw on +inf
+        Assert.Equal(JsonValueKind.Null, doc.RootElement.GetProperty("costRatio").ValueKind);
+    }
+
+    // ---- 6. Persistence ----------------------------------------------------------------------------
+
+    [Fact]
+    public async Task RunAsync_Persist_WritesCrewAbEvalRun()
+    {
+        var db = new CapturingDb();
+        var gen = new FakeGenerator(baselineCost: 0.002m, crewCostEach: 0.001m);
+        var judge = new MarkerJudge(3, 3, 3, 5, 5, 5);
+
+        var result = await Runner().RunAsync(
+            Baseline(gen), Crew(gen), judge, "judge-model-x", persist: true, db, gitSha: "abc123", Ct);
+
+        var run = Assert.Single(db.Added);
+        Assert.Equal("crew_ab", run.Feature);
+        Assert.Equal("judge-model-x", run.JudgeModelId);
+        Assert.Equal("abc123", run.GitSha);
+        Assert.Equal(GoldenN, run.N);
+        Assert.Equal(Math.Round((decimal)result.LiftPct, 3), run.Score);
+
+        using var doc = JsonDocument.Parse(run.BreakdownJson!);
+        var root = doc.RootElement;
+        Assert.True(root.TryGetProperty("avgA", out _));
+        Assert.True(root.TryGetProperty("avgB", out _));
+        Assert.True(root.TryGetProperty("liftPct", out _));
+        Assert.True(root.TryGetProperty("costRatio", out _));
+        Assert.True(root.TryGetProperty("winRate", out _));
+        Assert.Equal(GoldenN, root.GetProperty("n").GetInt32());
+
+        var perFixture = root.GetProperty("perFixture");
+        Assert.Equal(GoldenN, perFixture.GetArrayLength());
+        var first = perFixture[0];
+        Assert.True(first.TryGetProperty("id", out _));
+        Assert.True(first.TryGetProperty("scoreA", out _));
+        Assert.True(first.TryGetProperty("scoreB", out _));
+        Assert.True(first.TryGetProperty("bWins", out _));
+    }
+}