From 00f70a8657a0fb13b6da6a530532478bb11d5e0e Mon Sep 17 00:00:00 2001
From: YuliiaKovalova <95473390+YuliiaKovalova@users.noreply.github.com>
Date: Thu, 11 Jun 2026 17:07:07 +0200
Subject: [PATCH 1/7] Add shared/precomputed baseline reuse to skill-validator
 evaluate (#751)

Add --baseline-out and --baseline-from options to the evaluate command so the
no-skill/no-agent baseline arm can be computed once and reused as a shared
control group across multiple skill/agent evaluations. This eliminates redundant
baseline runs and removes baseline run-to-run variance from cross-config
comparisons.

- New BaselineStore + BaselineFile/BaselineScenarioEntry models, keyed per
  scenario on SHA-256(prompt) with a header recording version, model,
  validator version and runs. Load validates version + model and fails fast on
  mismatch or missing scenarios.
- Register the new serializable types in SkillValidatorJsonContext (AOT
  source-gen).
- Wire two mutually-exclusive CLI options into ValidatorConfig; thread an
  optional BaselineStore through both execution paths.
- On reuse, skip the baseline agent run, its assertions/constraints/
  task-completion/judging, and attribute no extra pairwise tokens to the
  baseline; report the scenario with the baseline-reused session phase and a
  reused status. In write mode, record each scenario's averaged baseline and
  persist it after the run.
- Add unit tests for BaselineStore and document the feature in README and
  InvestigatingResults.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../src/Evaluate/BaselineStore.cs             | 145 +++++++++
 .../src/Evaluate/EvaluateCommand.cs           | 304 ++++++++++++++----
 eng/skill-validator/src/Evaluate/Models.cs    |   6 +
 eng/skill-validator/src/README.md             |  19 ++
 .../src/SkillValidatorJsonContext.cs          |   2 +
 .../src/docs/InvestigatingResults.md          |   2 +
 .../tests/Evaluate/BaselineStoreTests.cs      | 151 +++++++++
 7 files changed, 560 insertions(+), 69 deletions(-)
 create mode 100644 eng/skill-validator/src/Evaluate/BaselineStore.cs
 create mode 100644 eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
diff --git a/eng/skill-validator/src/Evaluate/BaselineStore.cs b/eng/skill-validator/src/Evaluate/BaselineStore.cs
new file mode 100644
index 0000000000..0e3e143065
--- /dev/null
+++ b/eng/skill-validator/src/Evaluate/BaselineStore.cs
@@ -0,0 +1,145 @@
+using System.Collections.Concurrent;
+using System.Security.Cryptography;
+using System.Text;
+using System.Text.Json;
+
+namespace SkillValidator.Evaluate;
+
+/// <summary>
+/// One scenario's precomputed baseline, keyed by the SHA-256 of its prompt.
+/// <see cref="Runs"/> records how many baseline runs were averaged into
+/// <see cref="Baseline"/> so reuse can report the robustness of the reference.
+/// </summary>
+public sealed record BaselineScenarioEntry(
+    string Name,
+    string PromptSha,
+    int Runs,
+    RunResult Baseline);
+
+/// <summary>
+/// On-disk format written by <c>--baseline-out</c> and read by <c>--baseline-from</c>.
+/// The baseline arm of <c>evaluate</c> is plain-agent with no skill/MCP attached, so it
+/// is independent of the target under test and can be computed once and shared across
+/// many invocations.  The header records the identity needed to reject a stale reuse.
+/// </summary>
+public sealed record BaselineFile(
+    int Version,
+    string Model,
+    string? ValidatorVersion,
+    string CreatedAt,
+    IReadOnlyList<BaselineScenarioEntry> Scenarios);
+
+/// <summary>
+/// Manages a precomputed, shared baseline across <c>evaluate</c> invocations.
+/// In write mode (<c>--baseline-out</c>) it accumulates each scenario's averaged
+/// baseline for later persistence.  In reuse mode (<c>--baseline-from</c>) it serves
+/// cached baselines in place of freshly executed baseline runs.
+/// </summary>
+internal sealed class BaselineStore
+{
+    /// <summary>Current on-disk schema version.</summary>
+    public const int CurrentVersion = 1;
+
+    private readonly ConcurrentDictionary<string, BaselineScenarioEntry> _entries = new(StringComparer.Ordinal);
+    private readonly string _model;
+
+    /// <summary>True when serving cached baselines (<c>--baseline-from</c>).</summary>
+    public bool IsReuse { get; }
+
+    private BaselineStore(string model, bool isReuse)
+    {
+        _model = model;
+        IsReuse = isReuse;
+    }
+
+    /// <summary>Create a store that accumulates baselines for later persistence.</summary>
+    public static BaselineStore ForWrite(string model) => new(model, isReuse: false);
+
+    /// <summary>
+    /// Load a baseline file for reuse.  Validates the schema version and that the model
+    /// matches, throwing on mismatch so a stale or wrong baseline can never silently
+    /// skew results.  Per-scenario prompt identity is validated later via
+    /// <see cref="FindMissingScenarios"/>.
+    /// </summary>
+    public static BaselineStore Load(string path, string expectedModel)
+    {
+        if (!File.Exists(path))
+            throw new FileNotFoundException($"Baseline file not found: {path}");
+
+        BaselineFile? file;
+        try
+        {
+            file = JsonSerializer.Deserialize(File.ReadAllText(path), SkillValidatorJsonContext.Default.BaselineFile);
+        }
+        catch (JsonException ex)
+        {
+            throw new InvalidOperationException($"Baseline file '{path}' is not valid JSON: {ex.Message}", ex);
+        }
+
+        if (file is null)
+            throw new InvalidOperationException($"Baseline file '{path}' is empty.");
+        if (file.Version != CurrentVersion)
+            throw new InvalidOperationException(
+                $"Baseline file '{path}' has unsupported version {file.Version} (expected {CurrentVersion}). Recompute it with --baseline-out.");
+        if (!string.Equals(file.Model, expectedModel, StringComparison.Ordinal))
+            throw new InvalidOperationException(
+                $"Baseline file '{path}' was computed for model '{file.Model}' but evaluation uses model '{expectedModel}'. " +
+                "Recompute the baseline with --baseline-out for the new model.");
+
+        var store = new BaselineStore(expectedModel, isReuse: true);
+        foreach (var entry in file.Scenarios)
+        {
+            if (entry.Baseline is not null)
+                store._entries[entry.PromptSha] = entry;
+        }
+        return store;
+    }
+
+    /// <summary>SHA-256 (lower-case hex) of the scenario prompt — the per-scenario reuse key.</summary>
+    public static string ComputePromptSha(string prompt)
+    {
+        var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(prompt));
+        return Convert.ToHexString(bytes).ToLowerInvariant();
+    }
+
+    /// <summary>
+    /// In reuse mode, return the names of scenarios that have no matching cached
+    /// baseline (keyed by prompt hash).  Empty when every scenario is covered.
+    /// </summary>
+    public IReadOnlyList<string> FindMissingScenarios(IEnumerable<EvalScenario> scenarios) =>
+        scenarios
+            .Where(s => !_entries.ContainsKey(ComputePromptSha(s.Prompt)))
+            .Select(s => s.Name)
+            .ToList();
+
+    /// <summary>Get the cached averaged baseline for a scenario, or null when absent.</summary>
+    public RunResult? TryGetBaseline(EvalScenario scenario) =>
+        _entries.TryGetValue(ComputePromptSha(scenario.Prompt), out var entry) ? entry.Baseline : null;
+
+    /// <summary>Record a scenario's averaged baseline for later persistence (write mode).</summary>
+    public void Record(EvalScenario scenario, int runs, RunResult averagedBaseline)
+    {
+        var sha = ComputePromptSha(scenario.Prompt);
+        _entries[sha] = new BaselineScenarioEntry(scenario.Name, sha, runs, averagedBaseline);
+    }
+
+    /// <summary>Serialize all recorded baselines to <paramref name="path"/>.</summary>
+    public void Save(string path)
+    {
+        var file = new BaselineFile(
+            Version: CurrentVersion,
+            Model: _model,
+            ValidatorVersion: typeof(BaselineStore).Assembly.GetName().Version?.ToString(),
+            CreatedAt: DateTime.UtcNow.ToString("o"),
+            Scenarios: _entries.Values.OrderBy(e => e.Name, StringComparer.Ordinal).ToList());
+
+        var dir = Path.GetDirectoryName(Path.GetFullPath(path));
+        if (!string.IsNullOrEmpty(dir))
+            Directory.CreateDirectory(dir);
+
+        File.WriteAllText(path, JsonSerializer.Serialize(file, SkillValidatorJsonContext.Default.BaselineFile));
+    }
+
+    /// <summary>Number of baselines currently held.</summary>
+    public int Count => _entries.Count;
+}
diff --git a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
index f80987fa3c..d8f054dd9f 100644
--- a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
+++ b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
@@ -33,6 +33,8 @@ public static Command Create()
         var noiseSkillsDirOpt = new Option<string?>("--noise-skills-dir") { Description = "Directory containing skills to load as noise. Enables the noise test: re-runs scenarios with all noise skills loaded and measures degradation." };
         var noiseMaxDegradationOpt = new Option<double>("--noise-max-degradation") { Description = "Maximum acceptable average quality degradation (0-1) in noise test (only positive degradations count)", DefaultValueFactory = _ => 0.2 };
         var noiseMaxScenarioDegradationOpt = new Option<double>("--noise-max-scenario-degradation") { Description = "Maximum acceptable quality degradation (0-1) for any single noise-test scenario", DefaultValueFactory = _ => 0.4 };
+        var baselineOutOpt = new Option<string?>("--baseline-out") { Description = "After running, persist each scenario's averaged baseline (no-skill/no-agent reference) to this file for later reuse with --baseline-from." };
+        var baselineFromOpt = new Option<string?>("--baseline-from") { Description = "Reuse a precomputed baseline from this file instead of re-running the no-skill/no-agent baseline arm. Must match --model and the scenario prompts. Mutually exclusive with --baseline-out." };
 
         var command = new Command("evaluate", "Evaluate agent skills via LLM-based testing")
         {
@@ -59,6 +61,8 @@ public static Command Create()
             noiseSkillsDirOpt,
             noiseMaxDegradationOpt,
             noiseMaxScenarioDegradationOpt,
+            baselineOutOpt,
+            baselineFromOpt,
         };
 
         command.Add(RejudgeCommand.Create());
@@ -110,6 +114,8 @@ public static Command Create()
                 NoiseSkillsDir = parseResult.GetValue(noiseSkillsDirOpt),
                 NoiseDegradationLimit = parseResult.GetValue(noiseMaxDegradationOpt),
                 NoiseMaxScenarioDegradation = parseResult.GetValue(noiseMaxScenarioDegradationOpt),
+                BaselineOut = parseResult.GetValue(baselineOutOpt),
+                BaselineFrom = parseResult.GetValue(baselineFromOpt),
             };
 
             return await Run(config, cancellationToken);
@@ -129,6 +135,14 @@ public static Command Create()
 
     public static async Task<int> Run(ValidatorConfig config, CancellationToken cancellationToken = default)
     {
+        // --baseline-out and --baseline-from are mutually exclusive: one writes a
+        // shared baseline, the other consumes one.
+        if (config.BaselineOut is not null && config.BaselineFrom is not null)
+        {
+            Console.Error.WriteLine("--baseline-out and --baseline-from cannot be used together.");
+            return 1;
+        }
+
         // Validate model early
         try
         {
@@ -290,6 +304,42 @@ public static async Task<int> Run(ValidatorConfig config, CancellationToken canc
         bool usePairwise = config.JudgeMode is JudgeMode.Pairwise or JudgeMode.Both;
         bool effectiveKeepSessions = config.KeepSessions && config.ResultsDir is not null;
 
+        // Set up shared-baseline reuse/persistence.
+        BaselineStore? baselineStore = null;
+        if (config.BaselineFrom is not null)
+        {
+            try
+            {
+                baselineStore = BaselineStore.Load(config.BaselineFrom, config.Model);
+            }
+            catch (Exception ex) when (ex is FileNotFoundException or InvalidOperationException)
+            {
+                Console.Error.WriteLine($"{Ansi.Red}❌ Failed to load baseline from '{config.BaselineFrom}': {ex.Message}{Ansi.Reset}");
+                return 1;
+            }
+
+            // Fail fast if any scenario lacks a matching cached baseline so a stale or
+            // incomplete baseline can never silently skew results.
+            var allScenarios = allTargets
+                .Where(t => t.EvalConfig is not null)
+                .SelectMany(t => t.EvalConfig!.Scenarios)
+                .ToList();
+            var missing = baselineStore.FindMissingScenarios(allScenarios);
+            if (missing.Count > 0)
+            {
+                Console.Error.WriteLine(
+                    $"{Ansi.Red}❌ Baseline file '{config.BaselineFrom}' has no entry for scenario(s): {string.Join(", ", missing.Distinct())}. " +
+                    $"Recompute the baseline with --baseline-out for the current tests and model.{Ansi.Reset}");
+                return 1;
+            }
+            Console.WriteLine($"Reusing precomputed baseline from {config.BaselineFrom} ({baselineStore.Count} scenario(s)).");
+        }
+        else if (config.BaselineOut is not null)
+        {
+            baselineStore = BaselineStore.ForWrite(config.Model);
+            Console.WriteLine($"Baseline will be persisted to {config.BaselineOut} after the run.");
+        }
+
         string? sessionsDir = null;
         SessionDatabase? sessionDb = null;
         string? timestampedResultsDir = null;
@@ -314,7 +364,7 @@ public static async Task<int> Run(ValidatorConfig config, CancellationToken canc
         // Evaluate all targets (skills and agents)
         spinner.Start($"Evaluating {allTargets.Count} target(s)...");
         var skillTasks = allTargets.Select(target =>
-            skillLimit.RunAsync(() => EvaluateTarget(target, config, usePairwise, spinner, noiseEvalSkills, sessionsDir, sessionDb, cancellationToken), cancellationToken));
+            skillLimit.RunAsync(() => EvaluateTarget(target, config, usePairwise, spinner, noiseEvalSkills, sessionsDir, sessionDb, baselineStore, cancellationToken), cancellationToken));
         var settled = await Task.WhenAll(skillTasks.Select(async t =>
         {
             try { return (Result: await t, Error: (Exception?)null); }
@@ -353,6 +403,28 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
         await AgentRunner.CleanupWorkDirs(effectiveKeepSessions);
         sessionDb?.Dispose();
 
+        // Persist the shared baseline for later reuse with --baseline-from.
+        if (config.BaselineOut is not null && baselineStore is not null)
+        {
+            if (baselineStore.Count > 0)
+            {
+                try
+                {
+                    baselineStore.Save(config.BaselineOut);
+                    Console.WriteLine($"Baseline written to {config.BaselineOut} ({baselineStore.Count} scenario(s)).");
+                }
+                catch (Exception ex)
+                {
+                    Console.Error.WriteLine($"{Ansi.Red}❌ Failed to write baseline to '{config.BaselineOut}': {ex.Message}{Ansi.Reset}");
+                    return 1;
+                }
+            }
+            else
+            {
+                Console.Error.WriteLine($"{Ansi.Yellow}⚠  No baselines were produced; nothing written to {config.BaselineOut}.{Ansi.Reset}");
+            }
+        }
+
         // Always fail on execution errors, even in --verdict-warn-only mode
         if (rejectionMessages.Count > 0) return 1;
 
@@ -380,16 +452,17 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
         IReadOnlyList<EvalSkillInfo> noiseSkills,
         string? sessionsDir,
         SessionDatabase? sessionDb,
+        BaselineStore? baselineStore,
         CancellationToken cancellationToken)
     {
         if (target.Kind == EvalTargetKind.Skill && target.Skill is not null)
         {
             var evalSkill = new EvalSkillInfo(target.Skill, target.EvalPath, target.EvalConfig, target.McpServers);
-            return await EvaluateSkill(evalSkill, config, usePairwise, spinner, noiseSkills, sessionsDir, sessionDb, cancellationToken);
+            return await EvaluateSkill(evalSkill, config, usePairwise, spinner, noiseSkills, sessionsDir, sessionDb, baselineStore, cancellationToken);
         }
         else if (target.Kind == EvalTargetKind.Agent && target.Agent is not null)
         {
-            return await EvaluateAgent(target, config, usePairwise, spinner, sessionsDir, sessionDb, cancellationToken);
+            return await EvaluateAgent(target, config, usePairwise, spinner, sessionsDir, sessionDb, baselineStore, cancellationToken);
         }
         return null;
     }
@@ -405,6 +478,7 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
         Spinner spinner,
         string? sessionsDir,
         SessionDatabase? sessionDb,
+        BaselineStore? baselineStore,
         CancellationToken cancellationToken)
     {
         var agent = target.Agent!;
@@ -457,7 +531,7 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
             {
                 try
                 {
-                    return await ExecuteAgentScenario(scenario, target, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, targetSha, cancellationToken);
+                    return await ExecuteAgentScenario(scenario, target, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, targetSha, baselineStore, cancellationToken);
                 }
                 catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested)
                 {
@@ -516,6 +590,7 @@ private static async Task<ScenarioComparison> ExecuteAgentScenario(
         string? sessionsDir,
         SessionDatabase? sessionDb,
         string? targetSha,
+        BaselineStore? baselineStore,
         CancellationToken cancellationToken)
     {
         var agent = target.Agent!;
@@ -535,7 +610,7 @@ private static async Task<ScenarioComparison> ExecuteAgentScenario(
             {
                 try
                 {
-                    return (Result: await ExecuteAgentRun(i, scenario, target, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, targetSha, cancellationToken), Error: (Exception?)null);
+                    return (Result: await ExecuteAgentRun(i, scenario, target, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, targetSha, baselineStore, cancellationToken), Error: (Exception?)null);
                 }
                 catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested)
                 {
@@ -580,6 +655,10 @@ private static async Task<ScenarioComparison> ExecuteAgentScenario(
         var avgIsolated = AverageResults(isolatedRuns);
         var avgPlugin = AverageResults(pluginRuns);
 
+        // Persist the averaged baseline (skill/agent-independent) for shared reuse.
+        if (baselineStore is { IsReuse: false })
+            baselineStore.Record(scenario, runResults.Length, avgBaseline);
+
         int bestPairwiseIdx = -1;
         for (int i = 0; i < perRunPairwise.Count; i++)
         {
@@ -669,6 +748,7 @@ private static async Task<RunExecutionResult> ExecuteAgentRun(
         string? sessionsDir,
         SessionDatabase? sessionDb,
         string? targetSha,
+        BaselineStore? baselineStore,
         CancellationToken cancellationToken)
     {
         var agent = target.Agent!;
@@ -690,8 +770,12 @@ private static async Task<RunExecutionResult> ExecuteAgentRun(
         var pluginConfigDir = sessionsDir is not null ? Path.Combine("sessions", pluginSessionId) : null;
         var rubricJson = JsonSerializer.Serialize(scenario.Rubric?.ToArray() ?? [], SkillValidatorJsonContext.Default.StringArray);
 
+        // Reuse a precomputed shared baseline when available (--baseline-from). The
+        // baseline arm is agent-independent, so this skips a redundant agent run.
+        var reusedBaseline = baselineStore?.TryGetBaseline(scenario);
+
         sessionDb?.RegisterSession(baselineSessionId, agent.Name, agent.Path, scenario.Name, runIndex,
-            "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, targetSha, rubricJson);
+            reusedBaseline is not null ? "baseline-reused" : "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, targetSha, rubricJson);
         sessionDb?.RegisterSession(isolatedSessionId, agent.Name, agent.Path, scenario.Name, runIndex,
             "with-agent-isolated", config.Model, isolatedConfigDir, null, scenario.Prompt, targetSha, rubricJson);
         sessionDb?.RegisterSession(pluginSessionId, agent.Name, agent.Path, scenario.Name, runIndex,
@@ -706,25 +790,41 @@ private static async Task<RunExecutionResult> ExecuteAgentRun(
             additionalAgents = await ResolveAdditionalAgents(scenario.Setup.AdditionalRequiredAgents, pluginRoot);
         }
 
-        var agentTasks = await Task.WhenAll(
+        // 2. Agent-isolated: target agent only (+ scenario deps)
+        var isolatedTask = AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose,
+            PluginRoot: null, Log: runLog, McpServers: target.McpServers, SessionsDir: sessionsDir,
+            SessionId: isolatedSessionId, Agent: agent, AdditionalSkills: additionalSkills, AdditionalAgents: additionalAgents), cancellationToken);
+        // 3. Agent-plugin: full plugin context + agent selected
+        var pluginTask = AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose,
+            PluginRoot: pluginRoot, Log: runLog, McpServers: target.McpServers, SessionsDir: sessionsDir,
+            SessionId: pluginSessionId, Agent: agent), cancellationToken);
+
+        RunMetrics baselineMetrics;
+        RunMetrics isolatedMetrics;
+        RunMetrics pluginMetrics;
+        if (reusedBaseline is not null)
+        {
+            if (config.Verbose)
+                runLog("↩︎ reusing precomputed baseline");
+            baselineMetrics = reusedBaseline.Metrics;
+            var skilled = await Task.WhenAll(isolatedTask, pluginTask);
+            isolatedMetrics = skilled[0];
+            pluginMetrics = skilled[1];
+        }
+        else
+        {
             // 1. Baseline: no agent, no skills — vanilla
-            AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose,
-                PluginRoot: null, Log: runLog, SessionsDir: sessionsDir, SessionId: baselineSessionId), cancellationToken),
-            // 2. Agent-isolated: target agent only (+ scenario deps)
-            AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose,
-                PluginRoot: null, Log: runLog, McpServers: target.McpServers, SessionsDir: sessionsDir,
-                SessionId: isolatedSessionId, Agent: agent, AdditionalSkills: additionalSkills, AdditionalAgents: additionalAgents), cancellationToken),
-            // 3. Agent-plugin: full plugin context + agent selected
-            AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose,
-                PluginRoot: pluginRoot, Log: runLog, McpServers: target.McpServers, SessionsDir: sessionsDir,
-                SessionId: pluginSessionId, Agent: agent), cancellationToken));
-        var baselineMetrics = agentTasks[0];
-        var isolatedMetrics = agentTasks[1];
-        var pluginMetrics = agentTasks[2];
+            var baselineTask = AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose,
+                PluginRoot: null, Log: runLog, SessionsDir: sessionsDir, SessionId: baselineSessionId), cancellationToken);
+            var all = await Task.WhenAll(baselineTask, isolatedTask, pluginTask);
+            baselineMetrics = all[0];
+            isolatedMetrics = all[1];
+            pluginMetrics = all[2];
+        }
 
         if (sessionDb is not null)
         {
-            sessionDb.CompleteSession(baselineSessionId, baselineMetrics.TimedOut ? "timed_out" : "completed",
+            sessionDb.CompleteSession(baselineSessionId, reusedBaseline is not null ? "reused" : (baselineMetrics.TimedOut ? "timed_out" : "completed"),
                 JsonSerializer.Serialize(baselineMetrics, SkillValidatorJsonContext.Default.RunMetrics));
             sessionDb.CompleteSession(isolatedSessionId, isolatedMetrics.TimedOut ? "timed_out" : "completed",
                 JsonSerializer.Serialize(isolatedMetrics, SkillValidatorJsonContext.Default.RunMetrics));
@@ -732,43 +832,58 @@ private static async Task<RunExecutionResult> ExecuteAgentRun(
                 JsonSerializer.Serialize(pluginMetrics, SkillValidatorJsonContext.Default.RunMetrics));
         }
 
-        // Assertions, constraints, task completion, judging — same as skills
+        // Assertions, constraints, task completion, judging — same as skills.
+        // Baseline arm is skipped when reused (its results are cached).
         if (scenario.Assertions is { Count: > 0 })
         {
-            baselineMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, baselineMetrics.AgentOutput, baselineMetrics.WorkDir, scenario.Timeout);
+            if (reusedBaseline is null)
+                baselineMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, baselineMetrics.AgentOutput, baselineMetrics.WorkDir, scenario.Timeout);
             isolatedMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, isolatedMetrics.AgentOutput, isolatedMetrics.WorkDir, scenario.Timeout);
             pluginMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, pluginMetrics.AgentOutput, pluginMetrics.WorkDir, scenario.Timeout);
         }
 
-        var baselineConstraints = AssertionEvaluator.EvaluateConstraints(scenario, baselineMetrics);
+        var baselineConstraints = reusedBaseline is null ? AssertionEvaluator.EvaluateConstraints(scenario, baselineMetrics) : [];
         var isolatedConstraints = AssertionEvaluator.EvaluateConstraints(scenario, isolatedMetrics);
         var pluginConstraints = AssertionEvaluator.EvaluateConstraints(scenario, pluginMetrics);
-        baselineMetrics.AssertionResults = [..baselineMetrics.AssertionResults, ..baselineConstraints];
+        if (reusedBaseline is null)
+            baselineMetrics.AssertionResults = [..baselineMetrics.AssertionResults, ..baselineConstraints];
         isolatedMetrics.AssertionResults = [..isolatedMetrics.AssertionResults, ..isolatedConstraints];
         pluginMetrics.AssertionResults = [..pluginMetrics.AssertionResults, ..pluginConstraints];
 
-        if (scenario.Assertions is { Count: > 0 } || baselineConstraints.Count > 0)
+        if (scenario.Assertions is { Count: > 0 } || baselineConstraints.Count > 0 || isolatedConstraints.Count > 0 || pluginConstraints.Count > 0)
         {
-            baselineMetrics.TaskCompleted = baselineMetrics.AssertionResults.All(a => a.Passed);
+            if (reusedBaseline is null)
+                baselineMetrics.TaskCompleted = baselineMetrics.AssertionResults.All(a => a.Passed);
             isolatedMetrics.TaskCompleted = isolatedMetrics.AssertionResults.All(a => a.Passed);
             pluginMetrics.TaskCompleted = pluginMetrics.AssertionResults.All(a => a.Passed);
         }
         else
         {
-            baselineMetrics.TaskCompleted = baselineMetrics.ErrorCount == 0;
+            if (reusedBaseline is null)
+                baselineMetrics.TaskCompleted = baselineMetrics.ErrorCount == 0;
             isolatedMetrics.TaskCompleted = isolatedMetrics.ErrorCount == 0;
             pluginMetrics.TaskCompleted = pluginMetrics.ErrorCount == 0;
         }
 
-        var judgeOpts = new JudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, agent.Path);
+        var judgeOpts = new JudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, isolatedMetrics.WorkDir, agent.Path);
 
-        var (baselineJudge, baselineJudgeTokens) = await SafeJudge(Judge.JudgeRun(scenario, baselineMetrics, judgeOpts, runLog, cancellationToken), "baseline", runLog);
+        JudgeResult baselineJudge;
+        if (reusedBaseline is not null)
+        {
+            baselineJudge = reusedBaseline.JudgeResult;
+        }
+        else
+        {
+            var (judged, baselineJudgeTokens) = await SafeJudge(Judge.JudgeRun(
+                scenario, baselineMetrics, judgeOpts with { WorkDir = baselineMetrics.WorkDir }, runLog, cancellationToken), "baseline", runLog);
+            baselineJudge = judged;
+            AccumulateJudgeTokens(baselineMetrics, baselineJudgeTokens);
+        }
         var (isolatedJudge, isolatedJudgeTokens) = await SafeJudge(Judge.JudgeRun(
             scenario, isolatedMetrics, judgeOpts with { WorkDir = isolatedMetrics.WorkDir }, runLog, cancellationToken), "isolated", runLog);
         var (pluginJudge, pluginJudgeTokens) = await SafeJudge(Judge.JudgeRun(
             scenario, pluginMetrics, judgeOpts with { WorkDir = pluginMetrics.WorkDir }, runLog, cancellationToken), "plugin", runLog);
 
-        AccumulateJudgeTokens(baselineMetrics, baselineJudgeTokens);
         AccumulateJudgeTokens(isolatedMetrics, isolatedJudgeTokens);
         AccumulateJudgeTokens(pluginMetrics, pluginJudgeTokens);
 
@@ -785,12 +900,16 @@ private static async Task<RunExecutionResult> ExecuteAgentRun(
             var worseSkilled = pairwiseFromPlugin ? pluginMetrics : isolatedMetrics;
             try
             {
+                // Reused baseline work dir no longer exists; run the judge in the skilled
+                // run's work dir (judge reads only the provided metrics text).
+                var pairwiseWorkDir = reusedBaseline is not null ? worseSkilled.WorkDir : baselineMetrics.WorkDir;
                 var (pairwiseResult, pairwiseTokens) = await PairwiseJudge.Judge(
                     scenario, baselineMetrics, worseSkilled,
-                    new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, agent.Path, worseSkilled.WorkDir),
+                    new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, pairwiseWorkDir, agent.Path, worseSkilled.WorkDir),
                     runLog, cancellationToken);
                 pairwise = pairwiseResult;
-                AccumulateJudgeTokens(baselineMetrics, pairwiseTokens);
+                if (reusedBaseline is null)
+                    AccumulateJudgeTokens(baselineMetrics, pairwiseTokens);
                 AccumulateJudgeTokens(worseSkilled, pairwiseTokens);
             }
             catch (Exception error)
@@ -827,6 +946,7 @@ private static async Task<RunExecutionResult> ExecuteAgentRun(
         IReadOnlyList<EvalSkillInfo> noiseSkills,
         string? sessionsDir,
         SessionDatabase? sessionDb,
+        BaselineStore? baselineStore,
         CancellationToken cancellationToken)
     {
         var skill = evalSkill.Skill;
@@ -896,7 +1016,7 @@ private static async Task<RunExecutionResult> ExecuteAgentRun(
             {
                 try
                 {
-                    return await ExecuteScenario(scenario, evalSkill, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, skillSha, cancellationToken);
+                    return await ExecuteScenario(scenario, evalSkill, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, skillSha, baselineStore, cancellationToken);
                 }
                 catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested)
                 {
@@ -989,6 +1109,7 @@ private static async Task<ScenarioComparison> ExecuteScenario(
         string? sessionsDir,
         SessionDatabase? sessionDb,
         string? skillSha,
+        BaselineStore? baselineStore,
         CancellationToken cancellationToken)
     {
         var skill = evalSkill.Skill;
@@ -1008,7 +1129,7 @@ private static async Task<ScenarioComparison> ExecuteScenario(
             {
                 try
                 {
-                    return (Result: await ExecuteRun(i, scenario, evalSkill, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, skillSha, cancellationToken), Error: (Exception?)null);
+                    return (Result: await ExecuteRun(i, scenario, evalSkill, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, skillSha, baselineStore, cancellationToken), Error: (Exception?)null);
                 }
                 catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested)
                 {
@@ -1056,6 +1177,9 @@ private static async Task<ScenarioComparison> ExecuteScenario(
         var avgBaseline = AverageResults(baselineRuns);
         var avgIsolated = AverageResults(isolatedRuns);
         var avgPlugin = AverageResults(pluginRuns);
+        // Persist the averaged baseline (skill/agent-independent) for shared reuse.
+        if (baselineStore is { IsReuse: false })
+            baselineStore.Record(scenario, runResults.Length, avgBaseline);
         // Select the best pairwise result and track which run it came from
         int bestPairwiseIdx = -1;
         for (int i = 0; i < perRunPairwise.Count; i++)
@@ -1163,6 +1287,7 @@ private static async Task<RunExecutionResult> ExecuteRun(
         string? sessionsDir,
         SessionDatabase? sessionDb,
         string? skillSha,
+        BaselineStore? baselineStore,
         CancellationToken cancellationToken)
     {
         var skill = evalSkill.Skill;
@@ -1184,8 +1309,12 @@ private static async Task<RunExecutionResult> ExecuteRun(
         var pluginConfigDir = sessionsDir is not null ? Path.Combine("sessions", pluginSessionId) : null;
         var rubricJson = JsonSerializer.Serialize(scenario.Rubric?.ToArray() ?? [], SkillValidatorJsonContext.Default.StringArray);
 
+        // Reuse a precomputed shared baseline when available (--baseline-from). The
+        // baseline arm is skill-independent, so this skips a redundant agent run.
+        var reusedBaseline = baselineStore?.TryGetBaseline(scenario);
+
         sessionDb?.RegisterSession(baselineSessionId, skill.Name, skill.Path, scenario.Name, runIndex,
-            "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, skillSha, rubricJson);
+            reusedBaseline is not null ? "baseline-reused" : "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, skillSha, rubricJson);
         sessionDb?.RegisterSession(isolatedSessionId, skill.Name, skill.Path, scenario.Name, runIndex,
             "with-skill-isolated", config.Model, isolatedConfigDir, null, scenario.Prompt, skillSha, rubricJson);
         sessionDb?.RegisterSession(pluginSessionId, skill.Name, skill.Path, scenario.Name, runIndex,
@@ -1200,24 +1329,40 @@ private static async Task<RunExecutionResult> ExecuteRun(
             additionalAgents = await ResolveAdditionalAgents(scenario.Setup.AdditionalRequiredAgents, pluginRoot);
         }
 
-        var agentTasks = await Task.WhenAll(
+        // 2. Skilled-isolated: target skill + declared dependencies
+        var isolatedTask = AgentRunner.RunAgent(new RunOptions(scenario, skill, evalSkill.EvalPath, config.Model, config.Verbose,
+            PluginRoot: null, Log: runLog, McpServers: evalSkill.McpServers, SessionsDir: sessionsDir,
+            SessionId: isolatedSessionId, AdditionalSkills: additionalSkills, AdditionalAgents: additionalAgents), cancellationToken);
+        // 3. Skilled-plugin: load entire plugin from plugin root directory
+        var pluginTask = AgentRunner.RunAgent(new RunOptions(scenario, skill, evalSkill.EvalPath, config.Model, config.Verbose,
+            PluginRoot: pluginRoot, Log: runLog, McpServers: evalSkill.McpServers, SessionsDir: sessionsDir, SessionId: pluginSessionId), cancellationToken);
+
+        RunMetrics baselineMetrics;
+        RunMetrics isolatedMetrics;
+        RunMetrics pluginMetrics;
+        if (reusedBaseline is not null)
+        {
+            if (config.Verbose)
+                runLog("↩︎ reusing precomputed baseline");
+            baselineMetrics = reusedBaseline.Metrics;
+            var skilled = await Task.WhenAll(isolatedTask, pluginTask);
+            isolatedMetrics = skilled[0];
+            pluginMetrics = skilled[1];
+        }
+        else
+        {
             // 1. Baseline: no plugin, no skills — vanilla agent
-            AgentRunner.RunAgent(new RunOptions(scenario, null, evalSkill.EvalPath, config.Model, config.Verbose,
-                PluginRoot: null, Log: runLog, SessionsDir: sessionsDir, SessionId: baselineSessionId), cancellationToken),
-            // 2. Skilled-isolated: target skill + declared dependencies
-            AgentRunner.RunAgent(new RunOptions(scenario, skill, evalSkill.EvalPath, config.Model, config.Verbose,
-                PluginRoot: null, Log: runLog, McpServers: evalSkill.McpServers, SessionsDir: sessionsDir,
-                SessionId: isolatedSessionId, AdditionalSkills: additionalSkills, AdditionalAgents: additionalAgents), cancellationToken),
-            // 3. Skilled-plugin: load entire plugin from plugin root directory
-            AgentRunner.RunAgent(new RunOptions(scenario, skill, evalSkill.EvalPath, config.Model, config.Verbose,
-                PluginRoot: pluginRoot, Log: runLog, McpServers: evalSkill.McpServers, SessionsDir: sessionsDir, SessionId: pluginSessionId), cancellationToken));
-        var baselineMetrics = agentTasks[0];
-        var isolatedMetrics = agentTasks[1];
-        var pluginMetrics = agentTasks[2];
+            var baselineTask = AgentRunner.RunAgent(new RunOptions(scenario, null, evalSkill.EvalPath, config.Model, config.Verbose,
+                PluginRoot: null, Log: runLog, SessionsDir: sessionsDir, SessionId: baselineSessionId), cancellationToken);
+            var all = await Task.WhenAll(baselineTask, isolatedTask, pluginTask);
+            baselineMetrics = all[0];
+            isolatedMetrics = all[1];
+            pluginMetrics = all[2];
+        }
 
         if (sessionDb is not null)
         {
-            var baselineStatus = baselineMetrics.TimedOut ? "timed_out" : "completed";
+            var baselineStatus = reusedBaseline is not null ? "reused" : (baselineMetrics.TimedOut ? "timed_out" : "completed");
             var isolatedStatus = isolatedMetrics.TimedOut ? "timed_out" : "completed";
             var pluginStatus = pluginMetrics.TimedOut ? "timed_out" : "completed";
             sessionDb.CompleteSession(baselineSessionId, baselineStatus, JsonSerializer.Serialize(baselineMetrics, SkillValidatorJsonContext.Default.RunMetrics));
@@ -1225,57 +1370,72 @@ private static async Task<RunExecutionResult> ExecuteRun(
             sessionDb.CompleteSession(pluginSessionId, pluginStatus, JsonSerializer.Serialize(pluginMetrics, SkillValidatorJsonContext.Default.RunMetrics));
         }
 
-        // Evaluate assertions on all three runs
+        // Evaluate assertions on the skilled runs (baseline assertions are cached when reused)
         if (scenario.Assertions is { Count: > 0 })
         {
-            baselineMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, baselineMetrics.AgentOutput, baselineMetrics.WorkDir, scenario.Timeout);
+            if (reusedBaseline is null)
+                baselineMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, baselineMetrics.AgentOutput, baselineMetrics.WorkDir, scenario.Timeout);
             isolatedMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, isolatedMetrics.AgentOutput, isolatedMetrics.WorkDir, scenario.Timeout);
             pluginMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, pluginMetrics.AgentOutput, pluginMetrics.WorkDir, scenario.Timeout);
         }
 
-        // Evaluate constraints on all three runs
-        var baselineConstraints = AssertionEvaluator.EvaluateConstraints(scenario, baselineMetrics);
+        // Evaluate constraints on the skilled runs (baseline constraints are cached when reused)
+        var baselineConstraints = reusedBaseline is null ? AssertionEvaluator.EvaluateConstraints(scenario, baselineMetrics) : [];
         var isolatedConstraints = AssertionEvaluator.EvaluateConstraints(scenario, isolatedMetrics);
         var pluginConstraints = AssertionEvaluator.EvaluateConstraints(scenario, pluginMetrics);
-        baselineMetrics.AssertionResults = [..baselineMetrics.AssertionResults, ..baselineConstraints];
+        if (reusedBaseline is null)
+            baselineMetrics.AssertionResults = [..baselineMetrics.AssertionResults, ..baselineConstraints];
         isolatedMetrics.AssertionResults = [..isolatedMetrics.AssertionResults, ..isolatedConstraints];
         pluginMetrics.AssertionResults = [..pluginMetrics.AssertionResults, ..pluginConstraints];
 
-        // Task completion for all three
-        if (scenario.Assertions is { Count: > 0 } || baselineConstraints.Count > 0)
+        // Task completion for the skilled runs (baseline completion is cached when reused)
+        if (scenario.Assertions is { Count: > 0 } || baselineConstraints.Count > 0 || isolatedConstraints.Count > 0 || pluginConstraints.Count > 0)
         {
-            baselineMetrics.TaskCompleted = baselineMetrics.AssertionResults.All(a => a.Passed);
+            if (reusedBaseline is null)
+                baselineMetrics.TaskCompleted = baselineMetrics.AssertionResults.All(a => a.Passed);
             isolatedMetrics.TaskCompleted = isolatedMetrics.AssertionResults.All(a => a.Passed);
             pluginMetrics.TaskCompleted = pluginMetrics.AssertionResults.All(a => a.Passed);
         }
         else
         {
-            baselineMetrics.TaskCompleted = baselineMetrics.ErrorCount == 0;
+            if (reusedBaseline is null)
+                baselineMetrics.TaskCompleted = baselineMetrics.ErrorCount == 0;
             isolatedMetrics.TaskCompleted = isolatedMetrics.ErrorCount == 0;
             pluginMetrics.TaskCompleted = pluginMetrics.ErrorCount == 0;
         }
 
-        // Judge all three runs independently (failures are non-fatal)
-        var judgeOpts = new JudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, skill.Path);
+        // Judge the skilled runs independently (failures are non-fatal). The baseline
+        // judge result is reused from the precomputed baseline when available.
+        var judgeOpts = new JudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, isolatedMetrics.WorkDir, skill.Path);
 
-        var baselineJudgeTask = Judge.JudgeRun(scenario, baselineMetrics, judgeOpts, runLog, cancellationToken);
         var isolatedJudgeTask = Judge.JudgeRun(
             scenario, isolatedMetrics, judgeOpts with { WorkDir = isolatedMetrics.WorkDir }, runLog, cancellationToken);
         var pluginJudgeTask = Judge.JudgeRun(
             scenario, pluginMetrics, judgeOpts with { WorkDir = pluginMetrics.WorkDir }, runLog, cancellationToken);
 
-        var (baselineJudge, baselineJudgeTokens) = await SafeJudge(baselineJudgeTask, "baseline", runLog);
+        JudgeResult baselineJudge;
+        if (reusedBaseline is not null)
+        {
+            baselineJudge = reusedBaseline.JudgeResult;
+        }
+        else
+        {
+            var (judged, baselineJudgeTokens) = await SafeJudge(
+                Judge.JudgeRun(scenario, baselineMetrics, judgeOpts with { WorkDir = baselineMetrics.WorkDir }, runLog, cancellationToken), "baseline", runLog);
+            baselineJudge = judged;
+            AccumulateJudgeTokens(baselineMetrics, baselineJudgeTokens);
+        }
         var (isolatedJudge, isolatedJudgeTokens) = await SafeJudge(isolatedJudgeTask, "isolated", runLog);
         var (pluginJudge, pluginJudgeTokens) = await SafeJudge(pluginJudgeTask, "plugin", runLog);
 
-        // Accumulate judge tokens into each run's metrics
-        AccumulateJudgeTokens(baselineMetrics, baselineJudgeTokens);
+        // Accumulate judge tokens into each skilled run's metrics
         AccumulateJudgeTokens(isolatedMetrics, isolatedJudgeTokens);
         AccumulateJudgeTokens(pluginMetrics, pluginJudgeTokens);
 
         if (sessionDb is not null)
         {
-            sessionDb.SaveJudgeResult(baselineSessionId, JsonSerializer.Serialize(baselineJudge, SkillValidatorJsonContext.Default.JudgeResult));
+            if (reusedBaseline is null)
+                sessionDb.SaveJudgeResult(baselineSessionId, JsonSerializer.Serialize(baselineJudge, SkillValidatorJsonContext.Default.JudgeResult));
             sessionDb.SaveJudgeResult(isolatedSessionId, JsonSerializer.Serialize(isolatedJudge, SkillValidatorJsonContext.Default.JudgeResult));
             sessionDb.SaveJudgeResult(pluginSessionId, JsonSerializer.Serialize(pluginJudge, SkillValidatorJsonContext.Default.JudgeResult));
         }
@@ -1295,13 +1455,19 @@ private static async Task<RunExecutionResult> ExecuteRun(
                 ? pluginMetrics : isolatedMetrics;
             try
             {
+                // When the baseline is reused its work dir no longer exists; run the
+                // judge session in the skilled run's work dir instead (the judge only
+                // reads the provided metrics text and is denied tool access).
+                var pairwiseWorkDir = reusedBaseline is not null ? worseSkilled.WorkDir : baselineMetrics.WorkDir;
                 var (pairwiseResult, pairwiseTokens) = await PairwiseJudge.Judge(
                     scenario, baselineMetrics, worseSkilled,
-                    new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, skill.Path, worseSkilled.WorkDir),
+                    new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, pairwiseWorkDir, skill.Path, worseSkilled.WorkDir),
                     runLog, cancellationToken);
                 pairwise = pairwiseResult;
-                // Attribute pairwise judge tokens to both the baseline and the compared run
-                AccumulateJudgeTokens(baselineMetrics, pairwiseTokens);
+                // Attribute pairwise judge tokens to the compared run (and to the baseline
+                // only when it was freshly executed, to avoid double-counting reused cost).
+                if (reusedBaseline is null)
+                    AccumulateJudgeTokens(baselineMetrics, pairwiseTokens);
                 AccumulateJudgeTokens(worseSkilled, pairwiseTokens);
                 if (sessionDb is not null && pairwise is not null)
                 {
diff --git a/eng/skill-validator/src/Evaluate/Models.cs b/eng/skill-validator/src/Evaluate/Models.cs
index b5c055d650..00508a9cf8 100644
--- a/eng/skill-validator/src/Evaluate/Models.cs
+++ b/eng/skill-validator/src/Evaluate/Models.cs
@@ -427,6 +427,12 @@ public sealed record ValidatorConfig
     public string? NoiseSkillsDir { get; init; }
     public double NoiseDegradationLimit { get; init; } = 0.2;
     public double NoiseMaxScenarioDegradation { get; init; } = 0.4;
+
+    /// <summary>When set, persist each scenario's averaged baseline to this file after the run.</summary>
+    public string? BaselineOut { get; init; }
+
+    /// <summary>When set, reuse the precomputed baseline from this file instead of re-running the baseline arm.</summary>
+    public string? BaselineFrom { get; init; }
 }
 
 public static class DefaultWeights
diff --git a/eng/skill-validator/src/README.md b/eng/skill-validator/src/README.md
index c3f346b446..af54a4340d 100644
--- a/eng/skill-validator/src/README.md
+++ b/eng/skill-validator/src/README.md
@@ -73,6 +73,10 @@ skill-validator evaluate --model gpt-5.3-codex --judge-model claude-opus-4.6-fas
 # Multiple runs for stability
 skill-validator evaluate --runs 5 --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills
 
+# Compute a shared baseline once, then reuse it across multiple skills/agents
+skill-validator evaluate --baseline-out baseline.json --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills/skill-a
+skill-validator evaluate --baseline-from baseline.json --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills/skill-b
+
 # Override the default results directory (.skill-validator-results)
 skill-validator evaluate --results-dir ./my-results --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills
 
@@ -142,6 +146,8 @@ skill-validator check --json --plugin ./plugins/my-plugin
 | `--confidence-level <n>` | `0.95` | Confidence level for statistical intervals (0–1) |
 | `--judge-timeout <n>` | `300` | Judge LLM timeout in seconds |
 | `--require-completion` | `true` | Fail if skill regresses task completion |
+| `--baseline-out <path>` | *(none)* | After running, persist each scenario's averaged baseline (no-skill/no-agent reference) to this file for reuse. Mutually exclusive with `--baseline-from`. |
+| `--baseline-from <path>` | *(none)* | Reuse a precomputed baseline from this file instead of re-running the baseline arm. Must match `--model` and every scenario prompt. Mutually exclusive with `--baseline-out`. |
 | `--verdict-warn-only` | `false` | Treat verdict failures as warnings (exit 0). Execution errors still fail. |
 | `--no-overfitting-check` | `false` | Disable the LLM-based overfitting analysis (on by default) |
 | `--overfitting-fix` | `false` | Generate `eval.fixed.yaml` with improved rubric items/assertions |
@@ -151,6 +157,19 @@ skill-validator check --json --plugin ./plugins/my-plugin
 
 Models are validated on startup — invalid model names fail fast with a list of available models.
 
+### Shared baseline reuse
+
+Every evaluation runs each scenario through a **baseline arm** (the agent with no skill / no agent loaded) to establish a reference the skill-enhanced run is compared against. When you evaluate many skills or agents against the same test scenarios, that baseline arm is re-run every time — redundant work that also introduces run-to-run variance into the comparison.
+
+`--baseline-out` and `--baseline-from` let you compute the baseline **once** and reuse it as a shared control group:
+
+1. **Produce** a baseline file with `--baseline-out baseline.json`. After the run, each scenario's averaged baseline result (honoring `--runs`) is written to the file.
+2. **Reuse** it with `--baseline-from baseline.json` on subsequent runs. The baseline arm is skipped entirely; the cached baseline is used for assertions, pairwise/independent judging, and metric deltas.
+
+The baseline file records the `--model` and a SHA-256 of each scenario prompt. On reuse the validator fails fast if the model differs or any scenario prompt is missing from the file, so a stale or mismatched baseline can never be silently applied. Scenarios reused from the file are reported with the `baseline-reused` session phase and a `reused` baseline status.
+
+The two options are mutually exclusive.
+
 ## Output
 
 Results are displayed in the console with color-coded scores and metric deltas. By default, `json` and `markdown` reporters are enabled and write to `.skill-validator-results/` (override with `--results-dir`). File reporters write to that directory:
diff --git a/eng/skill-validator/src/SkillValidatorJsonContext.cs b/eng/skill-validator/src/SkillValidatorJsonContext.cs
index 2aaae673bf..e11f7c3597 100644
--- a/eng/skill-validator/src/SkillValidatorJsonContext.cs
+++ b/eng/skill-validator/src/SkillValidatorJsonContext.cs
@@ -17,6 +17,8 @@ namespace SkillValidator;
 [JsonSerializable(typeof(ScenarioComparison))]
 [JsonSerializable(typeof(RunResult))]
 [JsonSerializable(typeof(RunMetrics))]
+[JsonSerializable(typeof(BaselineFile))]
+[JsonSerializable(typeof(BaselineScenarioEntry))]
 [JsonSerializable(typeof(JudgeResult))]
 [JsonSerializable(typeof(RubricScore))]
 [JsonSerializable(typeof(AssertionResult))]
diff --git a/eng/skill-validator/src/docs/InvestigatingResults.md b/eng/skill-validator/src/docs/InvestigatingResults.md
index 7616caf9c5..7ca0aba084 100644
--- a/eng/skill-validator/src/docs/InvestigatingResults.md
+++ b/eng/skill-validator/src/docs/InvestigatingResults.md
@@ -83,6 +83,8 @@ Each scenario includes two required runs (baseline + isolated). It may also incl
 
 > **Note:** Scenarios do not have a `passed` field. To determine pass/fail for an individual scenario, check whether `improvementScore >= 0`. This is the effective score: when no plugin run is present it equals `isolatedImprovementScore`; when a plugin run is present it is the min of isolated and plugin scores. The `passed` field exists only at the verdict level (per-skill).
 
+> **Reused baselines:** When the run was invoked with `--baseline-from`, the `baseline` arm is not executed — its `metrics` and `judgeResult` come from the shared baseline file produced earlier with `--baseline-out` (computed once, honoring `--runs`). Such scenarios are reported with the `baseline-reused` session phase and a `reused` baseline status. The baseline file is keyed on `--model` and a SHA-256 of each scenario prompt; reuse fails fast if the model differs or any prompt is missing, so the baseline you compare against is always identity-matched. Because the baseline output is identical across every skill/agent that consumes the same file, this acts as a shared control group and removes baseline run-to-run variance from cross-skill comparisons.
+
 ### Breakdown fields
 
 The `isolatedBreakdown` and `pluginBreakdown` objects show how each metric contributed to the improvement score. Each field is a raw delta (not yet weighted). The final score is computed as a weighted sum:
diff --git a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
new file mode 100644
index 0000000000..254ec650a9
--- /dev/null
+++ b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
@@ -0,0 +1,151 @@
+using System.Text.Json;
+using SkillValidator;
+using SkillValidator.Evaluate;
+
+namespace SkillValidator.Tests;
+
+public class BaselineStoreTests
+{
+    private static RunResult MakeBaseline(double overallScore = 3, string output = "baseline output") =>
+        new(
+            new RunMetrics
+            {
+                TokenEstimate = 1000,
+                ToolCallCount = 4,
+                ToolCallBreakdown = new Dictionary<string, int> { ["bash"] = 4 },
+                AgentOutput = output,
+                TaskCompleted = true,
+                Events = [],
+            },
+            new JudgeResult([new RubricScore("Quality", overallScore, "ok")], overallScore, "fine"));
+
+    private static EvalScenario Scenario(string name, string prompt) => new(name, prompt);
+
+    private static string TempPath() =>
+        Path.Combine(Path.GetTempPath(), $"sv-baseline-test-{Guid.NewGuid():N}.json");
+
+    [Fact]
+    public void ComputePromptSha_IsDeterministicAndPromptSensitive()
+    {
+        var a = BaselineStore.ComputePromptSha("do the thing");
+        var b = BaselineStore.ComputePromptSha("do the thing");
+        var c = BaselineStore.ComputePromptSha("do something else");
+
+        Assert.Equal(a, b);
+        Assert.NotEqual(a, c);
+        Assert.Equal(64, a.Length); // SHA-256 hex
+    }
+
+    [Fact]
+    public void SaveThenLoad_RoundTripsBaselinePerScenario()
+    {
+        var path = TempPath();
+        try
+        {
+            var store = BaselineStore.ForWrite("model-x");
+            var s1 = Scenario("alpha", "prompt one");
+            var s2 = Scenario("beta", "prompt two");
+            store.Record(s1, runs: 5, MakeBaseline(overallScore: 4, output: "out-1"));
+            store.Record(s2, runs: 5, MakeBaseline(overallScore: 2, output: "out-2"));
+            store.Save(path);
+
+            Assert.True(File.Exists(path));
+
+            var loaded = BaselineStore.Load(path, "model-x");
+            Assert.True(loaded.IsReuse);
+            Assert.Equal(2, loaded.Count);
+
+            var b1 = loaded.TryGetBaseline(s1);
+            var b2 = loaded.TryGetBaseline(s2);
+            Assert.NotNull(b1);
+            Assert.NotNull(b2);
+            Assert.Equal("out-1", b1!.Metrics.AgentOutput);
+            Assert.Equal(4, b1.JudgeResult.OverallScore);
+            Assert.Equal("out-2", b2!.Metrics.AgentOutput);
+        }
+        finally
+        {
+            File.Delete(path);
+        }
+    }
+
+    [Fact]
+    public void Load_ThrowsOnModelMismatch()
+    {
+        var path = TempPath();
+        try
+        {
+            var store = BaselineStore.ForWrite("model-x");
+            store.Record(Scenario("alpha", "prompt one"), runs: 3, MakeBaseline());
+            store.Save(path);
+
+            var ex = Assert.Throws<InvalidOperationException>(() => BaselineStore.Load(path, "model-y"));
+            Assert.Contains("model-x", ex.Message);
+            Assert.Contains("model-y", ex.Message);
+        }
+        finally
+        {
+            File.Delete(path);
+        }
+    }
+
+    [Fact]
+    public void Load_ThrowsOnUnsupportedVersion()
+    {
+        var path = TempPath();
+        try
+        {
+            var file = new BaselineFile(
+                Version: BaselineStore.CurrentVersion + 1,
+                Model: "model-x",
+                ValidatorVersion: "9.9.9",
+                CreatedAt: DateTime.UtcNow.ToString("o"),
+                Scenarios: []);
+            File.WriteAllText(path, JsonSerializer.Serialize(file, SkillValidatorJsonContext.Default.BaselineFile));
+
+            var ex = Assert.Throws<InvalidOperationException>(() => BaselineStore.Load(path, "model-x"));
+            Assert.Contains("unsupported version", ex.Message);
+        }
+        finally
+        {
+            File.Delete(path);
+        }
+    }
+
+    [Fact]
+    public void Load_ThrowsWhenFileMissing()
+    {
+        Assert.Throws<FileNotFoundException>(() => BaselineStore.Load(TempPath(), "model-x"));
+    }
+
+    [Fact]
+    public void FindMissingScenarios_ReturnsScenariosWithoutCachedBaseline()
+    {
+        var path = TempPath();
+        try
+        {
+            var store = BaselineStore.ForWrite("model-x");
+            var present = Scenario("alpha", "prompt one");
+            store.Record(present, runs: 5, MakeBaseline());
+            store.Save(path);
+
+            var loaded = BaselineStore.Load(path, "model-x");
+            var missing = loaded.FindMissingScenarios([present, Scenario("beta", "prompt two")]);
+
+            Assert.Single(missing);
+            Assert.Equal("beta", missing[0]);
+        }
+        finally
+        {
+            File.Delete(path);
+        }
+    }
+
+    [Fact]
+    public void WriteStore_IsNotReuse()
+    {
+        var store = BaselineStore.ForWrite("model-x");
+        Assert.False(store.IsReuse);
+        Assert.Null(store.TryGetBaseline(Scenario("alpha", "prompt one")));
+    }
+}

From f571556f4dbe0ce6ab4490aefb77b0358c681a20 Mon Sep 17 00:00:00 2001
From: YuliiaKovalova <95473390+YuliiaKovalova@users.noreply.github.com>
Date: Thu, 11 Jun 2026 17:35:52 +0200
Subject: [PATCH 2/7] Bind reused baseline to fixture identity (targetSha), not
 just prompt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Align baseline reuse with the (prompt, model, targetSha) identity contract from
the upstream eval-harness design: previously the per-scenario reuse key was the
prompt SHA + model only, so two scenarios that share an identical prompt but
feed the agent different input artifacts (e.g. a different build.binlog) would
collide and silently reuse the wrong baseline.

- Add BaselineScenarioEntry.TargetSha: a SHA-256 over the scenario's materialized
  inputs — files auto-copied via copy_test_files, explicit setup files (inline
  content or copied sources), and the setup command recipe. The reuse key is now
  (promptSha, targetSha); both must match. Bump the on-disk schema to version 2.
- Memoize target hashing per process via a cheap, file-I/O-free setup signature
  to avoid re-hashing large fixtures across the N runs.
- Thread the originating eval.yaml path into Record/TryGetBaseline/
  FindMissingScenarios so inputs can be fingerprinted.
- Tests: target SHA is stable and content-sensitive; same-prompt/different-fixture
  scenarios do not reuse each other's baseline and are surfaced by
  FindMissingScenarios. Update README and InvestigatingResults.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../src/Evaluate/BaselineStore.cs             | 143 +++++++++++++++---
 .../src/Evaluate/EvaluateCommand.cs           |  10 +-
 eng/skill-validator/src/README.md             |   2 +-
 .../src/docs/InvestigatingResults.md          |   2 +-
 .../tests/Evaluate/BaselineStoreTests.cs      |  80 +++++++++-
 5 files changed, 212 insertions(+), 25 deletions(-)

diff --git a/eng/skill-validator/src/Evaluate/BaselineStore.cs b/eng/skill-validator/src/Evaluate/BaselineStore.cs
index 0e3e143065..90877b0065 100644
--- a/eng/skill-validator/src/Evaluate/BaselineStore.cs
+++ b/eng/skill-validator/src/Evaluate/BaselineStore.cs
@@ -6,13 +6,18 @@
 namespace SkillValidator.Evaluate;
 
 /// <summary>
-/// One scenario's precomputed baseline, keyed by the SHA-256 of its prompt.
+/// One scenario's precomputed baseline, keyed by the SHA-256 of its prompt
+/// (<see cref="PromptSha"/>) <em>and</em> the SHA-256 of its setup/fixture inputs
+/// (<see cref="TargetSha"/>).  Both must match for a baseline to be reused, so two
+/// scenarios that share a prompt but feed the agent different input artifacts
+/// (e.g. different <c>build.binlog</c> fixtures) never collide.
 /// <see cref="Runs"/> records how many baseline runs were averaged into
 /// <see cref="Baseline"/> so reuse can report the robustness of the reference.
 /// </summary>
 public sealed record BaselineScenarioEntry(
     string Name,
     string PromptSha,
+    string TargetSha,
     int Runs,
     RunResult Baseline);
 
@@ -38,9 +43,10 @@ public sealed record BaselineFile(
 internal sealed class BaselineStore
 {
     /// <summary>Current on-disk schema version.</summary>
-    public const int CurrentVersion = 1;
+    public const int CurrentVersion = 2;
 
     private readonly ConcurrentDictionary<string, BaselineScenarioEntry> _entries = new(StringComparer.Ordinal);
+    private static readonly ConcurrentDictionary<string, string> _targetShaCache = new(StringComparer.Ordinal);
     private readonly string _model;
 
     /// <summary>True when serving cached baselines (<c>--baseline-from</c>).</summary>
@@ -58,8 +64,8 @@ private BaselineStore(string model, bool isReuse)
     /// <summary>
     /// Load a baseline file for reuse.  Validates the schema version and that the model
     /// matches, throwing on mismatch so a stale or wrong baseline can never silently
-    /// skew results.  Per-scenario prompt identity is validated later via
-    /// <see cref="FindMissingScenarios"/>.
+    /// skew results.  Per-scenario identity (prompt + setup/fixture inputs) is validated
+    /// later via <see cref="FindMissingScenarios"/>.
     /// </summary>
     public static BaselineStore Load(string path, string expectedModel)
     {
@@ -90,37 +96,140 @@ public static BaselineStore Load(string path, string expectedModel)
         foreach (var entry in file.Scenarios)
         {
             if (entry.Baseline is not null)
-                store._entries[entry.PromptSha] = entry;
+                store._entries[MakeKey(entry.PromptSha, entry.TargetSha)] = entry;
         }
         return store;
     }
 
-    /// <summary>SHA-256 (lower-case hex) of the scenario prompt — the per-scenario reuse key.</summary>
-    public static string ComputePromptSha(string prompt)
+    /// <summary>SHA-256 (lower-case hex) of the scenario prompt.</summary>
+    public static string ComputePromptSha(string prompt) => Sha256Hex(Encoding.UTF8.GetBytes(prompt));
+
+    /// <summary>
+    /// SHA-256 (lower-case hex) identifying the scenario's input artifacts — the analog
+    /// of the issue's <c>targetSha</c>.  It folds in the contents of every file the agent
+    /// is given for the run: sibling files auto-copied via <c>copy_test_files</c>, explicit
+    /// setup files (inline content or copied sources), and the setup command recipe.  This
+    /// binds a cached baseline to the exact inputs it was measured against, so two scenarios
+    /// that share prompt text but differ in fixtures (e.g. a different <c>build.binlog</c>)
+    /// resolve to distinct keys and never reuse each other's baseline.
+    /// </summary>
+    public static string ComputeTargetSha(EvalScenario scenario, string? evalPath)
+    {
+        var cacheKey = BuildTargetCacheKey(scenario, evalPath);
+        return _targetShaCache.GetOrAdd(cacheKey, _ => ComputeTargetShaCore(scenario, evalPath));
+    }
+
+    /// <summary>
+    /// Cheap, file-I/O-free signature of a scenario's setup inputs, used only to memoize
+    /// the (expensive) content hashing in <see cref="ComputeTargetShaCore"/> within a
+    /// single process.  It must distinguish any two scenarios whose materialized inputs
+    /// could differ, so it folds in the eval directory, the copy flag, the explicit setup
+    /// file recipe, and the command list — but not the on-disk file contents themselves.
+    /// </summary>
+    private static string BuildTargetCacheKey(EvalScenario scenario, string? evalPath)
     {
-        var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(prompt));
+        var setup = scenario.Setup;
+        var sb = new StringBuilder().Append(evalPath ?? "").Append('\0');
+        if (setup is null)
+            return sb.Append("none").ToString();
+        sb.Append("copy=").Append(setup.CopyTestFiles).Append('\0');
+        if (setup.Files is { } files)
+            foreach (var f in files)
+                sb.Append("f=").Append(f.Path).Append('|').Append(f.Source ?? "").Append('|').Append(f.Content ?? "").Append('\0');
+        if (setup.Commands is { } commands)
+            foreach (var c in commands)
+                sb.Append("c=").Append(c).Append('\0');
+        return sb.ToString();
+    }
+
+    private static string ComputeTargetShaCore(EvalScenario scenario, string? evalPath)
+    {
+        var setup = scenario.Setup;
+        if (setup is null)
+            return Sha256Hex(Encoding.UTF8.GetBytes("\0no-setup\0"));
+
+        var sb = new StringBuilder();
+
+        // 1. Sibling files auto-copied into the work dir (copy_test_files: true).
+        if (setup.CopyTestFiles && evalPath is not null)
+        {
+            var evalDir = Path.GetDirectoryName(evalPath);
+            if (!string.IsNullOrEmpty(evalDir) && Directory.Exists(evalDir))
+            {
+                var files = Directory.EnumerateFiles(evalDir, "*", SearchOption.AllDirectories)
+                    .Where(f => !string.Equals(Path.GetFileName(f), "eval.yaml", StringComparison.Ordinal))
+                    .Select(f => (Rel: Path.GetRelativePath(evalDir, f).Replace('\\', '/'), Full: f))
+                    .OrderBy(x => x.Rel, StringComparer.Ordinal);
+                foreach (var (rel, full) in files)
+                    sb.Append("F:").Append(rel).Append('=').Append(HashFile(full)).Append('\n');
+            }
+        }
+
+        // 2. Explicit setup files — inline content or a copied source.
+        if (setup.Files is { } setupFiles)
+        {
+            foreach (var f in setupFiles.OrderBy(f => f.Path, StringComparer.Ordinal))
+            {
+                sb.Append("E:").Append(f.Path.Replace('\\', '/')).Append('=');
+                if (f.Content is not null)
+                    sb.Append("c:").Append(Sha256Hex(Encoding.UTF8.GetBytes(f.Content)));
+                else if (f.Source is not null)
+                {
+                    var resolved = AgentRunner.ResolveSourcePath(f.Source, evalPath, skillPath: null);
+                    sb.Append("s:").Append(resolved is not null && File.Exists(resolved) ? HashFile(resolved) : "missing");
+                }
+                sb.Append('\n');
+            }
+        }
+
+        // 3. Setup commands define part of the input recipe (e.g. building a binlog).
+        if (setup.Commands is { } commands)
+        {
+            foreach (var c in commands)
+                sb.Append("C:").Append(c).Append('\n');
+        }
+
+        return Sha256Hex(Encoding.UTF8.GetBytes(sb.ToString()));
+    }
+
+    private static string HashFile(string path)
+    {
+        using var stream = File.OpenRead(path);
+        return Sha256Hex(SHA256.HashData(stream));
+    }
+
+    private static string Sha256Hex(byte[] data)
+    {
+        var bytes = data.Length == 32 ? data : SHA256.HashData(data);
         return Convert.ToHexString(bytes).ToLowerInvariant();
     }
 
+    private static string MakeKey(string promptSha, string targetSha) => string.Concat(promptSha, ":", targetSha);
+
     /// <summary>
     /// In reuse mode, return the names of scenarios that have no matching cached
-    /// baseline (keyed by prompt hash).  Empty when every scenario is covered.
+    /// baseline (keyed by prompt + setup/fixture identity).  Empty when every
+    /// scenario is covered.  Each scenario is paired with the eval.yaml path it
+    /// originates from so its input artifacts can be fingerprinted.
     /// </summary>
-    public IReadOnlyList<string> FindMissingScenarios(IEnumerable<EvalScenario> scenarios) =>
+    public IReadOnlyList<string> FindMissingScenarios(IEnumerable<(EvalScenario Scenario, string? EvalPath)> scenarios) =>
         scenarios
-            .Where(s => !_entries.ContainsKey(ComputePromptSha(s.Prompt)))
-            .Select(s => s.Name)
+            .Where(s => !_entries.ContainsKey(MakeKey(ComputePromptSha(s.Scenario.Prompt), ComputeTargetSha(s.Scenario, s.EvalPath))))
+            .Select(s => s.Scenario.Name)
             .ToList();
 
     /// <summary>Get the cached averaged baseline for a scenario, or null when absent.</summary>
-    public RunResult? TryGetBaseline(EvalScenario scenario) =>
-        _entries.TryGetValue(ComputePromptSha(scenario.Prompt), out var entry) ? entry.Baseline : null;
+    public RunResult? TryGetBaseline(EvalScenario scenario, string? evalPath = null) =>
+        _entries.TryGetValue(MakeKey(ComputePromptSha(scenario.Prompt), ComputeTargetSha(scenario, evalPath)), out var entry)
+            ? entry.Baseline
+            : null;
 
     /// <summary>Record a scenario's averaged baseline for later persistence (write mode).</summary>
-    public void Record(EvalScenario scenario, int runs, RunResult averagedBaseline)
+    public void Record(EvalScenario scenario, int runs, RunResult averagedBaseline, string? evalPath = null)
     {
-        var sha = ComputePromptSha(scenario.Prompt);
-        _entries[sha] = new BaselineScenarioEntry(scenario.Name, sha, runs, averagedBaseline);
+        var promptSha = ComputePromptSha(scenario.Prompt);
+        var targetSha = ComputeTargetSha(scenario, evalPath);
+        _entries[MakeKey(promptSha, targetSha)] = new BaselineScenarioEntry(scenario.Name, promptSha, targetSha, runs, averagedBaseline);
     }
 
     /// <summary>Serialize all recorded baselines to <paramref name="path"/>.</summary>
diff --git a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
index d8f054dd9f..ac805ea602 100644
--- a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
+++ b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
@@ -322,7 +322,7 @@ public static async Task<int> Run(ValidatorConfig config, CancellationToken canc
             // incomplete baseline can never silently skew results.
             var allScenarios = allTargets
                 .Where(t => t.EvalConfig is not null)
-                .SelectMany(t => t.EvalConfig!.Scenarios)
+                .SelectMany(t => t.EvalConfig!.Scenarios.Select(s => (Scenario: s, t.EvalPath)))
                 .ToList();
             var missing = baselineStore.FindMissingScenarios(allScenarios);
             if (missing.Count > 0)
@@ -657,7 +657,7 @@ private static async Task<ScenarioComparison> ExecuteAgentScenario(
 
         // Persist the averaged baseline (skill/agent-independent) for shared reuse.
         if (baselineStore is { IsReuse: false })
-            baselineStore.Record(scenario, runResults.Length, avgBaseline);
+            baselineStore.Record(scenario, runResults.Length, avgBaseline, target.EvalPath);
 
         int bestPairwiseIdx = -1;
         for (int i = 0; i < perRunPairwise.Count; i++)
@@ -772,7 +772,7 @@ private static async Task<RunExecutionResult> ExecuteAgentRun(
 
         // Reuse a precomputed shared baseline when available (--baseline-from). The
         // baseline arm is agent-independent, so this skips a redundant agent run.
-        var reusedBaseline = baselineStore?.TryGetBaseline(scenario);
+        var reusedBaseline = baselineStore?.TryGetBaseline(scenario, target.EvalPath);
 
         sessionDb?.RegisterSession(baselineSessionId, agent.Name, agent.Path, scenario.Name, runIndex,
             reusedBaseline is not null ? "baseline-reused" : "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, targetSha, rubricJson);
@@ -1179,7 +1179,7 @@ private static async Task<ScenarioComparison> ExecuteScenario(
         var avgPlugin = AverageResults(pluginRuns);
         // Persist the averaged baseline (skill/agent-independent) for shared reuse.
         if (baselineStore is { IsReuse: false })
-            baselineStore.Record(scenario, runResults.Length, avgBaseline);
+            baselineStore.Record(scenario, runResults.Length, avgBaseline, evalSkill.EvalPath);
         // Select the best pairwise result and track which run it came from
         int bestPairwiseIdx = -1;
         for (int i = 0; i < perRunPairwise.Count; i++)
@@ -1311,7 +1311,7 @@ private static async Task<RunExecutionResult> ExecuteRun(
 
         // Reuse a precomputed shared baseline when available (--baseline-from). The
         // baseline arm is skill-independent, so this skips a redundant agent run.
-        var reusedBaseline = baselineStore?.TryGetBaseline(scenario);
+        var reusedBaseline = baselineStore?.TryGetBaseline(scenario, evalSkill.EvalPath);
 
         sessionDb?.RegisterSession(baselineSessionId, skill.Name, skill.Path, scenario.Name, runIndex,
             reusedBaseline is not null ? "baseline-reused" : "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, skillSha, rubricJson);
diff --git a/eng/skill-validator/src/README.md b/eng/skill-validator/src/README.md
index af54a4340d..4c8c54b4cd 100644
--- a/eng/skill-validator/src/README.md
+++ b/eng/skill-validator/src/README.md
@@ -166,7 +166,7 @@ Every evaluation runs each scenario through a **baseline arm** (the agent with n
 1. **Produce** a baseline file with `--baseline-out baseline.json`. After the run, each scenario's averaged baseline result (honoring `--runs`) is written to the file.
 2. **Reuse** it with `--baseline-from baseline.json` on subsequent runs. The baseline arm is skipped entirely; the cached baseline is used for assertions, pairwise/independent judging, and metric deltas.
 
-The baseline file records the `--model` and a SHA-256 of each scenario prompt. On reuse the validator fails fast if the model differs or any scenario prompt is missing from the file, so a stale or mismatched baseline can never be silently applied. Scenarios reused from the file are reported with the `baseline-reused` session phase and a `reused` baseline status.
+The baseline file records the `--model` and, per scenario, a SHA-256 of the prompt **and** a SHA-256 of its setup inputs (the fixtures copied via `copy_test_files`, explicit setup files, and setup commands — the analog of a target/input SHA). On reuse the validator fails fast if the model differs or any scenario's prompt-plus-fixture identity is missing from the file, so a stale or mismatched baseline can never be silently applied — and two scenarios that share a prompt but feed the agent different fixtures (e.g. a different `build.binlog`) never reuse each other's baseline. Scenarios reused from the file are reported with the `baseline-reused` session phase and a `reused` baseline status.
 
 The two options are mutually exclusive.
 
diff --git a/eng/skill-validator/src/docs/InvestigatingResults.md b/eng/skill-validator/src/docs/InvestigatingResults.md
index 7ca0aba084..c278035799 100644
--- a/eng/skill-validator/src/docs/InvestigatingResults.md
+++ b/eng/skill-validator/src/docs/InvestigatingResults.md
@@ -83,7 +83,7 @@ Each scenario includes two required runs (baseline + isolated). It may also incl
 
 > **Note:** Scenarios do not have a `passed` field. To determine pass/fail for an individual scenario, check whether `improvementScore >= 0`. This is the effective score: when no plugin run is present it equals `isolatedImprovementScore`; when a plugin run is present it is the min of isolated and plugin scores. The `passed` field exists only at the verdict level (per-skill).
 
-> **Reused baselines:** When the run was invoked with `--baseline-from`, the `baseline` arm is not executed — its `metrics` and `judgeResult` come from the shared baseline file produced earlier with `--baseline-out` (computed once, honoring `--runs`). Such scenarios are reported with the `baseline-reused` session phase and a `reused` baseline status. The baseline file is keyed on `--model` and a SHA-256 of each scenario prompt; reuse fails fast if the model differs or any prompt is missing, so the baseline you compare against is always identity-matched. Because the baseline output is identical across every skill/agent that consumes the same file, this acts as a shared control group and removes baseline run-to-run variance from cross-skill comparisons.
+> **Reused baselines:** When the run was invoked with `--baseline-from`, the `baseline` arm is not executed — its `metrics` and `judgeResult` come from the shared baseline file produced earlier with `--baseline-out` (computed once, honoring `--runs`). Such scenarios are reported with the `baseline-reused` session phase and a `reused` baseline status. The baseline file is keyed on `--model` plus, per scenario, a SHA-256 of the prompt and a SHA-256 of its setup/fixture inputs (copied test files, explicit setup files, and setup commands); reuse fails fast if the model differs or any prompt-plus-fixture identity is missing, so the baseline you compare against is always identity-matched and a shared prompt across cases with different fixtures cannot cross-contaminate. Because the baseline output is identical across every skill/agent that consumes the same file, this acts as a shared control group and removes baseline run-to-run variance from cross-skill comparisons.
 
 ### Breakdown fields
 
diff --git a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
index 254ec650a9..09444d7cb5 100644
--- a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
+++ b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
@@ -130,7 +130,7 @@ public void FindMissingScenarios_ReturnsScenariosWithoutCachedBaseline()
             store.Save(path);
 
             var loaded = BaselineStore.Load(path, "model-x");
-            var missing = loaded.FindMissingScenarios([present, Scenario("beta", "prompt two")]);
+            var missing = loaded.FindMissingScenarios([(present, null), (Scenario("beta", "prompt two"), null)]);
 
             Assert.Single(missing);
             Assert.Equal("beta", missing[0]);
@@ -148,4 +148,82 @@ public void WriteStore_IsNotReuse()
         Assert.False(store.IsReuse);
         Assert.Null(store.TryGetBaseline(Scenario("alpha", "prompt one")));
     }
+
+    private static string MakeEvalDirWithFixture(string fixtureName, string fixtureContent)
+    {
+        var dir = Path.Combine(Path.GetTempPath(), $"sv-baseline-fixture-{Guid.NewGuid():N}");
+        Directory.CreateDirectory(dir);
+        File.WriteAllText(Path.Combine(dir, "eval.yaml"), "scenarios: []");
+        File.WriteAllText(Path.Combine(dir, fixtureName), fixtureContent);
+        return Path.Combine(dir, "eval.yaml");
+    }
+
+    private static EvalScenario FixtureScenario(string name, string prompt) =>
+        new(name, prompt, new SetupConfig(CopyTestFiles: true));
+
+    [Fact]
+    public void ComputeTargetSha_DiffersByFixtureContentAndIsStable()
+    {
+        var evalA = MakeEvalDirWithFixture("build.binlog", "AAAA");
+        var evalB = MakeEvalDirWithFixture("build.binlog", "BBBB");
+        try
+        {
+            var scenario = FixtureScenario("s", "investigate build.binlog");
+
+            var shaA1 = BaselineStore.ComputeTargetSha(scenario, evalA);
+            var shaA2 = BaselineStore.ComputeTargetSha(scenario, evalA);
+            var shaB = BaselineStore.ComputeTargetSha(scenario, evalB);
+
+            Assert.Equal(shaA1, shaA2);     // stable for identical inputs
+            Assert.NotEqual(shaA1, shaB);   // sensitive to fixture content
+            Assert.Equal(64, shaA1.Length);
+
+            // No setup → a stable, distinct constant.
+            var noSetup = BaselineStore.ComputeTargetSha(Scenario("s", "investigate build.binlog"), evalA);
+            Assert.NotEqual(shaA1, noSetup);
+        }
+        finally
+        {
+            Directory.Delete(Path.GetDirectoryName(evalA)!, recursive: true);
+            Directory.Delete(Path.GetDirectoryName(evalB)!, recursive: true);
+        }
+    }
+
+    [Fact]
+    public void SamePromptDifferentFixture_DoesNotReuseBaseline()
+    {
+        var path = TempPath();
+        var evalA = MakeEvalDirWithFixture("build.binlog", "case-A-binlog");
+        var evalB = MakeEvalDirWithFixture("build.binlog", "case-B-binlog");
+        try
+        {
+            // Two cases share an identical prompt but feed different fixtures.
+            const string sharedPrompt = "The binlog is at build.binlog. What went wrong?";
+            var scenarioA = FixtureScenario("case-A", sharedPrompt);
+            var scenarioB = FixtureScenario("case-B", sharedPrompt);
+
+            // Persist a baseline only for case A.
+            var store = BaselineStore.ForWrite("model-x");
+            store.Record(scenarioA, runs: 5, MakeBaseline(output: "A-baseline"), evalA);
+            store.Save(path);
+
+            var loaded = BaselineStore.Load(path, "model-x");
+
+            // Case A reuses its baseline; case B must NOT (different targetSha).
+            Assert.NotNull(loaded.TryGetBaseline(scenarioA, evalA));
+            Assert.Equal("A-baseline", loaded.TryGetBaseline(scenarioA, evalA)!.Metrics.AgentOutput);
+            Assert.Null(loaded.TryGetBaseline(scenarioB, evalB));
+
+            // FindMissingScenarios surfaces case B by name despite the shared prompt.
+            var missing = loaded.FindMissingScenarios([(scenarioA, evalA), (scenarioB, evalB)]);
+            Assert.Single(missing);
+            Assert.Equal("case-B", missing[0]);
+        }
+        finally
+        {
+            File.Delete(path);
+            Directory.Delete(Path.GetDirectoryName(evalA)!, recursive: true);
+            Directory.Delete(Path.GetDirectoryName(evalB)!, recursive: true);
+        }
+    }
 }

From 4f7a652b4179b7991a218cc43362d4677f0736e9 Mon Sep 17 00:00:00 2001
From: YuliiaKovalova <95473390+YuliiaKovalova@users.noreply.github.com>
Date: Thu, 11 Jun 2026 18:05:01 +0200
Subject: [PATCH 3/7] Harden baseline reuse identity after review

Address rubber-duck review findings on the shared-baseline feature:

- Fix Sha256Hex 32-byte bug: a 32-byte input was treated as an
  already-computed digest and not hashed. Split into Sha256Hex (always
  hash) + HexDigest (encode existing digest).
- Broaden reuse identity: the cached baseline RunResult depends on the
  judge model and on per-scenario evaluation criteria (rubric,
  assertions, expect/reject tools, turn/token/timeout limits). Add
  JudgeModel to the baseline header (validated on load) and fold the
  criteria into the per-scenario targetSha so changing them invalidates
  reuse instead of silently serving a stale result.
- Mirror AgentRunner.SetupWorkDir exactly when hashing copied fixtures:
  exclude only the top-level eval.yaml (nested eval.yaml files are
  copied, so they must be hashed).
- Make the target-SHA cache instance-scoped (memoizing only the
  expensive fixture-input hashing) so it can't serve stale hashes or
  leak across evaluations/tests; hash inline file Content in the cache
  key instead of embedding it.
- Deterministic Save ordering (Name, PromptSha, TargetSha); guard Load
  against null Scenarios; enrich FindMissingScenarios output with the
  eval path.
- Document that setup commands are fingerprinted by recipe, so reuse
  assumes they are deterministic/hermetic.
- Tests + docs updated; add judge-model-mismatch and criteria-identity
  tests (562 pass).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../src/Evaluate/BaselineStore.cs             | 165 +++++++++++++-----
 .../src/Evaluate/EvaluateCommand.cs           |   6 +-
 eng/skill-validator/src/README.md             |   6 +-
 .../src/docs/InvestigatingResults.md          |   2 +-
 .../tests/Evaluate/BaselineStoreTests.cs      |  79 +++++++--
 5 files changed, 190 insertions(+), 68 deletions(-)

diff --git a/eng/skill-validator/src/Evaluate/BaselineStore.cs b/eng/skill-validator/src/Evaluate/BaselineStore.cs
index 90877b0065..87a51a12d2 100644
--- a/eng/skill-validator/src/Evaluate/BaselineStore.cs
+++ b/eng/skill-validator/src/Evaluate/BaselineStore.cs
@@ -25,11 +25,14 @@ public sealed record BaselineScenarioEntry(
 /// On-disk format written by <c>--baseline-out</c> and read by <c>--baseline-from</c>.
 /// The baseline arm of <c>evaluate</c> is plain-agent with no skill/MCP attached, so it
 /// is independent of the target under test and can be computed once and shared across
-/// many invocations.  The header records the identity needed to reject a stale reuse.
+/// many invocations.  The header records the identity needed to reject a stale reuse:
+/// the agent <see cref="Model"/> and the <see cref="JudgeModel"/> that produced the
+/// cached judge scores.
 /// </summary>
 public sealed record BaselineFile(
     int Version,
     string Model,
+    string JudgeModel,
     string? ValidatorVersion,
     string CreatedAt,
     IReadOnlyList<BaselineScenarioEntry> Scenarios);
@@ -46,28 +49,33 @@ internal sealed class BaselineStore
     public const int CurrentVersion = 2;
 
     private readonly ConcurrentDictionary<string, BaselineScenarioEntry> _entries = new(StringComparer.Ordinal);
-    private static readonly ConcurrentDictionary<string, string> _targetShaCache = new(StringComparer.Ordinal);
+    // Memoizes the (expensive, file-I/O-bound) hashing of materialized input artifacts.
+    // Instance-scoped — never shared across stores — so it can never serve a stale hash
+    // from a different evaluation or leak between tests.
+    private readonly ConcurrentDictionary<string, string> _inputsShaCache = new(StringComparer.Ordinal);
     private readonly string _model;
+    private readonly string _judgeModel;
 
     /// <summary>True when serving cached baselines (<c>--baseline-from</c>).</summary>
     public bool IsReuse { get; }
 
-    private BaselineStore(string model, bool isReuse)
+    private BaselineStore(string model, string judgeModel, bool isReuse)
     {
         _model = model;
+        _judgeModel = judgeModel;
         IsReuse = isReuse;
     }
 
     /// <summary>Create a store that accumulates baselines for later persistence.</summary>
-    public static BaselineStore ForWrite(string model) => new(model, isReuse: false);
+    public static BaselineStore ForWrite(string model, string judgeModel) => new(model, judgeModel, isReuse: false);
 
     /// <summary>
-    /// Load a baseline file for reuse.  Validates the schema version and that the model
-    /// matches, throwing on mismatch so a stale or wrong baseline can never silently
-    /// skew results.  Per-scenario identity (prompt + setup/fixture inputs) is validated
-    /// later via <see cref="FindMissingScenarios"/>.
+    /// Load a baseline file for reuse.  Validates the schema version and that both the
+    /// agent model and judge model match, throwing on mismatch so a stale or wrong
+    /// baseline can never silently skew results.  Per-scenario identity (prompt + setup
+    /// inputs + evaluation criteria) is validated later via <see cref="FindMissingScenarios"/>.
     /// </summary>
-    public static BaselineStore Load(string path, string expectedModel)
+    public static BaselineStore Load(string path, string expectedModel, string expectedJudgeModel)
     {
         if (!File.Exists(path))
             throw new FileNotFoundException($"Baseline file not found: {path}");
@@ -91,11 +99,15 @@ public static BaselineStore Load(string path, string expectedModel)
             throw new InvalidOperationException(
                 $"Baseline file '{path}' was computed for model '{file.Model}' but evaluation uses model '{expectedModel}'. " +
                 "Recompute the baseline with --baseline-out for the new model.");
+        if (!string.Equals(file.JudgeModel, expectedJudgeModel, StringComparison.Ordinal))
+            throw new InvalidOperationException(
+                $"Baseline file '{path}' was judged with model '{file.JudgeModel}' but evaluation uses judge model '{expectedJudgeModel}'. " +
+                "Recompute the baseline with --baseline-out for the new judge model.");
 
-        var store = new BaselineStore(expectedModel, isReuse: true);
-        foreach (var entry in file.Scenarios)
+        var store = new BaselineStore(expectedModel, expectedJudgeModel, isReuse: true);
+        foreach (var entry in file.Scenarios ?? [])
         {
-            if (entry.Baseline is not null)
+            if (entry?.Baseline is not null)
                 store._entries[MakeKey(entry.PromptSha, entry.TargetSha)] = entry;
         }
         return store;
@@ -105,28 +117,46 @@ public static BaselineStore Load(string path, string expectedModel)
     public static string ComputePromptSha(string prompt) => Sha256Hex(Encoding.UTF8.GetBytes(prompt));
 
     /// <summary>
-    /// SHA-256 (lower-case hex) identifying the scenario's input artifacts — the analog
-    /// of the issue's <c>targetSha</c>.  It folds in the contents of every file the agent
-    /// is given for the run: sibling files auto-copied via <c>copy_test_files</c>, explicit
-    /// setup files (inline content or copied sources), and the setup command recipe.  This
-    /// binds a cached baseline to the exact inputs it was measured against, so two scenarios
-    /// that share prompt text but differ in fixtures (e.g. a different <c>build.binlog</c>)
-    /// resolve to distinct keys and never reuse each other's baseline.
+    /// SHA-256 (lower-case hex) identifying everything (besides the prompt and model) that
+    /// determines a scenario's cached baseline <see cref="RunResult"/> — the analog of the
+    /// issue's <c>targetSha</c>.  It folds in:
+    /// <list type="bullet">
+    /// <item>the materialized input artifacts the agent is given (files auto-copied via
+    /// <c>copy_test_files</c>, explicit setup files' content/sources, and the setup command
+    /// recipe), and</item>
+    /// <item>the evaluation criteria that shape the stored result (rubric, assertions,
+    /// expect/reject tools, and the turn/token/timeout limits that bound the baseline run).</item>
+    /// </list>
+    /// This binds a cached baseline to the exact inputs <em>and</em> criteria it was measured
+    /// against, so two scenarios that share a prompt but differ in fixtures (e.g. a different
+    /// <c>build.binlog</c>) or in rubric/assertions resolve to distinct keys and never reuse
+    /// each other's baseline.
+    /// <para><b>Setup commands</b> are hashed by their text (the recipe), not the artifacts they
+    /// generate; reuse therefore assumes setup commands are deterministic/hermetic.</para>
     /// </summary>
-    public static string ComputeTargetSha(EvalScenario scenario, string? evalPath)
+    public static string ComputeTargetSha(EvalScenario scenario, string? evalPath) =>
+        CombineIdentity(ComputeInputsSha(scenario, evalPath), scenario);
+
+    // Instance variant: memoizes the expensive input hashing, then combines with the
+    // (cheap) per-scenario criteria so the result equals the static method exactly.
+    private string TargetShaFor(EvalScenario scenario, string? evalPath)
     {
-        var cacheKey = BuildTargetCacheKey(scenario, evalPath);
-        return _targetShaCache.GetOrAdd(cacheKey, _ => ComputeTargetShaCore(scenario, evalPath));
+        var inputsSha = _inputsShaCache.GetOrAdd(BuildInputsCacheKey(scenario, evalPath), _ => ComputeInputsSha(scenario, evalPath));
+        return CombineIdentity(inputsSha, scenario);
     }
 
+    private static string CombineIdentity(string inputsSha, EvalScenario scenario) =>
+        Sha256Hex(Encoding.UTF8.GetBytes(string.Concat(inputsSha, "\0criteria\0", CriteriaString(scenario))));
+
     /// <summary>
-    /// Cheap, file-I/O-free signature of a scenario's setup inputs, used only to memoize
-    /// the (expensive) content hashing in <see cref="ComputeTargetShaCore"/> within a
-    /// single process.  It must distinguish any two scenarios whose materialized inputs
-    /// could differ, so it folds in the eval directory, the copy flag, the explicit setup
-    /// file recipe, and the command list — but not the on-disk file contents themselves.
+    /// Cheap, file-I/O-free key memoizing the input-artifact hash within this store.  It must
+    /// distinguish any two scenarios whose materialized inputs could differ, so it folds in the
+    /// eval directory, the copy flag, the explicit setup file recipe, and the command list (but
+    /// not the auto-copied file contents — those are determined by the directory + copy flag).
+    /// Evaluation criteria are intentionally excluded here because they are combined after the
+    /// cache lookup in <see cref="TargetShaFor"/>.
     /// </summary>
-    private static string BuildTargetCacheKey(EvalScenario scenario, string? evalPath)
+    private static string BuildInputsCacheKey(EvalScenario scenario, string? evalPath)
     {
         var setup = scenario.Setup;
         var sb = new StringBuilder().Append(evalPath ?? "").Append('\0');
@@ -135,14 +165,15 @@ private static string BuildTargetCacheKey(EvalScenario scenario, string? evalPat
         sb.Append("copy=").Append(setup.CopyTestFiles).Append('\0');
         if (setup.Files is { } files)
             foreach (var f in files)
-                sb.Append("f=").Append(f.Path).Append('|').Append(f.Source ?? "").Append('|').Append(f.Content ?? "").Append('\0');
+                sb.Append("f=").Append(f.Path).Append('|').Append(f.Source ?? "").Append('|')
+                  .Append(f.Content is null ? "" : Sha256Hex(Encoding.UTF8.GetBytes(f.Content))).Append('\0');
         if (setup.Commands is { } commands)
             foreach (var c in commands)
                 sb.Append("c=").Append(c).Append('\0');
         return sb.ToString();
     }
 
-    private static string ComputeTargetShaCore(EvalScenario scenario, string? evalPath)
+    private static string ComputeInputsSha(EvalScenario scenario, string? evalPath)
     {
         var setup = scenario.Setup;
         if (setup is null)
@@ -150,15 +181,16 @@ private static string ComputeTargetShaCore(EvalScenario scenario, string? evalPa
 
         var sb = new StringBuilder();
 
-        // 1. Sibling files auto-copied into the work dir (copy_test_files: true).
+        // 1. Sibling files auto-copied into the work dir (copy_test_files: true).  Mirror
+        //    AgentRunner.SetupWorkDir, which excludes only the top-level eval.yaml.
         if (setup.CopyTestFiles && evalPath is not null)
         {
             var evalDir = Path.GetDirectoryName(evalPath);
             if (!string.IsNullOrEmpty(evalDir) && Directory.Exists(evalDir))
             {
                 var files = Directory.EnumerateFiles(evalDir, "*", SearchOption.AllDirectories)
-                    .Where(f => !string.Equals(Path.GetFileName(f), "eval.yaml", StringComparison.Ordinal))
                     .Select(f => (Rel: Path.GetRelativePath(evalDir, f).Replace('\\', '/'), Full: f))
+                    .Where(x => !string.Equals(x.Rel, "eval.yaml", StringComparison.Ordinal))
                     .OrderBy(x => x.Rel, StringComparer.Ordinal);
                 foreach (var (rel, full) in files)
                     sb.Append("F:").Append(rel).Append('=').Append(HashFile(full)).Append('\n');
@@ -192,35 +224,69 @@ private static string ComputeTargetShaCore(EvalScenario scenario, string? evalPa
         return Sha256Hex(Encoding.UTF8.GetBytes(sb.ToString()));
     }
 
-    private static string HashFile(string path)
+    /// <summary>
+    /// Deterministic textual signature of the evaluation criteria that influence a scenario's
+    /// stored baseline result: run-bounding limits, rubric, assertions, and expect/reject tools.
+    /// </summary>
+    private static string CriteriaString(EvalScenario scenario)
     {
-        using var stream = File.OpenRead(path);
-        return Sha256Hex(SHA256.HashData(stream));
+        var sb = new StringBuilder();
+        sb.Append("turns=").Append(scenario.MaxTurns?.ToString() ?? "").Append('\0');
+        sb.Append("tokens=").Append(scenario.MaxTokens?.ToString() ?? "").Append('\0');
+        sb.Append("timeout=").Append(scenario.Timeout).Append('\0');
+        if (scenario.Rubric is { } rubric)
+            foreach (var r in rubric)
+                sb.Append("R:").Append(r).Append('\n');
+        if (scenario.ExpectTools is { } expect)
+            foreach (var t in expect.OrderBy(x => x, StringComparer.Ordinal))
+                sb.Append("XT:").Append(t).Append('\0');
+        if (scenario.RejectTools is { } reject)
+            foreach (var t in reject.OrderBy(x => x, StringComparer.Ordinal))
+                sb.Append("RT:").Append(t).Append('\0');
+        if (scenario.Assertions is { } assertions)
+            foreach (var a in assertions)
+            {
+                sb.Append("A:").Append(a.Type).Append('|').Append(a.Path ?? "").Append('|')
+                  .Append(a.Value ?? "").Append('|').Append(a.Pattern ?? "").Append('|');
+                if (a.CommandArgs is { } ca)
+                    sb.Append(ca.CommandToRun).Append(';').Append(ca.CommandArguments ?? "").Append(';')
+                      .Append(ca.ExpectedExitCode?.ToString() ?? "").Append(';').Append(ca.ExpectedStdOutContains ?? "").Append(';')
+                      .Append(ca.ExpectedStdErrorContains ?? "").Append(';').Append(ca.ExpectedStdOutMatches ?? "").Append(';')
+                      .Append(ca.ExpectedStdErrorMatches ?? "").Append(';').Append(ca.Timeout?.ToString() ?? "");
+                sb.Append('\n');
+            }
+        return sb.ToString();
     }
 
-    private static string Sha256Hex(byte[] data)
+    private static string HashFile(string path)
     {
-        var bytes = data.Length == 32 ? data : SHA256.HashData(data);
-        return Convert.ToHexString(bytes).ToLowerInvariant();
+        using var stream = File.OpenRead(path);
+        return HexDigest(SHA256.HashData(stream));
     }
 
+    /// <summary>SHA-256 of <paramref name="data"/>, lower-case hex.</summary>
+    private static string Sha256Hex(byte[] data) => HexDigest(SHA256.HashData(data));
+
+    /// <summary>Lower-case hex encoding of an already-computed digest.</summary>
+    private static string HexDigest(byte[] digest) => Convert.ToHexString(digest).ToLowerInvariant();
+
     private static string MakeKey(string promptSha, string targetSha) => string.Concat(promptSha, ":", targetSha);
 
     /// <summary>
-    /// In reuse mode, return the names of scenarios that have no matching cached
-    /// baseline (keyed by prompt + setup/fixture identity).  Empty when every
-    /// scenario is covered.  Each scenario is paired with the eval.yaml path it
-    /// originates from so its input artifacts can be fingerprinted.
+    /// In reuse mode, return human-readable identifiers (name + eval path) of scenarios that
+    /// have no matching cached baseline (keyed by prompt + setup/criteria identity).  Empty
+    /// when every scenario is covered.  Each scenario is paired with the eval.yaml path it
+    /// originates from so its input artifacts can be fingerprinted and reported unambiguously.
     /// </summary>
     public IReadOnlyList<string> FindMissingScenarios(IEnumerable<(EvalScenario Scenario, string? EvalPath)> scenarios) =>
         scenarios
-            .Where(s => !_entries.ContainsKey(MakeKey(ComputePromptSha(s.Scenario.Prompt), ComputeTargetSha(s.Scenario, s.EvalPath))))
-            .Select(s => s.Scenario.Name)
+            .Where(s => !_entries.ContainsKey(MakeKey(ComputePromptSha(s.Scenario.Prompt), TargetShaFor(s.Scenario, s.EvalPath))))
+            .Select(s => s.EvalPath is null ? s.Scenario.Name : $"{s.Scenario.Name} ({s.EvalPath})")
             .ToList();
 
     /// <summary>Get the cached averaged baseline for a scenario, or null when absent.</summary>
     public RunResult? TryGetBaseline(EvalScenario scenario, string? evalPath = null) =>
-        _entries.TryGetValue(MakeKey(ComputePromptSha(scenario.Prompt), ComputeTargetSha(scenario, evalPath)), out var entry)
+        _entries.TryGetValue(MakeKey(ComputePromptSha(scenario.Prompt), TargetShaFor(scenario, evalPath)), out var entry)
             ? entry.Baseline
             : null;
 
@@ -228,7 +294,7 @@ public IReadOnlyList<string> FindMissingScenarios(IEnumerable<(EvalScenario Scen
     public void Record(EvalScenario scenario, int runs, RunResult averagedBaseline, string? evalPath = null)
     {
         var promptSha = ComputePromptSha(scenario.Prompt);
-        var targetSha = ComputeTargetSha(scenario, evalPath);
+        var targetSha = TargetShaFor(scenario, evalPath);
         _entries[MakeKey(promptSha, targetSha)] = new BaselineScenarioEntry(scenario.Name, promptSha, targetSha, runs, averagedBaseline);
     }
 
@@ -238,9 +304,14 @@ public void Save(string path)
         var file = new BaselineFile(
             Version: CurrentVersion,
             Model: _model,
+            JudgeModel: _judgeModel,
             ValidatorVersion: typeof(BaselineStore).Assembly.GetName().Version?.ToString(),
             CreatedAt: DateTime.UtcNow.ToString("o"),
-            Scenarios: _entries.Values.OrderBy(e => e.Name, StringComparer.Ordinal).ToList());
+            Scenarios: _entries.Values
+                .OrderBy(e => e.Name, StringComparer.Ordinal)
+                .ThenBy(e => e.PromptSha, StringComparer.Ordinal)
+                .ThenBy(e => e.TargetSha, StringComparer.Ordinal)
+                .ToList());
 
         var dir = Path.GetDirectoryName(Path.GetFullPath(path));
         if (!string.IsNullOrEmpty(dir))
diff --git a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
index ac805ea602..bce797313b 100644
--- a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
+++ b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
@@ -34,7 +34,7 @@ public static Command Create()
         var noiseMaxDegradationOpt = new Option<double>("--noise-max-degradation") { Description = "Maximum acceptable average quality degradation (0-1) in noise test (only positive degradations count)", DefaultValueFactory = _ => 0.2 };
         var noiseMaxScenarioDegradationOpt = new Option<double>("--noise-max-scenario-degradation") { Description = "Maximum acceptable quality degradation (0-1) for any single noise-test scenario", DefaultValueFactory = _ => 0.4 };
         var baselineOutOpt = new Option<string?>("--baseline-out") { Description = "After running, persist each scenario's averaged baseline (no-skill/no-agent reference) to this file for later reuse with --baseline-from." };
-        var baselineFromOpt = new Option<string?>("--baseline-from") { Description = "Reuse a precomputed baseline from this file instead of re-running the no-skill/no-agent baseline arm. Must match --model and the scenario prompts. Mutually exclusive with --baseline-out." };
+        var baselineFromOpt = new Option<string?>("--baseline-from") { Description = "Reuse a precomputed baseline from this file instead of re-running the no-skill/no-agent baseline arm. Must match --model, --judge-model, and each scenario's prompt, setup inputs, and evaluation criteria. Mutually exclusive with --baseline-out." };
 
         var command = new Command("evaluate", "Evaluate agent skills via LLM-based testing")
         {
@@ -310,7 +310,7 @@ public static async Task<int> Run(ValidatorConfig config, CancellationToken canc
         {
             try
             {
-                baselineStore = BaselineStore.Load(config.BaselineFrom, config.Model);
+                baselineStore = BaselineStore.Load(config.BaselineFrom, config.Model, config.JudgeModel);
             }
             catch (Exception ex) when (ex is FileNotFoundException or InvalidOperationException)
             {
@@ -336,7 +336,7 @@ public static async Task<int> Run(ValidatorConfig config, CancellationToken canc
         }
         else if (config.BaselineOut is not null)
         {
-            baselineStore = BaselineStore.ForWrite(config.Model);
+            baselineStore = BaselineStore.ForWrite(config.Model, config.JudgeModel);
             Console.WriteLine($"Baseline will be persisted to {config.BaselineOut} after the run.");
         }
 
diff --git a/eng/skill-validator/src/README.md b/eng/skill-validator/src/README.md
index 4c8c54b4cd..c2b2d03fbf 100644
--- a/eng/skill-validator/src/README.md
+++ b/eng/skill-validator/src/README.md
@@ -147,7 +147,7 @@ skill-validator check --json --plugin ./plugins/my-plugin
 | `--judge-timeout <n>` | `300` | Judge LLM timeout in seconds |
 | `--require-completion` | `true` | Fail if skill regresses task completion |
 | `--baseline-out <path>` | *(none)* | After running, persist each scenario's averaged baseline (no-skill/no-agent reference) to this file for reuse. Mutually exclusive with `--baseline-from`. |
-| `--baseline-from <path>` | *(none)* | Reuse a precomputed baseline from this file instead of re-running the baseline arm. Must match `--model` and every scenario prompt. Mutually exclusive with `--baseline-out`. |
+| `--baseline-from <path>` | *(none)* | Reuse a precomputed baseline from this file instead of re-running the baseline arm. Must match `--model`, `--judge-model`, and every scenario's prompt, setup inputs, and evaluation criteria. Mutually exclusive with `--baseline-out`. |
 | `--verdict-warn-only` | `false` | Treat verdict failures as warnings (exit 0). Execution errors still fail. |
 | `--no-overfitting-check` | `false` | Disable the LLM-based overfitting analysis (on by default) |
 | `--overfitting-fix` | `false` | Generate `eval.fixed.yaml` with improved rubric items/assertions |
@@ -166,7 +166,9 @@ Every evaluation runs each scenario through a **baseline arm** (the agent with n
 1. **Produce** a baseline file with `--baseline-out baseline.json`. After the run, each scenario's averaged baseline result (honoring `--runs`) is written to the file.
 2. **Reuse** it with `--baseline-from baseline.json` on subsequent runs. The baseline arm is skipped entirely; the cached baseline is used for assertions, pairwise/independent judging, and metric deltas.
 
-The baseline file records the `--model` and, per scenario, a SHA-256 of the prompt **and** a SHA-256 of its setup inputs (the fixtures copied via `copy_test_files`, explicit setup files, and setup commands — the analog of a target/input SHA). On reuse the validator fails fast if the model differs or any scenario's prompt-plus-fixture identity is missing from the file, so a stale or mismatched baseline can never be silently applied — and two scenarios that share a prompt but feed the agent different fixtures (e.g. a different `build.binlog`) never reuse each other's baseline. Scenarios reused from the file are reported with the `baseline-reused` session phase and a `reused` baseline status.
+The baseline file records the `--model` **and** `--judge-model`, and per scenario a SHA-256 of the prompt plus a composite SHA-256 over (a) its setup inputs — the fixtures copied via `copy_test_files`, explicit setup files, and setup commands — and (b) the evaluation criteria that shape the stored result (rubric, assertions, expect/reject tools, and the turn/token/timeout limits). This is the analog of a target/input SHA. On reuse the validator fails fast if the agent model, the judge model, or any scenario's prompt-plus-setup-plus-criteria identity is missing from the file, so a stale or mismatched baseline can never be silently applied — and two scenarios that share a prompt but feed the agent different fixtures (e.g. a different `build.binlog`) or use different rubrics never reuse each other's baseline. Scenarios reused from the file are reported with the `baseline-reused` session phase and a `reused` baseline status.
+
+> **Note:** Setup `commands` are fingerprinted by their text (the recipe), not the artifacts they produce, so baseline reuse assumes setup commands are deterministic/hermetic — a command whose output changes between runs (e.g. fetching `latest`) will not invalidate a cached baseline.
 
 The two options are mutually exclusive.
 
diff --git a/eng/skill-validator/src/docs/InvestigatingResults.md b/eng/skill-validator/src/docs/InvestigatingResults.md
index c278035799..e2c0a7ecd4 100644
--- a/eng/skill-validator/src/docs/InvestigatingResults.md
+++ b/eng/skill-validator/src/docs/InvestigatingResults.md
@@ -83,7 +83,7 @@ Each scenario includes two required runs (baseline + isolated). It may also incl
 
 > **Note:** Scenarios do not have a `passed` field. To determine pass/fail for an individual scenario, check whether `improvementScore >= 0`. This is the effective score: when no plugin run is present it equals `isolatedImprovementScore`; when a plugin run is present it is the min of isolated and plugin scores. The `passed` field exists only at the verdict level (per-skill).
 
-> **Reused baselines:** When the run was invoked with `--baseline-from`, the `baseline` arm is not executed — its `metrics` and `judgeResult` come from the shared baseline file produced earlier with `--baseline-out` (computed once, honoring `--runs`). Such scenarios are reported with the `baseline-reused` session phase and a `reused` baseline status. The baseline file is keyed on `--model` plus, per scenario, a SHA-256 of the prompt and a SHA-256 of its setup/fixture inputs (copied test files, explicit setup files, and setup commands); reuse fails fast if the model differs or any prompt-plus-fixture identity is missing, so the baseline you compare against is always identity-matched and a shared prompt across cases with different fixtures cannot cross-contaminate. Because the baseline output is identical across every skill/agent that consumes the same file, this acts as a shared control group and removes baseline run-to-run variance from cross-skill comparisons.
+> **Reused baselines:** When the run was invoked with `--baseline-from`, the `baseline` arm is not executed — its `metrics` and `judgeResult` come from the shared baseline file produced earlier with `--baseline-out` (computed once, honoring `--runs`). Such scenarios are reported with the `baseline-reused` session phase and a `reused` baseline status. The baseline file is keyed on `--model` and `--judge-model` plus, per scenario, a SHA-256 of the prompt and a composite SHA-256 over its setup inputs (copied test files, explicit setup files, and setup commands) and its evaluation criteria (rubric, assertions, expect/reject tools, and turn/token/timeout limits); reuse fails fast if the agent model, judge model, or any prompt-plus-setup-plus-criteria identity is missing, so the baseline you compare against is always identity-matched and a shared prompt across cases with different fixtures or rubrics cannot cross-contaminate. Because the baseline output is identical across every skill/agent that consumes the same file, this acts as a shared control group and removes baseline run-to-run variance from cross-skill comparisons.
 
 ### Breakdown fields
 
diff --git a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
index 09444d7cb5..d6d41c3d02 100644
--- a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
+++ b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
@@ -6,6 +6,9 @@ namespace SkillValidator.Tests;
 
 public class BaselineStoreTests
 {
+    private const string Model = "model-x";
+    private const string Judge = "judge-x";
+
     private static RunResult MakeBaseline(double overallScore = 3, string output = "baseline output") =>
         new(
             new RunMetrics
@@ -42,7 +45,7 @@ public void SaveThenLoad_RoundTripsBaselinePerScenario()
         var path = TempPath();
         try
         {
-            var store = BaselineStore.ForWrite("model-x");
+            var store = BaselineStore.ForWrite(Model, Judge);
             var s1 = Scenario("alpha", "prompt one");
             var s2 = Scenario("beta", "prompt two");
             store.Record(s1, runs: 5, MakeBaseline(overallScore: 4, output: "out-1"));
@@ -51,7 +54,7 @@ public void SaveThenLoad_RoundTripsBaselinePerScenario()
 
             Assert.True(File.Exists(path));
 
-            var loaded = BaselineStore.Load(path, "model-x");
+            var loaded = BaselineStore.Load(path, Model, Judge);
             Assert.True(loaded.IsReuse);
             Assert.Equal(2, loaded.Count);
 
@@ -75,12 +78,12 @@ public void Load_ThrowsOnModelMismatch()
         var path = TempPath();
         try
         {
-            var store = BaselineStore.ForWrite("model-x");
+            var store = BaselineStore.ForWrite(Model, Judge);
             store.Record(Scenario("alpha", "prompt one"), runs: 3, MakeBaseline());
             store.Save(path);
 
-            var ex = Assert.Throws<InvalidOperationException>(() => BaselineStore.Load(path, "model-y"));
-            Assert.Contains("model-x", ex.Message);
+            var ex = Assert.Throws<InvalidOperationException>(() => BaselineStore.Load(path, "model-y", Judge));
+            Assert.Contains(Model, ex.Message);
             Assert.Contains("model-y", ex.Message);
         }
         finally
@@ -89,6 +92,26 @@ public void Load_ThrowsOnModelMismatch()
         }
     }
 
+    [Fact]
+    public void Load_ThrowsOnJudgeModelMismatch()
+    {
+        var path = TempPath();
+        try
+        {
+            var store = BaselineStore.ForWrite(Model, Judge);
+            store.Record(Scenario("alpha", "prompt one"), runs: 3, MakeBaseline());
+            store.Save(path);
+
+            var ex = Assert.Throws<InvalidOperationException>(() => BaselineStore.Load(path, Model, "judge-y"));
+            Assert.Contains(Judge, ex.Message);
+            Assert.Contains("judge-y", ex.Message);
+        }
+        finally
+        {
+            File.Delete(path);
+        }
+    }
+
     [Fact]
     public void Load_ThrowsOnUnsupportedVersion()
     {
@@ -97,13 +120,14 @@ public void Load_ThrowsOnUnsupportedVersion()
         {
             var file = new BaselineFile(
                 Version: BaselineStore.CurrentVersion + 1,
-                Model: "model-x",
+                Model: Model,
+                JudgeModel: Judge,
                 ValidatorVersion: "9.9.9",
                 CreatedAt: DateTime.UtcNow.ToString("o"),
                 Scenarios: []);
             File.WriteAllText(path, JsonSerializer.Serialize(file, SkillValidatorJsonContext.Default.BaselineFile));
 
-            var ex = Assert.Throws<InvalidOperationException>(() => BaselineStore.Load(path, "model-x"));
+            var ex = Assert.Throws<InvalidOperationException>(() => BaselineStore.Load(path, Model, Judge));
             Assert.Contains("unsupported version", ex.Message);
         }
         finally
@@ -115,7 +139,7 @@ public void Load_ThrowsOnUnsupportedVersion()
     [Fact]
     public void Load_ThrowsWhenFileMissing()
     {
-        Assert.Throws<FileNotFoundException>(() => BaselineStore.Load(TempPath(), "model-x"));
+        Assert.Throws<FileNotFoundException>(() => BaselineStore.Load(TempPath(), Model, Judge));
     }
 
     [Fact]
@@ -124,12 +148,12 @@ public void FindMissingScenarios_ReturnsScenariosWithoutCachedBaseline()
         var path = TempPath();
         try
         {
-            var store = BaselineStore.ForWrite("model-x");
+            var store = BaselineStore.ForWrite(Model, Judge);
             var present = Scenario("alpha", "prompt one");
             store.Record(present, runs: 5, MakeBaseline());
             store.Save(path);
 
-            var loaded = BaselineStore.Load(path, "model-x");
+            var loaded = BaselineStore.Load(path, Model, Judge);
             var missing = loaded.FindMissingScenarios([(present, null), (Scenario("beta", "prompt two"), null)]);
 
             Assert.Single(missing);
@@ -144,7 +168,7 @@ public void FindMissingScenarios_ReturnsScenariosWithoutCachedBaseline()
     [Fact]
     public void WriteStore_IsNotReuse()
     {
-        var store = BaselineStore.ForWrite("model-x");
+        var store = BaselineStore.ForWrite(Model, Judge);
         Assert.False(store.IsReuse);
         Assert.Null(store.TryGetBaseline(Scenario("alpha", "prompt one")));
     }
@@ -189,6 +213,30 @@ public void ComputeTargetSha_DiffersByFixtureContentAndIsStable()
         }
     }
 
+    [Fact]
+    public void ComputeTargetSha_DiffersByEvaluationCriteria()
+    {
+        const string prompt = "investigate the failure";
+        var baseScenario = Scenario("s", prompt);
+        var withRubric = baseScenario with { Rubric = ["Did it find the root cause?"] };
+        var withAssertion = baseScenario with { Assertions = [new Assertion(AssertionType.OutputContains, Value: "error")] };
+        var withTurns = baseScenario with { MaxTurns = 5 };
+        var withExpectTools = baseScenario with { ExpectTools = ["bash"] };
+
+        var shaBase = BaselineStore.ComputeTargetSha(baseScenario, null);
+
+        // Each criterion that shapes the cached result must change the identity.
+        Assert.NotEqual(shaBase, BaselineStore.ComputeTargetSha(withRubric, null));
+        Assert.NotEqual(shaBase, BaselineStore.ComputeTargetSha(withAssertion, null));
+        Assert.NotEqual(shaBase, BaselineStore.ComputeTargetSha(withTurns, null));
+        Assert.NotEqual(shaBase, BaselineStore.ComputeTargetSha(withExpectTools, null));
+
+        // Same criteria → stable identity.
+        Assert.Equal(
+            BaselineStore.ComputeTargetSha(withRubric, null),
+            BaselineStore.ComputeTargetSha(baseScenario with { Rubric = ["Did it find the root cause?"] }, null));
+    }
+
     [Fact]
     public void SamePromptDifferentFixture_DoesNotReuseBaseline()
     {
@@ -203,21 +251,22 @@ public void SamePromptDifferentFixture_DoesNotReuseBaseline()
             var scenarioB = FixtureScenario("case-B", sharedPrompt);
 
             // Persist a baseline only for case A.
-            var store = BaselineStore.ForWrite("model-x");
+            var store = BaselineStore.ForWrite(Model, Judge);
             store.Record(scenarioA, runs: 5, MakeBaseline(output: "A-baseline"), evalA);
             store.Save(path);
 
-            var loaded = BaselineStore.Load(path, "model-x");
+            var loaded = BaselineStore.Load(path, Model, Judge);
 
             // Case A reuses its baseline; case B must NOT (different targetSha).
             Assert.NotNull(loaded.TryGetBaseline(scenarioA, evalA));
             Assert.Equal("A-baseline", loaded.TryGetBaseline(scenarioA, evalA)!.Metrics.AgentOutput);
             Assert.Null(loaded.TryGetBaseline(scenarioB, evalB));
 
-            // FindMissingScenarios surfaces case B by name despite the shared prompt.
+            // FindMissingScenarios surfaces case B (with its eval path) despite the shared prompt.
             var missing = loaded.FindMissingScenarios([(scenarioA, evalA), (scenarioB, evalB)]);
             Assert.Single(missing);
-            Assert.Equal("case-B", missing[0]);
+            Assert.StartsWith("case-B", missing[0]);
+            Assert.Contains(evalB, missing[0]);
         }
         finally
         {

From b5be0e3fb87ea3d04ac0ba0204976ff973744ba0 Mon Sep 17 00:00:00 2001
From: YuliiaKovalova <95473390+YuliiaKovalova@users.noreply.github.com>
Date: Thu, 11 Jun 2026 18:10:31 +0200
Subject: [PATCH 4/7] Address PR review comments on baseline reuse

- Mirror AgentRunner.SetupWorkDir exactly when hashing copied fixtures:
  enumerate only the files actually copied (top-level siblings except
  eval.yaml, recursing into directories) and skip reparse points and
  out-of-root junctions, instead of blindly hashing every file under the
  eval directory. This keeps the fixture identity restricted to the
  intentionally-copied set so stray output/log files can't poison reuse.
- Stream baseline JSON to/from disk (File.OpenRead/File.Create with
  JsonSerializer) so large baselines never materialize as one giant
  in-memory string.
- Enrich the fail-fast 'missing scenario' output with the eval path and
  short prompt/target SHA prefixes so it is actionable when scenario
  names collide across eval files.
- Add a test locking in recursive (nested-directory) fixture hashing.

563 tests pass.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../src/Evaluate/BaselineStore.cs             | 87 ++++++++++++++++---
 .../tests/Evaluate/BaselineStoreTests.cs      | 29 ++++++-
 2 files changed, 101 insertions(+), 15 deletions(-)

diff --git a/eng/skill-validator/src/Evaluate/BaselineStore.cs b/eng/skill-validator/src/Evaluate/BaselineStore.cs
index 87a51a12d2..3a060e13e4 100644
--- a/eng/skill-validator/src/Evaluate/BaselineStore.cs
+++ b/eng/skill-validator/src/Evaluate/BaselineStore.cs
@@ -83,7 +83,8 @@ public static BaselineStore Load(string path, string expectedModel, string expec
         BaselineFile? file;
         try
         {
-            file = JsonSerializer.Deserialize(File.ReadAllText(path), SkillValidatorJsonContext.Default.BaselineFile);
+            using var stream = File.OpenRead(path);
+            file = JsonSerializer.Deserialize(stream, SkillValidatorJsonContext.Default.BaselineFile);
         }
         catch (JsonException ex)
         {
@@ -182,17 +183,15 @@ private static string ComputeInputsSha(EvalScenario scenario, string? evalPath)
         var sb = new StringBuilder();
 
         // 1. Sibling files auto-copied into the work dir (copy_test_files: true).  Mirror
-        //    AgentRunner.SetupWorkDir, which excludes only the top-level eval.yaml.
+        //    AgentRunner.SetupWorkDir/CopyDirectory exactly so the hash reflects precisely
+        //    the files the agent is given — no more (e.g. reparse points are skipped) and
+        //    no fewer (nested files are included).
         if (setup.CopyTestFiles && evalPath is not null)
         {
             var evalDir = Path.GetDirectoryName(evalPath);
             if (!string.IsNullOrEmpty(evalDir) && Directory.Exists(evalDir))
             {
-                var files = Directory.EnumerateFiles(evalDir, "*", SearchOption.AllDirectories)
-                    .Select(f => (Rel: Path.GetRelativePath(evalDir, f).Replace('\\', '/'), Full: f))
-                    .Where(x => !string.Equals(x.Rel, "eval.yaml", StringComparison.Ordinal))
-                    .OrderBy(x => x.Rel, StringComparer.Ordinal);
-                foreach (var (rel, full) in files)
+                foreach (var (rel, full) in EnumerateCopiedFixtures(evalDir).OrderBy(x => x.Rel, StringComparer.Ordinal))
                     sb.Append("F:").Append(rel).Append('=').Append(HashFile(full)).Append('\n');
             }
         }
@@ -224,6 +223,54 @@ private static string ComputeInputsSha(EvalScenario scenario, string? evalPath)
         return Sha256Hex(Encoding.UTF8.GetBytes(sb.ToString()));
     }
 
+    private static readonly StringComparison PathComparison =
+        OperatingSystem.IsWindows() ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal;
+
+    /// <summary>
+    /// Yields the exact set of files <see cref="AgentRunner.SetupWorkDir"/> copies into the
+    /// work dir under <c>copy_test_files</c>: every top-level sibling except <c>eval.yaml</c>,
+    /// recursing into directories.  Reparse points (symlinks/junctions) and junctions that
+    /// resolve outside their top-level fixture directory are skipped, mirroring
+    /// <c>CopyDirectory</c>, so the hash only ever covers files genuinely materialized for the
+    /// run rather than whatever else happens to live under the eval directory.
+    /// </summary>
+    private static IEnumerable<(string Rel, string Full)> EnumerateCopiedFixtures(string evalDir)
+    {
+        foreach (var entry in new DirectoryInfo(evalDir).EnumerateFileSystemInfos())
+        {
+            if (string.Equals(entry.Name, "eval.yaml", StringComparison.Ordinal))
+                continue;
+            if (entry is FileInfo file)
+                yield return (file.Name, file.FullName);
+            else if (entry is DirectoryInfo dir)
+            {
+                var root = Path.TrimEndingDirectorySeparator(Path.GetFullPath(dir.FullName));
+                foreach (var nested in EnumerateDirFixtures(dir.FullName, dir.Name, root))
+                    yield return nested;
+            }
+        }
+    }
+
+    private static IEnumerable<(string Rel, string Full)> EnumerateDirFixtures(string dir, string relBase, string sourceRoot)
+    {
+        foreach (var entry in new DirectoryInfo(dir).EnumerateFileSystemInfos())
+        {
+            if ((entry.Attributes & FileAttributes.ReparsePoint) != 0)
+                continue;
+            var rel = string.Concat(relBase, "/", entry.Name);
+            if (entry is DirectoryInfo sub)
+            {
+                var subFull = Path.TrimEndingDirectorySeparator(Path.GetFullPath(sub.FullName));
+                if (!subFull.StartsWith(sourceRoot + Path.DirectorySeparatorChar, PathComparison))
+                    continue;
+                foreach (var nested in EnumerateDirFixtures(sub.FullName, rel, sourceRoot))
+                    yield return nested;
+            }
+            else
+                yield return (rel, entry.FullName);
+        }
+    }
+
     /// <summary>
     /// Deterministic textual signature of the evaluation criteria that influence a scenario's
     /// stored baseline result: run-bounding limits, rubric, assertions, and expect/reject tools.
@@ -273,15 +320,24 @@ private static string HashFile(string path)
     private static string MakeKey(string promptSha, string targetSha) => string.Concat(promptSha, ":", targetSha);
 
     /// <summary>
-    /// In reuse mode, return human-readable identifiers (name + eval path) of scenarios that
-    /// have no matching cached baseline (keyed by prompt + setup/criteria identity).  Empty
-    /// when every scenario is covered.  Each scenario is paired with the eval.yaml path it
-    /// originates from so its input artifacts can be fingerprinted and reported unambiguously.
+    /// In reuse mode, return human-readable identifiers of scenarios that have no matching
+    /// cached baseline (keyed by prompt + setup/criteria identity).  Empty when every scenario
+    /// is covered.  Each entry carries the originating eval path plus short prompt/target SHA
+    /// prefixes so a missing scenario is actionable even when names collide across eval files.
     /// </summary>
     public IReadOnlyList<string> FindMissingScenarios(IEnumerable<(EvalScenario Scenario, string? EvalPath)> scenarios) =>
         scenarios
-            .Where(s => !_entries.ContainsKey(MakeKey(ComputePromptSha(s.Scenario.Prompt), TargetShaFor(s.Scenario, s.EvalPath))))
-            .Select(s => s.EvalPath is null ? s.Scenario.Name : $"{s.Scenario.Name} ({s.EvalPath})")
+            .Select(s => (
+                s.Scenario,
+                s.EvalPath,
+                PromptSha: ComputePromptSha(s.Scenario.Prompt),
+                TargetSha: TargetShaFor(s.Scenario, s.EvalPath)))
+            .Where(x => !_entries.ContainsKey(MakeKey(x.PromptSha, x.TargetSha)))
+            .Select(x =>
+            {
+                var where = x.EvalPath is null ? "" : $" in {x.EvalPath}";
+                return $"{x.Scenario.Name}{where} [prompt {x.PromptSha[..8]}, target {x.TargetSha[..8]}]";
+            })
             .ToList();
 
     /// <summary>Get the cached averaged baseline for a scenario, or null when absent.</summary>
@@ -317,7 +373,10 @@ public void Save(string path)
         if (!string.IsNullOrEmpty(dir))
             Directory.CreateDirectory(dir);
 
-        File.WriteAllText(path, JsonSerializer.Serialize(file, SkillValidatorJsonContext.Default.BaselineFile));
+        // Stream directly to disk so large baselines (many scenarios with full
+        // RunMetrics/AgentOutput) never materialize as one giant in-memory string.
+        using var stream = File.Create(path);
+        JsonSerializer.Serialize(stream, file, SkillValidatorJsonContext.Default.BaselineFile);
     }
 
     /// <summary>Number of baselines currently held.</summary>
diff --git a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
index d6d41c3d02..7dbd5e249f 100644
--- a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
+++ b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
@@ -157,7 +157,7 @@ public void FindMissingScenarios_ReturnsScenariosWithoutCachedBaseline()
             var missing = loaded.FindMissingScenarios([(present, null), (Scenario("beta", "prompt two"), null)]);
 
             Assert.Single(missing);
-            Assert.Equal("beta", missing[0]);
+            Assert.StartsWith("beta", missing[0]);
         }
         finally
         {
@@ -237,6 +237,33 @@ public void ComputeTargetSha_DiffersByEvaluationCriteria()
             BaselineStore.ComputeTargetSha(baseScenario with { Rubric = ["Did it find the root cause?"] }, null));
     }
 
+    [Fact]
+    public void ComputeTargetSha_IncludesNestedFixtureFiles()
+    {
+        // copy_test_files copies subdirectories recursively, so nested fixture content
+        // must participate in the target identity (mirrors AgentRunner.CopyDirectory).
+        var evalPath = MakeEvalDirWithFixture("top.txt", "top");
+        var evalDir = Path.GetDirectoryName(evalPath)!;
+        var nestedDir = Path.Combine(evalDir, "sub");
+        Directory.CreateDirectory(nestedDir);
+        var nestedFile = Path.Combine(nestedDir, "data.bin");
+        File.WriteAllText(nestedFile, "v1");
+        try
+        {
+            var scenario = FixtureScenario("s", "investigate");
+            var before = BaselineStore.ComputeTargetSha(scenario, evalPath);
+
+            File.WriteAllText(nestedFile, "v2");
+            var after = BaselineStore.ComputeTargetSha(scenario, evalPath);
+
+            Assert.NotEqual(before, after); // nested file change invalidates reuse
+        }
+        finally
+        {
+            Directory.Delete(evalDir, recursive: true);
+        }
+    }
+
     [Fact]
     public void SamePromptDifferentFixture_DoesNotReuseBaseline()
     {

From c9f0f44e9eeea00f08be73f971b690e704d3188e Mon Sep 17 00:00:00 2001
From: YuliiaKovalova <95473390+YuliiaKovalova@users.noreply.github.com>
Date: Thu, 11 Jun 2026 20:08:39 +0200
Subject: [PATCH 5/7] Address second round of PR review comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Skip top-level reparse points in EnumerateCopiedFixtures (not just
  nested ones) so a top-level symlink/junction can't cause hashing of
  data outside the eval directory; code now matches the docstring.
- Record uses first-writer-wins (TryAdd) instead of overwriting, so a
  scenario identity recorded by multiple parallel targets yields a
  deterministic --baseline-out regardless of completion order.
- Persist the baseline judge result to the session DB even when the
  baseline is reused, so the registered 'baseline-reused' session record
  is complete for downstream investigation tooling (pairwise was already
  saved; the judge result was incorrectly gated on a fresh run).
- Add first-writer-wins test (564 pass).

Note: BaselineStore stays internal — the test project already has
InternalsVisibleTo, so it compiles.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../src/Evaluate/BaselineStore.cs             | 20 +++++++++-----
 .../src/Evaluate/EvaluateCommand.cs           |  6 +++--
 .../tests/Evaluate/BaselineStoreTests.cs      | 26 +++++++++++++++++++
 3 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/eng/skill-validator/src/Evaluate/BaselineStore.cs b/eng/skill-validator/src/Evaluate/BaselineStore.cs
index 3a060e13e4..1a93250087 100644
--- a/eng/skill-validator/src/Evaluate/BaselineStore.cs
+++ b/eng/skill-validator/src/Evaluate/BaselineStore.cs
@@ -229,10 +229,10 @@ private static string ComputeInputsSha(EvalScenario scenario, string? evalPath)
     /// <summary>
     /// Yields the exact set of files <see cref="AgentRunner.SetupWorkDir"/> copies into the
     /// work dir under <c>copy_test_files</c>: every top-level sibling except <c>eval.yaml</c>,
-    /// recursing into directories.  Reparse points (symlinks/junctions) and junctions that
-    /// resolve outside their top-level fixture directory are skipped, mirroring
-    /// <c>CopyDirectory</c>, so the hash only ever covers files genuinely materialized for the
-    /// run rather than whatever else happens to live under the eval directory.
+    /// recursing into directories.  Reparse points (symlinks/junctions) — at the top level and
+    /// nested — and junctions that resolve outside their top-level fixture directory are
+    /// skipped, so the hash only ever covers files genuinely materialized for the run rather
+    /// than data linked from outside the eval directory.
     /// </summary>
     private static IEnumerable<(string Rel, string Full)> EnumerateCopiedFixtures(string evalDir)
     {
@@ -240,6 +240,8 @@ private static string ComputeInputsSha(EvalScenario scenario, string? evalPath)
         {
             if (string.Equals(entry.Name, "eval.yaml", StringComparison.Ordinal))
                 continue;
+            if ((entry.Attributes & FileAttributes.ReparsePoint) != 0)
+                continue;
             if (entry is FileInfo file)
                 yield return (file.Name, file.FullName);
             else if (entry is DirectoryInfo dir)
@@ -346,12 +348,18 @@ public IReadOnlyList<string> FindMissingScenarios(IEnumerable<(EvalScenario Scen
             ? entry.Baseline
             : null;
 
-    /// <summary>Record a scenario's averaged baseline for later persistence (write mode).</summary>
+    /// <summary>
+    /// Record a scenario's averaged baseline for later persistence (write mode).  The
+    /// baseline arm is target-independent, so when several targets evaluated in parallel
+    /// share the same scenario identity they produce the same key; a <b>first-writer-wins</b>
+    /// strategy keeps the persisted file deterministic regardless of completion order
+    /// (later identical-key records — differing only by run-to-run noise — are ignored).
+    /// </summary>
     public void Record(EvalScenario scenario, int runs, RunResult averagedBaseline, string? evalPath = null)
     {
         var promptSha = ComputePromptSha(scenario.Prompt);
         var targetSha = TargetShaFor(scenario, evalPath);
-        _entries[MakeKey(promptSha, targetSha)] = new BaselineScenarioEntry(scenario.Name, promptSha, targetSha, runs, averagedBaseline);
+        _entries.TryAdd(MakeKey(promptSha, targetSha), new BaselineScenarioEntry(scenario.Name, promptSha, targetSha, runs, averagedBaseline));
     }
 
     /// <summary>Serialize all recorded baselines to <paramref name="path"/>.</summary>
diff --git a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
index bce797313b..1e8348b3d3 100644
--- a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
+++ b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
@@ -1434,8 +1434,10 @@ private static async Task<RunExecutionResult> ExecuteRun(
 
         if (sessionDb is not null)
         {
-            if (reusedBaseline is null)
-                sessionDb.SaveJudgeResult(baselineSessionId, JsonSerializer.Serialize(baselineJudge, SkillValidatorJsonContext.Default.JudgeResult));
+            // Persist the baseline judge result even when reused so the baseline session
+            // record (registered with the "baseline-reused" phase) is complete for
+            // downstream investigation tooling — baselineJudge is valid in both cases.
+            sessionDb.SaveJudgeResult(baselineSessionId, JsonSerializer.Serialize(baselineJudge, SkillValidatorJsonContext.Default.JudgeResult));
             sessionDb.SaveJudgeResult(isolatedSessionId, JsonSerializer.Serialize(isolatedJudge, SkillValidatorJsonContext.Default.JudgeResult));
             sessionDb.SaveJudgeResult(pluginSessionId, JsonSerializer.Serialize(pluginJudge, SkillValidatorJsonContext.Default.JudgeResult));
         }
diff --git a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
index 7dbd5e249f..57d295a880 100644
--- a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
+++ b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
@@ -237,6 +237,32 @@ public void ComputeTargetSha_DiffersByEvaluationCriteria()
             BaselineStore.ComputeTargetSha(baseScenario with { Rubric = ["Did it find the root cause?"] }, null));
     }
 
+    [Fact]
+    public void Record_IsFirstWriterWins_ForSameScenarioIdentity()
+    {
+        var path = TempPath();
+        try
+        {
+            var store = BaselineStore.ForWrite(Model, Judge);
+            var scenario = Scenario("alpha", "prompt one");
+
+            // Same identity recorded twice (e.g. two parallel targets sharing a scenario)
+            // with differing run-to-run results: the first record must win so --baseline-out
+            // is deterministic regardless of completion order.
+            store.Record(scenario, runs: 5, MakeBaseline(output: "first"));
+            store.Record(scenario, runs: 5, MakeBaseline(output: "second"));
+
+            Assert.Equal(1, store.Count);
+            store.Save(path);
+            var loaded = BaselineStore.Load(path, Model, Judge);
+            Assert.Equal("first", loaded.TryGetBaseline(scenario)!.Metrics.AgentOutput);
+        }
+        finally
+        {
+            File.Delete(path);
+        }
+    }
+
     [Fact]
     public void ComputeTargetSha_IncludesNestedFixtureFiles()
     {

From 7b76143d0e60a4d4a7c4fd7a42cab72fbbda437b Mon Sep 17 00:00:00 2001
From: YuliiaKovalova <95473390+YuliiaKovalova@users.noreply.github.com>
Date: Fri, 12 Jun 2026 12:11:47 +0200
Subject: [PATCH 6/7] Pin MessagePack to 2.5.301 to fix NU1903 vulnerability

The transitive MessagePack 2.5.198 (via GitHub.Copilot.SDK -> StreamJsonRpc)
has a high-severity vulnerability (GHSA-hv8m-jj95-wg3x) that fails the build
under TreatWarningsAsErrors. Pin a direct reference to the patched 2.5.301.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 eng/skill-validator/src/SkillValidator.csproj | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/eng/skill-validator/src/SkillValidator.csproj b/eng/skill-validator/src/SkillValidator.csproj
index 77b51e897e..ca7037ad99 100644
--- a/eng/skill-validator/src/SkillValidator.csproj
+++ b/eng/skill-validator/src/SkillValidator.csproj
@@ -52,6 +52,14 @@
       Fixed in 1.1.62.  Drop this once the upstream chain bumps past 1.1.62.
     -->
     <PackageReference Include="Nerdbank.MessagePack" Version="1.2.4" />
+
+    <!--
+      Transitive pin: GitHub.Copilot.SDK -> StreamJsonRpc 2.24.84 drags in
+      MessagePack 2.5.198 which has a known high-severity vulnerability
+      (GHSA-hv8m-jj95-wg3x, out-of-bounds read in LZ4 decompression). Patched in
+      the v2 line as of 2.5.301.  Drop this once the upstream chain bumps past it.
+    -->
+    <PackageReference Include="MessagePack" Version="2.5.301" />
   </ItemGroup>
 
 </Project>

From 79ffa9c33a8b48b07e9a9bd7a11a932112c6092e Mon Sep 17 00:00:00 2001
From: YuliiaKovalova <95473390+YuliiaKovalova@users.noreply.github.com>
Date: Fri, 12 Jun 2026 13:19:58 +0200
Subject: [PATCH 7/7] Address review: bare-filename fixture hashing, clone
 reused baseline, consistent token attribution

- ComputeInputsSha: normalize evalPath via Path.GetFullPath so a bare filename
  still hashes sibling fixtures (avoids TargetSha collisions / unsafe reuse).
- RunMetrics.Clone(): per-run copy with fresh collections; reuse paths now clone
  the cached baseline so concurrent evaluations never share a mutable instance.
- Pairwise judge tokens attributed to both compared runs in every mode (the
  baseline clone makes this safe), keeping token deltas comparable across
  --baseline-from modes.
- Reword Record first-writer-wins doc to describe the within-run stabilization
  guarantee rather than order-independence.
- Add tests for bare-filename fixture hashing and clone isolation (566 pass).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../src/Evaluate/BaselineStore.cs             | 17 +++++--
 .../src/Evaluate/EvaluateCommand.cs           | 20 ++++----
 eng/skill-validator/src/Evaluate/Models.cs    | 31 ++++++++++++
 .../tests/Evaluate/BaselineStoreTests.cs      | 48 +++++++++++++++++++
 4 files changed, 104 insertions(+), 12 deletions(-)

diff --git a/eng/skill-validator/src/Evaluate/BaselineStore.cs b/eng/skill-validator/src/Evaluate/BaselineStore.cs
index 1a93250087..bca17bbe1f 100644
--- a/eng/skill-validator/src/Evaluate/BaselineStore.cs
+++ b/eng/skill-validator/src/Evaluate/BaselineStore.cs
@@ -188,7 +188,12 @@ private static string ComputeInputsSha(EvalScenario scenario, string? evalPath)
         //    no fewer (nested files are included).
         if (setup.CopyTestFiles && evalPath is not null)
         {
-            var evalDir = Path.GetDirectoryName(evalPath);
+            // Normalize first: Path.GetDirectoryName returns "" for a bare filename
+            // (e.g. "eval.yaml" in the cwd), which would silently skip fixture hashing
+            // even though copy_test_files still copies the sibling files — risking
+            // TargetSha collisions and unsafe baseline reuse.  GetFullPath resolves the
+            // bare name against the current directory so its real parent is hashed.
+            var evalDir = Path.GetDirectoryName(Path.GetFullPath(evalPath));
             if (!string.IsNullOrEmpty(evalDir) && Directory.Exists(evalDir))
             {
                 foreach (var (rel, full) in EnumerateCopiedFixtures(evalDir).OrderBy(x => x.Rel, StringComparer.Ordinal))
@@ -351,9 +356,13 @@ public IReadOnlyList<string> FindMissingScenarios(IEnumerable<(EvalScenario Scen
     /// <summary>
     /// Record a scenario's averaged baseline for later persistence (write mode).  The
     /// baseline arm is target-independent, so when several targets evaluated in parallel
-    /// share the same scenario identity they produce the same key; a <b>first-writer-wins</b>
-    /// strategy keeps the persisted file deterministic regardless of completion order
-    /// (later identical-key records — differing only by run-to-run noise — are ignored).
+    /// share the same scenario identity they produce the same key.  A <b>first-writer-wins</b>
+    /// strategy stabilizes the baseline chosen <i>within a single run</i>: once a value is
+    /// recorded for a key the first writer's value is kept and later records for that key
+    /// are ignored, preventing non-deterministic late overwrites under parallelism.  The
+    /// competing records differ only by run-to-run noise, so which writer wins the race is
+    /// immaterial — the guarantee is that the persisted value is not clobbered afterward,
+    /// not that it is independent of thread scheduling.
     /// </summary>
     public void Record(EvalScenario scenario, int runs, RunResult averagedBaseline, string? evalPath = null)
     {
diff --git a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
index 1e8348b3d3..cada00d47e 100644
--- a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
+++ b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
@@ -806,7 +806,7 @@ private static async Task<RunExecutionResult> ExecuteAgentRun(
         {
             if (config.Verbose)
                 runLog("↩︎ reusing precomputed baseline");
-            baselineMetrics = reusedBaseline.Metrics;
+            baselineMetrics = reusedBaseline.Metrics.Clone();
             var skilled = await Task.WhenAll(isolatedTask, pluginTask);
             isolatedMetrics = skilled[0];
             pluginMetrics = skilled[1];
@@ -908,8 +908,11 @@ private static async Task<RunExecutionResult> ExecuteAgentRun(
                     new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, pairwiseWorkDir, agent.Path, worseSkilled.WorkDir),
                     runLog, cancellationToken);
                 pairwise = pairwiseResult;
-                if (reusedBaseline is null)
-                    AccumulateJudgeTokens(baselineMetrics, pairwiseTokens);
+                // Attribute pairwise judge tokens consistently to both compared runs in
+                // every mode so token deltas stay comparable regardless of --baseline-from.
+                // baselineMetrics is a per-run clone when reused, so this never mutates the
+                // shared cached baseline.
+                AccumulateJudgeTokens(baselineMetrics, pairwiseTokens);
                 AccumulateJudgeTokens(worseSkilled, pairwiseTokens);
             }
             catch (Exception error)
@@ -1344,7 +1347,7 @@ private static async Task<RunExecutionResult> ExecuteRun(
         {
             if (config.Verbose)
                 runLog("↩︎ reusing precomputed baseline");
-            baselineMetrics = reusedBaseline.Metrics;
+            baselineMetrics = reusedBaseline.Metrics.Clone();
             var skilled = await Task.WhenAll(isolatedTask, pluginTask);
             isolatedMetrics = skilled[0];
             pluginMetrics = skilled[1];
@@ -1466,10 +1469,11 @@ private static async Task<RunExecutionResult> ExecuteRun(
                     new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, pairwiseWorkDir, skill.Path, worseSkilled.WorkDir),
                     runLog, cancellationToken);
                 pairwise = pairwiseResult;
-                // Attribute pairwise judge tokens to the compared run (and to the baseline
-                // only when it was freshly executed, to avoid double-counting reused cost).
-                if (reusedBaseline is null)
-                    AccumulateJudgeTokens(baselineMetrics, pairwiseTokens);
+                // Attribute pairwise judge tokens consistently to both compared runs in
+                // every mode so token deltas stay comparable regardless of --baseline-from.
+                // baselineMetrics is a per-run clone when reused, so this never mutates the
+                // shared cached baseline.
+                AccumulateJudgeTokens(baselineMetrics, pairwiseTokens);
                 AccumulateJudgeTokens(worseSkilled, pairwiseTokens);
                 if (sessionDb is not null && pairwise is not null)
                 {
diff --git a/eng/skill-validator/src/Evaluate/Models.cs b/eng/skill-validator/src/Evaluate/Models.cs
index 00508a9cf8..da6fe75a85 100644
--- a/eng/skill-validator/src/Evaluate/Models.cs
+++ b/eng/skill-validator/src/Evaluate/Models.cs
@@ -186,6 +186,37 @@ public sealed class RunMetrics
     public string AgentOutput { get; set; } = "";
     public List<AgentEvent> Events { get; set; } = [];
     public string WorkDir { get; set; } = "";
+
+    /// <summary>
+    /// Creates a per-run copy.  Scalar fields are copied by value and the mutable
+    /// collections are re-wrapped in fresh instances so mutating the clone (e.g.
+    /// accumulating judge tokens) never affects the source.  This is essential when a
+    /// cached baseline is reused concurrently across parallel target evaluations: each
+    /// evaluation works on its own copy instead of sharing one mutable instance.
+    /// </summary>
+    public RunMetrics Clone() => new()
+    {
+        TokenEstimate = TokenEstimate,
+        InputTokens = InputTokens,
+        OutputTokens = OutputTokens,
+        CacheReadTokens = CacheReadTokens,
+        CacheWriteTokens = CacheWriteTokens,
+        JudgeInputTokens = JudgeInputTokens,
+        JudgeOutputTokens = JudgeOutputTokens,
+        JudgeCacheReadTokens = JudgeCacheReadTokens,
+        JudgeCacheWriteTokens = JudgeCacheWriteTokens,
+        ToolCallCount = ToolCallCount,
+        ToolCallBreakdown = new Dictionary<string, int>(ToolCallBreakdown),
+        TurnCount = TurnCount,
+        WallTimeMs = WallTimeMs,
+        ErrorCount = ErrorCount,
+        TimedOut = TimedOut,
+        AssertionResults = [.. AssertionResults],
+        TaskCompleted = TaskCompleted,
+        AgentOutput = AgentOutput,
+        Events = [.. Events],
+        WorkDir = WorkDir,
+    };
 }
 
 public sealed record RunResult(
diff --git a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
index 57d295a880..e364ab7c07 100644
--- a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
+++ b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
@@ -290,6 +290,54 @@ public void ComputeTargetSha_IncludesNestedFixtureFiles()
         }
     }
 
+    [Fact]
+    public void ComputeTargetSha_HashesFixtures_WhenEvalPathIsBareFilename()
+    {
+        // A bare filename (no directory component) must still hash sibling fixtures:
+        // Path.GetDirectoryName returns "" for "eval.yaml", so without normalization
+        // fixture hashing is silently skipped and distinct fixtures collide.
+        var evalPath = MakeEvalDirWithFixture("build.binlog", "AAAA");
+        var evalDir = Path.GetDirectoryName(evalPath)!;
+        var originalCwd = Directory.GetCurrentDirectory();
+        try
+        {
+            Directory.SetCurrentDirectory(evalDir);
+            var scenario = FixtureScenario("s", "investigate build.binlog");
+
+            var shaA = BaselineStore.ComputeTargetSha(scenario, "eval.yaml");
+            File.WriteAllText(Path.Combine(evalDir, "build.binlog"), "BBBB");
+            var shaB = BaselineStore.ComputeTargetSha(scenario, "eval.yaml");
+
+            Assert.NotEqual(shaA, shaB); // fixture content participates in identity
+        }
+        finally
+        {
+            Directory.SetCurrentDirectory(originalCwd);
+            Directory.Delete(evalDir, recursive: true);
+        }
+    }
+
+    [Fact]
+    public void Clone_ProducesIndependentCopy()
+    {
+        var source = MakeBaseline(output: "src").Metrics;
+        source.JudgeInputTokens = 10;
+        source.ToolCallBreakdown["bash"] = 4;
+
+        var clone = source.Clone();
+        clone.JudgeInputTokens = 99;
+        clone.ToolCallBreakdown["bash"] = 1;
+        clone.AssertionResults.Add(new AssertionResult(new Assertion(AssertionType.OutputContains, Value: "x"), true, ""));
+
+        // Mutating the clone must not leak back into the source — the cached baseline
+        // can be reused concurrently across parallel target evaluations.
+        Assert.Equal(10, source.JudgeInputTokens);
+        Assert.Equal(4, source.ToolCallBreakdown["bash"]);
+        Assert.Empty(source.AssertionResults);
+        Assert.NotSame(source.ToolCallBreakdown, clone.ToolCallBreakdown);
+        Assert.NotSame(source.AssertionResults, clone.AssertionResults);
+    }
+
     [Fact]
     public void SamePromptDifferentFixture_DoesNotReuseBaseline()
     {