From 00f70a8657a0fb13b6da6a530532478bb11d5e0e Mon Sep 17 00:00:00 2001 From: YuliiaKovalova <95473390+YuliiaKovalova@users.noreply.github.com> Date: Thu, 11 Jun 2026 17:07:07 +0200 Subject: [PATCH 1/7] Add shared/precomputed baseline reuse to skill-validator evaluate (#751) Add --baseline-out and --baseline-from options to the evaluate command so the no-skill/no-agent baseline arm can be computed once and reused as a shared control group across multiple skill/agent evaluations. This eliminates redundant baseline runs and removes baseline run-to-run variance from cross-config comparisons. - New BaselineStore + BaselineFile/BaselineScenarioEntry models, keyed per scenario on SHA-256(prompt) with a header recording version, model, validator version and runs. Load validates version + model and fails fast on mismatch or missing scenarios. - Register the new serializable types in SkillValidatorJsonContext (AOT source-gen). - Wire two mutually-exclusive CLI options into ValidatorConfig; thread an optional BaselineStore through both execution paths. - On reuse, skip the baseline agent run, its assertions/constraints/ task-completion/judging, and attribute no extra pairwise tokens to the baseline; report the scenario with the baseline-reused session phase and a reused status. In write mode, record each scenario's averaged baseline and persist it after the run. - Add unit tests for BaselineStore and document the feature in README and InvestigatingResults. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/Evaluate/BaselineStore.cs | 145 +++++++++ .../src/Evaluate/EvaluateCommand.cs | 304 ++++++++++++++---- eng/skill-validator/src/Evaluate/Models.cs | 6 + eng/skill-validator/src/README.md | 19 ++ .../src/SkillValidatorJsonContext.cs | 2 + .../src/docs/InvestigatingResults.md | 2 + .../tests/Evaluate/BaselineStoreTests.cs | 151 +++++++++ 7 files changed, 560 insertions(+), 69 deletions(-) create mode 100644 eng/skill-validator/src/Evaluate/BaselineStore.cs create mode 100644 eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs diff --git a/eng/skill-validator/src/Evaluate/BaselineStore.cs b/eng/skill-validator/src/Evaluate/BaselineStore.cs new file mode 100644 index 0000000000..0e3e143065 --- /dev/null +++ b/eng/skill-validator/src/Evaluate/BaselineStore.cs @@ -0,0 +1,145 @@ +using System.Collections.Concurrent; +using System.Security.Cryptography; +using System.Text; +using System.Text.Json; + +namespace SkillValidator.Evaluate; + +/// +/// One scenario's precomputed baseline, keyed by the SHA-256 of its prompt. +/// records how many baseline runs were averaged into +/// so reuse can report the robustness of the reference. +/// +public sealed record BaselineScenarioEntry( + string Name, + string PromptSha, + int Runs, + RunResult Baseline); + +/// +/// On-disk format written by --baseline-out and read by --baseline-from. +/// The baseline arm of evaluate is plain-agent with no skill/MCP attached, so it +/// is independent of the target under test and can be computed once and shared across +/// many invocations. The header records the identity needed to reject a stale reuse. +/// +public sealed record BaselineFile( + int Version, + string Model, + string? ValidatorVersion, + string CreatedAt, + IReadOnlyList Scenarios); + +/// +/// Manages a precomputed, shared baseline across evaluate invocations. +/// In write mode (--baseline-out) it accumulates each scenario's averaged +/// baseline for later persistence. In reuse mode (--baseline-from) it serves +/// cached baselines in place of freshly executed baseline runs. +/// +internal sealed class BaselineStore +{ + /// Current on-disk schema version. + public const int CurrentVersion = 1; + + private readonly ConcurrentDictionary _entries = new(StringComparer.Ordinal); + private readonly string _model; + + /// True when serving cached baselines (--baseline-from). + public bool IsReuse { get; } + + private BaselineStore(string model, bool isReuse) + { + _model = model; + IsReuse = isReuse; + } + + /// Create a store that accumulates baselines for later persistence. + public static BaselineStore ForWrite(string model) => new(model, isReuse: false); + + /// + /// Load a baseline file for reuse. Validates the schema version and that the model + /// matches, throwing on mismatch so a stale or wrong baseline can never silently + /// skew results. Per-scenario prompt identity is validated later via + /// . + /// + public static BaselineStore Load(string path, string expectedModel) + { + if (!File.Exists(path)) + throw new FileNotFoundException($"Baseline file not found: {path}"); + + BaselineFile? file; + try + { + file = JsonSerializer.Deserialize(File.ReadAllText(path), SkillValidatorJsonContext.Default.BaselineFile); + } + catch (JsonException ex) + { + throw new InvalidOperationException($"Baseline file '{path}' is not valid JSON: {ex.Message}", ex); + } + + if (file is null) + throw new InvalidOperationException($"Baseline file '{path}' is empty."); + if (file.Version != CurrentVersion) + throw new InvalidOperationException( + $"Baseline file '{path}' has unsupported version {file.Version} (expected {CurrentVersion}). Recompute it with --baseline-out."); + if (!string.Equals(file.Model, expectedModel, StringComparison.Ordinal)) + throw new InvalidOperationException( + $"Baseline file '{path}' was computed for model '{file.Model}' but evaluation uses model '{expectedModel}'. " + + "Recompute the baseline with --baseline-out for the new model."); + + var store = new BaselineStore(expectedModel, isReuse: true); + foreach (var entry in file.Scenarios) + { + if (entry.Baseline is not null) + store._entries[entry.PromptSha] = entry; + } + return store; + } + + /// SHA-256 (lower-case hex) of the scenario prompt — the per-scenario reuse key. + public static string ComputePromptSha(string prompt) + { + var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(prompt)); + return Convert.ToHexString(bytes).ToLowerInvariant(); + } + + /// + /// In reuse mode, return the names of scenarios that have no matching cached + /// baseline (keyed by prompt hash). Empty when every scenario is covered. + /// + public IReadOnlyList FindMissingScenarios(IEnumerable scenarios) => + scenarios + .Where(s => !_entries.ContainsKey(ComputePromptSha(s.Prompt))) + .Select(s => s.Name) + .ToList(); + + /// Get the cached averaged baseline for a scenario, or null when absent. + public RunResult? TryGetBaseline(EvalScenario scenario) => + _entries.TryGetValue(ComputePromptSha(scenario.Prompt), out var entry) ? entry.Baseline : null; + + /// Record a scenario's averaged baseline for later persistence (write mode). + public void Record(EvalScenario scenario, int runs, RunResult averagedBaseline) + { + var sha = ComputePromptSha(scenario.Prompt); + _entries[sha] = new BaselineScenarioEntry(scenario.Name, sha, runs, averagedBaseline); + } + + /// Serialize all recorded baselines to . + public void Save(string path) + { + var file = new BaselineFile( + Version: CurrentVersion, + Model: _model, + ValidatorVersion: typeof(BaselineStore).Assembly.GetName().Version?.ToString(), + CreatedAt: DateTime.UtcNow.ToString("o"), + Scenarios: _entries.Values.OrderBy(e => e.Name, StringComparer.Ordinal).ToList()); + + var dir = Path.GetDirectoryName(Path.GetFullPath(path)); + if (!string.IsNullOrEmpty(dir)) + Directory.CreateDirectory(dir); + + File.WriteAllText(path, JsonSerializer.Serialize(file, SkillValidatorJsonContext.Default.BaselineFile)); + } + + /// Number of baselines currently held. + public int Count => _entries.Count; +} diff --git a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs index f80987fa3c..d8f054dd9f 100644 --- a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs +++ b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs @@ -33,6 +33,8 @@ public static Command Create() var noiseSkillsDirOpt = new Option("--noise-skills-dir") { Description = "Directory containing skills to load as noise. Enables the noise test: re-runs scenarios with all noise skills loaded and measures degradation." }; var noiseMaxDegradationOpt = new Option("--noise-max-degradation") { Description = "Maximum acceptable average quality degradation (0-1) in noise test (only positive degradations count)", DefaultValueFactory = _ => 0.2 }; var noiseMaxScenarioDegradationOpt = new Option("--noise-max-scenario-degradation") { Description = "Maximum acceptable quality degradation (0-1) for any single noise-test scenario", DefaultValueFactory = _ => 0.4 }; + var baselineOutOpt = new Option("--baseline-out") { Description = "After running, persist each scenario's averaged baseline (no-skill/no-agent reference) to this file for later reuse with --baseline-from." }; + var baselineFromOpt = new Option("--baseline-from") { Description = "Reuse a precomputed baseline from this file instead of re-running the no-skill/no-agent baseline arm. Must match --model and the scenario prompts. Mutually exclusive with --baseline-out." }; var command = new Command("evaluate", "Evaluate agent skills via LLM-based testing") { @@ -59,6 +61,8 @@ public static Command Create() noiseSkillsDirOpt, noiseMaxDegradationOpt, noiseMaxScenarioDegradationOpt, + baselineOutOpt, + baselineFromOpt, }; command.Add(RejudgeCommand.Create()); @@ -110,6 +114,8 @@ public static Command Create() NoiseSkillsDir = parseResult.GetValue(noiseSkillsDirOpt), NoiseDegradationLimit = parseResult.GetValue(noiseMaxDegradationOpt), NoiseMaxScenarioDegradation = parseResult.GetValue(noiseMaxScenarioDegradationOpt), + BaselineOut = parseResult.GetValue(baselineOutOpt), + BaselineFrom = parseResult.GetValue(baselineFromOpt), }; return await Run(config, cancellationToken); @@ -129,6 +135,14 @@ public static Command Create() public static async Task Run(ValidatorConfig config, CancellationToken cancellationToken = default) { + // --baseline-out and --baseline-from are mutually exclusive: one writes a + // shared baseline, the other consumes one. + if (config.BaselineOut is not null && config.BaselineFrom is not null) + { + Console.Error.WriteLine("--baseline-out and --baseline-from cannot be used together."); + return 1; + } + // Validate model early try { @@ -290,6 +304,42 @@ public static async Task Run(ValidatorConfig config, CancellationToken canc bool usePairwise = config.JudgeMode is JudgeMode.Pairwise or JudgeMode.Both; bool effectiveKeepSessions = config.KeepSessions && config.ResultsDir is not null; + // Set up shared-baseline reuse/persistence. + BaselineStore? baselineStore = null; + if (config.BaselineFrom is not null) + { + try + { + baselineStore = BaselineStore.Load(config.BaselineFrom, config.Model); + } + catch (Exception ex) when (ex is FileNotFoundException or InvalidOperationException) + { + Console.Error.WriteLine($"{Ansi.Red}❌ Failed to load baseline from '{config.BaselineFrom}': {ex.Message}{Ansi.Reset}"); + return 1; + } + + // Fail fast if any scenario lacks a matching cached baseline so a stale or + // incomplete baseline can never silently skew results. + var allScenarios = allTargets + .Where(t => t.EvalConfig is not null) + .SelectMany(t => t.EvalConfig!.Scenarios) + .ToList(); + var missing = baselineStore.FindMissingScenarios(allScenarios); + if (missing.Count > 0) + { + Console.Error.WriteLine( + $"{Ansi.Red}❌ Baseline file '{config.BaselineFrom}' has no entry for scenario(s): {string.Join(", ", missing.Distinct())}. " + + $"Recompute the baseline with --baseline-out for the current tests and model.{Ansi.Reset}"); + return 1; + } + Console.WriteLine($"Reusing precomputed baseline from {config.BaselineFrom} ({baselineStore.Count} scenario(s))."); + } + else if (config.BaselineOut is not null) + { + baselineStore = BaselineStore.ForWrite(config.Model); + Console.WriteLine($"Baseline will be persisted to {config.BaselineOut} after the run."); + } + string? sessionsDir = null; SessionDatabase? sessionDb = null; string? timestampedResultsDir = null; @@ -314,7 +364,7 @@ public static async Task Run(ValidatorConfig config, CancellationToken canc // Evaluate all targets (skills and agents) spinner.Start($"Evaluating {allTargets.Count} target(s)..."); var skillTasks = allTargets.Select(target => - skillLimit.RunAsync(() => EvaluateTarget(target, config, usePairwise, spinner, noiseEvalSkills, sessionsDir, sessionDb, cancellationToken), cancellationToken)); + skillLimit.RunAsync(() => EvaluateTarget(target, config, usePairwise, spinner, noiseEvalSkills, sessionsDir, sessionDb, baselineStore, cancellationToken), cancellationToken)); var settled = await Task.WhenAll(skillTasks.Select(async t => { try { return (Result: await t, Error: (Exception?)null); } @@ -353,6 +403,28 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose, await AgentRunner.CleanupWorkDirs(effectiveKeepSessions); sessionDb?.Dispose(); + // Persist the shared baseline for later reuse with --baseline-from. + if (config.BaselineOut is not null && baselineStore is not null) + { + if (baselineStore.Count > 0) + { + try + { + baselineStore.Save(config.BaselineOut); + Console.WriteLine($"Baseline written to {config.BaselineOut} ({baselineStore.Count} scenario(s))."); + } + catch (Exception ex) + { + Console.Error.WriteLine($"{Ansi.Red}❌ Failed to write baseline to '{config.BaselineOut}': {ex.Message}{Ansi.Reset}"); + return 1; + } + } + else + { + Console.Error.WriteLine($"{Ansi.Yellow}⚠ No baselines were produced; nothing written to {config.BaselineOut}.{Ansi.Reset}"); + } + } + // Always fail on execution errors, even in --verdict-warn-only mode if (rejectionMessages.Count > 0) return 1; @@ -380,16 +452,17 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose, IReadOnlyList noiseSkills, string? sessionsDir, SessionDatabase? sessionDb, + BaselineStore? baselineStore, CancellationToken cancellationToken) { if (target.Kind == EvalTargetKind.Skill && target.Skill is not null) { var evalSkill = new EvalSkillInfo(target.Skill, target.EvalPath, target.EvalConfig, target.McpServers); - return await EvaluateSkill(evalSkill, config, usePairwise, spinner, noiseSkills, sessionsDir, sessionDb, cancellationToken); + return await EvaluateSkill(evalSkill, config, usePairwise, spinner, noiseSkills, sessionsDir, sessionDb, baselineStore, cancellationToken); } else if (target.Kind == EvalTargetKind.Agent && target.Agent is not null) { - return await EvaluateAgent(target, config, usePairwise, spinner, sessionsDir, sessionDb, cancellationToken); + return await EvaluateAgent(target, config, usePairwise, spinner, sessionsDir, sessionDb, baselineStore, cancellationToken); } return null; } @@ -405,6 +478,7 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose, Spinner spinner, string? sessionsDir, SessionDatabase? sessionDb, + BaselineStore? baselineStore, CancellationToken cancellationToken) { var agent = target.Agent!; @@ -457,7 +531,7 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose, { try { - return await ExecuteAgentScenario(scenario, target, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, targetSha, cancellationToken); + return await ExecuteAgentScenario(scenario, target, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, targetSha, baselineStore, cancellationToken); } catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested) { @@ -516,6 +590,7 @@ private static async Task ExecuteAgentScenario( string? sessionsDir, SessionDatabase? sessionDb, string? targetSha, + BaselineStore? baselineStore, CancellationToken cancellationToken) { var agent = target.Agent!; @@ -535,7 +610,7 @@ private static async Task ExecuteAgentScenario( { try { - return (Result: await ExecuteAgentRun(i, scenario, target, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, targetSha, cancellationToken), Error: (Exception?)null); + return (Result: await ExecuteAgentRun(i, scenario, target, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, targetSha, baselineStore, cancellationToken), Error: (Exception?)null); } catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested) { @@ -580,6 +655,10 @@ private static async Task ExecuteAgentScenario( var avgIsolated = AverageResults(isolatedRuns); var avgPlugin = AverageResults(pluginRuns); + // Persist the averaged baseline (skill/agent-independent) for shared reuse. + if (baselineStore is { IsReuse: false }) + baselineStore.Record(scenario, runResults.Length, avgBaseline); + int bestPairwiseIdx = -1; for (int i = 0; i < perRunPairwise.Count; i++) { @@ -669,6 +748,7 @@ private static async Task ExecuteAgentRun( string? sessionsDir, SessionDatabase? sessionDb, string? targetSha, + BaselineStore? baselineStore, CancellationToken cancellationToken) { var agent = target.Agent!; @@ -690,8 +770,12 @@ private static async Task ExecuteAgentRun( var pluginConfigDir = sessionsDir is not null ? Path.Combine("sessions", pluginSessionId) : null; var rubricJson = JsonSerializer.Serialize(scenario.Rubric?.ToArray() ?? [], SkillValidatorJsonContext.Default.StringArray); + // Reuse a precomputed shared baseline when available (--baseline-from). The + // baseline arm is agent-independent, so this skips a redundant agent run. + var reusedBaseline = baselineStore?.TryGetBaseline(scenario); + sessionDb?.RegisterSession(baselineSessionId, agent.Name, agent.Path, scenario.Name, runIndex, - "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, targetSha, rubricJson); + reusedBaseline is not null ? "baseline-reused" : "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, targetSha, rubricJson); sessionDb?.RegisterSession(isolatedSessionId, agent.Name, agent.Path, scenario.Name, runIndex, "with-agent-isolated", config.Model, isolatedConfigDir, null, scenario.Prompt, targetSha, rubricJson); sessionDb?.RegisterSession(pluginSessionId, agent.Name, agent.Path, scenario.Name, runIndex, @@ -706,25 +790,41 @@ private static async Task ExecuteAgentRun( additionalAgents = await ResolveAdditionalAgents(scenario.Setup.AdditionalRequiredAgents, pluginRoot); } - var agentTasks = await Task.WhenAll( + // 2. Agent-isolated: target agent only (+ scenario deps) + var isolatedTask = AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose, + PluginRoot: null, Log: runLog, McpServers: target.McpServers, SessionsDir: sessionsDir, + SessionId: isolatedSessionId, Agent: agent, AdditionalSkills: additionalSkills, AdditionalAgents: additionalAgents), cancellationToken); + // 3. Agent-plugin: full plugin context + agent selected + var pluginTask = AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose, + PluginRoot: pluginRoot, Log: runLog, McpServers: target.McpServers, SessionsDir: sessionsDir, + SessionId: pluginSessionId, Agent: agent), cancellationToken); + + RunMetrics baselineMetrics; + RunMetrics isolatedMetrics; + RunMetrics pluginMetrics; + if (reusedBaseline is not null) + { + if (config.Verbose) + runLog("↩︎ reusing precomputed baseline"); + baselineMetrics = reusedBaseline.Metrics; + var skilled = await Task.WhenAll(isolatedTask, pluginTask); + isolatedMetrics = skilled[0]; + pluginMetrics = skilled[1]; + } + else + { // 1. Baseline: no agent, no skills — vanilla - AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose, - PluginRoot: null, Log: runLog, SessionsDir: sessionsDir, SessionId: baselineSessionId), cancellationToken), - // 2. Agent-isolated: target agent only (+ scenario deps) - AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose, - PluginRoot: null, Log: runLog, McpServers: target.McpServers, SessionsDir: sessionsDir, - SessionId: isolatedSessionId, Agent: agent, AdditionalSkills: additionalSkills, AdditionalAgents: additionalAgents), cancellationToken), - // 3. Agent-plugin: full plugin context + agent selected - AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose, - PluginRoot: pluginRoot, Log: runLog, McpServers: target.McpServers, SessionsDir: sessionsDir, - SessionId: pluginSessionId, Agent: agent), cancellationToken)); - var baselineMetrics = agentTasks[0]; - var isolatedMetrics = agentTasks[1]; - var pluginMetrics = agentTasks[2]; + var baselineTask = AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose, + PluginRoot: null, Log: runLog, SessionsDir: sessionsDir, SessionId: baselineSessionId), cancellationToken); + var all = await Task.WhenAll(baselineTask, isolatedTask, pluginTask); + baselineMetrics = all[0]; + isolatedMetrics = all[1]; + pluginMetrics = all[2]; + } if (sessionDb is not null) { - sessionDb.CompleteSession(baselineSessionId, baselineMetrics.TimedOut ? "timed_out" : "completed", + sessionDb.CompleteSession(baselineSessionId, reusedBaseline is not null ? "reused" : (baselineMetrics.TimedOut ? "timed_out" : "completed"), JsonSerializer.Serialize(baselineMetrics, SkillValidatorJsonContext.Default.RunMetrics)); sessionDb.CompleteSession(isolatedSessionId, isolatedMetrics.TimedOut ? "timed_out" : "completed", JsonSerializer.Serialize(isolatedMetrics, SkillValidatorJsonContext.Default.RunMetrics)); @@ -732,43 +832,58 @@ private static async Task ExecuteAgentRun( JsonSerializer.Serialize(pluginMetrics, SkillValidatorJsonContext.Default.RunMetrics)); } - // Assertions, constraints, task completion, judging — same as skills + // Assertions, constraints, task completion, judging — same as skills. + // Baseline arm is skipped when reused (its results are cached). if (scenario.Assertions is { Count: > 0 }) { - baselineMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, baselineMetrics.AgentOutput, baselineMetrics.WorkDir, scenario.Timeout); + if (reusedBaseline is null) + baselineMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, baselineMetrics.AgentOutput, baselineMetrics.WorkDir, scenario.Timeout); isolatedMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, isolatedMetrics.AgentOutput, isolatedMetrics.WorkDir, scenario.Timeout); pluginMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, pluginMetrics.AgentOutput, pluginMetrics.WorkDir, scenario.Timeout); } - var baselineConstraints = AssertionEvaluator.EvaluateConstraints(scenario, baselineMetrics); + var baselineConstraints = reusedBaseline is null ? AssertionEvaluator.EvaluateConstraints(scenario, baselineMetrics) : []; var isolatedConstraints = AssertionEvaluator.EvaluateConstraints(scenario, isolatedMetrics); var pluginConstraints = AssertionEvaluator.EvaluateConstraints(scenario, pluginMetrics); - baselineMetrics.AssertionResults = [..baselineMetrics.AssertionResults, ..baselineConstraints]; + if (reusedBaseline is null) + baselineMetrics.AssertionResults = [..baselineMetrics.AssertionResults, ..baselineConstraints]; isolatedMetrics.AssertionResults = [..isolatedMetrics.AssertionResults, ..isolatedConstraints]; pluginMetrics.AssertionResults = [..pluginMetrics.AssertionResults, ..pluginConstraints]; - if (scenario.Assertions is { Count: > 0 } || baselineConstraints.Count > 0) + if (scenario.Assertions is { Count: > 0 } || baselineConstraints.Count > 0 || isolatedConstraints.Count > 0 || pluginConstraints.Count > 0) { - baselineMetrics.TaskCompleted = baselineMetrics.AssertionResults.All(a => a.Passed); + if (reusedBaseline is null) + baselineMetrics.TaskCompleted = baselineMetrics.AssertionResults.All(a => a.Passed); isolatedMetrics.TaskCompleted = isolatedMetrics.AssertionResults.All(a => a.Passed); pluginMetrics.TaskCompleted = pluginMetrics.AssertionResults.All(a => a.Passed); } else { - baselineMetrics.TaskCompleted = baselineMetrics.ErrorCount == 0; + if (reusedBaseline is null) + baselineMetrics.TaskCompleted = baselineMetrics.ErrorCount == 0; isolatedMetrics.TaskCompleted = isolatedMetrics.ErrorCount == 0; pluginMetrics.TaskCompleted = pluginMetrics.ErrorCount == 0; } - var judgeOpts = new JudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, agent.Path); + var judgeOpts = new JudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, isolatedMetrics.WorkDir, agent.Path); - var (baselineJudge, baselineJudgeTokens) = await SafeJudge(Judge.JudgeRun(scenario, baselineMetrics, judgeOpts, runLog, cancellationToken), "baseline", runLog); + JudgeResult baselineJudge; + if (reusedBaseline is not null) + { + baselineJudge = reusedBaseline.JudgeResult; + } + else + { + var (judged, baselineJudgeTokens) = await SafeJudge(Judge.JudgeRun( + scenario, baselineMetrics, judgeOpts with { WorkDir = baselineMetrics.WorkDir }, runLog, cancellationToken), "baseline", runLog); + baselineJudge = judged; + AccumulateJudgeTokens(baselineMetrics, baselineJudgeTokens); + } var (isolatedJudge, isolatedJudgeTokens) = await SafeJudge(Judge.JudgeRun( scenario, isolatedMetrics, judgeOpts with { WorkDir = isolatedMetrics.WorkDir }, runLog, cancellationToken), "isolated", runLog); var (pluginJudge, pluginJudgeTokens) = await SafeJudge(Judge.JudgeRun( scenario, pluginMetrics, judgeOpts with { WorkDir = pluginMetrics.WorkDir }, runLog, cancellationToken), "plugin", runLog); - AccumulateJudgeTokens(baselineMetrics, baselineJudgeTokens); AccumulateJudgeTokens(isolatedMetrics, isolatedJudgeTokens); AccumulateJudgeTokens(pluginMetrics, pluginJudgeTokens); @@ -785,12 +900,16 @@ private static async Task ExecuteAgentRun( var worseSkilled = pairwiseFromPlugin ? pluginMetrics : isolatedMetrics; try { + // Reused baseline work dir no longer exists; run the judge in the skilled + // run's work dir (judge reads only the provided metrics text). + var pairwiseWorkDir = reusedBaseline is not null ? worseSkilled.WorkDir : baselineMetrics.WorkDir; var (pairwiseResult, pairwiseTokens) = await PairwiseJudge.Judge( scenario, baselineMetrics, worseSkilled, - new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, agent.Path, worseSkilled.WorkDir), + new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, pairwiseWorkDir, agent.Path, worseSkilled.WorkDir), runLog, cancellationToken); pairwise = pairwiseResult; - AccumulateJudgeTokens(baselineMetrics, pairwiseTokens); + if (reusedBaseline is null) + AccumulateJudgeTokens(baselineMetrics, pairwiseTokens); AccumulateJudgeTokens(worseSkilled, pairwiseTokens); } catch (Exception error) @@ -827,6 +946,7 @@ private static async Task ExecuteAgentRun( IReadOnlyList noiseSkills, string? sessionsDir, SessionDatabase? sessionDb, + BaselineStore? baselineStore, CancellationToken cancellationToken) { var skill = evalSkill.Skill; @@ -896,7 +1016,7 @@ private static async Task ExecuteAgentRun( { try { - return await ExecuteScenario(scenario, evalSkill, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, skillSha, cancellationToken); + return await ExecuteScenario(scenario, evalSkill, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, skillSha, baselineStore, cancellationToken); } catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested) { @@ -989,6 +1109,7 @@ private static async Task ExecuteScenario( string? sessionsDir, SessionDatabase? sessionDb, string? skillSha, + BaselineStore? baselineStore, CancellationToken cancellationToken) { var skill = evalSkill.Skill; @@ -1008,7 +1129,7 @@ private static async Task ExecuteScenario( { try { - return (Result: await ExecuteRun(i, scenario, evalSkill, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, skillSha, cancellationToken), Error: (Exception?)null); + return (Result: await ExecuteRun(i, scenario, evalSkill, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, skillSha, baselineStore, cancellationToken), Error: (Exception?)null); } catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested) { @@ -1056,6 +1177,9 @@ private static async Task ExecuteScenario( var avgBaseline = AverageResults(baselineRuns); var avgIsolated = AverageResults(isolatedRuns); var avgPlugin = AverageResults(pluginRuns); + // Persist the averaged baseline (skill/agent-independent) for shared reuse. + if (baselineStore is { IsReuse: false }) + baselineStore.Record(scenario, runResults.Length, avgBaseline); // Select the best pairwise result and track which run it came from int bestPairwiseIdx = -1; for (int i = 0; i < perRunPairwise.Count; i++) @@ -1163,6 +1287,7 @@ private static async Task ExecuteRun( string? sessionsDir, SessionDatabase? sessionDb, string? skillSha, + BaselineStore? baselineStore, CancellationToken cancellationToken) { var skill = evalSkill.Skill; @@ -1184,8 +1309,12 @@ private static async Task ExecuteRun( var pluginConfigDir = sessionsDir is not null ? Path.Combine("sessions", pluginSessionId) : null; var rubricJson = JsonSerializer.Serialize(scenario.Rubric?.ToArray() ?? [], SkillValidatorJsonContext.Default.StringArray); + // Reuse a precomputed shared baseline when available (--baseline-from). The + // baseline arm is skill-independent, so this skips a redundant agent run. + var reusedBaseline = baselineStore?.TryGetBaseline(scenario); + sessionDb?.RegisterSession(baselineSessionId, skill.Name, skill.Path, scenario.Name, runIndex, - "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, skillSha, rubricJson); + reusedBaseline is not null ? "baseline-reused" : "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, skillSha, rubricJson); sessionDb?.RegisterSession(isolatedSessionId, skill.Name, skill.Path, scenario.Name, runIndex, "with-skill-isolated", config.Model, isolatedConfigDir, null, scenario.Prompt, skillSha, rubricJson); sessionDb?.RegisterSession(pluginSessionId, skill.Name, skill.Path, scenario.Name, runIndex, @@ -1200,24 +1329,40 @@ private static async Task ExecuteRun( additionalAgents = await ResolveAdditionalAgents(scenario.Setup.AdditionalRequiredAgents, pluginRoot); } - var agentTasks = await Task.WhenAll( + // 2. Skilled-isolated: target skill + declared dependencies + var isolatedTask = AgentRunner.RunAgent(new RunOptions(scenario, skill, evalSkill.EvalPath, config.Model, config.Verbose, + PluginRoot: null, Log: runLog, McpServers: evalSkill.McpServers, SessionsDir: sessionsDir, + SessionId: isolatedSessionId, AdditionalSkills: additionalSkills, AdditionalAgents: additionalAgents), cancellationToken); + // 3. Skilled-plugin: load entire plugin from plugin root directory + var pluginTask = AgentRunner.RunAgent(new RunOptions(scenario, skill, evalSkill.EvalPath, config.Model, config.Verbose, + PluginRoot: pluginRoot, Log: runLog, McpServers: evalSkill.McpServers, SessionsDir: sessionsDir, SessionId: pluginSessionId), cancellationToken); + + RunMetrics baselineMetrics; + RunMetrics isolatedMetrics; + RunMetrics pluginMetrics; + if (reusedBaseline is not null) + { + if (config.Verbose) + runLog("↩︎ reusing precomputed baseline"); + baselineMetrics = reusedBaseline.Metrics; + var skilled = await Task.WhenAll(isolatedTask, pluginTask); + isolatedMetrics = skilled[0]; + pluginMetrics = skilled[1]; + } + else + { // 1. Baseline: no plugin, no skills — vanilla agent - AgentRunner.RunAgent(new RunOptions(scenario, null, evalSkill.EvalPath, config.Model, config.Verbose, - PluginRoot: null, Log: runLog, SessionsDir: sessionsDir, SessionId: baselineSessionId), cancellationToken), - // 2. Skilled-isolated: target skill + declared dependencies - AgentRunner.RunAgent(new RunOptions(scenario, skill, evalSkill.EvalPath, config.Model, config.Verbose, - PluginRoot: null, Log: runLog, McpServers: evalSkill.McpServers, SessionsDir: sessionsDir, - SessionId: isolatedSessionId, AdditionalSkills: additionalSkills, AdditionalAgents: additionalAgents), cancellationToken), - // 3. Skilled-plugin: load entire plugin from plugin root directory - AgentRunner.RunAgent(new RunOptions(scenario, skill, evalSkill.EvalPath, config.Model, config.Verbose, - PluginRoot: pluginRoot, Log: runLog, McpServers: evalSkill.McpServers, SessionsDir: sessionsDir, SessionId: pluginSessionId), cancellationToken)); - var baselineMetrics = agentTasks[0]; - var isolatedMetrics = agentTasks[1]; - var pluginMetrics = agentTasks[2]; + var baselineTask = AgentRunner.RunAgent(new RunOptions(scenario, null, evalSkill.EvalPath, config.Model, config.Verbose, + PluginRoot: null, Log: runLog, SessionsDir: sessionsDir, SessionId: baselineSessionId), cancellationToken); + var all = await Task.WhenAll(baselineTask, isolatedTask, pluginTask); + baselineMetrics = all[0]; + isolatedMetrics = all[1]; + pluginMetrics = all[2]; + } if (sessionDb is not null) { - var baselineStatus = baselineMetrics.TimedOut ? "timed_out" : "completed"; + var baselineStatus = reusedBaseline is not null ? "reused" : (baselineMetrics.TimedOut ? "timed_out" : "completed"); var isolatedStatus = isolatedMetrics.TimedOut ? "timed_out" : "completed"; var pluginStatus = pluginMetrics.TimedOut ? "timed_out" : "completed"; sessionDb.CompleteSession(baselineSessionId, baselineStatus, JsonSerializer.Serialize(baselineMetrics, SkillValidatorJsonContext.Default.RunMetrics)); @@ -1225,57 +1370,72 @@ private static async Task ExecuteRun( sessionDb.CompleteSession(pluginSessionId, pluginStatus, JsonSerializer.Serialize(pluginMetrics, SkillValidatorJsonContext.Default.RunMetrics)); } - // Evaluate assertions on all three runs + // Evaluate assertions on the skilled runs (baseline assertions are cached when reused) if (scenario.Assertions is { Count: > 0 }) { - baselineMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, baselineMetrics.AgentOutput, baselineMetrics.WorkDir, scenario.Timeout); + if (reusedBaseline is null) + baselineMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, baselineMetrics.AgentOutput, baselineMetrics.WorkDir, scenario.Timeout); isolatedMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, isolatedMetrics.AgentOutput, isolatedMetrics.WorkDir, scenario.Timeout); pluginMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, pluginMetrics.AgentOutput, pluginMetrics.WorkDir, scenario.Timeout); } - // Evaluate constraints on all three runs - var baselineConstraints = AssertionEvaluator.EvaluateConstraints(scenario, baselineMetrics); + // Evaluate constraints on the skilled runs (baseline constraints are cached when reused) + var baselineConstraints = reusedBaseline is null ? AssertionEvaluator.EvaluateConstraints(scenario, baselineMetrics) : []; var isolatedConstraints = AssertionEvaluator.EvaluateConstraints(scenario, isolatedMetrics); var pluginConstraints = AssertionEvaluator.EvaluateConstraints(scenario, pluginMetrics); - baselineMetrics.AssertionResults = [..baselineMetrics.AssertionResults, ..baselineConstraints]; + if (reusedBaseline is null) + baselineMetrics.AssertionResults = [..baselineMetrics.AssertionResults, ..baselineConstraints]; isolatedMetrics.AssertionResults = [..isolatedMetrics.AssertionResults, ..isolatedConstraints]; pluginMetrics.AssertionResults = [..pluginMetrics.AssertionResults, ..pluginConstraints]; - // Task completion for all three - if (scenario.Assertions is { Count: > 0 } || baselineConstraints.Count > 0) + // Task completion for the skilled runs (baseline completion is cached when reused) + if (scenario.Assertions is { Count: > 0 } || baselineConstraints.Count > 0 || isolatedConstraints.Count > 0 || pluginConstraints.Count > 0) { - baselineMetrics.TaskCompleted = baselineMetrics.AssertionResults.All(a => a.Passed); + if (reusedBaseline is null) + baselineMetrics.TaskCompleted = baselineMetrics.AssertionResults.All(a => a.Passed); isolatedMetrics.TaskCompleted = isolatedMetrics.AssertionResults.All(a => a.Passed); pluginMetrics.TaskCompleted = pluginMetrics.AssertionResults.All(a => a.Passed); } else { - baselineMetrics.TaskCompleted = baselineMetrics.ErrorCount == 0; + if (reusedBaseline is null) + baselineMetrics.TaskCompleted = baselineMetrics.ErrorCount == 0; isolatedMetrics.TaskCompleted = isolatedMetrics.ErrorCount == 0; pluginMetrics.TaskCompleted = pluginMetrics.ErrorCount == 0; } - // Judge all three runs independently (failures are non-fatal) - var judgeOpts = new JudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, skill.Path); + // Judge the skilled runs independently (failures are non-fatal). The baseline + // judge result is reused from the precomputed baseline when available. + var judgeOpts = new JudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, isolatedMetrics.WorkDir, skill.Path); - var baselineJudgeTask = Judge.JudgeRun(scenario, baselineMetrics, judgeOpts, runLog, cancellationToken); var isolatedJudgeTask = Judge.JudgeRun( scenario, isolatedMetrics, judgeOpts with { WorkDir = isolatedMetrics.WorkDir }, runLog, cancellationToken); var pluginJudgeTask = Judge.JudgeRun( scenario, pluginMetrics, judgeOpts with { WorkDir = pluginMetrics.WorkDir }, runLog, cancellationToken); - var (baselineJudge, baselineJudgeTokens) = await SafeJudge(baselineJudgeTask, "baseline", runLog); + JudgeResult baselineJudge; + if (reusedBaseline is not null) + { + baselineJudge = reusedBaseline.JudgeResult; + } + else + { + var (judged, baselineJudgeTokens) = await SafeJudge( + Judge.JudgeRun(scenario, baselineMetrics, judgeOpts with { WorkDir = baselineMetrics.WorkDir }, runLog, cancellationToken), "baseline", runLog); + baselineJudge = judged; + AccumulateJudgeTokens(baselineMetrics, baselineJudgeTokens); + } var (isolatedJudge, isolatedJudgeTokens) = await SafeJudge(isolatedJudgeTask, "isolated", runLog); var (pluginJudge, pluginJudgeTokens) = await SafeJudge(pluginJudgeTask, "plugin", runLog); - // Accumulate judge tokens into each run's metrics - AccumulateJudgeTokens(baselineMetrics, baselineJudgeTokens); + // Accumulate judge tokens into each skilled run's metrics AccumulateJudgeTokens(isolatedMetrics, isolatedJudgeTokens); AccumulateJudgeTokens(pluginMetrics, pluginJudgeTokens); if (sessionDb is not null) { - sessionDb.SaveJudgeResult(baselineSessionId, JsonSerializer.Serialize(baselineJudge, SkillValidatorJsonContext.Default.JudgeResult)); + if (reusedBaseline is null) + sessionDb.SaveJudgeResult(baselineSessionId, JsonSerializer.Serialize(baselineJudge, SkillValidatorJsonContext.Default.JudgeResult)); sessionDb.SaveJudgeResult(isolatedSessionId, JsonSerializer.Serialize(isolatedJudge, SkillValidatorJsonContext.Default.JudgeResult)); sessionDb.SaveJudgeResult(pluginSessionId, JsonSerializer.Serialize(pluginJudge, SkillValidatorJsonContext.Default.JudgeResult)); } @@ -1295,13 +1455,19 @@ private static async Task ExecuteRun( ? pluginMetrics : isolatedMetrics; try { + // When the baseline is reused its work dir no longer exists; run the + // judge session in the skilled run's work dir instead (the judge only + // reads the provided metrics text and is denied tool access). + var pairwiseWorkDir = reusedBaseline is not null ? worseSkilled.WorkDir : baselineMetrics.WorkDir; var (pairwiseResult, pairwiseTokens) = await PairwiseJudge.Judge( scenario, baselineMetrics, worseSkilled, - new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, skill.Path, worseSkilled.WorkDir), + new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, pairwiseWorkDir, skill.Path, worseSkilled.WorkDir), runLog, cancellationToken); pairwise = pairwiseResult; - // Attribute pairwise judge tokens to both the baseline and the compared run - AccumulateJudgeTokens(baselineMetrics, pairwiseTokens); + // Attribute pairwise judge tokens to the compared run (and to the baseline + // only when it was freshly executed, to avoid double-counting reused cost). + if (reusedBaseline is null) + AccumulateJudgeTokens(baselineMetrics, pairwiseTokens); AccumulateJudgeTokens(worseSkilled, pairwiseTokens); if (sessionDb is not null && pairwise is not null) { diff --git a/eng/skill-validator/src/Evaluate/Models.cs b/eng/skill-validator/src/Evaluate/Models.cs index b5c055d650..00508a9cf8 100644 --- a/eng/skill-validator/src/Evaluate/Models.cs +++ b/eng/skill-validator/src/Evaluate/Models.cs @@ -427,6 +427,12 @@ public sealed record ValidatorConfig public string? NoiseSkillsDir { get; init; } public double NoiseDegradationLimit { get; init; } = 0.2; public double NoiseMaxScenarioDegradation { get; init; } = 0.4; + + /// When set, persist each scenario's averaged baseline to this file after the run. + public string? BaselineOut { get; init; } + + /// When set, reuse the precomputed baseline from this file instead of re-running the baseline arm. + public string? BaselineFrom { get; init; } } public static class DefaultWeights diff --git a/eng/skill-validator/src/README.md b/eng/skill-validator/src/README.md index c3f346b446..af54a4340d 100644 --- a/eng/skill-validator/src/README.md +++ b/eng/skill-validator/src/README.md @@ -73,6 +73,10 @@ skill-validator evaluate --model gpt-5.3-codex --judge-model claude-opus-4.6-fas # Multiple runs for stability skill-validator evaluate --runs 5 --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills +# Compute a shared baseline once, then reuse it across multiple skills/agents +skill-validator evaluate --baseline-out baseline.json --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills/skill-a +skill-validator evaluate --baseline-from baseline.json --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills/skill-b + # Override the default results directory (.skill-validator-results) skill-validator evaluate --results-dir ./my-results --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills @@ -142,6 +146,8 @@ skill-validator check --json --plugin ./plugins/my-plugin | `--confidence-level ` | `0.95` | Confidence level for statistical intervals (0–1) | | `--judge-timeout ` | `300` | Judge LLM timeout in seconds | | `--require-completion` | `true` | Fail if skill regresses task completion | +| `--baseline-out ` | *(none)* | After running, persist each scenario's averaged baseline (no-skill/no-agent reference) to this file for reuse. Mutually exclusive with `--baseline-from`. | +| `--baseline-from ` | *(none)* | Reuse a precomputed baseline from this file instead of re-running the baseline arm. Must match `--model` and every scenario prompt. Mutually exclusive with `--baseline-out`. | | `--verdict-warn-only` | `false` | Treat verdict failures as warnings (exit 0). Execution errors still fail. | | `--no-overfitting-check` | `false` | Disable the LLM-based overfitting analysis (on by default) | | `--overfitting-fix` | `false` | Generate `eval.fixed.yaml` with improved rubric items/assertions | @@ -151,6 +157,19 @@ skill-validator check --json --plugin ./plugins/my-plugin Models are validated on startup — invalid model names fail fast with a list of available models. +### Shared baseline reuse + +Every evaluation runs each scenario through a **baseline arm** (the agent with no skill / no agent loaded) to establish a reference the skill-enhanced run is compared against. When you evaluate many skills or agents against the same test scenarios, that baseline arm is re-run every time — redundant work that also introduces run-to-run variance into the comparison. + +`--baseline-out` and `--baseline-from` let you compute the baseline **once** and reuse it as a shared control group: + +1. **Produce** a baseline file with `--baseline-out baseline.json`. After the run, each scenario's averaged baseline result (honoring `--runs`) is written to the file. +2. **Reuse** it with `--baseline-from baseline.json` on subsequent runs. The baseline arm is skipped entirely; the cached baseline is used for assertions, pairwise/independent judging, and metric deltas. + +The baseline file records the `--model` and a SHA-256 of each scenario prompt. On reuse the validator fails fast if the model differs or any scenario prompt is missing from the file, so a stale or mismatched baseline can never be silently applied. Scenarios reused from the file are reported with the `baseline-reused` session phase and a `reused` baseline status. + +The two options are mutually exclusive. + ## Output Results are displayed in the console with color-coded scores and metric deltas. By default, `json` and `markdown` reporters are enabled and write to `.skill-validator-results/` (override with `--results-dir`). File reporters write to that directory: diff --git a/eng/skill-validator/src/SkillValidatorJsonContext.cs b/eng/skill-validator/src/SkillValidatorJsonContext.cs index 2aaae673bf..e11f7c3597 100644 --- a/eng/skill-validator/src/SkillValidatorJsonContext.cs +++ b/eng/skill-validator/src/SkillValidatorJsonContext.cs @@ -17,6 +17,8 @@ namespace SkillValidator; [JsonSerializable(typeof(ScenarioComparison))] [JsonSerializable(typeof(RunResult))] [JsonSerializable(typeof(RunMetrics))] +[JsonSerializable(typeof(BaselineFile))] +[JsonSerializable(typeof(BaselineScenarioEntry))] [JsonSerializable(typeof(JudgeResult))] [JsonSerializable(typeof(RubricScore))] [JsonSerializable(typeof(AssertionResult))] diff --git a/eng/skill-validator/src/docs/InvestigatingResults.md b/eng/skill-validator/src/docs/InvestigatingResults.md index 7616caf9c5..7ca0aba084 100644 --- a/eng/skill-validator/src/docs/InvestigatingResults.md +++ b/eng/skill-validator/src/docs/InvestigatingResults.md @@ -83,6 +83,8 @@ Each scenario includes two required runs (baseline + isolated). It may also incl > **Note:** Scenarios do not have a `passed` field. To determine pass/fail for an individual scenario, check whether `improvementScore >= 0`. This is the effective score: when no plugin run is present it equals `isolatedImprovementScore`; when a plugin run is present it is the min of isolated and plugin scores. The `passed` field exists only at the verdict level (per-skill). +> **Reused baselines:** When the run was invoked with `--baseline-from`, the `baseline` arm is not executed — its `metrics` and `judgeResult` come from the shared baseline file produced earlier with `--baseline-out` (computed once, honoring `--runs`). Such scenarios are reported with the `baseline-reused` session phase and a `reused` baseline status. The baseline file is keyed on `--model` and a SHA-256 of each scenario prompt; reuse fails fast if the model differs or any prompt is missing, so the baseline you compare against is always identity-matched. Because the baseline output is identical across every skill/agent that consumes the same file, this acts as a shared control group and removes baseline run-to-run variance from cross-skill comparisons. + ### Breakdown fields The `isolatedBreakdown` and `pluginBreakdown` objects show how each metric contributed to the improvement score. Each field is a raw delta (not yet weighted). The final score is computed as a weighted sum: diff --git a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs new file mode 100644 index 0000000000..254ec650a9 --- /dev/null +++ b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs @@ -0,0 +1,151 @@ +using System.Text.Json; +using SkillValidator; +using SkillValidator.Evaluate; + +namespace SkillValidator.Tests; + +public class BaselineStoreTests +{ + private static RunResult MakeBaseline(double overallScore = 3, string output = "baseline output") => + new( + new RunMetrics + { + TokenEstimate = 1000, + ToolCallCount = 4, + ToolCallBreakdown = new Dictionary { ["bash"] = 4 }, + AgentOutput = output, + TaskCompleted = true, + Events = [], + }, + new JudgeResult([new RubricScore("Quality", overallScore, "ok")], overallScore, "fine")); + + private static EvalScenario Scenario(string name, string prompt) => new(name, prompt); + + private static string TempPath() => + Path.Combine(Path.GetTempPath(), $"sv-baseline-test-{Guid.NewGuid():N}.json"); + + [Fact] + public void ComputePromptSha_IsDeterministicAndPromptSensitive() + { + var a = BaselineStore.ComputePromptSha("do the thing"); + var b = BaselineStore.ComputePromptSha("do the thing"); + var c = BaselineStore.ComputePromptSha("do something else"); + + Assert.Equal(a, b); + Assert.NotEqual(a, c); + Assert.Equal(64, a.Length); // SHA-256 hex + } + + [Fact] + public void SaveThenLoad_RoundTripsBaselinePerScenario() + { + var path = TempPath(); + try + { + var store = BaselineStore.ForWrite("model-x"); + var s1 = Scenario("alpha", "prompt one"); + var s2 = Scenario("beta", "prompt two"); + store.Record(s1, runs: 5, MakeBaseline(overallScore: 4, output: "out-1")); + store.Record(s2, runs: 5, MakeBaseline(overallScore: 2, output: "out-2")); + store.Save(path); + + Assert.True(File.Exists(path)); + + var loaded = BaselineStore.Load(path, "model-x"); + Assert.True(loaded.IsReuse); + Assert.Equal(2, loaded.Count); + + var b1 = loaded.TryGetBaseline(s1); + var b2 = loaded.TryGetBaseline(s2); + Assert.NotNull(b1); + Assert.NotNull(b2); + Assert.Equal("out-1", b1!.Metrics.AgentOutput); + Assert.Equal(4, b1.JudgeResult.OverallScore); + Assert.Equal("out-2", b2!.Metrics.AgentOutput); + } + finally + { + File.Delete(path); + } + } + + [Fact] + public void Load_ThrowsOnModelMismatch() + { + var path = TempPath(); + try + { + var store = BaselineStore.ForWrite("model-x"); + store.Record(Scenario("alpha", "prompt one"), runs: 3, MakeBaseline()); + store.Save(path); + + var ex = Assert.Throws(() => BaselineStore.Load(path, "model-y")); + Assert.Contains("model-x", ex.Message); + Assert.Contains("model-y", ex.Message); + } + finally + { + File.Delete(path); + } + } + + [Fact] + public void Load_ThrowsOnUnsupportedVersion() + { + var path = TempPath(); + try + { + var file = new BaselineFile( + Version: BaselineStore.CurrentVersion + 1, + Model: "model-x", + ValidatorVersion: "9.9.9", + CreatedAt: DateTime.UtcNow.ToString("o"), + Scenarios: []); + File.WriteAllText(path, JsonSerializer.Serialize(file, SkillValidatorJsonContext.Default.BaselineFile)); + + var ex = Assert.Throws(() => BaselineStore.Load(path, "model-x")); + Assert.Contains("unsupported version", ex.Message); + } + finally + { + File.Delete(path); + } + } + + [Fact] + public void Load_ThrowsWhenFileMissing() + { + Assert.Throws(() => BaselineStore.Load(TempPath(), "model-x")); + } + + [Fact] + public void FindMissingScenarios_ReturnsScenariosWithoutCachedBaseline() + { + var path = TempPath(); + try + { + var store = BaselineStore.ForWrite("model-x"); + var present = Scenario("alpha", "prompt one"); + store.Record(present, runs: 5, MakeBaseline()); + store.Save(path); + + var loaded = BaselineStore.Load(path, "model-x"); + var missing = loaded.FindMissingScenarios([present, Scenario("beta", "prompt two")]); + + Assert.Single(missing); + Assert.Equal("beta", missing[0]); + } + finally + { + File.Delete(path); + } + } + + [Fact] + public void WriteStore_IsNotReuse() + { + var store = BaselineStore.ForWrite("model-x"); + Assert.False(store.IsReuse); + Assert.Null(store.TryGetBaseline(Scenario("alpha", "prompt one"))); + } +} From f571556f4dbe0ce6ab4490aefb77b0358c681a20 Mon Sep 17 00:00:00 2001 From: YuliiaKovalova <95473390+YuliiaKovalova@users.noreply.github.com> Date: Thu, 11 Jun 2026 17:35:52 +0200 Subject: [PATCH 2/7] Bind reused baseline to fixture identity (targetSha), not just prompt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Align baseline reuse with the (prompt, model, targetSha) identity contract from the upstream eval-harness design: previously the per-scenario reuse key was the prompt SHA + model only, so two scenarios that share an identical prompt but feed the agent different input artifacts (e.g. a different build.binlog) would collide and silently reuse the wrong baseline. - Add BaselineScenarioEntry.TargetSha: a SHA-256 over the scenario's materialized inputs — files auto-copied via copy_test_files, explicit setup files (inline content or copied sources), and the setup command recipe. The reuse key is now (promptSha, targetSha); both must match. Bump the on-disk schema to version 2. - Memoize target hashing per process via a cheap, file-I/O-free setup signature to avoid re-hashing large fixtures across the N runs. - Thread the originating eval.yaml path into Record/TryGetBaseline/ FindMissingScenarios so inputs can be fingerprinted. - Tests: target SHA is stable and content-sensitive; same-prompt/different-fixture scenarios do not reuse each other's baseline and are surfaced by FindMissingScenarios. Update README and InvestigatingResults. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/Evaluate/BaselineStore.cs | 143 +++++++++++++++--- .../src/Evaluate/EvaluateCommand.cs | 10 +- eng/skill-validator/src/README.md | 2 +- .../src/docs/InvestigatingResults.md | 2 +- .../tests/Evaluate/BaselineStoreTests.cs | 80 +++++++++- 5 files changed, 212 insertions(+), 25 deletions(-) diff --git a/eng/skill-validator/src/Evaluate/BaselineStore.cs b/eng/skill-validator/src/Evaluate/BaselineStore.cs index 0e3e143065..90877b0065 100644 --- a/eng/skill-validator/src/Evaluate/BaselineStore.cs +++ b/eng/skill-validator/src/Evaluate/BaselineStore.cs @@ -6,13 +6,18 @@ namespace SkillValidator.Evaluate; /// -/// One scenario's precomputed baseline, keyed by the SHA-256 of its prompt. +/// One scenario's precomputed baseline, keyed by the SHA-256 of its prompt +/// () and the SHA-256 of its setup/fixture inputs +/// (). Both must match for a baseline to be reused, so two +/// scenarios that share a prompt but feed the agent different input artifacts +/// (e.g. different build.binlog fixtures) never collide. /// records how many baseline runs were averaged into /// so reuse can report the robustness of the reference. /// public sealed record BaselineScenarioEntry( string Name, string PromptSha, + string TargetSha, int Runs, RunResult Baseline); @@ -38,9 +43,10 @@ public sealed record BaselineFile( internal sealed class BaselineStore { /// Current on-disk schema version. - public const int CurrentVersion = 1; + public const int CurrentVersion = 2; private readonly ConcurrentDictionary _entries = new(StringComparer.Ordinal); + private static readonly ConcurrentDictionary _targetShaCache = new(StringComparer.Ordinal); private readonly string _model; /// True when serving cached baselines (--baseline-from). @@ -58,8 +64,8 @@ private BaselineStore(string model, bool isReuse) /// /// Load a baseline file for reuse. Validates the schema version and that the model /// matches, throwing on mismatch so a stale or wrong baseline can never silently - /// skew results. Per-scenario prompt identity is validated later via - /// . + /// skew results. Per-scenario identity (prompt + setup/fixture inputs) is validated + /// later via . /// public static BaselineStore Load(string path, string expectedModel) { @@ -90,37 +96,140 @@ public static BaselineStore Load(string path, string expectedModel) foreach (var entry in file.Scenarios) { if (entry.Baseline is not null) - store._entries[entry.PromptSha] = entry; + store._entries[MakeKey(entry.PromptSha, entry.TargetSha)] = entry; } return store; } - /// SHA-256 (lower-case hex) of the scenario prompt — the per-scenario reuse key. - public static string ComputePromptSha(string prompt) + /// SHA-256 (lower-case hex) of the scenario prompt. + public static string ComputePromptSha(string prompt) => Sha256Hex(Encoding.UTF8.GetBytes(prompt)); + + /// + /// SHA-256 (lower-case hex) identifying the scenario's input artifacts — the analog + /// of the issue's targetSha. It folds in the contents of every file the agent + /// is given for the run: sibling files auto-copied via copy_test_files, explicit + /// setup files (inline content or copied sources), and the setup command recipe. This + /// binds a cached baseline to the exact inputs it was measured against, so two scenarios + /// that share prompt text but differ in fixtures (e.g. a different build.binlog) + /// resolve to distinct keys and never reuse each other's baseline. + /// + public static string ComputeTargetSha(EvalScenario scenario, string? evalPath) + { + var cacheKey = BuildTargetCacheKey(scenario, evalPath); + return _targetShaCache.GetOrAdd(cacheKey, _ => ComputeTargetShaCore(scenario, evalPath)); + } + + /// + /// Cheap, file-I/O-free signature of a scenario's setup inputs, used only to memoize + /// the (expensive) content hashing in within a + /// single process. It must distinguish any two scenarios whose materialized inputs + /// could differ, so it folds in the eval directory, the copy flag, the explicit setup + /// file recipe, and the command list — but not the on-disk file contents themselves. + /// + private static string BuildTargetCacheKey(EvalScenario scenario, string? evalPath) { - var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(prompt)); + var setup = scenario.Setup; + var sb = new StringBuilder().Append(evalPath ?? "").Append('\0'); + if (setup is null) + return sb.Append("none").ToString(); + sb.Append("copy=").Append(setup.CopyTestFiles).Append('\0'); + if (setup.Files is { } files) + foreach (var f in files) + sb.Append("f=").Append(f.Path).Append('|').Append(f.Source ?? "").Append('|').Append(f.Content ?? "").Append('\0'); + if (setup.Commands is { } commands) + foreach (var c in commands) + sb.Append("c=").Append(c).Append('\0'); + return sb.ToString(); + } + + private static string ComputeTargetShaCore(EvalScenario scenario, string? evalPath) + { + var setup = scenario.Setup; + if (setup is null) + return Sha256Hex(Encoding.UTF8.GetBytes("\0no-setup\0")); + + var sb = new StringBuilder(); + + // 1. Sibling files auto-copied into the work dir (copy_test_files: true). + if (setup.CopyTestFiles && evalPath is not null) + { + var evalDir = Path.GetDirectoryName(evalPath); + if (!string.IsNullOrEmpty(evalDir) && Directory.Exists(evalDir)) + { + var files = Directory.EnumerateFiles(evalDir, "*", SearchOption.AllDirectories) + .Where(f => !string.Equals(Path.GetFileName(f), "eval.yaml", StringComparison.Ordinal)) + .Select(f => (Rel: Path.GetRelativePath(evalDir, f).Replace('\\', '/'), Full: f)) + .OrderBy(x => x.Rel, StringComparer.Ordinal); + foreach (var (rel, full) in files) + sb.Append("F:").Append(rel).Append('=').Append(HashFile(full)).Append('\n'); + } + } + + // 2. Explicit setup files — inline content or a copied source. + if (setup.Files is { } setupFiles) + { + foreach (var f in setupFiles.OrderBy(f => f.Path, StringComparer.Ordinal)) + { + sb.Append("E:").Append(f.Path.Replace('\\', '/')).Append('='); + if (f.Content is not null) + sb.Append("c:").Append(Sha256Hex(Encoding.UTF8.GetBytes(f.Content))); + else if (f.Source is not null) + { + var resolved = AgentRunner.ResolveSourcePath(f.Source, evalPath, skillPath: null); + sb.Append("s:").Append(resolved is not null && File.Exists(resolved) ? HashFile(resolved) : "missing"); + } + sb.Append('\n'); + } + } + + // 3. Setup commands define part of the input recipe (e.g. building a binlog). + if (setup.Commands is { } commands) + { + foreach (var c in commands) + sb.Append("C:").Append(c).Append('\n'); + } + + return Sha256Hex(Encoding.UTF8.GetBytes(sb.ToString())); + } + + private static string HashFile(string path) + { + using var stream = File.OpenRead(path); + return Sha256Hex(SHA256.HashData(stream)); + } + + private static string Sha256Hex(byte[] data) + { + var bytes = data.Length == 32 ? data : SHA256.HashData(data); return Convert.ToHexString(bytes).ToLowerInvariant(); } + private static string MakeKey(string promptSha, string targetSha) => string.Concat(promptSha, ":", targetSha); + /// /// In reuse mode, return the names of scenarios that have no matching cached - /// baseline (keyed by prompt hash). Empty when every scenario is covered. + /// baseline (keyed by prompt + setup/fixture identity). Empty when every + /// scenario is covered. Each scenario is paired with the eval.yaml path it + /// originates from so its input artifacts can be fingerprinted. /// - public IReadOnlyList FindMissingScenarios(IEnumerable scenarios) => + public IReadOnlyList FindMissingScenarios(IEnumerable<(EvalScenario Scenario, string? EvalPath)> scenarios) => scenarios - .Where(s => !_entries.ContainsKey(ComputePromptSha(s.Prompt))) - .Select(s => s.Name) + .Where(s => !_entries.ContainsKey(MakeKey(ComputePromptSha(s.Scenario.Prompt), ComputeTargetSha(s.Scenario, s.EvalPath)))) + .Select(s => s.Scenario.Name) .ToList(); /// Get the cached averaged baseline for a scenario, or null when absent. - public RunResult? TryGetBaseline(EvalScenario scenario) => - _entries.TryGetValue(ComputePromptSha(scenario.Prompt), out var entry) ? entry.Baseline : null; + public RunResult? TryGetBaseline(EvalScenario scenario, string? evalPath = null) => + _entries.TryGetValue(MakeKey(ComputePromptSha(scenario.Prompt), ComputeTargetSha(scenario, evalPath)), out var entry) + ? entry.Baseline + : null; /// Record a scenario's averaged baseline for later persistence (write mode). - public void Record(EvalScenario scenario, int runs, RunResult averagedBaseline) + public void Record(EvalScenario scenario, int runs, RunResult averagedBaseline, string? evalPath = null) { - var sha = ComputePromptSha(scenario.Prompt); - _entries[sha] = new BaselineScenarioEntry(scenario.Name, sha, runs, averagedBaseline); + var promptSha = ComputePromptSha(scenario.Prompt); + var targetSha = ComputeTargetSha(scenario, evalPath); + _entries[MakeKey(promptSha, targetSha)] = new BaselineScenarioEntry(scenario.Name, promptSha, targetSha, runs, averagedBaseline); } /// Serialize all recorded baselines to . diff --git a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs index d8f054dd9f..ac805ea602 100644 --- a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs +++ b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs @@ -322,7 +322,7 @@ public static async Task Run(ValidatorConfig config, CancellationToken canc // incomplete baseline can never silently skew results. var allScenarios = allTargets .Where(t => t.EvalConfig is not null) - .SelectMany(t => t.EvalConfig!.Scenarios) + .SelectMany(t => t.EvalConfig!.Scenarios.Select(s => (Scenario: s, t.EvalPath))) .ToList(); var missing = baselineStore.FindMissingScenarios(allScenarios); if (missing.Count > 0) @@ -657,7 +657,7 @@ private static async Task ExecuteAgentScenario( // Persist the averaged baseline (skill/agent-independent) for shared reuse. if (baselineStore is { IsReuse: false }) - baselineStore.Record(scenario, runResults.Length, avgBaseline); + baselineStore.Record(scenario, runResults.Length, avgBaseline, target.EvalPath); int bestPairwiseIdx = -1; for (int i = 0; i < perRunPairwise.Count; i++) @@ -772,7 +772,7 @@ private static async Task ExecuteAgentRun( // Reuse a precomputed shared baseline when available (--baseline-from). The // baseline arm is agent-independent, so this skips a redundant agent run. - var reusedBaseline = baselineStore?.TryGetBaseline(scenario); + var reusedBaseline = baselineStore?.TryGetBaseline(scenario, target.EvalPath); sessionDb?.RegisterSession(baselineSessionId, agent.Name, agent.Path, scenario.Name, runIndex, reusedBaseline is not null ? "baseline-reused" : "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, targetSha, rubricJson); @@ -1179,7 +1179,7 @@ private static async Task ExecuteScenario( var avgPlugin = AverageResults(pluginRuns); // Persist the averaged baseline (skill/agent-independent) for shared reuse. if (baselineStore is { IsReuse: false }) - baselineStore.Record(scenario, runResults.Length, avgBaseline); + baselineStore.Record(scenario, runResults.Length, avgBaseline, evalSkill.EvalPath); // Select the best pairwise result and track which run it came from int bestPairwiseIdx = -1; for (int i = 0; i < perRunPairwise.Count; i++) @@ -1311,7 +1311,7 @@ private static async Task ExecuteRun( // Reuse a precomputed shared baseline when available (--baseline-from). The // baseline arm is skill-independent, so this skips a redundant agent run. - var reusedBaseline = baselineStore?.TryGetBaseline(scenario); + var reusedBaseline = baselineStore?.TryGetBaseline(scenario, evalSkill.EvalPath); sessionDb?.RegisterSession(baselineSessionId, skill.Name, skill.Path, scenario.Name, runIndex, reusedBaseline is not null ? "baseline-reused" : "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, skillSha, rubricJson); diff --git a/eng/skill-validator/src/README.md b/eng/skill-validator/src/README.md index af54a4340d..4c8c54b4cd 100644 --- a/eng/skill-validator/src/README.md +++ b/eng/skill-validator/src/README.md @@ -166,7 +166,7 @@ Every evaluation runs each scenario through a **baseline arm** (the agent with n 1. **Produce** a baseline file with `--baseline-out baseline.json`. After the run, each scenario's averaged baseline result (honoring `--runs`) is written to the file. 2. **Reuse** it with `--baseline-from baseline.json` on subsequent runs. The baseline arm is skipped entirely; the cached baseline is used for assertions, pairwise/independent judging, and metric deltas. -The baseline file records the `--model` and a SHA-256 of each scenario prompt. On reuse the validator fails fast if the model differs or any scenario prompt is missing from the file, so a stale or mismatched baseline can never be silently applied. Scenarios reused from the file are reported with the `baseline-reused` session phase and a `reused` baseline status. +The baseline file records the `--model` and, per scenario, a SHA-256 of the prompt **and** a SHA-256 of its setup inputs (the fixtures copied via `copy_test_files`, explicit setup files, and setup commands — the analog of a target/input SHA). On reuse the validator fails fast if the model differs or any scenario's prompt-plus-fixture identity is missing from the file, so a stale or mismatched baseline can never be silently applied — and two scenarios that share a prompt but feed the agent different fixtures (e.g. a different `build.binlog`) never reuse each other's baseline. Scenarios reused from the file are reported with the `baseline-reused` session phase and a `reused` baseline status. The two options are mutually exclusive. diff --git a/eng/skill-validator/src/docs/InvestigatingResults.md b/eng/skill-validator/src/docs/InvestigatingResults.md index 7ca0aba084..c278035799 100644 --- a/eng/skill-validator/src/docs/InvestigatingResults.md +++ b/eng/skill-validator/src/docs/InvestigatingResults.md @@ -83,7 +83,7 @@ Each scenario includes two required runs (baseline + isolated). It may also incl > **Note:** Scenarios do not have a `passed` field. To determine pass/fail for an individual scenario, check whether `improvementScore >= 0`. This is the effective score: when no plugin run is present it equals `isolatedImprovementScore`; when a plugin run is present it is the min of isolated and plugin scores. The `passed` field exists only at the verdict level (per-skill). -> **Reused baselines:** When the run was invoked with `--baseline-from`, the `baseline` arm is not executed — its `metrics` and `judgeResult` come from the shared baseline file produced earlier with `--baseline-out` (computed once, honoring `--runs`). Such scenarios are reported with the `baseline-reused` session phase and a `reused` baseline status. The baseline file is keyed on `--model` and a SHA-256 of each scenario prompt; reuse fails fast if the model differs or any prompt is missing, so the baseline you compare against is always identity-matched. Because the baseline output is identical across every skill/agent that consumes the same file, this acts as a shared control group and removes baseline run-to-run variance from cross-skill comparisons. +> **Reused baselines:** When the run was invoked with `--baseline-from`, the `baseline` arm is not executed — its `metrics` and `judgeResult` come from the shared baseline file produced earlier with `--baseline-out` (computed once, honoring `--runs`). Such scenarios are reported with the `baseline-reused` session phase and a `reused` baseline status. The baseline file is keyed on `--model` plus, per scenario, a SHA-256 of the prompt and a SHA-256 of its setup/fixture inputs (copied test files, explicit setup files, and setup commands); reuse fails fast if the model differs or any prompt-plus-fixture identity is missing, so the baseline you compare against is always identity-matched and a shared prompt across cases with different fixtures cannot cross-contaminate. Because the baseline output is identical across every skill/agent that consumes the same file, this acts as a shared control group and removes baseline run-to-run variance from cross-skill comparisons. ### Breakdown fields diff --git a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs index 254ec650a9..09444d7cb5 100644 --- a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs +++ b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs @@ -130,7 +130,7 @@ public void FindMissingScenarios_ReturnsScenariosWithoutCachedBaseline() store.Save(path); var loaded = BaselineStore.Load(path, "model-x"); - var missing = loaded.FindMissingScenarios([present, Scenario("beta", "prompt two")]); + var missing = loaded.FindMissingScenarios([(present, null), (Scenario("beta", "prompt two"), null)]); Assert.Single(missing); Assert.Equal("beta", missing[0]); @@ -148,4 +148,82 @@ public void WriteStore_IsNotReuse() Assert.False(store.IsReuse); Assert.Null(store.TryGetBaseline(Scenario("alpha", "prompt one"))); } + + private static string MakeEvalDirWithFixture(string fixtureName, string fixtureContent) + { + var dir = Path.Combine(Path.GetTempPath(), $"sv-baseline-fixture-{Guid.NewGuid():N}"); + Directory.CreateDirectory(dir); + File.WriteAllText(Path.Combine(dir, "eval.yaml"), "scenarios: []"); + File.WriteAllText(Path.Combine(dir, fixtureName), fixtureContent); + return Path.Combine(dir, "eval.yaml"); + } + + private static EvalScenario FixtureScenario(string name, string prompt) => + new(name, prompt, new SetupConfig(CopyTestFiles: true)); + + [Fact] + public void ComputeTargetSha_DiffersByFixtureContentAndIsStable() + { + var evalA = MakeEvalDirWithFixture("build.binlog", "AAAA"); + var evalB = MakeEvalDirWithFixture("build.binlog", "BBBB"); + try + { + var scenario = FixtureScenario("s", "investigate build.binlog"); + + var shaA1 = BaselineStore.ComputeTargetSha(scenario, evalA); + var shaA2 = BaselineStore.ComputeTargetSha(scenario, evalA); + var shaB = BaselineStore.ComputeTargetSha(scenario, evalB); + + Assert.Equal(shaA1, shaA2); // stable for identical inputs + Assert.NotEqual(shaA1, shaB); // sensitive to fixture content + Assert.Equal(64, shaA1.Length); + + // No setup → a stable, distinct constant. + var noSetup = BaselineStore.ComputeTargetSha(Scenario("s", "investigate build.binlog"), evalA); + Assert.NotEqual(shaA1, noSetup); + } + finally + { + Directory.Delete(Path.GetDirectoryName(evalA)!, recursive: true); + Directory.Delete(Path.GetDirectoryName(evalB)!, recursive: true); + } + } + + [Fact] + public void SamePromptDifferentFixture_DoesNotReuseBaseline() + { + var path = TempPath(); + var evalA = MakeEvalDirWithFixture("build.binlog", "case-A-binlog"); + var evalB = MakeEvalDirWithFixture("build.binlog", "case-B-binlog"); + try + { + // Two cases share an identical prompt but feed different fixtures. + const string sharedPrompt = "The binlog is at build.binlog. What went wrong?"; + var scenarioA = FixtureScenario("case-A", sharedPrompt); + var scenarioB = FixtureScenario("case-B", sharedPrompt); + + // Persist a baseline only for case A. + var store = BaselineStore.ForWrite("model-x"); + store.Record(scenarioA, runs: 5, MakeBaseline(output: "A-baseline"), evalA); + store.Save(path); + + var loaded = BaselineStore.Load(path, "model-x"); + + // Case A reuses its baseline; case B must NOT (different targetSha). + Assert.NotNull(loaded.TryGetBaseline(scenarioA, evalA)); + Assert.Equal("A-baseline", loaded.TryGetBaseline(scenarioA, evalA)!.Metrics.AgentOutput); + Assert.Null(loaded.TryGetBaseline(scenarioB, evalB)); + + // FindMissingScenarios surfaces case B by name despite the shared prompt. + var missing = loaded.FindMissingScenarios([(scenarioA, evalA), (scenarioB, evalB)]); + Assert.Single(missing); + Assert.Equal("case-B", missing[0]); + } + finally + { + File.Delete(path); + Directory.Delete(Path.GetDirectoryName(evalA)!, recursive: true); + Directory.Delete(Path.GetDirectoryName(evalB)!, recursive: true); + } + } } From 4f7a652b4179b7991a218cc43362d4677f0736e9 Mon Sep 17 00:00:00 2001 From: YuliiaKovalova <95473390+YuliiaKovalova@users.noreply.github.com> Date: Thu, 11 Jun 2026 18:05:01 +0200 Subject: [PATCH 3/7] Harden baseline reuse identity after review Address rubber-duck review findings on the shared-baseline feature: - Fix Sha256Hex 32-byte bug: a 32-byte input was treated as an already-computed digest and not hashed. Split into Sha256Hex (always hash) + HexDigest (encode existing digest). - Broaden reuse identity: the cached baseline RunResult depends on the judge model and on per-scenario evaluation criteria (rubric, assertions, expect/reject tools, turn/token/timeout limits). Add JudgeModel to the baseline header (validated on load) and fold the criteria into the per-scenario targetSha so changing them invalidates reuse instead of silently serving a stale result. - Mirror AgentRunner.SetupWorkDir exactly when hashing copied fixtures: exclude only the top-level eval.yaml (nested eval.yaml files are copied, so they must be hashed). - Make the target-SHA cache instance-scoped (memoizing only the expensive fixture-input hashing) so it can't serve stale hashes or leak across evaluations/tests; hash inline file Content in the cache key instead of embedding it. - Deterministic Save ordering (Name, PromptSha, TargetSha); guard Load against null Scenarios; enrich FindMissingScenarios output with the eval path. - Document that setup commands are fingerprinted by recipe, so reuse assumes they are deterministic/hermetic. - Tests + docs updated; add judge-model-mismatch and criteria-identity tests (562 pass). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/Evaluate/BaselineStore.cs | 165 +++++++++++++----- .../src/Evaluate/EvaluateCommand.cs | 6 +- eng/skill-validator/src/README.md | 6 +- .../src/docs/InvestigatingResults.md | 2 +- .../tests/Evaluate/BaselineStoreTests.cs | 79 +++++++-- 5 files changed, 190 insertions(+), 68 deletions(-) diff --git a/eng/skill-validator/src/Evaluate/BaselineStore.cs b/eng/skill-validator/src/Evaluate/BaselineStore.cs index 90877b0065..87a51a12d2 100644 --- a/eng/skill-validator/src/Evaluate/BaselineStore.cs +++ b/eng/skill-validator/src/Evaluate/BaselineStore.cs @@ -25,11 +25,14 @@ public sealed record BaselineScenarioEntry( /// On-disk format written by --baseline-out and read by --baseline-from. /// The baseline arm of evaluate is plain-agent with no skill/MCP attached, so it /// is independent of the target under test and can be computed once and shared across -/// many invocations. The header records the identity needed to reject a stale reuse. +/// many invocations. The header records the identity needed to reject a stale reuse: +/// the agent and the that produced the +/// cached judge scores. /// public sealed record BaselineFile( int Version, string Model, + string JudgeModel, string? ValidatorVersion, string CreatedAt, IReadOnlyList Scenarios); @@ -46,28 +49,33 @@ internal sealed class BaselineStore public const int CurrentVersion = 2; private readonly ConcurrentDictionary _entries = new(StringComparer.Ordinal); - private static readonly ConcurrentDictionary _targetShaCache = new(StringComparer.Ordinal); + // Memoizes the (expensive, file-I/O-bound) hashing of materialized input artifacts. + // Instance-scoped — never shared across stores — so it can never serve a stale hash + // from a different evaluation or leak between tests. + private readonly ConcurrentDictionary _inputsShaCache = new(StringComparer.Ordinal); private readonly string _model; + private readonly string _judgeModel; /// True when serving cached baselines (--baseline-from). public bool IsReuse { get; } - private BaselineStore(string model, bool isReuse) + private BaselineStore(string model, string judgeModel, bool isReuse) { _model = model; + _judgeModel = judgeModel; IsReuse = isReuse; } /// Create a store that accumulates baselines for later persistence. - public static BaselineStore ForWrite(string model) => new(model, isReuse: false); + public static BaselineStore ForWrite(string model, string judgeModel) => new(model, judgeModel, isReuse: false); /// - /// Load a baseline file for reuse. Validates the schema version and that the model - /// matches, throwing on mismatch so a stale or wrong baseline can never silently - /// skew results. Per-scenario identity (prompt + setup/fixture inputs) is validated - /// later via . + /// Load a baseline file for reuse. Validates the schema version and that both the + /// agent model and judge model match, throwing on mismatch so a stale or wrong + /// baseline can never silently skew results. Per-scenario identity (prompt + setup + /// inputs + evaluation criteria) is validated later via . /// - public static BaselineStore Load(string path, string expectedModel) + public static BaselineStore Load(string path, string expectedModel, string expectedJudgeModel) { if (!File.Exists(path)) throw new FileNotFoundException($"Baseline file not found: {path}"); @@ -91,11 +99,15 @@ public static BaselineStore Load(string path, string expectedModel) throw new InvalidOperationException( $"Baseline file '{path}' was computed for model '{file.Model}' but evaluation uses model '{expectedModel}'. " + "Recompute the baseline with --baseline-out for the new model."); + if (!string.Equals(file.JudgeModel, expectedJudgeModel, StringComparison.Ordinal)) + throw new InvalidOperationException( + $"Baseline file '{path}' was judged with model '{file.JudgeModel}' but evaluation uses judge model '{expectedJudgeModel}'. " + + "Recompute the baseline with --baseline-out for the new judge model."); - var store = new BaselineStore(expectedModel, isReuse: true); - foreach (var entry in file.Scenarios) + var store = new BaselineStore(expectedModel, expectedJudgeModel, isReuse: true); + foreach (var entry in file.Scenarios ?? []) { - if (entry.Baseline is not null) + if (entry?.Baseline is not null) store._entries[MakeKey(entry.PromptSha, entry.TargetSha)] = entry; } return store; @@ -105,28 +117,46 @@ public static BaselineStore Load(string path, string expectedModel) public static string ComputePromptSha(string prompt) => Sha256Hex(Encoding.UTF8.GetBytes(prompt)); /// - /// SHA-256 (lower-case hex) identifying the scenario's input artifacts — the analog - /// of the issue's targetSha. It folds in the contents of every file the agent - /// is given for the run: sibling files auto-copied via copy_test_files, explicit - /// setup files (inline content or copied sources), and the setup command recipe. This - /// binds a cached baseline to the exact inputs it was measured against, so two scenarios - /// that share prompt text but differ in fixtures (e.g. a different build.binlog) - /// resolve to distinct keys and never reuse each other's baseline. + /// SHA-256 (lower-case hex) identifying everything (besides the prompt and model) that + /// determines a scenario's cached baseline — the analog of the + /// issue's targetSha. It folds in: + /// + /// the materialized input artifacts the agent is given (files auto-copied via + /// copy_test_files, explicit setup files' content/sources, and the setup command + /// recipe), and + /// the evaluation criteria that shape the stored result (rubric, assertions, + /// expect/reject tools, and the turn/token/timeout limits that bound the baseline run). + /// + /// This binds a cached baseline to the exact inputs and criteria it was measured + /// against, so two scenarios that share a prompt but differ in fixtures (e.g. a different + /// build.binlog) or in rubric/assertions resolve to distinct keys and never reuse + /// each other's baseline. + /// Setup commands are hashed by their text (the recipe), not the artifacts they + /// generate; reuse therefore assumes setup commands are deterministic/hermetic. /// - public static string ComputeTargetSha(EvalScenario scenario, string? evalPath) + public static string ComputeTargetSha(EvalScenario scenario, string? evalPath) => + CombineIdentity(ComputeInputsSha(scenario, evalPath), scenario); + + // Instance variant: memoizes the expensive input hashing, then combines with the + // (cheap) per-scenario criteria so the result equals the static method exactly. + private string TargetShaFor(EvalScenario scenario, string? evalPath) { - var cacheKey = BuildTargetCacheKey(scenario, evalPath); - return _targetShaCache.GetOrAdd(cacheKey, _ => ComputeTargetShaCore(scenario, evalPath)); + var inputsSha = _inputsShaCache.GetOrAdd(BuildInputsCacheKey(scenario, evalPath), _ => ComputeInputsSha(scenario, evalPath)); + return CombineIdentity(inputsSha, scenario); } + private static string CombineIdentity(string inputsSha, EvalScenario scenario) => + Sha256Hex(Encoding.UTF8.GetBytes(string.Concat(inputsSha, "\0criteria\0", CriteriaString(scenario)))); + /// - /// Cheap, file-I/O-free signature of a scenario's setup inputs, used only to memoize - /// the (expensive) content hashing in within a - /// single process. It must distinguish any two scenarios whose materialized inputs - /// could differ, so it folds in the eval directory, the copy flag, the explicit setup - /// file recipe, and the command list — but not the on-disk file contents themselves. + /// Cheap, file-I/O-free key memoizing the input-artifact hash within this store. It must + /// distinguish any two scenarios whose materialized inputs could differ, so it folds in the + /// eval directory, the copy flag, the explicit setup file recipe, and the command list (but + /// not the auto-copied file contents — those are determined by the directory + copy flag). + /// Evaluation criteria are intentionally excluded here because they are combined after the + /// cache lookup in . /// - private static string BuildTargetCacheKey(EvalScenario scenario, string? evalPath) + private static string BuildInputsCacheKey(EvalScenario scenario, string? evalPath) { var setup = scenario.Setup; var sb = new StringBuilder().Append(evalPath ?? "").Append('\0'); @@ -135,14 +165,15 @@ private static string BuildTargetCacheKey(EvalScenario scenario, string? evalPat sb.Append("copy=").Append(setup.CopyTestFiles).Append('\0'); if (setup.Files is { } files) foreach (var f in files) - sb.Append("f=").Append(f.Path).Append('|').Append(f.Source ?? "").Append('|').Append(f.Content ?? "").Append('\0'); + sb.Append("f=").Append(f.Path).Append('|').Append(f.Source ?? "").Append('|') + .Append(f.Content is null ? "" : Sha256Hex(Encoding.UTF8.GetBytes(f.Content))).Append('\0'); if (setup.Commands is { } commands) foreach (var c in commands) sb.Append("c=").Append(c).Append('\0'); return sb.ToString(); } - private static string ComputeTargetShaCore(EvalScenario scenario, string? evalPath) + private static string ComputeInputsSha(EvalScenario scenario, string? evalPath) { var setup = scenario.Setup; if (setup is null) @@ -150,15 +181,16 @@ private static string ComputeTargetShaCore(EvalScenario scenario, string? evalPa var sb = new StringBuilder(); - // 1. Sibling files auto-copied into the work dir (copy_test_files: true). + // 1. Sibling files auto-copied into the work dir (copy_test_files: true). Mirror + // AgentRunner.SetupWorkDir, which excludes only the top-level eval.yaml. if (setup.CopyTestFiles && evalPath is not null) { var evalDir = Path.GetDirectoryName(evalPath); if (!string.IsNullOrEmpty(evalDir) && Directory.Exists(evalDir)) { var files = Directory.EnumerateFiles(evalDir, "*", SearchOption.AllDirectories) - .Where(f => !string.Equals(Path.GetFileName(f), "eval.yaml", StringComparison.Ordinal)) .Select(f => (Rel: Path.GetRelativePath(evalDir, f).Replace('\\', '/'), Full: f)) + .Where(x => !string.Equals(x.Rel, "eval.yaml", StringComparison.Ordinal)) .OrderBy(x => x.Rel, StringComparer.Ordinal); foreach (var (rel, full) in files) sb.Append("F:").Append(rel).Append('=').Append(HashFile(full)).Append('\n'); @@ -192,35 +224,69 @@ private static string ComputeTargetShaCore(EvalScenario scenario, string? evalPa return Sha256Hex(Encoding.UTF8.GetBytes(sb.ToString())); } - private static string HashFile(string path) + /// + /// Deterministic textual signature of the evaluation criteria that influence a scenario's + /// stored baseline result: run-bounding limits, rubric, assertions, and expect/reject tools. + /// + private static string CriteriaString(EvalScenario scenario) { - using var stream = File.OpenRead(path); - return Sha256Hex(SHA256.HashData(stream)); + var sb = new StringBuilder(); + sb.Append("turns=").Append(scenario.MaxTurns?.ToString() ?? "").Append('\0'); + sb.Append("tokens=").Append(scenario.MaxTokens?.ToString() ?? "").Append('\0'); + sb.Append("timeout=").Append(scenario.Timeout).Append('\0'); + if (scenario.Rubric is { } rubric) + foreach (var r in rubric) + sb.Append("R:").Append(r).Append('\n'); + if (scenario.ExpectTools is { } expect) + foreach (var t in expect.OrderBy(x => x, StringComparer.Ordinal)) + sb.Append("XT:").Append(t).Append('\0'); + if (scenario.RejectTools is { } reject) + foreach (var t in reject.OrderBy(x => x, StringComparer.Ordinal)) + sb.Append("RT:").Append(t).Append('\0'); + if (scenario.Assertions is { } assertions) + foreach (var a in assertions) + { + sb.Append("A:").Append(a.Type).Append('|').Append(a.Path ?? "").Append('|') + .Append(a.Value ?? "").Append('|').Append(a.Pattern ?? "").Append('|'); + if (a.CommandArgs is { } ca) + sb.Append(ca.CommandToRun).Append(';').Append(ca.CommandArguments ?? "").Append(';') + .Append(ca.ExpectedExitCode?.ToString() ?? "").Append(';').Append(ca.ExpectedStdOutContains ?? "").Append(';') + .Append(ca.ExpectedStdErrorContains ?? "").Append(';').Append(ca.ExpectedStdOutMatches ?? "").Append(';') + .Append(ca.ExpectedStdErrorMatches ?? "").Append(';').Append(ca.Timeout?.ToString() ?? ""); + sb.Append('\n'); + } + return sb.ToString(); } - private static string Sha256Hex(byte[] data) + private static string HashFile(string path) { - var bytes = data.Length == 32 ? data : SHA256.HashData(data); - return Convert.ToHexString(bytes).ToLowerInvariant(); + using var stream = File.OpenRead(path); + return HexDigest(SHA256.HashData(stream)); } + /// SHA-256 of , lower-case hex. + private static string Sha256Hex(byte[] data) => HexDigest(SHA256.HashData(data)); + + /// Lower-case hex encoding of an already-computed digest. + private static string HexDigest(byte[] digest) => Convert.ToHexString(digest).ToLowerInvariant(); + private static string MakeKey(string promptSha, string targetSha) => string.Concat(promptSha, ":", targetSha); /// - /// In reuse mode, return the names of scenarios that have no matching cached - /// baseline (keyed by prompt + setup/fixture identity). Empty when every - /// scenario is covered. Each scenario is paired with the eval.yaml path it - /// originates from so its input artifacts can be fingerprinted. + /// In reuse mode, return human-readable identifiers (name + eval path) of scenarios that + /// have no matching cached baseline (keyed by prompt + setup/criteria identity). Empty + /// when every scenario is covered. Each scenario is paired with the eval.yaml path it + /// originates from so its input artifacts can be fingerprinted and reported unambiguously. /// public IReadOnlyList FindMissingScenarios(IEnumerable<(EvalScenario Scenario, string? EvalPath)> scenarios) => scenarios - .Where(s => !_entries.ContainsKey(MakeKey(ComputePromptSha(s.Scenario.Prompt), ComputeTargetSha(s.Scenario, s.EvalPath)))) - .Select(s => s.Scenario.Name) + .Where(s => !_entries.ContainsKey(MakeKey(ComputePromptSha(s.Scenario.Prompt), TargetShaFor(s.Scenario, s.EvalPath)))) + .Select(s => s.EvalPath is null ? s.Scenario.Name : $"{s.Scenario.Name} ({s.EvalPath})") .ToList(); /// Get the cached averaged baseline for a scenario, or null when absent. public RunResult? TryGetBaseline(EvalScenario scenario, string? evalPath = null) => - _entries.TryGetValue(MakeKey(ComputePromptSha(scenario.Prompt), ComputeTargetSha(scenario, evalPath)), out var entry) + _entries.TryGetValue(MakeKey(ComputePromptSha(scenario.Prompt), TargetShaFor(scenario, evalPath)), out var entry) ? entry.Baseline : null; @@ -228,7 +294,7 @@ public IReadOnlyList FindMissingScenarios(IEnumerable<(EvalScenario Scen public void Record(EvalScenario scenario, int runs, RunResult averagedBaseline, string? evalPath = null) { var promptSha = ComputePromptSha(scenario.Prompt); - var targetSha = ComputeTargetSha(scenario, evalPath); + var targetSha = TargetShaFor(scenario, evalPath); _entries[MakeKey(promptSha, targetSha)] = new BaselineScenarioEntry(scenario.Name, promptSha, targetSha, runs, averagedBaseline); } @@ -238,9 +304,14 @@ public void Save(string path) var file = new BaselineFile( Version: CurrentVersion, Model: _model, + JudgeModel: _judgeModel, ValidatorVersion: typeof(BaselineStore).Assembly.GetName().Version?.ToString(), CreatedAt: DateTime.UtcNow.ToString("o"), - Scenarios: _entries.Values.OrderBy(e => e.Name, StringComparer.Ordinal).ToList()); + Scenarios: _entries.Values + .OrderBy(e => e.Name, StringComparer.Ordinal) + .ThenBy(e => e.PromptSha, StringComparer.Ordinal) + .ThenBy(e => e.TargetSha, StringComparer.Ordinal) + .ToList()); var dir = Path.GetDirectoryName(Path.GetFullPath(path)); if (!string.IsNullOrEmpty(dir)) diff --git a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs index ac805ea602..bce797313b 100644 --- a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs +++ b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs @@ -34,7 +34,7 @@ public static Command Create() var noiseMaxDegradationOpt = new Option("--noise-max-degradation") { Description = "Maximum acceptable average quality degradation (0-1) in noise test (only positive degradations count)", DefaultValueFactory = _ => 0.2 }; var noiseMaxScenarioDegradationOpt = new Option("--noise-max-scenario-degradation") { Description = "Maximum acceptable quality degradation (0-1) for any single noise-test scenario", DefaultValueFactory = _ => 0.4 }; var baselineOutOpt = new Option("--baseline-out") { Description = "After running, persist each scenario's averaged baseline (no-skill/no-agent reference) to this file for later reuse with --baseline-from." }; - var baselineFromOpt = new Option("--baseline-from") { Description = "Reuse a precomputed baseline from this file instead of re-running the no-skill/no-agent baseline arm. Must match --model and the scenario prompts. Mutually exclusive with --baseline-out." }; + var baselineFromOpt = new Option("--baseline-from") { Description = "Reuse a precomputed baseline from this file instead of re-running the no-skill/no-agent baseline arm. Must match --model, --judge-model, and each scenario's prompt, setup inputs, and evaluation criteria. Mutually exclusive with --baseline-out." }; var command = new Command("evaluate", "Evaluate agent skills via LLM-based testing") { @@ -310,7 +310,7 @@ public static async Task Run(ValidatorConfig config, CancellationToken canc { try { - baselineStore = BaselineStore.Load(config.BaselineFrom, config.Model); + baselineStore = BaselineStore.Load(config.BaselineFrom, config.Model, config.JudgeModel); } catch (Exception ex) when (ex is FileNotFoundException or InvalidOperationException) { @@ -336,7 +336,7 @@ public static async Task Run(ValidatorConfig config, CancellationToken canc } else if (config.BaselineOut is not null) { - baselineStore = BaselineStore.ForWrite(config.Model); + baselineStore = BaselineStore.ForWrite(config.Model, config.JudgeModel); Console.WriteLine($"Baseline will be persisted to {config.BaselineOut} after the run."); } diff --git a/eng/skill-validator/src/README.md b/eng/skill-validator/src/README.md index 4c8c54b4cd..c2b2d03fbf 100644 --- a/eng/skill-validator/src/README.md +++ b/eng/skill-validator/src/README.md @@ -147,7 +147,7 @@ skill-validator check --json --plugin ./plugins/my-plugin | `--judge-timeout ` | `300` | Judge LLM timeout in seconds | | `--require-completion` | `true` | Fail if skill regresses task completion | | `--baseline-out ` | *(none)* | After running, persist each scenario's averaged baseline (no-skill/no-agent reference) to this file for reuse. Mutually exclusive with `--baseline-from`. | -| `--baseline-from ` | *(none)* | Reuse a precomputed baseline from this file instead of re-running the baseline arm. Must match `--model` and every scenario prompt. Mutually exclusive with `--baseline-out`. | +| `--baseline-from ` | *(none)* | Reuse a precomputed baseline from this file instead of re-running the baseline arm. Must match `--model`, `--judge-model`, and every scenario's prompt, setup inputs, and evaluation criteria. Mutually exclusive with `--baseline-out`. | | `--verdict-warn-only` | `false` | Treat verdict failures as warnings (exit 0). Execution errors still fail. | | `--no-overfitting-check` | `false` | Disable the LLM-based overfitting analysis (on by default) | | `--overfitting-fix` | `false` | Generate `eval.fixed.yaml` with improved rubric items/assertions | @@ -166,7 +166,9 @@ Every evaluation runs each scenario through a **baseline arm** (the agent with n 1. **Produce** a baseline file with `--baseline-out baseline.json`. After the run, each scenario's averaged baseline result (honoring `--runs`) is written to the file. 2. **Reuse** it with `--baseline-from baseline.json` on subsequent runs. The baseline arm is skipped entirely; the cached baseline is used for assertions, pairwise/independent judging, and metric deltas. -The baseline file records the `--model` and, per scenario, a SHA-256 of the prompt **and** a SHA-256 of its setup inputs (the fixtures copied via `copy_test_files`, explicit setup files, and setup commands — the analog of a target/input SHA). On reuse the validator fails fast if the model differs or any scenario's prompt-plus-fixture identity is missing from the file, so a stale or mismatched baseline can never be silently applied — and two scenarios that share a prompt but feed the agent different fixtures (e.g. a different `build.binlog`) never reuse each other's baseline. Scenarios reused from the file are reported with the `baseline-reused` session phase and a `reused` baseline status. +The baseline file records the `--model` **and** `--judge-model`, and per scenario a SHA-256 of the prompt plus a composite SHA-256 over (a) its setup inputs — the fixtures copied via `copy_test_files`, explicit setup files, and setup commands — and (b) the evaluation criteria that shape the stored result (rubric, assertions, expect/reject tools, and the turn/token/timeout limits). This is the analog of a target/input SHA. On reuse the validator fails fast if the agent model, the judge model, or any scenario's prompt-plus-setup-plus-criteria identity is missing from the file, so a stale or mismatched baseline can never be silently applied — and two scenarios that share a prompt but feed the agent different fixtures (e.g. a different `build.binlog`) or use different rubrics never reuse each other's baseline. Scenarios reused from the file are reported with the `baseline-reused` session phase and a `reused` baseline status. + +> **Note:** Setup `commands` are fingerprinted by their text (the recipe), not the artifacts they produce, so baseline reuse assumes setup commands are deterministic/hermetic — a command whose output changes between runs (e.g. fetching `latest`) will not invalidate a cached baseline. The two options are mutually exclusive. diff --git a/eng/skill-validator/src/docs/InvestigatingResults.md b/eng/skill-validator/src/docs/InvestigatingResults.md index c278035799..e2c0a7ecd4 100644 --- a/eng/skill-validator/src/docs/InvestigatingResults.md +++ b/eng/skill-validator/src/docs/InvestigatingResults.md @@ -83,7 +83,7 @@ Each scenario includes two required runs (baseline + isolated). It may also incl > **Note:** Scenarios do not have a `passed` field. To determine pass/fail for an individual scenario, check whether `improvementScore >= 0`. This is the effective score: when no plugin run is present it equals `isolatedImprovementScore`; when a plugin run is present it is the min of isolated and plugin scores. The `passed` field exists only at the verdict level (per-skill). -> **Reused baselines:** When the run was invoked with `--baseline-from`, the `baseline` arm is not executed — its `metrics` and `judgeResult` come from the shared baseline file produced earlier with `--baseline-out` (computed once, honoring `--runs`). Such scenarios are reported with the `baseline-reused` session phase and a `reused` baseline status. The baseline file is keyed on `--model` plus, per scenario, a SHA-256 of the prompt and a SHA-256 of its setup/fixture inputs (copied test files, explicit setup files, and setup commands); reuse fails fast if the model differs or any prompt-plus-fixture identity is missing, so the baseline you compare against is always identity-matched and a shared prompt across cases with different fixtures cannot cross-contaminate. Because the baseline output is identical across every skill/agent that consumes the same file, this acts as a shared control group and removes baseline run-to-run variance from cross-skill comparisons. +> **Reused baselines:** When the run was invoked with `--baseline-from`, the `baseline` arm is not executed — its `metrics` and `judgeResult` come from the shared baseline file produced earlier with `--baseline-out` (computed once, honoring `--runs`). Such scenarios are reported with the `baseline-reused` session phase and a `reused` baseline status. The baseline file is keyed on `--model` and `--judge-model` plus, per scenario, a SHA-256 of the prompt and a composite SHA-256 over its setup inputs (copied test files, explicit setup files, and setup commands) and its evaluation criteria (rubric, assertions, expect/reject tools, and turn/token/timeout limits); reuse fails fast if the agent model, judge model, or any prompt-plus-setup-plus-criteria identity is missing, so the baseline you compare against is always identity-matched and a shared prompt across cases with different fixtures or rubrics cannot cross-contaminate. Because the baseline output is identical across every skill/agent that consumes the same file, this acts as a shared control group and removes baseline run-to-run variance from cross-skill comparisons. ### Breakdown fields diff --git a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs index 09444d7cb5..d6d41c3d02 100644 --- a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs +++ b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs @@ -6,6 +6,9 @@ namespace SkillValidator.Tests; public class BaselineStoreTests { + private const string Model = "model-x"; + private const string Judge = "judge-x"; + private static RunResult MakeBaseline(double overallScore = 3, string output = "baseline output") => new( new RunMetrics @@ -42,7 +45,7 @@ public void SaveThenLoad_RoundTripsBaselinePerScenario() var path = TempPath(); try { - var store = BaselineStore.ForWrite("model-x"); + var store = BaselineStore.ForWrite(Model, Judge); var s1 = Scenario("alpha", "prompt one"); var s2 = Scenario("beta", "prompt two"); store.Record(s1, runs: 5, MakeBaseline(overallScore: 4, output: "out-1")); @@ -51,7 +54,7 @@ public void SaveThenLoad_RoundTripsBaselinePerScenario() Assert.True(File.Exists(path)); - var loaded = BaselineStore.Load(path, "model-x"); + var loaded = BaselineStore.Load(path, Model, Judge); Assert.True(loaded.IsReuse); Assert.Equal(2, loaded.Count); @@ -75,12 +78,12 @@ public void Load_ThrowsOnModelMismatch() var path = TempPath(); try { - var store = BaselineStore.ForWrite("model-x"); + var store = BaselineStore.ForWrite(Model, Judge); store.Record(Scenario("alpha", "prompt one"), runs: 3, MakeBaseline()); store.Save(path); - var ex = Assert.Throws(() => BaselineStore.Load(path, "model-y")); - Assert.Contains("model-x", ex.Message); + var ex = Assert.Throws(() => BaselineStore.Load(path, "model-y", Judge)); + Assert.Contains(Model, ex.Message); Assert.Contains("model-y", ex.Message); } finally @@ -89,6 +92,26 @@ public void Load_ThrowsOnModelMismatch() } } + [Fact] + public void Load_ThrowsOnJudgeModelMismatch() + { + var path = TempPath(); + try + { + var store = BaselineStore.ForWrite(Model, Judge); + store.Record(Scenario("alpha", "prompt one"), runs: 3, MakeBaseline()); + store.Save(path); + + var ex = Assert.Throws(() => BaselineStore.Load(path, Model, "judge-y")); + Assert.Contains(Judge, ex.Message); + Assert.Contains("judge-y", ex.Message); + } + finally + { + File.Delete(path); + } + } + [Fact] public void Load_ThrowsOnUnsupportedVersion() { @@ -97,13 +120,14 @@ public void Load_ThrowsOnUnsupportedVersion() { var file = new BaselineFile( Version: BaselineStore.CurrentVersion + 1, - Model: "model-x", + Model: Model, + JudgeModel: Judge, ValidatorVersion: "9.9.9", CreatedAt: DateTime.UtcNow.ToString("o"), Scenarios: []); File.WriteAllText(path, JsonSerializer.Serialize(file, SkillValidatorJsonContext.Default.BaselineFile)); - var ex = Assert.Throws(() => BaselineStore.Load(path, "model-x")); + var ex = Assert.Throws(() => BaselineStore.Load(path, Model, Judge)); Assert.Contains("unsupported version", ex.Message); } finally @@ -115,7 +139,7 @@ public void Load_ThrowsOnUnsupportedVersion() [Fact] public void Load_ThrowsWhenFileMissing() { - Assert.Throws(() => BaselineStore.Load(TempPath(), "model-x")); + Assert.Throws(() => BaselineStore.Load(TempPath(), Model, Judge)); } [Fact] @@ -124,12 +148,12 @@ public void FindMissingScenarios_ReturnsScenariosWithoutCachedBaseline() var path = TempPath(); try { - var store = BaselineStore.ForWrite("model-x"); + var store = BaselineStore.ForWrite(Model, Judge); var present = Scenario("alpha", "prompt one"); store.Record(present, runs: 5, MakeBaseline()); store.Save(path); - var loaded = BaselineStore.Load(path, "model-x"); + var loaded = BaselineStore.Load(path, Model, Judge); var missing = loaded.FindMissingScenarios([(present, null), (Scenario("beta", "prompt two"), null)]); Assert.Single(missing); @@ -144,7 +168,7 @@ public void FindMissingScenarios_ReturnsScenariosWithoutCachedBaseline() [Fact] public void WriteStore_IsNotReuse() { - var store = BaselineStore.ForWrite("model-x"); + var store = BaselineStore.ForWrite(Model, Judge); Assert.False(store.IsReuse); Assert.Null(store.TryGetBaseline(Scenario("alpha", "prompt one"))); } @@ -189,6 +213,30 @@ public void ComputeTargetSha_DiffersByFixtureContentAndIsStable() } } + [Fact] + public void ComputeTargetSha_DiffersByEvaluationCriteria() + { + const string prompt = "investigate the failure"; + var baseScenario = Scenario("s", prompt); + var withRubric = baseScenario with { Rubric = ["Did it find the root cause?"] }; + var withAssertion = baseScenario with { Assertions = [new Assertion(AssertionType.OutputContains, Value: "error")] }; + var withTurns = baseScenario with { MaxTurns = 5 }; + var withExpectTools = baseScenario with { ExpectTools = ["bash"] }; + + var shaBase = BaselineStore.ComputeTargetSha(baseScenario, null); + + // Each criterion that shapes the cached result must change the identity. + Assert.NotEqual(shaBase, BaselineStore.ComputeTargetSha(withRubric, null)); + Assert.NotEqual(shaBase, BaselineStore.ComputeTargetSha(withAssertion, null)); + Assert.NotEqual(shaBase, BaselineStore.ComputeTargetSha(withTurns, null)); + Assert.NotEqual(shaBase, BaselineStore.ComputeTargetSha(withExpectTools, null)); + + // Same criteria → stable identity. + Assert.Equal( + BaselineStore.ComputeTargetSha(withRubric, null), + BaselineStore.ComputeTargetSha(baseScenario with { Rubric = ["Did it find the root cause?"] }, null)); + } + [Fact] public void SamePromptDifferentFixture_DoesNotReuseBaseline() { @@ -203,21 +251,22 @@ public void SamePromptDifferentFixture_DoesNotReuseBaseline() var scenarioB = FixtureScenario("case-B", sharedPrompt); // Persist a baseline only for case A. - var store = BaselineStore.ForWrite("model-x"); + var store = BaselineStore.ForWrite(Model, Judge); store.Record(scenarioA, runs: 5, MakeBaseline(output: "A-baseline"), evalA); store.Save(path); - var loaded = BaselineStore.Load(path, "model-x"); + var loaded = BaselineStore.Load(path, Model, Judge); // Case A reuses its baseline; case B must NOT (different targetSha). Assert.NotNull(loaded.TryGetBaseline(scenarioA, evalA)); Assert.Equal("A-baseline", loaded.TryGetBaseline(scenarioA, evalA)!.Metrics.AgentOutput); Assert.Null(loaded.TryGetBaseline(scenarioB, evalB)); - // FindMissingScenarios surfaces case B by name despite the shared prompt. + // FindMissingScenarios surfaces case B (with its eval path) despite the shared prompt. var missing = loaded.FindMissingScenarios([(scenarioA, evalA), (scenarioB, evalB)]); Assert.Single(missing); - Assert.Equal("case-B", missing[0]); + Assert.StartsWith("case-B", missing[0]); + Assert.Contains(evalB, missing[0]); } finally { From b5be0e3fb87ea3d04ac0ba0204976ff973744ba0 Mon Sep 17 00:00:00 2001 From: YuliiaKovalova <95473390+YuliiaKovalova@users.noreply.github.com> Date: Thu, 11 Jun 2026 18:10:31 +0200 Subject: [PATCH 4/7] Address PR review comments on baseline reuse - Mirror AgentRunner.SetupWorkDir exactly when hashing copied fixtures: enumerate only the files actually copied (top-level siblings except eval.yaml, recursing into directories) and skip reparse points and out-of-root junctions, instead of blindly hashing every file under the eval directory. This keeps the fixture identity restricted to the intentionally-copied set so stray output/log files can't poison reuse. - Stream baseline JSON to/from disk (File.OpenRead/File.Create with JsonSerializer) so large baselines never materialize as one giant in-memory string. - Enrich the fail-fast 'missing scenario' output with the eval path and short prompt/target SHA prefixes so it is actionable when scenario names collide across eval files. - Add a test locking in recursive (nested-directory) fixture hashing. 563 tests pass. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/Evaluate/BaselineStore.cs | 87 ++++++++++++++++--- .../tests/Evaluate/BaselineStoreTests.cs | 29 ++++++- 2 files changed, 101 insertions(+), 15 deletions(-) diff --git a/eng/skill-validator/src/Evaluate/BaselineStore.cs b/eng/skill-validator/src/Evaluate/BaselineStore.cs index 87a51a12d2..3a060e13e4 100644 --- a/eng/skill-validator/src/Evaluate/BaselineStore.cs +++ b/eng/skill-validator/src/Evaluate/BaselineStore.cs @@ -83,7 +83,8 @@ public static BaselineStore Load(string path, string expectedModel, string expec BaselineFile? file; try { - file = JsonSerializer.Deserialize(File.ReadAllText(path), SkillValidatorJsonContext.Default.BaselineFile); + using var stream = File.OpenRead(path); + file = JsonSerializer.Deserialize(stream, SkillValidatorJsonContext.Default.BaselineFile); } catch (JsonException ex) { @@ -182,17 +183,15 @@ private static string ComputeInputsSha(EvalScenario scenario, string? evalPath) var sb = new StringBuilder(); // 1. Sibling files auto-copied into the work dir (copy_test_files: true). Mirror - // AgentRunner.SetupWorkDir, which excludes only the top-level eval.yaml. + // AgentRunner.SetupWorkDir/CopyDirectory exactly so the hash reflects precisely + // the files the agent is given — no more (e.g. reparse points are skipped) and + // no fewer (nested files are included). if (setup.CopyTestFiles && evalPath is not null) { var evalDir = Path.GetDirectoryName(evalPath); if (!string.IsNullOrEmpty(evalDir) && Directory.Exists(evalDir)) { - var files = Directory.EnumerateFiles(evalDir, "*", SearchOption.AllDirectories) - .Select(f => (Rel: Path.GetRelativePath(evalDir, f).Replace('\\', '/'), Full: f)) - .Where(x => !string.Equals(x.Rel, "eval.yaml", StringComparison.Ordinal)) - .OrderBy(x => x.Rel, StringComparer.Ordinal); - foreach (var (rel, full) in files) + foreach (var (rel, full) in EnumerateCopiedFixtures(evalDir).OrderBy(x => x.Rel, StringComparer.Ordinal)) sb.Append("F:").Append(rel).Append('=').Append(HashFile(full)).Append('\n'); } } @@ -224,6 +223,54 @@ private static string ComputeInputsSha(EvalScenario scenario, string? evalPath) return Sha256Hex(Encoding.UTF8.GetBytes(sb.ToString())); } + private static readonly StringComparison PathComparison = + OperatingSystem.IsWindows() ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal; + + /// + /// Yields the exact set of files copies into the + /// work dir under copy_test_files: every top-level sibling except eval.yaml, + /// recursing into directories. Reparse points (symlinks/junctions) and junctions that + /// resolve outside their top-level fixture directory are skipped, mirroring + /// CopyDirectory, so the hash only ever covers files genuinely materialized for the + /// run rather than whatever else happens to live under the eval directory. + /// + private static IEnumerable<(string Rel, string Full)> EnumerateCopiedFixtures(string evalDir) + { + foreach (var entry in new DirectoryInfo(evalDir).EnumerateFileSystemInfos()) + { + if (string.Equals(entry.Name, "eval.yaml", StringComparison.Ordinal)) + continue; + if (entry is FileInfo file) + yield return (file.Name, file.FullName); + else if (entry is DirectoryInfo dir) + { + var root = Path.TrimEndingDirectorySeparator(Path.GetFullPath(dir.FullName)); + foreach (var nested in EnumerateDirFixtures(dir.FullName, dir.Name, root)) + yield return nested; + } + } + } + + private static IEnumerable<(string Rel, string Full)> EnumerateDirFixtures(string dir, string relBase, string sourceRoot) + { + foreach (var entry in new DirectoryInfo(dir).EnumerateFileSystemInfos()) + { + if ((entry.Attributes & FileAttributes.ReparsePoint) != 0) + continue; + var rel = string.Concat(relBase, "/", entry.Name); + if (entry is DirectoryInfo sub) + { + var subFull = Path.TrimEndingDirectorySeparator(Path.GetFullPath(sub.FullName)); + if (!subFull.StartsWith(sourceRoot + Path.DirectorySeparatorChar, PathComparison)) + continue; + foreach (var nested in EnumerateDirFixtures(sub.FullName, rel, sourceRoot)) + yield return nested; + } + else + yield return (rel, entry.FullName); + } + } + /// /// Deterministic textual signature of the evaluation criteria that influence a scenario's /// stored baseline result: run-bounding limits, rubric, assertions, and expect/reject tools. @@ -273,15 +320,24 @@ private static string HashFile(string path) private static string MakeKey(string promptSha, string targetSha) => string.Concat(promptSha, ":", targetSha); /// - /// In reuse mode, return human-readable identifiers (name + eval path) of scenarios that - /// have no matching cached baseline (keyed by prompt + setup/criteria identity). Empty - /// when every scenario is covered. Each scenario is paired with the eval.yaml path it - /// originates from so its input artifacts can be fingerprinted and reported unambiguously. + /// In reuse mode, return human-readable identifiers of scenarios that have no matching + /// cached baseline (keyed by prompt + setup/criteria identity). Empty when every scenario + /// is covered. Each entry carries the originating eval path plus short prompt/target SHA + /// prefixes so a missing scenario is actionable even when names collide across eval files. /// public IReadOnlyList FindMissingScenarios(IEnumerable<(EvalScenario Scenario, string? EvalPath)> scenarios) => scenarios - .Where(s => !_entries.ContainsKey(MakeKey(ComputePromptSha(s.Scenario.Prompt), TargetShaFor(s.Scenario, s.EvalPath)))) - .Select(s => s.EvalPath is null ? s.Scenario.Name : $"{s.Scenario.Name} ({s.EvalPath})") + .Select(s => ( + s.Scenario, + s.EvalPath, + PromptSha: ComputePromptSha(s.Scenario.Prompt), + TargetSha: TargetShaFor(s.Scenario, s.EvalPath))) + .Where(x => !_entries.ContainsKey(MakeKey(x.PromptSha, x.TargetSha))) + .Select(x => + { + var where = x.EvalPath is null ? "" : $" in {x.EvalPath}"; + return $"{x.Scenario.Name}{where} [prompt {x.PromptSha[..8]}, target {x.TargetSha[..8]}]"; + }) .ToList(); /// Get the cached averaged baseline for a scenario, or null when absent. @@ -317,7 +373,10 @@ public void Save(string path) if (!string.IsNullOrEmpty(dir)) Directory.CreateDirectory(dir); - File.WriteAllText(path, JsonSerializer.Serialize(file, SkillValidatorJsonContext.Default.BaselineFile)); + // Stream directly to disk so large baselines (many scenarios with full + // RunMetrics/AgentOutput) never materialize as one giant in-memory string. + using var stream = File.Create(path); + JsonSerializer.Serialize(stream, file, SkillValidatorJsonContext.Default.BaselineFile); } /// Number of baselines currently held. diff --git a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs index d6d41c3d02..7dbd5e249f 100644 --- a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs +++ b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs @@ -157,7 +157,7 @@ public void FindMissingScenarios_ReturnsScenariosWithoutCachedBaseline() var missing = loaded.FindMissingScenarios([(present, null), (Scenario("beta", "prompt two"), null)]); Assert.Single(missing); - Assert.Equal("beta", missing[0]); + Assert.StartsWith("beta", missing[0]); } finally { @@ -237,6 +237,33 @@ public void ComputeTargetSha_DiffersByEvaluationCriteria() BaselineStore.ComputeTargetSha(baseScenario with { Rubric = ["Did it find the root cause?"] }, null)); } + [Fact] + public void ComputeTargetSha_IncludesNestedFixtureFiles() + { + // copy_test_files copies subdirectories recursively, so nested fixture content + // must participate in the target identity (mirrors AgentRunner.CopyDirectory). + var evalPath = MakeEvalDirWithFixture("top.txt", "top"); + var evalDir = Path.GetDirectoryName(evalPath)!; + var nestedDir = Path.Combine(evalDir, "sub"); + Directory.CreateDirectory(nestedDir); + var nestedFile = Path.Combine(nestedDir, "data.bin"); + File.WriteAllText(nestedFile, "v1"); + try + { + var scenario = FixtureScenario("s", "investigate"); + var before = BaselineStore.ComputeTargetSha(scenario, evalPath); + + File.WriteAllText(nestedFile, "v2"); + var after = BaselineStore.ComputeTargetSha(scenario, evalPath); + + Assert.NotEqual(before, after); // nested file change invalidates reuse + } + finally + { + Directory.Delete(evalDir, recursive: true); + } + } + [Fact] public void SamePromptDifferentFixture_DoesNotReuseBaseline() { From c9f0f44e9eeea00f08be73f971b690e704d3188e Mon Sep 17 00:00:00 2001 From: YuliiaKovalova <95473390+YuliiaKovalova@users.noreply.github.com> Date: Thu, 11 Jun 2026 20:08:39 +0200 Subject: [PATCH 5/7] Address second round of PR review comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Skip top-level reparse points in EnumerateCopiedFixtures (not just nested ones) so a top-level symlink/junction can't cause hashing of data outside the eval directory; code now matches the docstring. - Record uses first-writer-wins (TryAdd) instead of overwriting, so a scenario identity recorded by multiple parallel targets yields a deterministic --baseline-out regardless of completion order. - Persist the baseline judge result to the session DB even when the baseline is reused, so the registered 'baseline-reused' session record is complete for downstream investigation tooling (pairwise was already saved; the judge result was incorrectly gated on a fresh run). - Add first-writer-wins test (564 pass). Note: BaselineStore stays internal — the test project already has InternalsVisibleTo, so it compiles. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/Evaluate/BaselineStore.cs | 20 +++++++++----- .../src/Evaluate/EvaluateCommand.cs | 6 +++-- .../tests/Evaluate/BaselineStoreTests.cs | 26 +++++++++++++++++++ 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/eng/skill-validator/src/Evaluate/BaselineStore.cs b/eng/skill-validator/src/Evaluate/BaselineStore.cs index 3a060e13e4..1a93250087 100644 --- a/eng/skill-validator/src/Evaluate/BaselineStore.cs +++ b/eng/skill-validator/src/Evaluate/BaselineStore.cs @@ -229,10 +229,10 @@ private static string ComputeInputsSha(EvalScenario scenario, string? evalPath) /// /// Yields the exact set of files copies into the /// work dir under copy_test_files: every top-level sibling except eval.yaml, - /// recursing into directories. Reparse points (symlinks/junctions) and junctions that - /// resolve outside their top-level fixture directory are skipped, mirroring - /// CopyDirectory, so the hash only ever covers files genuinely materialized for the - /// run rather than whatever else happens to live under the eval directory. + /// recursing into directories. Reparse points (symlinks/junctions) — at the top level and + /// nested — and junctions that resolve outside their top-level fixture directory are + /// skipped, so the hash only ever covers files genuinely materialized for the run rather + /// than data linked from outside the eval directory. /// private static IEnumerable<(string Rel, string Full)> EnumerateCopiedFixtures(string evalDir) { @@ -240,6 +240,8 @@ private static string ComputeInputsSha(EvalScenario scenario, string? evalPath) { if (string.Equals(entry.Name, "eval.yaml", StringComparison.Ordinal)) continue; + if ((entry.Attributes & FileAttributes.ReparsePoint) != 0) + continue; if (entry is FileInfo file) yield return (file.Name, file.FullName); else if (entry is DirectoryInfo dir) @@ -346,12 +348,18 @@ public IReadOnlyList FindMissingScenarios(IEnumerable<(EvalScenario Scen ? entry.Baseline : null; - /// Record a scenario's averaged baseline for later persistence (write mode). + /// + /// Record a scenario's averaged baseline for later persistence (write mode). The + /// baseline arm is target-independent, so when several targets evaluated in parallel + /// share the same scenario identity they produce the same key; a first-writer-wins + /// strategy keeps the persisted file deterministic regardless of completion order + /// (later identical-key records — differing only by run-to-run noise — are ignored). + /// public void Record(EvalScenario scenario, int runs, RunResult averagedBaseline, string? evalPath = null) { var promptSha = ComputePromptSha(scenario.Prompt); var targetSha = TargetShaFor(scenario, evalPath); - _entries[MakeKey(promptSha, targetSha)] = new BaselineScenarioEntry(scenario.Name, promptSha, targetSha, runs, averagedBaseline); + _entries.TryAdd(MakeKey(promptSha, targetSha), new BaselineScenarioEntry(scenario.Name, promptSha, targetSha, runs, averagedBaseline)); } /// Serialize all recorded baselines to . diff --git a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs index bce797313b..1e8348b3d3 100644 --- a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs +++ b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs @@ -1434,8 +1434,10 @@ private static async Task ExecuteRun( if (sessionDb is not null) { - if (reusedBaseline is null) - sessionDb.SaveJudgeResult(baselineSessionId, JsonSerializer.Serialize(baselineJudge, SkillValidatorJsonContext.Default.JudgeResult)); + // Persist the baseline judge result even when reused so the baseline session + // record (registered with the "baseline-reused" phase) is complete for + // downstream investigation tooling — baselineJudge is valid in both cases. + sessionDb.SaveJudgeResult(baselineSessionId, JsonSerializer.Serialize(baselineJudge, SkillValidatorJsonContext.Default.JudgeResult)); sessionDb.SaveJudgeResult(isolatedSessionId, JsonSerializer.Serialize(isolatedJudge, SkillValidatorJsonContext.Default.JudgeResult)); sessionDb.SaveJudgeResult(pluginSessionId, JsonSerializer.Serialize(pluginJudge, SkillValidatorJsonContext.Default.JudgeResult)); } diff --git a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs index 7dbd5e249f..57d295a880 100644 --- a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs +++ b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs @@ -237,6 +237,32 @@ public void ComputeTargetSha_DiffersByEvaluationCriteria() BaselineStore.ComputeTargetSha(baseScenario with { Rubric = ["Did it find the root cause?"] }, null)); } + [Fact] + public void Record_IsFirstWriterWins_ForSameScenarioIdentity() + { + var path = TempPath(); + try + { + var store = BaselineStore.ForWrite(Model, Judge); + var scenario = Scenario("alpha", "prompt one"); + + // Same identity recorded twice (e.g. two parallel targets sharing a scenario) + // with differing run-to-run results: the first record must win so --baseline-out + // is deterministic regardless of completion order. + store.Record(scenario, runs: 5, MakeBaseline(output: "first")); + store.Record(scenario, runs: 5, MakeBaseline(output: "second")); + + Assert.Equal(1, store.Count); + store.Save(path); + var loaded = BaselineStore.Load(path, Model, Judge); + Assert.Equal("first", loaded.TryGetBaseline(scenario)!.Metrics.AgentOutput); + } + finally + { + File.Delete(path); + } + } + [Fact] public void ComputeTargetSha_IncludesNestedFixtureFiles() { From 7b76143d0e60a4d4a7c4fd7a42cab72fbbda437b Mon Sep 17 00:00:00 2001 From: YuliiaKovalova <95473390+YuliiaKovalova@users.noreply.github.com> Date: Fri, 12 Jun 2026 12:11:47 +0200 Subject: [PATCH 6/7] Pin MessagePack to 2.5.301 to fix NU1903 vulnerability The transitive MessagePack 2.5.198 (via GitHub.Copilot.SDK -> StreamJsonRpc) has a high-severity vulnerability (GHSA-hv8m-jj95-wg3x) that fails the build under TreatWarningsAsErrors. Pin a direct reference to the patched 2.5.301. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- eng/skill-validator/src/SkillValidator.csproj | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/eng/skill-validator/src/SkillValidator.csproj b/eng/skill-validator/src/SkillValidator.csproj index 77b51e897e..ca7037ad99 100644 --- a/eng/skill-validator/src/SkillValidator.csproj +++ b/eng/skill-validator/src/SkillValidator.csproj @@ -52,6 +52,14 @@ Fixed in 1.1.62. Drop this once the upstream chain bumps past 1.1.62. --> + + + From 79ffa9c33a8b48b07e9a9bd7a11a932112c6092e Mon Sep 17 00:00:00 2001 From: YuliiaKovalova <95473390+YuliiaKovalova@users.noreply.github.com> Date: Fri, 12 Jun 2026 13:19:58 +0200 Subject: [PATCH 7/7] Address review: bare-filename fixture hashing, clone reused baseline, consistent token attribution - ComputeInputsSha: normalize evalPath via Path.GetFullPath so a bare filename still hashes sibling fixtures (avoids TargetSha collisions / unsafe reuse). - RunMetrics.Clone(): per-run copy with fresh collections; reuse paths now clone the cached baseline so concurrent evaluations never share a mutable instance. - Pairwise judge tokens attributed to both compared runs in every mode (the baseline clone makes this safe), keeping token deltas comparable across --baseline-from modes. - Reword Record first-writer-wins doc to describe the within-run stabilization guarantee rather than order-independence. - Add tests for bare-filename fixture hashing and clone isolation (566 pass). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/Evaluate/BaselineStore.cs | 17 +++++-- .../src/Evaluate/EvaluateCommand.cs | 20 ++++---- eng/skill-validator/src/Evaluate/Models.cs | 31 ++++++++++++ .../tests/Evaluate/BaselineStoreTests.cs | 48 +++++++++++++++++++ 4 files changed, 104 insertions(+), 12 deletions(-) diff --git a/eng/skill-validator/src/Evaluate/BaselineStore.cs b/eng/skill-validator/src/Evaluate/BaselineStore.cs index 1a93250087..bca17bbe1f 100644 --- a/eng/skill-validator/src/Evaluate/BaselineStore.cs +++ b/eng/skill-validator/src/Evaluate/BaselineStore.cs @@ -188,7 +188,12 @@ private static string ComputeInputsSha(EvalScenario scenario, string? evalPath) // no fewer (nested files are included). if (setup.CopyTestFiles && evalPath is not null) { - var evalDir = Path.GetDirectoryName(evalPath); + // Normalize first: Path.GetDirectoryName returns "" for a bare filename + // (e.g. "eval.yaml" in the cwd), which would silently skip fixture hashing + // even though copy_test_files still copies the sibling files — risking + // TargetSha collisions and unsafe baseline reuse. GetFullPath resolves the + // bare name against the current directory so its real parent is hashed. + var evalDir = Path.GetDirectoryName(Path.GetFullPath(evalPath)); if (!string.IsNullOrEmpty(evalDir) && Directory.Exists(evalDir)) { foreach (var (rel, full) in EnumerateCopiedFixtures(evalDir).OrderBy(x => x.Rel, StringComparer.Ordinal)) @@ -351,9 +356,13 @@ public IReadOnlyList FindMissingScenarios(IEnumerable<(EvalScenario Scen /// /// Record a scenario's averaged baseline for later persistence (write mode). The /// baseline arm is target-independent, so when several targets evaluated in parallel - /// share the same scenario identity they produce the same key; a first-writer-wins - /// strategy keeps the persisted file deterministic regardless of completion order - /// (later identical-key records — differing only by run-to-run noise — are ignored). + /// share the same scenario identity they produce the same key. A first-writer-wins + /// strategy stabilizes the baseline chosen within a single run: once a value is + /// recorded for a key the first writer's value is kept and later records for that key + /// are ignored, preventing non-deterministic late overwrites under parallelism. The + /// competing records differ only by run-to-run noise, so which writer wins the race is + /// immaterial — the guarantee is that the persisted value is not clobbered afterward, + /// not that it is independent of thread scheduling. /// public void Record(EvalScenario scenario, int runs, RunResult averagedBaseline, string? evalPath = null) { diff --git a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs index 1e8348b3d3..cada00d47e 100644 --- a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs +++ b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs @@ -806,7 +806,7 @@ private static async Task ExecuteAgentRun( { if (config.Verbose) runLog("↩︎ reusing precomputed baseline"); - baselineMetrics = reusedBaseline.Metrics; + baselineMetrics = reusedBaseline.Metrics.Clone(); var skilled = await Task.WhenAll(isolatedTask, pluginTask); isolatedMetrics = skilled[0]; pluginMetrics = skilled[1]; @@ -908,8 +908,11 @@ private static async Task ExecuteAgentRun( new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, pairwiseWorkDir, agent.Path, worseSkilled.WorkDir), runLog, cancellationToken); pairwise = pairwiseResult; - if (reusedBaseline is null) - AccumulateJudgeTokens(baselineMetrics, pairwiseTokens); + // Attribute pairwise judge tokens consistently to both compared runs in + // every mode so token deltas stay comparable regardless of --baseline-from. + // baselineMetrics is a per-run clone when reused, so this never mutates the + // shared cached baseline. + AccumulateJudgeTokens(baselineMetrics, pairwiseTokens); AccumulateJudgeTokens(worseSkilled, pairwiseTokens); } catch (Exception error) @@ -1344,7 +1347,7 @@ private static async Task ExecuteRun( { if (config.Verbose) runLog("↩︎ reusing precomputed baseline"); - baselineMetrics = reusedBaseline.Metrics; + baselineMetrics = reusedBaseline.Metrics.Clone(); var skilled = await Task.WhenAll(isolatedTask, pluginTask); isolatedMetrics = skilled[0]; pluginMetrics = skilled[1]; @@ -1466,10 +1469,11 @@ private static async Task ExecuteRun( new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, pairwiseWorkDir, skill.Path, worseSkilled.WorkDir), runLog, cancellationToken); pairwise = pairwiseResult; - // Attribute pairwise judge tokens to the compared run (and to the baseline - // only when it was freshly executed, to avoid double-counting reused cost). - if (reusedBaseline is null) - AccumulateJudgeTokens(baselineMetrics, pairwiseTokens); + // Attribute pairwise judge tokens consistently to both compared runs in + // every mode so token deltas stay comparable regardless of --baseline-from. + // baselineMetrics is a per-run clone when reused, so this never mutates the + // shared cached baseline. + AccumulateJudgeTokens(baselineMetrics, pairwiseTokens); AccumulateJudgeTokens(worseSkilled, pairwiseTokens); if (sessionDb is not null && pairwise is not null) { diff --git a/eng/skill-validator/src/Evaluate/Models.cs b/eng/skill-validator/src/Evaluate/Models.cs index 00508a9cf8..da6fe75a85 100644 --- a/eng/skill-validator/src/Evaluate/Models.cs +++ b/eng/skill-validator/src/Evaluate/Models.cs @@ -186,6 +186,37 @@ public sealed class RunMetrics public string AgentOutput { get; set; } = ""; public List Events { get; set; } = []; public string WorkDir { get; set; } = ""; + + /// + /// Creates a per-run copy. Scalar fields are copied by value and the mutable + /// collections are re-wrapped in fresh instances so mutating the clone (e.g. + /// accumulating judge tokens) never affects the source. This is essential when a + /// cached baseline is reused concurrently across parallel target evaluations: each + /// evaluation works on its own copy instead of sharing one mutable instance. + /// + public RunMetrics Clone() => new() + { + TokenEstimate = TokenEstimate, + InputTokens = InputTokens, + OutputTokens = OutputTokens, + CacheReadTokens = CacheReadTokens, + CacheWriteTokens = CacheWriteTokens, + JudgeInputTokens = JudgeInputTokens, + JudgeOutputTokens = JudgeOutputTokens, + JudgeCacheReadTokens = JudgeCacheReadTokens, + JudgeCacheWriteTokens = JudgeCacheWriteTokens, + ToolCallCount = ToolCallCount, + ToolCallBreakdown = new Dictionary(ToolCallBreakdown), + TurnCount = TurnCount, + WallTimeMs = WallTimeMs, + ErrorCount = ErrorCount, + TimedOut = TimedOut, + AssertionResults = [.. AssertionResults], + TaskCompleted = TaskCompleted, + AgentOutput = AgentOutput, + Events = [.. Events], + WorkDir = WorkDir, + }; } public sealed record RunResult( diff --git a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs index 57d295a880..e364ab7c07 100644 --- a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs +++ b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs @@ -290,6 +290,54 @@ public void ComputeTargetSha_IncludesNestedFixtureFiles() } } + [Fact] + public void ComputeTargetSha_HashesFixtures_WhenEvalPathIsBareFilename() + { + // A bare filename (no directory component) must still hash sibling fixtures: + // Path.GetDirectoryName returns "" for "eval.yaml", so without normalization + // fixture hashing is silently skipped and distinct fixtures collide. + var evalPath = MakeEvalDirWithFixture("build.binlog", "AAAA"); + var evalDir = Path.GetDirectoryName(evalPath)!; + var originalCwd = Directory.GetCurrentDirectory(); + try + { + Directory.SetCurrentDirectory(evalDir); + var scenario = FixtureScenario("s", "investigate build.binlog"); + + var shaA = BaselineStore.ComputeTargetSha(scenario, "eval.yaml"); + File.WriteAllText(Path.Combine(evalDir, "build.binlog"), "BBBB"); + var shaB = BaselineStore.ComputeTargetSha(scenario, "eval.yaml"); + + Assert.NotEqual(shaA, shaB); // fixture content participates in identity + } + finally + { + Directory.SetCurrentDirectory(originalCwd); + Directory.Delete(evalDir, recursive: true); + } + } + + [Fact] + public void Clone_ProducesIndependentCopy() + { + var source = MakeBaseline(output: "src").Metrics; + source.JudgeInputTokens = 10; + source.ToolCallBreakdown["bash"] = 4; + + var clone = source.Clone(); + clone.JudgeInputTokens = 99; + clone.ToolCallBreakdown["bash"] = 1; + clone.AssertionResults.Add(new AssertionResult(new Assertion(AssertionType.OutputContains, Value: "x"), true, "")); + + // Mutating the clone must not leak back into the source — the cached baseline + // can be reused concurrently across parallel target evaluations. + Assert.Equal(10, source.JudgeInputTokens); + Assert.Equal(4, source.ToolCallBreakdown["bash"]); + Assert.Empty(source.AssertionResults); + Assert.NotSame(source.ToolCallBreakdown, clone.ToolCallBreakdown); + Assert.NotSame(source.AssertionResults, clone.AssertionResults); + } + [Fact] public void SamePromptDifferentFixture_DoesNotReuseBaseline() {