diff --git a/eng/skill-validator/src/Evaluate/BaselineStore.cs b/eng/skill-validator/src/Evaluate/BaselineStore.cs
new file mode 100644
index 0000000000..bca17bbe1f
--- /dev/null
+++ b/eng/skill-validator/src/Evaluate/BaselineStore.cs
@@ -0,0 +1,401 @@
+using System.Collections.Concurrent;
+using System.Security.Cryptography;
+using System.Text;
+using System.Text.Json;
+
+namespace SkillValidator.Evaluate;
+
+///
+/// One scenario's precomputed baseline, keyed by the SHA-256 of its prompt
+/// () and the SHA-256 of its setup/fixture inputs
+/// (). Both must match for a baseline to be reused, so two
+/// scenarios that share a prompt but feed the agent different input artifacts
+/// (e.g. different build.binlog fixtures) never collide.
+/// records how many baseline runs were averaged into
+/// so reuse can report the robustness of the reference.
+///
+public sealed record BaselineScenarioEntry(
+ string Name,
+ string PromptSha,
+ string TargetSha,
+ int Runs,
+ RunResult Baseline);
+
+///
+/// On-disk format written by --baseline-out and read by --baseline-from.
+/// The baseline arm of evaluate is plain-agent with no skill/MCP attached, so it
+/// is independent of the target under test and can be computed once and shared across
+/// many invocations. The header records the identity needed to reject a stale reuse:
+/// the agent and the that produced the
+/// cached judge scores.
+///
+public sealed record BaselineFile(
+ int Version,
+ string Model,
+ string JudgeModel,
+ string? ValidatorVersion,
+ string CreatedAt,
+ IReadOnlyList Scenarios);
+
+///
+/// Manages a precomputed, shared baseline across evaluate invocations.
+/// In write mode (--baseline-out) it accumulates each scenario's averaged
+/// baseline for later persistence. In reuse mode (--baseline-from) it serves
+/// cached baselines in place of freshly executed baseline runs.
+///
+internal sealed class BaselineStore
+{
+ /// Current on-disk schema version.
+ public const int CurrentVersion = 2;
+
+ private readonly ConcurrentDictionary _entries = new(StringComparer.Ordinal);
+ // Memoizes the (expensive, file-I/O-bound) hashing of materialized input artifacts.
+ // Instance-scoped — never shared across stores — so it can never serve a stale hash
+ // from a different evaluation or leak between tests.
+ private readonly ConcurrentDictionary _inputsShaCache = new(StringComparer.Ordinal);
+ private readonly string _model;
+ private readonly string _judgeModel;
+
+ /// True when serving cached baselines (--baseline-from).
+ public bool IsReuse { get; }
+
+ private BaselineStore(string model, string judgeModel, bool isReuse)
+ {
+ _model = model;
+ _judgeModel = judgeModel;
+ IsReuse = isReuse;
+ }
+
+ /// Create a store that accumulates baselines for later persistence.
+ public static BaselineStore ForWrite(string model, string judgeModel) => new(model, judgeModel, isReuse: false);
+
+ ///
+ /// Load a baseline file for reuse. Validates the schema version and that both the
+ /// agent model and judge model match, throwing on mismatch so a stale or wrong
+ /// baseline can never silently skew results. Per-scenario identity (prompt + setup
+ /// inputs + evaluation criteria) is validated later via .
+ ///
+ public static BaselineStore Load(string path, string expectedModel, string expectedJudgeModel)
+ {
+ if (!File.Exists(path))
+ throw new FileNotFoundException($"Baseline file not found: {path}");
+
+ BaselineFile? file;
+ try
+ {
+ using var stream = File.OpenRead(path);
+ file = JsonSerializer.Deserialize(stream, SkillValidatorJsonContext.Default.BaselineFile);
+ }
+ catch (JsonException ex)
+ {
+ throw new InvalidOperationException($"Baseline file '{path}' is not valid JSON: {ex.Message}", ex);
+ }
+
+ if (file is null)
+ throw new InvalidOperationException($"Baseline file '{path}' is empty.");
+ if (file.Version != CurrentVersion)
+ throw new InvalidOperationException(
+ $"Baseline file '{path}' has unsupported version {file.Version} (expected {CurrentVersion}). Recompute it with --baseline-out.");
+ if (!string.Equals(file.Model, expectedModel, StringComparison.Ordinal))
+ throw new InvalidOperationException(
+ $"Baseline file '{path}' was computed for model '{file.Model}' but evaluation uses model '{expectedModel}'. " +
+ "Recompute the baseline with --baseline-out for the new model.");
+ if (!string.Equals(file.JudgeModel, expectedJudgeModel, StringComparison.Ordinal))
+ throw new InvalidOperationException(
+ $"Baseline file '{path}' was judged with model '{file.JudgeModel}' but evaluation uses judge model '{expectedJudgeModel}'. " +
+ "Recompute the baseline with --baseline-out for the new judge model.");
+
+ var store = new BaselineStore(expectedModel, expectedJudgeModel, isReuse: true);
+ foreach (var entry in file.Scenarios ?? [])
+ {
+ if (entry?.Baseline is not null)
+ store._entries[MakeKey(entry.PromptSha, entry.TargetSha)] = entry;
+ }
+ return store;
+ }
+
+ /// SHA-256 (lower-case hex) of the scenario prompt.
+ public static string ComputePromptSha(string prompt) => Sha256Hex(Encoding.UTF8.GetBytes(prompt));
+
+ ///
+ /// SHA-256 (lower-case hex) identifying everything (besides the prompt and model) that
+ /// determines a scenario's cached baseline — the analog of the
+ /// issue's targetSha. It folds in:
+ ///
+ /// - the materialized input artifacts the agent is given (files auto-copied via
+ /// copy_test_files, explicit setup files' content/sources, and the setup command
+ /// recipe), and
+ /// - the evaluation criteria that shape the stored result (rubric, assertions,
+ /// expect/reject tools, and the turn/token/timeout limits that bound the baseline run).
+ ///
+ /// This binds a cached baseline to the exact inputs and criteria it was measured
+ /// against, so two scenarios that share a prompt but differ in fixtures (e.g. a different
+ /// build.binlog) or in rubric/assertions resolve to distinct keys and never reuse
+ /// each other's baseline.
+ /// Setup commands are hashed by their text (the recipe), not the artifacts they
+ /// generate; reuse therefore assumes setup commands are deterministic/hermetic.
+ ///
+ public static string ComputeTargetSha(EvalScenario scenario, string? evalPath) =>
+ CombineIdentity(ComputeInputsSha(scenario, evalPath), scenario);
+
+ // Instance variant: memoizes the expensive input hashing, then combines with the
+ // (cheap) per-scenario criteria so the result equals the static method exactly.
+ private string TargetShaFor(EvalScenario scenario, string? evalPath)
+ {
+ var inputsSha = _inputsShaCache.GetOrAdd(BuildInputsCacheKey(scenario, evalPath), _ => ComputeInputsSha(scenario, evalPath));
+ return CombineIdentity(inputsSha, scenario);
+ }
+
+ private static string CombineIdentity(string inputsSha, EvalScenario scenario) =>
+ Sha256Hex(Encoding.UTF8.GetBytes(string.Concat(inputsSha, "\0criteria\0", CriteriaString(scenario))));
+
+ ///
+ /// Cheap, file-I/O-free key memoizing the input-artifact hash within this store. It must
+ /// distinguish any two scenarios whose materialized inputs could differ, so it folds in the
+ /// eval directory, the copy flag, the explicit setup file recipe, and the command list (but
+ /// not the auto-copied file contents — those are determined by the directory + copy flag).
+ /// Evaluation criteria are intentionally excluded here because they are combined after the
+ /// cache lookup in .
+ ///
+ private static string BuildInputsCacheKey(EvalScenario scenario, string? evalPath)
+ {
+ var setup = scenario.Setup;
+ var sb = new StringBuilder().Append(evalPath ?? "").Append('\0');
+ if (setup is null)
+ return sb.Append("none").ToString();
+ sb.Append("copy=").Append(setup.CopyTestFiles).Append('\0');
+ if (setup.Files is { } files)
+ foreach (var f in files)
+ sb.Append("f=").Append(f.Path).Append('|').Append(f.Source ?? "").Append('|')
+ .Append(f.Content is null ? "" : Sha256Hex(Encoding.UTF8.GetBytes(f.Content))).Append('\0');
+ if (setup.Commands is { } commands)
+ foreach (var c in commands)
+ sb.Append("c=").Append(c).Append('\0');
+ return sb.ToString();
+ }
+
+ private static string ComputeInputsSha(EvalScenario scenario, string? evalPath)
+ {
+ var setup = scenario.Setup;
+ if (setup is null)
+ return Sha256Hex(Encoding.UTF8.GetBytes("\0no-setup\0"));
+
+ var sb = new StringBuilder();
+
+ // 1. Sibling files auto-copied into the work dir (copy_test_files: true). Mirror
+ // AgentRunner.SetupWorkDir/CopyDirectory exactly so the hash reflects precisely
+ // the files the agent is given — no more (e.g. reparse points are skipped) and
+ // no fewer (nested files are included).
+ if (setup.CopyTestFiles && evalPath is not null)
+ {
+ // Normalize first: Path.GetDirectoryName returns "" for a bare filename
+ // (e.g. "eval.yaml" in the cwd), which would silently skip fixture hashing
+ // even though copy_test_files still copies the sibling files — risking
+ // TargetSha collisions and unsafe baseline reuse. GetFullPath resolves the
+ // bare name against the current directory so its real parent is hashed.
+ var evalDir = Path.GetDirectoryName(Path.GetFullPath(evalPath));
+ if (!string.IsNullOrEmpty(evalDir) && Directory.Exists(evalDir))
+ {
+ foreach (var (rel, full) in EnumerateCopiedFixtures(evalDir).OrderBy(x => x.Rel, StringComparer.Ordinal))
+ sb.Append("F:").Append(rel).Append('=').Append(HashFile(full)).Append('\n');
+ }
+ }
+
+ // 2. Explicit setup files — inline content or a copied source.
+ if (setup.Files is { } setupFiles)
+ {
+ foreach (var f in setupFiles.OrderBy(f => f.Path, StringComparer.Ordinal))
+ {
+ sb.Append("E:").Append(f.Path.Replace('\\', '/')).Append('=');
+ if (f.Content is not null)
+ sb.Append("c:").Append(Sha256Hex(Encoding.UTF8.GetBytes(f.Content)));
+ else if (f.Source is not null)
+ {
+ var resolved = AgentRunner.ResolveSourcePath(f.Source, evalPath, skillPath: null);
+ sb.Append("s:").Append(resolved is not null && File.Exists(resolved) ? HashFile(resolved) : "missing");
+ }
+ sb.Append('\n');
+ }
+ }
+
+ // 3. Setup commands define part of the input recipe (e.g. building a binlog).
+ if (setup.Commands is { } commands)
+ {
+ foreach (var c in commands)
+ sb.Append("C:").Append(c).Append('\n');
+ }
+
+ return Sha256Hex(Encoding.UTF8.GetBytes(sb.ToString()));
+ }
+
+ private static readonly StringComparison PathComparison =
+ OperatingSystem.IsWindows() ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal;
+
+ ///
+ /// Yields the exact set of files copies into the
+ /// work dir under copy_test_files: every top-level sibling except eval.yaml,
+ /// recursing into directories. Reparse points (symlinks/junctions) — at the top level and
+ /// nested — and junctions that resolve outside their top-level fixture directory are
+ /// skipped, so the hash only ever covers files genuinely materialized for the run rather
+ /// than data linked from outside the eval directory.
+ ///
+ private static IEnumerable<(string Rel, string Full)> EnumerateCopiedFixtures(string evalDir)
+ {
+ foreach (var entry in new DirectoryInfo(evalDir).EnumerateFileSystemInfos())
+ {
+ if (string.Equals(entry.Name, "eval.yaml", StringComparison.Ordinal))
+ continue;
+ if ((entry.Attributes & FileAttributes.ReparsePoint) != 0)
+ continue;
+ if (entry is FileInfo file)
+ yield return (file.Name, file.FullName);
+ else if (entry is DirectoryInfo dir)
+ {
+ var root = Path.TrimEndingDirectorySeparator(Path.GetFullPath(dir.FullName));
+ foreach (var nested in EnumerateDirFixtures(dir.FullName, dir.Name, root))
+ yield return nested;
+ }
+ }
+ }
+
+ private static IEnumerable<(string Rel, string Full)> EnumerateDirFixtures(string dir, string relBase, string sourceRoot)
+ {
+ foreach (var entry in new DirectoryInfo(dir).EnumerateFileSystemInfos())
+ {
+ if ((entry.Attributes & FileAttributes.ReparsePoint) != 0)
+ continue;
+ var rel = string.Concat(relBase, "/", entry.Name);
+ if (entry is DirectoryInfo sub)
+ {
+ var subFull = Path.TrimEndingDirectorySeparator(Path.GetFullPath(sub.FullName));
+ if (!subFull.StartsWith(sourceRoot + Path.DirectorySeparatorChar, PathComparison))
+ continue;
+ foreach (var nested in EnumerateDirFixtures(sub.FullName, rel, sourceRoot))
+ yield return nested;
+ }
+ else
+ yield return (rel, entry.FullName);
+ }
+ }
+
+ ///
+ /// Deterministic textual signature of the evaluation criteria that influence a scenario's
+ /// stored baseline result: run-bounding limits, rubric, assertions, and expect/reject tools.
+ ///
+ private static string CriteriaString(EvalScenario scenario)
+ {
+ var sb = new StringBuilder();
+ sb.Append("turns=").Append(scenario.MaxTurns?.ToString() ?? "").Append('\0');
+ sb.Append("tokens=").Append(scenario.MaxTokens?.ToString() ?? "").Append('\0');
+ sb.Append("timeout=").Append(scenario.Timeout).Append('\0');
+ if (scenario.Rubric is { } rubric)
+ foreach (var r in rubric)
+ sb.Append("R:").Append(r).Append('\n');
+ if (scenario.ExpectTools is { } expect)
+ foreach (var t in expect.OrderBy(x => x, StringComparer.Ordinal))
+ sb.Append("XT:").Append(t).Append('\0');
+ if (scenario.RejectTools is { } reject)
+ foreach (var t in reject.OrderBy(x => x, StringComparer.Ordinal))
+ sb.Append("RT:").Append(t).Append('\0');
+ if (scenario.Assertions is { } assertions)
+ foreach (var a in assertions)
+ {
+ sb.Append("A:").Append(a.Type).Append('|').Append(a.Path ?? "").Append('|')
+ .Append(a.Value ?? "").Append('|').Append(a.Pattern ?? "").Append('|');
+ if (a.CommandArgs is { } ca)
+ sb.Append(ca.CommandToRun).Append(';').Append(ca.CommandArguments ?? "").Append(';')
+ .Append(ca.ExpectedExitCode?.ToString() ?? "").Append(';').Append(ca.ExpectedStdOutContains ?? "").Append(';')
+ .Append(ca.ExpectedStdErrorContains ?? "").Append(';').Append(ca.ExpectedStdOutMatches ?? "").Append(';')
+ .Append(ca.ExpectedStdErrorMatches ?? "").Append(';').Append(ca.Timeout?.ToString() ?? "");
+ sb.Append('\n');
+ }
+ return sb.ToString();
+ }
+
+ private static string HashFile(string path)
+ {
+ using var stream = File.OpenRead(path);
+ return HexDigest(SHA256.HashData(stream));
+ }
+
+ /// SHA-256 of , lower-case hex.
+ private static string Sha256Hex(byte[] data) => HexDigest(SHA256.HashData(data));
+
+ /// Lower-case hex encoding of an already-computed digest.
+ private static string HexDigest(byte[] digest) => Convert.ToHexString(digest).ToLowerInvariant();
+
+ private static string MakeKey(string promptSha, string targetSha) => string.Concat(promptSha, ":", targetSha);
+
+ ///
+ /// In reuse mode, return human-readable identifiers of scenarios that have no matching
+ /// cached baseline (keyed by prompt + setup/criteria identity). Empty when every scenario
+ /// is covered. Each entry carries the originating eval path plus short prompt/target SHA
+ /// prefixes so a missing scenario is actionable even when names collide across eval files.
+ ///
+ public IReadOnlyList FindMissingScenarios(IEnumerable<(EvalScenario Scenario, string? EvalPath)> scenarios) =>
+ scenarios
+ .Select(s => (
+ s.Scenario,
+ s.EvalPath,
+ PromptSha: ComputePromptSha(s.Scenario.Prompt),
+ TargetSha: TargetShaFor(s.Scenario, s.EvalPath)))
+ .Where(x => !_entries.ContainsKey(MakeKey(x.PromptSha, x.TargetSha)))
+ .Select(x =>
+ {
+ var where = x.EvalPath is null ? "" : $" in {x.EvalPath}";
+ return $"{x.Scenario.Name}{where} [prompt {x.PromptSha[..8]}, target {x.TargetSha[..8]}]";
+ })
+ .ToList();
+
+ /// Get the cached averaged baseline for a scenario, or null when absent.
+ public RunResult? TryGetBaseline(EvalScenario scenario, string? evalPath = null) =>
+ _entries.TryGetValue(MakeKey(ComputePromptSha(scenario.Prompt), TargetShaFor(scenario, evalPath)), out var entry)
+ ? entry.Baseline
+ : null;
+
+ ///
+ /// Record a scenario's averaged baseline for later persistence (write mode). The
+ /// baseline arm is target-independent, so when several targets evaluated in parallel
+ /// share the same scenario identity they produce the same key. A first-writer-wins
+ /// strategy stabilizes the baseline chosen within a single run: once a value is
+ /// recorded for a key the first writer's value is kept and later records for that key
+ /// are ignored, preventing non-deterministic late overwrites under parallelism. The
+ /// competing records differ only by run-to-run noise, so which writer wins the race is
+ /// immaterial — the guarantee is that the persisted value is not clobbered afterward,
+ /// not that it is independent of thread scheduling.
+ ///
+ public void Record(EvalScenario scenario, int runs, RunResult averagedBaseline, string? evalPath = null)
+ {
+ var promptSha = ComputePromptSha(scenario.Prompt);
+ var targetSha = TargetShaFor(scenario, evalPath);
+ _entries.TryAdd(MakeKey(promptSha, targetSha), new BaselineScenarioEntry(scenario.Name, promptSha, targetSha, runs, averagedBaseline));
+ }
+
+ /// Serialize all recorded baselines to .
+ public void Save(string path)
+ {
+ var file = new BaselineFile(
+ Version: CurrentVersion,
+ Model: _model,
+ JudgeModel: _judgeModel,
+ ValidatorVersion: typeof(BaselineStore).Assembly.GetName().Version?.ToString(),
+ CreatedAt: DateTime.UtcNow.ToString("o"),
+ Scenarios: _entries.Values
+ .OrderBy(e => e.Name, StringComparer.Ordinal)
+ .ThenBy(e => e.PromptSha, StringComparer.Ordinal)
+ .ThenBy(e => e.TargetSha, StringComparer.Ordinal)
+ .ToList());
+
+ var dir = Path.GetDirectoryName(Path.GetFullPath(path));
+ if (!string.IsNullOrEmpty(dir))
+ Directory.CreateDirectory(dir);
+
+ // Stream directly to disk so large baselines (many scenarios with full
+ // RunMetrics/AgentOutput) never materialize as one giant in-memory string.
+ using var stream = File.Create(path);
+ JsonSerializer.Serialize(stream, file, SkillValidatorJsonContext.Default.BaselineFile);
+ }
+
+ /// Number of baselines currently held.
+ public int Count => _entries.Count;
+}
diff --git a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
index f80987fa3c..cada00d47e 100644
--- a/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
+++ b/eng/skill-validator/src/Evaluate/EvaluateCommand.cs
@@ -33,6 +33,8 @@ public static Command Create()
var noiseSkillsDirOpt = new Option("--noise-skills-dir") { Description = "Directory containing skills to load as noise. Enables the noise test: re-runs scenarios with all noise skills loaded and measures degradation." };
var noiseMaxDegradationOpt = new Option("--noise-max-degradation") { Description = "Maximum acceptable average quality degradation (0-1) in noise test (only positive degradations count)", DefaultValueFactory = _ => 0.2 };
var noiseMaxScenarioDegradationOpt = new Option("--noise-max-scenario-degradation") { Description = "Maximum acceptable quality degradation (0-1) for any single noise-test scenario", DefaultValueFactory = _ => 0.4 };
+ var baselineOutOpt = new Option("--baseline-out") { Description = "After running, persist each scenario's averaged baseline (no-skill/no-agent reference) to this file for later reuse with --baseline-from." };
+ var baselineFromOpt = new Option("--baseline-from") { Description = "Reuse a precomputed baseline from this file instead of re-running the no-skill/no-agent baseline arm. Must match --model, --judge-model, and each scenario's prompt, setup inputs, and evaluation criteria. Mutually exclusive with --baseline-out." };
var command = new Command("evaluate", "Evaluate agent skills via LLM-based testing")
{
@@ -59,6 +61,8 @@ public static Command Create()
noiseSkillsDirOpt,
noiseMaxDegradationOpt,
noiseMaxScenarioDegradationOpt,
+ baselineOutOpt,
+ baselineFromOpt,
};
command.Add(RejudgeCommand.Create());
@@ -110,6 +114,8 @@ public static Command Create()
NoiseSkillsDir = parseResult.GetValue(noiseSkillsDirOpt),
NoiseDegradationLimit = parseResult.GetValue(noiseMaxDegradationOpt),
NoiseMaxScenarioDegradation = parseResult.GetValue(noiseMaxScenarioDegradationOpt),
+ BaselineOut = parseResult.GetValue(baselineOutOpt),
+ BaselineFrom = parseResult.GetValue(baselineFromOpt),
};
return await Run(config, cancellationToken);
@@ -129,6 +135,14 @@ public static Command Create()
public static async Task Run(ValidatorConfig config, CancellationToken cancellationToken = default)
{
+ // --baseline-out and --baseline-from are mutually exclusive: one writes a
+ // shared baseline, the other consumes one.
+ if (config.BaselineOut is not null && config.BaselineFrom is not null)
+ {
+ Console.Error.WriteLine("--baseline-out and --baseline-from cannot be used together.");
+ return 1;
+ }
+
// Validate model early
try
{
@@ -290,6 +304,42 @@ public static async Task Run(ValidatorConfig config, CancellationToken canc
bool usePairwise = config.JudgeMode is JudgeMode.Pairwise or JudgeMode.Both;
bool effectiveKeepSessions = config.KeepSessions && config.ResultsDir is not null;
+ // Set up shared-baseline reuse/persistence.
+ BaselineStore? baselineStore = null;
+ if (config.BaselineFrom is not null)
+ {
+ try
+ {
+ baselineStore = BaselineStore.Load(config.BaselineFrom, config.Model, config.JudgeModel);
+ }
+ catch (Exception ex) when (ex is FileNotFoundException or InvalidOperationException)
+ {
+ Console.Error.WriteLine($"{Ansi.Red}❌ Failed to load baseline from '{config.BaselineFrom}': {ex.Message}{Ansi.Reset}");
+ return 1;
+ }
+
+ // Fail fast if any scenario lacks a matching cached baseline so a stale or
+ // incomplete baseline can never silently skew results.
+ var allScenarios = allTargets
+ .Where(t => t.EvalConfig is not null)
+ .SelectMany(t => t.EvalConfig!.Scenarios.Select(s => (Scenario: s, t.EvalPath)))
+ .ToList();
+ var missing = baselineStore.FindMissingScenarios(allScenarios);
+ if (missing.Count > 0)
+ {
+ Console.Error.WriteLine(
+ $"{Ansi.Red}❌ Baseline file '{config.BaselineFrom}' has no entry for scenario(s): {string.Join(", ", missing.Distinct())}. " +
+ $"Recompute the baseline with --baseline-out for the current tests and model.{Ansi.Reset}");
+ return 1;
+ }
+ Console.WriteLine($"Reusing precomputed baseline from {config.BaselineFrom} ({baselineStore.Count} scenario(s)).");
+ }
+ else if (config.BaselineOut is not null)
+ {
+ baselineStore = BaselineStore.ForWrite(config.Model, config.JudgeModel);
+ Console.WriteLine($"Baseline will be persisted to {config.BaselineOut} after the run.");
+ }
+
string? sessionsDir = null;
SessionDatabase? sessionDb = null;
string? timestampedResultsDir = null;
@@ -314,7 +364,7 @@ public static async Task Run(ValidatorConfig config, CancellationToken canc
// Evaluate all targets (skills and agents)
spinner.Start($"Evaluating {allTargets.Count} target(s)...");
var skillTasks = allTargets.Select(target =>
- skillLimit.RunAsync(() => EvaluateTarget(target, config, usePairwise, spinner, noiseEvalSkills, sessionsDir, sessionDb, cancellationToken), cancellationToken));
+ skillLimit.RunAsync(() => EvaluateTarget(target, config, usePairwise, spinner, noiseEvalSkills, sessionsDir, sessionDb, baselineStore, cancellationToken), cancellationToken));
var settled = await Task.WhenAll(skillTasks.Select(async t =>
{
try { return (Result: await t, Error: (Exception?)null); }
@@ -353,6 +403,28 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
await AgentRunner.CleanupWorkDirs(effectiveKeepSessions);
sessionDb?.Dispose();
+ // Persist the shared baseline for later reuse with --baseline-from.
+ if (config.BaselineOut is not null && baselineStore is not null)
+ {
+ if (baselineStore.Count > 0)
+ {
+ try
+ {
+ baselineStore.Save(config.BaselineOut);
+ Console.WriteLine($"Baseline written to {config.BaselineOut} ({baselineStore.Count} scenario(s)).");
+ }
+ catch (Exception ex)
+ {
+ Console.Error.WriteLine($"{Ansi.Red}❌ Failed to write baseline to '{config.BaselineOut}': {ex.Message}{Ansi.Reset}");
+ return 1;
+ }
+ }
+ else
+ {
+ Console.Error.WriteLine($"{Ansi.Yellow}⚠ No baselines were produced; nothing written to {config.BaselineOut}.{Ansi.Reset}");
+ }
+ }
+
// Always fail on execution errors, even in --verdict-warn-only mode
if (rejectionMessages.Count > 0) return 1;
@@ -380,16 +452,17 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
IReadOnlyList noiseSkills,
string? sessionsDir,
SessionDatabase? sessionDb,
+ BaselineStore? baselineStore,
CancellationToken cancellationToken)
{
if (target.Kind == EvalTargetKind.Skill && target.Skill is not null)
{
var evalSkill = new EvalSkillInfo(target.Skill, target.EvalPath, target.EvalConfig, target.McpServers);
- return await EvaluateSkill(evalSkill, config, usePairwise, spinner, noiseSkills, sessionsDir, sessionDb, cancellationToken);
+ return await EvaluateSkill(evalSkill, config, usePairwise, spinner, noiseSkills, sessionsDir, sessionDb, baselineStore, cancellationToken);
}
else if (target.Kind == EvalTargetKind.Agent && target.Agent is not null)
{
- return await EvaluateAgent(target, config, usePairwise, spinner, sessionsDir, sessionDb, cancellationToken);
+ return await EvaluateAgent(target, config, usePairwise, spinner, sessionsDir, sessionDb, baselineStore, cancellationToken);
}
return null;
}
@@ -405,6 +478,7 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
Spinner spinner,
string? sessionsDir,
SessionDatabase? sessionDb,
+ BaselineStore? baselineStore,
CancellationToken cancellationToken)
{
var agent = target.Agent!;
@@ -457,7 +531,7 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
{
try
{
- return await ExecuteAgentScenario(scenario, target, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, targetSha, cancellationToken);
+ return await ExecuteAgentScenario(scenario, target, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, targetSha, baselineStore, cancellationToken);
}
catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested)
{
@@ -516,6 +590,7 @@ private static async Task ExecuteAgentScenario(
string? sessionsDir,
SessionDatabase? sessionDb,
string? targetSha,
+ BaselineStore? baselineStore,
CancellationToken cancellationToken)
{
var agent = target.Agent!;
@@ -535,7 +610,7 @@ private static async Task ExecuteAgentScenario(
{
try
{
- return (Result: await ExecuteAgentRun(i, scenario, target, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, targetSha, cancellationToken), Error: (Exception?)null);
+ return (Result: await ExecuteAgentRun(i, scenario, target, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, targetSha, baselineStore, cancellationToken), Error: (Exception?)null);
}
catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested)
{
@@ -580,6 +655,10 @@ private static async Task ExecuteAgentScenario(
var avgIsolated = AverageResults(isolatedRuns);
var avgPlugin = AverageResults(pluginRuns);
+ // Persist the averaged baseline (skill/agent-independent) for shared reuse.
+ if (baselineStore is { IsReuse: false })
+ baselineStore.Record(scenario, runResults.Length, avgBaseline, target.EvalPath);
+
int bestPairwiseIdx = -1;
for (int i = 0; i < perRunPairwise.Count; i++)
{
@@ -669,6 +748,7 @@ private static async Task ExecuteAgentRun(
string? sessionsDir,
SessionDatabase? sessionDb,
string? targetSha,
+ BaselineStore? baselineStore,
CancellationToken cancellationToken)
{
var agent = target.Agent!;
@@ -690,8 +770,12 @@ private static async Task ExecuteAgentRun(
var pluginConfigDir = sessionsDir is not null ? Path.Combine("sessions", pluginSessionId) : null;
var rubricJson = JsonSerializer.Serialize(scenario.Rubric?.ToArray() ?? [], SkillValidatorJsonContext.Default.StringArray);
+ // Reuse a precomputed shared baseline when available (--baseline-from). The
+ // baseline arm is agent-independent, so this skips a redundant agent run.
+ var reusedBaseline = baselineStore?.TryGetBaseline(scenario, target.EvalPath);
+
sessionDb?.RegisterSession(baselineSessionId, agent.Name, agent.Path, scenario.Name, runIndex,
- "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, targetSha, rubricJson);
+ reusedBaseline is not null ? "baseline-reused" : "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, targetSha, rubricJson);
sessionDb?.RegisterSession(isolatedSessionId, agent.Name, agent.Path, scenario.Name, runIndex,
"with-agent-isolated", config.Model, isolatedConfigDir, null, scenario.Prompt, targetSha, rubricJson);
sessionDb?.RegisterSession(pluginSessionId, agent.Name, agent.Path, scenario.Name, runIndex,
@@ -706,25 +790,41 @@ private static async Task ExecuteAgentRun(
additionalAgents = await ResolveAdditionalAgents(scenario.Setup.AdditionalRequiredAgents, pluginRoot);
}
- var agentTasks = await Task.WhenAll(
+ // 2. Agent-isolated: target agent only (+ scenario deps)
+ var isolatedTask = AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose,
+ PluginRoot: null, Log: runLog, McpServers: target.McpServers, SessionsDir: sessionsDir,
+ SessionId: isolatedSessionId, Agent: agent, AdditionalSkills: additionalSkills, AdditionalAgents: additionalAgents), cancellationToken);
+ // 3. Agent-plugin: full plugin context + agent selected
+ var pluginTask = AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose,
+ PluginRoot: pluginRoot, Log: runLog, McpServers: target.McpServers, SessionsDir: sessionsDir,
+ SessionId: pluginSessionId, Agent: agent), cancellationToken);
+
+ RunMetrics baselineMetrics;
+ RunMetrics isolatedMetrics;
+ RunMetrics pluginMetrics;
+ if (reusedBaseline is not null)
+ {
+ if (config.Verbose)
+ runLog("↩︎ reusing precomputed baseline");
+ baselineMetrics = reusedBaseline.Metrics.Clone();
+ var skilled = await Task.WhenAll(isolatedTask, pluginTask);
+ isolatedMetrics = skilled[0];
+ pluginMetrics = skilled[1];
+ }
+ else
+ {
// 1. Baseline: no agent, no skills — vanilla
- AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose,
- PluginRoot: null, Log: runLog, SessionsDir: sessionsDir, SessionId: baselineSessionId), cancellationToken),
- // 2. Agent-isolated: target agent only (+ scenario deps)
- AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose,
- PluginRoot: null, Log: runLog, McpServers: target.McpServers, SessionsDir: sessionsDir,
- SessionId: isolatedSessionId, Agent: agent, AdditionalSkills: additionalSkills, AdditionalAgents: additionalAgents), cancellationToken),
- // 3. Agent-plugin: full plugin context + agent selected
- AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose,
- PluginRoot: pluginRoot, Log: runLog, McpServers: target.McpServers, SessionsDir: sessionsDir,
- SessionId: pluginSessionId, Agent: agent), cancellationToken));
- var baselineMetrics = agentTasks[0];
- var isolatedMetrics = agentTasks[1];
- var pluginMetrics = agentTasks[2];
+ var baselineTask = AgentRunner.RunAgent(new RunOptions(scenario, null, target.EvalPath, config.Model, config.Verbose,
+ PluginRoot: null, Log: runLog, SessionsDir: sessionsDir, SessionId: baselineSessionId), cancellationToken);
+ var all = await Task.WhenAll(baselineTask, isolatedTask, pluginTask);
+ baselineMetrics = all[0];
+ isolatedMetrics = all[1];
+ pluginMetrics = all[2];
+ }
if (sessionDb is not null)
{
- sessionDb.CompleteSession(baselineSessionId, baselineMetrics.TimedOut ? "timed_out" : "completed",
+ sessionDb.CompleteSession(baselineSessionId, reusedBaseline is not null ? "reused" : (baselineMetrics.TimedOut ? "timed_out" : "completed"),
JsonSerializer.Serialize(baselineMetrics, SkillValidatorJsonContext.Default.RunMetrics));
sessionDb.CompleteSession(isolatedSessionId, isolatedMetrics.TimedOut ? "timed_out" : "completed",
JsonSerializer.Serialize(isolatedMetrics, SkillValidatorJsonContext.Default.RunMetrics));
@@ -732,43 +832,58 @@ private static async Task ExecuteAgentRun(
JsonSerializer.Serialize(pluginMetrics, SkillValidatorJsonContext.Default.RunMetrics));
}
- // Assertions, constraints, task completion, judging — same as skills
+ // Assertions, constraints, task completion, judging — same as skills.
+ // Baseline arm is skipped when reused (its results are cached).
if (scenario.Assertions is { Count: > 0 })
{
- baselineMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, baselineMetrics.AgentOutput, baselineMetrics.WorkDir, scenario.Timeout);
+ if (reusedBaseline is null)
+ baselineMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, baselineMetrics.AgentOutput, baselineMetrics.WorkDir, scenario.Timeout);
isolatedMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, isolatedMetrics.AgentOutput, isolatedMetrics.WorkDir, scenario.Timeout);
pluginMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, pluginMetrics.AgentOutput, pluginMetrics.WorkDir, scenario.Timeout);
}
- var baselineConstraints = AssertionEvaluator.EvaluateConstraints(scenario, baselineMetrics);
+ var baselineConstraints = reusedBaseline is null ? AssertionEvaluator.EvaluateConstraints(scenario, baselineMetrics) : [];
var isolatedConstraints = AssertionEvaluator.EvaluateConstraints(scenario, isolatedMetrics);
var pluginConstraints = AssertionEvaluator.EvaluateConstraints(scenario, pluginMetrics);
- baselineMetrics.AssertionResults = [..baselineMetrics.AssertionResults, ..baselineConstraints];
+ if (reusedBaseline is null)
+ baselineMetrics.AssertionResults = [..baselineMetrics.AssertionResults, ..baselineConstraints];
isolatedMetrics.AssertionResults = [..isolatedMetrics.AssertionResults, ..isolatedConstraints];
pluginMetrics.AssertionResults = [..pluginMetrics.AssertionResults, ..pluginConstraints];
- if (scenario.Assertions is { Count: > 0 } || baselineConstraints.Count > 0)
+ if (scenario.Assertions is { Count: > 0 } || baselineConstraints.Count > 0 || isolatedConstraints.Count > 0 || pluginConstraints.Count > 0)
{
- baselineMetrics.TaskCompleted = baselineMetrics.AssertionResults.All(a => a.Passed);
+ if (reusedBaseline is null)
+ baselineMetrics.TaskCompleted = baselineMetrics.AssertionResults.All(a => a.Passed);
isolatedMetrics.TaskCompleted = isolatedMetrics.AssertionResults.All(a => a.Passed);
pluginMetrics.TaskCompleted = pluginMetrics.AssertionResults.All(a => a.Passed);
}
else
{
- baselineMetrics.TaskCompleted = baselineMetrics.ErrorCount == 0;
+ if (reusedBaseline is null)
+ baselineMetrics.TaskCompleted = baselineMetrics.ErrorCount == 0;
isolatedMetrics.TaskCompleted = isolatedMetrics.ErrorCount == 0;
pluginMetrics.TaskCompleted = pluginMetrics.ErrorCount == 0;
}
- var judgeOpts = new JudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, agent.Path);
+ var judgeOpts = new JudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, isolatedMetrics.WorkDir, agent.Path);
- var (baselineJudge, baselineJudgeTokens) = await SafeJudge(Judge.JudgeRun(scenario, baselineMetrics, judgeOpts, runLog, cancellationToken), "baseline", runLog);
+ JudgeResult baselineJudge;
+ if (reusedBaseline is not null)
+ {
+ baselineJudge = reusedBaseline.JudgeResult;
+ }
+ else
+ {
+ var (judged, baselineJudgeTokens) = await SafeJudge(Judge.JudgeRun(
+ scenario, baselineMetrics, judgeOpts with { WorkDir = baselineMetrics.WorkDir }, runLog, cancellationToken), "baseline", runLog);
+ baselineJudge = judged;
+ AccumulateJudgeTokens(baselineMetrics, baselineJudgeTokens);
+ }
var (isolatedJudge, isolatedJudgeTokens) = await SafeJudge(Judge.JudgeRun(
scenario, isolatedMetrics, judgeOpts with { WorkDir = isolatedMetrics.WorkDir }, runLog, cancellationToken), "isolated", runLog);
var (pluginJudge, pluginJudgeTokens) = await SafeJudge(Judge.JudgeRun(
scenario, pluginMetrics, judgeOpts with { WorkDir = pluginMetrics.WorkDir }, runLog, cancellationToken), "plugin", runLog);
- AccumulateJudgeTokens(baselineMetrics, baselineJudgeTokens);
AccumulateJudgeTokens(isolatedMetrics, isolatedJudgeTokens);
AccumulateJudgeTokens(pluginMetrics, pluginJudgeTokens);
@@ -785,11 +900,18 @@ private static async Task ExecuteAgentRun(
var worseSkilled = pairwiseFromPlugin ? pluginMetrics : isolatedMetrics;
try
{
+ // Reused baseline work dir no longer exists; run the judge in the skilled
+ // run's work dir (judge reads only the provided metrics text).
+ var pairwiseWorkDir = reusedBaseline is not null ? worseSkilled.WorkDir : baselineMetrics.WorkDir;
var (pairwiseResult, pairwiseTokens) = await PairwiseJudge.Judge(
scenario, baselineMetrics, worseSkilled,
- new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, agent.Path, worseSkilled.WorkDir),
+ new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, pairwiseWorkDir, agent.Path, worseSkilled.WorkDir),
runLog, cancellationToken);
pairwise = pairwiseResult;
+ // Attribute pairwise judge tokens consistently to both compared runs in
+ // every mode so token deltas stay comparable regardless of --baseline-from.
+ // baselineMetrics is a per-run clone when reused, so this never mutates the
+ // shared cached baseline.
AccumulateJudgeTokens(baselineMetrics, pairwiseTokens);
AccumulateJudgeTokens(worseSkilled, pairwiseTokens);
}
@@ -827,6 +949,7 @@ private static async Task ExecuteAgentRun(
IReadOnlyList noiseSkills,
string? sessionsDir,
SessionDatabase? sessionDb,
+ BaselineStore? baselineStore,
CancellationToken cancellationToken)
{
var skill = evalSkill.Skill;
@@ -896,7 +1019,7 @@ private static async Task ExecuteAgentRun(
{
try
{
- return await ExecuteScenario(scenario, evalSkill, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, skillSha, cancellationToken);
+ return await ExecuteScenario(scenario, evalSkill, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, skillSha, baselineStore, cancellationToken);
}
catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested)
{
@@ -989,6 +1112,7 @@ private static async Task ExecuteScenario(
string? sessionsDir,
SessionDatabase? sessionDb,
string? skillSha,
+ BaselineStore? baselineStore,
CancellationToken cancellationToken)
{
var skill = evalSkill.Skill;
@@ -1008,7 +1132,7 @@ private static async Task ExecuteScenario(
{
try
{
- return (Result: await ExecuteRun(i, scenario, evalSkill, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, skillSha, cancellationToken), Error: (Exception?)null);
+ return (Result: await ExecuteRun(i, scenario, evalSkill, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb, skillSha, baselineStore, cancellationToken), Error: (Exception?)null);
}
catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested)
{
@@ -1056,6 +1180,9 @@ private static async Task ExecuteScenario(
var avgBaseline = AverageResults(baselineRuns);
var avgIsolated = AverageResults(isolatedRuns);
var avgPlugin = AverageResults(pluginRuns);
+ // Persist the averaged baseline (skill/agent-independent) for shared reuse.
+ if (baselineStore is { IsReuse: false })
+ baselineStore.Record(scenario, runResults.Length, avgBaseline, evalSkill.EvalPath);
// Select the best pairwise result and track which run it came from
int bestPairwiseIdx = -1;
for (int i = 0; i < perRunPairwise.Count; i++)
@@ -1163,6 +1290,7 @@ private static async Task ExecuteRun(
string? sessionsDir,
SessionDatabase? sessionDb,
string? skillSha,
+ BaselineStore? baselineStore,
CancellationToken cancellationToken)
{
var skill = evalSkill.Skill;
@@ -1184,8 +1312,12 @@ private static async Task ExecuteRun(
var pluginConfigDir = sessionsDir is not null ? Path.Combine("sessions", pluginSessionId) : null;
var rubricJson = JsonSerializer.Serialize(scenario.Rubric?.ToArray() ?? [], SkillValidatorJsonContext.Default.StringArray);
+ // Reuse a precomputed shared baseline when available (--baseline-from). The
+ // baseline arm is skill-independent, so this skips a redundant agent run.
+ var reusedBaseline = baselineStore?.TryGetBaseline(scenario, evalSkill.EvalPath);
+
sessionDb?.RegisterSession(baselineSessionId, skill.Name, skill.Path, scenario.Name, runIndex,
- "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, skillSha, rubricJson);
+ reusedBaseline is not null ? "baseline-reused" : "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, skillSha, rubricJson);
sessionDb?.RegisterSession(isolatedSessionId, skill.Name, skill.Path, scenario.Name, runIndex,
"with-skill-isolated", config.Model, isolatedConfigDir, null, scenario.Prompt, skillSha, rubricJson);
sessionDb?.RegisterSession(pluginSessionId, skill.Name, skill.Path, scenario.Name, runIndex,
@@ -1200,24 +1332,40 @@ private static async Task ExecuteRun(
additionalAgents = await ResolveAdditionalAgents(scenario.Setup.AdditionalRequiredAgents, pluginRoot);
}
- var agentTasks = await Task.WhenAll(
+ // 2. Skilled-isolated: target skill + declared dependencies
+ var isolatedTask = AgentRunner.RunAgent(new RunOptions(scenario, skill, evalSkill.EvalPath, config.Model, config.Verbose,
+ PluginRoot: null, Log: runLog, McpServers: evalSkill.McpServers, SessionsDir: sessionsDir,
+ SessionId: isolatedSessionId, AdditionalSkills: additionalSkills, AdditionalAgents: additionalAgents), cancellationToken);
+ // 3. Skilled-plugin: load entire plugin from plugin root directory
+ var pluginTask = AgentRunner.RunAgent(new RunOptions(scenario, skill, evalSkill.EvalPath, config.Model, config.Verbose,
+ PluginRoot: pluginRoot, Log: runLog, McpServers: evalSkill.McpServers, SessionsDir: sessionsDir, SessionId: pluginSessionId), cancellationToken);
+
+ RunMetrics baselineMetrics;
+ RunMetrics isolatedMetrics;
+ RunMetrics pluginMetrics;
+ if (reusedBaseline is not null)
+ {
+ if (config.Verbose)
+ runLog("↩︎ reusing precomputed baseline");
+ baselineMetrics = reusedBaseline.Metrics.Clone();
+ var skilled = await Task.WhenAll(isolatedTask, pluginTask);
+ isolatedMetrics = skilled[0];
+ pluginMetrics = skilled[1];
+ }
+ else
+ {
// 1. Baseline: no plugin, no skills — vanilla agent
- AgentRunner.RunAgent(new RunOptions(scenario, null, evalSkill.EvalPath, config.Model, config.Verbose,
- PluginRoot: null, Log: runLog, SessionsDir: sessionsDir, SessionId: baselineSessionId), cancellationToken),
- // 2. Skilled-isolated: target skill + declared dependencies
- AgentRunner.RunAgent(new RunOptions(scenario, skill, evalSkill.EvalPath, config.Model, config.Verbose,
- PluginRoot: null, Log: runLog, McpServers: evalSkill.McpServers, SessionsDir: sessionsDir,
- SessionId: isolatedSessionId, AdditionalSkills: additionalSkills, AdditionalAgents: additionalAgents), cancellationToken),
- // 3. Skilled-plugin: load entire plugin from plugin root directory
- AgentRunner.RunAgent(new RunOptions(scenario, skill, evalSkill.EvalPath, config.Model, config.Verbose,
- PluginRoot: pluginRoot, Log: runLog, McpServers: evalSkill.McpServers, SessionsDir: sessionsDir, SessionId: pluginSessionId), cancellationToken));
- var baselineMetrics = agentTasks[0];
- var isolatedMetrics = agentTasks[1];
- var pluginMetrics = agentTasks[2];
+ var baselineTask = AgentRunner.RunAgent(new RunOptions(scenario, null, evalSkill.EvalPath, config.Model, config.Verbose,
+ PluginRoot: null, Log: runLog, SessionsDir: sessionsDir, SessionId: baselineSessionId), cancellationToken);
+ var all = await Task.WhenAll(baselineTask, isolatedTask, pluginTask);
+ baselineMetrics = all[0];
+ isolatedMetrics = all[1];
+ pluginMetrics = all[2];
+ }
if (sessionDb is not null)
{
- var baselineStatus = baselineMetrics.TimedOut ? "timed_out" : "completed";
+ var baselineStatus = reusedBaseline is not null ? "reused" : (baselineMetrics.TimedOut ? "timed_out" : "completed");
var isolatedStatus = isolatedMetrics.TimedOut ? "timed_out" : "completed";
var pluginStatus = pluginMetrics.TimedOut ? "timed_out" : "completed";
sessionDb.CompleteSession(baselineSessionId, baselineStatus, JsonSerializer.Serialize(baselineMetrics, SkillValidatorJsonContext.Default.RunMetrics));
@@ -1225,56 +1373,73 @@ private static async Task ExecuteRun(
sessionDb.CompleteSession(pluginSessionId, pluginStatus, JsonSerializer.Serialize(pluginMetrics, SkillValidatorJsonContext.Default.RunMetrics));
}
- // Evaluate assertions on all three runs
+ // Evaluate assertions on the skilled runs (baseline assertions are cached when reused)
if (scenario.Assertions is { Count: > 0 })
{
- baselineMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, baselineMetrics.AgentOutput, baselineMetrics.WorkDir, scenario.Timeout);
+ if (reusedBaseline is null)
+ baselineMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, baselineMetrics.AgentOutput, baselineMetrics.WorkDir, scenario.Timeout);
isolatedMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, isolatedMetrics.AgentOutput, isolatedMetrics.WorkDir, scenario.Timeout);
pluginMetrics.AssertionResults = await AssertionEvaluator.EvaluateAssertions(scenario.Assertions, pluginMetrics.AgentOutput, pluginMetrics.WorkDir, scenario.Timeout);
}
- // Evaluate constraints on all three runs
- var baselineConstraints = AssertionEvaluator.EvaluateConstraints(scenario, baselineMetrics);
+ // Evaluate constraints on the skilled runs (baseline constraints are cached when reused)
+ var baselineConstraints = reusedBaseline is null ? AssertionEvaluator.EvaluateConstraints(scenario, baselineMetrics) : [];
var isolatedConstraints = AssertionEvaluator.EvaluateConstraints(scenario, isolatedMetrics);
var pluginConstraints = AssertionEvaluator.EvaluateConstraints(scenario, pluginMetrics);
- baselineMetrics.AssertionResults = [..baselineMetrics.AssertionResults, ..baselineConstraints];
+ if (reusedBaseline is null)
+ baselineMetrics.AssertionResults = [..baselineMetrics.AssertionResults, ..baselineConstraints];
isolatedMetrics.AssertionResults = [..isolatedMetrics.AssertionResults, ..isolatedConstraints];
pluginMetrics.AssertionResults = [..pluginMetrics.AssertionResults, ..pluginConstraints];
- // Task completion for all three
- if (scenario.Assertions is { Count: > 0 } || baselineConstraints.Count > 0)
+ // Task completion for the skilled runs (baseline completion is cached when reused)
+ if (scenario.Assertions is { Count: > 0 } || baselineConstraints.Count > 0 || isolatedConstraints.Count > 0 || pluginConstraints.Count > 0)
{
- baselineMetrics.TaskCompleted = baselineMetrics.AssertionResults.All(a => a.Passed);
+ if (reusedBaseline is null)
+ baselineMetrics.TaskCompleted = baselineMetrics.AssertionResults.All(a => a.Passed);
isolatedMetrics.TaskCompleted = isolatedMetrics.AssertionResults.All(a => a.Passed);
pluginMetrics.TaskCompleted = pluginMetrics.AssertionResults.All(a => a.Passed);
}
else
{
- baselineMetrics.TaskCompleted = baselineMetrics.ErrorCount == 0;
+ if (reusedBaseline is null)
+ baselineMetrics.TaskCompleted = baselineMetrics.ErrorCount == 0;
isolatedMetrics.TaskCompleted = isolatedMetrics.ErrorCount == 0;
pluginMetrics.TaskCompleted = pluginMetrics.ErrorCount == 0;
}
- // Judge all three runs independently (failures are non-fatal)
- var judgeOpts = new JudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, skill.Path);
+ // Judge the skilled runs independently (failures are non-fatal). The baseline
+ // judge result is reused from the precomputed baseline when available.
+ var judgeOpts = new JudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, isolatedMetrics.WorkDir, skill.Path);
- var baselineJudgeTask = Judge.JudgeRun(scenario, baselineMetrics, judgeOpts, runLog, cancellationToken);
var isolatedJudgeTask = Judge.JudgeRun(
scenario, isolatedMetrics, judgeOpts with { WorkDir = isolatedMetrics.WorkDir }, runLog, cancellationToken);
var pluginJudgeTask = Judge.JudgeRun(
scenario, pluginMetrics, judgeOpts with { WorkDir = pluginMetrics.WorkDir }, runLog, cancellationToken);
- var (baselineJudge, baselineJudgeTokens) = await SafeJudge(baselineJudgeTask, "baseline", runLog);
+ JudgeResult baselineJudge;
+ if (reusedBaseline is not null)
+ {
+ baselineJudge = reusedBaseline.JudgeResult;
+ }
+ else
+ {
+ var (judged, baselineJudgeTokens) = await SafeJudge(
+ Judge.JudgeRun(scenario, baselineMetrics, judgeOpts with { WorkDir = baselineMetrics.WorkDir }, runLog, cancellationToken), "baseline", runLog);
+ baselineJudge = judged;
+ AccumulateJudgeTokens(baselineMetrics, baselineJudgeTokens);
+ }
var (isolatedJudge, isolatedJudgeTokens) = await SafeJudge(isolatedJudgeTask, "isolated", runLog);
var (pluginJudge, pluginJudgeTokens) = await SafeJudge(pluginJudgeTask, "plugin", runLog);
- // Accumulate judge tokens into each run's metrics
- AccumulateJudgeTokens(baselineMetrics, baselineJudgeTokens);
+ // Accumulate judge tokens into each skilled run's metrics
AccumulateJudgeTokens(isolatedMetrics, isolatedJudgeTokens);
AccumulateJudgeTokens(pluginMetrics, pluginJudgeTokens);
if (sessionDb is not null)
{
+ // Persist the baseline judge result even when reused so the baseline session
+ // record (registered with the "baseline-reused" phase) is complete for
+ // downstream investigation tooling — baselineJudge is valid in both cases.
sessionDb.SaveJudgeResult(baselineSessionId, JsonSerializer.Serialize(baselineJudge, SkillValidatorJsonContext.Default.JudgeResult));
sessionDb.SaveJudgeResult(isolatedSessionId, JsonSerializer.Serialize(isolatedJudge, SkillValidatorJsonContext.Default.JudgeResult));
sessionDb.SaveJudgeResult(pluginSessionId, JsonSerializer.Serialize(pluginJudge, SkillValidatorJsonContext.Default.JudgeResult));
@@ -1295,12 +1460,19 @@ private static async Task ExecuteRun(
? pluginMetrics : isolatedMetrics;
try
{
+ // When the baseline is reused its work dir no longer exists; run the
+ // judge session in the skilled run's work dir instead (the judge only
+ // reads the provided metrics text and is denied tool access).
+ var pairwiseWorkDir = reusedBaseline is not null ? worseSkilled.WorkDir : baselineMetrics.WorkDir;
var (pairwiseResult, pairwiseTokens) = await PairwiseJudge.Judge(
scenario, baselineMetrics, worseSkilled,
- new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, baselineMetrics.WorkDir, skill.Path, worseSkilled.WorkDir),
+ new PairwiseJudgeOptions(config.JudgeModel, config.Verbose, config.JudgeTimeout, pairwiseWorkDir, skill.Path, worseSkilled.WorkDir),
runLog, cancellationToken);
pairwise = pairwiseResult;
- // Attribute pairwise judge tokens to both the baseline and the compared run
+ // Attribute pairwise judge tokens consistently to both compared runs in
+ // every mode so token deltas stay comparable regardless of --baseline-from.
+ // baselineMetrics is a per-run clone when reused, so this never mutates the
+ // shared cached baseline.
AccumulateJudgeTokens(baselineMetrics, pairwiseTokens);
AccumulateJudgeTokens(worseSkilled, pairwiseTokens);
if (sessionDb is not null && pairwise is not null)
diff --git a/eng/skill-validator/src/Evaluate/Models.cs b/eng/skill-validator/src/Evaluate/Models.cs
index b5c055d650..da6fe75a85 100644
--- a/eng/skill-validator/src/Evaluate/Models.cs
+++ b/eng/skill-validator/src/Evaluate/Models.cs
@@ -186,6 +186,37 @@ public sealed class RunMetrics
public string AgentOutput { get; set; } = "";
public List Events { get; set; } = [];
public string WorkDir { get; set; } = "";
+
+ ///
+ /// Creates a per-run copy. Scalar fields are copied by value and the mutable
+ /// collections are re-wrapped in fresh instances so mutating the clone (e.g.
+ /// accumulating judge tokens) never affects the source. This is essential when a
+ /// cached baseline is reused concurrently across parallel target evaluations: each
+ /// evaluation works on its own copy instead of sharing one mutable instance.
+ ///
+ public RunMetrics Clone() => new()
+ {
+ TokenEstimate = TokenEstimate,
+ InputTokens = InputTokens,
+ OutputTokens = OutputTokens,
+ CacheReadTokens = CacheReadTokens,
+ CacheWriteTokens = CacheWriteTokens,
+ JudgeInputTokens = JudgeInputTokens,
+ JudgeOutputTokens = JudgeOutputTokens,
+ JudgeCacheReadTokens = JudgeCacheReadTokens,
+ JudgeCacheWriteTokens = JudgeCacheWriteTokens,
+ ToolCallCount = ToolCallCount,
+ ToolCallBreakdown = new Dictionary(ToolCallBreakdown),
+ TurnCount = TurnCount,
+ WallTimeMs = WallTimeMs,
+ ErrorCount = ErrorCount,
+ TimedOut = TimedOut,
+ AssertionResults = [.. AssertionResults],
+ TaskCompleted = TaskCompleted,
+ AgentOutput = AgentOutput,
+ Events = [.. Events],
+ WorkDir = WorkDir,
+ };
}
public sealed record RunResult(
@@ -427,6 +458,12 @@ public sealed record ValidatorConfig
public string? NoiseSkillsDir { get; init; }
public double NoiseDegradationLimit { get; init; } = 0.2;
public double NoiseMaxScenarioDegradation { get; init; } = 0.4;
+
+ /// When set, persist each scenario's averaged baseline to this file after the run.
+ public string? BaselineOut { get; init; }
+
+ /// When set, reuse the precomputed baseline from this file instead of re-running the baseline arm.
+ public string? BaselineFrom { get; init; }
}
public static class DefaultWeights
diff --git a/eng/skill-validator/src/README.md b/eng/skill-validator/src/README.md
index c3f346b446..c2b2d03fbf 100644
--- a/eng/skill-validator/src/README.md
+++ b/eng/skill-validator/src/README.md
@@ -73,6 +73,10 @@ skill-validator evaluate --model gpt-5.3-codex --judge-model claude-opus-4.6-fas
# Multiple runs for stability
skill-validator evaluate --runs 5 --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills
+# Compute a shared baseline once, then reuse it across multiple skills/agents
+skill-validator evaluate --baseline-out baseline.json --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills/skill-a
+skill-validator evaluate --baseline-from baseline.json --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills/skill-b
+
# Override the default results directory (.skill-validator-results)
skill-validator evaluate --results-dir ./my-results --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills
@@ -142,6 +146,8 @@ skill-validator check --json --plugin ./plugins/my-plugin
| `--confidence-level ` | `0.95` | Confidence level for statistical intervals (0–1) |
| `--judge-timeout ` | `300` | Judge LLM timeout in seconds |
| `--require-completion` | `true` | Fail if skill regresses task completion |
+| `--baseline-out ` | *(none)* | After running, persist each scenario's averaged baseline (no-skill/no-agent reference) to this file for reuse. Mutually exclusive with `--baseline-from`. |
+| `--baseline-from ` | *(none)* | Reuse a precomputed baseline from this file instead of re-running the baseline arm. Must match `--model`, `--judge-model`, and every scenario's prompt, setup inputs, and evaluation criteria. Mutually exclusive with `--baseline-out`. |
| `--verdict-warn-only` | `false` | Treat verdict failures as warnings (exit 0). Execution errors still fail. |
| `--no-overfitting-check` | `false` | Disable the LLM-based overfitting analysis (on by default) |
| `--overfitting-fix` | `false` | Generate `eval.fixed.yaml` with improved rubric items/assertions |
@@ -151,6 +157,21 @@ skill-validator check --json --plugin ./plugins/my-plugin
Models are validated on startup — invalid model names fail fast with a list of available models.
+### Shared baseline reuse
+
+Every evaluation runs each scenario through a **baseline arm** (the agent with no skill / no agent loaded) to establish a reference the skill-enhanced run is compared against. When you evaluate many skills or agents against the same test scenarios, that baseline arm is re-run every time — redundant work that also introduces run-to-run variance into the comparison.
+
+`--baseline-out` and `--baseline-from` let you compute the baseline **once** and reuse it as a shared control group:
+
+1. **Produce** a baseline file with `--baseline-out baseline.json`. After the run, each scenario's averaged baseline result (honoring `--runs`) is written to the file.
+2. **Reuse** it with `--baseline-from baseline.json` on subsequent runs. The baseline arm is skipped entirely; the cached baseline is used for assertions, pairwise/independent judging, and metric deltas.
+
+The baseline file records the `--model` **and** `--judge-model`, and per scenario a SHA-256 of the prompt plus a composite SHA-256 over (a) its setup inputs — the fixtures copied via `copy_test_files`, explicit setup files, and setup commands — and (b) the evaluation criteria that shape the stored result (rubric, assertions, expect/reject tools, and the turn/token/timeout limits). This is the analog of a target/input SHA. On reuse the validator fails fast if the agent model, the judge model, or any scenario's prompt-plus-setup-plus-criteria identity is missing from the file, so a stale or mismatched baseline can never be silently applied — and two scenarios that share a prompt but feed the agent different fixtures (e.g. a different `build.binlog`) or use different rubrics never reuse each other's baseline. Scenarios reused from the file are reported with the `baseline-reused` session phase and a `reused` baseline status.
+
+> **Note:** Setup `commands` are fingerprinted by their text (the recipe), not the artifacts they produce, so baseline reuse assumes setup commands are deterministic/hermetic — a command whose output changes between runs (e.g. fetching `latest`) will not invalidate a cached baseline.
+
+The two options are mutually exclusive.
+
## Output
Results are displayed in the console with color-coded scores and metric deltas. By default, `json` and `markdown` reporters are enabled and write to `.skill-validator-results/` (override with `--results-dir`). File reporters write to that directory:
diff --git a/eng/skill-validator/src/SkillValidator.csproj b/eng/skill-validator/src/SkillValidator.csproj
index 77b51e897e..ca7037ad99 100644
--- a/eng/skill-validator/src/SkillValidator.csproj
+++ b/eng/skill-validator/src/SkillValidator.csproj
@@ -52,6 +52,14 @@
Fixed in 1.1.62. Drop this once the upstream chain bumps past 1.1.62.
-->
+
+
+
diff --git a/eng/skill-validator/src/SkillValidatorJsonContext.cs b/eng/skill-validator/src/SkillValidatorJsonContext.cs
index 2aaae673bf..e11f7c3597 100644
--- a/eng/skill-validator/src/SkillValidatorJsonContext.cs
+++ b/eng/skill-validator/src/SkillValidatorJsonContext.cs
@@ -17,6 +17,8 @@ namespace SkillValidator;
[JsonSerializable(typeof(ScenarioComparison))]
[JsonSerializable(typeof(RunResult))]
[JsonSerializable(typeof(RunMetrics))]
+[JsonSerializable(typeof(BaselineFile))]
+[JsonSerializable(typeof(BaselineScenarioEntry))]
[JsonSerializable(typeof(JudgeResult))]
[JsonSerializable(typeof(RubricScore))]
[JsonSerializable(typeof(AssertionResult))]
diff --git a/eng/skill-validator/src/docs/InvestigatingResults.md b/eng/skill-validator/src/docs/InvestigatingResults.md
index 7616caf9c5..e2c0a7ecd4 100644
--- a/eng/skill-validator/src/docs/InvestigatingResults.md
+++ b/eng/skill-validator/src/docs/InvestigatingResults.md
@@ -83,6 +83,8 @@ Each scenario includes two required runs (baseline + isolated). It may also incl
> **Note:** Scenarios do not have a `passed` field. To determine pass/fail for an individual scenario, check whether `improvementScore >= 0`. This is the effective score: when no plugin run is present it equals `isolatedImprovementScore`; when a plugin run is present it is the min of isolated and plugin scores. The `passed` field exists only at the verdict level (per-skill).
+> **Reused baselines:** When the run was invoked with `--baseline-from`, the `baseline` arm is not executed — its `metrics` and `judgeResult` come from the shared baseline file produced earlier with `--baseline-out` (computed once, honoring `--runs`). Such scenarios are reported with the `baseline-reused` session phase and a `reused` baseline status. The baseline file is keyed on `--model` and `--judge-model` plus, per scenario, a SHA-256 of the prompt and a composite SHA-256 over its setup inputs (copied test files, explicit setup files, and setup commands) and its evaluation criteria (rubric, assertions, expect/reject tools, and turn/token/timeout limits); reuse fails fast if the agent model, judge model, or any prompt-plus-setup-plus-criteria identity is missing, so the baseline you compare against is always identity-matched and a shared prompt across cases with different fixtures or rubrics cannot cross-contaminate. Because the baseline output is identical across every skill/agent that consumes the same file, this acts as a shared control group and removes baseline run-to-run variance from cross-skill comparisons.
+
### Breakdown fields
The `isolatedBreakdown` and `pluginBreakdown` objects show how each metric contributed to the improvement score. Each field is a raw delta (not yet weighted). The final score is computed as a weighted sum:
diff --git a/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
new file mode 100644
index 0000000000..e364ab7c07
--- /dev/null
+++ b/eng/skill-validator/tests/Evaluate/BaselineStoreTests.cs
@@ -0,0 +1,379 @@
+using System.Text.Json;
+using SkillValidator;
+using SkillValidator.Evaluate;
+
+namespace SkillValidator.Tests;
+
+public class BaselineStoreTests
+{
+ private const string Model = "model-x";
+ private const string Judge = "judge-x";
+
+ private static RunResult MakeBaseline(double overallScore = 3, string output = "baseline output") =>
+ new(
+ new RunMetrics
+ {
+ TokenEstimate = 1000,
+ ToolCallCount = 4,
+ ToolCallBreakdown = new Dictionary { ["bash"] = 4 },
+ AgentOutput = output,
+ TaskCompleted = true,
+ Events = [],
+ },
+ new JudgeResult([new RubricScore("Quality", overallScore, "ok")], overallScore, "fine"));
+
+ private static EvalScenario Scenario(string name, string prompt) => new(name, prompt);
+
+ private static string TempPath() =>
+ Path.Combine(Path.GetTempPath(), $"sv-baseline-test-{Guid.NewGuid():N}.json");
+
+ [Fact]
+ public void ComputePromptSha_IsDeterministicAndPromptSensitive()
+ {
+ var a = BaselineStore.ComputePromptSha("do the thing");
+ var b = BaselineStore.ComputePromptSha("do the thing");
+ var c = BaselineStore.ComputePromptSha("do something else");
+
+ Assert.Equal(a, b);
+ Assert.NotEqual(a, c);
+ Assert.Equal(64, a.Length); // SHA-256 hex
+ }
+
+ [Fact]
+ public void SaveThenLoad_RoundTripsBaselinePerScenario()
+ {
+ var path = TempPath();
+ try
+ {
+ var store = BaselineStore.ForWrite(Model, Judge);
+ var s1 = Scenario("alpha", "prompt one");
+ var s2 = Scenario("beta", "prompt two");
+ store.Record(s1, runs: 5, MakeBaseline(overallScore: 4, output: "out-1"));
+ store.Record(s2, runs: 5, MakeBaseline(overallScore: 2, output: "out-2"));
+ store.Save(path);
+
+ Assert.True(File.Exists(path));
+
+ var loaded = BaselineStore.Load(path, Model, Judge);
+ Assert.True(loaded.IsReuse);
+ Assert.Equal(2, loaded.Count);
+
+ var b1 = loaded.TryGetBaseline(s1);
+ var b2 = loaded.TryGetBaseline(s2);
+ Assert.NotNull(b1);
+ Assert.NotNull(b2);
+ Assert.Equal("out-1", b1!.Metrics.AgentOutput);
+ Assert.Equal(4, b1.JudgeResult.OverallScore);
+ Assert.Equal("out-2", b2!.Metrics.AgentOutput);
+ }
+ finally
+ {
+ File.Delete(path);
+ }
+ }
+
+ [Fact]
+ public void Load_ThrowsOnModelMismatch()
+ {
+ var path = TempPath();
+ try
+ {
+ var store = BaselineStore.ForWrite(Model, Judge);
+ store.Record(Scenario("alpha", "prompt one"), runs: 3, MakeBaseline());
+ store.Save(path);
+
+ var ex = Assert.Throws(() => BaselineStore.Load(path, "model-y", Judge));
+ Assert.Contains(Model, ex.Message);
+ Assert.Contains("model-y", ex.Message);
+ }
+ finally
+ {
+ File.Delete(path);
+ }
+ }
+
+ [Fact]
+ public void Load_ThrowsOnJudgeModelMismatch()
+ {
+ var path = TempPath();
+ try
+ {
+ var store = BaselineStore.ForWrite(Model, Judge);
+ store.Record(Scenario("alpha", "prompt one"), runs: 3, MakeBaseline());
+ store.Save(path);
+
+ var ex = Assert.Throws(() => BaselineStore.Load(path, Model, "judge-y"));
+ Assert.Contains(Judge, ex.Message);
+ Assert.Contains("judge-y", ex.Message);
+ }
+ finally
+ {
+ File.Delete(path);
+ }
+ }
+
+ [Fact]
+ public void Load_ThrowsOnUnsupportedVersion()
+ {
+ var path = TempPath();
+ try
+ {
+ var file = new BaselineFile(
+ Version: BaselineStore.CurrentVersion + 1,
+ Model: Model,
+ JudgeModel: Judge,
+ ValidatorVersion: "9.9.9",
+ CreatedAt: DateTime.UtcNow.ToString("o"),
+ Scenarios: []);
+ File.WriteAllText(path, JsonSerializer.Serialize(file, SkillValidatorJsonContext.Default.BaselineFile));
+
+ var ex = Assert.Throws(() => BaselineStore.Load(path, Model, Judge));
+ Assert.Contains("unsupported version", ex.Message);
+ }
+ finally
+ {
+ File.Delete(path);
+ }
+ }
+
+ [Fact]
+ public void Load_ThrowsWhenFileMissing()
+ {
+ Assert.Throws(() => BaselineStore.Load(TempPath(), Model, Judge));
+ }
+
+ [Fact]
+ public void FindMissingScenarios_ReturnsScenariosWithoutCachedBaseline()
+ {
+ var path = TempPath();
+ try
+ {
+ var store = BaselineStore.ForWrite(Model, Judge);
+ var present = Scenario("alpha", "prompt one");
+ store.Record(present, runs: 5, MakeBaseline());
+ store.Save(path);
+
+ var loaded = BaselineStore.Load(path, Model, Judge);
+ var missing = loaded.FindMissingScenarios([(present, null), (Scenario("beta", "prompt two"), null)]);
+
+ Assert.Single(missing);
+ Assert.StartsWith("beta", missing[0]);
+ }
+ finally
+ {
+ File.Delete(path);
+ }
+ }
+
+ [Fact]
+ public void WriteStore_IsNotReuse()
+ {
+ var store = BaselineStore.ForWrite(Model, Judge);
+ Assert.False(store.IsReuse);
+ Assert.Null(store.TryGetBaseline(Scenario("alpha", "prompt one")));
+ }
+
+ private static string MakeEvalDirWithFixture(string fixtureName, string fixtureContent)
+ {
+ var dir = Path.Combine(Path.GetTempPath(), $"sv-baseline-fixture-{Guid.NewGuid():N}");
+ Directory.CreateDirectory(dir);
+ File.WriteAllText(Path.Combine(dir, "eval.yaml"), "scenarios: []");
+ File.WriteAllText(Path.Combine(dir, fixtureName), fixtureContent);
+ return Path.Combine(dir, "eval.yaml");
+ }
+
+ private static EvalScenario FixtureScenario(string name, string prompt) =>
+ new(name, prompt, new SetupConfig(CopyTestFiles: true));
+
+ [Fact]
+ public void ComputeTargetSha_DiffersByFixtureContentAndIsStable()
+ {
+ var evalA = MakeEvalDirWithFixture("build.binlog", "AAAA");
+ var evalB = MakeEvalDirWithFixture("build.binlog", "BBBB");
+ try
+ {
+ var scenario = FixtureScenario("s", "investigate build.binlog");
+
+ var shaA1 = BaselineStore.ComputeTargetSha(scenario, evalA);
+ var shaA2 = BaselineStore.ComputeTargetSha(scenario, evalA);
+ var shaB = BaselineStore.ComputeTargetSha(scenario, evalB);
+
+ Assert.Equal(shaA1, shaA2); // stable for identical inputs
+ Assert.NotEqual(shaA1, shaB); // sensitive to fixture content
+ Assert.Equal(64, shaA1.Length);
+
+ // No setup → a stable, distinct constant.
+ var noSetup = BaselineStore.ComputeTargetSha(Scenario("s", "investigate build.binlog"), evalA);
+ Assert.NotEqual(shaA1, noSetup);
+ }
+ finally
+ {
+ Directory.Delete(Path.GetDirectoryName(evalA)!, recursive: true);
+ Directory.Delete(Path.GetDirectoryName(evalB)!, recursive: true);
+ }
+ }
+
+ [Fact]
+ public void ComputeTargetSha_DiffersByEvaluationCriteria()
+ {
+ const string prompt = "investigate the failure";
+ var baseScenario = Scenario("s", prompt);
+ var withRubric = baseScenario with { Rubric = ["Did it find the root cause?"] };
+ var withAssertion = baseScenario with { Assertions = [new Assertion(AssertionType.OutputContains, Value: "error")] };
+ var withTurns = baseScenario with { MaxTurns = 5 };
+ var withExpectTools = baseScenario with { ExpectTools = ["bash"] };
+
+ var shaBase = BaselineStore.ComputeTargetSha(baseScenario, null);
+
+ // Each criterion that shapes the cached result must change the identity.
+ Assert.NotEqual(shaBase, BaselineStore.ComputeTargetSha(withRubric, null));
+ Assert.NotEqual(shaBase, BaselineStore.ComputeTargetSha(withAssertion, null));
+ Assert.NotEqual(shaBase, BaselineStore.ComputeTargetSha(withTurns, null));
+ Assert.NotEqual(shaBase, BaselineStore.ComputeTargetSha(withExpectTools, null));
+
+ // Same criteria → stable identity.
+ Assert.Equal(
+ BaselineStore.ComputeTargetSha(withRubric, null),
+ BaselineStore.ComputeTargetSha(baseScenario with { Rubric = ["Did it find the root cause?"] }, null));
+ }
+
+ [Fact]
+ public void Record_IsFirstWriterWins_ForSameScenarioIdentity()
+ {
+ var path = TempPath();
+ try
+ {
+ var store = BaselineStore.ForWrite(Model, Judge);
+ var scenario = Scenario("alpha", "prompt one");
+
+ // Same identity recorded twice (e.g. two parallel targets sharing a scenario)
+ // with differing run-to-run results: the first record must win so --baseline-out
+ // is deterministic regardless of completion order.
+ store.Record(scenario, runs: 5, MakeBaseline(output: "first"));
+ store.Record(scenario, runs: 5, MakeBaseline(output: "second"));
+
+ Assert.Equal(1, store.Count);
+ store.Save(path);
+ var loaded = BaselineStore.Load(path, Model, Judge);
+ Assert.Equal("first", loaded.TryGetBaseline(scenario)!.Metrics.AgentOutput);
+ }
+ finally
+ {
+ File.Delete(path);
+ }
+ }
+
+ [Fact]
+ public void ComputeTargetSha_IncludesNestedFixtureFiles()
+ {
+ // copy_test_files copies subdirectories recursively, so nested fixture content
+ // must participate in the target identity (mirrors AgentRunner.CopyDirectory).
+ var evalPath = MakeEvalDirWithFixture("top.txt", "top");
+ var evalDir = Path.GetDirectoryName(evalPath)!;
+ var nestedDir = Path.Combine(evalDir, "sub");
+ Directory.CreateDirectory(nestedDir);
+ var nestedFile = Path.Combine(nestedDir, "data.bin");
+ File.WriteAllText(nestedFile, "v1");
+ try
+ {
+ var scenario = FixtureScenario("s", "investigate");
+ var before = BaselineStore.ComputeTargetSha(scenario, evalPath);
+
+ File.WriteAllText(nestedFile, "v2");
+ var after = BaselineStore.ComputeTargetSha(scenario, evalPath);
+
+ Assert.NotEqual(before, after); // nested file change invalidates reuse
+ }
+ finally
+ {
+ Directory.Delete(evalDir, recursive: true);
+ }
+ }
+
+ [Fact]
+ public void ComputeTargetSha_HashesFixtures_WhenEvalPathIsBareFilename()
+ {
+ // A bare filename (no directory component) must still hash sibling fixtures:
+ // Path.GetDirectoryName returns "" for "eval.yaml", so without normalization
+ // fixture hashing is silently skipped and distinct fixtures collide.
+ var evalPath = MakeEvalDirWithFixture("build.binlog", "AAAA");
+ var evalDir = Path.GetDirectoryName(evalPath)!;
+ var originalCwd = Directory.GetCurrentDirectory();
+ try
+ {
+ Directory.SetCurrentDirectory(evalDir);
+ var scenario = FixtureScenario("s", "investigate build.binlog");
+
+ var shaA = BaselineStore.ComputeTargetSha(scenario, "eval.yaml");
+ File.WriteAllText(Path.Combine(evalDir, "build.binlog"), "BBBB");
+ var shaB = BaselineStore.ComputeTargetSha(scenario, "eval.yaml");
+
+ Assert.NotEqual(shaA, shaB); // fixture content participates in identity
+ }
+ finally
+ {
+ Directory.SetCurrentDirectory(originalCwd);
+ Directory.Delete(evalDir, recursive: true);
+ }
+ }
+
+ [Fact]
+ public void Clone_ProducesIndependentCopy()
+ {
+ var source = MakeBaseline(output: "src").Metrics;
+ source.JudgeInputTokens = 10;
+ source.ToolCallBreakdown["bash"] = 4;
+
+ var clone = source.Clone();
+ clone.JudgeInputTokens = 99;
+ clone.ToolCallBreakdown["bash"] = 1;
+ clone.AssertionResults.Add(new AssertionResult(new Assertion(AssertionType.OutputContains, Value: "x"), true, ""));
+
+ // Mutating the clone must not leak back into the source — the cached baseline
+ // can be reused concurrently across parallel target evaluations.
+ Assert.Equal(10, source.JudgeInputTokens);
+ Assert.Equal(4, source.ToolCallBreakdown["bash"]);
+ Assert.Empty(source.AssertionResults);
+ Assert.NotSame(source.ToolCallBreakdown, clone.ToolCallBreakdown);
+ Assert.NotSame(source.AssertionResults, clone.AssertionResults);
+ }
+
+ [Fact]
+ public void SamePromptDifferentFixture_DoesNotReuseBaseline()
+ {
+ var path = TempPath();
+ var evalA = MakeEvalDirWithFixture("build.binlog", "case-A-binlog");
+ var evalB = MakeEvalDirWithFixture("build.binlog", "case-B-binlog");
+ try
+ {
+ // Two cases share an identical prompt but feed different fixtures.
+ const string sharedPrompt = "The binlog is at build.binlog. What went wrong?";
+ var scenarioA = FixtureScenario("case-A", sharedPrompt);
+ var scenarioB = FixtureScenario("case-B", sharedPrompt);
+
+ // Persist a baseline only for case A.
+ var store = BaselineStore.ForWrite(Model, Judge);
+ store.Record(scenarioA, runs: 5, MakeBaseline(output: "A-baseline"), evalA);
+ store.Save(path);
+
+ var loaded = BaselineStore.Load(path, Model, Judge);
+
+ // Case A reuses its baseline; case B must NOT (different targetSha).
+ Assert.NotNull(loaded.TryGetBaseline(scenarioA, evalA));
+ Assert.Equal("A-baseline", loaded.TryGetBaseline(scenarioA, evalA)!.Metrics.AgentOutput);
+ Assert.Null(loaded.TryGetBaseline(scenarioB, evalB));
+
+ // FindMissingScenarios surfaces case B (with its eval path) despite the shared prompt.
+ var missing = loaded.FindMissingScenarios([(scenarioA, evalA), (scenarioB, evalB)]);
+ Assert.Single(missing);
+ Assert.StartsWith("case-B", missing[0]);
+ Assert.Contains(evalB, missing[0]);
+ }
+ finally
+ {
+ File.Delete(path);
+ Directory.Delete(Path.GetDirectoryName(evalA)!, recursive: true);
+ Directory.Delete(Path.GetDirectoryName(evalB)!, recursive: true);
+ }
+ }
+}