dotnet · YuliiaKovalova · Jun 12, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
@@ -186,6 +186,37 @@ public sealed class RunMetrics
     public string AgentOutput { get; set; } = "";
     public List<AgentEvent> Events { get; set; } = [];
     public string WorkDir { get; set; } = "";
+
+    /// <summary>
+    /// Creates a per-run copy.  Scalar fields are copied by value and the mutable
+    /// collections are re-wrapped in fresh instances so mutating the clone (e.g.
+    /// accumulating judge tokens) never affects the source.  This is essential when a
+    /// cached baseline is reused concurrently across parallel target evaluations: each
+    /// evaluation works on its own copy instead of sharing one mutable instance.
+    /// </summary>
+    public RunMetrics Clone() => new()
+    {
+        TokenEstimate = TokenEstimate,
+        InputTokens = InputTokens,
+        OutputTokens = OutputTokens,
+        CacheReadTokens = CacheReadTokens,
+        CacheWriteTokens = CacheWriteTokens,
+        JudgeInputTokens = JudgeInputTokens,
+        JudgeOutputTokens = JudgeOutputTokens,
+        JudgeCacheReadTokens = JudgeCacheReadTokens,
+        JudgeCacheWriteTokens = JudgeCacheWriteTokens,
+        ToolCallCount = ToolCallCount,
+        ToolCallBreakdown = new Dictionary<string, int>(ToolCallBreakdown),
+        TurnCount = TurnCount,
+        WallTimeMs = WallTimeMs,
+        ErrorCount = ErrorCount,
+        TimedOut = TimedOut,
+        AssertionResults = [.. AssertionResults],
+        TaskCompleted = TaskCompleted,
+        AgentOutput = AgentOutput,
+        Events = [.. Events],
+        WorkDir = WorkDir,
+    };
 }
 
 public sealed record RunResult(
@@ -427,6 +458,12 @@ public sealed record ValidatorConfig
     public string? NoiseSkillsDir { get; init; }
     public double NoiseDegradationLimit { get; init; } = 0.2;
     public double NoiseMaxScenarioDegradation { get; init; } = 0.4;
+
+    /// <summary>When set, persist each scenario's averaged baseline to this file after the run.</summary>
+    public string? BaselineOut { get; init; }
+
+    /// <summary>When set, reuse the precomputed baseline from this file instead of re-running the baseline arm.</summary>
+    public string? BaselineFrom { get; init; }
 }
 
 public static class DefaultWeights

@@ -73,6 +73,10 @@ skill-validator evaluate --model gpt-5.3-codex --judge-model claude-opus-4.6-fas
 # Multiple runs for stability
 skill-validator evaluate --runs 5 --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills
 
+# Compute a shared baseline once, then reuse it across multiple skills/agents
+skill-validator evaluate --baseline-out baseline.json --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills/skill-a
+skill-validator evaluate --baseline-from baseline.json --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills/skill-b
+
 # Override the default results directory (.skill-validator-results)
 skill-validator evaluate --results-dir ./my-results --tests-dir ./tests/my-plugin ./plugins/my-plugin/skills
 
@@ -142,6 +146,8 @@ skill-validator check --json --plugin ./plugins/my-plugin
 | `--confidence-level <n>` | `0.95` | Confidence level for statistical intervals (0–1) |
 | `--judge-timeout <n>` | `300` | Judge LLM timeout in seconds |
 | `--require-completion` | `true` | Fail if skill regresses task completion |
+| `--baseline-out <path>` | *(none)* | After running, persist each scenario's averaged baseline (no-skill/no-agent reference) to this file for reuse. Mutually exclusive with `--baseline-from`. |
+| `--baseline-from <path>` | *(none)* | Reuse a precomputed baseline from this file instead of re-running the baseline arm. Must match `--model`, `--judge-model`, and every scenario's prompt, setup inputs, and evaluation criteria. Mutually exclusive with `--baseline-out`. |
 | `--verdict-warn-only` | `false` | Treat verdict failures as warnings (exit 0). Execution errors still fail. |
 | `--no-overfitting-check` | `false` | Disable the LLM-based overfitting analysis (on by default) |
 | `--overfitting-fix` | `false` | Generate `eval.fixed.yaml` with improved rubric items/assertions |
@@ -151,6 +157,21 @@ skill-validator check --json --plugin ./plugins/my-plugin
 
 Models are validated on startup — invalid model names fail fast with a list of available models.
 
+### Shared baseline reuse
+
+Every evaluation runs each scenario through a **baseline arm** (the agent with no skill / no agent loaded) to establish a reference the skill-enhanced run is compared against. When you evaluate many skills or agents against the same test scenarios, that baseline arm is re-run every time — redundant work that also introduces run-to-run variance into the comparison.
+
+`--baseline-out` and `--baseline-from` let you compute the baseline **once** and reuse it as a shared control group:
+
+1. **Produce** a baseline file with `--baseline-out baseline.json`. After the run, each scenario's averaged baseline result (honoring `--runs`) is written to the file.
+2. **Reuse** it with `--baseline-from baseline.json` on subsequent runs. The baseline arm is skipped entirely; the cached baseline is used for assertions, pairwise/independent judging, and metric deltas.
+
+The baseline file records the `--model` **and** `--judge-model`, and per scenario a SHA-256 of the prompt plus a composite SHA-256 over (a) its setup inputs — the fixtures copied via `copy_test_files`, explicit setup files, and setup commands — and (b) the evaluation criteria that shape the stored result (rubric, assertions, expect/reject tools, and the turn/token/timeout limits). This is the analog of a target/input SHA. On reuse the validator fails fast if the agent model, the judge model, or any scenario's prompt-plus-setup-plus-criteria identity is missing from the file, so a stale or mismatched baseline can never be silently applied — and two scenarios that share a prompt but feed the agent different fixtures (e.g. a different `build.binlog`) or use different rubrics never reuse each other's baseline. Scenarios reused from the file are reported with the `baseline-reused` session phase and a `reused` baseline status.
+
+> **Note:** Setup `commands` are fingerprinted by their text (the recipe), not the artifacts they produce, so baseline reuse assumes setup commands are deterministic/hermetic — a command whose output changes between runs (e.g. fetching `latest`) will not invalidate a cached baseline.
+
+The two options are mutually exclusive.
+
 ## Output
 
 Results are displayed in the console with color-coded scores and metric deltas. By default, `json` and `markdown` reporters are enabled and write to `.skill-validator-results/` (override with `--results-dir`). File reporters write to that directory:

@@ -52,6 +52,14 @@
       Fixed in 1.1.62.  Drop this once the upstream chain bumps past 1.1.62.
     -->
     <PackageReference Include="Nerdbank.MessagePack" Version="1.2.4" />
+
+    <!--
+      Transitive pin: GitHub.Copilot.SDK -> StreamJsonRpc 2.24.84 drags in
+      MessagePack 2.5.198 which has a known high-severity vulnerability
+      (GHSA-hv8m-jj95-wg3x, out-of-bounds read in LZ4 decompression). Patched in
+      the v2 line as of 2.5.301.  Drop this once the upstream chain bumps past it.
+    -->
+    <PackageReference Include="MessagePack" Version="2.5.301" />
   </ItemGroup>
 
 </Project>
@@ -17,6 +17,8 @@ namespace SkillValidator;
 [JsonSerializable(typeof(ScenarioComparison))]
 [JsonSerializable(typeof(RunResult))]
 [JsonSerializable(typeof(RunMetrics))]
+[JsonSerializable(typeof(BaselineFile))]
+[JsonSerializable(typeof(BaselineScenarioEntry))]
 [JsonSerializable(typeof(JudgeResult))]
 [JsonSerializable(typeof(RubricScore))]
 [JsonSerializable(typeof(AssertionResult))]

@@ -83,6 +83,8 @@ Each scenario includes two required runs (baseline + isolated). It may also incl
 
 > **Note:** Scenarios do not have a `passed` field. To determine pass/fail for an individual scenario, check whether `improvementScore >= 0`. This is the effective score: when no plugin run is present it equals `isolatedImprovementScore`; when a plugin run is present it is the min of isolated and plugin scores. The `passed` field exists only at the verdict level (per-skill).
 
+> **Reused baselines:** When the run was invoked with `--baseline-from`, the `baseline` arm is not executed — its `metrics` and `judgeResult` come from the shared baseline file produced earlier with `--baseline-out` (computed once, honoring `--runs`). Such scenarios are reported with the `baseline-reused` session phase and a `reused` baseline status. The baseline file is keyed on `--model` and `--judge-model` plus, per scenario, a SHA-256 of the prompt and a composite SHA-256 over its setup inputs (copied test files, explicit setup files, and setup commands) and its evaluation criteria (rubric, assertions, expect/reject tools, and turn/token/timeout limits); reuse fails fast if the agent model, judge model, or any prompt-plus-setup-plus-criteria identity is missing, so the baseline you compare against is always identity-matched and a shared prompt across cases with different fixtures or rubrics cannot cross-contaminate. Because the baseline output is identical across every skill/agent that consumes the same file, this acts as a shared control group and removes baseline run-to-run variance from cross-skill comparisons.
+
 ### Breakdown fields
 
 The `isolatedBreakdown` and `pluginBreakdown` objects show how each metric contributed to the improvement score. Each field is a raw delta (not yet weighted). The final score is computed as a weighted sum: