From d870730203d690d0cd1841bfb842cb86fee1f70b Mon Sep 17 00:00:00 2001 From: Dmitrii Korolev Date: Sat, 7 Mar 2026 12:55:06 +0100 Subject: [PATCH 1/2] use BPE tokenizer --- .../src/Services/SkillProfiler.cs | 30 ++++++++++++------- eng/skill-validator/src/SkillValidator.csproj | 4 ++- .../tests/SkillProfileTests.cs | 5 ++-- 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/eng/skill-validator/src/Services/SkillProfiler.cs b/eng/skill-validator/src/Services/SkillProfiler.cs index eef736d147..42800718f7 100644 --- a/eng/skill-validator/src/Services/SkillProfiler.cs +++ b/eng/skill-validator/src/Services/SkillProfiler.cs @@ -1,4 +1,5 @@ using System.Text.RegularExpressions; +using Microsoft.ML.Tokenizers; using SkillValidator.Models; namespace SkillValidator.Services; @@ -6,6 +7,7 @@ namespace SkillValidator.Services; public sealed record SkillProfile( string Name, int TokenCount, + int BpeTokenCount, string ComplexityTier, // "compact" | "detailed" | "standard" | "comprehensive" int SectionCount, int CodeBlockCount, @@ -25,6 +27,10 @@ public static partial class SkillProfiler private const int TokenSweetHigh = 2500; private const int TokenWarnHigh = 5000; internal const int MaxDescriptionLength = 1024; + + // Lazy-initialized BPE tokenizer (cl100k_base, same BPE family as GPT-4/Claude) + private static readonly Lazy s_bpeTokenizer = new( + () => TiktokenTokenizer.CreateForModel("gpt-4")); internal const int MaxAggregateDescriptionLength = 15_000; private const int MaxNameLength = 64; private const int MaxCompatibilityLength = 500; @@ -33,7 +39,8 @@ public static partial class SkillProfiler public static SkillProfile AnalyzeSkill(SkillInfo skill) { var content = skill.SkillMdContent; - int tokenCount = (int)Math.Ceiling(content.Length / 4.0); + int chars4TokenCount = (int)Math.Ceiling(content.Length / 4.0); + int bpeTokenCount = s_bpeTokenizer.Value.CountTokens(content); bool hasFrontmatter = FrontmatterRegex().IsMatch(content); @@ -48,7 +55,7 @@ public static SkillProfile AnalyzeSkill(SkillInfo skill) bool hasWhenToUse = WhenToUseRegex().IsMatch(body); bool hasWhenNotToUse = WhenNotToUseRegex().IsMatch(body); - string complexityTier = tokenCount switch + string complexityTier = bpeTokenCount switch { < 400 => "compact", <= 2500 => "detailed", @@ -134,21 +141,21 @@ public static SkillProfile AnalyzeSkill(SkillInfo skill) } } - // --- Token size warnings --- - if (tokenCount > TokenWarnHigh) + // --- Token size warnings (based on BPE token count) --- + if (bpeTokenCount > TokenWarnHigh) { warnings.Add( - $"Skill is {tokenCount:N0} tokens — \"comprehensive\" skills hurt performance by 2.9pp on average. Consider splitting into 2–3 focused skills."); + $"Skill is {bpeTokenCount:N0} BPE tokens (chars/4 estimate: {chars4TokenCount:N0}) — \"comprehensive\" skills hurt performance by 2.9pp on average. Consider splitting into 2–3 focused skills."); } - else if (tokenCount > TokenSweetHigh) + else if (bpeTokenCount > TokenSweetHigh) { warnings.Add( - $"Skill is {tokenCount:N0} tokens — approaching \"comprehensive\" range where gains diminish."); + $"Skill is {bpeTokenCount:N0} BPE tokens (chars/4 estimate: {chars4TokenCount:N0}) — approaching \"comprehensive\" range where gains diminish."); } - else if (tokenCount < TokenSweetLow) + else if (bpeTokenCount < TokenSweetLow) { warnings.Add( - $"Skill is only {tokenCount} tokens — may be too sparse to provide actionable guidance."); + $"Skill is only {bpeTokenCount} BPE tokens (chars/4 estimate: {chars4TokenCount}) — may be too sparse to provide actionable guidance."); } if (sectionCount == 0) @@ -177,7 +184,8 @@ public static SkillProfile AnalyzeSkill(SkillInfo skill) return new SkillProfile( Name: skill.Name, - TokenCount: tokenCount, + TokenCount: chars4TokenCount, + BpeTokenCount: bpeTokenCount, ComplexityTier: complexityTier, SectionCount: sectionCount, CodeBlockCount: codeBlockCount, @@ -230,7 +238,7 @@ public static string FormatProfileLine(SkillProfile profile) }; return - $"📊 {profile.Name}: {profile.TokenCount:N0} tokens ({profile.ComplexityTier} {tierIndicator}), " + + $"📊 {profile.Name}: {profile.BpeTokenCount:N0} BPE tokens [chars/4: {profile.TokenCount:N0}] ({profile.ComplexityTier} {tierIndicator}), " + $"{profile.SectionCount} sections, {profile.CodeBlockCount} code blocks"; } diff --git a/eng/skill-validator/src/SkillValidator.csproj b/eng/skill-validator/src/SkillValidator.csproj index 0a5bd5dc6f..c48b07bdcf 100644 --- a/eng/skill-validator/src/SkillValidator.csproj +++ b/eng/skill-validator/src/SkillValidator.csproj @@ -18,7 +18,7 @@ true - --results-dir "$([MSBuild]::NormalizePath('$(ArtifactsPath)', 'TestResults', '$(AssemblyName)'))" --parallel-skills 3 --parallel-scenarios 3 --parallel-runs 3 + --results-dir "$([MSBuild]::NormalizePath('$(ArtifactsPath)', 'TestResults', '$(AssemblyName)'))" --parallel-skills 3 --parallel-scenarios 3 --parallel-runs 3 @@ -31,6 +31,8 @@ + + diff --git a/eng/skill-validator/tests/SkillProfileTests.cs b/eng/skill-validator/tests/SkillProfileTests.cs index 6371981062..b853664ef9 100644 --- a/eng/skill-validator/tests/SkillProfileTests.cs +++ b/eng/skill-validator/tests/SkillProfileTests.cs @@ -70,8 +70,9 @@ public void ClassifiesCompactSkills() [Fact] public void ClassifiesComprehensiveSkillsAndWarns() { - // >5000 tokens = >20000 chars - var content = "---\nname: foo\n---\n# Big\n" + new string('x', 25000); + // >5000 BPE tokens — use varied text since BPE compresses repeated chars efficiently + var content = "---\nname: foo\n---\n# Big\n" + string.Concat( + Enumerable.Range(0, 5000).Select(i => $"word{i} ")); var profile = SkillProfiler.AnalyzeSkill(MakeSkill(content)); Assert.Equal("comprehensive", profile.ComplexityTier); Assert.Contains(profile.Warnings, w => w.Contains("comprehensive")); From 77eb1be7ac59475c9e56975188818e2fb002311a Mon Sep 17 00:00:00 2001 From: Korolev Dmitry Date: Sun, 8 Mar 2026 19:31:52 +0100 Subject: [PATCH 2/2] selectivity test --- .../src/Commands/ValidateCommand.cs | 114 +++++++++++++++++- eng/skill-validator/src/Models/Models.cs | 23 +++- .../src/Services/AgentRunner.cs | 71 +++++++++++ .../src/Services/EvalSchema.cs | 11 +- eng/skill-validator/src/Services/Reporter.cs | 51 ++++++-- .../src/SkillValidatorJsonContext.cs | 2 + .../src/SkillValidatorYamlContext.cs | 1 + .../build-perf-diagnostics/eval.yaml | 14 +++ 8 files changed, 271 insertions(+), 16 deletions(-) diff --git a/eng/skill-validator/src/Commands/ValidateCommand.cs b/eng/skill-validator/src/Commands/ValidateCommand.cs index 15746cdd50..0493ba9bf0 100644 --- a/eng/skill-validator/src/Commands/ValidateCommand.cs +++ b/eng/skill-validator/src/Commands/ValidateCommand.cs @@ -30,6 +30,9 @@ public static RootCommand Create() var reporterOpt = new Option("--reporter") { Description = "Reporter (console, json, junit, markdown). Can be repeated.", AllowMultipleArgumentsPerToken = true }; var noOverfittingCheckOpt = new Option("--no-overfitting-check") { Description = "Disable LLM-based overfitting analysis (on by default)" }; var overfittingFixOpt = new Option("--overfitting-fix") { Description = "Generate a fixed eval.yaml with improved rubric items/assertions" }; + var selectivityTestOpt = new Option("--selectivity-test") { Description = "Run selectivity test using should_activate / should_not_activate prompts from eval.yaml" }; + var selectivityMinRecallOpt = new Option("--selectivity-min-recall") { Description = "Minimum recall (activation on should_activate prompts) to pass (0-1)", DefaultValueFactory = _ => 0.8 }; + var selectivityMinPrecisionOpt = new Option("--selectivity-min-precision") { Description = "Minimum precision (non-activation on should_not_activate prompts) to pass (0-1)", DefaultValueFactory = _ => 0.8 }; var command = new RootCommand("Validate that agent skills meaningfully improve agent performance") { @@ -53,6 +56,9 @@ public static RootCommand Create() reporterOpt, noOverfittingCheckOpt, overfittingFixOpt, + selectivityTestOpt, + selectivityMinRecallOpt, + selectivityMinPrecisionOpt, }; command.SetAction(async (parseResult, _) => @@ -98,6 +104,9 @@ public static RootCommand Create() TestsDir = parseResult.GetValue(testsDirOpt), OverfittingCheck = !parseResult.GetValue(noOverfittingCheckOpt), OverfittingFix = parseResult.GetValue(overfittingFixOpt), + SelectivityTest = parseResult.GetValue(selectivityTestOpt), + SelectivityMinRecall = parseResult.GetValue(selectivityMinRecallOpt), + SelectivityMinPrecision = parseResult.GetValue(selectivityMinPrecisionOpt), }; return await Run(config); @@ -333,6 +342,36 @@ internal static List CheckAggregateDescriptionLimits(IReadOnlyList 0 } || skill.EvalConfig.ShouldNotActivatePrompts is { Count: > 0 })) + { + log("🎯 Running selectivity test (standalone)..."); + var selectivityResult = await ExecuteSelectivityTest(skill, config, spinner); + log($"🎯 Selectivity: recall={selectivityResult.Recall:P0}, precision={selectivityResult.Precision:P0} — {(selectivityResult.Passed ? "PASSED" : "FAILED")}"); + + return new SkillVerdict + { + SkillName = skill.Name, + SkillPath = skill.Path, + Passed = selectivityResult.Passed, + Scenarios = [], + OverallImprovementScore = 0, + Reason = selectivityResult.Passed + ? "Selectivity test passed" + : $"Selectivity test failed: {selectivityResult.Reason}", + FailureKind = selectivityResult.Passed ? null : "selectivity_failure", + ProfileWarnings = profile.Warnings, + SelectivityResult = selectivityResult, + }; + } + + log("⏭ Skipping (no selectivity prompts in eval.yaml)"); + return null; + } + // Launch overfitting check in parallel with scenario execution var workDir = Path.GetTempPath(); Task overfittingTask = Task.FromResult(null); @@ -496,8 +535,8 @@ private static async Task ExecuteRun( runLog("running agents..."); var agentTasks = await Task.WhenAll( - AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, runLog)), - AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, runLog))); + AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, Log: runLog)), + AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, Log: runLog))); var baselineMetrics = agentTasks[0]; var withSkillMetrics = agentTasks[1]; @@ -642,4 +681,75 @@ private static string SanitizeErrorMessage(string? message) var singleLine = raw.ReplaceLineEndings(" "); return singleLine.Length > 150 ? singleLine[..150] + "…" : singleLine; } + + private static async Task ExecuteSelectivityTest(SkillInfo skill, ValidatorConfig config, Spinner spinner) + { + var prefix = $"[{skill.Name}/selectivity]"; + var log = (string msg) => spinner.Log($"{prefix} {msg}"); + + // Launch all probes in parallel + var tasks = new List>(); + + if (skill.EvalConfig!.ShouldActivatePrompts is { } activatePrompts) + { + foreach (var prompt in activatePrompts) + { + log($"Testing should_activate: \"{Truncate(prompt, 60)}\""); + tasks.Add(ProbeAndLog(skill, prompt, expectedActivation: true, config, log)); + } + } + + if (skill.EvalConfig.ShouldNotActivatePrompts is { } deactivatePrompts) + { + foreach (var prompt in deactivatePrompts) + { + log($"Testing should_not_activate: \"{Truncate(prompt, 60)}\""); + tasks.Add(ProbeAndLog(skill, prompt, expectedActivation: false, config, log)); + } + } + + var promptResults = (await Task.WhenAll(tasks)).ToList(); + + // Calculate recall: fraction of should_activate prompts that actually activated + var shouldActivateResults = promptResults.Where(r => r.ExpectedActivation).ToList(); + double recall = shouldActivateResults.Count > 0 + ? (double)shouldActivateResults.Count(r => r.SkillActivated) / shouldActivateResults.Count + : 1.0; + + // Calculate precision: fraction of should_not_activate prompts that correctly did NOT activate + var shouldNotActivateResults = promptResults.Where(r => !r.ExpectedActivation).ToList(); + double precision = shouldNotActivateResults.Count > 0 + ? (double)shouldNotActivateResults.Count(r => !r.SkillActivated) / shouldNotActivateResults.Count + : 1.0; + + bool passed = recall >= config.SelectivityMinRecall && precision >= config.SelectivityMinPrecision; + var reasons = new List(); + if (recall < config.SelectivityMinRecall) + reasons.Add($"Recall {recall:P0} below threshold {config.SelectivityMinRecall:P0}"); + if (precision < config.SelectivityMinPrecision) + reasons.Add($"Precision {precision:P0} below threshold {config.SelectivityMinPrecision:P0}"); + string reason = passed ? "Selectivity test passed" : string.Join("; ", reasons); + + return new SelectivityResult(promptResults, recall, precision, passed, reason); + } + + private static async Task ProbeAndLog( + SkillInfo skill, string prompt, bool expectedActivation, ValidatorConfig config, Action log) + { + var activated = await TestSkillActivation(skill, prompt, config); + if (expectedActivation) + log($" → {(activated ? "✅ activated" : "❌ NOT activated")}: \"{Truncate(prompt, 50)}\""); + else + log($" → {(activated ? "❌ activated (unexpected)" : "✅ correctly NOT activated")}: \"{Truncate(prompt, 50)}\""); + return new SelectivityPromptResult(prompt, ExpectedActivation: expectedActivation, SkillActivated: activated); + } + + private static async Task TestSkillActivation(SkillInfo skill, string prompt, ValidatorConfig config) + { + var scenario = new EvalScenario(Name: "selectivity-probe", Prompt: prompt, Rubric: [], Timeout: 15); + return await AgentRunner.ProbeSkillActivation(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose)); + } + + private static string Truncate(string value, int maxLength) => + value.Length <= maxLength ? value : value[..(maxLength - 1)] + "…"; } diff --git a/eng/skill-validator/src/Models/Models.cs b/eng/skill-validator/src/Models/Models.cs index 1c076e45b9..7158e15688 100644 --- a/eng/skill-validator/src/Models/Models.cs +++ b/eng/skill-validator/src/Models/Models.cs @@ -70,7 +70,10 @@ public sealed record EvalScenario( int? MaxTokens = null, bool ExpectActivation = true); -public sealed record EvalConfig(IReadOnlyList Scenarios); +public sealed record EvalConfig( + IReadOnlyList Scenarios, + IReadOnlyList? ShouldActivatePrompts = null, + IReadOnlyList? ShouldNotActivatePrompts = null); // --- Skill info --- @@ -227,6 +230,7 @@ public sealed class SkillVerdict public IReadOnlyList? ProfileWarnings { get; set; } public bool SkillNotActivated { get; set; } public OverfittingResult? OverfittingResult { get; set; } + public SelectivityResult? SelectivityResult { get; set; } } // --- Overfitting assessment --- @@ -274,6 +278,20 @@ public sealed record OverfittingJudgeOptions( int Timeout, string WorkDir); +// --- Selectivity test --- + +public sealed record SelectivityPromptResult( + string Prompt, + bool ExpectedActivation, + bool SkillActivated); + +public sealed record SelectivityResult( + IReadOnlyList PromptResults, + double Recall, + double Precision, + bool Passed, + string Reason); + // --- Config --- public sealed record ReporterSpec(ReporterType Type); @@ -308,6 +326,9 @@ public sealed record ValidatorConfig public string? TestsDir { get; init; } public bool OverfittingCheck { get; init; } = true; public bool OverfittingFix { get; init; } + public bool SelectivityTest { get; init; } + public double SelectivityMinRecall { get; init; } = 0.8; + public double SelectivityMinPrecision { get; init; } = 0.8; } public static class DefaultWeights diff --git a/eng/skill-validator/src/Services/AgentRunner.cs b/eng/skill-validator/src/Services/AgentRunner.cs index 7a79a630e6..13be42a481 100644 --- a/eng/skill-validator/src/Services/AgentRunner.cs +++ b/eng/skill-validator/src/Services/AgentRunner.cs @@ -287,6 +287,77 @@ public static async Task RunAgent(RunOptions options) return metrics; } + /// + /// Lightweight probe that sends a prompt and checks whether the skill is activated. + /// Exits immediately when a SkillInvokedEvent is seen, or waits for the session to + /// complete/timeout. Designed to run many probes in parallel via Task.WhenAll. + /// + public static async Task ProbeSkillActivation(RunOptions options) + { + var workDir = Path.Combine(Path.GetTempPath(), $"sv-{Guid.NewGuid():N}"); + Directory.CreateDirectory(workDir); + _workDirs.Add(workDir); + + if (options.Verbose) + { + var write = options.Log ?? (msg => Console.Error.WriteLine(msg)); + write($" 📂 {workDir} (skilled)"); + } + + bool skillActivated = false; + var done = new TaskCompletionSource(); + + try + { + var client = await GetSharedClient(options.Verbose); + await using var session = await client.CreateSessionAsync( + BuildSessionConfig(options.Skill, options.Model, workDir, options.Skill?.McpServers)); + + // 30s timeout — enough for the agent to reach the skill-loading decision + using var cts = new CancellationTokenSource(30_000); + cts.Token.Register(() => done.TrySetResult(skillActivated)); + + session.On(evt => + { + switch (evt) + { + // Skill loaded → we have our answer, bail immediately + case SkillInvokedEvent: + skillActivated = true; + done.TrySetResult(true); + break; + + // Session finished without loading the skill → not activated + case SessionIdleEvent: + done.TrySetResult(skillActivated); + break; + + case SessionErrorEvent err: + done.TrySetException(new InvalidOperationException(err.Data.Message ?? "Session error")); + break; + } + + if (options.Verbose && evt is SkillInvokedEvent si) + { + var write = options.Log ?? (m => Console.Error.WriteLine(m)); + write($" 📘 Skill invoked: {si.Data.Name}"); + } + if (options.Verbose && evt is ToolExecutionStartEvent ts) + { + var write = options.Log ?? (m => Console.Error.WriteLine(m)); + write($" 🔧 {ts.Data.ToolName}"); + } + }); + + await session.SendAsync(new MessageOptions { Prompt = options.Scenario.Prompt }); + return await done.Task; + } + catch + { + return skillActivated; + } + } + private static async Task SetupWorkDir(EvalScenario scenario, string? skillPath, string? evalPath) { var workDir = Path.Combine(Path.GetTempPath(), $"sv-{Guid.NewGuid():N}"); diff --git a/eng/skill-validator/src/Services/EvalSchema.cs b/eng/skill-validator/src/Services/EvalSchema.cs index d89ea92de8..b8c3e36928 100644 --- a/eng/skill-validator/src/Services/EvalSchema.cs +++ b/eng/skill-validator/src/Services/EvalSchema.cs @@ -21,7 +21,7 @@ public static EvalConfig ParseEvalConfig(string yamlContent) if (scenarios is not { Count: > 0 }) throw new InvalidOperationException("Eval config must have at least one scenario"); - return new EvalConfig(scenarios); + return new EvalConfig(scenarios, raw.Selectivity?.ShouldActivate, raw.Selectivity?.ShouldNotActivate); } public static (bool Success, EvalConfig? Data, IReadOnlyList? Errors) ValidateEvalConfig(string yamlContent) @@ -122,6 +122,15 @@ internal sealed class RawFrontmatter internal sealed class RawEvalConfig { public List? Scenarios { get; set; } + public RawSelectivity? Selectivity { get; set; } + } + + internal sealed class RawSelectivity + { + [YamlMember(Alias = "should_activate")] + public List? ShouldActivate { get; set; } + [YamlMember(Alias = "should_not_activate")] + public List? ShouldNotActivate { get; set; } } internal sealed class RawScenario diff --git a/eng/skill-validator/src/Services/Reporter.cs b/eng/skill-validator/src/Services/Reporter.cs index 86e8fccbb3..ffe00e13e3 100644 --- a/eng/skill-validator/src/Services/Reporter.cs +++ b/eng/skill-validator/src/Services/Reporter.cs @@ -62,22 +62,34 @@ private static void ReportConsole(IReadOnlyList verdicts, bool ver { var icon = verdict.Passed ? "\x1b[32m✓\x1b[0m" : "\x1b[31m✗\x1b[0m"; var name = $"\x1b[1m{verdict.SkillName}\x1b[0m"; - var score = FormatScore(verdict.OverallImprovementScore); - var scoreLine = $"{icon} {name} {score}"; - if (verdict.ConfidenceInterval is { } ci) + // Selectivity-only verdict: no scenarios or score to display + bool isSelectivityOnly = verdict.Scenarios.Count == 0 && verdict.SelectivityResult is not null; + + if (isSelectivityOnly) { - var ciStr = $"[{FormatPct(ci.Low)}, {FormatPct(ci.High)}]"; - var sigStr = verdict.IsSignificant == true - ? "\x1b[32msignificant\x1b[0m" - : "\x1b[33mnot significant\x1b[0m"; - scoreLine += $" \x1b[2m{ciStr}\x1b[0m {sigStr}"; + Console.WriteLine($"{icon} {name} \x1b[2m(selectivity only)\x1b[0m"); + Console.WriteLine($" \x1b[2m{verdict.Reason}\x1b[0m"); } - if (verdict.NormalizedGain is { } ng) - scoreLine += $" \x1b[2m(g={FormatPct(ng)})\x1b[0m"; + else + { + var score = FormatScore(verdict.OverallImprovementScore); + + var scoreLine = $"{icon} {name} {score}"; + if (verdict.ConfidenceInterval is { } ci) + { + var ciStr = $"[{FormatPct(ci.Low)}, {FormatPct(ci.High)}]"; + var sigStr = verdict.IsSignificant == true + ? "\x1b[32msignificant\x1b[0m" + : "\x1b[33mnot significant\x1b[0m"; + scoreLine += $" \x1b[2m{ciStr}\x1b[0m {sigStr}"; + } + if (verdict.NormalizedGain is { } ng) + scoreLine += $" \x1b[2m(g={FormatPct(ng)})\x1b[0m"; - Console.WriteLine(scoreLine); - Console.WriteLine($" \x1b[2m{verdict.Reason}\x1b[0m"); + Console.WriteLine(scoreLine); + Console.WriteLine($" \x1b[2m{verdict.Reason}\x1b[0m"); + } if (!verdict.Passed && verdict.ProfileWarnings is { Count: > 0 }) { @@ -132,6 +144,21 @@ private static void ReportConsole(IReadOnlyList verdicts, bool ver Console.WriteLine($" \x1b[2m•\x1b[0m [{item.Classification}] \x1b[2m{item.AssertionSummary}\x1b[0m\n \x1b[2m— {item.Reasoning}\x1b[0m"); } } + if (verdict.SelectivityResult is { } selResult) + { + Console.WriteLine(); + var selIcon = selResult.Passed ? "✅" : "🔴"; + Console.WriteLine($" 🎯 Selectivity: recall={selResult.Recall:P0}, precision={selResult.Precision:P0} {selIcon}"); + foreach (var pr in selResult.PromptResults) + { + var expected = pr.ExpectedActivation ? "should activate" : "should NOT activate"; + var correct = (pr.ExpectedActivation == pr.SkillActivated); + var prIcon = correct ? "\x1b[32m✓\x1b[0m" : "\x1b[31m✗\x1b[0m"; + var activatedStr = pr.SkillActivated ? "activated" : "not activated"; + var prompt = pr.Prompt.Length > 60 ? pr.Prompt[..59] + "…" : pr.Prompt; + Console.WriteLine($" {prIcon} \x1b[2m\"{prompt}\" — {expected} → {activatedStr}\x1b[0m"); + } + } if (verdict.Scenarios.Count > 0) { Console.WriteLine(); diff --git a/eng/skill-validator/src/SkillValidatorJsonContext.cs b/eng/skill-validator/src/SkillValidatorJsonContext.cs index 90e5420e72..fb370ef526 100644 --- a/eng/skill-validator/src/SkillValidatorJsonContext.cs +++ b/eng/skill-validator/src/SkillValidatorJsonContext.cs @@ -33,6 +33,8 @@ namespace SkillValidator; [JsonSerializable(typeof(PairwiseMagnitude))] [JsonSerializable(typeof(AssertionType))] [JsonSerializable(typeof(MCPServerDef))] +[JsonSerializable(typeof(SelectivityPromptResult))] +[JsonSerializable(typeof(SelectivityResult))] [JsonSerializable(typeof(JsonElement))] [JsonSerializable(typeof(Dictionary))] [JsonSerializable(typeof(Dictionary))] diff --git a/eng/skill-validator/src/SkillValidatorYamlContext.cs b/eng/skill-validator/src/SkillValidatorYamlContext.cs index 9f1dadf584..c671d7007b 100644 --- a/eng/skill-validator/src/SkillValidatorYamlContext.cs +++ b/eng/skill-validator/src/SkillValidatorYamlContext.cs @@ -10,4 +10,5 @@ namespace SkillValidator; [YamlSerializable(typeof(EvalSchema.RawSetup))] [YamlSerializable(typeof(EvalSchema.RawSetupFile))] [YamlSerializable(typeof(EvalSchema.RawAssertion))] +[YamlSerializable(typeof(EvalSchema.RawSelectivity))] public partial class SkillValidatorYamlContext : StaticContext; diff --git a/tests/dotnet-msbuild/build-perf-diagnostics/eval.yaml b/tests/dotnet-msbuild/build-perf-diagnostics/eval.yaml index be492fba00..8982b987e5 100644 --- a/tests/dotnet-msbuild/build-perf-diagnostics/eval.yaml +++ b/tests/dotnet-msbuild/build-perf-diagnostics/eval.yaml @@ -15,3 +15,17 @@ scenarios: - "Identified EnforceCodeStyleInBuild should be conditional on CI, not always true" - "Solution preserves full analyzer enforcement in CI pipelines while speeding dev builds" timeout: 160 + +selectivity: + should_activate: + - "My .NET build takes over 5 minutes, how can I speed it up?" + - "How do I analyze a binlog to find slow targets in MSBuild?" + - "Roslyn analyzers are making my compilation really slow, what can I do?" + - "I want to profile my MSBuild build to understand where time is being spent" + - "Our CI builds are fast but local dev builds are painfully slow, how do I diagnose this?" + should_not_activate: + - "How do I add a NuGet package reference to my project?" + - "My unit tests are failing with a NullReferenceException" + - "How do I configure Docker for my .NET application?" + - "What's the difference between .NET 8 and .NET 9?" + - "How do I set up Entity Framework Core migrations?"