diff --git a/Dashboard/Analysis/AnalysisModels.cs b/Dashboard/Analysis/AnalysisModels.cs new file mode 100644 index 0000000..7671885 --- /dev/null +++ b/Dashboard/Analysis/AnalysisModels.cs @@ -0,0 +1,152 @@ +using System; +using System.Collections.Generic; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// A scored observation from collected data. +/// +public class Fact +{ + public string Source { get; set; } = string.Empty; + public string Key { get; set; } = string.Empty; + public double Value { get; set; } + public double BaseSeverity { get; set; } + public double Severity { get; set; } + public int ServerId { get; set; } + public string? DatabaseName { get; set; } + + /// + /// Raw metric values for analysis and audit trail. + /// Keys are metric-specific (e.g., "wait_time_ms", "waiting_tasks_count"). + /// + public Dictionary Metadata { get; set; } = []; + + /// + /// Amplifiers that were evaluated for this fact. + /// + public List AmplifierResults { get; set; } = []; +} + +/// +/// Result of evaluating a single amplifier against the fact set. +/// +public class AmplifierResult +{ + public string Description { get; set; } = string.Empty; + public bool Matched { get; set; } + public double Boost { get; set; } +} + +/// +/// A conditional edge in the relationship graph. +/// +public class Edge +{ + public string Source { get; set; } = string.Empty; + public string Destination { get; set; } = string.Empty; + public string Category { get; set; } = string.Empty; + public string PredicateDescription { get; set; } = string.Empty; + + /// + /// Evaluates whether this edge should be followed given the current fact set. + /// + public Func, bool> Predicate { get; set; } = _ => false; +} + +/// +/// A complete analysis story — the path from root symptom to leaf recommendation. +/// +public class AnalysisStory +{ + public string RootFactKey { get; set; } = string.Empty; + public double RootFactValue { get; set; } + public double Severity { get; set; } + public double Confidence { get; set; } + public string Category { get; set; } = string.Empty; + public List Path { get; set; } = []; + public string StoryPath { get; set; } = string.Empty; + public string StoryPathHash { get; set; } = string.Empty; + public string StoryText { get; set; } = string.Empty; + public string? LeafFactKey { get; set; } + public double? LeafFactValue { get; set; } + public int FactCount { get; set; } + public bool IsAbsolution { get; set; } +} + +/// +/// A persisted finding from a previous analysis run. +/// Maps to the analysis_findings DuckDB table. +/// +public class AnalysisFinding +{ + public long FindingId { get; set; } + public DateTime AnalysisTime { get; set; } + public int ServerId { get; set; } + public string ServerName { get; set; } = string.Empty; + public string? DatabaseName { get; set; } + public DateTime? TimeRangeStart { get; set; } + public DateTime? TimeRangeEnd { get; set; } + public double Severity { get; set; } + public double Confidence { get; set; } + public string Category { get; set; } = string.Empty; + public string StoryPath { get; set; } = string.Empty; + public string StoryPathHash { get; set; } = string.Empty; + public string StoryText { get; set; } = string.Empty; + public string RootFactKey { get; set; } = string.Empty; + public double? RootFactValue { get; set; } + public string? LeafFactKey { get; set; } + public double? LeafFactValue { get; set; } + public int FactCount { get; set; } + + /// + /// Drill-down data collected after graph traversal. Ephemeral — not persisted to DuckDB. + /// Contains supporting detail keyed by category (e.g., "top_deadlocks", "queries_at_spike"). + /// + public Dictionary? DrillDown { get; set; } +} + +/// +/// A muted finding pattern. Maps to the analysis_muted DuckDB table. +/// +public class AnalysisMuted +{ + public long MuteId { get; set; } + public int? ServerId { get; set; } + public string? DatabaseName { get; set; } + public string StoryPathHash { get; set; } = string.Empty; + public string StoryPath { get; set; } = string.Empty; + public DateTime MutedDate { get; set; } + public string? Reason { get; set; } +} + +/// +/// A user-configured exclusion filter. Maps to the analysis_exclusions DuckDB table. +/// +public class AnalysisExclusion +{ + public long ExclusionId { get; set; } + public string ExclusionType { get; set; } = string.Empty; + public string ExclusionValue { get; set; } = string.Empty; + public int? ServerId { get; set; } + public string? DatabaseName { get; set; } + public bool IsEnabled { get; set; } = true; + public DateTime CreatedDate { get; set; } + public string? Description { get; set; } +} + +/// +/// A severity threshold value. Maps to the analysis_thresholds DuckDB table. +/// +public class AnalysisThreshold +{ + public long ThresholdId { get; set; } + public string Category { get; set; } = string.Empty; + public string FactKey { get; set; } = string.Empty; + public string ThresholdType { get; set; } = string.Empty; + public double ThresholdValue { get; set; } + public int? ServerId { get; set; } + public string? DatabaseName { get; set; } + public bool IsEnabled { get; set; } = true; + public DateTime ModifiedDate { get; set; } +} diff --git a/Dashboard/Analysis/AnalysisService.cs b/Dashboard/Analysis/AnalysisService.cs new file mode 100644 index 0000000..a0d7b2f --- /dev/null +++ b/Dashboard/Analysis/AnalysisService.cs @@ -0,0 +1,323 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.Data.SqlClient; +using PerformanceMonitorDashboard.Helpers; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Orchestrates the full analysis pipeline: collect -> score -> traverse -> persist. +/// Can be run on-demand or on a timer. Each run analyzes a single server's data +/// for a given time window and persists the findings. +/// Port of Lite's AnalysisService — uses SQL Server instead of DuckDB. +/// +public class AnalysisService +{ + private readonly string _connectionString; + private readonly SqlServerFindingStore _findingStore; + private readonly SqlServerFactCollector _collector; + private readonly FactScorer _scorer; + private readonly RelationshipGraph _graph; + private readonly InferenceEngine _engine; + private readonly SqlServerDrillDownCollector _drillDown; + private readonly SqlServerAnomalyDetector _anomalyDetector; + + /// + /// Minimum hours of collected data required before analysis will run. + /// Short collection windows distort fraction-of-period calculations -- + /// 5 seconds of THREADPOOL looks alarming in a 16-minute window. + /// + internal double MinimumDataHours { get; set; } = 72; + + /// + /// Raised after each analysis run completes, providing the findings for UI display. + /// + public event EventHandler? AnalysisCompleted; + + /// + /// Whether an analysis is currently running. + /// + public bool IsAnalyzing { get; private set; } + + /// + /// Time of the last completed analysis run. + /// + public DateTime? LastAnalysisTime { get; private set; } + + /// + /// Set after AnalyzeAsync if insufficient data was found. Null if enough data exists. + /// + public string? InsufficientDataMessage { get; private set; } + + public AnalysisService(string connectionString, IPlanFetcher? planFetcher = null) + { + _connectionString = connectionString; + _findingStore = new SqlServerFindingStore(connectionString); + _collector = new SqlServerFactCollector(connectionString); + _scorer = new FactScorer(); + _graph = new RelationshipGraph(); + _engine = new InferenceEngine(_graph); + _drillDown = new SqlServerDrillDownCollector(connectionString, planFetcher); + _anomalyDetector = new SqlServerAnomalyDetector(connectionString); + } + + /// + /// Runs the full analysis pipeline for a server. + /// Default time range is the last 4 hours. + /// + public async Task> AnalyzeAsync(int serverId, string serverName, int hoursBack = 4) + { + var timeRangeEnd = DateTime.UtcNow; + var timeRangeStart = timeRangeEnd.AddHours(-hoursBack); + + var context = new AnalysisContext + { + ServerId = serverId, + ServerName = serverName, + TimeRangeStart = timeRangeStart, + TimeRangeEnd = timeRangeEnd + }; + + return await AnalyzeAsync(context); + } + + /// + /// Runs the full analysis pipeline with a specific context. + /// + public async Task> AnalyzeAsync(AnalysisContext context) + { + if (IsAnalyzing) + return []; + + IsAnalyzing = true; + InsufficientDataMessage = null; + + try + { + // 0. Check minimum data span -- total history, not the analysis window. + // A server with 100h of total history can be analyzed over a 4h window. + var dataSpanHours = await GetTotalDataSpanHoursAsync(); + if (dataSpanHours < MinimumDataHours) + { + var needed = MinimumDataHours >= 24 + ? $"{MinimumDataHours / 24:F1} days" + : $"{MinimumDataHours:F0} hours"; + var have = dataSpanHours >= 24 + ? $"{dataSpanHours / 24:F1} days" + : $"{dataSpanHours:F1} hours"; + + InsufficientDataMessage = + $"Not enough data for reliable analysis. Need {needed} of collected data, " + + $"have {have}. Keep the collector running and try again later."; + + Logger.Info( + $"[AnalysisService] Skipping analysis for {context.ServerName}: {dataSpanHours:F1}h data, need {MinimumDataHours}h"); + + LastAnalysisTime = DateTime.UtcNow; + return []; + } + + // 1. Collect facts from SQL Server + var facts = await _collector.CollectFactsAsync(context); + + if (facts.Count == 0) + { + LastAnalysisTime = DateTime.UtcNow; + return []; + } + + // 1.5. Detect anomalies (compare analysis window against baseline) + var anomalies = await _anomalyDetector.DetectAnomaliesAsync(context); + facts.AddRange(anomalies); + + // 2. Score facts (base severity + amplifiers) + _scorer.ScoreAll(facts); + + // 3. Build stories via graph traversal + var stories = _engine.BuildStories(facts); + + // 4. Persist findings (filtering out muted) + var findings = await _findingStore.SaveFindingsAsync(stories, context); + + // 5. Enrich findings with drill-down data (ephemeral, not persisted) + await _drillDown.EnrichFindingsAsync(findings, context); + + LastAnalysisTime = DateTime.UtcNow; + + // 6. Notify listeners + AnalysisCompleted?.Invoke(this, new AnalysisCompletedEventArgs + { + ServerId = context.ServerId, + ServerName = context.ServerName, + Findings = findings, + AnalysisTime = LastAnalysisTime.Value + }); + + Logger.Info( + $"[AnalysisService] Analysis complete for {context.ServerName}: {findings.Count} finding(s), " + + $"highest severity {(findings.Count > 0 ? findings.Max(f => f.Severity) : 0):F2}"); + + return findings; + } + catch (Exception ex) + { + Logger.Error($"[AnalysisService] Analysis failed for {context.ServerName}: {ex.Message}"); + return []; + } + finally + { + IsAnalyzing = false; + } + } + + /// + /// Runs the collect + score pipeline without graph traversal. + /// Returns raw scored facts with amplifier details for direct inspection. + /// + public async Task> CollectAndScoreFactsAsync(int serverId, string serverName, int hoursBack = 4) + { + var timeRangeEnd = DateTime.UtcNow; + var timeRangeStart = timeRangeEnd.AddHours(-hoursBack); + + var context = new AnalysisContext + { + ServerId = serverId, + ServerName = serverName, + TimeRangeStart = timeRangeStart, + TimeRangeEnd = timeRangeEnd + }; + + try + { + var facts = await _collector.CollectFactsAsync(context); + if (facts.Count == 0) return facts; + _scorer.ScoreAll(facts); + return facts; + } + catch (Exception ex) + { + Logger.Error($"[AnalysisService] Fact collection failed for {serverName}: {ex.Message}"); + return []; + } + } + + /// + /// Compares analysis of two time periods, returning facts from both for comparison. + /// + public async Task<(List BaselineFacts, List ComparisonFacts)> ComparePeriodsAsync( + int serverId, string serverName, + DateTime baselineStart, DateTime baselineEnd, + DateTime comparisonStart, DateTime comparisonEnd) + { + var baselineContext = new AnalysisContext + { + ServerId = serverId, + ServerName = serverName, + TimeRangeStart = baselineStart, + TimeRangeEnd = baselineEnd + }; + + var comparisonContext = new AnalysisContext + { + ServerId = serverId, + ServerName = serverName, + TimeRangeStart = comparisonStart, + TimeRangeEnd = comparisonEnd + }; + + try + { + var baselineFacts = await _collector.CollectFactsAsync(baselineContext); + var comparisonFacts = await _collector.CollectFactsAsync(comparisonContext); + + _scorer.ScoreAll(baselineFacts); + _scorer.ScoreAll(comparisonFacts); + + return (baselineFacts, comparisonFacts); + } + catch (Exception ex) + { + Logger.Error($"[AnalysisService] Period comparison failed for {serverName}: {ex.Message}"); + return ([], []); + } + } + + /// + /// Gets the latest findings for a server without running a new analysis. + /// + public async Task> GetLatestFindingsAsync(int serverId) + { + return await _findingStore.GetLatestFindingsAsync(serverId); + } + + /// + /// Gets recent findings for a server within the given time range. + /// + public async Task> GetRecentFindingsAsync(int serverId, int hoursBack = 24) + { + return await _findingStore.GetRecentFindingsAsync(serverId, hoursBack); + } + + /// + /// Mutes a finding pattern so it won't appear in future runs. + /// + public async Task MuteFindingAsync(AnalysisFinding finding, string? reason = null) + { + await _findingStore.MuteStoryAsync( + finding.ServerId, finding.StoryPathHash, finding.StoryPath, reason); + } + + /// + /// Cleans up old findings beyond the retention period. + /// + public async Task CleanupAsync(int retentionDays = 30) + { + await _findingStore.CleanupOldFindingsAsync(retentionDays); + } + + /// + /// Returns the total span of collected data (no time range filter). + /// This answers "has this server been monitored long enough?" -- separate from + /// the analysis window. A server with 100 hours of total history can safely + /// be analyzed over a 4-hour window without dilution. + /// Dashboard monitors one server per database, so no server_id filtering. + /// + private async Task GetTotalDataSpanHoursAsync() + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT DATEDIFF(SECOND, MIN(collection_time), MAX(collection_time)) / 3600.0 +FROM collect.wait_stats;"; + + var result = await cmd.ExecuteScalarAsync(); + if (result == null || result is DBNull) + return 0; + + return Convert.ToDouble(result); + } + catch + { + return 0; + } + } +} + +/// +/// Event args for when an analysis run completes. +/// +public class AnalysisCompletedEventArgs : EventArgs +{ + public int ServerId { get; set; } + public string ServerName { get; set; } = string.Empty; + public List Findings { get; set; } = []; + public DateTime AnalysisTime { get; set; } +} diff --git a/Dashboard/Analysis/FactScorer.cs b/Dashboard/Analysis/FactScorer.cs new file mode 100644 index 0000000..8238298 --- /dev/null +++ b/Dashboard/Analysis/FactScorer.cs @@ -0,0 +1,867 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Assigns severity to facts using threshold formulas (Layer 1) +/// and contextual amplifiers (Layer 2). +/// +/// Layer 1: Base severity 0.0-1.0 from thresholds alone. +/// Layer 2: Amplifiers multiply base up to 2.0 max using corroborating facts. +/// +/// Formula: severity = min(base * (1.0 + sum(amplifiers)), 2.0) +/// +public class FactScorer +{ + /// + /// Scores all facts: Layer 1 (base severity), then Layer 2 (amplifiers). + /// + public void ScoreAll(List facts) + { + // Layer 1: base severity from thresholds + foreach (var fact in facts) + { + fact.BaseSeverity = fact.Source switch + { + "waits" => ScoreWaitFact(fact), + "blocking" => ScoreBlockingFact(fact), + "cpu" => ScoreCpuFact(fact), + "io" => ScoreIoFact(fact), + "tempdb" => ScoreTempDbFact(fact), + "memory" => ScoreMemoryFact(fact), + "queries" => ScoreQueryFact(fact), + "perfmon" => ScorePerfmonFact(fact), + "database_config" => ScoreDatabaseConfigFact(fact), + "jobs" => ScoreJobFact(fact), + "disk" => ScoreDiskFact(fact), + "bad_actor" => ScoreBadActorFact(fact), + "anomaly" => ScoreAnomalyFact(fact), + _ => 0.0 + }; + } + + // Build lookup for amplifier evaluation (include context facts that amplifiers reference) + var contextSources = new HashSet + { "config", "cpu", "io", "tempdb", "memory", "queries", "perfmon", + "database_config", "jobs", "sessions", "disk", "bad_actor", "anomaly" }; + var factsByKey = facts + .Where(f => f.BaseSeverity > 0 || contextSources.Contains(f.Source)) + .ToDictionary(f => f.Key, f => f); + + // Layer 2: amplifiers boost base severity using corroborating facts + foreach (var fact in facts) + { + if (fact.BaseSeverity <= 0) + { + fact.Severity = 0; + continue; + } + + var amplifiers = GetAmplifiers(fact); + var totalBoost = 0.0; + + foreach (var amp in amplifiers) + { + var matched = amp.Predicate(factsByKey); + fact.AmplifierResults.Add(new AmplifierResult + { + Description = amp.Description, + Matched = matched, + Boost = matched ? amp.Boost : 0.0 + }); + + if (matched) totalBoost += amp.Boost; + } + + fact.Severity = Math.Min(fact.BaseSeverity * (1.0 + totalBoost), 2.0); + } + } + + /// + /// Scores a wait fact using the fraction-of-period formula. + /// Some waits have absolute minimum thresholds to filter out background noise. + /// + private static double ScoreWaitFact(Fact fact) + { + var fraction = fact.Value; + if (fraction <= 0) return 0.0; + + // THREADPOOL: require both meaningful total wait time AND meaningful average. + // Tiny amounts are normal thread pool grow/shrink housekeeping, not exhaustion. + if (fact.Key == "THREADPOOL") + { + var waitTimeMs = fact.Metadata.GetValueOrDefault("wait_time_ms"); + var avgMs = fact.Metadata.GetValueOrDefault("avg_ms_per_wait"); + if (waitTimeMs < 3_600_000 || avgMs < 1_000) return 0.0; + } + + var thresholds = GetWaitThresholds(fact.Key); + if (thresholds == null) return 0.0; + + return ApplyThresholdFormula(fraction, thresholds.Value.concerning, thresholds.Value.critical); + } + + /// + /// Scores blocking/deadlock facts using events-per-hour thresholds. + /// + private static double ScoreBlockingFact(Fact fact) + { + var value = fact.Value; // events per hour + if (value <= 0) return 0.0; + + return fact.Key switch + { + // Blocking: concerning >10/hr, critical >50/hr + "BLOCKING_EVENTS" => ApplyThresholdFormula(value, 10, 50), + // Deadlocks: concerning >5/hr (no critical — any sustained deadlocking is bad) + "DEADLOCKS" => ApplyThresholdFormula(value, 5, null), + _ => 0.0 + }; + } + + /// + /// Scores CPU utilization. Value is average SQL CPU %. + /// + private static double ScoreCpuFact(Fact fact) + { + return fact.Key switch + { + // CPU %: concerning at 75%, critical at 95% + "CPU_SQL_PERCENT" => ApplyThresholdFormula(fact.Value, 75, 95), + // CPU spike: value is max CPU %. Concerning at 80%, critical at 95%. + // Only emitted when max is significantly above average (bursty). + "CPU_SPIKE" => ApplyThresholdFormula(fact.Value, 80, 95), + _ => 0.0 + }; + } + + /// + /// Scores I/O latency facts. Value is average latency in ms. + /// + private static double ScoreIoFact(Fact fact) + { + return fact.Key switch + { + // Read latency: concerning at 20ms, critical at 50ms + "IO_READ_LATENCY_MS" => ApplyThresholdFormula(fact.Value, 20, 50), + // Write latency: concerning at 10ms, critical at 30ms + "IO_WRITE_LATENCY_MS" => ApplyThresholdFormula(fact.Value, 10, 30), + _ => 0.0 + }; + } + + /// + /// Scores TempDB usage. Value is usage fraction (reserved / total space). + /// + private static double ScoreTempDbFact(Fact fact) + { + return fact.Key switch + { + // TempDB usage: concerning at 75%, critical at 90% + "TEMPDB_USAGE" => ApplyThresholdFormula(fact.Value, 0.75, 0.90), + _ => 0.0 + }; + } + + /// + /// Scores memory grant facts. Only MEMORY_GRANT_PENDING (from resource semaphore) for now. + /// + private static double ScoreMemoryFact(Fact fact) + { + return fact.Key switch + { + // Grant waiters: concerning at 1, critical at 5 + "MEMORY_GRANT_PENDING" => ApplyThresholdFormula(fact.Value, 1, 5), + _ => 0.0 + }; + } + + /// + /// Scores query-level aggregate facts. + /// + private static double ScoreQueryFact(Fact fact) + { + return fact.Key switch + { + // Spills: concerning at 100, critical at 1000 in the period + "QUERY_SPILLS" => ApplyThresholdFormula(fact.Value, 100, 1000), + // High DOP queries: concerning at 5, critical at 20 in the period + "QUERY_HIGH_DOP" => ApplyThresholdFormula(fact.Value, 5, 20), + _ => 0.0 + }; + } + + /// + /// Scores perfmon counter facts. PLE is the classic memory pressure indicator. + /// + private static double ScorePerfmonFact(Fact fact) + { + return fact.Key switch + { + // PLE: lower is worse. Invert: concerning < 300, critical < 60 + "PERFMON_PLE" when fact.Value <= 0 => 1.0, + "PERFMON_PLE" when fact.Value < 60 => 1.0, + "PERFMON_PLE" when fact.Value < 300 => 0.5 + 0.5 * (300 - fact.Value) / 240, + "PERFMON_PLE" => 0.0, + _ => 0.0 + }; + } + + /// + /// Scores database configuration facts. + /// Auto-shrink and auto-close are always bad. + /// RCSI-off gets a low base that only becomes visible through amplifiers + /// when reader/writer lock contention (LCK_M_S, LCK_M_IS) is present. + /// + private static double ScoreDatabaseConfigFact(Fact fact) + { + if (fact.Key != "DB_CONFIG") return 0.0; + + var autoShrink = fact.Metadata.GetValueOrDefault("auto_shrink_on_count"); + var autoClose = fact.Metadata.GetValueOrDefault("auto_close_on_count"); + var pageVerifyBad = fact.Metadata.GetValueOrDefault("page_verify_not_checksum_count"); + var rcsiOff = fact.Metadata.GetValueOrDefault("rcsi_off_count"); + + var score = 0.0; + + // Auto-shrink, auto-close, bad page verify are always concerning + if (autoShrink > 0 || autoClose > 0 || pageVerifyBad > 0) + score = Math.Max(score, Math.Min((autoShrink + autoClose + pageVerifyBad) * 0.3, 1.0)); + + // RCSI-off: low base (0.3) — below display threshold alone. + // Amplifiers for LCK_M_S/LCK_M_IS push it above 0.5 when reader/writer + // contention confirms RCSI would help. + if (rcsiOff > 0) + score = Math.Max(score, 0.3); + + return score; + } + + /// + /// Scores running job facts. Long-running jobs are a signal. + /// + private static double ScoreJobFact(Fact fact) + { + return fact.Key switch + { + // Long-running jobs: concerning at 1, critical at 3 + "RUNNING_JOBS" => ApplyThresholdFormula(fact.Value, 1, 3), + _ => 0.0 + }; + } + + /// + /// Scores disk space facts. Low free space is critical. + /// + private static double ScoreDiskFact(Fact fact) + { + if (fact.Key != "DISK_SPACE") return 0.0; + + var freePct = fact.Value; + // Invert: lower free space is worse. Critical < 5%, concerning < 10% + if (freePct < 0.05) return 1.0; + if (freePct < 0.10) return 0.5 + 0.5 * (0.10 - freePct) / 0.05; + if (freePct < 0.20) return 0.5 * (0.20 - freePct) / 0.10; + return 0.0; + } + + /// + /// Scores bad actor queries using execution count tier x per-execution impact. + /// A query running 100K times at 1ms CPU is different from 100K times at 5s CPU. + /// The tier gets it in the door, per-execution impact determines how bad it is. + /// + private static double ScoreBadActorFact(Fact fact) + { + var execCount = fact.Metadata.GetValueOrDefault("execution_count"); + var avgCpuMs = fact.Metadata.GetValueOrDefault("avg_cpu_ms"); + var avgReads = fact.Metadata.GetValueOrDefault("avg_reads"); + + // Execution count tier base — higher tiers for more frequent queries + var tierBase = execCount switch + { + < 1_000 => 0.5, + < 10_000 => 0.7, + < 100_000 => 0.85, + _ => 1.0 + }; + + // Per-execution impact: use the worse of CPU or reads + // CPU: concerning at 50ms, critical at 2000ms + var cpuImpact = ApplyThresholdFormula(avgCpuMs, 50, 2000); + // Reads: concerning at 5K, critical at 250K + var readsImpact = ApplyThresholdFormula(avgReads, 5_000, 250_000); + + var impact = Math.Max(cpuImpact, readsImpact); + + // Final: tier * impact. Both must be meaningful. + // A high-frequency query with trivial per-execution cost won't score. + // A heavy query that only runs once won't score high either. + return tierBase * impact; + } + + /// + /// Scores anomaly facts based on deviation from baseline. + /// At 2σ → 0.5, at 4σ → 1.0. Higher deviations are more severe. + /// For count-based anomalies (blocking/deadlock spikes), uses ratio instead. + /// + private static double ScoreAnomalyFact(Fact fact) + { + if (fact.Key.StartsWith("ANOMALY_CPU_SPIKE") || fact.Key.StartsWith("ANOMALY_READ_LATENCY") + || fact.Key.StartsWith("ANOMALY_WRITE_LATENCY")) + { + // Deviation-based scoring: 2σ = 0.5, 4σ = 1.0 + var deviation = fact.Metadata.GetValueOrDefault("deviation_sigma"); + var confidence = fact.Metadata.GetValueOrDefault("confidence", 1.0); + if (deviation < 2.0) return 0.0; + var base_score = 0.5 + 0.5 * Math.Min((deviation - 2.0) / 2.0, 1.0); + return base_score * confidence; + } + + if (fact.Key.StartsWith("ANOMALY_WAIT_")) + { + // Ratio-based scoring: 5x = 0.5, 20x = 1.0 + var ratio = fact.Metadata.GetValueOrDefault("ratio"); + if (ratio < 5) return 0.0; + return 0.5 + 0.5 * Math.Min((ratio - 5.0) / 15.0, 1.0); + } + + if (fact.Key.StartsWith("ANOMALY_BLOCKING_SPIKE") || fact.Key.StartsWith("ANOMALY_DEADLOCK_SPIKE")) + { + // Ratio-based: 3x = 0.5, 10x = 1.0 + var ratio = fact.Metadata.GetValueOrDefault("ratio"); + if (ratio < 3) return 0.0; + return 0.5 + 0.5 * Math.Min((ratio - 3.0) / 7.0, 1.0); + } + + return 0.0; + } + + /// + /// Generic threshold formula used by waits, latency, and count-based metrics. + /// Critical == null means "concerning only" — hitting concerning = 1.0. + /// + internal static double ApplyThresholdFormula(double value, double concerning, double? critical) + { + if (value <= 0) return 0.0; + + if (critical == null) + return Math.Min(value / concerning, 1.0); + + if (value >= critical.Value) + return 1.0; + + if (value >= concerning) + return 0.5 + 0.5 * (value - concerning) / (critical.Value - concerning); + + return 0.5 * (value / concerning); + } + + /// + /// Returns amplifier definitions for a fact. Each amplifier has a description, + /// a boost value, and a predicate that evaluates against the current fact set. + /// Amplifiers are defined per wait type and will grow as more fact categories are added. + /// + private static List GetAmplifiers(Fact fact) + { + return fact.Key switch + { + "SOS_SCHEDULER_YIELD" => SosSchedulerYieldAmplifiers(), + "CXPACKET" => CxPacketAmplifiers(), + "THREADPOOL" => ThreadpoolAmplifiers(), + "PAGEIOLATCH_SH" or "PAGEIOLATCH_EX" => PageiolatchAmplifiers(), + "LATCH_EX" or "LATCH_SH" => LatchAmplifiers(), + "BLOCKING_EVENTS" => BlockingEventsAmplifiers(), + "DEADLOCKS" => DeadlockAmplifiers(), + "LCK" => LckAmplifiers(), + "CPU_SQL_PERCENT" => CpuSqlPercentAmplifiers(), + "CPU_SPIKE" => CpuSpikeAmplifiers(), + "IO_READ_LATENCY_MS" => IoReadLatencyAmplifiers(), + "IO_WRITE_LATENCY_MS" => IoWriteLatencyAmplifiers(), + "MEMORY_GRANT_PENDING" => MemoryGrantAmplifiers(), + "QUERY_SPILLS" => QuerySpillAmplifiers(), + "PERFMON_PLE" => PleAmplifiers(), + "DB_CONFIG" => DbConfigAmplifiers(), + "DISK_SPACE" => DiskSpaceAmplifiers(), + _ => [] + }; + } + + /// + /// SOS_SCHEDULER_YIELD: CPU starvation confirmed by parallelism waits. + /// More amplifiers added when config and CPU utilization facts are available. + /// + private static List SosSchedulerYieldAmplifiers() => + [ + new() + { + Description = "CXPACKET significant — parallelism consuming schedulers", + Boost = 0.2, + Predicate = facts => HasSignificantWait(facts, "CXPACKET", 0.10) + }, + new() + { + Description = "THREADPOOL waits present — escalating to thread exhaustion", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("THREADPOOL") && facts["THREADPOOL"].BaseSeverity > 0 + }, + new() + { + Description = "SQL Server CPU > 80% — confirmed CPU saturation", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("CPU_SQL_PERCENT", out var cpu) && cpu.Value >= 80 + } + ]; + + /// + /// CXPACKET: parallelism waits confirmed by CPU pressure and bad config. + /// CXCONSUMER is grouped into CXPACKET by the collector. + /// + private static List CxPacketAmplifiers() => + [ + new() + { + Description = "SOS_SCHEDULER_YIELD high — CPU starvation from parallelism", + Boost = 0.3, + Predicate = facts => HasSignificantWait(facts, "SOS_SCHEDULER_YIELD", 0.25) + }, + new() + { + Description = "THREADPOOL waits present — thread exhaustion cascade", + Boost = 0.4, + Predicate = facts => facts.ContainsKey("THREADPOOL") && facts["THREADPOOL"].BaseSeverity > 0 + }, + new() + { + Description = "CTFP at default (5) — too low for most workloads", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("CONFIG_CTFP", out var ctfp) && ctfp.Value <= 5 + }, + new() + { + Description = "MAXDOP at 0 — unlimited parallelism", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("CONFIG_MAXDOP", out var maxdop) && maxdop.Value == 0 + }, + new() + { + Description = "Queries running with DOP > 8 — excessive parallelism confirmed", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("QUERY_HIGH_DOP", out var dop) && dop.BaseSeverity > 0 + } + ]; + + /// + /// THREADPOOL: thread exhaustion confirmed by parallelism pressure. + /// Blocking and config amplifiers added later. + /// + private static List ThreadpoolAmplifiers() => + [ + new() + { + Description = "CXPACKET significant — parallel queries consuming thread pool", + Boost = 0.2, + Predicate = facts => HasSignificantWait(facts, "CXPACKET", 0.10) + }, + new() + { + Description = "Lock contention present — blocked queries holding worker threads", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("LCK") && facts["LCK"].BaseSeverity >= 0.5 + } + ]; + + /// + /// PAGEIOLATCH: memory pressure confirmed by other waits. + /// Buffer pool, query, and config amplifiers added when those facts are available. + /// + private static List PageiolatchAmplifiers() => + [ + new() + { + Description = "SOS_SCHEDULER_YIELD elevated — CPU pressure alongside I/O pressure", + Boost = 0.1, + Predicate = facts => HasSignificantWait(facts, "SOS_SCHEDULER_YIELD", 0.15) + }, + new() + { + Description = "Read latency > 20ms — confirmed disk I/O bottleneck", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("IO_READ_LATENCY_MS", out var io) && io.Value >= 20 + }, + new() + { + Description = "Memory grant waiters present — grants competing with buffer pool", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("MEMORY_GRANT_PENDING", out var mg) && mg.Value >= 1 + } + ]; + + /// + /// LATCH_EX/LATCH_SH: in-memory page latch contention. + /// Common causes: TempDB allocation contention, hot page updates, + /// parallel insert into heaps or narrow indexes. + /// + private static List LatchAmplifiers() => + [ + new() + { + Description = "TempDB usage elevated — latch contention likely on TempDB allocation pages", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("TEMPDB_USAGE", out var t) && t.BaseSeverity > 0 + }, + new() + { + Description = "CXPACKET significant — parallel operations amplifying latch contention", + Boost = 0.2, + Predicate = facts => HasSignificantWait(facts, "CXPACKET", 0.10) + }, + new() + { + Description = "SOS_SCHEDULER_YIELD elevated — latch spinning contributing to CPU pressure", + Boost = 0.2, + Predicate = facts => HasSignificantWait(facts, "SOS_SCHEDULER_YIELD", 0.15) + } + ]; + + /// + /// BLOCKING_EVENTS: blocking confirmed by lock waits and deadlocks. + /// + private static List BlockingEventsAmplifiers() => + [ + new() + { + Description = "Head blocker sleeping with open transaction — abandoned transaction pattern", + Boost = 0.4, + Predicate = facts => facts.TryGetValue("BLOCKING_EVENTS", out var f) + && f.Metadata.GetValueOrDefault("sleeping_blocker_count") > 0 + }, + new() + { + Description = "Lock contention waits elevated — blocking visible in wait stats", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("LCK") && facts["LCK"].BaseSeverity >= 0.3 + }, + new() + { + Description = "Deadlocks also present — blocking escalating to deadlocks", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("DEADLOCKS") && facts["DEADLOCKS"].BaseSeverity > 0 + } + ]; + + /// + /// DEADLOCKS: deadlocks confirmed by blocking patterns. + /// + private static List DeadlockAmplifiers() => + [ + new() + { + Description = "Blocking events also present — systemic contention pattern", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0 + }, + new() + { + Description = "Reader/writer lock waits present — RCSI could prevent some deadlocks", + Boost = 0.3, + Predicate = facts => (facts.ContainsKey("LCK_M_S") && facts["LCK_M_S"].BaseSeverity > 0) + || (facts.ContainsKey("LCK_M_IS") && facts["LCK_M_IS"].BaseSeverity > 0) + }, + new() + { + Description = "Databases without RCSI — reader/writer isolation amplifying deadlocks", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("DB_CONFIG", out var db) && db.Metadata.GetValueOrDefault("rcsi_off_count") > 0 + } + ]; + + /// + /// LCK (grouped general lock contention): confirmed by blocking reports and deadlocks. + /// + private static List LckAmplifiers() => + [ + new() + { + Description = "Blocked process reports present — confirmed blocking events", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0 + }, + new() + { + Description = "Deadlocks present — lock contention escalating to deadlocks", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("DEADLOCKS") && facts["DEADLOCKS"].BaseSeverity > 0 + }, + new() + { + Description = "THREADPOOL waits present — blocking causing thread exhaustion", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("THREADPOOL") && facts["THREADPOOL"].BaseSeverity > 0 + } + ]; + + /// + /// PLE: memory pressure confirmed by PAGEIOLATCH and RESOURCE_SEMAPHORE. + /// + private static List PleAmplifiers() => + [ + new() + { + Description = "PAGEIOLATCH waits present — buffer pool misses confirm memory pressure", + Boost = 0.3, + Predicate = facts => HasSignificantWait(facts, "PAGEIOLATCH_SH", 0.10) + || HasSignificantWait(facts, "PAGEIOLATCH_EX", 0.10) + }, + new() + { + Description = "RESOURCE_SEMAPHORE waits — memory grants competing with buffer pool", + Boost = 0.2, + Predicate = facts => facts.ContainsKey("RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].BaseSeverity > 0 + } + ]; + + /// + /// DB_CONFIG: database misconfiguration amplified by related symptoms. + /// RCSI-off amplifiers only fire when reader/writer lock contention is present — + /// LCK_M_S (shared lock waits) and LCK_M_IS (intent-shared) are readers blocked + /// by writers. RCSI eliminates these. Writer/writer conflicts (LCK_M_X, LCK_M_U) + /// are NOT helped by RCSI and should not trigger this amplifier. + /// + private static List DbConfigAmplifiers() => + [ + new() + { + Description = "I/O latency elevated — auto_shrink may be causing fragmentation and I/O pressure", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("IO_READ_LATENCY_MS", out var io) && io.BaseSeverity > 0 + }, + new() + { + Description = "LCK_M_S waits — readers blocked by writers, RCSI would eliminate shared lock waits", + Boost = 0.5, + Predicate = facts => facts.TryGetValue("DB_CONFIG", out var db) + && db.Metadata.GetValueOrDefault("rcsi_off_count") > 0 + && facts.TryGetValue("LCK_M_S", out var lckS) && lckS.BaseSeverity > 0 + }, + new() + { + Description = "LCK_M_IS waits — intent-shared locks blocked by writers, RCSI would eliminate these", + Boost = 0.4, + Predicate = facts => facts.TryGetValue("DB_CONFIG", out var db) + && db.Metadata.GetValueOrDefault("rcsi_off_count") > 0 + && facts.TryGetValue("LCK_M_IS", out var lckIS) && lckIS.BaseSeverity > 0 + }, + new() + { + Description = "Deadlocks with reader/writer lock waits — RCSI eliminates reader/writer deadlocks", + Boost = 0.4, + Predicate = facts => facts.TryGetValue("DB_CONFIG", out var db) + && db.Metadata.GetValueOrDefault("rcsi_off_count") > 0 + && facts.TryGetValue("DEADLOCKS", out var dl) && dl.BaseSeverity > 0 + && (facts.TryGetValue("LCK_M_S", out var s) && s.BaseSeverity > 0 + || facts.TryGetValue("LCK_M_IS", out var i) && i.BaseSeverity > 0) + } + ]; + + /// + /// DISK_SPACE: low disk space amplified by I/O activity and TempDB pressure. + /// + private static List DiskSpaceAmplifiers() => + [ + new() + { + Description = "TempDB usage elevated — growing TempDB on a nearly full volume", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("TEMPDB_USAGE", out var t) && t.BaseSeverity > 0 + }, + new() + { + Description = "Query spills present — spills to disk on a nearly full volume", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("QUERY_SPILLS", out var s) && s.BaseSeverity > 0 + } + ]; + + /// + /// CPU_SQL_PERCENT: CPU saturation confirmed by scheduler yields and parallelism. + /// + private static List CpuSqlPercentAmplifiers() => + [ + new() + { + Description = "SOS_SCHEDULER_YIELD elevated — scheduler pressure confirms CPU saturation", + Boost = 0.3, + Predicate = facts => HasSignificantWait(facts, "SOS_SCHEDULER_YIELD", 0.25) + }, + new() + { + Description = "CXPACKET significant — parallelism contributing to CPU load", + Boost = 0.2, + Predicate = facts => HasSignificantWait(facts, "CXPACKET", 0.10) + } + ]; + + /// + /// CPU_SPIKE: bursty CPU event (max >> average) confirmed by scheduler + /// pressure, parallelism, or query spills during the spike. + /// + private static List CpuSpikeAmplifiers() => + [ + new() + { + Description = "SOS_SCHEDULER_YIELD present — scheduler pressure during CPU spike", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].BaseSeverity > 0 + }, + new() + { + Description = "CXPACKET significant — parallelism contributing to CPU spike", + Boost = 0.2, + Predicate = facts => HasSignificantWait(facts, "CXPACKET", 0.10) + }, + new() + { + Description = "THREADPOOL waits present — CPU spike causing thread exhaustion", + Boost = 0.4, + Predicate = facts => facts.ContainsKey("THREADPOOL") && facts["THREADPOOL"].BaseSeverity > 0 + } + ]; + + /// + /// IO_READ_LATENCY_MS: read latency confirmed by PAGEIOLATCH waits. + /// + private static List IoReadLatencyAmplifiers() => + [ + new() + { + Description = "PAGEIOLATCH waits elevated — buffer pool misses confirm I/O pressure", + Boost = 0.3, + Predicate = facts => HasSignificantWait(facts, "PAGEIOLATCH_SH", 0.10) + || HasSignificantWait(facts, "PAGEIOLATCH_EX", 0.10) + } + ]; + + /// + /// IO_WRITE_LATENCY_MS: write latency confirmed by WRITELOG waits. + /// + private static List IoWriteLatencyAmplifiers() => + [ + new() + { + Description = "WRITELOG waits elevated — transaction log I/O bottleneck confirmed", + Boost = 0.3, + Predicate = facts => HasSignificantWait(facts, "WRITELOG", 0.05) + } + ]; + + /// + /// MEMORY_GRANT_PENDING: grant pressure confirmed by RESOURCE_SEMAPHORE waits and spills. + /// + private static List MemoryGrantAmplifiers() => + [ + new() + { + Description = "RESOURCE_SEMAPHORE waits present — memory grant pressure in wait stats", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].BaseSeverity > 0 + }, + new() + { + Description = "Query spills present — queries running with insufficient memory grants", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("QUERY_SPILLS", out var s) && s.BaseSeverity > 0 + } + ]; + + /// + /// QUERY_SPILLS: spills confirmed by memory grant pressure. + /// + private static List QuerySpillAmplifiers() => + [ + new() + { + Description = "Memory grant waiters present — insufficient memory for query grants", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("MEMORY_GRANT_PENDING", out var mg) && mg.Value >= 1 + }, + new() + { + Description = "RESOURCE_SEMAPHORE waits — grant pressure visible in wait stats", + Boost = 0.2, + Predicate = facts => facts.ContainsKey("RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].BaseSeverity > 0 + } + ]; + + /// + /// Checks if a wait type is present with at least the given fraction of period. + /// + private static bool HasSignificantWait(Dictionary facts, string waitType, double minFraction) + { + return facts.TryGetValue(waitType, out var fact) && fact.Value >= minFraction; + } + + /// + /// Default thresholds for wait types (fraction of examined period). + /// Returns null for unrecognized waits — they get severity 0. + /// + private static (double concerning, double? critical)? GetWaitThresholds(string waitType) + { + return waitType switch + { + // CPU pressure + "SOS_SCHEDULER_YIELD" => (0.75, null), + "THREADPOOL" => (0.01, null), + + // Memory pressure + "PAGEIOLATCH_SH" => (0.25, null), + "PAGEIOLATCH_EX" => (0.25, null), + "RESOURCE_SEMAPHORE" => (0.01, null), + + // Parallelism (CXCONSUMER is grouped into CXPACKET by collector) + "CXPACKET" => (0.25, null), + + // Log I/O + "WRITELOG" => (0.10, null), + + // Lock waits — serializable/repeatable read lock modes + "LCK_M_RS_S" => (0.01, null), + "LCK_M_RS_U" => (0.01, null), + "LCK_M_RIn_NL" => (0.01, null), + "LCK_M_RIn_S" => (0.01, null), + "LCK_M_RIn_U" => (0.01, null), + "LCK_M_RIn_X" => (0.01, null), + "LCK_M_RX_S" => (0.01, null), + "LCK_M_RX_U" => (0.01, null), + "LCK_M_RX_X" => (0.01, null), + + // Reader/writer blocking locks + "LCK_M_S" => (0.05, null), + "LCK_M_IS" => (0.05, null), + + // General lock contention (grouped X, U, IX, SIX, BU, etc.) + "LCK" => (0.10, null), + + // Schema locks — DDL operations, index rebuilds + "SCH_M" => (0.01, null), + + // Latch contention — page latch (not I/O latch) indicates + // in-memory contention, often TempDB allocation or hot pages + "LATCH_EX" => (0.25, null), + "LATCH_SH" => (0.25, null), + + _ => null + }; + } +} + +/// +/// An amplifier definition: a named predicate that boosts severity when matched. +/// +internal class AmplifierDefinition +{ + public string Description { get; set; } = string.Empty; + public double Boost { get; set; } + public Func, bool> Predicate { get; set; } = _ => false; +} diff --git a/Dashboard/Analysis/IFactCollector.cs b/Dashboard/Analysis/IFactCollector.cs new file mode 100644 index 0000000..38b6abb --- /dev/null +++ b/Dashboard/Analysis/IFactCollector.cs @@ -0,0 +1,31 @@ +using System; +using System.Collections.Generic; +using System.Threading.Tasks; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Context for an analysis run — what server, what time range. +/// +public class AnalysisContext +{ + public int ServerId { get; set; } + public string ServerName { get; set; } = string.Empty; + public DateTime TimeRangeStart { get; set; } + public DateTime TimeRangeEnd { get; set; } + public List Exclusions { get; set; } = []; + + /// + /// Duration of the examined period in milliseconds. + /// + public double PeriodDurationMs => (TimeRangeEnd - TimeRangeStart).TotalMilliseconds; +} + +/// +/// Collects facts from a data source for analysis. +/// Implementations are per-app: DuckDB for Lite, SQL Server for Dashboard. +/// +public interface IFactCollector +{ + Task> CollectFactsAsync(AnalysisContext context); +} diff --git a/Dashboard/Analysis/IPlanFetcher.cs b/Dashboard/Analysis/IPlanFetcher.cs new file mode 100644 index 0000000..e77fea1 --- /dev/null +++ b/Dashboard/Analysis/IPlanFetcher.cs @@ -0,0 +1,19 @@ +using System.Threading.Tasks; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Fetches execution plan XML from SQL Server on demand. +/// Platform-agnostic interface — Lite implements via RemoteCollectorService's +/// SQL connection, Dashboard implements via DatabaseService's connection. +/// Used by DrillDownCollector to analyze plans for high-impact findings +/// without storing plan XML in DuckDB or SQL Server tables. +/// +public interface IPlanFetcher +{ + /// + /// Fetches the execution plan XML for a given plan_handle. + /// Returns null if the plan is no longer in cache. + /// + Task FetchPlanXmlAsync(int serverId, string planHandle); +} diff --git a/Dashboard/Analysis/InferenceEngine.cs b/Dashboard/Analysis/InferenceEngine.cs new file mode 100644 index 0000000..976bef4 --- /dev/null +++ b/Dashboard/Analysis/InferenceEngine.cs @@ -0,0 +1,165 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Security.Cryptography; +using System.Text; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Greedy traversal engine that builds analysis stories from scored facts +/// and the relationship graph. +/// +/// Algorithm: +/// 1. Start at the highest-severity fact as entry point +/// 2. Evaluate all edge predicates from current node +/// 3. Follow edge to highest-severity destination (that hasn't been visited) +/// 4. Repeat until leaf (no active edges or all destinations visited) +/// 5. The path IS the story +/// 6. Mark traversed facts as consumed, repeat from next highest-severity +/// 7. Stop when remaining facts are below 0.5 severity +/// +public class InferenceEngine +{ + private const double MinimumSeverityThreshold = 0.5; + private const int MaxPathDepth = 10; // Safety limit + + private readonly RelationshipGraph _graph; + + public InferenceEngine(RelationshipGraph graph) + { + _graph = graph; + } + + /// + /// Builds analysis stories by traversing the relationship graph + /// starting from the highest-severity facts. + /// + public List BuildStories(List facts) + { + var stories = new List(); + var factsByKey = facts + .Where(f => f.Severity > 0) + .ToDictionary(f => f.Key, f => f); + var consumed = new HashSet(); + + // Process facts in severity order + var entryPoints = facts + .Where(f => f.Severity >= MinimumSeverityThreshold) + .OrderByDescending(f => f.Severity) + .ToList(); + + foreach (var entryFact in entryPoints) + { + if (consumed.Contains(entryFact.Key)) + continue; + + var path = Traverse(entryFact.Key, factsByKey, consumed); + + // Mark all facts in this path as consumed + foreach (var node in path) + consumed.Add(node); + + var story = BuildStory(path, factsByKey); + stories.Add(story); + } + + // Check for absolution — if no stories were generated at all + if (stories.Count == 0 && facts.Count > 0) + { + stories.Add(new AnalysisStory + { + RootFactKey = "server_health", + RootFactValue = 0, + Severity = 0, + Confidence = 1.0, + Category = "absolution", + Path = ["server_health"], + StoryPath = "server_health", + StoryPathHash = ComputeHash("server_health"), + StoryText = string.Empty, + IsAbsolution = true + }); + } + + return stories; + } + + /// + /// Greedy traversal from an entry point through the relationship graph. + /// Returns the path as a list of fact keys. + /// + private List Traverse(string startKey, + Dictionary factsByKey, + HashSet consumed) + { + var path = new List { startKey }; + var visited = new HashSet { startKey }; + var current = startKey; + + for (var depth = 0; depth < MaxPathDepth; depth++) + { + var activeEdges = _graph.GetActiveEdges(current, factsByKey); + + // Filter to destinations not already in this path and not consumed by prior stories + var candidates = activeEdges + .Where(e => !visited.Contains(e.Destination) && !consumed.Contains(e.Destination)) + .Where(e => factsByKey.ContainsKey(e.Destination)) + .OrderByDescending(e => factsByKey[e.Destination].Severity) + .ToList(); + + if (candidates.Count == 0) + break; // Leaf node — no more edges to follow + + var best = candidates[0]; + path.Add(best.Destination); + visited.Add(best.Destination); + current = best.Destination; + } + + return path; + } + + /// + /// Builds an AnalysisStory from a traversal path. + /// + private static AnalysisStory BuildStory(List path, Dictionary factsByKey) + { + var rootFact = factsByKey.GetValueOrDefault(path[0]); + var leafKey = path.Count > 1 ? path[^1] : null; + var leafFact = leafKey != null ? factsByKey.GetValueOrDefault(leafKey) : null; + + var storyPath = string.Join(" → ", path); + var category = rootFact?.Source ?? "unknown"; + + // Confidence = what fraction of edge destinations had matching facts + // For single-node paths, confidence is 1.0 (we found the symptom, just no deeper cause) + var confidence = path.Count == 1 ? 1.0 : (path.Count - 1.0) / path.Count; + + return new AnalysisStory + { + RootFactKey = path[0], + RootFactValue = rootFact?.Severity ?? 0, + Severity = rootFact?.Severity ?? 0, + Confidence = confidence, + Category = category, + Path = path, + StoryPath = storyPath, + StoryPathHash = ComputeHash(storyPath), + StoryText = string.Empty, + LeafFactKey = leafKey, + LeafFactValue = leafFact?.Severity, + FactCount = path.Count, + IsAbsolution = false + }; + } + + /// + /// Stable hash for story path deduplication and muting. + /// + private static string ComputeHash(string storyPath) + { + var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(storyPath)); + return Convert.ToHexString(bytes).ToLowerInvariant()[..16]; + } +} diff --git a/Dashboard/Analysis/RelationshipGraph.cs b/Dashboard/Analysis/RelationshipGraph.cs new file mode 100644 index 0000000..2650a7b --- /dev/null +++ b/Dashboard/Analysis/RelationshipGraph.cs @@ -0,0 +1,325 @@ +using System.Collections.Generic; +using System.Linq; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Defines conditional edges between facts. The graph encodes Erik's diagnostic +/// reasoning: "when I see symptom X, what do I check next?" +/// +/// Edges are code-defined (not data-driven) because they represent expert knowledge. +/// Each edge has a predicate that evaluates against the current fact set to decide +/// if the edge should be followed. +/// +/// Built incrementally — new edges are added as new fact categories become available. +/// +public class RelationshipGraph +{ + private readonly Dictionary> _edges = new(); + + public RelationshipGraph() + { + BuildGraph(); + } + + /// + /// Returns all edges originating from the given fact key, + /// filtered to only those whose predicates are true. + /// + public List GetActiveEdges(string sourceKey, IReadOnlyDictionary factsByKey) + { + if (!_edges.TryGetValue(sourceKey, out var edges)) + return []; + + return edges.Where(e => e.Predicate(factsByKey)).ToList(); + } + + /// + /// Returns all defined edges from a source (regardless of predicate). + /// Used for audit trail logging. + /// + public List GetAllEdges(string sourceKey) + { + return _edges.TryGetValue(sourceKey, out var edges) ? edges : []; + } + + private void AddEdge(string source, string destination, string category, + string predicateDescription, System.Func, bool> predicate) + { + if (!_edges.ContainsKey(source)) + _edges[source] = []; + + _edges[source].Add(new Edge + { + Source = source, + Destination = destination, + Category = category, + PredicateDescription = predicateDescription, + Predicate = predicate + }); + } + + /// + /// Builds all edges in the relationship graph. + /// Organized by entry point category matching the design doc. + /// + private void BuildGraph() + { + BuildCpuPressureEdges(); + BuildMemoryPressureEdges(); + BuildBlockingEdges(); + BuildIoPressureEdges(); + BuildLatchEdges(); + BuildTempDbEdges(); + BuildQueryEdges(); + } + + /* ── CPU Pressure ── */ + + private void BuildCpuPressureEdges() + { + // SOS_SCHEDULER_YIELD → CXPACKET (parallelism contributing to CPU) + AddEdge("SOS_SCHEDULER_YIELD", "CXPACKET", "cpu_pressure", + "CXPACKET significant — parallelism consuming schedulers", + facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.5); + + // SOS_SCHEDULER_YIELD → THREADPOOL (escalating to thread exhaustion) + AddEdge("SOS_SCHEDULER_YIELD", "THREADPOOL", "cpu_pressure", + "THREADPOOL waits present — escalating to thread exhaustion", + facts => HasFact(facts, "THREADPOOL") && facts["THREADPOOL"].Severity > 0); + + // CXPACKET → SOS (CPU starvation from parallelism) + AddEdge("CXPACKET", "SOS_SCHEDULER_YIELD", "parallelism", + "SOS_SCHEDULER_YIELD elevated — CPU starvation from parallelism", + facts => HasFact(facts, "SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].Value >= 0.25); + + // CXPACKET → THREADPOOL (thread exhaustion cascade) + AddEdge("CXPACKET", "THREADPOOL", "parallelism", + "THREADPOOL waits present — thread exhaustion cascade", + facts => HasFact(facts, "THREADPOOL") && facts["THREADPOOL"].Severity > 0); + + // THREADPOOL → CXPACKET (parallel queries consuming thread pool) + AddEdge("THREADPOOL", "CXPACKET", "thread_exhaustion", + "CXPACKET significant — parallel queries consuming thread pool", + facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.5); + + // THREADPOOL → LCK (blocking causing thread buildup — stuck queries holding threads) + AddEdge("THREADPOOL", "LCK", "thread_exhaustion", + "Lock contention — blocked queries holding worker threads", + facts => HasFact(facts, "LCK") && facts["LCK"].Severity >= 0.5); + + // CPU_SQL_PERCENT → SOS_SCHEDULER_YIELD (CPU confirms scheduler pressure) + AddEdge("CPU_SQL_PERCENT", "SOS_SCHEDULER_YIELD", "cpu_pressure", + "Scheduler yields confirm CPU saturation", + facts => HasFact(facts, "SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].Severity >= 0.5); + + // CPU_SQL_PERCENT → CXPACKET (CPU load from parallelism) + AddEdge("CPU_SQL_PERCENT", "CXPACKET", "cpu_pressure", + "Parallelism waits contributing to CPU load", + facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.5); + + // SOS_SCHEDULER_YIELD → CPU_SQL_PERCENT (scheduler yields with high CPU) + AddEdge("SOS_SCHEDULER_YIELD", "CPU_SQL_PERCENT", "cpu_pressure", + "SQL CPU > 80% — confirms CPU is the bottleneck", + facts => HasFact(facts, "CPU_SQL_PERCENT") && facts["CPU_SQL_PERCENT"].Value >= 80); + + // CPU_SPIKE → SOS_SCHEDULER_YIELD (spike confirmed by scheduler pressure) + AddEdge("CPU_SPIKE", "SOS_SCHEDULER_YIELD", "cpu_spike", + "Scheduler yields — CPU spike caused scheduler starvation", + facts => HasFact(facts, "SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].BaseSeverity > 0); + + // CPU_SPIKE → CXPACKET (spike from parallelism) + AddEdge("CPU_SPIKE", "CXPACKET", "cpu_spike", + "Parallelism waits — parallel queries contributing to CPU spike", + facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.3); + } + + /* ── Memory Pressure ── */ + + private void BuildMemoryPressureEdges() + { + // PAGEIOLATCH_SH → RESOURCE_SEMAPHORE (memory grants contributing to buffer pressure) + AddEdge("PAGEIOLATCH_SH", "RESOURCE_SEMAPHORE", "memory_pressure", + "RESOURCE_SEMAPHORE present — memory grants competing with buffer pool", + facts => HasFact(facts, "RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].Severity > 0); + + // PAGEIOLATCH_EX → same + AddEdge("PAGEIOLATCH_EX", "RESOURCE_SEMAPHORE", "memory_pressure", + "RESOURCE_SEMAPHORE present — memory grants competing with buffer pool", + facts => HasFact(facts, "RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].Severity > 0); + + // RESOURCE_SEMAPHORE → PAGEIOLATCH (downstream I/O cascade) + AddEdge("RESOURCE_SEMAPHORE", "PAGEIOLATCH_SH", "memory_grants", + "PAGEIOLATCH elevated — memory grant pressure causing buffer pool shrinkage", + facts => HasFact(facts, "PAGEIOLATCH_SH") && facts["PAGEIOLATCH_SH"].Severity >= 0.5); + + // RESOURCE_SEMAPHORE → MEMORY_GRANT_PENDING (grant pressure confirmed by semaphore waiters) + AddEdge("RESOURCE_SEMAPHORE", "MEMORY_GRANT_PENDING", "memory_grants", + "Memory grant waiters present — queries queued for memory", + facts => HasFact(facts, "MEMORY_GRANT_PENDING") && facts["MEMORY_GRANT_PENDING"].BaseSeverity > 0); + + // RESOURCE_SEMAPHORE → QUERY_SPILLS (grant pressure causing spills) + AddEdge("RESOURCE_SEMAPHORE", "QUERY_SPILLS", "memory_grants", + "Query spills present — queries running with insufficient memory", + facts => HasFact(facts, "QUERY_SPILLS") && facts["QUERY_SPILLS"].BaseSeverity > 0); + + // MEMORY_GRANT_PENDING → RESOURCE_SEMAPHORE (waiters confirm RESOURCE_SEMAPHORE waits) + AddEdge("MEMORY_GRANT_PENDING", "RESOURCE_SEMAPHORE", "memory_grants", + "RESOURCE_SEMAPHORE waits — grant pressure visible in wait stats", + facts => HasFact(facts, "RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].Severity > 0); + + // MEMORY_GRANT_PENDING → QUERY_SPILLS (insufficient grants causing spills) + AddEdge("MEMORY_GRANT_PENDING", "QUERY_SPILLS", "memory_grants", + "Query spills — queries getting insufficient memory grants", + facts => HasFact(facts, "QUERY_SPILLS") && facts["QUERY_SPILLS"].BaseSeverity > 0); + + // PAGEIOLATCH_SH → IO_READ_LATENCY_MS (buffer miss confirmed by disk latency) + AddEdge("PAGEIOLATCH_SH", "IO_READ_LATENCY_MS", "memory_pressure", + "Read latency elevated — disk confirms buffer pool pressure", + facts => HasFact(facts, "IO_READ_LATENCY_MS") && facts["IO_READ_LATENCY_MS"].BaseSeverity > 0); + + // PAGEIOLATCH_EX → IO_READ_LATENCY_MS + AddEdge("PAGEIOLATCH_EX", "IO_READ_LATENCY_MS", "memory_pressure", + "Read latency elevated — disk confirms buffer pool pressure", + facts => HasFact(facts, "IO_READ_LATENCY_MS") && facts["IO_READ_LATENCY_MS"].BaseSeverity > 0); + } + + /* ── Blocking & Deadlocking ── */ + + private void BuildBlockingEdges() + { + // LCK → BLOCKING_EVENTS (lock waits confirmed by actual blocking reports) + AddEdge("LCK", "BLOCKING_EVENTS", "lock_contention", + "Blocked process reports present — confirmed blocking events", + facts => HasFact(facts, "BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0); + + // LCK → DEADLOCKS (lock contention escalating) + AddEdge("LCK", "DEADLOCKS", "lock_contention", + "Deadlocks present — lock contention escalating to deadlocks", + facts => HasFact(facts, "DEADLOCKS") && facts["DEADLOCKS"].BaseSeverity > 0); + + // BLOCKING_EVENTS → LCK (blocking confirmed by lock waits) + AddEdge("BLOCKING_EVENTS", "LCK", "blocking", + "Lock contention waits elevated — blocking visible in wait stats", + facts => HasFact(facts, "LCK") && facts["LCK"].Severity >= 0.5); + + // BLOCKING_EVENTS → DEADLOCKS (blocking escalating) + AddEdge("BLOCKING_EVENTS", "DEADLOCKS", "blocking", + "Deadlocks also present — blocking escalating to deadlocks", + facts => HasFact(facts, "DEADLOCKS") && facts["DEADLOCKS"].BaseSeverity > 0); + + // BLOCKING_EVENTS → THREADPOOL (blocking causing thread exhaustion) + AddEdge("BLOCKING_EVENTS", "THREADPOOL", "blocking", + "THREADPOOL waits present — blocked queries consuming worker threads", + facts => HasFact(facts, "THREADPOOL") && facts["THREADPOOL"].Severity > 0); + + // DEADLOCKS → BLOCKING_EVENTS (deadlocks with systemic blocking) + AddEdge("DEADLOCKS", "BLOCKING_EVENTS", "deadlocking", + "Blocking events also present — systemic contention pattern", + facts => HasFact(facts, "BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0); + + // DEADLOCKS → LCK_M_S (reader/writer deadlocks) + AddEdge("DEADLOCKS", "LCK_M_S", "deadlocking", + "Reader lock waits present — RCSI could prevent reader/writer deadlocks", + facts => HasFact(facts, "LCK_M_S") && facts["LCK_M_S"].BaseSeverity > 0); + + // LCK_M_S → DB_CONFIG (reader/writer contention → RCSI recommendation) + AddEdge("LCK_M_S", "DB_CONFIG", "lock_contention", + "Databases without RCSI — readers blocked by writers could be eliminated", + facts => HasFact(facts, "DB_CONFIG") + && facts["DB_CONFIG"].Metadata.GetValueOrDefault("rcsi_off_count") > 0 + && facts["DB_CONFIG"].BaseSeverity > 0); + + // DB_CONFIG → LCK_M_S (RCSI-off confirmed by reader/writer lock contention) + AddEdge("DB_CONFIG", "LCK_M_S", "config_issue", + "LCK_M_S waits — readers blocked by writers, RCSI would eliminate these", + facts => HasFact(facts, "LCK_M_S") && facts["LCK_M_S"].BaseSeverity > 0 + && HasFact(facts, "DB_CONFIG") + && facts["DB_CONFIG"].Metadata.GetValueOrDefault("rcsi_off_count") > 0); + + // THREADPOOL → BLOCKING_EVENTS (blocking causing thread buildup) + AddEdge("THREADPOOL", "BLOCKING_EVENTS", "thread_exhaustion", + "Blocking events present — blocked queries holding worker threads", + facts => HasFact(facts, "BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0); + } + + /* ── I/O Pressure ── */ + + private void BuildIoPressureEdges() + { + // IO_READ_LATENCY_MS → PAGEIOLATCH_SH (disk latency with buffer pool misses) + AddEdge("IO_READ_LATENCY_MS", "PAGEIOLATCH_SH", "io_pressure", + "PAGEIOLATCH waits — buffer pool misses driving read I/O", + facts => HasFact(facts, "PAGEIOLATCH_SH") && facts["PAGEIOLATCH_SH"].Severity >= 0.5); + + // IO_WRITE_LATENCY_MS → WRITELOG (write latency with log waits) + AddEdge("IO_WRITE_LATENCY_MS", "WRITELOG", "io_pressure", + "WRITELOG waits — transaction log I/O bottleneck", + facts => HasFact(facts, "WRITELOG") && facts["WRITELOG"].Severity > 0); + + // WRITELOG → IO_WRITE_LATENCY_MS (log waits confirmed by disk latency) + AddEdge("WRITELOG", "IO_WRITE_LATENCY_MS", "log_io", + "Write latency elevated — disk confirms log I/O bottleneck", + facts => HasFact(facts, "IO_WRITE_LATENCY_MS") && facts["IO_WRITE_LATENCY_MS"].BaseSeverity > 0); + } + + /* ── Latch Contention ── */ + + private void BuildLatchEdges() + { + // LATCH_EX → TEMPDB_USAGE (latch contention often from TempDB allocation) + AddEdge("LATCH_EX", "TEMPDB_USAGE", "latch_contention", + "TempDB usage — latch contention may be on TempDB allocation pages", + facts => HasFact(facts, "TEMPDB_USAGE") && facts["TEMPDB_USAGE"].BaseSeverity > 0); + + // LATCH_EX → CXPACKET (parallel operations amplifying latch contention) + AddEdge("LATCH_EX", "CXPACKET", "latch_contention", + "Parallelism waits — parallel operations amplifying page latch contention", + facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.5); + } + + /* ── TempDB ── */ + + private void BuildTempDbEdges() + { + // TEMPDB_USAGE → PAGEIOLATCH_SH (tempdb pressure causing I/O) + AddEdge("TEMPDB_USAGE", "PAGEIOLATCH_SH", "tempdb_pressure", + "PAGEIOLATCH waits — TempDB pressure contributing to I/O", + facts => HasFact(facts, "PAGEIOLATCH_SH") && facts["PAGEIOLATCH_SH"].Severity >= 0.5); + + // TEMPDB_USAGE → QUERY_SPILLS (spills consuming tempdb) + AddEdge("TEMPDB_USAGE", "QUERY_SPILLS", "tempdb_pressure", + "Query spills — spilling to TempDB consuming space", + facts => HasFact(facts, "QUERY_SPILLS") && facts["QUERY_SPILLS"].BaseSeverity > 0); + } + + /* ── Query-Level ── */ + + private void BuildQueryEdges() + { + // QUERY_SPILLS → MEMORY_GRANT_PENDING (spills from insufficient grants) + AddEdge("QUERY_SPILLS", "MEMORY_GRANT_PENDING", "query_performance", + "Memory grant waiters — spills caused by insufficient memory grants", + facts => HasFact(facts, "MEMORY_GRANT_PENDING") && facts["MEMORY_GRANT_PENDING"].BaseSeverity > 0); + + // QUERY_SPILLS → TEMPDB_USAGE (spills consuming tempdb space) + AddEdge("QUERY_SPILLS", "TEMPDB_USAGE", "query_performance", + "TempDB usage elevated — spills consuming TempDB space", + facts => HasFact(facts, "TEMPDB_USAGE") && facts["TEMPDB_USAGE"].BaseSeverity > 0); + + // QUERY_HIGH_DOP → CXPACKET (high-DOP queries causing parallelism waits) + AddEdge("QUERY_HIGH_DOP", "CXPACKET", "query_performance", + "CXPACKET waits — high-DOP queries causing excessive parallelism", + facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.5); + + // QUERY_HIGH_DOP → SOS_SCHEDULER_YIELD (high-DOP queries causing CPU pressure) + AddEdge("QUERY_HIGH_DOP", "SOS_SCHEDULER_YIELD", "query_performance", + "Scheduler yields — high-DOP queries saturating CPU", + facts => HasFact(facts, "SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].Severity >= 0.5); + } + + private static bool HasFact(IReadOnlyDictionary facts, string key) + { + return facts.ContainsKey(key); + } +} diff --git a/Dashboard/Analysis/SqlServerAnomalyDetector.cs b/Dashboard/Analysis/SqlServerAnomalyDetector.cs new file mode 100644 index 0000000..bdf6664 --- /dev/null +++ b/Dashboard/Analysis/SqlServerAnomalyDetector.cs @@ -0,0 +1,543 @@ +using System; +using System.Collections.Generic; +using System.Threading.Tasks; +using Microsoft.Data.SqlClient; +using PerformanceMonitorDashboard.Helpers; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Detects anomalies by comparing the analysis window's metrics against a +/// baseline period. When a metric deviates significantly from baseline +/// (mean + standard deviation), an ANOMALY fact is emitted. +/// +/// This is the "oh shit" mode -- detecting acute deviations that don't show +/// up in aggregate analysis because they're brief. A 5-minute CPU spike +/// that averages out over 4 hours is invisible to aggregate scoring but +/// obvious when compared against "what was this metric doing before?" +/// +/// Baseline selection: uses the 24 hours preceding the analysis window. +/// If less data is available, uses whatever exists with lower confidence. +/// +/// Port of Lite's AnomalyDetector -- uses SQL Server collect.* tables instead of DuckDB views. +/// No server_id filtering -- Dashboard monitors one server per database. +/// +public class SqlServerAnomalyDetector +{ + private readonly string _connectionString; + + /// + /// Minimum number of baseline samples needed for reliable detection. + /// Below this, anomalies are still detected but with reduced confidence. + /// + private const int MinBaselineSamples = 10; + + /// + /// Number of standard deviations above baseline mean to flag as anomalous. + /// + private const double DeviationThreshold = 2.0; + + public SqlServerAnomalyDetector(string connectionString) + { + _connectionString = connectionString; + } + + /// + /// Detects anomalies by comparing the analysis window against a baseline period. + /// Returns anomaly facts to be merged into the main fact list. + /// + public async Task> DetectAnomaliesAsync(AnalysisContext context) + { + var anomalies = new List(); + + // Baseline: 24 hours preceding the analysis window + var baselineEnd = context.TimeRangeStart; + var baselineStart = baselineEnd.AddHours(-24); + + // Check if baseline period has any data at all -- if not, skip all anomaly detection. + // Without baseline data, everything looks anomalous. + if (!await HasBaselineDataAsync(baselineStart, baselineEnd)) + return anomalies; + + await DetectCpuAnomalies(context, baselineStart, baselineEnd, anomalies); + await DetectWaitAnomalies(context, baselineStart, baselineEnd, anomalies); + await DetectBlockingAnomalies(context, baselineStart, baselineEnd, anomalies); + await DetectIoAnomalies(context, baselineStart, baselineEnd, anomalies); + + return anomalies; + } + + /// + /// Checks if the baseline period has any collected data. + /// Uses wait_stats as canary -- if waits are collected, other data is too. + /// + private async Task HasBaselineDataAsync(DateTime baselineStart, DateTime baselineEnd) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + (SELECT COUNT(*) FROM collect.wait_stats + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) + + (SELECT COUNT(*) FROM collect.cpu_utilization_stats + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd);"; + + cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); + cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); + + var count = Convert.ToInt64(await cmd.ExecuteScalarAsync() ?? 0); + return count > 0; + } + catch { return false; } + } + + /// + /// Detects CPU utilization anomalies by comparing per-sample values + /// against the baseline distribution. + /// + private async Task DetectCpuAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + // Get baseline stats + using var baselineCmd = connection.CreateCommand(); + baselineCmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + AVG(CAST(sqlserver_cpu_utilization AS FLOAT)) AS mean_cpu, + STDEV(CAST(sqlserver_cpu_utilization AS FLOAT)) AS stddev_cpu, + COUNT(*) AS sample_count +FROM collect.cpu_utilization_stats +WHERE collection_time >= @baselineStart +AND collection_time < @baselineEnd;"; + + baselineCmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); + baselineCmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); + + double baselineMean = 0, baselineStdDev = 0; + long baselineSamples = 0; + + using (var reader = await baselineCmd.ExecuteReaderAsync()) + { + if (await reader.ReadAsync()) + { + baselineMean = reader.IsDBNull(0) ? 0 : Convert.ToDouble(reader.GetValue(0)); + baselineStdDev = reader.IsDBNull(1) ? 0 : Convert.ToDouble(reader.GetValue(1)); + baselineSamples = reader.IsDBNull(2) ? 0 : Convert.ToInt64(reader.GetValue(2)); + } + } + + if (baselineSamples < 3 || baselineStdDev <= 0) return; + + // Get peak and average in the analysis window + using var windowCmd = connection.CreateCommand(); + windowCmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + MAX(sqlserver_cpu_utilization) AS peak_cpu, + AVG(CAST(sqlserver_cpu_utilization AS FLOAT)) AS avg_cpu, + COUNT(*) AS sample_count, + (SELECT TOP 1 collection_time FROM collect.cpu_utilization_stats + WHERE collection_time >= @windowStart AND collection_time < @windowEnd + ORDER BY sqlserver_cpu_utilization DESC) AS peak_time +FROM collect.cpu_utilization_stats +WHERE collection_time >= @windowStart +AND collection_time < @windowEnd;"; + + windowCmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); + windowCmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); + + using var windowReader = await windowCmd.ExecuteReaderAsync(); + if (!await windowReader.ReadAsync()) return; + + var peakCpu = windowReader.IsDBNull(0) ? 0.0 : Convert.ToDouble(windowReader.GetValue(0)); + var avgCpu = windowReader.IsDBNull(1) ? 0.0 : Convert.ToDouble(windowReader.GetValue(1)); + var windowSamples = windowReader.IsDBNull(2) ? 0L : Convert.ToInt64(windowReader.GetValue(2)); + var peakTime = windowReader.IsDBNull(3) ? (DateTime?)null : windowReader.GetDateTime(3); + + if (windowSamples == 0) return; + + // Check if peak deviates significantly from baseline + var deviation = (peakCpu - baselineMean) / baselineStdDev; + if (deviation < DeviationThreshold || peakCpu < 50) return; // Don't flag low absolute values + + var confidence = baselineSamples >= MinBaselineSamples ? 1.0 : (double)baselineSamples / MinBaselineSamples; + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_CPU_SPIKE", + Value = peakCpu, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["peak_cpu"] = peakCpu, + ["avg_cpu_in_window"] = avgCpu, + ["baseline_mean"] = baselineMean, + ["baseline_stddev"] = baselineStdDev, + ["deviation_sigma"] = deviation, + ["baseline_samples"] = baselineSamples, + ["window_samples"] = windowSamples, + ["confidence"] = confidence, + ["peak_time_ticks"] = peakTime?.Ticks ?? 0 + } + }); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerAnomalyDetector] CPU anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects wait stat anomalies -- significant waits in the analysis window + /// that were absent or much lower in the baseline. + /// + private async Task DetectWaitAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + // Check if baseline has any wait data at all -- if not, skip + using var checkCmd = connection.CreateCommand(); + checkCmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT COUNT(*) FROM collect.wait_stats +WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd;"; + + checkCmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); + checkCmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); + + var baselineCount = Convert.ToInt64(await checkCmd.ExecuteScalarAsync() ?? 0); + if (baselineCount == 0) return; + + // Get per-wait-type totals in both windows + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH baseline AS ( + SELECT wait_type, + CAST(SUM(wait_time_ms_delta) AS BIGINT) AS total_ms + FROM collect.wait_stats + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd + AND wait_time_ms_delta > 0 + GROUP BY wait_type +), +current_window AS ( + SELECT wait_type, + CAST(SUM(wait_time_ms_delta) AS BIGINT) AS total_ms + FROM collect.wait_stats + WHERE collection_time >= @windowStart AND collection_time <= @windowEnd + AND wait_time_ms_delta > 0 + GROUP BY wait_type +) +SELECT TOP 10 + c.wait_type, + c.total_ms AS current_ms, + COALESCE(b.total_ms, 0) AS baseline_ms +FROM current_window c +LEFT JOIN baseline b ON c.wait_type = b.wait_type +WHERE c.total_ms > 10000 -- At least 10 seconds of wait time +ORDER BY c.total_ms DESC;"; + + cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); + cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); + cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var waitType = reader.GetString(0); + var currentMs = Convert.ToInt64(reader.GetValue(1)); + var baselineMs = Convert.ToInt64(reader.GetValue(2)); + + // Normalize to per-hour rates before comparing (windows are different lengths) + var baselineHours = (baselineEnd - baselineStart).TotalHours; + var currentHours = (context.TimeRangeEnd - context.TimeRangeStart).TotalHours; + if (baselineHours <= 0) baselineHours = 1; + if (currentHours <= 0) currentHours = 1; + + double ratio; + string anomalyType; + + if (baselineMs == 0) + { + ratio = currentMs > 60_000 ? 100.0 : 0; // Only flag if > 1 minute total + anomalyType = "new"; + } + else + { + var baselineRate = baselineMs / baselineHours; + var currentRate = currentMs / currentHours; + ratio = baselineRate > 0 ? currentRate / baselineRate : 100.0; + anomalyType = "spike"; + } + + if (ratio < 5.0) continue; // Need at least 5x increase + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = $"ANOMALY_WAIT_{waitType}", + Value = currentMs, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_ms"] = currentMs, + ["baseline_ms"] = baselineMs, + ["ratio"] = ratio, + ["is_new"] = anomalyType == "new" ? 1 : 0 + } + }); + } + } + catch (Exception ex) + { + Logger.Error($"[SqlServerAnomalyDetector] Wait anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects blocking/deadlock anomalies -- events in the analysis window + /// that are significantly above baseline rates. + /// + private async Task DetectBlockingAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + // Check if baseline period has any data at all + using var checkCmd = connection.CreateCommand(); + checkCmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + (SELECT COUNT(*) FROM collect.blocking_BlockedProcessReport + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) + + (SELECT COUNT(*) FROM collect.deadlocks + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) + + (SELECT COUNT(*) FROM collect.wait_stats + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd);"; + + checkCmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); + checkCmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); + + var baselineDataCount = Convert.ToInt64(await checkCmd.ExecuteScalarAsync() ?? 0); + if (baselineDataCount == 0) return; // No baseline data = can't detect anomaly + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + (SELECT COUNT(*) FROM collect.blocking_BlockedProcessReport + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) AS baseline_blocking, + (SELECT COUNT(*) FROM collect.blocking_BlockedProcessReport + WHERE collection_time >= @windowStart AND collection_time <= @windowEnd) AS current_blocking, + (SELECT COUNT(*) FROM collect.deadlocks + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) AS baseline_deadlocks, + (SELECT COUNT(*) FROM collect.deadlocks + WHERE collection_time >= @windowStart AND collection_time <= @windowEnd) AS current_deadlocks;"; + + cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); + cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); + cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var baselineBlocking = Convert.ToInt64(reader.GetValue(0)); + var currentBlocking = Convert.ToInt64(reader.GetValue(1)); + var baselineDeadlocks = Convert.ToInt64(reader.GetValue(2)); + var currentDeadlocks = Convert.ToInt64(reader.GetValue(3)); + + // Normalize to per-hour rates (windows are different lengths) + var baselineHours = (baselineEnd - baselineStart).TotalHours; + var currentHours = (context.TimeRangeEnd - context.TimeRangeStart).TotalHours; + if (baselineHours <= 0) baselineHours = 1; + if (currentHours <= 0) currentHours = 1; + + var baselineBlockingRate = baselineBlocking / baselineHours; + var currentBlockingRate = currentBlocking / currentHours; + var blockingRatio = baselineBlocking > 0 ? currentBlockingRate / baselineBlockingRate : 100.0; + + var baselineDeadlockRate = baselineDeadlocks / baselineHours; + var currentDeadlockRate = currentDeadlocks / currentHours; + var deadlockRatio = baselineDeadlocks > 0 ? currentDeadlockRate / baselineDeadlockRate : 100.0; + + // Blocking spike: at least 5 events AND 3x baseline rate (or new) + if (currentBlocking >= 5 && (baselineBlocking == 0 || blockingRatio >= 3)) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_BLOCKING_SPIKE", + Value = currentBlocking, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_count"] = currentBlocking, + ["baseline_count"] = baselineBlocking, + ["ratio"] = blockingRatio + } + }); + } + + // Deadlock spike: at least 3 events AND 3x baseline rate (or new) + if (currentDeadlocks >= 3 && (baselineDeadlocks == 0 || deadlockRatio >= 3)) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_DEADLOCK_SPIKE", + Value = currentDeadlocks, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_count"] = currentDeadlocks, + ["baseline_count"] = baselineDeadlocks, + ["ratio"] = deadlockRatio + } + }); + } + } + catch (Exception ex) + { + Logger.Error($"[SqlServerAnomalyDetector] Blocking anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects I/O latency anomalies -- significant increase in read/write latency + /// compared to baseline. + /// + private async Task DetectIoAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH baseline AS ( + SELECT + AVG(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS avg_read_lat, + AVG(io_stall_write_ms_delta * 1.0 / NULLIF(num_of_writes_delta, 0)) AS avg_write_lat, + STDEV(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS stddev_read, + STDEV(io_stall_write_ms_delta * 1.0 / NULLIF(num_of_writes_delta, 0)) AS stddev_write, + COUNT(*) AS samples + FROM collect.file_io_stats + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd + AND (num_of_reads_delta > 0 OR num_of_writes_delta > 0) +), +current_window AS ( + SELECT + AVG(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS avg_read_lat, + AVG(io_stall_write_ms_delta * 1.0 / NULLIF(num_of_writes_delta, 0)) AS avg_write_lat + FROM collect.file_io_stats + WHERE collection_time >= @windowStart AND collection_time <= @windowEnd + AND (num_of_reads_delta > 0 OR num_of_writes_delta > 0) +) +SELECT b.avg_read_lat, b.stddev_read, c.avg_read_lat, + b.avg_write_lat, b.stddev_write, c.avg_write_lat, + b.samples +FROM baseline b, current_window c;"; + + cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); + cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); + cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var baselineReadLat = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var stddevRead = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var currentReadLat = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + var baselineWriteLat = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + var stddevWrite = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)); + var currentWriteLat = reader.IsDBNull(5) ? 0.0 : Convert.ToDouble(reader.GetValue(5)); + var samples = reader.IsDBNull(6) ? 0L : Convert.ToInt64(reader.GetValue(6)); + + if (samples < 3) return; + + // Read latency anomaly + if (stddevRead > 0 && currentReadLat > 10) // At least 10ms to matter + { + var readDeviation = (currentReadLat - baselineReadLat) / stddevRead; + if (readDeviation >= DeviationThreshold) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_READ_LATENCY", + Value = currentReadLat, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_latency_ms"] = currentReadLat, + ["baseline_mean_ms"] = baselineReadLat, + ["baseline_stddev_ms"] = stddevRead, + ["deviation_sigma"] = readDeviation, + ["baseline_samples"] = samples + } + }); + } + } + + // Write latency anomaly + if (stddevWrite > 0 && currentWriteLat > 5) // At least 5ms to matter + { + var writeDeviation = (currentWriteLat - baselineWriteLat) / stddevWrite; + if (writeDeviation >= DeviationThreshold) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_WRITE_LATENCY", + Value = currentWriteLat, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_latency_ms"] = currentWriteLat, + ["baseline_mean_ms"] = baselineWriteLat, + ["baseline_stddev_ms"] = stddevWrite, + ["deviation_sigma"] = writeDeviation, + ["baseline_samples"] = samples + } + }); + } + } + } + catch (Exception ex) + { + Logger.Error($"[SqlServerAnomalyDetector] I/O anomaly detection failed: {ex.Message}"); + } + } +} diff --git a/Dashboard/Analysis/SqlServerDrillDownCollector.cs b/Dashboard/Analysis/SqlServerDrillDownCollector.cs new file mode 100644 index 0000000..4010269 --- /dev/null +++ b/Dashboard/Analysis/SqlServerDrillDownCollector.cs @@ -0,0 +1,773 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.Data.SqlClient; +using PerformanceMonitorDashboard.Helpers; +using PerformanceMonitorDashboard.Mcp; +using PerformanceMonitorDashboard.Models; +using PerformanceMonitorDashboard.Services; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Enriches findings with drill-down data from SQL Server. +/// Runs after graph traversal, only for findings above severity threshold. +/// Each drill-down query is limited to top N results with truncated text. +/// +/// This makes analyze_server self-sufficient -- instead of returning a list +/// of "next tools to call," findings include the actual supporting data. +/// +/// Port of Lite's DrillDownCollector -- uses SQL Server collect.* tables instead of DuckDB views. +/// No server_id filtering -- Dashboard monitors one server per database. +/// +public class SqlServerDrillDownCollector +{ + private readonly string _connectionString; + private readonly IPlanFetcher? _planFetcher; + private const int TextLimit = 500; + + public SqlServerDrillDownCollector(string connectionString, IPlanFetcher? planFetcher = null) + { + _connectionString = connectionString; + _planFetcher = planFetcher; + } + + /// + /// Enriches each finding's DrillDown dictionary based on its story path. + /// + public async Task EnrichFindingsAsync(List findings, AnalysisContext context) + { + foreach (var finding in findings) + { + if (finding.Severity < 0.5) continue; + + try + { + finding.DrillDown = new Dictionary(); + var pathKeys = finding.StoryPath.Split(" -> ", StringSplitOptions.RemoveEmptyEntries).ToHashSet(); + + if (pathKeys.Contains("DEADLOCKS")) + await CollectTopDeadlocks(finding, context); + + if (pathKeys.Contains("BLOCKING_EVENTS")) + await CollectTopBlockingChains(finding, context); + + if (pathKeys.Contains("CPU_SPIKE")) + await CollectQueriesAtSpike(finding, context); + + if (pathKeys.Contains("CPU_SQL_PERCENT") || pathKeys.Contains("CPU_SPIKE")) + await CollectTopCpuQueries(finding, context); + + if (pathKeys.Contains("QUERY_SPILLS")) + await CollectTopSpillingQueries(finding, context); + + if (pathKeys.Contains("IO_READ_LATENCY_MS") || pathKeys.Contains("IO_WRITE_LATENCY_MS")) + await CollectFileLatencyBreakdown(finding, context); + + if (pathKeys.Contains("LCK") || pathKeys.Contains("LCK_M_S") || pathKeys.Contains("LCK_M_IS")) + await CollectLockModeBreakdown(finding, context); + + if (pathKeys.Contains("DB_CONFIG")) + await CollectConfigIssues(finding, context); + + if (pathKeys.Contains("TEMPDB_USAGE")) + await CollectTempDbBreakdown(finding, context); + + if (pathKeys.Contains("MEMORY_GRANT_PENDING")) + await CollectPendingGrants(finding, context); + + if (pathKeys.Any(k => k.StartsWith("BAD_ACTOR_"))) + await CollectBadActorDetail(finding, context); + + // Plan analysis: for findings with top queries, analyze their cached plans + await CollectPlanAnalysis(finding, context); + + // Remove empty drill-down dictionaries + if (finding.DrillDown.Count == 0) + finding.DrillDown = null; + } + catch (Exception ex) + { + Logger.Error( + $"[SqlServerDrillDownCollector] Drill-down failed for {finding.StoryPath}: {ex.GetType().Name}: {ex.Message}"); + // Don't null out -- keep whatever was collected before the error + } + } + } + + private async Task CollectTopDeadlocks(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 3 + collection_time, + event_date, + spid, + LEFT(CAST(query AS NVARCHAR(MAX)), 500) AS victim_sql +FROM collect.deadlocks +WHERE collection_time >= @startTime AND collection_time <= @endTime +ORDER BY collection_time DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + time = reader.IsDBNull(0) ? "" : reader.GetDateTime(0).ToString("o"), + deadlock_time = reader.IsDBNull(1) ? "" : reader.GetDateTime(1).ToString("o"), + victim = reader.IsDBNull(2) ? "" : reader.GetValue(2).ToString(), + victim_sql = reader.IsDBNull(3) ? "" : reader.GetString(3) + }); + } + + if (items.Count > 0) + finding.DrillDown!["top_deadlocks"] = items; + } + + private async Task CollectTopBlockingChains(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 5 + collection_time, + database_name, + spid AS blocked_spid, + 0 AS blocking_spid, + wait_time_ms, + lock_mode, + LEFT(CAST(query_text AS NVARCHAR(MAX)), 500) AS blocked_sql, + LEFT(blocking_tree, 500) AS blocking_sql +FROM collect.blocking_BlockedProcessReport +WHERE collection_time >= @startTime AND collection_time <= @endTime +ORDER BY wait_time_ms DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + time = reader.IsDBNull(0) ? "" : reader.GetDateTime(0).ToString("o"), + database = reader.IsDBNull(1) ? "" : reader.GetString(1), + blocked_spid = reader.IsDBNull(2) ? 0 : Convert.ToInt32(reader.GetValue(2)), + blocking_spid = reader.IsDBNull(3) ? 0 : Convert.ToInt32(reader.GetValue(3)), + wait_time_ms = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)), + lock_mode = reader.IsDBNull(5) ? "" : reader.GetString(5), + blocked_sql = reader.IsDBNull(6) ? "" : reader.GetString(6), + blocking_sql = reader.IsDBNull(7) ? "" : reader.GetString(7) + }); + } + + if (items.Count > 0) + finding.DrillDown!["top_blocking_chains"] = items; + } + + private async Task CollectQueriesAtSpike(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + // Check if query_snapshots table exists (created dynamically by sp_WhoIsActive) + using var checkCmd = connection.CreateCommand(); + checkCmd.CommandText = "SELECT OBJECT_ID(N'collect.query_snapshots', N'U')"; + var tableExists = await checkCmd.ExecuteScalarAsync(); + if (tableExists == null || tableExists == DBNull.Value) return; + + // Step 1: Find when the spike occurred + using var peakCmd = connection.CreateCommand(); + peakCmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 1 collection_time, sqlserver_cpu_utilization +FROM collect.cpu_utilization_stats +WHERE collection_time >= @startTime AND collection_time <= @endTime +ORDER BY sqlserver_cpu_utilization DESC;"; + + peakCmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + peakCmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + DateTime? peakTime = null; + int peakCpu = 0; + using (var peakReader = await peakCmd.ExecuteReaderAsync()) + { + if (await peakReader.ReadAsync()) + { + peakTime = peakReader.GetDateTime(0); + peakCpu = peakReader.GetInt32(1); + } + } + + if (peakTime == null) return; + + // Step 2: Get queries active within 2 minutes of peak + using var queryCmd = connection.CreateCommand(); + queryCmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 5 + collection_time, + [session_id], + [database_name], + [status], + DATEDIFF(MILLISECOND, 0, [CPU]) AS cpu_time_ms, + DATEDIFF(MILLISECOND, 0, [elapsed_time]) AS total_elapsed_time_ms, + [reads] AS logical_reads, + [wait_info] AS wait_type, + 0 AS dop, + 0 AS parallel_worker_count, + LEFT(CAST([sql_text] AS NVARCHAR(MAX)), 500) AS query_text +FROM collect.query_snapshots +WHERE collection_time >= @spikeStart +AND collection_time <= @spikeEnd +AND CAST([sql_text] AS NVARCHAR(MAX)) NOT LIKE 'WAITFOR%' +ORDER BY DATEDIFF(MILLISECOND, 0, [CPU]) DESC;"; + + queryCmd.Parameters.Add(new SqlParameter("@spikeStart", peakTime.Value.AddMinutes(-2))); + queryCmd.Parameters.Add(new SqlParameter("@spikeEnd", peakTime.Value.AddMinutes(2))); + + var items = new List(); + using (var reader = await queryCmd.ExecuteReaderAsync()) + { + while (await reader.ReadAsync()) + { + items.Add(new + { + time = reader.IsDBNull(0) ? "" : reader.GetDateTime(0).ToString("o"), + session_id = reader.IsDBNull(1) ? 0 : Convert.ToInt32(reader.GetValue(1)), + database = reader.IsDBNull(2) ? "" : reader.GetString(2), + status = reader.IsDBNull(3) ? "" : reader.GetString(3), + cpu_time_ms = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)), + elapsed_time_ms = reader.IsDBNull(5) ? 0L : Convert.ToInt64(reader.GetValue(5)), + logical_reads = reader.IsDBNull(6) ? 0L : Convert.ToInt64(reader.GetValue(6)), + wait_type = reader.IsDBNull(7) ? "" : reader.GetString(7), + dop = reader.IsDBNull(8) ? 0 : Convert.ToInt32(reader.GetValue(8)), + parallel_workers = reader.IsDBNull(9) ? 0 : Convert.ToInt32(reader.GetValue(9)), + query_text = reader.IsDBNull(10) ? "" : reader.GetString(10) + }); + } + } + + if (items.Count > 0) + { + finding.DrillDown!["spike_peak"] = new + { + time = peakTime.Value.ToString("o"), + cpu_percent = peakCpu + }; + finding.DrillDown!["queries_at_spike"] = items; + } + } + + private async Task CollectTopCpuQueries(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 5 + database_name, + CONVERT(VARCHAR(18), query_hash, 1) AS query_hash, + CAST(SUM(total_worker_time_delta) AS BIGINT) AS total_cpu_us, + CAST(SUM(execution_count_delta) AS BIGINT) AS exec_count, + MAX(max_dop) AS max_dop, + CAST(SUM(total_spills) AS BIGINT) AS spills, + LEFT(CAST(DECOMPRESS(MAX(query_text)) AS NVARCHAR(MAX)), 500) AS query_text +FROM collect.query_stats +WHERE collection_time >= @startTime AND collection_time <= @endTime +AND total_worker_time_delta > 0 +GROUP BY database_name, query_hash +ORDER BY CAST(SUM(total_worker_time_delta) AS BIGINT) DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + database = reader.IsDBNull(0) ? "" : reader.GetString(0), + query_hash = reader.IsDBNull(1) ? "" : reader.GetString(1), + total_cpu_ms = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)) / 1000.0, + execution_count = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)), + max_dop = reader.IsDBNull(4) ? 0 : Convert.ToInt32(reader.GetValue(4)), + spills = reader.IsDBNull(5) ? 0L : Convert.ToInt64(reader.GetValue(5)), + query_text = reader.IsDBNull(6) ? "" : reader.GetString(6) + }); + } + + if (items.Count > 0 && !finding.DrillDown!.ContainsKey("top_cpu_queries")) + finding.DrillDown!["top_cpu_queries"] = items; + } + + private async Task CollectTopSpillingQueries(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 5 + database_name, + CONVERT(VARCHAR(18), query_hash, 1) AS query_hash, + CAST(SUM(total_spills) AS BIGINT) AS total_spills, + CAST(SUM(execution_count_delta) AS BIGINT) AS exec_count, + LEFT(CAST(DECOMPRESS(MAX(query_text)) AS NVARCHAR(MAX)), 500) AS query_text +FROM collect.query_stats +WHERE collection_time >= @startTime AND collection_time <= @endTime +AND total_spills > 0 +GROUP BY database_name, query_hash +ORDER BY CAST(SUM(total_spills) AS BIGINT) DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + database = reader.IsDBNull(0) ? "" : reader.GetString(0), + query_hash = reader.IsDBNull(1) ? "" : reader.GetString(1), + total_spills = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)), + execution_count = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)), + query_text = reader.IsDBNull(4) ? "" : reader.GetString(4) + }); + } + + if (items.Count > 0) + finding.DrillDown!["top_spilling_queries"] = items; + } + + private async Task CollectFileLatencyBreakdown(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 10 + database_name, + file_type_desc AS file_type, + AVG(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS avg_read_ms, + AVG(io_stall_write_ms_delta * 1.0 / NULLIF(num_of_writes_delta, 0)) AS avg_write_ms, + CAST(SUM(num_of_reads_delta) AS BIGINT) AS total_reads, + CAST(SUM(num_of_writes_delta) AS BIGINT) AS total_writes +FROM collect.file_io_stats +WHERE collection_time >= @startTime AND collection_time <= @endTime +AND (num_of_reads_delta > 0 OR num_of_writes_delta > 0) +GROUP BY database_name, file_type_desc +ORDER BY AVG(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + database = reader.IsDBNull(0) ? "" : reader.GetString(0), + file_type = reader.IsDBNull(1) ? "" : reader.GetString(1), + avg_read_latency_ms = reader.IsDBNull(2) ? 0.0 : Math.Round(Convert.ToDouble(reader.GetValue(2)), 2), + avg_write_latency_ms = reader.IsDBNull(3) ? 0.0 : Math.Round(Convert.ToDouble(reader.GetValue(3)), 2), + total_reads = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)), + total_writes = reader.IsDBNull(5) ? 0L : Convert.ToInt64(reader.GetValue(5)) + }); + } + + if (items.Count > 0) + finding.DrillDown!["file_latency_breakdown"] = items; + } + + private async Task CollectLockModeBreakdown(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 10 + wait_type, + CAST(SUM(wait_time_ms_delta) AS BIGINT) AS total_wait_ms, + CAST(SUM(waiting_tasks_count_delta) AS BIGINT) AS total_count +FROM collect.wait_stats +WHERE collection_time >= @startTime AND collection_time <= @endTime +AND wait_type LIKE 'LCK%' +AND wait_time_ms_delta > 0 +GROUP BY wait_type +ORDER BY CAST(SUM(wait_time_ms_delta) AS BIGINT) DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + lock_type = reader.IsDBNull(0) ? "" : reader.GetString(0), + total_wait_ms = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)), + waiting_tasks = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)) + }); + } + + if (items.Count > 0) + finding.DrillDown!["lock_mode_breakdown"] = items; + } + + private async Task CollectConfigIssues(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + // The Dashboard uses config.database_configuration_history which stores + // settings as rows (setting_type, setting_name, setting_value) not columns. + // Pivot the latest snapshot into the format we need. + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT database_name, setting_name, + CAST(setting_value AS NVARCHAR(256)) AS setting_value, + ROW_NUMBER() OVER (PARTITION BY database_name, setting_name ORDER BY collection_time DESC) AS rn + FROM config.database_configuration_history + WHERE setting_name IN ( + 'recovery_model_desc', 'is_auto_shrink_on', 'is_auto_close_on', + 'is_read_committed_snapshot_on', 'page_verify_option_desc', 'is_query_store_on' + ) +), +pivoted AS ( + SELECT + database_name, + MAX(CASE WHEN setting_name = 'recovery_model_desc' THEN setting_value END) AS recovery_model, + MAX(CASE WHEN setting_name = 'is_auto_shrink_on' THEN setting_value END) AS is_auto_shrink_on, + MAX(CASE WHEN setting_name = 'is_auto_close_on' THEN setting_value END) AS is_auto_close_on, + MAX(CASE WHEN setting_name = 'is_read_committed_snapshot_on' THEN setting_value END) AS is_rcsi_on, + MAX(CASE WHEN setting_name = 'page_verify_option_desc' THEN setting_value END) AS page_verify_option, + MAX(CASE WHEN setting_name = 'is_query_store_on' THEN setting_value END) AS is_query_store_on + FROM latest + WHERE rn = 1 + GROUP BY database_name +) +SELECT database_name, recovery_model, + is_auto_shrink_on, is_auto_close_on, + is_rcsi_on, page_verify_option, is_query_store_on +FROM pivoted +WHERE is_auto_shrink_on = '1' OR is_auto_close_on = '1' + OR is_rcsi_on = '0' OR page_verify_option != 'CHECKSUM' +ORDER BY database_name;"; + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var issues = new List(); + var autoShrink = reader.IsDBNull(2) ? "" : reader.GetString(2); + var autoClose = reader.IsDBNull(3) ? "" : reader.GetString(3); + var rcsi = reader.IsDBNull(4) ? "" : reader.GetString(4); + var pageVerify = reader.IsDBNull(5) ? "" : reader.GetString(5); + var queryStore = reader.IsDBNull(6) ? "" : reader.GetString(6); + + if (autoShrink == "1") issues.Add("auto_shrink ON"); + if (autoClose == "1") issues.Add("auto_close ON"); + if (rcsi == "0") issues.Add("RCSI OFF"); + if (!string.IsNullOrEmpty(pageVerify) && pageVerify != "CHECKSUM") issues.Add($"page_verify={pageVerify}"); + + items.Add(new + { + database = reader.IsDBNull(0) ? "" : reader.GetString(0), + recovery_model = reader.IsDBNull(1) ? "" : reader.GetString(1), + rcsi = rcsi == "1", + query_store = queryStore == "1", + issues + }); + } + + if (items.Count > 0) + finding.DrillDown!["config_issues"] = items; + } + + private async Task CollectTempDbBreakdown(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 5 + collection_time, + user_object_reserved_mb, + internal_object_reserved_mb, + version_store_reserved_mb, + unallocated_mb +FROM collect.tempdb_stats +WHERE collection_time >= @startTime AND collection_time <= @endTime +ORDER BY (user_object_reserved_mb + internal_object_reserved_mb + version_store_reserved_mb) DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + time = reader.GetDateTime(0).ToString("o"), + user_objects_mb = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)), + internal_objects_mb = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)), + version_store_mb = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)), + unallocated_mb = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)) + }); + } + + if (items.Count > 0) + finding.DrillDown!["tempdb_breakdown"] = items; + } + + private async Task CollectPendingGrants(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 5 + collection_time, + target_memory_mb, total_memory_mb, available_memory_mb, + granted_memory_mb, used_memory_mb, + grantee_count, waiter_count, + timeout_error_count_delta, forced_grant_count_delta +FROM collect.memory_grant_stats +WHERE collection_time >= @startTime AND collection_time <= @endTime +AND waiter_count > 0 +ORDER BY waiter_count DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + time = reader.IsDBNull(0) ? "" : reader.GetDateTime(0).ToString("o"), + target_memory_mb = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)), + total_memory_mb = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)), + available_memory_mb = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)), + granted_memory_mb = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)), + used_memory_mb = reader.IsDBNull(5) ? 0.0 : Convert.ToDouble(reader.GetValue(5)), + grantee_count = reader.IsDBNull(6) ? 0 : reader.GetInt32(6), + waiter_count = reader.IsDBNull(7) ? 0 : reader.GetInt32(7), + timeout_errors = reader.IsDBNull(8) ? 0L : Convert.ToInt64(reader.GetValue(8)), + forced_grants = reader.IsDBNull(9) ? 0L : Convert.ToInt64(reader.GetValue(9)) + }); + } + + if (items.Count > 0) + finding.DrillDown!["pending_grants"] = items; + } + + /// + /// For findings that have query hashes (bad actors), fetch the execution plan + /// live from SQL Server via IPlanFetcher, then run PlanAnalyzer to surface + /// warnings and missing indexes. No plan storage needed -- fetch on demand + /// only for queries that make it into high-impact findings. + /// + private async Task CollectPlanAnalysis(AnalysisFinding finding, AnalysisContext context) + { + if (finding.DrillDown == null || _planFetcher == null) return; + + // Only analyze plans for bad actor findings (1 plan each). + // Skip top_cpu_queries (5 plans would be too heavy). + if (!finding.RootFactKey.StartsWith("BAD_ACTOR_")) return; + + var queryHash = finding.RootFactKey.Replace("BAD_ACTOR_", ""); + if (string.IsNullOrEmpty(queryHash)) return; + + // Look up plan_handle from collect.query_stats for this query_hash + string? planHandle = null; + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 1 CONVERT(VARCHAR(130), plan_handle, 1) AS plan_handle +FROM collect.query_stats +WHERE query_hash = CONVERT(BINARY(8), @queryHash, 1) +AND plan_handle IS NOT NULL +ORDER BY collection_time DESC;"; + + cmd.Parameters.Add(new SqlParameter("@queryHash", queryHash)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (await reader.ReadAsync() && !reader.IsDBNull(0)) + planHandle = reader.GetString(0); + } + catch { return; } + + if (string.IsNullOrEmpty(planHandle)) return; + + // Fetch plan XML live from SQL Server + var planXml = await _planFetcher.FetchPlanXmlAsync(context.ServerId, planHandle); + if (string.IsNullOrEmpty(planXml)) return; + + try + { + var plan = ShowPlanParser.Parse(planXml); + PlanAnalyzer.Analyze(plan); + + var allWarnings = plan.Batches + .SelectMany(b => b.Statements) + .Where(s => s.RootNode != null) + .SelectMany(s => + { + var nodeWarnings = new List(); + CollectPlanNodes(s.RootNode!, nodeWarnings); + return s.PlanWarnings + .Concat(nodeWarnings.SelectMany(n => n.Warnings)); + }) + .ToList(); + + var missingIndexes = plan.AllMissingIndexes; + + if (allWarnings.Count == 0 && missingIndexes.Count == 0) return; + + finding.DrillDown["plan_analysis"] = new + { + query_hash = queryHash, + warning_count = allWarnings.Count, + critical_count = allWarnings.Count(w => w.Severity == PlanWarningSeverity.Critical), + warnings = allWarnings + .OrderByDescending(w => w.Severity) + .Take(10) + .Select(w => new + { + severity = w.Severity.ToString(), + type = w.WarningType, + message = McpHelpers.Truncate(w.Message, 300) + }), + missing_indexes = missingIndexes.Take(5).Select(idx => new + { + table = $"{idx.Schema}.{idx.Table}", + impact = idx.Impact, + create_statement = idx.CreateStatement + }) + }; + } + catch + { + // Plan parsing can fail on malformed XML -- skip silently + } + } + + private static void CollectPlanNodes(PlanNode node, List nodes) + { + nodes.Add(node); + foreach (var child in node.Children) + CollectPlanNodes(child, nodes); + } + + private async Task CollectBadActorDetail(AnalysisFinding finding, AnalysisContext context) + { + // Extract query_hash from the fact key (BAD_ACTOR_0x...) + var queryHash = finding.RootFactKey.Replace("BAD_ACTOR_", ""); + if (string.IsNullOrEmpty(queryHash)) return; + + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + database_name, + CONVERT(VARCHAR(18), query_hash, 1) AS query_hash, + LEFT(CAST(DECOMPRESS(MAX(query_text)) AS NVARCHAR(MAX)), 500) AS query_text, + CAST(SUM(execution_count_delta) AS BIGINT) AS exec_count, + CASE WHEN SUM(execution_count_delta) > 0 + THEN CAST(SUM(total_worker_time_delta) AS FLOAT) / SUM(execution_count_delta) / 1000.0 + ELSE 0 END AS avg_cpu_ms, + CASE WHEN SUM(execution_count_delta) > 0 + THEN CAST(SUM(total_elapsed_time_delta) AS FLOAT) / SUM(execution_count_delta) / 1000.0 + ELSE 0 END AS avg_elapsed_ms, + CASE WHEN SUM(execution_count_delta) > 0 + THEN CAST(SUM(total_logical_reads_delta) AS FLOAT) / SUM(execution_count_delta) + ELSE 0 END AS avg_reads, + CAST(SUM(total_worker_time_delta) AS BIGINT) AS total_cpu_us, + CAST(SUM(total_logical_reads_delta) AS BIGINT) AS total_reads, + CAST(SUM(total_spills) AS BIGINT) AS total_spills, + MAX(max_dop) AS max_dop +FROM collect.query_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime +AND query_hash = CONVERT(BINARY(8), @queryHash, 1) +GROUP BY database_name, query_hash;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + cmd.Parameters.Add(new SqlParameter("@queryHash", queryHash)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (await reader.ReadAsync()) + { + finding.DrillDown!["bad_actor_query"] = new + { + database = reader.IsDBNull(0) ? "" : reader.GetString(0), + query_hash = reader.IsDBNull(1) ? "" : reader.GetString(1), + query_text = reader.IsDBNull(2) ? "" : reader.GetString(2), + execution_count = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)), + avg_cpu_ms = reader.IsDBNull(4) ? 0.0 : Math.Round(Convert.ToDouble(reader.GetValue(4)), 2), + avg_elapsed_ms = reader.IsDBNull(5) ? 0.0 : Math.Round(Convert.ToDouble(reader.GetValue(5)), 2), + avg_reads = reader.IsDBNull(6) ? 0.0 : Math.Round(Convert.ToDouble(reader.GetValue(6)), 0), + total_cpu_ms = reader.IsDBNull(7) ? 0.0 : Convert.ToDouble(reader.GetValue(7)) / 1000.0, + total_reads = reader.IsDBNull(8) ? 0L : Convert.ToInt64(reader.GetValue(8)), + total_spills = reader.IsDBNull(9) ? 0L : Convert.ToInt64(reader.GetValue(9)), + max_dop = reader.IsDBNull(10) ? 0 : Convert.ToInt32(reader.GetValue(10)) + }; + } + } +} diff --git a/Dashboard/Analysis/SqlServerFactCollector.cs b/Dashboard/Analysis/SqlServerFactCollector.cs new file mode 100644 index 0000000..a99d9aa --- /dev/null +++ b/Dashboard/Analysis/SqlServerFactCollector.cs @@ -0,0 +1,1687 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.Data.SqlClient; +using PerformanceMonitorDashboard.Helpers; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Collects facts from SQL Server for the Dashboard analysis engine. +/// Each fact category has its own collection method, added incrementally. +/// Port of DuckDbFactCollector from Lite — queries collect.* tables instead of DuckDB views. +/// +public class SqlServerFactCollector : IFactCollector +{ + private readonly string _connectionString; + + public SqlServerFactCollector(string connectionString) + { + _connectionString = connectionString; + } + + public async Task> CollectFactsAsync(AnalysisContext context) + { + var facts = new List(); + + await CollectWaitStatsFactsAsync(context, facts); + GroupGeneralLockWaits(facts, context); + GroupParallelismWaits(facts, context); + await CollectBlockingFactsAsync(context, facts); + await CollectDeadlockFactsAsync(context, facts); + await CollectServerConfigFactsAsync(context, facts); + await CollectMemoryFactsAsync(context, facts); + await CollectDatabaseSizeFactAsync(context, facts); + await CollectServerMetadataFactsAsync(context, facts); + await CollectCpuUtilizationFactsAsync(context, facts); + await CollectIoLatencyFactsAsync(context, facts); + await CollectTempDbFactsAsync(context, facts); + await CollectMemoryGrantFactsAsync(context, facts); + await CollectQueryStatsFactsAsync(context, facts); + await CollectBadActorFactsAsync(context, facts); + await CollectPerfmonFactsAsync(context, facts); + await CollectMemoryClerkFactsAsync(context, facts); + await CollectDatabaseConfigFactsAsync(context, facts); + await CollectProcedureStatsFactsAsync(context, facts); + await CollectActiveQueryFactsAsync(context, facts); + await CollectRunningJobFactsAsync(context, facts); + await CollectSessionFactsAsync(context, facts); + await CollectTraceFlagFactsAsync(context, facts); + await CollectServerPropertiesFactsAsync(context, facts); + await CollectDiskSpaceFactsAsync(context, facts); + + return facts; + } + + /// + /// Collects wait stats facts — one Fact per significant wait type. + /// Value is wait_time_ms / period_duration_ms (fraction of examined period). + /// + private async Task CollectWaitStatsFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var command = connection.CreateCommand(); + command.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + wait_type, + SUM(waiting_tasks_count_delta) AS total_waiting_tasks, + SUM(wait_time_ms_delta) AS total_wait_time_ms, + SUM(signal_wait_time_ms_delta) AS total_signal_wait_time_ms +FROM collect.wait_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime +AND wait_time_ms_delta > 0 +GROUP BY wait_type +ORDER BY SUM(wait_time_ms_delta) DESC"; + + command.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + command.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await command.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var waitType = reader.GetString(0); + var waitingTasks = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var waitTimeMs = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var signalWaitTimeMs = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + + if (waitTimeMs <= 0) continue; + + var fractionOfPeriod = waitTimeMs / context.PeriodDurationMs; + var avgMsPerWait = waitingTasks > 0 ? (double)waitTimeMs / waitingTasks : 0; + + facts.Add(new Fact + { + Source = "waits", + Key = waitType, + Value = fractionOfPeriod, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["wait_time_ms"] = waitTimeMs, + ["waiting_tasks_count"] = waitingTasks, + ["signal_wait_time_ms"] = signalWaitTimeMs, + ["resource_wait_time_ms"] = waitTimeMs - signalWaitTimeMs, + ["avg_ms_per_wait"] = avgMsPerWait, + ["period_duration_ms"] = context.PeriodDurationMs + } + }); + } + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectWaitStatsFactsAsync failed", ex); + } + } + + /// + /// Collects blocking facts from blocking_BlockedProcessReport. + /// Produces a single BLOCKING_EVENTS fact with event count, rate, and details. + /// Value is events per hour for threshold comparison. + /// + private async Task CollectBlockingFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var command = connection.CreateCommand(); + command.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + COUNT(*) AS event_count, + AVG(CAST(wait_time_ms AS FLOAT)) AS avg_wait_time_ms, + MAX(wait_time_ms) AS max_wait_time_ms, + COUNT(DISTINCT spid) AS distinct_head_blockers, + COUNT(CASE WHEN status = 'sleeping' THEN 1 END) AS sleeping_blocker_count +FROM collect.blocking_BlockedProcessReport +WHERE collection_time >= @startTime +AND collection_time <= @endTime"; + + command.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + command.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await command.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var eventCount = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + if (eventCount <= 0) return; + + var avgWaitTimeMs = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var maxWaitTimeMs = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var distinctHeadBlockers = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var sleepingBlockerCount = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + + var periodHours = context.PeriodDurationMs / 3_600_000.0; + var eventsPerHour = periodHours > 0 ? eventCount / periodHours : 0; + + facts.Add(new Fact + { + Source = "blocking", + Key = "BLOCKING_EVENTS", + Value = eventsPerHour, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["event_count"] = eventCount, + ["events_per_hour"] = eventsPerHour, + ["avg_wait_time_ms"] = avgWaitTimeMs, + ["max_wait_time_ms"] = maxWaitTimeMs, + ["distinct_head_blockers"] = distinctHeadBlockers, + ["sleeping_blocker_count"] = sleepingBlockerCount, + ["period_hours"] = periodHours + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectBlockingFactsAsync failed", ex); + } + } + + /// + /// Collects deadlock facts from the deadlocks table. + /// Produces a single DEADLOCKS fact with count and rate. + /// Value is deadlocks per hour for threshold comparison. + /// + private async Task CollectDeadlockFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var command = connection.CreateCommand(); + command.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT COUNT(*) AS deadlock_count +FROM collect.deadlocks +WHERE collection_time >= @startTime +AND collection_time <= @endTime"; + + command.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + command.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await command.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var deadlockCount = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + if (deadlockCount <= 0) return; + + var periodHours = context.PeriodDurationMs / 3_600_000.0; + var deadlocksPerHour = periodHours > 0 ? deadlockCount / periodHours : 0; + + facts.Add(new Fact + { + Source = "blocking", + Key = "DEADLOCKS", + Value = deadlocksPerHour, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["deadlock_count"] = deadlockCount, + ["deadlocks_per_hour"] = deadlocksPerHour, + ["period_hours"] = periodHours + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectDeadlockFactsAsync failed", ex); + } + } + + /// + /// Collects server configuration settings relevant to analysis. + /// These become facts that amplifiers and the config audit tool can reference + /// to make recommendations specific (e.g., "your CTFP is 50" vs "check CTFP"). + /// + private async Task CollectServerConfigFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 4 + configuration_name, + CAST(value_in_use AS BIGINT) AS value_in_use +FROM config.server_configuration_history +WHERE configuration_name IN ( + 'cost threshold for parallelism', + 'max degree of parallelism', + 'max server memory (MB)', + 'max worker threads' +) +ORDER BY collection_time DESC"; + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var configName = reader.GetString(0); + var value = Convert.ToDouble(reader.GetValue(1)); + + var factKey = configName switch + { + "cost threshold for parallelism" => "CONFIG_CTFP", + "max degree of parallelism" => "CONFIG_MAXDOP", + "max server memory (MB)" => "CONFIG_MAX_MEMORY_MB", + "max worker threads" => "CONFIG_MAX_WORKER_THREADS", + _ => null + }; + + if (factKey == null) continue; + + facts.Add(new Fact + { + Source = "config", + Key = factKey, + Value = value, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["value_in_use"] = value + } + }); + } + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectServerConfigFactsAsync failed", ex); + } + } + + /// + /// Collects memory stats: total physical RAM, buffer pool size, target memory. + /// These facts enable edition-aware memory recommendations in the config audit. + /// + private async Task CollectMemoryFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 1 + total_physical_memory_mb, + buffer_pool_mb, + committed_target_memory_mb +FROM collect.memory_stats +WHERE collection_time <= @endTime +ORDER BY collection_time DESC"; + + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var totalPhysical = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var bufferPool = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var targetMemory = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + + if (totalPhysical > 0) + facts.Add(new Fact { Source = "memory", Key = "MEMORY_TOTAL_PHYSICAL_MB", Value = totalPhysical, ServerId = context.ServerId }); + if (bufferPool > 0) + facts.Add(new Fact { Source = "memory", Key = "MEMORY_BUFFER_POOL_MB", Value = bufferPool, ServerId = context.ServerId }); + if (targetMemory > 0) + facts.Add(new Fact { Source = "memory", Key = "MEMORY_TARGET_MB", Value = targetMemory, ServerId = context.ServerId }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectMemoryFactsAsync failed", ex); + } + } + + /// + /// Collects total database data size from file_io_stats. + /// Sums the latest size_on_disk_bytes across all database files for the server. + /// + private async Task CollectDatabaseSizeFactAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT + database_name, + file_name, + size_on_disk_bytes, + ROW_NUMBER() OVER (PARTITION BY database_name, file_name ORDER BY collection_time DESC) AS rn + FROM collect.file_io_stats + WHERE collection_time <= @endTime + AND size_on_disk_bytes > 0 +) +SELECT SUM(size_on_disk_bytes / 1048576.0) AS total_size_mb +FROM latest +WHERE rn = 1"; + + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var totalSize = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + if (totalSize > 0) + facts.Add(new Fact { Source = "config", Key = "DATABASE_TOTAL_SIZE_MB", Value = totalSize, ServerId = context.ServerId }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectDatabaseSizeFactAsync failed", ex); + } + } + + /// + /// Collects SQL Server edition and major version from the server_properties table. + /// + private async Task CollectServerMetadataFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 1 + engine_edition, + CAST(LEFT(product_version, CHARINDEX('.', product_version) - 1) AS INT) AS major_version +FROM collect.server_properties +ORDER BY collection_time DESC"; + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var edition = reader.IsDBNull(0) ? 0 : Convert.ToInt32(reader.GetValue(0)); + var majorVersion = reader.IsDBNull(1) ? 0 : Convert.ToInt32(reader.GetValue(1)); + + if (edition > 0) + facts.Add(new Fact { Source = "config", Key = "SERVER_EDITION", Value = edition, ServerId = context.ServerId }); + if (majorVersion > 0) + facts.Add(new Fact { Source = "config", Key = "SERVER_MAJOR_VERSION", Value = majorVersion, ServerId = context.ServerId }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectServerMetadataFactsAsync failed", ex); + } + } + + /// + /// Collects CPU utilization: average and max SQL Server CPU % over the period. + /// Value is average SQL CPU %. Corroborates SOS_SCHEDULER_YIELD. + /// + private async Task CollectCpuUtilizationFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + AVG(CAST(sqlserver_cpu_utilization AS FLOAT)) AS avg_sql_cpu, + MAX(sqlserver_cpu_utilization) AS max_sql_cpu, + AVG(CAST(other_process_cpu_utilization AS FLOAT)) AS avg_other_cpu, + MAX(other_process_cpu_utilization) AS max_other_cpu, + COUNT(*) AS sample_count +FROM collect.cpu_utilization_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var avgSqlCpu = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var maxSqlCpu = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var avgOtherCpu = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + var maxOtherCpu = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + var sampleCount = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + + if (sampleCount == 0) return; + + var cpuMetadata = new Dictionary + { + ["avg_sql_cpu"] = avgSqlCpu, + ["max_sql_cpu"] = maxSqlCpu, + ["avg_other_cpu"] = avgOtherCpu, + ["max_other_cpu"] = maxOtherCpu, + ["avg_total_cpu"] = avgSqlCpu + avgOtherCpu, + ["sample_count"] = sampleCount + }; + + facts.Add(new Fact + { + Source = "cpu", + Key = "CPU_SQL_PERCENT", + Value = avgSqlCpu, + ServerId = context.ServerId, + Metadata = cpuMetadata + }); + + // Emit a CPU_SPIKE fact when max is high and significantly above average. + // This catches bursty CPU events that average-based scoring misses entirely. + // Requires max >= 80% AND at least 3x the average (or avg < 20% with max >= 80%). + if (maxSqlCpu >= 80 && (avgSqlCpu < 20 || maxSqlCpu / Math.Max(avgSqlCpu, 1) >= 3)) + { + facts.Add(new Fact + { + Source = "cpu", + Key = "CPU_SPIKE", + Value = maxSqlCpu, + ServerId = context.ServerId, + Metadata = cpuMetadata + }); + } + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectCpuUtilizationFactsAsync failed", ex); + } + } + + /// + /// Collects I/O latency from file_io_stats delta columns. + /// Computes average read and write latency across all database files. + /// + private async Task CollectIoLatencyFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + SUM(io_stall_read_ms_delta) AS total_stall_read_ms, + SUM(num_of_reads_delta) AS total_reads, + SUM(io_stall_write_ms_delta) AS total_stall_write_ms, + SUM(num_of_writes_delta) AS total_writes +FROM collect.file_io_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime +AND (num_of_reads_delta > 0 OR num_of_writes_delta > 0)"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var totalStallReadMs = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + var totalReads = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var totalStallWriteMs = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var totalWrites = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + + if (totalReads > 0) + { + var avgReadLatency = (double)totalStallReadMs / totalReads; + facts.Add(new Fact + { + Source = "io", + Key = "IO_READ_LATENCY_MS", + Value = avgReadLatency, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["avg_read_latency_ms"] = avgReadLatency, + ["total_stall_read_ms"] = totalStallReadMs, + ["total_reads"] = totalReads + } + }); + } + + if (totalWrites > 0) + { + var avgWriteLatency = (double)totalStallWriteMs / totalWrites; + facts.Add(new Fact + { + Source = "io", + Key = "IO_WRITE_LATENCY_MS", + Value = avgWriteLatency, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["avg_write_latency_ms"] = avgWriteLatency, + ["total_stall_write_ms"] = totalStallWriteMs, + ["total_writes"] = totalWrites + } + }); + } + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectIoLatencyFactsAsync failed", ex); + } + } + + /// + /// Collects TempDB usage facts: max usage, version store size, and unallocated space. + /// Value is max total_reserved_mb over the period. + /// Dashboard uses computed columns (total_reserved_mb, etc.) from collect.tempdb_stats. + /// + private async Task CollectTempDbFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + MAX(total_reserved_mb) AS max_total_reserved_mb, + MAX(user_object_reserved_mb) AS max_user_object_mb, + MAX(internal_object_reserved_mb) AS max_internal_object_mb, + MAX(version_store_reserved_mb) AS max_version_store_mb, + MIN(unallocated_mb) AS min_unallocated_mb, + AVG(CAST(total_reserved_mb AS FLOAT)) AS avg_total_reserved_mb +FROM collect.tempdb_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var maxReserved = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var maxUserObj = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var maxInternalObj = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + var maxVersionStore = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + var minUnallocated = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)); + var avgReserved = reader.IsDBNull(5) ? 0.0 : Convert.ToDouble(reader.GetValue(5)); + + if (maxReserved <= 0) return; + + // TempDB usage as fraction of total space (reserved + unallocated) + var totalSpace = maxReserved + minUnallocated; + var usageFraction = totalSpace > 0 ? maxReserved / totalSpace : 0; + + facts.Add(new Fact + { + Source = "tempdb", + Key = "TEMPDB_USAGE", + Value = usageFraction, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["max_reserved_mb"] = maxReserved, + ["avg_reserved_mb"] = avgReserved, + ["max_user_object_mb"] = maxUserObj, + ["max_internal_object_mb"] = maxInternalObj, + ["max_version_store_mb"] = maxVersionStore, + ["min_unallocated_mb"] = minUnallocated, + ["usage_fraction"] = usageFraction + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectTempDbFactsAsync failed", ex); + } + } + + /// + /// Collects memory grant facts from the memory_grant_stats table. + /// Detects grant waiters (sessions waiting for memory) and grant pressure. + /// + private async Task CollectMemoryGrantFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + MAX(waiter_count) AS max_waiters, + AVG(CAST(waiter_count AS FLOAT)) AS avg_waiters, + MAX(grantee_count) AS max_grantees, + SUM(timeout_error_count_delta) AS total_timeout_errors, + SUM(forced_grant_count_delta) AS total_forced_grants +FROM collect.memory_grant_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var maxWaiters = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + var avgWaiters = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var maxGrantees = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var totalTimeouts = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var totalForcedGrants = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + + // Only create a fact if there's evidence of grant pressure + if (maxWaiters <= 0 && totalTimeouts <= 0 && totalForcedGrants <= 0) return; + + facts.Add(new Fact + { + Source = "memory", + Key = "MEMORY_GRANT_PENDING", + Value = maxWaiters, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["max_waiters"] = maxWaiters, + ["avg_waiters"] = avgWaiters, + ["max_grantees"] = maxGrantees, + ["total_timeout_errors"] = totalTimeouts, + ["total_forced_grants"] = totalForcedGrants + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectMemoryGrantFactsAsync failed", ex); + } + } + + /// + /// Collects query-level aggregate facts from query_stats. + /// Focuses on spills (memory grant misestimates) and high-parallelism queries. + /// + private async Task CollectQueryStatsFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + SUM(total_spills) AS total_spills, + COUNT(CASE WHEN max_dop > 8 THEN 1 END) AS high_dop_queries, + COUNT(CASE WHEN total_spills > 0 THEN 1 END) AS spilling_queries, + SUM(execution_count_delta) AS total_executions, + SUM(total_worker_time_delta) AS total_cpu_time_us +FROM collect.query_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime +AND execution_count_delta > 0"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var totalSpills = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + var highDopQueries = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var spillingQueries = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var totalExecutions = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var totalCpuTimeUs = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + + if (totalSpills > 0) + { + facts.Add(new Fact + { + Source = "queries", + Key = "QUERY_SPILLS", + Value = totalSpills, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["total_spills"] = totalSpills, + ["spilling_query_count"] = spillingQueries, + ["total_executions"] = totalExecutions + } + }); + } + + if (highDopQueries > 0) + { + facts.Add(new Fact + { + Source = "queries", + Key = "QUERY_HIGH_DOP", + Value = highDopQueries, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["high_dop_query_count"] = highDopQueries, + ["total_cpu_time_us"] = totalCpuTimeUs, + ["total_executions"] = totalExecutions + } + }); + } + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectQueryStatsFactsAsync failed", ex); + } + } + + /// + /// Identifies individual queries that are consistently terrible ("bad actors"). + /// These queries don't necessarily cause server-level symptoms but waste resources + /// on every execution. Detection uses execution count tiers x per-execution impact. + /// Top 5 worst offenders become individual BAD_ACTOR facts. + /// Dashboard query_hash is binary(8) — convert to hex string for fact key. + /// + private async Task CollectBadActorFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 5 + database_name, + CONVERT(VARCHAR(18), query_hash, 1) AS query_hash, + CAST(SUM(execution_count_delta) AS BIGINT) AS exec_count, + CASE WHEN SUM(execution_count_delta) > 0 + THEN CAST(SUM(total_worker_time_delta) AS FLOAT) / SUM(execution_count_delta) / 1000.0 + ELSE 0 END AS avg_cpu_ms, + CASE WHEN SUM(execution_count_delta) > 0 + THEN CAST(SUM(total_elapsed_time_delta) AS FLOAT) / SUM(execution_count_delta) / 1000.0 + ELSE 0 END AS avg_elapsed_ms, + CASE WHEN SUM(execution_count_delta) > 0 + THEN CAST(SUM(total_logical_reads_delta) AS FLOAT) / SUM(execution_count_delta) + ELSE 0 END AS avg_reads, + CAST(SUM(total_worker_time_delta) AS BIGINT) AS total_cpu_us, + CAST(SUM(total_logical_reads_delta) AS BIGINT) AS total_reads, + CAST(SUM(total_spills) AS BIGINT) AS total_spills, + MAX(max_dop) AS max_dop, + LEFT(CAST(DECOMPRESS(MAX(query_text)) AS NVARCHAR(MAX)), 200) AS query_text +FROM collect.query_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime +AND execution_count_delta > 0 +GROUP BY database_name, query_hash +HAVING SUM(execution_count_delta) >= 100 +ORDER BY CAST(SUM(total_worker_time_delta) AS FLOAT) / NULLIF(SUM(execution_count_delta), 0) * + LOG(NULLIF(SUM(execution_count_delta), 0)) DESC"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var dbName = reader.IsDBNull(0) ? "" : reader.GetString(0); + var queryHash = reader.IsDBNull(1) ? "" : reader.GetString(1); + var execCount = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var avgCpuMs = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + var avgElapsedMs = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)); + var avgReads = reader.IsDBNull(5) ? 0.0 : Convert.ToDouble(reader.GetValue(5)); + var totalCpuUs = reader.IsDBNull(6) ? 0L : Convert.ToInt64(reader.GetValue(6)); + var totalReads = reader.IsDBNull(7) ? 0L : Convert.ToInt64(reader.GetValue(7)); + var totalSpills = reader.IsDBNull(8) ? 0L : Convert.ToInt64(reader.GetValue(8)); + var maxDop = reader.IsDBNull(9) ? 0 : Convert.ToInt32(reader.GetValue(9)); + var queryText = reader.IsDBNull(10) ? "" : reader.GetString(10); + + // Skip low-impact queries — need meaningful per-execution cost + if (avgCpuMs < 10 && avgReads < 1000) continue; + + facts.Add(new Fact + { + Source = "bad_actor", + Key = $"BAD_ACTOR_{queryHash}", + Value = avgCpuMs, // Primary scoring dimension + ServerId = context.ServerId, + DatabaseName = dbName, + Metadata = new Dictionary + { + ["execution_count"] = execCount, + ["avg_cpu_ms"] = avgCpuMs, + ["avg_elapsed_ms"] = avgElapsedMs, + ["avg_reads"] = avgReads, + ["total_cpu_us"] = totalCpuUs, + ["total_reads"] = totalReads, + ["total_spills"] = totalSpills, + ["max_dop"] = maxDop + } + }); + } + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectBadActorFactsAsync failed", ex); + } + } + + /// + /// Collects key perfmon counters: Page Life Expectancy, Batch Requests/sec, compilations. + /// PLE is scored; others are throughput context for the AI. + /// + private async Task CollectPerfmonFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT + counter_name, + cntr_value, + cntr_value_delta, + ROW_NUMBER() OVER (PARTITION BY counter_name ORDER BY collection_time DESC) AS rn + FROM collect.perfmon_stats + WHERE collection_time >= @startTime + AND collection_time <= @endTime + AND counter_name IN ('Page life expectancy', 'Batch Requests/sec', 'SQL Compilations/sec', 'SQL Re-Compilations/sec') +) +SELECT counter_name, cntr_value, cntr_value_delta +FROM latest WHERE rn = 1"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var counterName = reader.GetString(0); + var cntrValue = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var deltaValue = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + + var (factKey, source) = counterName switch + { + "Page life expectancy" => ("PERFMON_PLE", "perfmon"), + "Batch Requests/sec" => ("PERFMON_BATCH_REQ_SEC", "perfmon"), + "SQL Compilations/sec" => ("PERFMON_COMPILATIONS_SEC", "perfmon"), + "SQL Re-Compilations/sec" => ("PERFMON_RECOMPILATIONS_SEC", "perfmon"), + _ => (null, null) + }; + + if (factKey == null) continue; + + // For PLE, use the absolute value. For rate counters, use delta. + var value = counterName == "Page life expectancy" ? (double)cntrValue : (double)deltaValue; + + facts.Add(new Fact + { + Source = source!, + Key = factKey, + Value = value, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["cntr_value"] = cntrValue, + ["delta_cntr_value"] = deltaValue + } + }); + } + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectPerfmonFactsAsync failed", ex); + } + } + + /// + /// Collects top memory clerks by size. Context for understanding where memory is allocated. + /// Dashboard stores pages_kb — convert to MB for consistency with Lite facts. + /// + private async Task CollectMemoryClerkFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT + clerk_type, + SUM(pages_kb) / 1024.0 AS memory_mb, + ROW_NUMBER() OVER (PARTITION BY clerk_type ORDER BY collection_time DESC) AS rn, + collection_time + FROM collect.memory_clerks_stats + WHERE collection_time <= @endTime + GROUP BY clerk_type, collection_time +) +SELECT TOP 10 clerk_type, memory_mb +FROM latest WHERE rn = 1 AND memory_mb > 0 +ORDER BY memory_mb DESC"; + + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + var metadata = new Dictionary(); + var totalMb = 0.0; + var clerkCount = 0; + + while (await reader.ReadAsync()) + { + var clerkType = reader.GetString(0); + var memoryMb = Convert.ToDouble(reader.GetValue(1)); + metadata[clerkType] = memoryMb; + totalMb += memoryMb; + clerkCount++; + } + + if (clerkCount == 0) return; + + metadata["total_top_clerks_mb"] = totalMb; + metadata["clerk_count"] = clerkCount; + + facts.Add(new Fact + { + Source = "memory", + Key = "MEMORY_CLERKS", + Value = totalMb, + ServerId = context.ServerId, + Metadata = metadata + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectMemoryClerkFactsAsync failed", ex); + } + } + + /// + /// Collects database configuration facts: RCSI status, auto_shrink, auto_close, + /// recovery model. Aggregates counts across databases. + /// Dashboard stores config as individual setting rows in config.database_configuration_history. + /// We pivot from the per-setting rows into aggregated counts. + /// + private async Task CollectDatabaseConfigFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT + database_name, + setting_name, + setting_value, + ROW_NUMBER() OVER (PARTITION BY database_name, setting_name ORDER BY collection_time DESC) AS rn + FROM config.database_configuration_history + WHERE setting_type = 'database_option' + AND database_name NOT IN ('master', 'msdb', 'model', 'tempdb') +), +pivoted AS ( + SELECT + database_name, + MAX(CASE WHEN setting_name = 'recovery_model_desc' THEN CAST(setting_value AS NVARCHAR(128)) END) AS recovery_model, + MAX(CASE WHEN setting_name = 'is_auto_shrink_on' THEN CAST(setting_value AS NVARCHAR(10)) END) AS is_auto_shrink_on, + MAX(CASE WHEN setting_name = 'is_auto_close_on' THEN CAST(setting_value AS NVARCHAR(10)) END) AS is_auto_close_on, + MAX(CASE WHEN setting_name = 'is_read_committed_snapshot_on' THEN CAST(setting_value AS NVARCHAR(10)) END) AS is_read_committed_snapshot_on, + MAX(CASE WHEN setting_name = 'is_auto_create_stats_on' THEN CAST(setting_value AS NVARCHAR(10)) END) AS is_auto_create_stats_on, + MAX(CASE WHEN setting_name = 'is_auto_update_stats_on' THEN CAST(setting_value AS NVARCHAR(10)) END) AS is_auto_update_stats_on, + MAX(CASE WHEN setting_name = 'page_verify_option_desc' THEN CAST(setting_value AS NVARCHAR(128)) END) AS page_verify_option, + MAX(CASE WHEN setting_name = 'is_query_store_on' THEN CAST(setting_value AS NVARCHAR(10)) END) AS is_query_store_on + FROM latest + WHERE rn = 1 + GROUP BY database_name +) +SELECT + COUNT(*) AS database_count, + COUNT(CASE WHEN is_auto_shrink_on = '1' OR is_auto_shrink_on = 'True' THEN 1 END) AS auto_shrink_count, + COUNT(CASE WHEN is_auto_close_on = '1' OR is_auto_close_on = 'True' THEN 1 END) AS auto_close_count, + COUNT(CASE WHEN is_read_committed_snapshot_on = '0' OR is_read_committed_snapshot_on = 'False' THEN 1 END) AS rcsi_off_count, + COUNT(CASE WHEN is_auto_create_stats_on = '0' OR is_auto_create_stats_on = 'False' THEN 1 END) AS auto_create_stats_off_count, + COUNT(CASE WHEN is_auto_update_stats_on = '0' OR is_auto_update_stats_on = 'False' THEN 1 END) AS auto_update_stats_off_count, + COUNT(CASE WHEN page_verify_option IS NOT NULL AND page_verify_option != 'CHECKSUM' THEN 1 END) AS page_verify_not_checksum_count, + COUNT(CASE WHEN recovery_model = 'FULL' THEN 1 END) AS full_recovery_count, + COUNT(CASE WHEN recovery_model = 'SIMPLE' THEN 1 END) AS simple_recovery_count, + COUNT(CASE WHEN is_query_store_on = '1' OR is_query_store_on = 'True' THEN 1 END) AS query_store_on_count +FROM pivoted"; + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var dbCount = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + if (dbCount == 0) return; + + var autoShrink = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var autoClose = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var rcsiOff = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var autoCreateOff = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + var autoUpdateOff = reader.IsDBNull(5) ? 0L : Convert.ToInt64(reader.GetValue(5)); + var pageVerifyBad = reader.IsDBNull(6) ? 0L : Convert.ToInt64(reader.GetValue(6)); + var fullRecovery = reader.IsDBNull(7) ? 0L : Convert.ToInt64(reader.GetValue(7)); + var simpleRecovery = reader.IsDBNull(8) ? 0L : Convert.ToInt64(reader.GetValue(8)); + var queryStoreOn = reader.IsDBNull(9) ? 0L : Convert.ToInt64(reader.GetValue(9)); + + facts.Add(new Fact + { + Source = "database_config", + Key = "DB_CONFIG", + Value = dbCount, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["database_count"] = dbCount, + ["auto_shrink_on_count"] = autoShrink, + ["auto_close_on_count"] = autoClose, + ["rcsi_off_count"] = rcsiOff, + ["auto_create_stats_off_count"] = autoCreateOff, + ["auto_update_stats_off_count"] = autoUpdateOff, + ["page_verify_not_checksum_count"] = pageVerifyBad, + ["full_recovery_count"] = fullRecovery, + ["simple_recovery_count"] = simpleRecovery, + ["query_store_on_count"] = queryStoreOn + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectDatabaseConfigFactsAsync failed", ex); + } + } + + /// + /// Collects procedure stats: top procedure by delta CPU time in the period. + /// + private async Task CollectProcedureStatsFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + COUNT(DISTINCT object_name) AS distinct_procs, + SUM(execution_count_delta) AS total_executions, + SUM(total_worker_time_delta) AS total_cpu_time_us, + SUM(total_elapsed_time_delta) AS total_elapsed_time_us, + SUM(total_logical_reads_delta) AS total_logical_reads +FROM collect.procedure_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime +AND execution_count_delta > 0"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var distinctProcs = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + var totalExecs = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var totalCpuUs = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var totalElapsedUs = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var totalReads = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + + if (totalExecs == 0) return; + + facts.Add(new Fact + { + Source = "queries", + Key = "PROCEDURE_STATS", + Value = totalCpuUs, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["distinct_procedures"] = distinctProcs, + ["total_executions"] = totalExecs, + ["total_cpu_time_us"] = totalCpuUs, + ["total_elapsed_time_us"] = totalElapsedUs, + ["total_logical_reads"] = totalReads, + ["avg_cpu_per_exec_us"] = totalExecs > 0 ? (double)totalCpuUs / totalExecs : 0 + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectProcedureStatsFactsAsync failed", ex); + } + } + + /// + /// Collects active query snapshot facts: long-running queries, blocked sessions, high DOP. + /// Dashboard query_snapshots table is created by sp_WhoIsActive dynamically. + /// We query it if it exists. + /// + private async Task CollectActiveQueryFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + // Check if the table exists first (created dynamically by sp_WhoIsActive) + using var checkCmd = connection.CreateCommand(); + checkCmd.CommandText = "SELECT OBJECT_ID(N'collect.query_snapshots', N'U')"; + var tableExists = await checkCmd.ExecuteScalarAsync(); + if (tableExists == null || tableExists == DBNull.Value) return; + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + COUNT(*) AS total_snapshots, + COUNT(CASE WHEN DATEDIFF(MILLISECOND, 0, [elapsed_time]) > 30000 THEN 1 END) AS long_running_count, + COUNT(CASE WHEN [blocking_session_id] IS NOT NULL AND [blocking_session_id] != '' THEN 1 END) AS blocked_count, + MAX(DATEDIFF(MILLISECOND, 0, [elapsed_time])) AS max_elapsed_ms, + COUNT(DISTINCT [session_id]) AS distinct_sessions +FROM collect.query_snapshots +WHERE collection_time >= @startTime +AND collection_time <= @endTime"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var totalSnapshots = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + if (totalSnapshots == 0) return; + + var longRunning = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var blocked = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var maxElapsed = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var distinctSessions = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + + facts.Add(new Fact + { + Source = "queries", + Key = "ACTIVE_QUERIES", + Value = longRunning, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["total_snapshots"] = totalSnapshots, + ["long_running_count"] = longRunning, + ["blocked_count"] = blocked, + ["max_elapsed_ms"] = maxElapsed, + ["distinct_sessions"] = distinctSessions + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectActiveQueryFactsAsync failed", ex); + } + } + + /// + /// Collects running job facts: jobs currently running long vs historical averages. + /// + private async Task CollectRunningJobFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + COUNT(*) AS running_count, + COUNT(CASE WHEN is_running_long = 1 THEN 1 END) AS running_long_count, + MAX(percent_of_average) AS max_percent_of_avg, + MAX(current_duration_seconds) AS max_duration_seconds +FROM collect.running_jobs +WHERE collection_time >= @startTime +AND collection_time <= @endTime"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var runningCount = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + if (runningCount == 0) return; + + var runningLong = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var maxPctAvg = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + var maxDuration = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + + facts.Add(new Fact + { + Source = "jobs", + Key = "RUNNING_JOBS", + Value = runningLong, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["running_count"] = runningCount, + ["running_long_count"] = runningLong, + ["max_percent_of_average"] = maxPctAvg, + ["max_duration_seconds"] = maxDuration + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectRunningJobFactsAsync failed", ex); + } + } + + /// + /// Collects session stats: connection counts, total connections. + /// Dashboard session_stats is a flat table (not per-program_name), so we adapt. + /// + private async Task CollectSessionFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT + total_sessions, + running_sessions, + sleeping_sessions, + dormant_sessions, + databases_with_connections, + top_application_connections, + ROW_NUMBER() OVER (ORDER BY collection_time DESC) AS rn + FROM collect.session_stats + WHERE collection_time >= @startTime + AND collection_time <= @endTime +) +SELECT + total_sessions AS total_connections, + running_sessions AS total_running, + sleeping_sessions AS total_sleeping, + dormant_sessions AS total_dormant, + databases_with_connections AS distinct_apps, + top_application_connections AS max_app_connections +FROM latest WHERE rn = 1"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var totalConns = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + if (totalConns == 0) return; + + var totalRunning = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var totalSleeping = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var totalDormant = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var distinctApps = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + var maxAppConns = reader.IsDBNull(5) ? 0L : Convert.ToInt64(reader.GetValue(5)); + + facts.Add(new Fact + { + Source = "sessions", + Key = "SESSION_STATS", + Value = totalConns, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["total_connections"] = totalConns, + ["total_running"] = totalRunning, + ["total_sleeping"] = totalSleeping, + ["total_dormant"] = totalDormant, + ["distinct_applications"] = distinctApps, + ["max_app_connections"] = maxAppConns + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectSessionFactsAsync failed", ex); + } + } + + /// + /// Collects active global trace flags. Context for the AI to factor into recommendations. + /// + private async Task CollectTraceFlagFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT + trace_flag, + status, + ROW_NUMBER() OVER (PARTITION BY trace_flag ORDER BY collection_time DESC) AS rn + FROM config.trace_flags_history + WHERE is_global = 1 +) +SELECT trace_flag +FROM latest WHERE rn = 1 AND status = 1 +ORDER BY trace_flag"; + + using var reader = await cmd.ExecuteReaderAsync(); + var metadata = new Dictionary(); + var flagCount = 0; + + while (await reader.ReadAsync()) + { + var flag = Convert.ToInt32(reader.GetValue(0)); + metadata[$"TF_{flag}"] = 1; + flagCount++; + } + + if (flagCount == 0) return; + + metadata["flag_count"] = flagCount; + + facts.Add(new Fact + { + Source = "config", + Key = "TRACE_FLAGS", + Value = flagCount, + ServerId = context.ServerId, + Metadata = metadata + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectTraceFlagFactsAsync failed", ex); + } + } + + /// + /// Collects server hardware properties: CPU count, cores, sockets, memory. + /// Critical context for MAXDOP and memory recommendations. + /// + private async Task CollectServerPropertiesFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 1 + cpu_count, + hyperthread_ratio, + physical_memory_mb, + socket_count, + cores_per_socket, + is_hadr_enabled, + edition, + product_version +FROM collect.server_properties +ORDER BY collection_time DESC"; + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var cpuCount = reader.IsDBNull(0) ? 0 : Convert.ToInt32(reader.GetValue(0)); + var htRatio = reader.IsDBNull(1) ? 0 : Convert.ToInt32(reader.GetValue(1)); + var physicalMemMb = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var socketCount = reader.IsDBNull(3) ? 0 : Convert.ToInt32(reader.GetValue(3)); + var coresPerSocket = reader.IsDBNull(4) ? 0 : Convert.ToInt32(reader.GetValue(4)); + var hadrEnabled = !reader.IsDBNull(5) && Convert.ToBoolean(reader.GetValue(5)); + + if (cpuCount == 0) return; + + facts.Add(new Fact + { + Source = "config", + Key = "SERVER_HARDWARE", + Value = cpuCount, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["cpu_count"] = cpuCount, + ["hyperthread_ratio"] = htRatio, + ["physical_memory_mb"] = physicalMemMb, + ["socket_count"] = socketCount, + ["cores_per_socket"] = coresPerSocket, + ["hadr_enabled"] = hadrEnabled ? 1 : 0 + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectServerPropertiesFactsAsync failed", ex); + } + } + + /// + /// Collects disk space facts from database_size_stats: volume free space, file sizes. + /// + private async Task CollectDiskSpaceFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT + volume_mount_point, + volume_total_mb, + volume_free_mb, + ROW_NUMBER() OVER (PARTITION BY volume_mount_point ORDER BY collection_time DESC) AS rn + FROM collect.database_size_stats + WHERE collection_time <= @endTime + AND volume_total_mb > 0 +) +SELECT + MIN(volume_free_mb * 1.0 / volume_total_mb) AS min_free_pct, + MIN(volume_free_mb) AS min_free_mb, + COUNT(DISTINCT volume_mount_point) AS volume_count, + SUM(volume_total_mb) AS total_volume_mb, + SUM(volume_free_mb) AS total_free_mb +FROM latest WHERE rn = 1"; + + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var minFreePct = reader.IsDBNull(0) ? 1.0 : Convert.ToDouble(reader.GetValue(0)); + var minFreeMb = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var volumeCount = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var totalVolumeMb = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + var totalFreeMb = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)); + + if (volumeCount == 0) return; + + facts.Add(new Fact + { + Source = "disk", + Key = "DISK_SPACE", + Value = minFreePct, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["min_free_pct"] = minFreePct, + ["min_free_mb"] = minFreeMb, + ["volume_count"] = volumeCount, + ["total_volume_mb"] = totalVolumeMb, + ["total_free_mb"] = totalFreeMb + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectDiskSpaceFactsAsync failed", ex); + } + } + + /// + /// Groups general lock waits (X, U, IX, SIX, BU, IU, UIX, etc.) into a single "LCK" fact. + /// Keeps individual facts for: + /// - LCK_M_S, LCK_M_IS (reader/writer blocking -- RCSI signal) + /// - LCK_M_RS_*, LCK_M_RIn_*, LCK_M_RX_* (serializable/repeatable read signal) + /// - SCH_M, SCH_S (schema locks -- DDL/index operations) + /// Individual constituent wait times are preserved in metadata as "{type}_ms" keys. + /// + private static void GroupGeneralLockWaits(List facts, AnalysisContext context) + { + var generalLocks = facts.Where(f => f.Source == "waits" && IsGeneralLockWait(f.Key)).ToList(); + if (generalLocks.Count == 0) return; + + var totalWaitTimeMs = generalLocks.Sum(f => f.Metadata.GetValueOrDefault("wait_time_ms")); + var totalWaitingTasks = generalLocks.Sum(f => f.Metadata.GetValueOrDefault("waiting_tasks_count")); + var totalSignalMs = generalLocks.Sum(f => f.Metadata.GetValueOrDefault("signal_wait_time_ms")); + var avgMsPerWait = totalWaitingTasks > 0 ? totalWaitTimeMs / totalWaitingTasks : 0; + var fractionOfPeriod = totalWaitTimeMs / context.PeriodDurationMs; + + var metadata = new Dictionary + { + ["wait_time_ms"] = totalWaitTimeMs, + ["waiting_tasks_count"] = totalWaitingTasks, + ["signal_wait_time_ms"] = totalSignalMs, + ["resource_wait_time_ms"] = totalWaitTimeMs - totalSignalMs, + ["avg_ms_per_wait"] = avgMsPerWait, + ["period_duration_ms"] = context.PeriodDurationMs, + ["lock_type_count"] = generalLocks.Count + }; + + // Preserve individual constituent wait times for detailed analysis + foreach (var lck in generalLocks) + metadata[$"{lck.Key}_ms"] = lck.Metadata.GetValueOrDefault("wait_time_ms"); + + // Remove individual facts, add grouped fact + foreach (var lck in generalLocks) + facts.Remove(lck); + + facts.Add(new Fact + { + Source = "waits", + Key = "LCK", + Value = fractionOfPeriod, + ServerId = context.ServerId, + Metadata = metadata + }); + } + + /// + /// Groups all CX* parallelism waits (CXPACKET, CXCONSUMER, CXSYNC_PORT, CXSYNC_CONSUMER, etc.) + /// into a single "CXPACKET" fact. They all indicate the same thing: parallel queries are running. + /// Individual wait times are preserved in metadata for detailed analysis. + /// + private static void GroupParallelismWaits(List facts, AnalysisContext context) + { + var cxWaits = facts.Where(f => f.Source == "waits" && f.Key.StartsWith("CX", StringComparison.Ordinal)).ToList(); + if (cxWaits.Count <= 1) return; + + var totalWaitTimeMs = cxWaits.Sum(f => f.Metadata.GetValueOrDefault("wait_time_ms")); + var totalWaitingTasks = cxWaits.Sum(f => f.Metadata.GetValueOrDefault("waiting_tasks_count")); + var totalSignalMs = cxWaits.Sum(f => f.Metadata.GetValueOrDefault("signal_wait_time_ms")); + var avgMsPerWait = totalWaitingTasks > 0 ? totalWaitTimeMs / totalWaitingTasks : 0; + var fractionOfPeriod = totalWaitTimeMs / context.PeriodDurationMs; + + var metadata = new Dictionary + { + ["wait_time_ms"] = totalWaitTimeMs, + ["waiting_tasks_count"] = totalWaitingTasks, + ["signal_wait_time_ms"] = totalSignalMs, + ["resource_wait_time_ms"] = totalWaitTimeMs - totalSignalMs, + ["avg_ms_per_wait"] = avgMsPerWait, + ["period_duration_ms"] = context.PeriodDurationMs + }; + + // Preserve individual constituent wait times for detailed analysis + foreach (var cx in cxWaits) + metadata[$"{cx.Key}_ms"] = cx.Metadata.GetValueOrDefault("wait_time_ms"); + + foreach (var cx in cxWaits) + facts.Remove(cx); + + facts.Add(new Fact + { + Source = "waits", + Key = "CXPACKET", + Value = fractionOfPeriod, + ServerId = cxWaits[0].ServerId, + Metadata = metadata + }); + } + + /// + /// Returns true for general lock waits that should be grouped into "LCK". + /// Excludes reader locks (S, IS), range locks (RS_*, RIn_*, RX_*), and schema locks. + /// + private static bool IsGeneralLockWait(string waitType) + { + if (!waitType.StartsWith("LCK_M_")) return false; + + // Keep individual: reader/writer locks + if (waitType is "LCK_M_S" or "LCK_M_IS") return false; + + // Keep individual: range locks (serializable/repeatable read) + if (waitType.StartsWith("LCK_M_RS_") || + waitType.StartsWith("LCK_M_RIn_") || + waitType.StartsWith("LCK_M_RX_")) return false; + + // Everything else (X, U, IX, SIX, BU, IU, UIX, etc.) -> group + return true; + } +} diff --git a/Dashboard/Analysis/SqlServerFindingStore.cs b/Dashboard/Analysis/SqlServerFindingStore.cs new file mode 100644 index 0000000..0fd0d73 --- /dev/null +++ b/Dashboard/Analysis/SqlServerFindingStore.cs @@ -0,0 +1,408 @@ +using System; +using System.Collections.Generic; +using System.Threading.Tasks; +using Microsoft.Data.SqlClient; +using PerformanceMonitorDashboard.Helpers; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Persists analysis findings to SQL Server and checks for muted story hashes. +/// Handles the write side of the analysis pipeline -- after the engine produces +/// stories, SqlServerFindingStore saves them and filters out muted patterns. +/// Port of Lite's FindingStore -- uses SQL Server instead of DuckDB. +/// Auto-creates config.analysis_findings and config.analysis_muted tables if missing. +/// +public class SqlServerFindingStore +{ + private readonly string _connectionString; + private long _nextId; + + public SqlServerFindingStore(string connectionString) + { + _connectionString = connectionString; + _nextId = DateTime.UtcNow.Ticks; + } + + /// + /// Ensures the analysis_findings and analysis_muted tables exist. + /// Called before any read/write operation. Uses IF NOT EXISTS for idempotency. + /// + private async Task EnsureTablesExistAsync(SqlConnection connection) + { + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +IF OBJECT_ID(N'config.analysis_findings', N'U') IS NULL +BEGIN + CREATE TABLE config.analysis_findings + ( + finding_id bigint NOT NULL, + analysis_time datetime2(7) NOT NULL, + server_id integer NOT NULL, + server_name nvarchar(256) NOT NULL, + database_name nvarchar(256) NULL, + time_range_start datetime2(7) NULL, + time_range_end datetime2(7) NULL, + severity float NOT NULL, + confidence float NOT NULL, + category nvarchar(256) NOT NULL, + story_path nvarchar(2000) NOT NULL, + story_path_hash nvarchar(256) NOT NULL, + story_text nvarchar(4000) NOT NULL, + root_fact_key nvarchar(256) NOT NULL, + root_fact_value float NULL, + leaf_fact_key nvarchar(256) NULL, + leaf_fact_value float NULL, + fact_count integer NOT NULL, + CONSTRAINT PK_analysis_findings PRIMARY KEY CLUSTERED (finding_id) + WITH (DATA_COMPRESSION = PAGE) + ); + + CREATE INDEX IX_analysis_findings_server_time + ON config.analysis_findings (server_id, analysis_time DESC) + WITH (DATA_COMPRESSION = PAGE); +END; + +IF OBJECT_ID(N'config.analysis_muted', N'U') IS NULL +BEGIN + CREATE TABLE config.analysis_muted + ( + mute_id bigint NOT NULL, + server_id integer NULL, + story_path_hash nvarchar(256) NOT NULL, + story_path nvarchar(2000) NOT NULL, + muted_date datetime2(7) NOT NULL, + reason nvarchar(1000) NULL, + CONSTRAINT PK_analysis_muted PRIMARY KEY CLUSTERED (mute_id) + WITH (DATA_COMPRESSION = PAGE) + ); + + CREATE INDEX IX_analysis_muted_server_hash + ON config.analysis_muted (server_id, story_path_hash) + WITH (DATA_COMPRESSION = PAGE); +END;"; + + await cmd.ExecuteNonQueryAsync(); + } + + /// + /// Saves analysis stories as findings, filtering out any that match muted hashes. + /// Returns the list of findings that were actually saved (non-muted). + /// + public async Task> SaveFindingsAsync( + List stories, AnalysisContext context) + { + var mutedHashes = await GetMutedHashesAsync(context.ServerId); + var analysisTime = DateTime.UtcNow; + var saved = new List(); + + foreach (var story in stories) + { + // Skip absolution stories (severity 0) -- they confirm health, not problems + if (story.Severity <= 0) + continue; + + if (mutedHashes.Contains(story.StoryPathHash)) + continue; + + var finding = new AnalysisFinding + { + FindingId = _nextId++, + AnalysisTime = analysisTime, + ServerId = context.ServerId, + ServerName = context.ServerName, + TimeRangeStart = context.TimeRangeStart, + TimeRangeEnd = context.TimeRangeEnd, + Severity = story.Severity, + Confidence = story.Confidence, + Category = story.Category, + StoryPath = story.StoryPath, + StoryPathHash = story.StoryPathHash, + StoryText = story.StoryText, + RootFactKey = story.RootFactKey, + RootFactValue = story.RootFactValue, + LeafFactKey = story.LeafFactKey, + LeafFactValue = story.LeafFactValue, + FactCount = story.FactCount + }; + + await InsertFindingAsync(finding); + saved.Add(finding); + } + + return saved; + } + + /// + /// Returns the most recent findings for a server within the given time range. + /// + public async Task> GetRecentFindingsAsync( + int serverId, int hoursBack = 24, int limit = 100) + { + var findings = new List(); + + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + await EnsureTablesExistAsync(connection); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP (@limit) + finding_id, analysis_time, server_id, server_name, database_name, + time_range_start, time_range_end, severity, confidence, category, + story_path, story_path_hash, story_text, + root_fact_key, root_fact_value, leaf_fact_key, leaf_fact_value, fact_count +FROM config.analysis_findings +WHERE server_id = @serverId +AND analysis_time >= @cutoff +ORDER BY analysis_time DESC, severity DESC;"; + + cmd.Parameters.Add(new SqlParameter("@serverId", serverId)); + cmd.Parameters.Add(new SqlParameter("@cutoff", DateTime.UtcNow.AddHours(-hoursBack))); + cmd.Parameters.Add(new SqlParameter("@limit", limit)); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + findings.Add(ReadFinding(reader)); + } + } + catch (Exception ex) + { + Logger.Error($"[SqlServerFindingStore] GetRecentFindingsAsync failed: {ex.Message}"); + } + + return findings; + } + + /// + /// Returns the latest analysis run's findings for a server (most recent analysis_time). + /// + public async Task> GetLatestFindingsAsync(int serverId) + { + var findings = new List(); + + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + await EnsureTablesExistAsync(connection); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + finding_id, analysis_time, server_id, server_name, database_name, + time_range_start, time_range_end, severity, confidence, category, + story_path, story_path_hash, story_text, + root_fact_key, root_fact_value, leaf_fact_key, leaf_fact_value, fact_count +FROM config.analysis_findings +WHERE server_id = @serverId +AND analysis_time = ( + SELECT MAX(analysis_time) FROM config.analysis_findings WHERE server_id = @serverId +) +ORDER BY severity DESC;"; + + cmd.Parameters.Add(new SqlParameter("@serverId", serverId)); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + findings.Add(ReadFinding(reader)); + } + } + catch (Exception ex) + { + Logger.Error($"[SqlServerFindingStore] GetLatestFindingsAsync failed: {ex.Message}"); + } + + return findings; + } + + /// + /// Mutes a story pattern so it won't appear in future analysis runs. + /// + public async Task MuteStoryAsync(int serverId, string storyPathHash, string storyPath, string? reason = null) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + await EnsureTablesExistAsync(connection); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +INSERT INTO config.analysis_muted (mute_id, server_id, story_path_hash, story_path, muted_date, reason) +VALUES (@muteId, @serverId, @storyPathHash, @storyPath, @mutedDate, @reason);"; + + cmd.Parameters.Add(new SqlParameter("@muteId", _nextId++)); + cmd.Parameters.Add(new SqlParameter("@serverId", serverId)); + cmd.Parameters.Add(new SqlParameter("@storyPathHash", storyPathHash)); + cmd.Parameters.Add(new SqlParameter("@storyPath", storyPath)); + cmd.Parameters.Add(new SqlParameter("@mutedDate", DateTime.UtcNow)); + cmd.Parameters.Add(new SqlParameter("@reason", (object?)reason ?? DBNull.Value)); + + await cmd.ExecuteNonQueryAsync(); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerFindingStore] MuteStoryAsync failed: {ex.Message}"); + } + } + + /// + /// Unmutes a story pattern. + /// + public async Task UnmuteStoryAsync(long muteId) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + await EnsureTablesExistAsync(connection); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = "DELETE FROM config.analysis_muted WHERE mute_id = @muteId;"; + cmd.Parameters.Add(new SqlParameter("@muteId", muteId)); + await cmd.ExecuteNonQueryAsync(); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerFindingStore] UnmuteStoryAsync failed: {ex.Message}"); + } + } + + /// + /// Cleans up old findings beyond the retention period. + /// + public async Task CleanupOldFindingsAsync(int retentionDays = 30) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + await EnsureTablesExistAsync(connection); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = "DELETE FROM config.analysis_findings WHERE analysis_time < @cutoff;"; + cmd.Parameters.Add(new SqlParameter("@cutoff", DateTime.UtcNow.AddDays(-retentionDays))); + await cmd.ExecuteNonQueryAsync(); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerFindingStore] CleanupOldFindingsAsync failed: {ex.Message}"); + } + } + + private async Task> GetMutedHashesAsync(int serverId) + { + var hashes = new HashSet(); + + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + await EnsureTablesExistAsync(connection); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT story_path_hash FROM config.analysis_muted +WHERE server_id = @serverId OR server_id IS NULL;"; + + cmd.Parameters.Add(new SqlParameter("@serverId", serverId)); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + hashes.Add(reader.GetString(0)); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerFindingStore] GetMutedHashesAsync failed: {ex.Message}"); + } + + return hashes; + } + + private async Task InsertFindingAsync(AnalysisFinding finding) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + await EnsureTablesExistAsync(connection); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +INSERT INTO config.analysis_findings + (finding_id, analysis_time, server_id, server_name, database_name, + time_range_start, time_range_end, severity, confidence, category, + story_path, story_path_hash, story_text, + root_fact_key, root_fact_value, leaf_fact_key, leaf_fact_value, fact_count) +VALUES + (@findingId, @analysisTime, @serverId, @serverName, @databaseName, + @timeRangeStart, @timeRangeEnd, @severity, @confidence, @category, + @storyPath, @storyPathHash, @storyText, + @rootFactKey, @rootFactValue, @leafFactKey, @leafFactValue, @factCount);"; + + cmd.Parameters.Add(new SqlParameter("@findingId", finding.FindingId)); + cmd.Parameters.Add(new SqlParameter("@analysisTime", finding.AnalysisTime)); + cmd.Parameters.Add(new SqlParameter("@serverId", finding.ServerId)); + cmd.Parameters.Add(new SqlParameter("@serverName", finding.ServerName)); + cmd.Parameters.Add(new SqlParameter("@databaseName", (object?)finding.DatabaseName ?? DBNull.Value)); + cmd.Parameters.Add(new SqlParameter("@timeRangeStart", (object?)finding.TimeRangeStart ?? DBNull.Value)); + cmd.Parameters.Add(new SqlParameter("@timeRangeEnd", (object?)finding.TimeRangeEnd ?? DBNull.Value)); + cmd.Parameters.Add(new SqlParameter("@severity", finding.Severity)); + cmd.Parameters.Add(new SqlParameter("@confidence", finding.Confidence)); + cmd.Parameters.Add(new SqlParameter("@category", finding.Category)); + cmd.Parameters.Add(new SqlParameter("@storyPath", finding.StoryPath)); + cmd.Parameters.Add(new SqlParameter("@storyPathHash", finding.StoryPathHash)); + cmd.Parameters.Add(new SqlParameter("@storyText", finding.StoryText)); + cmd.Parameters.Add(new SqlParameter("@rootFactKey", finding.RootFactKey)); + cmd.Parameters.Add(new SqlParameter("@rootFactValue", (object?)finding.RootFactValue ?? DBNull.Value)); + cmd.Parameters.Add(new SqlParameter("@leafFactKey", (object?)finding.LeafFactKey ?? DBNull.Value)); + cmd.Parameters.Add(new SqlParameter("@leafFactValue", (object?)finding.LeafFactValue ?? DBNull.Value)); + cmd.Parameters.Add(new SqlParameter("@factCount", finding.FactCount)); + + await cmd.ExecuteNonQueryAsync(); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerFindingStore] InsertFindingAsync failed: {ex.Message}"); + } + } + + /// + /// Reads a single AnalysisFinding from a data reader row. + /// + private static AnalysisFinding ReadFinding(SqlDataReader reader) + { + return new AnalysisFinding + { + FindingId = reader.GetInt64(0), + AnalysisTime = reader.GetDateTime(1), + ServerId = reader.GetInt32(2), + ServerName = reader.GetString(3), + DatabaseName = reader.IsDBNull(4) ? null : reader.GetString(4), + TimeRangeStart = reader.IsDBNull(5) ? null : reader.GetDateTime(5), + TimeRangeEnd = reader.IsDBNull(6) ? null : reader.GetDateTime(6), + Severity = reader.GetDouble(7), + Confidence = reader.GetDouble(8), + Category = reader.GetString(9), + StoryPath = reader.GetString(10), + StoryPathHash = reader.GetString(11), + StoryText = reader.GetString(12), + RootFactKey = reader.GetString(13), + RootFactValue = reader.IsDBNull(14) ? null : reader.GetDouble(14), + LeafFactKey = reader.IsDBNull(15) ? null : reader.GetString(15), + LeafFactValue = reader.IsDBNull(16) ? null : reader.GetDouble(16), + FactCount = reader.GetInt32(17) + }; + } +} diff --git a/Dashboard/Analysis/SqlServerPlanFetcher.cs b/Dashboard/Analysis/SqlServerPlanFetcher.cs new file mode 100644 index 0000000..f92099b --- /dev/null +++ b/Dashboard/Analysis/SqlServerPlanFetcher.cs @@ -0,0 +1,58 @@ +using System; +using System.Threading.Tasks; +using Microsoft.Data.SqlClient; +using PerformanceMonitorDashboard.Helpers; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Dashboard implementation of IPlanFetcher -- fetches execution plans from SQL Server +/// using the monitored server's connection string directly. +/// Simpler than Lite's SqlPlanFetcher because Dashboard has one connection string +/// per database (no need to look up servers by ID). +/// +public class SqlServerPlanFetcher : IPlanFetcher +{ + private readonly string _connectionString; + + public SqlServerPlanFetcher(string connectionString) + { + _connectionString = connectionString; + } + + public async Task FetchPlanXmlAsync(int serverId, string planHandle) + { + if (string.IsNullOrEmpty(planHandle)) return null; + + try + { + var builder = new SqlConnectionStringBuilder(_connectionString) + { + ConnectTimeout = 10, + CommandTimeout = 15 + }; + + await using var connection = new SqlConnection(builder.ConnectionString); + await connection.OpenAsync(); + + await using var cmd = new SqlCommand(@" +SET NOCOUNT ON; +SELECT query_plan +FROM sys.dm_exec_query_plan(CONVERT(varbinary(64), @plan_handle, 1));", connection); + + cmd.CommandTimeout = 15; + cmd.Parameters.AddWithValue("@plan_handle", planHandle); + + var result = await cmd.ExecuteScalarAsync(); + if (result == null || result is DBNull) return null; + + return result.ToString(); + } + catch (Exception ex) + { + Logger.Error( + $"[SqlServerPlanFetcher] Failed to fetch plan for handle {planHandle}: {ex.Message}"); + return null; + } + } +} diff --git a/Dashboard/Mcp/McpAnalysisTools.cs b/Dashboard/Mcp/McpAnalysisTools.cs new file mode 100644 index 0000000..72ffa00 --- /dev/null +++ b/Dashboard/Mcp/McpAnalysisTools.cs @@ -0,0 +1,465 @@ +using System; +using System.ComponentModel; +using System.Linq; +using System.Text.Json; +using System.Threading.Tasks; +using ModelContextProtocol.Server; +using PerformanceMonitorDashboard.Analysis; +using PerformanceMonitorDashboard.Services; + +namespace PerformanceMonitorDashboard.Mcp; + +[McpServerToolType] +public sealed class McpAnalysisTools +{ + /// + /// Creates an AnalysisService for the resolved server's connection. + /// Dashboard creates per-request (each server has its own connection string). + /// + private static AnalysisService CreateAnalysisService(DatabaseService service) + { + var planFetcher = new SqlServerPlanFetcher(service.ConnectionString); + return new AnalysisService(service.ConnectionString, planFetcher); + } + + [McpServerTool(Name = "analyze_server"), Description("Runs the diagnostic inference engine against a server's collected data. Scores wait stats, blocking, memory, config, and other facts, then traverses a relationship graph to build evidence-backed stories about what's wrong and why. Returns structured findings with severity scores, evidence chains, drill-down data, and recommended next tools to call.")] + public static async Task AnalyzeServer( + ServerManager serverManager, + DatabaseServiceRegistry registry, + [Description("Server name or display name.")] string? server_name = null, + [Description("Hours of data to analyze. Default 4.")] int hours_back = 4) + { + var resolved = ServerResolver.Resolve(serverManager, registry, server_name); + if (resolved == null) + return $"Could not resolve server. Available servers:\n{ServerResolver.ListAvailableServers(serverManager)}"; + + var validation = McpHelpers.ValidateHoursBack(hours_back); + if (validation != null) return validation; + + try + { + var analysisService = CreateAnalysisService(resolved.Value.Service); + var serverId = resolved.Value.ServerName.GetHashCode(); + var findings = await analysisService.AnalyzeAsync(serverId, resolved.Value.ServerName, hours_back); + + if (analysisService.InsufficientDataMessage != null) + { + return JsonSerializer.Serialize(new + { + server = resolved.Value.ServerName, + status = "insufficient_data", + message = analysisService.InsufficientDataMessage + }, McpHelpers.JsonOptions); + } + + if (findings.Count == 0) + { + return JsonSerializer.Serialize(new + { + server = resolved.Value.ServerName, + status = "healthy", + message = "No significant findings. All metrics are within normal ranges.", + analysis_time = analysisService.LastAnalysisTime?.ToString("o") + }, McpHelpers.JsonOptions); + } + + return JsonSerializer.Serialize(new + { + server = resolved.Value.ServerName, + status = "findings", + finding_count = findings.Count, + analysis_time = analysisService.LastAnalysisTime?.ToString("o"), + time_range = new + { + start = findings[0].TimeRangeStart?.ToString("o"), + end = findings[0].TimeRangeEnd?.ToString("o") + }, + findings = findings.Select(f => new + { + severity = Math.Round(f.Severity, 2), + confidence = Math.Round(f.Confidence, 2), + category = f.Category, + root_fact = new { key = f.RootFactKey, value = f.RootFactValue }, + leaf_fact = f.LeafFactKey != null + ? new { key = f.LeafFactKey, value = f.LeafFactValue } + : null, + story_path = f.StoryPath, + story_path_hash = f.StoryPathHash, + fact_count = f.FactCount, + drill_down = f.DrillDown, + next_tools = ToolRecommendations.GetForStoryPath(f.StoryPath) + }) + }, McpHelpers.JsonOptions); + } + catch (Exception ex) + { + return McpHelpers.FormatError("analyze_server", ex); + } + } + + [McpServerTool(Name = "get_analysis_facts"), Description("Exposes the raw scored facts from the inference engine's collect+score pipeline. Shows every observation the engine sees with base severity, final severity after amplifiers, and which amplifiers matched.")] + public static async Task GetAnalysisFacts( + ServerManager serverManager, + DatabaseServiceRegistry registry, + [Description("Server name or display name.")] string? server_name = null, + [Description("Hours of data to analyze. Default 4.")] int hours_back = 4, + [Description("Filter to a specific source category. Omit for all.")] string? source = null, + [Description("Minimum severity to include. Default 0.")] double min_severity = 0) + { + var resolved = ServerResolver.Resolve(serverManager, registry, server_name); + if (resolved == null) + return $"Could not resolve server. Available servers:\n{ServerResolver.ListAvailableServers(serverManager)}"; + + var validation = McpHelpers.ValidateHoursBack(hours_back); + if (validation != null) return validation; + + try + { + var analysisService = CreateAnalysisService(resolved.Value.Service); + var serverId = resolved.Value.ServerName.GetHashCode(); + var facts = await analysisService.CollectAndScoreFactsAsync(serverId, resolved.Value.ServerName, hours_back); + + if (facts.Count == 0) + { + return JsonSerializer.Serialize(new + { + server = resolved.Value.ServerName, + fact_count = 0, + message = "No facts collected." + }, McpHelpers.JsonOptions); + } + + var filtered = facts.AsEnumerable(); + if (source != null) + filtered = filtered.Where(f => f.Source.Equals(source, StringComparison.OrdinalIgnoreCase)); + if (min_severity > 0) + filtered = filtered.Where(f => f.Severity >= min_severity); + + var result = filtered + .OrderByDescending(f => f.Severity) + .Select(f => new + { + source = f.Source, + key = f.Key, + value = Math.Round(f.Value, 6), + base_severity = Math.Round(f.BaseSeverity, 4), + severity = Math.Round(f.Severity, 4), + metadata = f.Metadata.ToDictionary( + m => m.Key, + m => Math.Round(m.Value, 2)), + amplifiers = f.AmplifierResults.Count > 0 + ? f.AmplifierResults.Select(a => new + { + description = a.Description, + matched = a.Matched, + boost = a.Boost + }) + : null + }) + .ToList(); + + return JsonSerializer.Serialize(new + { + server = resolved.Value.ServerName, + total_facts = facts.Count, + shown = result.Count, + filters = new { source, min_severity }, + facts = result + }, McpHelpers.JsonOptions); + } + catch (Exception ex) + { + return McpHelpers.FormatError("get_analysis_facts", ex); + } + } + + [McpServerTool(Name = "compare_analysis"), Description("Compares two time periods by running fact collection and scoring on each, showing what changed.")] + public static async Task CompareAnalysis( + ServerManager serverManager, + DatabaseServiceRegistry registry, + [Description("Server name or display name.")] string? server_name = null, + [Description("Hours back for the comparison period. Default 4.")] int hours_back = 4, + [Description("Hours back for the baseline period start. Default 28.")] int baseline_hours_back = 28) + { + var resolved = ServerResolver.Resolve(serverManager, registry, server_name); + if (resolved == null) + return $"Could not resolve server. Available servers:\n{ServerResolver.ListAvailableServers(serverManager)}"; + + var validation = McpHelpers.ValidateHoursBack(hours_back); + if (validation != null) return validation; + validation = McpHelpers.ValidateHoursBack(baseline_hours_back); + if (validation != null) return validation; + + if (baseline_hours_back <= hours_back) + return "baseline_hours_back must be greater than hours_back."; + + try + { + var analysisService = CreateAnalysisService(resolved.Value.Service); + var serverId = resolved.Value.ServerName.GetHashCode(); + + var now = DateTime.UtcNow; + var comparisonStart = now.AddHours(-hours_back); + var baselineEnd = now.AddHours(-baseline_hours_back + hours_back); + var baselineStart = now.AddHours(-baseline_hours_back); + + var (baselineFacts, comparisonFacts) = await analysisService.ComparePeriodsAsync( + serverId, resolved.Value.ServerName, baselineStart, baselineEnd, comparisonStart, now); + + var baselineByKey = baselineFacts.ToDictionary(f => f.Key, f => f); + var comparisonByKey = comparisonFacts.ToDictionary(f => f.Key, f => f); + var allKeys = baselineByKey.Keys.Union(comparisonByKey.Keys).ToHashSet(); + + var comparisons = allKeys + .Select(key => + { + baselineByKey.TryGetValue(key, out var baseline); + comparisonByKey.TryGetValue(key, out var comparison); + var severityDelta = (comparison?.Severity ?? 0) - (baseline?.Severity ?? 0); + + return new + { + key, + source = baseline?.Source ?? comparison?.Source ?? "unknown", + baseline_severity = baseline != null ? Math.Round(baseline.Severity, 4) : (double?)null, + comparison_severity = comparison != null ? Math.Round(comparison.Severity, 4) : (double?)null, + severity_delta = Math.Round(severityDelta, 4), + status = severityDelta > 0.1 ? "worse" : severityDelta < -0.1 ? "better" : "stable" + }; + }) + .OrderByDescending(c => Math.Abs(c.severity_delta)) + .ToList(); + + return JsonSerializer.Serialize(new + { + server = resolved.Value.ServerName, + summary = new + { + worse = comparisons.Count(c => c.status == "worse"), + better = comparisons.Count(c => c.status == "better"), + stable = comparisons.Count(c => c.status == "stable") + }, + facts = comparisons + }, McpHelpers.JsonOptions); + } + catch (Exception ex) + { + return McpHelpers.FormatError("compare_analysis", ex); + } + } + + [McpServerTool(Name = "audit_config"), Description("Evaluates SQL Server configuration settings against best practices, accounting for edition and server resources.")] + public static async Task AuditConfig( + ServerManager serverManager, + DatabaseServiceRegistry registry, + [Description("Server name or display name.")] string? server_name = null) + { + var resolved = ServerResolver.Resolve(serverManager, registry, server_name); + if (resolved == null) + return $"Could not resolve server. Available servers:\n{ServerResolver.ListAvailableServers(serverManager)}"; + + try + { + var analysisService = CreateAnalysisService(resolved.Value.Service); + var serverId = resolved.Value.ServerName.GetHashCode(); + var facts = await analysisService.CollectAndScoreFactsAsync(serverId, resolved.Value.ServerName, 1); + + var factsByKey = facts.ToDictionary(f => f.Key, f => f); + + var edition = factsByKey.TryGetValue("SERVER_EDITION", out var edFact) ? (int)edFact.Value : 0; + var totalMemoryMb = factsByKey.TryGetValue("MEMORY_TOTAL_PHYSICAL_MB", out var memFact) ? memFact.Value : 0; + + var editionName = edition switch + { + 2 => "Standard", + 3 => "Enterprise", + 4 => "Express", + _ => "Unknown" + }; + var isEnterprise = edition == 3; + var isExpress = edition == 4; + + var recommendations = new System.Collections.Generic.List(); + + if (factsByKey.TryGetValue("CONFIG_CTFP", out var ctfpFact)) + { + var ctfp = (int)ctfpFact.Value; + var status = ctfp <= 5 ? "warning" : ctfp < 25 ? "review" : ctfp > 100 ? "review" : "ok"; + var suggested = ctfp <= 5 || ctfp < 25 ? 50 : ctfp > 100 ? 50 : ctfp; + recommendations.Add(new { setting = "cost threshold for parallelism", current_value = ctfp, suggested_value = suggested, status }); + } + + if (factsByKey.TryGetValue("CONFIG_MAXDOP", out var maxdopFact)) + { + var maxdop = (int)maxdopFact.Value; + var suggested = maxdop == 0 ? (isExpress ? 1 : isEnterprise ? 8 : 4) : maxdop; + var status = maxdop == 0 ? "warning" : maxdop == 1 && !isExpress ? "review" : "ok"; + recommendations.Add(new { setting = "max degree of parallelism", current_value = maxdop, suggested_value = suggested, status }); + } + + if (factsByKey.TryGetValue("CONFIG_MAX_MEMORY_MB", out var maxMemFact)) + { + var maxMem = (int)maxMemFact.Value; + var status = maxMem == 2147483647 ? "warning" : "ok"; + recommendations.Add(new { setting = "max server memory (MB)", current_value = maxMem, suggested_value = maxMem, status }); + } + + return JsonSerializer.Serialize(new + { + server = resolved.Value.ServerName, + edition = editionName, + recommendations + }, McpHelpers.JsonOptions); + } + catch (Exception ex) + { + return McpHelpers.FormatError("audit_config", ex); + } + } + + [McpServerTool(Name = "get_analysis_findings"), Description("Gets persisted findings from previous analysis runs.")] + public static async Task GetAnalysisFindings( + ServerManager serverManager, + DatabaseServiceRegistry registry, + [Description("Server name or display name.")] string? server_name = null, + [Description("Hours of finding history. Default 24.")] int hours_back = 24) + { + var resolved = ServerResolver.Resolve(serverManager, registry, server_name); + if (resolved == null) + return $"Could not resolve server. Available servers:\n{ServerResolver.ListAvailableServers(serverManager)}"; + + try + { + var analysisService = CreateAnalysisService(resolved.Value.Service); + var serverId = resolved.Value.ServerName.GetHashCode(); + var findings = await analysisService.GetRecentFindingsAsync(serverId, hours_back); + + if (findings.Count == 0) + return JsonSerializer.Serialize(new { server = resolved.Value.ServerName, finding_count = 0, message = "No findings. Run analyze_server to generate new findings." }, McpHelpers.JsonOptions); + + return JsonSerializer.Serialize(new + { + server = resolved.Value.ServerName, + finding_count = findings.Count, + findings = findings.Select(f => new + { + severity = Math.Round(f.Severity, 2), + category = f.Category, + story_path = f.StoryPath, + story_path_hash = f.StoryPathHash, + analysis_time = f.AnalysisTime.ToString("o") + }) + }, McpHelpers.JsonOptions); + } + catch (Exception ex) + { + return McpHelpers.FormatError("get_analysis_findings", ex); + } + } + + [McpServerTool(Name = "mute_analysis_finding"), Description("Mutes a finding pattern so it won't appear in future analysis runs.")] + public static async Task MuteAnalysisFinding( + ServerManager serverManager, + DatabaseServiceRegistry registry, + [Description("The story_path_hash from the finding to mute.")] string story_path_hash, + [Description("Server name.")] string? server_name = null, + [Description("Optional reason for muting.")] string? reason = null) + { + var resolved = ServerResolver.Resolve(serverManager, registry, server_name); + if (resolved == null) + return $"Could not resolve server. Available servers:\n{ServerResolver.ListAvailableServers(serverManager)}"; + + try + { + var analysisService = CreateAnalysisService(resolved.Value.Service); + var serverId = resolved.Value.ServerName.GetHashCode(); + var finding = new AnalysisFinding { ServerId = serverId, StoryPathHash = story_path_hash, StoryPath = story_path_hash }; + await analysisService.MuteFindingAsync(finding, reason); + + return JsonSerializer.Serialize(new { status = "muted", story_path_hash, reason }, McpHelpers.JsonOptions); + } + catch (Exception ex) + { + return McpHelpers.FormatError("mute_analysis_finding", ex); + } + } +} + +/// +/// Maps fact keys to recommended MCP tools for further investigation. +/// Shared between Lite and Dashboard — same recommendations. +/// +internal static class ToolRecommendations +{ + private static readonly System.Collections.Generic.Dictionary> ByFactKey = new() + { + ["SOS_SCHEDULER_YIELD"] = [new("get_cpu_utilization", "Check CPU usage over time"), new("get_top_queries_by_cpu", "Find CPU-expensive queries")], + ["CXPACKET"] = [new("get_top_queries_by_cpu", "Find parallel queries", new() { ["parallel_only"] = "true" }), new("audit_config", "Check CTFP and MAXDOP")], + ["THREADPOOL"] = [new("get_top_queries_by_cpu", "Find resource-consuming queries"), new("get_blocking", "Check if blocking is holding threads")], + ["PAGEIOLATCH_SH"] = [new("get_file_io_stats", "Check I/O latency"), new("get_memory_stats", "Check buffer pool")], + ["PAGEIOLATCH_EX"] = [new("get_file_io_stats", "Check I/O latency"), new("get_memory_stats", "Check buffer pool")], + ["RESOURCE_SEMAPHORE"] = [new("get_resource_semaphore", "Check memory grants")], + ["WRITELOG"] = [new("get_file_io_stats", "Check log file latency")], + ["LCK"] = [new("get_blocking", "Get blocking details"), new("get_deadlocks", "Check for deadlocks")], + ["LCK_M_S"] = [new("get_blocking", "Get reader/writer blocking details")], + ["BLOCKING_EVENTS"] = [new("get_blocking", "Get detailed blocking reports"), new("get_deadlocks", "Check for deadlocks")], + ["DEADLOCKS"] = [new("get_deadlocks", "Get deadlock events"), new("get_deadlock_detail", "Get full deadlock XML")], + ["CPU_SQL_PERCENT"] = [new("get_cpu_utilization", "See CPU trend"), new("get_top_queries_by_cpu", "Find CPU queries")], + ["CPU_SPIKE"] = [new("get_cpu_utilization", "See when spike occurred"), new("get_top_queries_by_cpu", "Find queries that drove the spike")], + ["IO_READ_LATENCY_MS"] = [new("get_file_io_stats", "Check per-file latency"), new("get_memory_stats", "Check buffer pool")], + ["IO_WRITE_LATENCY_MS"] = [new("get_file_io_stats", "Check per-file latency")], + ["TEMPDB_USAGE"] = [new("get_tempdb_trend", "Track TempDB usage")], + ["MEMORY_GRANT_PENDING"] = [new("get_resource_semaphore", "Check memory grants")], + ["QUERY_SPILLS"] = [new("get_top_queries_by_cpu", "Find queries with spills")], + ["QUERY_HIGH_DOP"] = [new("get_top_queries_by_cpu", "Find high-DOP queries", new() { ["parallel_only"] = "true" })], + ["PERFMON_PLE"] = [new("get_memory_stats", "Check buffer pool"), new("get_memory_clerks", "See memory allocation")], + ["DB_CONFIG"] = [new("audit_config", "Check configuration")], + ["DISK_SPACE"] = [new("get_file_io_stats", "Check per-file sizes")], + ["LATCH_EX"] = [new("get_latch_stats", "Check latch contention"), new("get_tempdb_trend", "Check TempDB")], + ["BAD_ACTOR"] = [new("get_top_queries_by_cpu", "See full query stats"), new("analyze_query_plan", "Analyze the execution plan")], + ["ANOMALY_CPU"] = [new("get_cpu_utilization", "See CPU trend"), new("get_active_queries", "Find what ran during spike")], + ["ANOMALY_WAIT"] = [new("get_wait_stats", "See wait breakdown"), new("compare_analysis", "Compare current vs baseline")], + ["ANOMALY_BLOCKING"] = [new("get_blocking", "Get blocking details"), new("get_deadlocks", "Get deadlock events")], + ["ANOMALY_IO"] = [new("get_file_io_stats", "Check I/O latency"), new("get_memory_stats", "Check buffer pool")] + }; + + public static System.Collections.Generic.List GetForStoryPath(string storyPath) + { + var factKeys = storyPath.Split(" → ", StringSplitOptions.RemoveEmptyEntries); + var seen = new System.Collections.Generic.HashSet(); + var result = new System.Collections.Generic.List(); + + foreach (var key in factKeys) + { + if (!ByFactKey.TryGetValue(key, out var recommendations)) + { + if (key.StartsWith("BAD_ACTOR_")) + ByFactKey.TryGetValue("BAD_ACTOR", out recommendations); + else if (key.StartsWith("ANOMALY_CPU")) + ByFactKey.TryGetValue("ANOMALY_CPU", out recommendations); + else if (key.StartsWith("ANOMALY_WAIT_")) + ByFactKey.TryGetValue("ANOMALY_WAIT", out recommendations); + else if (key.StartsWith("ANOMALY_BLOCKING") || key.StartsWith("ANOMALY_DEADLOCK")) + ByFactKey.TryGetValue("ANOMALY_BLOCKING", out recommendations); + else if (key.StartsWith("ANOMALY_READ") || key.StartsWith("ANOMALY_WRITE")) + ByFactKey.TryGetValue("ANOMALY_IO", out recommendations); + if (recommendations == null) continue; + } + + foreach (var rec in recommendations) + { + if (!seen.Add(rec.Tool)) continue; + result.Add(rec.SuggestedParams != null && rec.SuggestedParams.Count > 0 + ? new { tool = rec.Tool, reason = rec.Reason, suggested_params = rec.SuggestedParams } + : (object)new { tool = rec.Tool, reason = rec.Reason }); + } + } + + return result; + } +} + +internal record ToolRecommendation( + string Tool, + string Reason, + System.Collections.Generic.Dictionary? SuggestedParams = null); diff --git a/Dashboard/Mcp/McpHostService.cs b/Dashboard/Mcp/McpHostService.cs index 7f92768..6a906db 100644 --- a/Dashboard/Mcp/McpHostService.cs +++ b/Dashboard/Mcp/McpHostService.cs @@ -90,7 +90,8 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) .WithTools() .WithTools() .WithTools() - .WithTools(); + .WithTools() + .WithTools(); _app = builder.Build(); _app.MapMcp(); diff --git a/Dashboard/Mcp/McpInstructions.cs b/Dashboard/Mcp/McpInstructions.cs index 9c91b8e..65ba0f9 100644 --- a/Dashboard/Mcp/McpInstructions.cs +++ b/Dashboard/Mcp/McpInstructions.cs @@ -177,11 +177,22 @@ You are connected to a SQL Server performance monitoring tool via Performance Mo - Join issues: OR clauses, high nested loop executions, many-to-many merge joins - UDF execution overhead, table variable usage, CTE multiple references + ### Diagnostic Analysis Tools + | Tool | Purpose | Key Parameters | + |------|---------|----------------| + | `analyze_server` | Runs the inference engine: scores facts, traverses relationship graph, returns evidence-backed findings with severity, drill-down data, and recommended next tools | `server_name`, `hours_back` (default 4) | + | `get_analysis_facts` | Exposes raw scored facts from the collect+score pipeline with base severity, amplifiers, and metadata | `server_name`, `hours_back` (default 4), `source` (filter), `min_severity` | + | `compare_analysis` | Compares two time periods showing severity deltas for each fact | `server_name`, `hours_back` (default 4), `baseline_hours_back` (default 28) | + | `audit_config` | Edition-aware configuration audit: CTFP, MAXDOP, max memory, max worker threads | `server_name` | + | `get_analysis_findings` | Retrieves persisted findings from previous analysis runs | `server_name`, `hours_back` (default 24) | + | `mute_analysis_finding` | Mutes a finding pattern by story_path_hash | `story_path_hash` (required), `server_name`, `reason` | + ## Recommended Workflow 1. **Start**: `list_servers` — see what's monitored and which servers are online 2. **Verify**: `get_collection_health` — check collectors are running successfully - 3. **Overview**: `get_daily_summary` — high-level health: blocking, deadlocks, CPU spikes, memory pressure + 3. **Diagnose**: `analyze_server` — run the inference engine for evidence-backed assessment with drill-down data + 4. **Overview**: `get_daily_summary` — high-level health: blocking, deadlocks, CPU spikes, memory pressure 4. **Drill down** based on findings: - High wait times → `get_wait_stats` → `get_wait_trend` to see changes - CPU pressure → `get_cpu_utilization` → `get_top_queries_by_cpu` or `get_expensive_queries` diff --git a/README.md b/README.md index e58e884..823bb57 100644 --- a/README.md +++ b/README.md @@ -1,685 +1,685 @@ -# SQL Server Performance Monitor - -**Free, open-source monitoring that replaces the tools charging you thousands per server per year.** 30+ collectors, real-time alerts, built-in MCP server for AI analysis. Nothing phones home. Your data stays on your server and your machine. - -**Supported:** SQL Server 2016–2025 | Azure SQL Managed Instance | AWS RDS for SQL Server | Azure SQL Database (Lite only) - -![Dashboard landing page with server health cards](Screenshots/Screenshot%20Dashboard%20landing%20page%20with%20server%20health%20cards.jpg) - - -![Full Dashboard — Resource Overview](Screenshots/Full%20Dashboard%20%E2%80%94%20Resource%20Overview.jpg) - ---- - -## Download - -**👉 Not sure which edition to pick? [Start with Lite.](https://github.com/erikdarlingdata/PerformanceMonitor/releases/latest)** One download, nothing installed on your server, data flowing in under 5 minutes. - -| | **[Full Edition](https://github.com/erikdarlingdata/PerformanceMonitor/releases/latest)** | **[Lite Edition](https://github.com/erikdarlingdata/PerformanceMonitor/releases/latest)** | -|---|---|---| -| **What it does** | Installs a `PerformanceMonitor` database with 30 T-SQL collectors running via SQL Agent. Separate dashboard app connects to view everything. | Single desktop app that monitors remotely. Stores data locally in DuckDB + Parquet. Nothing touches your server. | -| **Best for** | Production 24/7 monitoring, long-term baselining | Quick triage, Azure SQL DB, locked-down servers, consultants, firefighting | -| **Requires** | SQL Agent running ([see permissions](#permissions)) | `VIEW SERVER STATE` ([see permissions](#permissions)) | -| **Get started** | Run the installer, open the dashboard | Download, run, add a server, done | - -Both editions include real-time alerts (system tray + email), charts and graphs, dark and light themes, CSV export, and a built-in MCP server for AI-powered analysis with tools like Claude. - -All release binaries are digitally signed via [SignPath](https://signpath.io) — no more Windows SmartScreen warnings. - ---- - -## What People Are Saying - -> *"You guys make us DBAs look like absolute rockstars. I'm over here getting showered with praise, and all I do is use your scripts and follow your advice."* - -> *"replaced SentryOne and had it running in 10 minutes"* - -> *"I've had enough time to gather data and converse with Claude on this. It helped a lot to zone in on CPU starvation from the hypervisor on which the VM runs. IT team currently investigating the host configuration."* - ---- - -## What You Get - -🔍 **32 specialized T-SQL collectors** running on configurable schedules with named presets (Aggressive, Balanced, Low-Impact) — wait stats, query performance, blocking chains, deadlock graphs, memory grants, file I/O, tempdb, perfmon counters, FinOps/capacity, and more. Query text and execution plan collection can be disabled per-collector for sensitive environments. - -🚨 **Real-time alerts** for blocking, deadlocks, and high CPU — system tray notifications plus styled HTML emails with full XML attachments for offline analysis - -📊 **NOC-style dashboard** with green/yellow/red health cards, auto-refresh, configurable time ranges, and dark/light themes - -📋 **Graphical plan viewer** with native ShowPlan rendering, 30-rule PlanAnalyzer, operator-level cost breakdown, and a standalone mode for opening `.sqlplan` files without a server connection - -🤖 **Built-in MCP server** with 51-57 read-only tools for AI analysis — ask Claude Code or Cursor "what are the top wait types on my server?" and get answers from your actual monitoring data - -🧰 **Community tools installed automatically** — sp_WhoIsActive, sp_BlitzLock, sp_HealthParser, sp_HumanEventsBlockViewer - -🔒 **Your data never leaves** — no telemetry, no cloud dependency, no phoning home. Credentials stored in Windows Credential Manager with OS-level encryption. - ---- - -## More Screenshots - -### Lite Edition — Query Performance -![Lite Edition — Query Performance](Screenshots/Lite%20Edition%20%E2%80%94%20Query%20Performance.jpg) - -### Graphical Plan Viewer -![Graphical plan viewer with missing index suggestions and operator analysis](Screenshots/New%20Query%20Plan%20Viewer.jpg) - -### Alert Notifications -![Alert notification](Screenshots/Screenshot%20alert%20notification%20or%20email.jpg) - -### MCP Server — AI-Powered Analysis -![MCP server analysis](Screenshots/Screenshot%20MCP%20server%20analysis.jpg) - ---- - -## Quick Start — Lite Edition - -1. Download and extract **[PerformanceMonitorLite](https://github.com/erikdarlingdata/PerformanceMonitor/releases/latest)** (requires [.NET 8 Desktop Runtime](https://dotnet.microsoft.com/en-us/download/dotnet/8.0)) -2. Run `PerformanceMonitorLite.exe` -3. Click **+ Add Server**, enter connection details, test, save -4. Double-click the server in the sidebar to connect - -Data starts flowing within 1–5 minutes. That's it. No installation on your server, no Agent jobs, no sysadmin required. - -**Upgrading?** Click **Import Data** in the sidebar and point it at your old Lite folder — all historical data (DuckDB + Parquet archives) is imported into the new install. - -**Always On AG?** Enable **ReadOnlyIntent** in the connection settings to route Lite's monitoring queries to a readable secondary, keeping the primary clear. - -### Lite Collectors - -23 collectors run on independent, configurable schedules: - -| Collector | Default | Source | -|---|---|---| -| query_snapshots | 1 min | `sys.dm_exec_requests` + `sys.dm_exec_sessions` | -| blocked_process_report | 1 min | XE ring buffer session | -| waiting_tasks | 1 min | `sys.dm_os_waiting_tasks` | -| wait_stats | 1 min | `sys.dm_os_wait_stats` (deltas) | -| query_stats | 1 min | `sys.dm_exec_query_stats` (deltas) | -| procedure_stats | 1 min | `sys.dm_exec_procedure_stats` (deltas) | -| cpu_utilization | 1 min | `sys.dm_os_ring_buffers` scheduler monitor | -| file_io_stats | 1 min | `sys.dm_io_virtual_file_stats` (deltas) | -| memory_stats | 1 min | `sys.dm_os_sys_memory` + memory counters | -| memory_grant_stats | 1 min | `sys.dm_exec_query_memory_grants` | -| tempdb_stats | 1 min | `sys.dm_db_file_space_usage` | -| perfmon_stats | 1 min | `sys.dm_os_performance_counters` (deltas) | -| deadlocks | 1 min | `system_health` Extended Events session | -| session_stats | 1 min | `sys.dm_exec_sessions` active session tracking | -| memory_clerks | 5 min | `sys.dm_os_memory_clerks` | -| query_store | 5 min | Query Store DMVs (per database) | -| running_jobs | 5 min | `msdb` job history with duration vs avg/p95 | -| database_size_stats | 15 min | `sys.master_files` + `FILEPROPERTY` + `dm_os_volume_stats` | -| server_properties | 15 min | `SERVERPROPERTY()` hardware and licensing metadata | -| server_config | On connect | `sys.configurations` | -| database_config | On connect | `sys.databases` | -| database_scoped_config | On connect | Database-scoped configurations | -| trace_flags | On connect | `DBCC TRACESTATUS` | - -### Lite Data Storage - -- **Hot data** in DuckDB 1.5.0 — non-blocking checkpoints, free block reuse, stable file size without periodic resets -- **Archive** to Parquet with ZSTD compression (~10x reduction) — automatic monthly compaction keeps file count low (~75 files vs thousands) -- **Retention**: 3-month calendar-month rolling window -- Typical size: ~50–200 MB per server per week - -### Lite Configuration - -All configuration lives in the `config/` folder: - -| File | Purpose | -|---|---| -| `servers.json` | Server connections (passwords in Windows Credential Manager). Optional **Utility Database** per server for community procs installed outside master. | -| `settings.json` | Retention, MCP server, startup behavior, alert thresholds, SMTP configuration | -| `collection_schedule.json` | Per-collector enable/disable and frequency | -| `ignored_wait_types.json` | 144 benign wait types excluded by default | - ---- - -## Quick Start — Full Edition - -### Install - -Windows Authentication: - -``` -PerformanceMonitorInstaller.exe YourServerName -``` - -SQL Authentication: - -``` -PerformanceMonitorInstaller.exe YourServerName sa YourPassword -``` - -Entra ID (MFA) Authentication: - -``` -PerformanceMonitorInstaller.exe YourServerName --entra user@domain.com -``` - -Clean reinstall (drops existing database and all collected data): - -``` -PerformanceMonitorInstaller.exe YourServerName --reinstall -PerformanceMonitorInstaller.exe YourServerName sa YourPassword --reinstall -``` - -Uninstall (removes database, Agent jobs, and XE sessions): - -``` -PerformanceMonitorInstaller.exe YourServerName --uninstall -PerformanceMonitorInstaller.exe YourServerName sa YourPassword --uninstall -``` - -The installer automatically tests the connection, checks the SQL Server version (2016+ required), executes SQL scripts, downloads community dependencies, creates SQL Agent jobs, and runs initial data collection. A GUI installer (`InstallerGui/`) is also available with the same functionality. - -### CLI Installer Options - -| Option | Description | -|---|---| -| `SERVER` | SQL Server instance name (positional, required) | -| `USERNAME PASSWORD` | SQL Authentication credentials (positional, optional) | -| `--entra EMAIL` | Microsoft Entra ID interactive authentication (MFA) | -| `--reinstall` | Drop existing database and perform clean install | -| `--uninstall` | Remove database, Agent jobs, and XE sessions | -| `--reset-schedule` | Reset collection schedule to recommended defaults | -| `--preserve-jobs` | Keep existing SQL Agent job schedules during upgrade | -| `--encrypt=optional\|mandatory\|strict` | Connection encryption level (default: mandatory) | -| `--trust-cert` | Trust server certificate without validation (default: require valid cert) | -| `--help` | Show usage information and exit | - -**Environment variable:** Set `PM_SQL_PASSWORD` to avoid passing the password on the command line. - -### Exit Codes - -| Code | Meaning | -|---|---| -| `0` | Success | -| `1` | Invalid arguments | -| `2` | Connection failed | -| `3` | Critical file failed (scripts 01–03) | -| `4` | Partial installation (non-critical failures) | -| `5` | Version check failed (SQL Server 2014 or earlier) | -| `6` | SQL files not found | -| `7` | Uninstall failed | -| `8` | Upgrade script failed | - -### Post-Installation - -1. Ensure SQL Server Agent is running — the collection job executes every minute -2. Verify installation: - -```sql -SELECT * FROM PerformanceMonitor.config.current_version; - -SELECT TOP (20) * -FROM PerformanceMonitor.config.collection_log -ORDER BY collection_time DESC; -``` - -3. Launch the Dashboard (`Dashboard/` folder — build with `dotnet build` or use the release package). The Dashboard is a separate WPF application that runs on your workstation and connects to any SQL Server where the PerformanceMonitor database is installed. Add your server, enter credentials, and data appears immediately. - -### What Gets Installed - -- **PerformanceMonitor database** with collection tables and reporting views -- **32 collector stored procedures** for gathering metrics (including SQL Agent job monitoring) -- **Configurable collection** — query text and execution plan capture can be disabled per-collector via `config.collection_schedule` (`collect_query`, `collect_plan` columns) for sensitive or high-volume environments -- **Delta framework** for calculating per-second rates from cumulative DMVs -- **Community dependencies:** sp_WhoIsActive, sp_HealthParser, sp_HumanEventsBlockViewer, sp_BlitzLock -- **SQL Agent jobs:** Collection (every 1 minute), Data Retention (daily at 2:00 AM), and Hung Job Monitor (collection job watchdog, every 5 minutes) -- **Version tracking** in `config.installation_history` - -### Data Retention - -Default: 30 days (configurable per collector via the `retention_days` column in `config.collection_schedule`). - -Storage estimates: 5–10 GB per week, 20–40 GB per month. - -### Managed Platform Support - -The Full Edition supports Azure SQL Managed Instance and AWS RDS for SQL Server with some limitations: - -| Feature | On-Premises | Azure SQL MI | AWS RDS | -|---|---|---|---| -| All core collectors | Yes | Yes | Yes | -| Default trace collectors | Yes | Disabled automatically | Yes | -| System health XE (file target) | Yes | Disabled automatically | Yes | -| SQL Trace collectors | Yes | Disabled automatically | Yes | -| SQL Agent jobs | Yes | Yes | Yes | -| Running jobs collector | Yes | Yes | Disabled automatically | -| Blocked process threshold | Auto-configured | Auto-configured | Configure via RDS parameter group | -| sp_configure | Yes | Yes | Not available | - -**Azure SQL MI:** The installer automatically detects Engine Edition 8 and disables 4 collectors that require file system access or SQL Trace (default_trace, trace_management, trace_analysis, system_health). All other collectors work normally. - -**AWS RDS:** The installer automatically detects the `rdsadmin` database and disables the `running_jobs_collector` (requires `msdb.dbo.syssessions` which is restricted on RDS). It also gracefully handles restricted `sp_configure` and limited `msdb` permissions. SQL Agent jobs are created and owned by the installing login. The RDS master user is automatically enrolled in `SQLAgentUserRole`; for other logins, add them to `SQLAgentUserRole` in msdb before running the installer. - -### AWS RDS Parameter Group Configuration - -`sp_configure` is not available on AWS RDS for SQL Server. Features that depend on server-level configuration must be set through **AWS RDS Parameter Groups** instead. - -**Blocked process threshold** — Required for blocked process report collection. Without this, blocked process reports will not fire on RDS. - -1. Open the [AWS RDS Console](https://console.aws.amazon.com/rds/) and navigate to **Parameter groups** -2. Create a new parameter group (or modify the one attached to your instance): - - Family: `sqlserver-ee-16.0` (or your edition/version) - - Type: DB Parameter Group -3. Search for `blocked process threshold (s)` and set it to `5` (seconds) -4. Apply the parameter group to your RDS instance (may require a reboot if the parameter is static) -5. Verify it took effect: - - ```sql - SELECT - c.name, - c.value_in_use - FROM sys.configurations AS c - WHERE c.name = N'blocked process threshold (s)'; - ``` - -**Deadlocks** — No parameter group configuration is required. The SQL Server deadlock monitor runs automatically on all platforms, and the `xml_deadlock_report` Extended Event fires without any threshold setting. - -**Azure SQL Database** — The blocked process threshold is fixed at 20 seconds and cannot be changed. The `blocked_process_report` event fires automatically when blocking exceeds this duration. - ---- - -## Edition Comparison - -| Capability | Full | Lite | -|---|---|---| -| Target server installation | Required | None | -| SQL Server Agent | Required | Not needed | -| Azure SQL Managed Instance | Supported | Supported | -| AWS RDS for SQL Server | Supported | Supported | -| Azure SQL Database | Not supported | Supported | -| Multi-server from one seat | Per-server install | Built-in | -| Collectors | 32 | 23 | -| Agent job monitoring | Duration vs historical avg/p95 | Duration vs historical avg/p95 | -| Data storage | SQL Server (on target) | DuckDB + Parquet (local) | -| Execution plans | Collected and stored (can be disabled per-collector) | Download on demand | -| Graphical plan viewer | Built-in with 30-rule PlanAnalyzer | Built-in with 30-rule PlanAnalyzer | -| Standalone plan viewer | Open/paste/drag `.sqlplan` files | Open/paste/drag `.sqlplan` files | -| Community tools (sp_WhoIsActive, sp_BlitzLock) | Installed automatically | Not needed | -| Alerts (tray + email) | Blocking, deadlocks, CPU | Blocking, deadlocks, CPU | -| Dashboard | Separate app | Built-in | -| Themes | Dark and light | Dark and light | -| Portability | Server-bound | Single executable | -| MCP server (LLM integration) | Built into Dashboard (57 tools) | Built-in (51 tools) | - ---- - -## Dashboard Tabs - -### Full Edition Dashboard - -| Tab | Contents | -|---|---| -| **Overview** | Resource overview, daily summary, critical issues, server config changes, database config changes, trace flag changes, collection health | -| **Performance** | Performance trends, expensive queries, active queries, query stats, procedure stats, Query Store, Query Store regressions, query trace patterns | -| **Resource Metrics** | Server trends, wait stats, TempDB, file I/O latency, perfmon counters, default trace events, trace analysis, session stats, latch stats, spinlock stats | -| **Memory** | Memory overview, grants, clerks, plan cache, memory pressure events | -| **Locking** | Blocking chains, deadlocks, blocking/deadlock trends | -| **System Events** | Corruption events, contention, errors, I/O issues, scheduler issues, memory conditions | - -Plus a NOC-style landing page with server health cards (green/yellow/red severity indicators). - -### Lite Edition Dashboard - -| Tab | Contents | -|---|---| -| **Active Queries** | Running queries with session details, wait types, blocking, DOP, memory grants | -| **Wait Stats** | Filterable wait statistics chart with delta calculations | -| **CPU** | SQL Server CPU vs Other Processes over time | -| **Memory** | Physical memory overview, SQL Server memory trend, memory clerk breakdown | -| **Queries** | Performance trends, top queries and procedures by duration, Query Store integration | -| **File I/O** | Read/write I/O trends per database file | -| **TempDB** | Space usage breakdown and TempDB file I/O | -| **Blocking** | Blocking/deadlock trends, blocked process reports, deadlock history | -| **Perfmon** | Selectable SQL Server performance counters over time | -| **Configuration** | Server configuration, database configuration, scoped configuration, trace flags | -| **FinOps** | Utilization & provisioning analysis, database resource breakdown, storage growth (7d/30d), idle database detection, index analysis via sp_IndexCleanup, application connections, server inventory, cost optimization recommendations (enterprise feature audit, CPU/memory right-sizing, compression savings, dormant databases, dev/test detection), column-level filtering on all grids | - -Both editions feature auto-refresh, configurable time ranges, right-click CSV export, system tray integration, dark and light themes, and timezone display options (server time, local time, or UTC). - ---- - -## Alerts & Notifications - -Both editions include a real-time alert engine that monitors for performance issues and sends notifications via system tray balloons and email. - -### Alert Types - -| Metric | Default Threshold | Description | -|---|---|---| -| **Blocking** | 30 seconds (Full), 5 seconds (Lite) | Fires when the longest blocked session exceeds the threshold | -| **Deadlocks** | 1 | Fires when new deadlocks are detected since the last check | -| **Poison waits** | 100 ms avg | Fires when any poison wait type exceeds the average-ms-per-wait threshold | -| **Long-running queries** | 5 minutes | Fires when any query exceeds the elapsed-time threshold | -| **TempDB space** | 80% | Fires when TempDB usage exceeds the percentage threshold | -| **Long-running agent jobs** | 3× average | Fires when a job's current duration exceeds a multiple of its historical average | -| **High CPU** | 90% (Full), 80% (Lite) | Fires when total CPU (SQL + other) exceeds the threshold | -| **Server unreachable** | N/A | Fires when a monitored server goes offline or comes back online (tray + email) | - -All thresholds are configurable in Settings. - -**Poison wait types** monitored: [`THREADPOOL`](https://learn.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-os-wait-stats-transact-sql#threadpool) (worker thread exhaustion), [`RESOURCE_SEMAPHORE`](https://learn.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-os-wait-stats-transact-sql#resource_semaphore) (memory grant pressure), and [`RESOURCE_SEMAPHORE_QUERY_COMPILE`](https://learn.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-os-wait-stats-transact-sql#resource_semaphore_query_compile) (compilation memory pressure). These waits indicate severe resource starvation and should never occur under normal operation. - -### Notification Channels - -- **System tray** — balloon notifications with a configurable per-metric cooldown (default: 5 minutes) -- **Email (SMTP)** — styled HTML emails with a configurable per-metric cooldown (default: 15 minutes), plus configurable SMTP settings (server, port, SSL, authentication, recipients) - -Both cooldown periods are independently configurable in Settings under the Performance Alerts section. - -### Email Alerts - -Alert emails include: - -- **Metric summary** — what triggered the alert, current value vs threshold -- **Detail section** — recent blocking chains or deadlock participants with query text, wait times, lock modes, database names, and client application -- **XML attachment** — full `blocked_process_report.xml` or `deadlock_graph.xml` for offline analysis - -### Alert Behavior - -- **Resolved notifications** — when a condition clears (e.g., blocking ends), a "Cleared" notification fires -- **Server silencing** — right-click a server tab to acknowledge alerts, silence all alerts, or unsilence -- **Always-on** — the Dashboard alert engine runs independently of which tab is active, including when minimized to the system tray. The Lite edition's alert engine also runs regardless of tab visibility. -- **Alert history** — Dashboard keeps an in-memory alert log (accessible via MCP). Lite logs alerts to DuckDB (`config_alert_log`). -- **Alert muting** — create rules to suppress specific recurring alerts while still logging them. Rules match on server name, metric type, database, query text, wait type, or job name (AND logic across fields). Access via Settings → Manage Mute Rules, or right-click an alert in the Alert History tab. The context menu offers two muting options: **Mute This Alert** (pre-fills server + metric for a targeted rule) and **Mute Similar Alerts** (pre-fills metric only, matching across all servers). Muted alerts appear grayed out in alert history and are still recorded for auditability. Rules support optional expiration (1h, 24h, 7 days, or permanent). -- **Alert details** — right-click any alert in the Alert History tab and choose **View Details** to open a detail window. The window shows core alert fields (time, server, metric, value, threshold, notification type, status) plus context-sensitive details that vary by metric: query text and session info for long-running queries, job name and duration stats for anomalous agent jobs, per-wait-type breakdowns for poison waits, space usage by category for TempDB, and blocking/deadlock session counts. - ---- - -## Agent Job Monitoring - -Both editions monitor currently running SQL Agent jobs and flag jobs that are running longer than expected. - -| Metric | How It Works | -|---|---| -| **Current duration** | Elapsed time since the job started | -| **Average duration** | Historical mean from successful completions in `msdb.dbo.sysjobhistory` | -| **p95 duration** | 95th percentile from historical completions | -| **Running long flag** | Set when current duration exceeds the p95 threshold | - -The Full Edition collects this data via the `collect.running_jobs_collector` stored procedure (every 5 minutes). The Lite Edition queries `msdb` directly on each collection cycle. Both editions expose this data through the MCP `get_running_jobs` tool. - -Gracefully skipped on Azure SQL Database, AWS RDS for SQL Server, and environments without SQL Server Agent. - ---- - -## MCP Server (LLM Integration) - -Both editions include an embedded [Model Context Protocol](https://modelcontextprotocol.io) server that exposes monitoring data to LLM clients like Claude Code and Cursor. - -### Setup - -1. Enable the MCP server in Settings (checkbox + port, default `5151`) - - The port must be between **1024** and **65535**. Ports 0–1023 are well-known privileged ports reserved by the operating system. - - On save, the app checks whether the chosen port is already in use and warns you if there is a conflict. - - On startup, the app verifies the port is available before starting the MCP server. -2. Register with Claude Code: - -``` -claude mcp add --transport http --scope user sql-monitor http://localhost:5151/ -``` - -3. Open a new Claude Code session and ask questions like: - - "What servers are being monitored?" - - "What are the top wait types on my server?" - - "Are there any blocking or deadlock issues?" - - "Show me CPU utilization for the last 4 hours" - - "What are the most expensive queries by CPU?" - -### Available Tools - -Full Edition exposes 57 tools, Lite Edition exposes 51. Core tools are shared across both editions. - -| Category | Tools | -|---|---| -| Discovery | `list_servers` | -| Health | `get_server_summary`\*, `get_daily_summary`\*\*, `get_collection_health` | -| Alerts | `get_alert_history`, `get_alert_settings`, `get_mute_rules` | -| Waits | `get_wait_stats`, `get_wait_types`\*, `get_wait_trend`, `get_waiting_tasks`\* | -| Queries | `get_top_queries_by_cpu`, `get_top_procedures_by_cpu`, `get_query_store_top`, `get_expensive_queries`\*\*, `get_query_duration_trend`\*, `get_query_trend` | -| Active Queries | `get_active_queries` | -| CPU | `get_cpu_utilization` | -| Memory | `get_memory_stats`, `get_memory_trend`, `get_memory_clerks`, `get_memory_grants` | -| Blocking | `get_blocking`\*\*, `get_deadlocks`, `get_deadlock_detail`, `get_blocked_process_reports`\*, `get_blocked_process_xml`, `get_blocking_deadlock_stats`\*\*, `get_blocking_trend`\*, `get_deadlock_trend`\* | -| I/O | `get_file_io_stats`, `get_file_io_trend` | -| TempDB | `get_tempdb_trend` | -| Perfmon | `get_perfmon_stats`, `get_perfmon_trend` | -| Jobs | `get_running_jobs` | -| Configuration | `get_server_config`\*, `get_database_config`\*, `get_database_scoped_config`\*, `get_trace_flags`\* | -| Config History | `get_server_config_changes`\*\*, `get_database_config_changes`\*\*, `get_trace_flag_changes`\*\* | -| Server Info | `get_server_properties`, `get_database_sizes` | -| Sessions | `get_session_stats` | -| Scheduler | `get_cpu_scheduler_pressure`\*\* | -| Latch/Spinlock | `get_latch_stats`\*\*, `get_spinlock_stats`\*\* | -| Diagnostics | `get_plan_cache_bloat`\*\*, `get_critical_issues`\*\* | -| System Events | `get_default_trace_events`\*\*, `get_trace_analysis`\*\*, `get_memory_pressure_events`\*\* | -| Health Parser | `get_health_parser_system_health`\*\*, `get_health_parser_severe_errors`\*\*, `get_health_parser_io_issues`\*\*, `get_health_parser_scheduler_issues`\*\*, `get_health_parser_memory_conditions`\*\*, `get_health_parser_cpu_tasks`\*\*, `get_health_parser_memory_broker`\*\*, `get_health_parser_memory_node_oom`\*\* | -| Plan Analysis | `analyze_query_plan`, `analyze_procedure_plan`, `analyze_query_store_plan`, `analyze_plan_xml`, `get_plan_xml` | -| Diagnostic Analysis | `analyze_server`\*, `get_analysis_facts`\*, `compare_analysis`\*, `audit_config`\*, `get_analysis_findings`\*, `mute_analysis_finding`\* | - -\* Lite only | \*\* Full only - -Most tools accept optional `server_name` and `hours_back` parameters. If only one server is configured, `server_name` is auto-resolved. - -The MCP server binds to `localhost` only and does not accept remote connections. - ---- - -## Performance Impact - -### On Monitored Servers - -- All queries use `READ UNCOMMITTED` isolation -- Configurable collection intervals -- Full Edition: typical overhead <1% CPU, <100 MB memory -- Lite Edition: max 7 concurrent SQL connections, 30-second command timeout - -### Local Resources (Lite) - -- DuckDB: ~50–200 MB per server per week -- Parquet archives: ~10x compression with ZSTD -- ScottPlot charts use hardware-accelerated rendering - ---- - -## Troubleshooting - -### Full Edition - -Two diagnostic scripts in the `install/` folder: - -| Script | Purpose | -|---|---| -| `99_installer_troubleshooting.sql` | Quick health checks: collection log errors, schedule status, Agent job status, table row counts | -| `99_user_troubleshooting.sql` | Comprehensive diagnostics: runs collectors with `@debug = 1`, detailed timing and row counts | - -```sql -SELECT - collection_time, - collector_name, - error_message -FROM PerformanceMonitor.config.collection_log -WHERE collection_status = 'ERROR' -ORDER BY collection_time DESC; -``` - -### Lite Edition - -Application logs are written to the `logs/` folder. Collection success/failure is also logged to the `collection_log` table in DuckDB. - -Common issues: - -1. **No data after connecting** — Wait for the first collection cycle (1–5 minutes). Check logs for connection errors. -2. **Query Store tab empty** — Query Store must be enabled on the target database (`ALTER DATABASE [YourDB] SET QUERY_STORE = ON`). -3. **Blocked process reports empty** — Both editions attempt to auto-configure the blocked process threshold to 5 seconds via `sp_configure`. On **AWS RDS**, `sp_configure` is not available — you must set `blocked process threshold (s)` through an RDS Parameter Group (see "AWS RDS Parameter Group Configuration" above). On **Azure SQL Database**, the threshold is fixed at 20 seconds and cannot be changed. If you still see no data on other platforms, verify the login has `ALTER SETTINGS` permission. -4. **Connection failures** — Verify network connectivity, firewall rules, and that the login has the required [permissions](#permissions). For Azure SQL Database, use a contained database user with `VIEW DATABASE STATE`. - ---- - -## Permissions - -### Full Edition (On-Premises) - -The installer needs `sysadmin` to create the database, Agent jobs, and configure `sp_configure` settings. After installation, the collection jobs can run under a **least-privilege login** with these grants: - -```sql -USE [master]; -CREATE LOGIN [SQLServerPerfMon] WITH PASSWORD = N'YourStrongPassword'; -GRANT VIEW SERVER STATE TO [SQLServerPerfMon]; - -USE [PerformanceMonitor]; -CREATE USER [SQLServerPerfMon] FOR LOGIN [SQLServerPerfMon]; -ALTER ROLE [db_owner] ADD MEMBER [SQLServerPerfMon]; - -USE [msdb]; -CREATE USER [SQLServerPerfMon] FOR LOGIN [SQLServerPerfMon]; -ALTER ROLE [SQLAgentReaderRole] ADD MEMBER [SQLServerPerfMon]; -``` - -| Grant | Why | -|---|---| -| `VIEW SERVER STATE` | All DMV access (wait stats, query stats, memory, CPU, file I/O, etc.) | -| `db_owner` on PerformanceMonitor | Collectors insert data, create/alter tables, execute procedures. Scoped to just this database — not sysadmin. | -| `SQLAgentReaderRole` on msdb | Read `sysjobs`, `sysjobactivity`, `sysjobhistory` for the running jobs collector | - -**Optional** (gracefully skipped if missing): -- `ALTER SETTINGS` — installer sets `blocked process threshold` via `sp_configure`. Skipped with a warning if unavailable. -- `ALTER TRACE` — default trace collector. Skipped if denied. -- `DBCC TRACESTATUS` — server config collector skips trace flag detection if denied. - -Change the SQL Agent job owner to the new login after installation if you want to run under least privilege end-to-end. - -### Lite Edition (On-Premises) - -Nothing is installed on the target server. The login only needs: - -```sql -USE [master]; -GRANT VIEW SERVER STATE TO [YourLogin]; - --- Optional: for SQL Agent job monitoring -USE [msdb]; -CREATE USER [YourLogin] FOR LOGIN [YourLogin]; -ALTER ROLE [SQLAgentReaderRole] ADD MEMBER [YourLogin]; -``` - -### Azure SQL Database (Lite Only) - -Azure SQL Database doesn't support server-level logins. Create a **contained database user** directly on the target database: - -```sql --- Connect to your target database (not master) -CREATE USER [SQLServerPerfMon] WITH PASSWORD = 'YourStrongPassword'; -GRANT VIEW DATABASE STATE TO [SQLServerPerfMon]; -``` - -When connecting in Lite, specify the database name in the connection. SQL Agent and msdb are not available on Azure SQL Database — those collectors are skipped automatically. - -### Azure SQL Managed Instance - -Works like on-premises. Use server-level logins with `VIEW SERVER STATE`. SQL Agent is available. - -### AWS RDS for SQL Server - -Use the RDS master user for installation. The master user has the necessary permissions. For ongoing collection, `VIEW SERVER STATE` and msdb access work the same as on-premises, but `sp_configure` is not available (use RDS Parameter Groups instead — see above). - ---- - -## Folder Structure - -``` -Monitor/ -│ -│ Full Edition (server-installed collectors + separate dashboard) -├── install/ # 58 SQL installation scripts -├── upgrades/ # Version-specific upgrade scripts -├── Installer/ # CLI installer for Full Edition database (C#) -├── InstallerGui/ # GUI installer for Full Edition database (WPF) -├── Dashboard/ # Full Edition dashboard application (WPF) -│ -│ Lite Edition (standalone desktop app, nothing installed on server) -├── Lite/ # Lite Edition desktop application (WPF) -│ -│ Shared -└── README.md # This file -``` - ---- - -## Building from Source - -All projects target .NET 8.0. - -``` -# Full Edition Dashboard -dotnet build Dashboard/Dashboard.csproj - -# Lite Edition -dotnet build Lite/PerformanceMonitorLite.csproj - -# CLI Installer (self-contained) -dotnet publish Installer/PerformanceMonitorInstaller.csproj -c Release - -# GUI Installer -dotnet publish InstallerGui/InstallerGui.csproj -c Release -r win-x64 --self-contained -``` - ---- - -## Support & Sponsorship - -**This project is free and open source under the MIT License.** The software is fully functional with no features withheld — every user gets the same tool, same collectors, same MCP integration. - -However, some organizations have procurement or compliance policies that require a formal vendor relationship, a support agreement, or an invoice on file before software can be deployed to production. If that sounds familiar, two commercial support tiers are available: - -| Tier | Annual Cost | What You Get | -|------|-------------|--------------| -| **Supported** | $500/year | Email support (2-business-day response), compatibility guarantees for new SQL Server versions, vendor agreement and invoices for compliance, unlimited instances | -| **Priority** | $2,500/year | Next-business-day email response, quarterly live Q&A sessions, early access to new features, roadmap input, unlimited instances | - -Both tiers cover unlimited SQL Server instances. The software itself is identical — commercial support is about the relationship, not a feature gate. - -**[Read more about the free tool and commercial options](https://erikdarling.com/free-sql-server-performance-monitoring/)** | **[Purchase a support subscription](https://training.erikdarling.com/sql-monitoring)** - -If you find the project valuable, you can also support continued development: - -| | | -|---|---| -| **Sponsor on GitHub** | [Become a sponsor](https://github.com/sponsors/erikdarlingdata) to fund new features, ongoing maintenance, and SQL Server version support. | -| **Consulting Services** | [Hire me](https://training.erikdarling.com/sqlconsulting) for hands-on consulting if you need help analyzing the data this tool collects? Want expert assistance fixing the issues it uncovers? | - -Neither sponsorship nor consulting is required — use the tool freely. - ---- - -## Third-Party Components - -### sp_WhoIsActive - -- **Author:** Adam Machanic | **License:** GPLv3 -- **Repository:** https://github.com/amachanic/sp_whoisactive - -### DarlingData - -- **Author:** Erik Darling (Darling Data, LLC) | **License:** MIT -- **Repository:** https://github.com/erikdarlingdata/DarlingData - -### SQL Server First Responder Kit - -- **Author:** Brent Ozar Unlimited | **License:** MIT -- **Repository:** https://github.com/BrentOzarULTD/SQL-Server-First-Responder-Kit - -See [THIRD_PARTY_NOTICES.md](THIRD_PARTY_NOTICES.md) for complete license texts. - ---- - -## License - -Copyright (c) 2026 Darling Data, LLC. Licensed under the MIT License. See [LICENSE](LICENSE) for details. - -## Author - -Erik Darling — [erikdarling.com](https://erikdarling.com) — [Darling Data, LLC](https://darlingdata.com) +# SQL Server Performance Monitor + +**Free, open-source monitoring that replaces the tools charging you thousands per server per year.** 30+ collectors, real-time alerts, built-in MCP server for AI analysis. Nothing phones home. Your data stays on your server and your machine. + +**Supported:** SQL Server 2016–2025 | Azure SQL Managed Instance | AWS RDS for SQL Server | Azure SQL Database (Lite only) + +![Dashboard landing page with server health cards](Screenshots/Screenshot%20Dashboard%20landing%20page%20with%20server%20health%20cards.jpg) + + +![Full Dashboard — Resource Overview](Screenshots/Full%20Dashboard%20%E2%80%94%20Resource%20Overview.jpg) + +--- + +## Download + +**👉 Not sure which edition to pick? [Start with Lite.](https://github.com/erikdarlingdata/PerformanceMonitor/releases/latest)** One download, nothing installed on your server, data flowing in under 5 minutes. + +| | **[Full Edition](https://github.com/erikdarlingdata/PerformanceMonitor/releases/latest)** | **[Lite Edition](https://github.com/erikdarlingdata/PerformanceMonitor/releases/latest)** | +|---|---|---| +| **What it does** | Installs a `PerformanceMonitor` database with 30 T-SQL collectors running via SQL Agent. Separate dashboard app connects to view everything. | Single desktop app that monitors remotely. Stores data locally in DuckDB + Parquet. Nothing touches your server. | +| **Best for** | Production 24/7 monitoring, long-term baselining | Quick triage, Azure SQL DB, locked-down servers, consultants, firefighting | +| **Requires** | SQL Agent running ([see permissions](#permissions)) | `VIEW SERVER STATE` ([see permissions](#permissions)) | +| **Get started** | Run the installer, open the dashboard | Download, run, add a server, done | + +Both editions include real-time alerts (system tray + email), charts and graphs, dark and light themes, CSV export, and a built-in MCP server for AI-powered analysis with tools like Claude. + +All release binaries are digitally signed via [SignPath](https://signpath.io) — no more Windows SmartScreen warnings. + +--- + +## What People Are Saying + +> *"You guys make us DBAs look like absolute rockstars. I'm over here getting showered with praise, and all I do is use your scripts and follow your advice."* + +> *"replaced SentryOne and had it running in 10 minutes"* + +> *"I've had enough time to gather data and converse with Claude on this. It helped a lot to zone in on CPU starvation from the hypervisor on which the VM runs. IT team currently investigating the host configuration."* + +--- + +## What You Get + +🔍 **32 specialized T-SQL collectors** running on configurable schedules with named presets (Aggressive, Balanced, Low-Impact) — wait stats, query performance, blocking chains, deadlock graphs, memory grants, file I/O, tempdb, perfmon counters, FinOps/capacity, and more. Query text and execution plan collection can be disabled per-collector for sensitive environments. + +🚨 **Real-time alerts** for blocking, deadlocks, and high CPU — system tray notifications plus styled HTML emails with full XML attachments for offline analysis + +📊 **NOC-style dashboard** with green/yellow/red health cards, auto-refresh, configurable time ranges, and dark/light themes + +📋 **Graphical plan viewer** with native ShowPlan rendering, 30-rule PlanAnalyzer, operator-level cost breakdown, and a standalone mode for opening `.sqlplan` files without a server connection + +🤖 **Built-in MCP server** with 51-63 read-only tools for AI analysis — ask Claude Code or Cursor "what are the top wait types on my server?" and get answers from your actual monitoring data + +🧰 **Community tools installed automatically** — sp_WhoIsActive, sp_BlitzLock, sp_HealthParser, sp_HumanEventsBlockViewer + +🔒 **Your data never leaves** — no telemetry, no cloud dependency, no phoning home. Credentials stored in Windows Credential Manager with OS-level encryption. + +--- + +## More Screenshots + +### Lite Edition — Query Performance +![Lite Edition — Query Performance](Screenshots/Lite%20Edition%20%E2%80%94%20Query%20Performance.jpg) + +### Graphical Plan Viewer +![Graphical plan viewer with missing index suggestions and operator analysis](Screenshots/New%20Query%20Plan%20Viewer.jpg) + +### Alert Notifications +![Alert notification](Screenshots/Screenshot%20alert%20notification%20or%20email.jpg) + +### MCP Server — AI-Powered Analysis +![MCP server analysis](Screenshots/Screenshot%20MCP%20server%20analysis.jpg) + +--- + +## Quick Start — Lite Edition + +1. Download and extract **[PerformanceMonitorLite](https://github.com/erikdarlingdata/PerformanceMonitor/releases/latest)** (requires [.NET 8 Desktop Runtime](https://dotnet.microsoft.com/en-us/download/dotnet/8.0)) +2. Run `PerformanceMonitorLite.exe` +3. Click **+ Add Server**, enter connection details, test, save +4. Double-click the server in the sidebar to connect + +Data starts flowing within 1–5 minutes. That's it. No installation on your server, no Agent jobs, no sysadmin required. + +**Upgrading?** Click **Import Data** in the sidebar and point it at your old Lite folder — all historical data (DuckDB + Parquet archives) is imported into the new install. + +**Always On AG?** Enable **ReadOnlyIntent** in the connection settings to route Lite's monitoring queries to a readable secondary, keeping the primary clear. + +### Lite Collectors + +23 collectors run on independent, configurable schedules: + +| Collector | Default | Source | +|---|---|---| +| query_snapshots | 1 min | `sys.dm_exec_requests` + `sys.dm_exec_sessions` | +| blocked_process_report | 1 min | XE ring buffer session | +| waiting_tasks | 1 min | `sys.dm_os_waiting_tasks` | +| wait_stats | 1 min | `sys.dm_os_wait_stats` (deltas) | +| query_stats | 1 min | `sys.dm_exec_query_stats` (deltas) | +| procedure_stats | 1 min | `sys.dm_exec_procedure_stats` (deltas) | +| cpu_utilization | 1 min | `sys.dm_os_ring_buffers` scheduler monitor | +| file_io_stats | 1 min | `sys.dm_io_virtual_file_stats` (deltas) | +| memory_stats | 1 min | `sys.dm_os_sys_memory` + memory counters | +| memory_grant_stats | 1 min | `sys.dm_exec_query_memory_grants` | +| tempdb_stats | 1 min | `sys.dm_db_file_space_usage` | +| perfmon_stats | 1 min | `sys.dm_os_performance_counters` (deltas) | +| deadlocks | 1 min | `system_health` Extended Events session | +| session_stats | 1 min | `sys.dm_exec_sessions` active session tracking | +| memory_clerks | 5 min | `sys.dm_os_memory_clerks` | +| query_store | 5 min | Query Store DMVs (per database) | +| running_jobs | 5 min | `msdb` job history with duration vs avg/p95 | +| database_size_stats | 15 min | `sys.master_files` + `FILEPROPERTY` + `dm_os_volume_stats` | +| server_properties | 15 min | `SERVERPROPERTY()` hardware and licensing metadata | +| server_config | On connect | `sys.configurations` | +| database_config | On connect | `sys.databases` | +| database_scoped_config | On connect | Database-scoped configurations | +| trace_flags | On connect | `DBCC TRACESTATUS` | + +### Lite Data Storage + +- **Hot data** in DuckDB 1.5.0 — non-blocking checkpoints, free block reuse, stable file size without periodic resets +- **Archive** to Parquet with ZSTD compression (~10x reduction) — automatic monthly compaction keeps file count low (~75 files vs thousands) +- **Retention**: 3-month calendar-month rolling window +- Typical size: ~50–200 MB per server per week + +### Lite Configuration + +All configuration lives in the `config/` folder: + +| File | Purpose | +|---|---| +| `servers.json` | Server connections (passwords in Windows Credential Manager). Optional **Utility Database** per server for community procs installed outside master. | +| `settings.json` | Retention, MCP server, startup behavior, alert thresholds, SMTP configuration | +| `collection_schedule.json` | Per-collector enable/disable and frequency | +| `ignored_wait_types.json` | 144 benign wait types excluded by default | + +--- + +## Quick Start — Full Edition + +### Install + +Windows Authentication: + +``` +PerformanceMonitorInstaller.exe YourServerName +``` + +SQL Authentication: + +``` +PerformanceMonitorInstaller.exe YourServerName sa YourPassword +``` + +Entra ID (MFA) Authentication: + +``` +PerformanceMonitorInstaller.exe YourServerName --entra user@domain.com +``` + +Clean reinstall (drops existing database and all collected data): + +``` +PerformanceMonitorInstaller.exe YourServerName --reinstall +PerformanceMonitorInstaller.exe YourServerName sa YourPassword --reinstall +``` + +Uninstall (removes database, Agent jobs, and XE sessions): + +``` +PerformanceMonitorInstaller.exe YourServerName --uninstall +PerformanceMonitorInstaller.exe YourServerName sa YourPassword --uninstall +``` + +The installer automatically tests the connection, checks the SQL Server version (2016+ required), executes SQL scripts, downloads community dependencies, creates SQL Agent jobs, and runs initial data collection. A GUI installer (`InstallerGui/`) is also available with the same functionality. + +### CLI Installer Options + +| Option | Description | +|---|---| +| `SERVER` | SQL Server instance name (positional, required) | +| `USERNAME PASSWORD` | SQL Authentication credentials (positional, optional) | +| `--entra EMAIL` | Microsoft Entra ID interactive authentication (MFA) | +| `--reinstall` | Drop existing database and perform clean install | +| `--uninstall` | Remove database, Agent jobs, and XE sessions | +| `--reset-schedule` | Reset collection schedule to recommended defaults | +| `--preserve-jobs` | Keep existing SQL Agent job schedules during upgrade | +| `--encrypt=optional\|mandatory\|strict` | Connection encryption level (default: mandatory) | +| `--trust-cert` | Trust server certificate without validation (default: require valid cert) | +| `--help` | Show usage information and exit | + +**Environment variable:** Set `PM_SQL_PASSWORD` to avoid passing the password on the command line. + +### Exit Codes + +| Code | Meaning | +|---|---| +| `0` | Success | +| `1` | Invalid arguments | +| `2` | Connection failed | +| `3` | Critical file failed (scripts 01–03) | +| `4` | Partial installation (non-critical failures) | +| `5` | Version check failed (SQL Server 2014 or earlier) | +| `6` | SQL files not found | +| `7` | Uninstall failed | +| `8` | Upgrade script failed | + +### Post-Installation + +1. Ensure SQL Server Agent is running — the collection job executes every minute +2. Verify installation: + +```sql +SELECT * FROM PerformanceMonitor.config.current_version; + +SELECT TOP (20) * +FROM PerformanceMonitor.config.collection_log +ORDER BY collection_time DESC; +``` + +3. Launch the Dashboard (`Dashboard/` folder — build with `dotnet build` or use the release package). The Dashboard is a separate WPF application that runs on your workstation and connects to any SQL Server where the PerformanceMonitor database is installed. Add your server, enter credentials, and data appears immediately. + +### What Gets Installed + +- **PerformanceMonitor database** with collection tables and reporting views +- **32 collector stored procedures** for gathering metrics (including SQL Agent job monitoring) +- **Configurable collection** — query text and execution plan capture can be disabled per-collector via `config.collection_schedule` (`collect_query`, `collect_plan` columns) for sensitive or high-volume environments +- **Delta framework** for calculating per-second rates from cumulative DMVs +- **Community dependencies:** sp_WhoIsActive, sp_HealthParser, sp_HumanEventsBlockViewer, sp_BlitzLock +- **SQL Agent jobs:** Collection (every 1 minute), Data Retention (daily at 2:00 AM), and Hung Job Monitor (collection job watchdog, every 5 minutes) +- **Version tracking** in `config.installation_history` + +### Data Retention + +Default: 30 days (configurable per collector via the `retention_days` column in `config.collection_schedule`). + +Storage estimates: 5–10 GB per week, 20–40 GB per month. + +### Managed Platform Support + +The Full Edition supports Azure SQL Managed Instance and AWS RDS for SQL Server with some limitations: + +| Feature | On-Premises | Azure SQL MI | AWS RDS | +|---|---|---|---| +| All core collectors | Yes | Yes | Yes | +| Default trace collectors | Yes | Disabled automatically | Yes | +| System health XE (file target) | Yes | Disabled automatically | Yes | +| SQL Trace collectors | Yes | Disabled automatically | Yes | +| SQL Agent jobs | Yes | Yes | Yes | +| Running jobs collector | Yes | Yes | Disabled automatically | +| Blocked process threshold | Auto-configured | Auto-configured | Configure via RDS parameter group | +| sp_configure | Yes | Yes | Not available | + +**Azure SQL MI:** The installer automatically detects Engine Edition 8 and disables 4 collectors that require file system access or SQL Trace (default_trace, trace_management, trace_analysis, system_health). All other collectors work normally. + +**AWS RDS:** The installer automatically detects the `rdsadmin` database and disables the `running_jobs_collector` (requires `msdb.dbo.syssessions` which is restricted on RDS). It also gracefully handles restricted `sp_configure` and limited `msdb` permissions. SQL Agent jobs are created and owned by the installing login. The RDS master user is automatically enrolled in `SQLAgentUserRole`; for other logins, add them to `SQLAgentUserRole` in msdb before running the installer. + +### AWS RDS Parameter Group Configuration + +`sp_configure` is not available on AWS RDS for SQL Server. Features that depend on server-level configuration must be set through **AWS RDS Parameter Groups** instead. + +**Blocked process threshold** — Required for blocked process report collection. Without this, blocked process reports will not fire on RDS. + +1. Open the [AWS RDS Console](https://console.aws.amazon.com/rds/) and navigate to **Parameter groups** +2. Create a new parameter group (or modify the one attached to your instance): + - Family: `sqlserver-ee-16.0` (or your edition/version) + - Type: DB Parameter Group +3. Search for `blocked process threshold (s)` and set it to `5` (seconds) +4. Apply the parameter group to your RDS instance (may require a reboot if the parameter is static) +5. Verify it took effect: + + ```sql + SELECT + c.name, + c.value_in_use + FROM sys.configurations AS c + WHERE c.name = N'blocked process threshold (s)'; + ``` + +**Deadlocks** — No parameter group configuration is required. The SQL Server deadlock monitor runs automatically on all platforms, and the `xml_deadlock_report` Extended Event fires without any threshold setting. + +**Azure SQL Database** — The blocked process threshold is fixed at 20 seconds and cannot be changed. The `blocked_process_report` event fires automatically when blocking exceeds this duration. + +--- + +## Edition Comparison + +| Capability | Full | Lite | +|---|---|---| +| Target server installation | Required | None | +| SQL Server Agent | Required | Not needed | +| Azure SQL Managed Instance | Supported | Supported | +| AWS RDS for SQL Server | Supported | Supported | +| Azure SQL Database | Not supported | Supported | +| Multi-server from one seat | Per-server install | Built-in | +| Collectors | 32 | 23 | +| Agent job monitoring | Duration vs historical avg/p95 | Duration vs historical avg/p95 | +| Data storage | SQL Server (on target) | DuckDB + Parquet (local) | +| Execution plans | Collected and stored (can be disabled per-collector) | Download on demand | +| Graphical plan viewer | Built-in with 30-rule PlanAnalyzer | Built-in with 30-rule PlanAnalyzer | +| Standalone plan viewer | Open/paste/drag `.sqlplan` files | Open/paste/drag `.sqlplan` files | +| Community tools (sp_WhoIsActive, sp_BlitzLock) | Installed automatically | Not needed | +| Alerts (tray + email) | Blocking, deadlocks, CPU | Blocking, deadlocks, CPU | +| Dashboard | Separate app | Built-in | +| Themes | Dark and light | Dark and light | +| Portability | Server-bound | Single executable | +| MCP server (LLM integration) | Built into Dashboard (63 tools) | Built-in (51 tools) | + +--- + +## Dashboard Tabs + +### Full Edition Dashboard + +| Tab | Contents | +|---|---| +| **Overview** | Resource overview, daily summary, critical issues, server config changes, database config changes, trace flag changes, collection health | +| **Performance** | Performance trends, expensive queries, active queries, query stats, procedure stats, Query Store, Query Store regressions, query trace patterns | +| **Resource Metrics** | Server trends, wait stats, TempDB, file I/O latency, perfmon counters, default trace events, trace analysis, session stats, latch stats, spinlock stats | +| **Memory** | Memory overview, grants, clerks, plan cache, memory pressure events | +| **Locking** | Blocking chains, deadlocks, blocking/deadlock trends | +| **System Events** | Corruption events, contention, errors, I/O issues, scheduler issues, memory conditions | + +Plus a NOC-style landing page with server health cards (green/yellow/red severity indicators). + +### Lite Edition Dashboard + +| Tab | Contents | +|---|---| +| **Active Queries** | Running queries with session details, wait types, blocking, DOP, memory grants | +| **Wait Stats** | Filterable wait statistics chart with delta calculations | +| **CPU** | SQL Server CPU vs Other Processes over time | +| **Memory** | Physical memory overview, SQL Server memory trend, memory clerk breakdown | +| **Queries** | Performance trends, top queries and procedures by duration, Query Store integration | +| **File I/O** | Read/write I/O trends per database file | +| **TempDB** | Space usage breakdown and TempDB file I/O | +| **Blocking** | Blocking/deadlock trends, blocked process reports, deadlock history | +| **Perfmon** | Selectable SQL Server performance counters over time | +| **Configuration** | Server configuration, database configuration, scoped configuration, trace flags | +| **FinOps** | Utilization & provisioning analysis, database resource breakdown, storage growth (7d/30d), idle database detection, index analysis via sp_IndexCleanup, application connections, server inventory, cost optimization recommendations (enterprise feature audit, CPU/memory right-sizing, compression savings, dormant databases, dev/test detection), column-level filtering on all grids | + +Both editions feature auto-refresh, configurable time ranges, right-click CSV export, system tray integration, dark and light themes, and timezone display options (server time, local time, or UTC). + +--- + +## Alerts & Notifications + +Both editions include a real-time alert engine that monitors for performance issues and sends notifications via system tray balloons and email. + +### Alert Types + +| Metric | Default Threshold | Description | +|---|---|---| +| **Blocking** | 30 seconds (Full), 5 seconds (Lite) | Fires when the longest blocked session exceeds the threshold | +| **Deadlocks** | 1 | Fires when new deadlocks are detected since the last check | +| **Poison waits** | 100 ms avg | Fires when any poison wait type exceeds the average-ms-per-wait threshold | +| **Long-running queries** | 5 minutes | Fires when any query exceeds the elapsed-time threshold | +| **TempDB space** | 80% | Fires when TempDB usage exceeds the percentage threshold | +| **Long-running agent jobs** | 3× average | Fires when a job's current duration exceeds a multiple of its historical average | +| **High CPU** | 90% (Full), 80% (Lite) | Fires when total CPU (SQL + other) exceeds the threshold | +| **Server unreachable** | N/A | Fires when a monitored server goes offline or comes back online (tray + email) | + +All thresholds are configurable in Settings. + +**Poison wait types** monitored: [`THREADPOOL`](https://learn.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-os-wait-stats-transact-sql#threadpool) (worker thread exhaustion), [`RESOURCE_SEMAPHORE`](https://learn.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-os-wait-stats-transact-sql#resource_semaphore) (memory grant pressure), and [`RESOURCE_SEMAPHORE_QUERY_COMPILE`](https://learn.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-os-wait-stats-transact-sql#resource_semaphore_query_compile) (compilation memory pressure). These waits indicate severe resource starvation and should never occur under normal operation. + +### Notification Channels + +- **System tray** — balloon notifications with a configurable per-metric cooldown (default: 5 minutes) +- **Email (SMTP)** — styled HTML emails with a configurable per-metric cooldown (default: 15 minutes), plus configurable SMTP settings (server, port, SSL, authentication, recipients) + +Both cooldown periods are independently configurable in Settings under the Performance Alerts section. + +### Email Alerts + +Alert emails include: + +- **Metric summary** — what triggered the alert, current value vs threshold +- **Detail section** — recent blocking chains or deadlock participants with query text, wait times, lock modes, database names, and client application +- **XML attachment** — full `blocked_process_report.xml` or `deadlock_graph.xml` for offline analysis + +### Alert Behavior + +- **Resolved notifications** — when a condition clears (e.g., blocking ends), a "Cleared" notification fires +- **Server silencing** — right-click a server tab to acknowledge alerts, silence all alerts, or unsilence +- **Always-on** — the Dashboard alert engine runs independently of which tab is active, including when minimized to the system tray. The Lite edition's alert engine also runs regardless of tab visibility. +- **Alert history** — Dashboard keeps an in-memory alert log (accessible via MCP). Lite logs alerts to DuckDB (`config_alert_log`). +- **Alert muting** — create rules to suppress specific recurring alerts while still logging them. Rules match on server name, metric type, database, query text, wait type, or job name (AND logic across fields). Access via Settings → Manage Mute Rules, or right-click an alert in the Alert History tab. The context menu offers two muting options: **Mute This Alert** (pre-fills server + metric for a targeted rule) and **Mute Similar Alerts** (pre-fills metric only, matching across all servers). Muted alerts appear grayed out in alert history and are still recorded for auditability. Rules support optional expiration (1h, 24h, 7 days, or permanent). +- **Alert details** — right-click any alert in the Alert History tab and choose **View Details** to open a detail window. The window shows core alert fields (time, server, metric, value, threshold, notification type, status) plus context-sensitive details that vary by metric: query text and session info for long-running queries, job name and duration stats for anomalous agent jobs, per-wait-type breakdowns for poison waits, space usage by category for TempDB, and blocking/deadlock session counts. + +--- + +## Agent Job Monitoring + +Both editions monitor currently running SQL Agent jobs and flag jobs that are running longer than expected. + +| Metric | How It Works | +|---|---| +| **Current duration** | Elapsed time since the job started | +| **Average duration** | Historical mean from successful completions in `msdb.dbo.sysjobhistory` | +| **p95 duration** | 95th percentile from historical completions | +| **Running long flag** | Set when current duration exceeds the p95 threshold | + +The Full Edition collects this data via the `collect.running_jobs_collector` stored procedure (every 5 minutes). The Lite Edition queries `msdb` directly on each collection cycle. Both editions expose this data through the MCP `get_running_jobs` tool. + +Gracefully skipped on Azure SQL Database, AWS RDS for SQL Server, and environments without SQL Server Agent. + +--- + +## MCP Server (LLM Integration) + +Both editions include an embedded [Model Context Protocol](https://modelcontextprotocol.io) server that exposes monitoring data to LLM clients like Claude Code and Cursor. + +### Setup + +1. Enable the MCP server in Settings (checkbox + port, default `5151`) + - The port must be between **1024** and **65535**. Ports 0–1023 are well-known privileged ports reserved by the operating system. + - On save, the app checks whether the chosen port is already in use and warns you if there is a conflict. + - On startup, the app verifies the port is available before starting the MCP server. +2. Register with Claude Code: + +``` +claude mcp add --transport http --scope user sql-monitor http://localhost:5151/ +``` + +3. Open a new Claude Code session and ask questions like: + - "What servers are being monitored?" + - "What are the top wait types on my server?" + - "Are there any blocking or deadlock issues?" + - "Show me CPU utilization for the last 4 hours" + - "What are the most expensive queries by CPU?" + +### Available Tools + +Full Edition exposes 63 tools, Lite Edition exposes 51. Core tools are shared across both editions. + +| Category | Tools | +|---|---| +| Discovery | `list_servers` | +| Health | `get_server_summary`\*, `get_daily_summary`\*\*, `get_collection_health` | +| Alerts | `get_alert_history`, `get_alert_settings`, `get_mute_rules` | +| Waits | `get_wait_stats`, `get_wait_types`\*, `get_wait_trend`, `get_waiting_tasks`\* | +| Queries | `get_top_queries_by_cpu`, `get_top_procedures_by_cpu`, `get_query_store_top`, `get_expensive_queries`\*\*, `get_query_duration_trend`\*, `get_query_trend` | +| Active Queries | `get_active_queries` | +| CPU | `get_cpu_utilization` | +| Memory | `get_memory_stats`, `get_memory_trend`, `get_memory_clerks`, `get_memory_grants` | +| Blocking | `get_blocking`\*\*, `get_deadlocks`, `get_deadlock_detail`, `get_blocked_process_reports`\*, `get_blocked_process_xml`, `get_blocking_deadlock_stats`\*\*, `get_blocking_trend`\*, `get_deadlock_trend`\* | +| I/O | `get_file_io_stats`, `get_file_io_trend` | +| TempDB | `get_tempdb_trend` | +| Perfmon | `get_perfmon_stats`, `get_perfmon_trend` | +| Jobs | `get_running_jobs` | +| Configuration | `get_server_config`\*, `get_database_config`\*, `get_database_scoped_config`\*, `get_trace_flags`\* | +| Config History | `get_server_config_changes`\*\*, `get_database_config_changes`\*\*, `get_trace_flag_changes`\*\* | +| Server Info | `get_server_properties`, `get_database_sizes` | +| Sessions | `get_session_stats` | +| Scheduler | `get_cpu_scheduler_pressure`\*\* | +| Latch/Spinlock | `get_latch_stats`\*\*, `get_spinlock_stats`\*\* | +| Diagnostics | `get_plan_cache_bloat`\*\*, `get_critical_issues`\*\* | +| System Events | `get_default_trace_events`\*\*, `get_trace_analysis`\*\*, `get_memory_pressure_events`\*\* | +| Health Parser | `get_health_parser_system_health`\*\*, `get_health_parser_severe_errors`\*\*, `get_health_parser_io_issues`\*\*, `get_health_parser_scheduler_issues`\*\*, `get_health_parser_memory_conditions`\*\*, `get_health_parser_cpu_tasks`\*\*, `get_health_parser_memory_broker`\*\*, `get_health_parser_memory_node_oom`\*\* | +| Plan Analysis | `analyze_query_plan`, `analyze_procedure_plan`, `analyze_query_store_plan`, `analyze_plan_xml`, `get_plan_xml` | +| Diagnostic Analysis | `analyze_server`\*, `get_analysis_facts`\*, `compare_analysis`\*, `audit_config`\*, `get_analysis_findings`\*, `mute_analysis_finding`\* | + +\* Lite only | \*\* Full only + +Most tools accept optional `server_name` and `hours_back` parameters. If only one server is configured, `server_name` is auto-resolved. + +The MCP server binds to `localhost` only and does not accept remote connections. + +--- + +## Performance Impact + +### On Monitored Servers + +- All queries use `READ UNCOMMITTED` isolation +- Configurable collection intervals +- Full Edition: typical overhead <1% CPU, <100 MB memory +- Lite Edition: max 7 concurrent SQL connections, 30-second command timeout + +### Local Resources (Lite) + +- DuckDB: ~50–200 MB per server per week +- Parquet archives: ~10x compression with ZSTD +- ScottPlot charts use hardware-accelerated rendering + +--- + +## Troubleshooting + +### Full Edition + +Two diagnostic scripts in the `install/` folder: + +| Script | Purpose | +|---|---| +| `99_installer_troubleshooting.sql` | Quick health checks: collection log errors, schedule status, Agent job status, table row counts | +| `99_user_troubleshooting.sql` | Comprehensive diagnostics: runs collectors with `@debug = 1`, detailed timing and row counts | + +```sql +SELECT + collection_time, + collector_name, + error_message +FROM PerformanceMonitor.config.collection_log +WHERE collection_status = 'ERROR' +ORDER BY collection_time DESC; +``` + +### Lite Edition + +Application logs are written to the `logs/` folder. Collection success/failure is also logged to the `collection_log` table in DuckDB. + +Common issues: + +1. **No data after connecting** — Wait for the first collection cycle (1–5 minutes). Check logs for connection errors. +2. **Query Store tab empty** — Query Store must be enabled on the target database (`ALTER DATABASE [YourDB] SET QUERY_STORE = ON`). +3. **Blocked process reports empty** — Both editions attempt to auto-configure the blocked process threshold to 5 seconds via `sp_configure`. On **AWS RDS**, `sp_configure` is not available — you must set `blocked process threshold (s)` through an RDS Parameter Group (see "AWS RDS Parameter Group Configuration" above). On **Azure SQL Database**, the threshold is fixed at 20 seconds and cannot be changed. If you still see no data on other platforms, verify the login has `ALTER SETTINGS` permission. +4. **Connection failures** — Verify network connectivity, firewall rules, and that the login has the required [permissions](#permissions). For Azure SQL Database, use a contained database user with `VIEW DATABASE STATE`. + +--- + +## Permissions + +### Full Edition (On-Premises) + +The installer needs `sysadmin` to create the database, Agent jobs, and configure `sp_configure` settings. After installation, the collection jobs can run under a **least-privilege login** with these grants: + +```sql +USE [master]; +CREATE LOGIN [SQLServerPerfMon] WITH PASSWORD = N'YourStrongPassword'; +GRANT VIEW SERVER STATE TO [SQLServerPerfMon]; + +USE [PerformanceMonitor]; +CREATE USER [SQLServerPerfMon] FOR LOGIN [SQLServerPerfMon]; +ALTER ROLE [db_owner] ADD MEMBER [SQLServerPerfMon]; + +USE [msdb]; +CREATE USER [SQLServerPerfMon] FOR LOGIN [SQLServerPerfMon]; +ALTER ROLE [SQLAgentReaderRole] ADD MEMBER [SQLServerPerfMon]; +``` + +| Grant | Why | +|---|---| +| `VIEW SERVER STATE` | All DMV access (wait stats, query stats, memory, CPU, file I/O, etc.) | +| `db_owner` on PerformanceMonitor | Collectors insert data, create/alter tables, execute procedures. Scoped to just this database — not sysadmin. | +| `SQLAgentReaderRole` on msdb | Read `sysjobs`, `sysjobactivity`, `sysjobhistory` for the running jobs collector | + +**Optional** (gracefully skipped if missing): +- `ALTER SETTINGS` — installer sets `blocked process threshold` via `sp_configure`. Skipped with a warning if unavailable. +- `ALTER TRACE` — default trace collector. Skipped if denied. +- `DBCC TRACESTATUS` — server config collector skips trace flag detection if denied. + +Change the SQL Agent job owner to the new login after installation if you want to run under least privilege end-to-end. + +### Lite Edition (On-Premises) + +Nothing is installed on the target server. The login only needs: + +```sql +USE [master]; +GRANT VIEW SERVER STATE TO [YourLogin]; + +-- Optional: for SQL Agent job monitoring +USE [msdb]; +CREATE USER [YourLogin] FOR LOGIN [YourLogin]; +ALTER ROLE [SQLAgentReaderRole] ADD MEMBER [YourLogin]; +``` + +### Azure SQL Database (Lite Only) + +Azure SQL Database doesn't support server-level logins. Create a **contained database user** directly on the target database: + +```sql +-- Connect to your target database (not master) +CREATE USER [SQLServerPerfMon] WITH PASSWORD = 'YourStrongPassword'; +GRANT VIEW DATABASE STATE TO [SQLServerPerfMon]; +``` + +When connecting in Lite, specify the database name in the connection. SQL Agent and msdb are not available on Azure SQL Database — those collectors are skipped automatically. + +### Azure SQL Managed Instance + +Works like on-premises. Use server-level logins with `VIEW SERVER STATE`. SQL Agent is available. + +### AWS RDS for SQL Server + +Use the RDS master user for installation. The master user has the necessary permissions. For ongoing collection, `VIEW SERVER STATE` and msdb access work the same as on-premises, but `sp_configure` is not available (use RDS Parameter Groups instead — see above). + +--- + +## Folder Structure + +``` +Monitor/ +│ +│ Full Edition (server-installed collectors + separate dashboard) +├── install/ # 58 SQL installation scripts +├── upgrades/ # Version-specific upgrade scripts +├── Installer/ # CLI installer for Full Edition database (C#) +├── InstallerGui/ # GUI installer for Full Edition database (WPF) +├── Dashboard/ # Full Edition dashboard application (WPF) +│ +│ Lite Edition (standalone desktop app, nothing installed on server) +├── Lite/ # Lite Edition desktop application (WPF) +│ +│ Shared +└── README.md # This file +``` + +--- + +## Building from Source + +All projects target .NET 8.0. + +``` +# Full Edition Dashboard +dotnet build Dashboard/Dashboard.csproj + +# Lite Edition +dotnet build Lite/PerformanceMonitorLite.csproj + +# CLI Installer (self-contained) +dotnet publish Installer/PerformanceMonitorInstaller.csproj -c Release + +# GUI Installer +dotnet publish InstallerGui/InstallerGui.csproj -c Release -r win-x64 --self-contained +``` + +--- + +## Support & Sponsorship + +**This project is free and open source under the MIT License.** The software is fully functional with no features withheld — every user gets the same tool, same collectors, same MCP integration. + +However, some organizations have procurement or compliance policies that require a formal vendor relationship, a support agreement, or an invoice on file before software can be deployed to production. If that sounds familiar, two commercial support tiers are available: + +| Tier | Annual Cost | What You Get | +|------|-------------|--------------| +| **Supported** | $500/year | Email support (2-business-day response), compatibility guarantees for new SQL Server versions, vendor agreement and invoices for compliance, unlimited instances | +| **Priority** | $2,500/year | Next-business-day email response, quarterly live Q&A sessions, early access to new features, roadmap input, unlimited instances | + +Both tiers cover unlimited SQL Server instances. The software itself is identical — commercial support is about the relationship, not a feature gate. + +**[Read more about the free tool and commercial options](https://erikdarling.com/free-sql-server-performance-monitoring/)** | **[Purchase a support subscription](https://training.erikdarling.com/sql-monitoring)** + +If you find the project valuable, you can also support continued development: + +| | | +|---|---| +| **Sponsor on GitHub** | [Become a sponsor](https://github.com/sponsors/erikdarlingdata) to fund new features, ongoing maintenance, and SQL Server version support. | +| **Consulting Services** | [Hire me](https://training.erikdarling.com/sqlconsulting) for hands-on consulting if you need help analyzing the data this tool collects? Want expert assistance fixing the issues it uncovers? | + +Neither sponsorship nor consulting is required — use the tool freely. + +--- + +## Third-Party Components + +### sp_WhoIsActive + +- **Author:** Adam Machanic | **License:** GPLv3 +- **Repository:** https://github.com/amachanic/sp_whoisactive + +### DarlingData + +- **Author:** Erik Darling (Darling Data, LLC) | **License:** MIT +- **Repository:** https://github.com/erikdarlingdata/DarlingData + +### SQL Server First Responder Kit + +- **Author:** Brent Ozar Unlimited | **License:** MIT +- **Repository:** https://github.com/BrentOzarULTD/SQL-Server-First-Responder-Kit + +See [THIRD_PARTY_NOTICES.md](THIRD_PARTY_NOTICES.md) for complete license texts. + +--- + +## License + +Copyright (c) 2026 Darling Data, LLC. Licensed under the MIT License. See [LICENSE](LICENSE) for details. + +## Author + +Erik Darling — [erikdarling.com](https://erikdarling.com) — [Darling Data, LLC](https://darlingdata.com)