diff --git a/.coderabbit.yaml b/.coderabbit.yaml
new file mode 100644
index 00000000..67c7ecc2
--- /dev/null
+++ b/.coderabbit.yaml
@@ -0,0 +1,67 @@
+# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
+
+language: "en-US"
+early_access: false
+enable_free_tier: true
+
+reviews:
+ profile: "chill"
+ high_level_summary: true
+ review_status: true
+ commit_status: true
+ collapse_walkthrough: true
+ sequence_diagrams: false
+ poem: false
+
+ path_filters:
+ - "!**/*.Designer.cs"
+ - "!**/bin/**"
+ - "!**/obj/**"
+ - "!**/publish/**"
+ - "!**/*.user"
+ - "!**/*.suo"
+
+ path_instructions:
+ - path: "Dashboard/**/*.cs"
+ instructions: >
+ This is a WPF .NET 8 desktop app (Dashboard) that reads from SQL Server.
+ Uses data binding, async/await patterns, and INotifyPropertyChanged.
+ Watch for: null reference risks, disposal of SQL connections,
+ thread safety with UI dispatch, and proper async patterns.
+ - path: "Lite/**/*.cs"
+ instructions: >
+ This is a WPF .NET 8 desktop app (Lite) that collects SQL Server DMV data
+ into a local DuckDB database. Uses ReaderWriterLockSlim for DB coordination.
+ Watch for: connection disposal, thread safety, DuckDB access patterns,
+ and proper async/await usage.
+ - path: "**/*.sql"
+ instructions: >
+ T-SQL stored procedures and scripts for SQL Server.
+ Watch for: SQL injection risks, missing error handling (TRY/CATCH),
+ proper use of SET NOCOUNT ON, and parameter sniffing concerns.
+ - path: "Installers/**"
+ instructions: >
+ WiX-based MSI installer projects. Be cautious about upgrade paths
+ and file versioning. Schema upgrades go in upgrades/ folder, not install scripts.
+
+ auto_review:
+ enabled: true
+ drafts: false
+ base_branches:
+ - "dev"
+ - "main"
+
+ tools:
+ gitleaks:
+ enabled: true
+ github-checks:
+ enabled: true
+
+chat:
+ auto_reply: true
+
+knowledge_base:
+ learnings:
+ scope: "local"
+ pull_requests:
+ scope: "local"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 05c86966..42d7bf2c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,87 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [2.3.0] - 2026-03-18
+
+### Important
+
+- **Schema upgrade**: Six columns widened across three tables (`query_stats`, `cpu_scheduler_stats`, `waiting_tasks`, `database_size_stats`) to match DMV documentation types. These are in-place ALTER COLUMN operations — fast on any table size, no data migration. Upgrade scripts run automatically via the CLI/GUI installer.
+- **SQL Server version check**: Both installers now reject SQL Server 2014 and earlier before running any scripts, with a clear error message. Azure MI (EngineEdition 8) is always accepted. ([#543])
+- **Installer adversarial tests**: 35 automated tests covering upgrade failures, data survival, idempotency, version detection fallback, file filtering, restricted permissions, and more. These run as part of pre-release validation. ([#543])
+
+### Added
+
+- **ErikAI analysis engine** — rule-based inference engine for Lite that scores server health across wait stats, CPU, memory, I/O, blocking, tempdb, and query performance. Surfaces actionable findings with severity, detail, and recommended actions. Includes anomaly detection (baseline comparison for acute deviations), bad actor detection (per-query scoring for consistently terrible queries), and CPU spike detection for bursty workloads. ([#589], [#593])
+- **ErikAI Dashboard port** — full analysis engine ported to Dashboard with SQL Server backend ([#590])
+- **FinOps cost optimization recommendations** — Phase 1-4 checks: enterprise feature audit, CPU/memory right-sizing, compression savings estimator, unused index cost quantification, dormant database detection, dev/test workload detection, VM right-sizing, storage tier optimization, reserved capacity candidates ([#564])
+- **FinOps High Impact Queries** — 80/20 analysis showing which queries consume the most resources across all dimensions ([#564])
+- **FinOps dollar-denominated cost attribution** — per-server monthly cost setting with proportional database-level breakdown ([#564])
+- **On-demand plan fetch** for bad actor and analysis findings — click to retrieve execution plans for flagged queries ([#604])
+- **Plan analysis integration** — findings include execution plan analysis when plans are available ([#594])
+- **Server unreachable email alerts** — Dashboard sends email (not just tray notification) when a monitored server goes offline or comes back online ([#529])
+- **Column filters on all FinOps DataGrids** — filter funnel icons on every column header across all 7 FinOps grids in Lite and Dashboard ([#562])
+- **Column filters on Dashboard** IdleDatabases, TempDB, and Index Analysis grids
+- **Lite data import** — "Import Data" button brings in monitoring history from a previous Lite install via parquet files, preserving trend data across version upgrades ([#566])
+- **Per-server Utility Database setting** — Lite can call community stored procedures (sp_IndexCleanup) from a database other than master ([#555])
+- **SQL Server version check** in both CLI and GUI installers — rejects 2014 and earlier with a clear message ([#543])
+- **Execution plan analysis MCP tools** for both Dashboard and Lite
+- **Full MCP tool coverage** — Dashboard expanded from 28 to 57 tools, Lite from 32 to 51 tools ([#576], [#577])
+- **Self-sufficient analyze_server drill-down** — MCP tool returns complete analysis, not breadcrumb trail ([#578])
+- **NuGet package dependency licenses** in THIRD_PARTY_NOTICES.md
+
+### Changed
+
+- **Azure SQL DB FinOps** — all collectors (database sizes, query stats, file I/O) now connect to each database individually instead of only querying master. Server Inventory uses dynamic SQL to avoid `sys.master_files` dependency. ([#557])
+- **Index Analysis scroll fix** — both summary and detail grids now use proportional heights instead of Auto, so they scroll independently with large result sets ([#554])
+- **Dashboard Add Server dialog** — increased MaxHeight from 700 to 850px so buttons are visible when SQL auth fields are shown
+- **GUI installer** — Uninstall button now correctly enables after a successful install
+- **GUI installer** — fixed encryption mapping and history logging ([#612])
+- **Dashboard visible sub-tab only refresh** on auto-refresh ticks ([#528])
+- Analysis engine decouples data maturity check from analysis window
+
+### Fixed
+
+- **Installer dropping database on every upgrade** — `00_uninstall.sql` excluded from install file list, installer aborts on upgrade failure, version detection fallback returns "1.0.0" instead of null ([#538], [#539])
+- **SQL dumps on mirroring passive servers** from FinOps collectors ([#535])
+- **RetrievedFromCache** always showing False ([#536])
+- **Arithmetic overflow** in query_stats collector for dop/thread columns ([#547])
+- **Lite perfmon chart bugs** and Dashboard ScottPlot crash handling ([#544], [#545])
+- **PLE=0 scoring bug** — was scored as harmless, now correctly flagged ([#543])
+- **PercentRank >1.0** bug in HealthCalculator
+- **6 verified Lite bugs** from code review ([#611])
+- **Enterprise feature audit text** — partitioning is not Enterprise-only
+- **FinOps collector scheduling**, server switch, and utilization bugs
+- **Dashboard drill-down** Unicode arrow in story path split
+- **Empty DataGrid scrollbar artifacts** — hide grids when empty across all FinOps tabs
+- **Query preview** — truncated in row, full text in tooltip
+
+[#529]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/529
+[#535]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/535
+[#536]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/536
+[#538]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/538
+[#539]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/539
+[#543]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/543
+[#544]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/544
+[#545]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/545
+[#547]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/547
+[#554]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/554
+[#555]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/555
+[#557]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/557
+[#562]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/562
+[#564]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/564
+[#566]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/566
+[#576]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/576
+[#577]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/577
+[#578]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/578
+[#528]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/528
+[#589]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/589
+[#590]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/590
+[#593]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/593
+[#594]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/594
+[#604]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/604
+[#611]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/611
+[#612]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/612
+
## [2.2.0] - 2026-03-11
**Contributors:** [@HannahVernon](https://github.com/HannahVernon), [@ClaudioESSilva](https://github.com/ClaudioESSilva), [@dphugo](https://github.com/dphugo), [@Orestes](https://github.com/Orestes) — thank you!
diff --git a/Dashboard/AddServerDialog.xaml b/Dashboard/AddServerDialog.xaml
index 84d3e5f1..ee06c9d9 100644
--- a/Dashboard/AddServerDialog.xaml
+++ b/Dashboard/AddServerDialog.xaml
@@ -2,7 +2,7 @@
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
Title="Add SQL Server"
- SizeToContent="Height" Width="450" MaxHeight="700"
+ SizeToContent="Height" Width="450" MaxHeight="850"
WindowStartupLocation="CenterOwner"
ResizeMode="NoResize"
Background="{DynamicResource BackgroundBrush}"
@@ -100,6 +100,14 @@
+
+
+
+
+
+
diff --git a/Dashboard/AddServerDialog.xaml.cs b/Dashboard/AddServerDialog.xaml.cs
index f4ad1a30..3bed93d5 100644
--- a/Dashboard/AddServerDialog.xaml.cs
+++ b/Dashboard/AddServerDialog.xaml.cs
@@ -42,6 +42,7 @@ public AddServerDialog(ServerConnection existingServer)
ServerNameTextBox.Text = existingServer.ServerName;
DescriptionTextBox.Text = existingServer.Description;
IsFavoriteCheckBox.IsChecked = existingServer.IsFavorite;
+ MonthlyCostTextBox.Text = existingServer.MonthlyCostUsd.ToString(System.Globalization.CultureInfo.InvariantCulture);
// Load encryption settings
EncryptModeComboBox.SelectedIndex = existingServer.EncryptMode switch
@@ -328,9 +329,15 @@ private async void Save_Click(object sender, RoutedEventArgs e)
ServerConnection.IsFavorite = IsFavoriteCheckBox.IsChecked == true;
ServerConnection.EncryptMode = GetSelectedEncryptMode();
ServerConnection.TrustServerCertificate = TrustServerCertificateCheckBox.IsChecked == true;
+ if (decimal.TryParse(MonthlyCostTextBox.Text, System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var editCost) && editCost >= 0)
+ ServerConnection.MonthlyCostUsd = editCost;
}
else
{
+ decimal monthlyCost = 0m;
+ if (decimal.TryParse(MonthlyCostTextBox.Text, System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var newCost) && newCost >= 0)
+ monthlyCost = newCost;
+
ServerConnection = new ServerConnection
{
DisplayName = displayName,
@@ -341,7 +348,8 @@ private async void Save_Click(object sender, RoutedEventArgs e)
CreatedDate = DateTime.Now,
LastConnected = DateTime.Now,
EncryptMode = GetSelectedEncryptMode(),
- TrustServerCertificate = TrustServerCertificateCheckBox.IsChecked == true
+ TrustServerCertificate = TrustServerCertificateCheckBox.IsChecked == true,
+ MonthlyCostUsd = monthlyCost
};
}
diff --git a/Dashboard/Analysis/AnalysisModels.cs b/Dashboard/Analysis/AnalysisModels.cs
new file mode 100644
index 00000000..76718852
--- /dev/null
+++ b/Dashboard/Analysis/AnalysisModels.cs
@@ -0,0 +1,152 @@
+using System;
+using System.Collections.Generic;
+
+namespace PerformanceMonitorDashboard.Analysis;
+
+///
+/// A scored observation from collected data.
+///
+public class Fact
+{
+ public string Source { get; set; } = string.Empty;
+ public string Key { get; set; } = string.Empty;
+ public double Value { get; set; }
+ public double BaseSeverity { get; set; }
+ public double Severity { get; set; }
+ public int ServerId { get; set; }
+ public string? DatabaseName { get; set; }
+
+ ///
+ /// Raw metric values for analysis and audit trail.
+ /// Keys are metric-specific (e.g., "wait_time_ms", "waiting_tasks_count").
+ ///
+ public Dictionary Metadata { get; set; } = [];
+
+ ///
+ /// Amplifiers that were evaluated for this fact.
+ ///
+ public List AmplifierResults { get; set; } = [];
+}
+
+///
+/// Result of evaluating a single amplifier against the fact set.
+///
+public class AmplifierResult
+{
+ public string Description { get; set; } = string.Empty;
+ public bool Matched { get; set; }
+ public double Boost { get; set; }
+}
+
+///
+/// A conditional edge in the relationship graph.
+///
+public class Edge
+{
+ public string Source { get; set; } = string.Empty;
+ public string Destination { get; set; } = string.Empty;
+ public string Category { get; set; } = string.Empty;
+ public string PredicateDescription { get; set; } = string.Empty;
+
+ ///
+ /// Evaluates whether this edge should be followed given the current fact set.
+ ///
+ public Func, bool> Predicate { get; set; } = _ => false;
+}
+
+///
+/// A complete analysis story — the path from root symptom to leaf recommendation.
+///
+public class AnalysisStory
+{
+ public string RootFactKey { get; set; } = string.Empty;
+ public double RootFactValue { get; set; }
+ public double Severity { get; set; }
+ public double Confidence { get; set; }
+ public string Category { get; set; } = string.Empty;
+ public List Path { get; set; } = [];
+ public string StoryPath { get; set; } = string.Empty;
+ public string StoryPathHash { get; set; } = string.Empty;
+ public string StoryText { get; set; } = string.Empty;
+ public string? LeafFactKey { get; set; }
+ public double? LeafFactValue { get; set; }
+ public int FactCount { get; set; }
+ public bool IsAbsolution { get; set; }
+}
+
+///
+/// A persisted finding from a previous analysis run.
+/// Maps to the analysis_findings DuckDB table.
+///
+public class AnalysisFinding
+{
+ public long FindingId { get; set; }
+ public DateTime AnalysisTime { get; set; }
+ public int ServerId { get; set; }
+ public string ServerName { get; set; } = string.Empty;
+ public string? DatabaseName { get; set; }
+ public DateTime? TimeRangeStart { get; set; }
+ public DateTime? TimeRangeEnd { get; set; }
+ public double Severity { get; set; }
+ public double Confidence { get; set; }
+ public string Category { get; set; } = string.Empty;
+ public string StoryPath { get; set; } = string.Empty;
+ public string StoryPathHash { get; set; } = string.Empty;
+ public string StoryText { get; set; } = string.Empty;
+ public string RootFactKey { get; set; } = string.Empty;
+ public double? RootFactValue { get; set; }
+ public string? LeafFactKey { get; set; }
+ public double? LeafFactValue { get; set; }
+ public int FactCount { get; set; }
+
+ ///
+ /// Drill-down data collected after graph traversal. Ephemeral — not persisted to DuckDB.
+ /// Contains supporting detail keyed by category (e.g., "top_deadlocks", "queries_at_spike").
+ ///
+ public Dictionary? DrillDown { get; set; }
+}
+
+///
+/// A muted finding pattern. Maps to the analysis_muted DuckDB table.
+///
+public class AnalysisMuted
+{
+ public long MuteId { get; set; }
+ public int? ServerId { get; set; }
+ public string? DatabaseName { get; set; }
+ public string StoryPathHash { get; set; } = string.Empty;
+ public string StoryPath { get; set; } = string.Empty;
+ public DateTime MutedDate { get; set; }
+ public string? Reason { get; set; }
+}
+
+///
+/// A user-configured exclusion filter. Maps to the analysis_exclusions DuckDB table.
+///
+public class AnalysisExclusion
+{
+ public long ExclusionId { get; set; }
+ public string ExclusionType { get; set; } = string.Empty;
+ public string ExclusionValue { get; set; } = string.Empty;
+ public int? ServerId { get; set; }
+ public string? DatabaseName { get; set; }
+ public bool IsEnabled { get; set; } = true;
+ public DateTime CreatedDate { get; set; }
+ public string? Description { get; set; }
+}
+
+///
+/// A severity threshold value. Maps to the analysis_thresholds DuckDB table.
+///
+public class AnalysisThreshold
+{
+ public long ThresholdId { get; set; }
+ public string Category { get; set; } = string.Empty;
+ public string FactKey { get; set; } = string.Empty;
+ public string ThresholdType { get; set; } = string.Empty;
+ public double ThresholdValue { get; set; }
+ public int? ServerId { get; set; }
+ public string? DatabaseName { get; set; }
+ public bool IsEnabled { get; set; } = true;
+ public DateTime ModifiedDate { get; set; }
+}
diff --git a/Dashboard/Analysis/AnalysisService.cs b/Dashboard/Analysis/AnalysisService.cs
new file mode 100644
index 00000000..a0d7b2fc
--- /dev/null
+++ b/Dashboard/Analysis/AnalysisService.cs
@@ -0,0 +1,323 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading.Tasks;
+using Microsoft.Data.SqlClient;
+using PerformanceMonitorDashboard.Helpers;
+
+namespace PerformanceMonitorDashboard.Analysis;
+
+///
+/// Orchestrates the full analysis pipeline: collect -> score -> traverse -> persist.
+/// Can be run on-demand or on a timer. Each run analyzes a single server's data
+/// for a given time window and persists the findings.
+/// Port of Lite's AnalysisService — uses SQL Server instead of DuckDB.
+///
+public class AnalysisService
+{
+ private readonly string _connectionString;
+ private readonly SqlServerFindingStore _findingStore;
+ private readonly SqlServerFactCollector _collector;
+ private readonly FactScorer _scorer;
+ private readonly RelationshipGraph _graph;
+ private readonly InferenceEngine _engine;
+ private readonly SqlServerDrillDownCollector _drillDown;
+ private readonly SqlServerAnomalyDetector _anomalyDetector;
+
+ ///
+ /// Minimum hours of collected data required before analysis will run.
+ /// Short collection windows distort fraction-of-period calculations --
+ /// 5 seconds of THREADPOOL looks alarming in a 16-minute window.
+ ///
+ internal double MinimumDataHours { get; set; } = 72;
+
+ ///
+ /// Raised after each analysis run completes, providing the findings for UI display.
+ ///
+ public event EventHandler? AnalysisCompleted;
+
+ ///
+ /// Whether an analysis is currently running.
+ ///
+ public bool IsAnalyzing { get; private set; }
+
+ ///
+ /// Time of the last completed analysis run.
+ ///
+ public DateTime? LastAnalysisTime { get; private set; }
+
+ ///
+ /// Set after AnalyzeAsync if insufficient data was found. Null if enough data exists.
+ ///
+ public string? InsufficientDataMessage { get; private set; }
+
+ public AnalysisService(string connectionString, IPlanFetcher? planFetcher = null)
+ {
+ _connectionString = connectionString;
+ _findingStore = new SqlServerFindingStore(connectionString);
+ _collector = new SqlServerFactCollector(connectionString);
+ _scorer = new FactScorer();
+ _graph = new RelationshipGraph();
+ _engine = new InferenceEngine(_graph);
+ _drillDown = new SqlServerDrillDownCollector(connectionString, planFetcher);
+ _anomalyDetector = new SqlServerAnomalyDetector(connectionString);
+ }
+
+ ///
+ /// Runs the full analysis pipeline for a server.
+ /// Default time range is the last 4 hours.
+ ///
+ public async Task> AnalyzeAsync(int serverId, string serverName, int hoursBack = 4)
+ {
+ var timeRangeEnd = DateTime.UtcNow;
+ var timeRangeStart = timeRangeEnd.AddHours(-hoursBack);
+
+ var context = new AnalysisContext
+ {
+ ServerId = serverId,
+ ServerName = serverName,
+ TimeRangeStart = timeRangeStart,
+ TimeRangeEnd = timeRangeEnd
+ };
+
+ return await AnalyzeAsync(context);
+ }
+
+ ///
+ /// Runs the full analysis pipeline with a specific context.
+ ///
+ public async Task> AnalyzeAsync(AnalysisContext context)
+ {
+ if (IsAnalyzing)
+ return [];
+
+ IsAnalyzing = true;
+ InsufficientDataMessage = null;
+
+ try
+ {
+ // 0. Check minimum data span -- total history, not the analysis window.
+ // A server with 100h of total history can be analyzed over a 4h window.
+ var dataSpanHours = await GetTotalDataSpanHoursAsync();
+ if (dataSpanHours < MinimumDataHours)
+ {
+ var needed = MinimumDataHours >= 24
+ ? $"{MinimumDataHours / 24:F1} days"
+ : $"{MinimumDataHours:F0} hours";
+ var have = dataSpanHours >= 24
+ ? $"{dataSpanHours / 24:F1} days"
+ : $"{dataSpanHours:F1} hours";
+
+ InsufficientDataMessage =
+ $"Not enough data for reliable analysis. Need {needed} of collected data, " +
+ $"have {have}. Keep the collector running and try again later.";
+
+ Logger.Info(
+ $"[AnalysisService] Skipping analysis for {context.ServerName}: {dataSpanHours:F1}h data, need {MinimumDataHours}h");
+
+ LastAnalysisTime = DateTime.UtcNow;
+ return [];
+ }
+
+ // 1. Collect facts from SQL Server
+ var facts = await _collector.CollectFactsAsync(context);
+
+ if (facts.Count == 0)
+ {
+ LastAnalysisTime = DateTime.UtcNow;
+ return [];
+ }
+
+ // 1.5. Detect anomalies (compare analysis window against baseline)
+ var anomalies = await _anomalyDetector.DetectAnomaliesAsync(context);
+ facts.AddRange(anomalies);
+
+ // 2. Score facts (base severity + amplifiers)
+ _scorer.ScoreAll(facts);
+
+ // 3. Build stories via graph traversal
+ var stories = _engine.BuildStories(facts);
+
+ // 4. Persist findings (filtering out muted)
+ var findings = await _findingStore.SaveFindingsAsync(stories, context);
+
+ // 5. Enrich findings with drill-down data (ephemeral, not persisted)
+ await _drillDown.EnrichFindingsAsync(findings, context);
+
+ LastAnalysisTime = DateTime.UtcNow;
+
+ // 6. Notify listeners
+ AnalysisCompleted?.Invoke(this, new AnalysisCompletedEventArgs
+ {
+ ServerId = context.ServerId,
+ ServerName = context.ServerName,
+ Findings = findings,
+ AnalysisTime = LastAnalysisTime.Value
+ });
+
+ Logger.Info(
+ $"[AnalysisService] Analysis complete for {context.ServerName}: {findings.Count} finding(s), " +
+ $"highest severity {(findings.Count > 0 ? findings.Max(f => f.Severity) : 0):F2}");
+
+ return findings;
+ }
+ catch (Exception ex)
+ {
+ Logger.Error($"[AnalysisService] Analysis failed for {context.ServerName}: {ex.Message}");
+ return [];
+ }
+ finally
+ {
+ IsAnalyzing = false;
+ }
+ }
+
+ ///
+ /// Runs the collect + score pipeline without graph traversal.
+ /// Returns raw scored facts with amplifier details for direct inspection.
+ ///
+ public async Task> CollectAndScoreFactsAsync(int serverId, string serverName, int hoursBack = 4)
+ {
+ var timeRangeEnd = DateTime.UtcNow;
+ var timeRangeStart = timeRangeEnd.AddHours(-hoursBack);
+
+ var context = new AnalysisContext
+ {
+ ServerId = serverId,
+ ServerName = serverName,
+ TimeRangeStart = timeRangeStart,
+ TimeRangeEnd = timeRangeEnd
+ };
+
+ try
+ {
+ var facts = await _collector.CollectFactsAsync(context);
+ if (facts.Count == 0) return facts;
+ _scorer.ScoreAll(facts);
+ return facts;
+ }
+ catch (Exception ex)
+ {
+ Logger.Error($"[AnalysisService] Fact collection failed for {serverName}: {ex.Message}");
+ return [];
+ }
+ }
+
+ ///
+ /// Compares analysis of two time periods, returning facts from both for comparison.
+ ///
+ public async Task<(List BaselineFacts, List ComparisonFacts)> ComparePeriodsAsync(
+ int serverId, string serverName,
+ DateTime baselineStart, DateTime baselineEnd,
+ DateTime comparisonStart, DateTime comparisonEnd)
+ {
+ var baselineContext = new AnalysisContext
+ {
+ ServerId = serverId,
+ ServerName = serverName,
+ TimeRangeStart = baselineStart,
+ TimeRangeEnd = baselineEnd
+ };
+
+ var comparisonContext = new AnalysisContext
+ {
+ ServerId = serverId,
+ ServerName = serverName,
+ TimeRangeStart = comparisonStart,
+ TimeRangeEnd = comparisonEnd
+ };
+
+ try
+ {
+ var baselineFacts = await _collector.CollectFactsAsync(baselineContext);
+ var comparisonFacts = await _collector.CollectFactsAsync(comparisonContext);
+
+ _scorer.ScoreAll(baselineFacts);
+ _scorer.ScoreAll(comparisonFacts);
+
+ return (baselineFacts, comparisonFacts);
+ }
+ catch (Exception ex)
+ {
+ Logger.Error($"[AnalysisService] Period comparison failed for {serverName}: {ex.Message}");
+ return ([], []);
+ }
+ }
+
+ ///
+ /// Gets the latest findings for a server without running a new analysis.
+ ///
+ public async Task> GetLatestFindingsAsync(int serverId)
+ {
+ return await _findingStore.GetLatestFindingsAsync(serverId);
+ }
+
+ ///
+ /// Gets recent findings for a server within the given time range.
+ ///
+ public async Task> GetRecentFindingsAsync(int serverId, int hoursBack = 24)
+ {
+ return await _findingStore.GetRecentFindingsAsync(serverId, hoursBack);
+ }
+
+ ///
+ /// Mutes a finding pattern so it won't appear in future runs.
+ ///
+ public async Task MuteFindingAsync(AnalysisFinding finding, string? reason = null)
+ {
+ await _findingStore.MuteStoryAsync(
+ finding.ServerId, finding.StoryPathHash, finding.StoryPath, reason);
+ }
+
+ ///
+ /// Cleans up old findings beyond the retention period.
+ ///
+ public async Task CleanupAsync(int retentionDays = 30)
+ {
+ await _findingStore.CleanupOldFindingsAsync(retentionDays);
+ }
+
+ ///
+ /// Returns the total span of collected data (no time range filter).
+ /// This answers "has this server been monitored long enough?" -- separate from
+ /// the analysis window. A server with 100 hours of total history can safely
+ /// be analyzed over a 4-hour window without dilution.
+ /// Dashboard monitors one server per database, so no server_id filtering.
+ ///
+ private async Task GetTotalDataSpanHoursAsync()
+ {
+ try
+ {
+ using var connection = new SqlConnection(_connectionString);
+ await connection.OpenAsync();
+
+ using var cmd = connection.CreateCommand();
+ cmd.CommandText = @"
+SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+
+SELECT DATEDIFF(SECOND, MIN(collection_time), MAX(collection_time)) / 3600.0
+FROM collect.wait_stats;";
+
+ var result = await cmd.ExecuteScalarAsync();
+ if (result == null || result is DBNull)
+ return 0;
+
+ return Convert.ToDouble(result);
+ }
+ catch
+ {
+ return 0;
+ }
+ }
+}
+
+///
+/// Event args for when an analysis run completes.
+///
+public class AnalysisCompletedEventArgs : EventArgs
+{
+ public int ServerId { get; set; }
+ public string ServerName { get; set; } = string.Empty;
+ public List Findings { get; set; } = [];
+ public DateTime AnalysisTime { get; set; }
+}
diff --git a/Dashboard/Analysis/FactScorer.cs b/Dashboard/Analysis/FactScorer.cs
new file mode 100644
index 00000000..82382989
--- /dev/null
+++ b/Dashboard/Analysis/FactScorer.cs
@@ -0,0 +1,867 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace PerformanceMonitorDashboard.Analysis;
+
+///
+/// Assigns severity to facts using threshold formulas (Layer 1)
+/// and contextual amplifiers (Layer 2).
+///
+/// Layer 1: Base severity 0.0-1.0 from thresholds alone.
+/// Layer 2: Amplifiers multiply base up to 2.0 max using corroborating facts.
+///
+/// Formula: severity = min(base * (1.0 + sum(amplifiers)), 2.0)
+///
+public class FactScorer
+{
+ ///
+ /// Scores all facts: Layer 1 (base severity), then Layer 2 (amplifiers).
+ ///
+ public void ScoreAll(List facts)
+ {
+ // Layer 1: base severity from thresholds
+ foreach (var fact in facts)
+ {
+ fact.BaseSeverity = fact.Source switch
+ {
+ "waits" => ScoreWaitFact(fact),
+ "blocking" => ScoreBlockingFact(fact),
+ "cpu" => ScoreCpuFact(fact),
+ "io" => ScoreIoFact(fact),
+ "tempdb" => ScoreTempDbFact(fact),
+ "memory" => ScoreMemoryFact(fact),
+ "queries" => ScoreQueryFact(fact),
+ "perfmon" => ScorePerfmonFact(fact),
+ "database_config" => ScoreDatabaseConfigFact(fact),
+ "jobs" => ScoreJobFact(fact),
+ "disk" => ScoreDiskFact(fact),
+ "bad_actor" => ScoreBadActorFact(fact),
+ "anomaly" => ScoreAnomalyFact(fact),
+ _ => 0.0
+ };
+ }
+
+ // Build lookup for amplifier evaluation (include context facts that amplifiers reference)
+ var contextSources = new HashSet
+ { "config", "cpu", "io", "tempdb", "memory", "queries", "perfmon",
+ "database_config", "jobs", "sessions", "disk", "bad_actor", "anomaly" };
+ var factsByKey = facts
+ .Where(f => f.BaseSeverity > 0 || contextSources.Contains(f.Source))
+ .ToDictionary(f => f.Key, f => f);
+
+ // Layer 2: amplifiers boost base severity using corroborating facts
+ foreach (var fact in facts)
+ {
+ if (fact.BaseSeverity <= 0)
+ {
+ fact.Severity = 0;
+ continue;
+ }
+
+ var amplifiers = GetAmplifiers(fact);
+ var totalBoost = 0.0;
+
+ foreach (var amp in amplifiers)
+ {
+ var matched = amp.Predicate(factsByKey);
+ fact.AmplifierResults.Add(new AmplifierResult
+ {
+ Description = amp.Description,
+ Matched = matched,
+ Boost = matched ? amp.Boost : 0.0
+ });
+
+ if (matched) totalBoost += amp.Boost;
+ }
+
+ fact.Severity = Math.Min(fact.BaseSeverity * (1.0 + totalBoost), 2.0);
+ }
+ }
+
+ ///
+ /// Scores a wait fact using the fraction-of-period formula.
+ /// Some waits have absolute minimum thresholds to filter out background noise.
+ ///
+ private static double ScoreWaitFact(Fact fact)
+ {
+ var fraction = fact.Value;
+ if (fraction <= 0) return 0.0;
+
+ // THREADPOOL: require both meaningful total wait time AND meaningful average.
+ // Tiny amounts are normal thread pool grow/shrink housekeeping, not exhaustion.
+ if (fact.Key == "THREADPOOL")
+ {
+ var waitTimeMs = fact.Metadata.GetValueOrDefault("wait_time_ms");
+ var avgMs = fact.Metadata.GetValueOrDefault("avg_ms_per_wait");
+ if (waitTimeMs < 3_600_000 || avgMs < 1_000) return 0.0;
+ }
+
+ var thresholds = GetWaitThresholds(fact.Key);
+ if (thresholds == null) return 0.0;
+
+ return ApplyThresholdFormula(fraction, thresholds.Value.concerning, thresholds.Value.critical);
+ }
+
+ ///
+ /// Scores blocking/deadlock facts using events-per-hour thresholds.
+ ///
+ private static double ScoreBlockingFact(Fact fact)
+ {
+ var value = fact.Value; // events per hour
+ if (value <= 0) return 0.0;
+
+ return fact.Key switch
+ {
+ // Blocking: concerning >10/hr, critical >50/hr
+ "BLOCKING_EVENTS" => ApplyThresholdFormula(value, 10, 50),
+ // Deadlocks: concerning >5/hr (no critical — any sustained deadlocking is bad)
+ "DEADLOCKS" => ApplyThresholdFormula(value, 5, null),
+ _ => 0.0
+ };
+ }
+
+ ///
+ /// Scores CPU utilization. Value is average SQL CPU %.
+ ///
+ private static double ScoreCpuFact(Fact fact)
+ {
+ return fact.Key switch
+ {
+ // CPU %: concerning at 75%, critical at 95%
+ "CPU_SQL_PERCENT" => ApplyThresholdFormula(fact.Value, 75, 95),
+ // CPU spike: value is max CPU %. Concerning at 80%, critical at 95%.
+ // Only emitted when max is significantly above average (bursty).
+ "CPU_SPIKE" => ApplyThresholdFormula(fact.Value, 80, 95),
+ _ => 0.0
+ };
+ }
+
+ ///
+ /// Scores I/O latency facts. Value is average latency in ms.
+ ///
+ private static double ScoreIoFact(Fact fact)
+ {
+ return fact.Key switch
+ {
+ // Read latency: concerning at 20ms, critical at 50ms
+ "IO_READ_LATENCY_MS" => ApplyThresholdFormula(fact.Value, 20, 50),
+ // Write latency: concerning at 10ms, critical at 30ms
+ "IO_WRITE_LATENCY_MS" => ApplyThresholdFormula(fact.Value, 10, 30),
+ _ => 0.0
+ };
+ }
+
+ ///
+ /// Scores TempDB usage. Value is usage fraction (reserved / total space).
+ ///
+ private static double ScoreTempDbFact(Fact fact)
+ {
+ return fact.Key switch
+ {
+ // TempDB usage: concerning at 75%, critical at 90%
+ "TEMPDB_USAGE" => ApplyThresholdFormula(fact.Value, 0.75, 0.90),
+ _ => 0.0
+ };
+ }
+
+ ///
+ /// Scores memory grant facts. Only MEMORY_GRANT_PENDING (from resource semaphore) for now.
+ ///
+ private static double ScoreMemoryFact(Fact fact)
+ {
+ return fact.Key switch
+ {
+ // Grant waiters: concerning at 1, critical at 5
+ "MEMORY_GRANT_PENDING" => ApplyThresholdFormula(fact.Value, 1, 5),
+ _ => 0.0
+ };
+ }
+
+ ///
+ /// Scores query-level aggregate facts.
+ ///
+ private static double ScoreQueryFact(Fact fact)
+ {
+ return fact.Key switch
+ {
+ // Spills: concerning at 100, critical at 1000 in the period
+ "QUERY_SPILLS" => ApplyThresholdFormula(fact.Value, 100, 1000),
+ // High DOP queries: concerning at 5, critical at 20 in the period
+ "QUERY_HIGH_DOP" => ApplyThresholdFormula(fact.Value, 5, 20),
+ _ => 0.0
+ };
+ }
+
+ ///
+ /// Scores perfmon counter facts. PLE is the classic memory pressure indicator.
+ ///
+ private static double ScorePerfmonFact(Fact fact)
+ {
+ return fact.Key switch
+ {
+ // PLE: lower is worse. Invert: concerning < 300, critical < 60
+ "PERFMON_PLE" when fact.Value <= 0 => 1.0,
+ "PERFMON_PLE" when fact.Value < 60 => 1.0,
+ "PERFMON_PLE" when fact.Value < 300 => 0.5 + 0.5 * (300 - fact.Value) / 240,
+ "PERFMON_PLE" => 0.0,
+ _ => 0.0
+ };
+ }
+
+ ///
+ /// Scores database configuration facts.
+ /// Auto-shrink and auto-close are always bad.
+ /// RCSI-off gets a low base that only becomes visible through amplifiers
+ /// when reader/writer lock contention (LCK_M_S, LCK_M_IS) is present.
+ ///
+ private static double ScoreDatabaseConfigFact(Fact fact)
+ {
+ if (fact.Key != "DB_CONFIG") return 0.0;
+
+ var autoShrink = fact.Metadata.GetValueOrDefault("auto_shrink_on_count");
+ var autoClose = fact.Metadata.GetValueOrDefault("auto_close_on_count");
+ var pageVerifyBad = fact.Metadata.GetValueOrDefault("page_verify_not_checksum_count");
+ var rcsiOff = fact.Metadata.GetValueOrDefault("rcsi_off_count");
+
+ var score = 0.0;
+
+ // Auto-shrink, auto-close, bad page verify are always concerning
+ if (autoShrink > 0 || autoClose > 0 || pageVerifyBad > 0)
+ score = Math.Max(score, Math.Min((autoShrink + autoClose + pageVerifyBad) * 0.3, 1.0));
+
+ // RCSI-off: low base (0.3) — below display threshold alone.
+ // Amplifiers for LCK_M_S/LCK_M_IS push it above 0.5 when reader/writer
+ // contention confirms RCSI would help.
+ if (rcsiOff > 0)
+ score = Math.Max(score, 0.3);
+
+ return score;
+ }
+
+ ///
+ /// Scores running job facts. Long-running jobs are a signal.
+ ///
+ private static double ScoreJobFact(Fact fact)
+ {
+ return fact.Key switch
+ {
+ // Long-running jobs: concerning at 1, critical at 3
+ "RUNNING_JOBS" => ApplyThresholdFormula(fact.Value, 1, 3),
+ _ => 0.0
+ };
+ }
+
+ ///
+ /// Scores disk space facts. Low free space is critical.
+ ///
+ private static double ScoreDiskFact(Fact fact)
+ {
+ if (fact.Key != "DISK_SPACE") return 0.0;
+
+ var freePct = fact.Value;
+ // Invert: lower free space is worse. Critical < 5%, concerning < 10%
+ if (freePct < 0.05) return 1.0;
+ if (freePct < 0.10) return 0.5 + 0.5 * (0.10 - freePct) / 0.05;
+ if (freePct < 0.20) return 0.5 * (0.20 - freePct) / 0.10;
+ return 0.0;
+ }
+
+ ///
+ /// Scores bad actor queries using execution count tier x per-execution impact.
+ /// A query running 100K times at 1ms CPU is different from 100K times at 5s CPU.
+ /// The tier gets it in the door, per-execution impact determines how bad it is.
+ ///
+ private static double ScoreBadActorFact(Fact fact)
+ {
+ var execCount = fact.Metadata.GetValueOrDefault("execution_count");
+ var avgCpuMs = fact.Metadata.GetValueOrDefault("avg_cpu_ms");
+ var avgReads = fact.Metadata.GetValueOrDefault("avg_reads");
+
+ // Execution count tier base — higher tiers for more frequent queries
+ var tierBase = execCount switch
+ {
+ < 1_000 => 0.5,
+ < 10_000 => 0.7,
+ < 100_000 => 0.85,
+ _ => 1.0
+ };
+
+ // Per-execution impact: use the worse of CPU or reads
+ // CPU: concerning at 50ms, critical at 2000ms
+ var cpuImpact = ApplyThresholdFormula(avgCpuMs, 50, 2000);
+ // Reads: concerning at 5K, critical at 250K
+ var readsImpact = ApplyThresholdFormula(avgReads, 5_000, 250_000);
+
+ var impact = Math.Max(cpuImpact, readsImpact);
+
+ // Final: tier * impact. Both must be meaningful.
+ // A high-frequency query with trivial per-execution cost won't score.
+ // A heavy query that only runs once won't score high either.
+ return tierBase * impact;
+ }
+
+ ///
+ /// Scores anomaly facts based on deviation from baseline.
+ /// At 2σ → 0.5, at 4σ → 1.0. Higher deviations are more severe.
+ /// For count-based anomalies (blocking/deadlock spikes), uses ratio instead.
+ ///
+ private static double ScoreAnomalyFact(Fact fact)
+ {
+ if (fact.Key.StartsWith("ANOMALY_CPU_SPIKE") || fact.Key.StartsWith("ANOMALY_READ_LATENCY")
+ || fact.Key.StartsWith("ANOMALY_WRITE_LATENCY"))
+ {
+ // Deviation-based scoring: 2σ = 0.5, 4σ = 1.0
+ var deviation = fact.Metadata.GetValueOrDefault("deviation_sigma");
+ var confidence = fact.Metadata.GetValueOrDefault("confidence", 1.0);
+ if (deviation < 2.0) return 0.0;
+ var base_score = 0.5 + 0.5 * Math.Min((deviation - 2.0) / 2.0, 1.0);
+ return base_score * confidence;
+ }
+
+ if (fact.Key.StartsWith("ANOMALY_WAIT_"))
+ {
+ // Ratio-based scoring: 5x = 0.5, 20x = 1.0
+ var ratio = fact.Metadata.GetValueOrDefault("ratio");
+ if (ratio < 5) return 0.0;
+ return 0.5 + 0.5 * Math.Min((ratio - 5.0) / 15.0, 1.0);
+ }
+
+ if (fact.Key.StartsWith("ANOMALY_BLOCKING_SPIKE") || fact.Key.StartsWith("ANOMALY_DEADLOCK_SPIKE"))
+ {
+ // Ratio-based: 3x = 0.5, 10x = 1.0
+ var ratio = fact.Metadata.GetValueOrDefault("ratio");
+ if (ratio < 3) return 0.0;
+ return 0.5 + 0.5 * Math.Min((ratio - 3.0) / 7.0, 1.0);
+ }
+
+ return 0.0;
+ }
+
+ ///
+ /// Generic threshold formula used by waits, latency, and count-based metrics.
+ /// Critical == null means "concerning only" — hitting concerning = 1.0.
+ ///
+ internal static double ApplyThresholdFormula(double value, double concerning, double? critical)
+ {
+ if (value <= 0) return 0.0;
+
+ if (critical == null)
+ return Math.Min(value / concerning, 1.0);
+
+ if (value >= critical.Value)
+ return 1.0;
+
+ if (value >= concerning)
+ return 0.5 + 0.5 * (value - concerning) / (critical.Value - concerning);
+
+ return 0.5 * (value / concerning);
+ }
+
+ ///
+ /// Returns amplifier definitions for a fact. Each amplifier has a description,
+ /// a boost value, and a predicate that evaluates against the current fact set.
+ /// Amplifiers are defined per wait type and will grow as more fact categories are added.
+ ///
+ private static List GetAmplifiers(Fact fact)
+ {
+ return fact.Key switch
+ {
+ "SOS_SCHEDULER_YIELD" => SosSchedulerYieldAmplifiers(),
+ "CXPACKET" => CxPacketAmplifiers(),
+ "THREADPOOL" => ThreadpoolAmplifiers(),
+ "PAGEIOLATCH_SH" or "PAGEIOLATCH_EX" => PageiolatchAmplifiers(),
+ "LATCH_EX" or "LATCH_SH" => LatchAmplifiers(),
+ "BLOCKING_EVENTS" => BlockingEventsAmplifiers(),
+ "DEADLOCKS" => DeadlockAmplifiers(),
+ "LCK" => LckAmplifiers(),
+ "CPU_SQL_PERCENT" => CpuSqlPercentAmplifiers(),
+ "CPU_SPIKE" => CpuSpikeAmplifiers(),
+ "IO_READ_LATENCY_MS" => IoReadLatencyAmplifiers(),
+ "IO_WRITE_LATENCY_MS" => IoWriteLatencyAmplifiers(),
+ "MEMORY_GRANT_PENDING" => MemoryGrantAmplifiers(),
+ "QUERY_SPILLS" => QuerySpillAmplifiers(),
+ "PERFMON_PLE" => PleAmplifiers(),
+ "DB_CONFIG" => DbConfigAmplifiers(),
+ "DISK_SPACE" => DiskSpaceAmplifiers(),
+ _ => []
+ };
+ }
+
+ ///
+ /// SOS_SCHEDULER_YIELD: CPU starvation confirmed by parallelism waits.
+ /// More amplifiers added when config and CPU utilization facts are available.
+ ///
+ private static List SosSchedulerYieldAmplifiers() =>
+ [
+ new()
+ {
+ Description = "CXPACKET significant — parallelism consuming schedulers",
+ Boost = 0.2,
+ Predicate = facts => HasSignificantWait(facts, "CXPACKET", 0.10)
+ },
+ new()
+ {
+ Description = "THREADPOOL waits present — escalating to thread exhaustion",
+ Boost = 0.3,
+ Predicate = facts => facts.ContainsKey("THREADPOOL") && facts["THREADPOOL"].BaseSeverity > 0
+ },
+ new()
+ {
+ Description = "SQL Server CPU > 80% — confirmed CPU saturation",
+ Boost = 0.3,
+ Predicate = facts => facts.TryGetValue("CPU_SQL_PERCENT", out var cpu) && cpu.Value >= 80
+ }
+ ];
+
+ ///
+ /// CXPACKET: parallelism waits confirmed by CPU pressure and bad config.
+ /// CXCONSUMER is grouped into CXPACKET by the collector.
+ ///
+ private static List CxPacketAmplifiers() =>
+ [
+ new()
+ {
+ Description = "SOS_SCHEDULER_YIELD high — CPU starvation from parallelism",
+ Boost = 0.3,
+ Predicate = facts => HasSignificantWait(facts, "SOS_SCHEDULER_YIELD", 0.25)
+ },
+ new()
+ {
+ Description = "THREADPOOL waits present — thread exhaustion cascade",
+ Boost = 0.4,
+ Predicate = facts => facts.ContainsKey("THREADPOOL") && facts["THREADPOOL"].BaseSeverity > 0
+ },
+ new()
+ {
+ Description = "CTFP at default (5) — too low for most workloads",
+ Boost = 0.3,
+ Predicate = facts => facts.TryGetValue("CONFIG_CTFP", out var ctfp) && ctfp.Value <= 5
+ },
+ new()
+ {
+ Description = "MAXDOP at 0 — unlimited parallelism",
+ Boost = 0.2,
+ Predicate = facts => facts.TryGetValue("CONFIG_MAXDOP", out var maxdop) && maxdop.Value == 0
+ },
+ new()
+ {
+ Description = "Queries running with DOP > 8 — excessive parallelism confirmed",
+ Boost = 0.2,
+ Predicate = facts => facts.TryGetValue("QUERY_HIGH_DOP", out var dop) && dop.BaseSeverity > 0
+ }
+ ];
+
+ ///
+ /// THREADPOOL: thread exhaustion confirmed by parallelism pressure.
+ /// Blocking and config amplifiers added later.
+ ///
+ private static List ThreadpoolAmplifiers() =>
+ [
+ new()
+ {
+ Description = "CXPACKET significant — parallel queries consuming thread pool",
+ Boost = 0.2,
+ Predicate = facts => HasSignificantWait(facts, "CXPACKET", 0.10)
+ },
+ new()
+ {
+ Description = "Lock contention present — blocked queries holding worker threads",
+ Boost = 0.3,
+ Predicate = facts => facts.ContainsKey("LCK") && facts["LCK"].BaseSeverity >= 0.5
+ }
+ ];
+
+ ///
+ /// PAGEIOLATCH: memory pressure confirmed by other waits.
+ /// Buffer pool, query, and config amplifiers added when those facts are available.
+ ///
+ private static List PageiolatchAmplifiers() =>
+ [
+ new()
+ {
+ Description = "SOS_SCHEDULER_YIELD elevated — CPU pressure alongside I/O pressure",
+ Boost = 0.1,
+ Predicate = facts => HasSignificantWait(facts, "SOS_SCHEDULER_YIELD", 0.15)
+ },
+ new()
+ {
+ Description = "Read latency > 20ms — confirmed disk I/O bottleneck",
+ Boost = 0.3,
+ Predicate = facts => facts.TryGetValue("IO_READ_LATENCY_MS", out var io) && io.Value >= 20
+ },
+ new()
+ {
+ Description = "Memory grant waiters present — grants competing with buffer pool",
+ Boost = 0.2,
+ Predicate = facts => facts.TryGetValue("MEMORY_GRANT_PENDING", out var mg) && mg.Value >= 1
+ }
+ ];
+
+ ///
+ /// LATCH_EX/LATCH_SH: in-memory page latch contention.
+ /// Common causes: TempDB allocation contention, hot page updates,
+ /// parallel insert into heaps or narrow indexes.
+ ///
+ private static List LatchAmplifiers() =>
+ [
+ new()
+ {
+ Description = "TempDB usage elevated — latch contention likely on TempDB allocation pages",
+ Boost = 0.3,
+ Predicate = facts => facts.TryGetValue("TEMPDB_USAGE", out var t) && t.BaseSeverity > 0
+ },
+ new()
+ {
+ Description = "CXPACKET significant — parallel operations amplifying latch contention",
+ Boost = 0.2,
+ Predicate = facts => HasSignificantWait(facts, "CXPACKET", 0.10)
+ },
+ new()
+ {
+ Description = "SOS_SCHEDULER_YIELD elevated — latch spinning contributing to CPU pressure",
+ Boost = 0.2,
+ Predicate = facts => HasSignificantWait(facts, "SOS_SCHEDULER_YIELD", 0.15)
+ }
+ ];
+
+ ///
+ /// BLOCKING_EVENTS: blocking confirmed by lock waits and deadlocks.
+ ///
+ private static List BlockingEventsAmplifiers() =>
+ [
+ new()
+ {
+ Description = "Head blocker sleeping with open transaction — abandoned transaction pattern",
+ Boost = 0.4,
+ Predicate = facts => facts.TryGetValue("BLOCKING_EVENTS", out var f)
+ && f.Metadata.GetValueOrDefault("sleeping_blocker_count") > 0
+ },
+ new()
+ {
+ Description = "Lock contention waits elevated — blocking visible in wait stats",
+ Boost = 0.3,
+ Predicate = facts => facts.ContainsKey("LCK") && facts["LCK"].BaseSeverity >= 0.3
+ },
+ new()
+ {
+ Description = "Deadlocks also present — blocking escalating to deadlocks",
+ Boost = 0.3,
+ Predicate = facts => facts.ContainsKey("DEADLOCKS") && facts["DEADLOCKS"].BaseSeverity > 0
+ }
+ ];
+
+ ///
+ /// DEADLOCKS: deadlocks confirmed by blocking patterns.
+ ///
+ private static List DeadlockAmplifiers() =>
+ [
+ new()
+ {
+ Description = "Blocking events also present — systemic contention pattern",
+ Boost = 0.3,
+ Predicate = facts => facts.ContainsKey("BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0
+ },
+ new()
+ {
+ Description = "Reader/writer lock waits present — RCSI could prevent some deadlocks",
+ Boost = 0.3,
+ Predicate = facts => (facts.ContainsKey("LCK_M_S") && facts["LCK_M_S"].BaseSeverity > 0)
+ || (facts.ContainsKey("LCK_M_IS") && facts["LCK_M_IS"].BaseSeverity > 0)
+ },
+ new()
+ {
+ Description = "Databases without RCSI — reader/writer isolation amplifying deadlocks",
+ Boost = 0.2,
+ Predicate = facts => facts.TryGetValue("DB_CONFIG", out var db) && db.Metadata.GetValueOrDefault("rcsi_off_count") > 0
+ }
+ ];
+
+ ///
+ /// LCK (grouped general lock contention): confirmed by blocking reports and deadlocks.
+ ///
+ private static List LckAmplifiers() =>
+ [
+ new()
+ {
+ Description = "Blocked process reports present — confirmed blocking events",
+ Boost = 0.3,
+ Predicate = facts => facts.ContainsKey("BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0
+ },
+ new()
+ {
+ Description = "Deadlocks present — lock contention escalating to deadlocks",
+ Boost = 0.3,
+ Predicate = facts => facts.ContainsKey("DEADLOCKS") && facts["DEADLOCKS"].BaseSeverity > 0
+ },
+ new()
+ {
+ Description = "THREADPOOL waits present — blocking causing thread exhaustion",
+ Boost = 0.3,
+ Predicate = facts => facts.ContainsKey("THREADPOOL") && facts["THREADPOOL"].BaseSeverity > 0
+ }
+ ];
+
+ ///
+ /// PLE: memory pressure confirmed by PAGEIOLATCH and RESOURCE_SEMAPHORE.
+ ///
+ private static List PleAmplifiers() =>
+ [
+ new()
+ {
+ Description = "PAGEIOLATCH waits present — buffer pool misses confirm memory pressure",
+ Boost = 0.3,
+ Predicate = facts => HasSignificantWait(facts, "PAGEIOLATCH_SH", 0.10)
+ || HasSignificantWait(facts, "PAGEIOLATCH_EX", 0.10)
+ },
+ new()
+ {
+ Description = "RESOURCE_SEMAPHORE waits — memory grants competing with buffer pool",
+ Boost = 0.2,
+ Predicate = facts => facts.ContainsKey("RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].BaseSeverity > 0
+ }
+ ];
+
+ ///
+ /// DB_CONFIG: database misconfiguration amplified by related symptoms.
+ /// RCSI-off amplifiers only fire when reader/writer lock contention is present —
+ /// LCK_M_S (shared lock waits) and LCK_M_IS (intent-shared) are readers blocked
+ /// by writers. RCSI eliminates these. Writer/writer conflicts (LCK_M_X, LCK_M_U)
+ /// are NOT helped by RCSI and should not trigger this amplifier.
+ ///
+ private static List DbConfigAmplifiers() =>
+ [
+ new()
+ {
+ Description = "I/O latency elevated — auto_shrink may be causing fragmentation and I/O pressure",
+ Boost = 0.3,
+ Predicate = facts => facts.TryGetValue("IO_READ_LATENCY_MS", out var io) && io.BaseSeverity > 0
+ },
+ new()
+ {
+ Description = "LCK_M_S waits — readers blocked by writers, RCSI would eliminate shared lock waits",
+ Boost = 0.5,
+ Predicate = facts => facts.TryGetValue("DB_CONFIG", out var db)
+ && db.Metadata.GetValueOrDefault("rcsi_off_count") > 0
+ && facts.TryGetValue("LCK_M_S", out var lckS) && lckS.BaseSeverity > 0
+ },
+ new()
+ {
+ Description = "LCK_M_IS waits — intent-shared locks blocked by writers, RCSI would eliminate these",
+ Boost = 0.4,
+ Predicate = facts => facts.TryGetValue("DB_CONFIG", out var db)
+ && db.Metadata.GetValueOrDefault("rcsi_off_count") > 0
+ && facts.TryGetValue("LCK_M_IS", out var lckIS) && lckIS.BaseSeverity > 0
+ },
+ new()
+ {
+ Description = "Deadlocks with reader/writer lock waits — RCSI eliminates reader/writer deadlocks",
+ Boost = 0.4,
+ Predicate = facts => facts.TryGetValue("DB_CONFIG", out var db)
+ && db.Metadata.GetValueOrDefault("rcsi_off_count") > 0
+ && facts.TryGetValue("DEADLOCKS", out var dl) && dl.BaseSeverity > 0
+ && (facts.TryGetValue("LCK_M_S", out var s) && s.BaseSeverity > 0
+ || facts.TryGetValue("LCK_M_IS", out var i) && i.BaseSeverity > 0)
+ }
+ ];
+
+ ///
+ /// DISK_SPACE: low disk space amplified by I/O activity and TempDB pressure.
+ ///
+ private static List DiskSpaceAmplifiers() =>
+ [
+ new()
+ {
+ Description = "TempDB usage elevated — growing TempDB on a nearly full volume",
+ Boost = 0.3,
+ Predicate = facts => facts.TryGetValue("TEMPDB_USAGE", out var t) && t.BaseSeverity > 0
+ },
+ new()
+ {
+ Description = "Query spills present — spills to disk on a nearly full volume",
+ Boost = 0.2,
+ Predicate = facts => facts.TryGetValue("QUERY_SPILLS", out var s) && s.BaseSeverity > 0
+ }
+ ];
+
+ ///
+ /// CPU_SQL_PERCENT: CPU saturation confirmed by scheduler yields and parallelism.
+ ///
+ private static List CpuSqlPercentAmplifiers() =>
+ [
+ new()
+ {
+ Description = "SOS_SCHEDULER_YIELD elevated — scheduler pressure confirms CPU saturation",
+ Boost = 0.3,
+ Predicate = facts => HasSignificantWait(facts, "SOS_SCHEDULER_YIELD", 0.25)
+ },
+ new()
+ {
+ Description = "CXPACKET significant — parallelism contributing to CPU load",
+ Boost = 0.2,
+ Predicate = facts => HasSignificantWait(facts, "CXPACKET", 0.10)
+ }
+ ];
+
+ ///
+ /// CPU_SPIKE: bursty CPU event (max >> average) confirmed by scheduler
+ /// pressure, parallelism, or query spills during the spike.
+ ///
+ private static List CpuSpikeAmplifiers() =>
+ [
+ new()
+ {
+ Description = "SOS_SCHEDULER_YIELD present — scheduler pressure during CPU spike",
+ Boost = 0.3,
+ Predicate = facts => facts.ContainsKey("SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].BaseSeverity > 0
+ },
+ new()
+ {
+ Description = "CXPACKET significant — parallelism contributing to CPU spike",
+ Boost = 0.2,
+ Predicate = facts => HasSignificantWait(facts, "CXPACKET", 0.10)
+ },
+ new()
+ {
+ Description = "THREADPOOL waits present — CPU spike causing thread exhaustion",
+ Boost = 0.4,
+ Predicate = facts => facts.ContainsKey("THREADPOOL") && facts["THREADPOOL"].BaseSeverity > 0
+ }
+ ];
+
+ ///
+ /// IO_READ_LATENCY_MS: read latency confirmed by PAGEIOLATCH waits.
+ ///
+ private static List IoReadLatencyAmplifiers() =>
+ [
+ new()
+ {
+ Description = "PAGEIOLATCH waits elevated — buffer pool misses confirm I/O pressure",
+ Boost = 0.3,
+ Predicate = facts => HasSignificantWait(facts, "PAGEIOLATCH_SH", 0.10)
+ || HasSignificantWait(facts, "PAGEIOLATCH_EX", 0.10)
+ }
+ ];
+
+ ///
+ /// IO_WRITE_LATENCY_MS: write latency confirmed by WRITELOG waits.
+ ///
+ private static List IoWriteLatencyAmplifiers() =>
+ [
+ new()
+ {
+ Description = "WRITELOG waits elevated — transaction log I/O bottleneck confirmed",
+ Boost = 0.3,
+ Predicate = facts => HasSignificantWait(facts, "WRITELOG", 0.05)
+ }
+ ];
+
+ ///
+ /// MEMORY_GRANT_PENDING: grant pressure confirmed by RESOURCE_SEMAPHORE waits and spills.
+ ///
+ private static List MemoryGrantAmplifiers() =>
+ [
+ new()
+ {
+ Description = "RESOURCE_SEMAPHORE waits present — memory grant pressure in wait stats",
+ Boost = 0.3,
+ Predicate = facts => facts.ContainsKey("RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].BaseSeverity > 0
+ },
+ new()
+ {
+ Description = "Query spills present — queries running with insufficient memory grants",
+ Boost = 0.2,
+ Predicate = facts => facts.TryGetValue("QUERY_SPILLS", out var s) && s.BaseSeverity > 0
+ }
+ ];
+
+ ///
+ /// QUERY_SPILLS: spills confirmed by memory grant pressure.
+ ///
+ private static List QuerySpillAmplifiers() =>
+ [
+ new()
+ {
+ Description = "Memory grant waiters present — insufficient memory for query grants",
+ Boost = 0.3,
+ Predicate = facts => facts.TryGetValue("MEMORY_GRANT_PENDING", out var mg) && mg.Value >= 1
+ },
+ new()
+ {
+ Description = "RESOURCE_SEMAPHORE waits — grant pressure visible in wait stats",
+ Boost = 0.2,
+ Predicate = facts => facts.ContainsKey("RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].BaseSeverity > 0
+ }
+ ];
+
+ ///
+ /// Checks if a wait type is present with at least the given fraction of period.
+ ///
+ private static bool HasSignificantWait(Dictionary facts, string waitType, double minFraction)
+ {
+ return facts.TryGetValue(waitType, out var fact) && fact.Value >= minFraction;
+ }
+
+ ///
+ /// Default thresholds for wait types (fraction of examined period).
+ /// Returns null for unrecognized waits — they get severity 0.
+ ///
+ private static (double concerning, double? critical)? GetWaitThresholds(string waitType)
+ {
+ return waitType switch
+ {
+ // CPU pressure
+ "SOS_SCHEDULER_YIELD" => (0.75, null),
+ "THREADPOOL" => (0.01, null),
+
+ // Memory pressure
+ "PAGEIOLATCH_SH" => (0.25, null),
+ "PAGEIOLATCH_EX" => (0.25, null),
+ "RESOURCE_SEMAPHORE" => (0.01, null),
+
+ // Parallelism (CXCONSUMER is grouped into CXPACKET by collector)
+ "CXPACKET" => (0.25, null),
+
+ // Log I/O
+ "WRITELOG" => (0.10, null),
+
+ // Lock waits — serializable/repeatable read lock modes
+ "LCK_M_RS_S" => (0.01, null),
+ "LCK_M_RS_U" => (0.01, null),
+ "LCK_M_RIn_NL" => (0.01, null),
+ "LCK_M_RIn_S" => (0.01, null),
+ "LCK_M_RIn_U" => (0.01, null),
+ "LCK_M_RIn_X" => (0.01, null),
+ "LCK_M_RX_S" => (0.01, null),
+ "LCK_M_RX_U" => (0.01, null),
+ "LCK_M_RX_X" => (0.01, null),
+
+ // Reader/writer blocking locks
+ "LCK_M_S" => (0.05, null),
+ "LCK_M_IS" => (0.05, null),
+
+ // General lock contention (grouped X, U, IX, SIX, BU, etc.)
+ "LCK" => (0.10, null),
+
+ // Schema locks — DDL operations, index rebuilds
+ "SCH_M" => (0.01, null),
+
+ // Latch contention — page latch (not I/O latch) indicates
+ // in-memory contention, often TempDB allocation or hot pages
+ "LATCH_EX" => (0.25, null),
+ "LATCH_SH" => (0.25, null),
+
+ _ => null
+ };
+ }
+}
+
+///
+/// An amplifier definition: a named predicate that boosts severity when matched.
+///
+internal class AmplifierDefinition
+{
+ public string Description { get; set; } = string.Empty;
+ public double Boost { get; set; }
+ public Func, bool> Predicate { get; set; } = _ => false;
+}
diff --git a/Dashboard/Analysis/IFactCollector.cs b/Dashboard/Analysis/IFactCollector.cs
new file mode 100644
index 00000000..38b6abbe
--- /dev/null
+++ b/Dashboard/Analysis/IFactCollector.cs
@@ -0,0 +1,31 @@
+using System;
+using System.Collections.Generic;
+using System.Threading.Tasks;
+
+namespace PerformanceMonitorDashboard.Analysis;
+
+///
+/// Context for an analysis run — what server, what time range.
+///
+public class AnalysisContext
+{
+ public int ServerId { get; set; }
+ public string ServerName { get; set; } = string.Empty;
+ public DateTime TimeRangeStart { get; set; }
+ public DateTime TimeRangeEnd { get; set; }
+ public List Exclusions { get; set; } = [];
+
+ ///
+ /// Duration of the examined period in milliseconds.
+ ///
+ public double PeriodDurationMs => (TimeRangeEnd - TimeRangeStart).TotalMilliseconds;
+}
+
+///
+/// Collects facts from a data source for analysis.
+/// Implementations are per-app: DuckDB for Lite, SQL Server for Dashboard.
+///
+public interface IFactCollector
+{
+ Task> CollectFactsAsync(AnalysisContext context);
+}
diff --git a/Dashboard/Analysis/IPlanFetcher.cs b/Dashboard/Analysis/IPlanFetcher.cs
new file mode 100644
index 00000000..e77fea18
--- /dev/null
+++ b/Dashboard/Analysis/IPlanFetcher.cs
@@ -0,0 +1,19 @@
+using System.Threading.Tasks;
+
+namespace PerformanceMonitorDashboard.Analysis;
+
+///
+/// Fetches execution plan XML from SQL Server on demand.
+/// Platform-agnostic interface — Lite implements via RemoteCollectorService's
+/// SQL connection, Dashboard implements via DatabaseService's connection.
+/// Used by DrillDownCollector to analyze plans for high-impact findings
+/// without storing plan XML in DuckDB or SQL Server tables.
+///
+public interface IPlanFetcher
+{
+ ///
+ /// Fetches the execution plan XML for a given plan_handle.
+ /// Returns null if the plan is no longer in cache.
+ ///
+ Task FetchPlanXmlAsync(int serverId, string planHandle);
+}
diff --git a/Dashboard/Analysis/InferenceEngine.cs b/Dashboard/Analysis/InferenceEngine.cs
new file mode 100644
index 00000000..976bef43
--- /dev/null
+++ b/Dashboard/Analysis/InferenceEngine.cs
@@ -0,0 +1,165 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Security.Cryptography;
+using System.Text;
+
+namespace PerformanceMonitorDashboard.Analysis;
+
+///
+/// Greedy traversal engine that builds analysis stories from scored facts
+/// and the relationship graph.
+///
+/// Algorithm:
+/// 1. Start at the highest-severity fact as entry point
+/// 2. Evaluate all edge predicates from current node
+/// 3. Follow edge to highest-severity destination (that hasn't been visited)
+/// 4. Repeat until leaf (no active edges or all destinations visited)
+/// 5. The path IS the story
+/// 6. Mark traversed facts as consumed, repeat from next highest-severity
+/// 7. Stop when remaining facts are below 0.5 severity
+///
+public class InferenceEngine
+{
+ private const double MinimumSeverityThreshold = 0.5;
+ private const int MaxPathDepth = 10; // Safety limit
+
+ private readonly RelationshipGraph _graph;
+
+ public InferenceEngine(RelationshipGraph graph)
+ {
+ _graph = graph;
+ }
+
+ ///
+ /// Builds analysis stories by traversing the relationship graph
+ /// starting from the highest-severity facts.
+ ///
+ public List BuildStories(List facts)
+ {
+ var stories = new List();
+ var factsByKey = facts
+ .Where(f => f.Severity > 0)
+ .ToDictionary(f => f.Key, f => f);
+ var consumed = new HashSet();
+
+ // Process facts in severity order
+ var entryPoints = facts
+ .Where(f => f.Severity >= MinimumSeverityThreshold)
+ .OrderByDescending(f => f.Severity)
+ .ToList();
+
+ foreach (var entryFact in entryPoints)
+ {
+ if (consumed.Contains(entryFact.Key))
+ continue;
+
+ var path = Traverse(entryFact.Key, factsByKey, consumed);
+
+ // Mark all facts in this path as consumed
+ foreach (var node in path)
+ consumed.Add(node);
+
+ var story = BuildStory(path, factsByKey);
+ stories.Add(story);
+ }
+
+ // Check for absolution — if no stories were generated at all
+ if (stories.Count == 0 && facts.Count > 0)
+ {
+ stories.Add(new AnalysisStory
+ {
+ RootFactKey = "server_health",
+ RootFactValue = 0,
+ Severity = 0,
+ Confidence = 1.0,
+ Category = "absolution",
+ Path = ["server_health"],
+ StoryPath = "server_health",
+ StoryPathHash = ComputeHash("server_health"),
+ StoryText = string.Empty,
+ IsAbsolution = true
+ });
+ }
+
+ return stories;
+ }
+
+ ///
+ /// Greedy traversal from an entry point through the relationship graph.
+ /// Returns the path as a list of fact keys.
+ ///
+ private List Traverse(string startKey,
+ Dictionary factsByKey,
+ HashSet consumed)
+ {
+ var path = new List { startKey };
+ var visited = new HashSet { startKey };
+ var current = startKey;
+
+ for (var depth = 0; depth < MaxPathDepth; depth++)
+ {
+ var activeEdges = _graph.GetActiveEdges(current, factsByKey);
+
+ // Filter to destinations not already in this path and not consumed by prior stories
+ var candidates = activeEdges
+ .Where(e => !visited.Contains(e.Destination) && !consumed.Contains(e.Destination))
+ .Where(e => factsByKey.ContainsKey(e.Destination))
+ .OrderByDescending(e => factsByKey[e.Destination].Severity)
+ .ToList();
+
+ if (candidates.Count == 0)
+ break; // Leaf node — no more edges to follow
+
+ var best = candidates[0];
+ path.Add(best.Destination);
+ visited.Add(best.Destination);
+ current = best.Destination;
+ }
+
+ return path;
+ }
+
+ ///
+ /// Builds an AnalysisStory from a traversal path.
+ ///
+ private static AnalysisStory BuildStory(List path, Dictionary factsByKey)
+ {
+ var rootFact = factsByKey.GetValueOrDefault(path[0]);
+ var leafKey = path.Count > 1 ? path[^1] : null;
+ var leafFact = leafKey != null ? factsByKey.GetValueOrDefault(leafKey) : null;
+
+ var storyPath = string.Join(" → ", path);
+ var category = rootFact?.Source ?? "unknown";
+
+ // Confidence = what fraction of edge destinations had matching facts
+ // For single-node paths, confidence is 1.0 (we found the symptom, just no deeper cause)
+ var confidence = path.Count == 1 ? 1.0 : (path.Count - 1.0) / path.Count;
+
+ return new AnalysisStory
+ {
+ RootFactKey = path[0],
+ RootFactValue = rootFact?.Severity ?? 0,
+ Severity = rootFact?.Severity ?? 0,
+ Confidence = confidence,
+ Category = category,
+ Path = path,
+ StoryPath = storyPath,
+ StoryPathHash = ComputeHash(storyPath),
+ StoryText = string.Empty,
+ LeafFactKey = leafKey,
+ LeafFactValue = leafFact?.Severity,
+ FactCount = path.Count,
+ IsAbsolution = false
+ };
+ }
+
+ ///
+ /// Stable hash for story path deduplication and muting.
+ ///
+ private static string ComputeHash(string storyPath)
+ {
+ var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(storyPath));
+ return Convert.ToHexString(bytes).ToLowerInvariant()[..16];
+ }
+}
diff --git a/Dashboard/Analysis/RelationshipGraph.cs b/Dashboard/Analysis/RelationshipGraph.cs
new file mode 100644
index 00000000..2650a7bf
--- /dev/null
+++ b/Dashboard/Analysis/RelationshipGraph.cs
@@ -0,0 +1,325 @@
+using System.Collections.Generic;
+using System.Linq;
+
+namespace PerformanceMonitorDashboard.Analysis;
+
+///
+/// Defines conditional edges between facts. The graph encodes Erik's diagnostic
+/// reasoning: "when I see symptom X, what do I check next?"
+///
+/// Edges are code-defined (not data-driven) because they represent expert knowledge.
+/// Each edge has a predicate that evaluates against the current fact set to decide
+/// if the edge should be followed.
+///
+/// Built incrementally — new edges are added as new fact categories become available.
+///
+public class RelationshipGraph
+{
+ private readonly Dictionary> _edges = new();
+
+ public RelationshipGraph()
+ {
+ BuildGraph();
+ }
+
+ ///
+ /// Returns all edges originating from the given fact key,
+ /// filtered to only those whose predicates are true.
+ ///
+ public List GetActiveEdges(string sourceKey, IReadOnlyDictionary factsByKey)
+ {
+ if (!_edges.TryGetValue(sourceKey, out var edges))
+ return [];
+
+ return edges.Where(e => e.Predicate(factsByKey)).ToList();
+ }
+
+ ///
+ /// Returns all defined edges from a source (regardless of predicate).
+ /// Used for audit trail logging.
+ ///
+ public List GetAllEdges(string sourceKey)
+ {
+ return _edges.TryGetValue(sourceKey, out var edges) ? edges : [];
+ }
+
+ private void AddEdge(string source, string destination, string category,
+ string predicateDescription, System.Func, bool> predicate)
+ {
+ if (!_edges.ContainsKey(source))
+ _edges[source] = [];
+
+ _edges[source].Add(new Edge
+ {
+ Source = source,
+ Destination = destination,
+ Category = category,
+ PredicateDescription = predicateDescription,
+ Predicate = predicate
+ });
+ }
+
+ ///
+ /// Builds all edges in the relationship graph.
+ /// Organized by entry point category matching the design doc.
+ ///
+ private void BuildGraph()
+ {
+ BuildCpuPressureEdges();
+ BuildMemoryPressureEdges();
+ BuildBlockingEdges();
+ BuildIoPressureEdges();
+ BuildLatchEdges();
+ BuildTempDbEdges();
+ BuildQueryEdges();
+ }
+
+ /* ── CPU Pressure ── */
+
+ private void BuildCpuPressureEdges()
+ {
+ // SOS_SCHEDULER_YIELD → CXPACKET (parallelism contributing to CPU)
+ AddEdge("SOS_SCHEDULER_YIELD", "CXPACKET", "cpu_pressure",
+ "CXPACKET significant — parallelism consuming schedulers",
+ facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.5);
+
+ // SOS_SCHEDULER_YIELD → THREADPOOL (escalating to thread exhaustion)
+ AddEdge("SOS_SCHEDULER_YIELD", "THREADPOOL", "cpu_pressure",
+ "THREADPOOL waits present — escalating to thread exhaustion",
+ facts => HasFact(facts, "THREADPOOL") && facts["THREADPOOL"].Severity > 0);
+
+ // CXPACKET → SOS (CPU starvation from parallelism)
+ AddEdge("CXPACKET", "SOS_SCHEDULER_YIELD", "parallelism",
+ "SOS_SCHEDULER_YIELD elevated — CPU starvation from parallelism",
+ facts => HasFact(facts, "SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].Value >= 0.25);
+
+ // CXPACKET → THREADPOOL (thread exhaustion cascade)
+ AddEdge("CXPACKET", "THREADPOOL", "parallelism",
+ "THREADPOOL waits present — thread exhaustion cascade",
+ facts => HasFact(facts, "THREADPOOL") && facts["THREADPOOL"].Severity > 0);
+
+ // THREADPOOL → CXPACKET (parallel queries consuming thread pool)
+ AddEdge("THREADPOOL", "CXPACKET", "thread_exhaustion",
+ "CXPACKET significant — parallel queries consuming thread pool",
+ facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.5);
+
+ // THREADPOOL → LCK (blocking causing thread buildup — stuck queries holding threads)
+ AddEdge("THREADPOOL", "LCK", "thread_exhaustion",
+ "Lock contention — blocked queries holding worker threads",
+ facts => HasFact(facts, "LCK") && facts["LCK"].Severity >= 0.5);
+
+ // CPU_SQL_PERCENT → SOS_SCHEDULER_YIELD (CPU confirms scheduler pressure)
+ AddEdge("CPU_SQL_PERCENT", "SOS_SCHEDULER_YIELD", "cpu_pressure",
+ "Scheduler yields confirm CPU saturation",
+ facts => HasFact(facts, "SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].Severity >= 0.5);
+
+ // CPU_SQL_PERCENT → CXPACKET (CPU load from parallelism)
+ AddEdge("CPU_SQL_PERCENT", "CXPACKET", "cpu_pressure",
+ "Parallelism waits contributing to CPU load",
+ facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.5);
+
+ // SOS_SCHEDULER_YIELD → CPU_SQL_PERCENT (scheduler yields with high CPU)
+ AddEdge("SOS_SCHEDULER_YIELD", "CPU_SQL_PERCENT", "cpu_pressure",
+ "SQL CPU > 80% — confirms CPU is the bottleneck",
+ facts => HasFact(facts, "CPU_SQL_PERCENT") && facts["CPU_SQL_PERCENT"].Value >= 80);
+
+ // CPU_SPIKE → SOS_SCHEDULER_YIELD (spike confirmed by scheduler pressure)
+ AddEdge("CPU_SPIKE", "SOS_SCHEDULER_YIELD", "cpu_spike",
+ "Scheduler yields — CPU spike caused scheduler starvation",
+ facts => HasFact(facts, "SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].BaseSeverity > 0);
+
+ // CPU_SPIKE → CXPACKET (spike from parallelism)
+ AddEdge("CPU_SPIKE", "CXPACKET", "cpu_spike",
+ "Parallelism waits — parallel queries contributing to CPU spike",
+ facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.3);
+ }
+
+ /* ── Memory Pressure ── */
+
+ private void BuildMemoryPressureEdges()
+ {
+ // PAGEIOLATCH_SH → RESOURCE_SEMAPHORE (memory grants contributing to buffer pressure)
+ AddEdge("PAGEIOLATCH_SH", "RESOURCE_SEMAPHORE", "memory_pressure",
+ "RESOURCE_SEMAPHORE present — memory grants competing with buffer pool",
+ facts => HasFact(facts, "RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].Severity > 0);
+
+ // PAGEIOLATCH_EX → same
+ AddEdge("PAGEIOLATCH_EX", "RESOURCE_SEMAPHORE", "memory_pressure",
+ "RESOURCE_SEMAPHORE present — memory grants competing with buffer pool",
+ facts => HasFact(facts, "RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].Severity > 0);
+
+ // RESOURCE_SEMAPHORE → PAGEIOLATCH (downstream I/O cascade)
+ AddEdge("RESOURCE_SEMAPHORE", "PAGEIOLATCH_SH", "memory_grants",
+ "PAGEIOLATCH elevated — memory grant pressure causing buffer pool shrinkage",
+ facts => HasFact(facts, "PAGEIOLATCH_SH") && facts["PAGEIOLATCH_SH"].Severity >= 0.5);
+
+ // RESOURCE_SEMAPHORE → MEMORY_GRANT_PENDING (grant pressure confirmed by semaphore waiters)
+ AddEdge("RESOURCE_SEMAPHORE", "MEMORY_GRANT_PENDING", "memory_grants",
+ "Memory grant waiters present — queries queued for memory",
+ facts => HasFact(facts, "MEMORY_GRANT_PENDING") && facts["MEMORY_GRANT_PENDING"].BaseSeverity > 0);
+
+ // RESOURCE_SEMAPHORE → QUERY_SPILLS (grant pressure causing spills)
+ AddEdge("RESOURCE_SEMAPHORE", "QUERY_SPILLS", "memory_grants",
+ "Query spills present — queries running with insufficient memory",
+ facts => HasFact(facts, "QUERY_SPILLS") && facts["QUERY_SPILLS"].BaseSeverity > 0);
+
+ // MEMORY_GRANT_PENDING → RESOURCE_SEMAPHORE (waiters confirm RESOURCE_SEMAPHORE waits)
+ AddEdge("MEMORY_GRANT_PENDING", "RESOURCE_SEMAPHORE", "memory_grants",
+ "RESOURCE_SEMAPHORE waits — grant pressure visible in wait stats",
+ facts => HasFact(facts, "RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].Severity > 0);
+
+ // MEMORY_GRANT_PENDING → QUERY_SPILLS (insufficient grants causing spills)
+ AddEdge("MEMORY_GRANT_PENDING", "QUERY_SPILLS", "memory_grants",
+ "Query spills — queries getting insufficient memory grants",
+ facts => HasFact(facts, "QUERY_SPILLS") && facts["QUERY_SPILLS"].BaseSeverity > 0);
+
+ // PAGEIOLATCH_SH → IO_READ_LATENCY_MS (buffer miss confirmed by disk latency)
+ AddEdge("PAGEIOLATCH_SH", "IO_READ_LATENCY_MS", "memory_pressure",
+ "Read latency elevated — disk confirms buffer pool pressure",
+ facts => HasFact(facts, "IO_READ_LATENCY_MS") && facts["IO_READ_LATENCY_MS"].BaseSeverity > 0);
+
+ // PAGEIOLATCH_EX → IO_READ_LATENCY_MS
+ AddEdge("PAGEIOLATCH_EX", "IO_READ_LATENCY_MS", "memory_pressure",
+ "Read latency elevated — disk confirms buffer pool pressure",
+ facts => HasFact(facts, "IO_READ_LATENCY_MS") && facts["IO_READ_LATENCY_MS"].BaseSeverity > 0);
+ }
+
+ /* ── Blocking & Deadlocking ── */
+
+ private void BuildBlockingEdges()
+ {
+ // LCK → BLOCKING_EVENTS (lock waits confirmed by actual blocking reports)
+ AddEdge("LCK", "BLOCKING_EVENTS", "lock_contention",
+ "Blocked process reports present — confirmed blocking events",
+ facts => HasFact(facts, "BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0);
+
+ // LCK → DEADLOCKS (lock contention escalating)
+ AddEdge("LCK", "DEADLOCKS", "lock_contention",
+ "Deadlocks present — lock contention escalating to deadlocks",
+ facts => HasFact(facts, "DEADLOCKS") && facts["DEADLOCKS"].BaseSeverity > 0);
+
+ // BLOCKING_EVENTS → LCK (blocking confirmed by lock waits)
+ AddEdge("BLOCKING_EVENTS", "LCK", "blocking",
+ "Lock contention waits elevated — blocking visible in wait stats",
+ facts => HasFact(facts, "LCK") && facts["LCK"].Severity >= 0.5);
+
+ // BLOCKING_EVENTS → DEADLOCKS (blocking escalating)
+ AddEdge("BLOCKING_EVENTS", "DEADLOCKS", "blocking",
+ "Deadlocks also present — blocking escalating to deadlocks",
+ facts => HasFact(facts, "DEADLOCKS") && facts["DEADLOCKS"].BaseSeverity > 0);
+
+ // BLOCKING_EVENTS → THREADPOOL (blocking causing thread exhaustion)
+ AddEdge("BLOCKING_EVENTS", "THREADPOOL", "blocking",
+ "THREADPOOL waits present — blocked queries consuming worker threads",
+ facts => HasFact(facts, "THREADPOOL") && facts["THREADPOOL"].Severity > 0);
+
+ // DEADLOCKS → BLOCKING_EVENTS (deadlocks with systemic blocking)
+ AddEdge("DEADLOCKS", "BLOCKING_EVENTS", "deadlocking",
+ "Blocking events also present — systemic contention pattern",
+ facts => HasFact(facts, "BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0);
+
+ // DEADLOCKS → LCK_M_S (reader/writer deadlocks)
+ AddEdge("DEADLOCKS", "LCK_M_S", "deadlocking",
+ "Reader lock waits present — RCSI could prevent reader/writer deadlocks",
+ facts => HasFact(facts, "LCK_M_S") && facts["LCK_M_S"].BaseSeverity > 0);
+
+ // LCK_M_S → DB_CONFIG (reader/writer contention → RCSI recommendation)
+ AddEdge("LCK_M_S", "DB_CONFIG", "lock_contention",
+ "Databases without RCSI — readers blocked by writers could be eliminated",
+ facts => HasFact(facts, "DB_CONFIG")
+ && facts["DB_CONFIG"].Metadata.GetValueOrDefault("rcsi_off_count") > 0
+ && facts["DB_CONFIG"].BaseSeverity > 0);
+
+ // DB_CONFIG → LCK_M_S (RCSI-off confirmed by reader/writer lock contention)
+ AddEdge("DB_CONFIG", "LCK_M_S", "config_issue",
+ "LCK_M_S waits — readers blocked by writers, RCSI would eliminate these",
+ facts => HasFact(facts, "LCK_M_S") && facts["LCK_M_S"].BaseSeverity > 0
+ && HasFact(facts, "DB_CONFIG")
+ && facts["DB_CONFIG"].Metadata.GetValueOrDefault("rcsi_off_count") > 0);
+
+ // THREADPOOL → BLOCKING_EVENTS (blocking causing thread buildup)
+ AddEdge("THREADPOOL", "BLOCKING_EVENTS", "thread_exhaustion",
+ "Blocking events present — blocked queries holding worker threads",
+ facts => HasFact(facts, "BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0);
+ }
+
+ /* ── I/O Pressure ── */
+
+ private void BuildIoPressureEdges()
+ {
+ // IO_READ_LATENCY_MS → PAGEIOLATCH_SH (disk latency with buffer pool misses)
+ AddEdge("IO_READ_LATENCY_MS", "PAGEIOLATCH_SH", "io_pressure",
+ "PAGEIOLATCH waits — buffer pool misses driving read I/O",
+ facts => HasFact(facts, "PAGEIOLATCH_SH") && facts["PAGEIOLATCH_SH"].Severity >= 0.5);
+
+ // IO_WRITE_LATENCY_MS → WRITELOG (write latency with log waits)
+ AddEdge("IO_WRITE_LATENCY_MS", "WRITELOG", "io_pressure",
+ "WRITELOG waits — transaction log I/O bottleneck",
+ facts => HasFact(facts, "WRITELOG") && facts["WRITELOG"].Severity > 0);
+
+ // WRITELOG → IO_WRITE_LATENCY_MS (log waits confirmed by disk latency)
+ AddEdge("WRITELOG", "IO_WRITE_LATENCY_MS", "log_io",
+ "Write latency elevated — disk confirms log I/O bottleneck",
+ facts => HasFact(facts, "IO_WRITE_LATENCY_MS") && facts["IO_WRITE_LATENCY_MS"].BaseSeverity > 0);
+ }
+
+ /* ── Latch Contention ── */
+
+ private void BuildLatchEdges()
+ {
+ // LATCH_EX → TEMPDB_USAGE (latch contention often from TempDB allocation)
+ AddEdge("LATCH_EX", "TEMPDB_USAGE", "latch_contention",
+ "TempDB usage — latch contention may be on TempDB allocation pages",
+ facts => HasFact(facts, "TEMPDB_USAGE") && facts["TEMPDB_USAGE"].BaseSeverity > 0);
+
+ // LATCH_EX → CXPACKET (parallel operations amplifying latch contention)
+ AddEdge("LATCH_EX", "CXPACKET", "latch_contention",
+ "Parallelism waits — parallel operations amplifying page latch contention",
+ facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.5);
+ }
+
+ /* ── TempDB ── */
+
+ private void BuildTempDbEdges()
+ {
+ // TEMPDB_USAGE → PAGEIOLATCH_SH (tempdb pressure causing I/O)
+ AddEdge("TEMPDB_USAGE", "PAGEIOLATCH_SH", "tempdb_pressure",
+ "PAGEIOLATCH waits — TempDB pressure contributing to I/O",
+ facts => HasFact(facts, "PAGEIOLATCH_SH") && facts["PAGEIOLATCH_SH"].Severity >= 0.5);
+
+ // TEMPDB_USAGE → QUERY_SPILLS (spills consuming tempdb)
+ AddEdge("TEMPDB_USAGE", "QUERY_SPILLS", "tempdb_pressure",
+ "Query spills — spilling to TempDB consuming space",
+ facts => HasFact(facts, "QUERY_SPILLS") && facts["QUERY_SPILLS"].BaseSeverity > 0);
+ }
+
+ /* ── Query-Level ── */
+
+ private void BuildQueryEdges()
+ {
+ // QUERY_SPILLS → MEMORY_GRANT_PENDING (spills from insufficient grants)
+ AddEdge("QUERY_SPILLS", "MEMORY_GRANT_PENDING", "query_performance",
+ "Memory grant waiters — spills caused by insufficient memory grants",
+ facts => HasFact(facts, "MEMORY_GRANT_PENDING") && facts["MEMORY_GRANT_PENDING"].BaseSeverity > 0);
+
+ // QUERY_SPILLS → TEMPDB_USAGE (spills consuming tempdb space)
+ AddEdge("QUERY_SPILLS", "TEMPDB_USAGE", "query_performance",
+ "TempDB usage elevated — spills consuming TempDB space",
+ facts => HasFact(facts, "TEMPDB_USAGE") && facts["TEMPDB_USAGE"].BaseSeverity > 0);
+
+ // QUERY_HIGH_DOP → CXPACKET (high-DOP queries causing parallelism waits)
+ AddEdge("QUERY_HIGH_DOP", "CXPACKET", "query_performance",
+ "CXPACKET waits — high-DOP queries causing excessive parallelism",
+ facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.5);
+
+ // QUERY_HIGH_DOP → SOS_SCHEDULER_YIELD (high-DOP queries causing CPU pressure)
+ AddEdge("QUERY_HIGH_DOP", "SOS_SCHEDULER_YIELD", "query_performance",
+ "Scheduler yields — high-DOP queries saturating CPU",
+ facts => HasFact(facts, "SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].Severity >= 0.5);
+ }
+
+ private static bool HasFact(IReadOnlyDictionary facts, string key)
+ {
+ return facts.ContainsKey(key);
+ }
+}
diff --git a/Dashboard/Analysis/SqlServerAnomalyDetector.cs b/Dashboard/Analysis/SqlServerAnomalyDetector.cs
new file mode 100644
index 00000000..bdf6664a
--- /dev/null
+++ b/Dashboard/Analysis/SqlServerAnomalyDetector.cs
@@ -0,0 +1,543 @@
+using System;
+using System.Collections.Generic;
+using System.Threading.Tasks;
+using Microsoft.Data.SqlClient;
+using PerformanceMonitorDashboard.Helpers;
+
+namespace PerformanceMonitorDashboard.Analysis;
+
+///
+/// Detects anomalies by comparing the analysis window's metrics against a
+/// baseline period. When a metric deviates significantly from baseline
+/// (mean + standard deviation), an ANOMALY fact is emitted.
+///
+/// This is the "oh shit" mode -- detecting acute deviations that don't show
+/// up in aggregate analysis because they're brief. A 5-minute CPU spike
+/// that averages out over 4 hours is invisible to aggregate scoring but
+/// obvious when compared against "what was this metric doing before?"
+///
+/// Baseline selection: uses the 24 hours preceding the analysis window.
+/// If less data is available, uses whatever exists with lower confidence.
+///
+/// Port of Lite's AnomalyDetector -- uses SQL Server collect.* tables instead of DuckDB views.
+/// No server_id filtering -- Dashboard monitors one server per database.
+///
+public class SqlServerAnomalyDetector
+{
+ private readonly string _connectionString;
+
+ ///
+ /// Minimum number of baseline samples needed for reliable detection.
+ /// Below this, anomalies are still detected but with reduced confidence.
+ ///
+ private const int MinBaselineSamples = 10;
+
+ ///
+ /// Number of standard deviations above baseline mean to flag as anomalous.
+ ///
+ private const double DeviationThreshold = 2.0;
+
+ public SqlServerAnomalyDetector(string connectionString)
+ {
+ _connectionString = connectionString;
+ }
+
+ ///
+ /// Detects anomalies by comparing the analysis window against a baseline period.
+ /// Returns anomaly facts to be merged into the main fact list.
+ ///
+ public async Task> DetectAnomaliesAsync(AnalysisContext context)
+ {
+ var anomalies = new List();
+
+ // Baseline: 24 hours preceding the analysis window
+ var baselineEnd = context.TimeRangeStart;
+ var baselineStart = baselineEnd.AddHours(-24);
+
+ // Check if baseline period has any data at all -- if not, skip all anomaly detection.
+ // Without baseline data, everything looks anomalous.
+ if (!await HasBaselineDataAsync(baselineStart, baselineEnd))
+ return anomalies;
+
+ await DetectCpuAnomalies(context, baselineStart, baselineEnd, anomalies);
+ await DetectWaitAnomalies(context, baselineStart, baselineEnd, anomalies);
+ await DetectBlockingAnomalies(context, baselineStart, baselineEnd, anomalies);
+ await DetectIoAnomalies(context, baselineStart, baselineEnd, anomalies);
+
+ return anomalies;
+ }
+
+ ///
+ /// Checks if the baseline period has any collected data.
+ /// Uses wait_stats as canary -- if waits are collected, other data is too.
+ ///
+ private async Task HasBaselineDataAsync(DateTime baselineStart, DateTime baselineEnd)
+ {
+ try
+ {
+ using var connection = new SqlConnection(_connectionString);
+ await connection.OpenAsync();
+
+ using var cmd = connection.CreateCommand();
+ cmd.CommandText = @"
+SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+
+SELECT
+ (SELECT COUNT(*) FROM collect.wait_stats
+ WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd)
+ + (SELECT COUNT(*) FROM collect.cpu_utilization_stats
+ WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd);";
+
+ cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart));
+ cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd));
+
+ var count = Convert.ToInt64(await cmd.ExecuteScalarAsync() ?? 0);
+ return count > 0;
+ }
+ catch { return false; }
+ }
+
+ ///
+ /// Detects CPU utilization anomalies by comparing per-sample values
+ /// against the baseline distribution.
+ ///
+ private async Task DetectCpuAnomalies(AnalysisContext context,
+ DateTime baselineStart, DateTime baselineEnd, List anomalies)
+ {
+ try
+ {
+ using var connection = new SqlConnection(_connectionString);
+ await connection.OpenAsync();
+
+ // Get baseline stats
+ using var baselineCmd = connection.CreateCommand();
+ baselineCmd.CommandText = @"
+SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+
+SELECT
+ AVG(CAST(sqlserver_cpu_utilization AS FLOAT)) AS mean_cpu,
+ STDEV(CAST(sqlserver_cpu_utilization AS FLOAT)) AS stddev_cpu,
+ COUNT(*) AS sample_count
+FROM collect.cpu_utilization_stats
+WHERE collection_time >= @baselineStart
+AND collection_time < @baselineEnd;";
+
+ baselineCmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart));
+ baselineCmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd));
+
+ double baselineMean = 0, baselineStdDev = 0;
+ long baselineSamples = 0;
+
+ using (var reader = await baselineCmd.ExecuteReaderAsync())
+ {
+ if (await reader.ReadAsync())
+ {
+ baselineMean = reader.IsDBNull(0) ? 0 : Convert.ToDouble(reader.GetValue(0));
+ baselineStdDev = reader.IsDBNull(1) ? 0 : Convert.ToDouble(reader.GetValue(1));
+ baselineSamples = reader.IsDBNull(2) ? 0 : Convert.ToInt64(reader.GetValue(2));
+ }
+ }
+
+ if (baselineSamples < 3 || baselineStdDev <= 0) return;
+
+ // Get peak and average in the analysis window
+ using var windowCmd = connection.CreateCommand();
+ windowCmd.CommandText = @"
+SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+
+SELECT
+ MAX(sqlserver_cpu_utilization) AS peak_cpu,
+ AVG(CAST(sqlserver_cpu_utilization AS FLOAT)) AS avg_cpu,
+ COUNT(*) AS sample_count,
+ (SELECT TOP 1 collection_time FROM collect.cpu_utilization_stats
+ WHERE collection_time >= @windowStart AND collection_time < @windowEnd
+ ORDER BY sqlserver_cpu_utilization DESC) AS peak_time
+FROM collect.cpu_utilization_stats
+WHERE collection_time >= @windowStart
+AND collection_time < @windowEnd;";
+
+ windowCmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart));
+ windowCmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd));
+
+ using var windowReader = await windowCmd.ExecuteReaderAsync();
+ if (!await windowReader.ReadAsync()) return;
+
+ var peakCpu = windowReader.IsDBNull(0) ? 0.0 : Convert.ToDouble(windowReader.GetValue(0));
+ var avgCpu = windowReader.IsDBNull(1) ? 0.0 : Convert.ToDouble(windowReader.GetValue(1));
+ var windowSamples = windowReader.IsDBNull(2) ? 0L : Convert.ToInt64(windowReader.GetValue(2));
+ var peakTime = windowReader.IsDBNull(3) ? (DateTime?)null : windowReader.GetDateTime(3);
+
+ if (windowSamples == 0) return;
+
+ // Check if peak deviates significantly from baseline
+ var deviation = (peakCpu - baselineMean) / baselineStdDev;
+ if (deviation < DeviationThreshold || peakCpu < 50) return; // Don't flag low absolute values
+
+ var confidence = baselineSamples >= MinBaselineSamples ? 1.0 : (double)baselineSamples / MinBaselineSamples;
+
+ anomalies.Add(new Fact
+ {
+ Source = "anomaly",
+ Key = "ANOMALY_CPU_SPIKE",
+ Value = peakCpu,
+ ServerId = context.ServerId,
+ Metadata = new Dictionary
+ {
+ ["peak_cpu"] = peakCpu,
+ ["avg_cpu_in_window"] = avgCpu,
+ ["baseline_mean"] = baselineMean,
+ ["baseline_stddev"] = baselineStdDev,
+ ["deviation_sigma"] = deviation,
+ ["baseline_samples"] = baselineSamples,
+ ["window_samples"] = windowSamples,
+ ["confidence"] = confidence,
+ ["peak_time_ticks"] = peakTime?.Ticks ?? 0
+ }
+ });
+ }
+ catch (Exception ex)
+ {
+ Logger.Error($"[SqlServerAnomalyDetector] CPU anomaly detection failed: {ex.Message}");
+ }
+ }
+
+ ///
+ /// Detects wait stat anomalies -- significant waits in the analysis window
+ /// that were absent or much lower in the baseline.
+ ///
+ private async Task DetectWaitAnomalies(AnalysisContext context,
+ DateTime baselineStart, DateTime baselineEnd, List anomalies)
+ {
+ try
+ {
+ using var connection = new SqlConnection(_connectionString);
+ await connection.OpenAsync();
+
+ // Check if baseline has any wait data at all -- if not, skip
+ using var checkCmd = connection.CreateCommand();
+ checkCmd.CommandText = @"
+SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+
+SELECT COUNT(*) FROM collect.wait_stats
+WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd;";
+
+ checkCmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart));
+ checkCmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd));
+
+ var baselineCount = Convert.ToInt64(await checkCmd.ExecuteScalarAsync() ?? 0);
+ if (baselineCount == 0) return;
+
+ // Get per-wait-type totals in both windows
+ using var cmd = connection.CreateCommand();
+ cmd.CommandText = @"
+SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+
+;WITH baseline AS (
+ SELECT wait_type,
+ CAST(SUM(wait_time_ms_delta) AS BIGINT) AS total_ms
+ FROM collect.wait_stats
+ WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd
+ AND wait_time_ms_delta > 0
+ GROUP BY wait_type
+),
+current_window AS (
+ SELECT wait_type,
+ CAST(SUM(wait_time_ms_delta) AS BIGINT) AS total_ms
+ FROM collect.wait_stats
+ WHERE collection_time >= @windowStart AND collection_time <= @windowEnd
+ AND wait_time_ms_delta > 0
+ GROUP BY wait_type
+)
+SELECT TOP 10
+ c.wait_type,
+ c.total_ms AS current_ms,
+ COALESCE(b.total_ms, 0) AS baseline_ms
+FROM current_window c
+LEFT JOIN baseline b ON c.wait_type = b.wait_type
+WHERE c.total_ms > 10000 -- At least 10 seconds of wait time
+ORDER BY c.total_ms DESC;";
+
+ cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart));
+ cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd));
+ cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart));
+ cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd));
+
+ using var reader = await cmd.ExecuteReaderAsync();
+ while (await reader.ReadAsync())
+ {
+ var waitType = reader.GetString(0);
+ var currentMs = Convert.ToInt64(reader.GetValue(1));
+ var baselineMs = Convert.ToInt64(reader.GetValue(2));
+
+ // Normalize to per-hour rates before comparing (windows are different lengths)
+ var baselineHours = (baselineEnd - baselineStart).TotalHours;
+ var currentHours = (context.TimeRangeEnd - context.TimeRangeStart).TotalHours;
+ if (baselineHours <= 0) baselineHours = 1;
+ if (currentHours <= 0) currentHours = 1;
+
+ double ratio;
+ string anomalyType;
+
+ if (baselineMs == 0)
+ {
+ ratio = currentMs > 60_000 ? 100.0 : 0; // Only flag if > 1 minute total
+ anomalyType = "new";
+ }
+ else
+ {
+ var baselineRate = baselineMs / baselineHours;
+ var currentRate = currentMs / currentHours;
+ ratio = baselineRate > 0 ? currentRate / baselineRate : 100.0;
+ anomalyType = "spike";
+ }
+
+ if (ratio < 5.0) continue; // Need at least 5x increase
+
+ anomalies.Add(new Fact
+ {
+ Source = "anomaly",
+ Key = $"ANOMALY_WAIT_{waitType}",
+ Value = currentMs,
+ ServerId = context.ServerId,
+ Metadata = new Dictionary
+ {
+ ["current_ms"] = currentMs,
+ ["baseline_ms"] = baselineMs,
+ ["ratio"] = ratio,
+ ["is_new"] = anomalyType == "new" ? 1 : 0
+ }
+ });
+ }
+ }
+ catch (Exception ex)
+ {
+ Logger.Error($"[SqlServerAnomalyDetector] Wait anomaly detection failed: {ex.Message}");
+ }
+ }
+
+ ///
+ /// Detects blocking/deadlock anomalies -- events in the analysis window
+ /// that are significantly above baseline rates.
+ ///
+ private async Task DetectBlockingAnomalies(AnalysisContext context,
+ DateTime baselineStart, DateTime baselineEnd, List anomalies)
+ {
+ try
+ {
+ using var connection = new SqlConnection(_connectionString);
+ await connection.OpenAsync();
+
+ // Check if baseline period has any data at all
+ using var checkCmd = connection.CreateCommand();
+ checkCmd.CommandText = @"
+SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+
+SELECT
+ (SELECT COUNT(*) FROM collect.blocking_BlockedProcessReport
+ WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd)
+ + (SELECT COUNT(*) FROM collect.deadlocks
+ WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd)
+ + (SELECT COUNT(*) FROM collect.wait_stats
+ WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd);";
+
+ checkCmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart));
+ checkCmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd));
+
+ var baselineDataCount = Convert.ToInt64(await checkCmd.ExecuteScalarAsync() ?? 0);
+ if (baselineDataCount == 0) return; // No baseline data = can't detect anomaly
+
+ using var cmd = connection.CreateCommand();
+ cmd.CommandText = @"
+SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+
+SELECT
+ (SELECT COUNT(*) FROM collect.blocking_BlockedProcessReport
+ WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) AS baseline_blocking,
+ (SELECT COUNT(*) FROM collect.blocking_BlockedProcessReport
+ WHERE collection_time >= @windowStart AND collection_time <= @windowEnd) AS current_blocking,
+ (SELECT COUNT(*) FROM collect.deadlocks
+ WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) AS baseline_deadlocks,
+ (SELECT COUNT(*) FROM collect.deadlocks
+ WHERE collection_time >= @windowStart AND collection_time <= @windowEnd) AS current_deadlocks;";
+
+ cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart));
+ cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd));
+ cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart));
+ cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd));
+
+ using var reader = await cmd.ExecuteReaderAsync();
+ if (!await reader.ReadAsync()) return;
+
+ var baselineBlocking = Convert.ToInt64(reader.GetValue(0));
+ var currentBlocking = Convert.ToInt64(reader.GetValue(1));
+ var baselineDeadlocks = Convert.ToInt64(reader.GetValue(2));
+ var currentDeadlocks = Convert.ToInt64(reader.GetValue(3));
+
+ // Normalize to per-hour rates (windows are different lengths)
+ var baselineHours = (baselineEnd - baselineStart).TotalHours;
+ var currentHours = (context.TimeRangeEnd - context.TimeRangeStart).TotalHours;
+ if (baselineHours <= 0) baselineHours = 1;
+ if (currentHours <= 0) currentHours = 1;
+
+ var baselineBlockingRate = baselineBlocking / baselineHours;
+ var currentBlockingRate = currentBlocking / currentHours;
+ var blockingRatio = baselineBlocking > 0 ? currentBlockingRate / baselineBlockingRate : 100.0;
+
+ var baselineDeadlockRate = baselineDeadlocks / baselineHours;
+ var currentDeadlockRate = currentDeadlocks / currentHours;
+ var deadlockRatio = baselineDeadlocks > 0 ? currentDeadlockRate / baselineDeadlockRate : 100.0;
+
+ // Blocking spike: at least 5 events AND 3x baseline rate (or new)
+ if (currentBlocking >= 5 && (baselineBlocking == 0 || blockingRatio >= 3))
+ {
+ anomalies.Add(new Fact
+ {
+ Source = "anomaly",
+ Key = "ANOMALY_BLOCKING_SPIKE",
+ Value = currentBlocking,
+ ServerId = context.ServerId,
+ Metadata = new Dictionary
+ {
+ ["current_count"] = currentBlocking,
+ ["baseline_count"] = baselineBlocking,
+ ["ratio"] = blockingRatio
+ }
+ });
+ }
+
+ // Deadlock spike: at least 3 events AND 3x baseline rate (or new)
+ if (currentDeadlocks >= 3 && (baselineDeadlocks == 0 || deadlockRatio >= 3))
+ {
+ anomalies.Add(new Fact
+ {
+ Source = "anomaly",
+ Key = "ANOMALY_DEADLOCK_SPIKE",
+ Value = currentDeadlocks,
+ ServerId = context.ServerId,
+ Metadata = new Dictionary
+ {
+ ["current_count"] = currentDeadlocks,
+ ["baseline_count"] = baselineDeadlocks,
+ ["ratio"] = deadlockRatio
+ }
+ });
+ }
+ }
+ catch (Exception ex)
+ {
+ Logger.Error($"[SqlServerAnomalyDetector] Blocking anomaly detection failed: {ex.Message}");
+ }
+ }
+
+ ///
+ /// Detects I/O latency anomalies -- significant increase in read/write latency
+ /// compared to baseline.
+ ///
+ private async Task DetectIoAnomalies(AnalysisContext context,
+ DateTime baselineStart, DateTime baselineEnd, List anomalies)
+ {
+ try
+ {
+ using var connection = new SqlConnection(_connectionString);
+ await connection.OpenAsync();
+
+ using var cmd = connection.CreateCommand();
+ cmd.CommandText = @"
+SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+
+;WITH baseline AS (
+ SELECT
+ AVG(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS avg_read_lat,
+ AVG(io_stall_write_ms_delta * 1.0 / NULLIF(num_of_writes_delta, 0)) AS avg_write_lat,
+ STDEV(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS stddev_read,
+ STDEV(io_stall_write_ms_delta * 1.0 / NULLIF(num_of_writes_delta, 0)) AS stddev_write,
+ COUNT(*) AS samples
+ FROM collect.file_io_stats
+ WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd
+ AND (num_of_reads_delta > 0 OR num_of_writes_delta > 0)
+),
+current_window AS (
+ SELECT
+ AVG(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS avg_read_lat,
+ AVG(io_stall_write_ms_delta * 1.0 / NULLIF(num_of_writes_delta, 0)) AS avg_write_lat
+ FROM collect.file_io_stats
+ WHERE collection_time >= @windowStart AND collection_time <= @windowEnd
+ AND (num_of_reads_delta > 0 OR num_of_writes_delta > 0)
+)
+SELECT b.avg_read_lat, b.stddev_read, c.avg_read_lat,
+ b.avg_write_lat, b.stddev_write, c.avg_write_lat,
+ b.samples
+FROM baseline b, current_window c;";
+
+ cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart));
+ cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd));
+ cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart));
+ cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd));
+
+ using var reader = await cmd.ExecuteReaderAsync();
+ if (!await reader.ReadAsync()) return;
+
+ var baselineReadLat = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0));
+ var stddevRead = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1));
+ var currentReadLat = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2));
+ var baselineWriteLat = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3));
+ var stddevWrite = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4));
+ var currentWriteLat = reader.IsDBNull(5) ? 0.0 : Convert.ToDouble(reader.GetValue(5));
+ var samples = reader.IsDBNull(6) ? 0L : Convert.ToInt64(reader.GetValue(6));
+
+ if (samples < 3) return;
+
+ // Read latency anomaly
+ if (stddevRead > 0 && currentReadLat > 10) // At least 10ms to matter
+ {
+ var readDeviation = (currentReadLat - baselineReadLat) / stddevRead;
+ if (readDeviation >= DeviationThreshold)
+ {
+ anomalies.Add(new Fact
+ {
+ Source = "anomaly",
+ Key = "ANOMALY_READ_LATENCY",
+ Value = currentReadLat,
+ ServerId = context.ServerId,
+ Metadata = new Dictionary
+ {
+ ["current_latency_ms"] = currentReadLat,
+ ["baseline_mean_ms"] = baselineReadLat,
+ ["baseline_stddev_ms"] = stddevRead,
+ ["deviation_sigma"] = readDeviation,
+ ["baseline_samples"] = samples
+ }
+ });
+ }
+ }
+
+ // Write latency anomaly
+ if (stddevWrite > 0 && currentWriteLat > 5) // At least 5ms to matter
+ {
+ var writeDeviation = (currentWriteLat - baselineWriteLat) / stddevWrite;
+ if (writeDeviation >= DeviationThreshold)
+ {
+ anomalies.Add(new Fact
+ {
+ Source = "anomaly",
+ Key = "ANOMALY_WRITE_LATENCY",
+ Value = currentWriteLat,
+ ServerId = context.ServerId,
+ Metadata = new Dictionary
+ {
+ ["current_latency_ms"] = currentWriteLat,
+ ["baseline_mean_ms"] = baselineWriteLat,
+ ["baseline_stddev_ms"] = stddevWrite,
+ ["deviation_sigma"] = writeDeviation,
+ ["baseline_samples"] = samples
+ }
+ });
+ }
+ }
+ }
+ catch (Exception ex)
+ {
+ Logger.Error($"[SqlServerAnomalyDetector] I/O anomaly detection failed: {ex.Message}");
+ }
+ }
+}
diff --git a/Dashboard/Analysis/SqlServerDrillDownCollector.cs b/Dashboard/Analysis/SqlServerDrillDownCollector.cs
new file mode 100644
index 00000000..050ffe30
--- /dev/null
+++ b/Dashboard/Analysis/SqlServerDrillDownCollector.cs
@@ -0,0 +1,773 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading.Tasks;
+using Microsoft.Data.SqlClient;
+using PerformanceMonitorDashboard.Helpers;
+using PerformanceMonitorDashboard.Mcp;
+using PerformanceMonitorDashboard.Models;
+using PerformanceMonitorDashboard.Services;
+
+namespace PerformanceMonitorDashboard.Analysis;
+
+///
+/// Enriches findings with drill-down data from SQL Server.
+/// Runs after graph traversal, only for findings above severity threshold.
+/// Each drill-down query is limited to top N results with truncated text.
+///
+/// This makes analyze_server self-sufficient -- instead of returning a list
+/// of "next tools to call," findings include the actual supporting data.
+///
+/// Port of Lite's DrillDownCollector -- uses SQL Server collect.* tables instead of DuckDB views.
+/// No server_id filtering -- Dashboard monitors one server per database.
+///
+public class SqlServerDrillDownCollector
+{
+ private readonly string _connectionString;
+ private readonly IPlanFetcher? _planFetcher;
+ private const int TextLimit = 500;
+
+ public SqlServerDrillDownCollector(string connectionString, IPlanFetcher? planFetcher = null)
+ {
+ _connectionString = connectionString;
+ _planFetcher = planFetcher;
+ }
+
+ ///
+ /// Enriches each finding's DrillDown dictionary based on its story path.
+ ///
+ public async Task EnrichFindingsAsync(List findings, AnalysisContext context)
+ {
+ foreach (var finding in findings)
+ {
+ if (finding.Severity < 0.5) continue;
+
+ try
+ {
+ finding.DrillDown = new Dictionary();
+ var pathKeys = finding.StoryPath.Split(" → ", StringSplitOptions.RemoveEmptyEntries).ToHashSet();
+
+ if (pathKeys.Contains("DEADLOCKS"))
+ await CollectTopDeadlocks(finding, context);
+
+ if (pathKeys.Contains("BLOCKING_EVENTS"))
+ await CollectTopBlockingChains(finding, context);
+
+ if (pathKeys.Contains("CPU_SPIKE"))
+ await CollectQueriesAtSpike(finding, context);
+
+ if (pathKeys.Contains("CPU_SQL_PERCENT") || pathKeys.Contains("CPU_SPIKE"))
+ await CollectTopCpuQueries(finding, context);
+
+ if (pathKeys.Contains("QUERY_SPILLS"))
+ await CollectTopSpillingQueries(finding, context);
+
+ if (pathKeys.Contains("IO_READ_LATENCY_MS") || pathKeys.Contains("IO_WRITE_LATENCY_MS"))
+ await CollectFileLatencyBreakdown(finding, context);
+
+ if (pathKeys.Contains("LCK") || pathKeys.Contains("LCK_M_S") || pathKeys.Contains("LCK_M_IS"))
+ await CollectLockModeBreakdown(finding, context);
+
+ if (pathKeys.Contains("DB_CONFIG"))
+ await CollectConfigIssues(finding, context);
+
+ if (pathKeys.Contains("TEMPDB_USAGE"))
+ await CollectTempDbBreakdown(finding, context);
+
+ if (pathKeys.Contains("MEMORY_GRANT_PENDING"))
+ await CollectPendingGrants(finding, context);
+
+ if (pathKeys.Any(k => k.StartsWith("BAD_ACTOR_")))
+ await CollectBadActorDetail(finding, context);
+
+ // Plan analysis: for findings with top queries, analyze their cached plans
+ await CollectPlanAnalysis(finding, context);
+
+ // Remove empty drill-down dictionaries
+ if (finding.DrillDown.Count == 0)
+ finding.DrillDown = null;
+ }
+ catch (Exception ex)
+ {
+ Logger.Error(
+ $"[SqlServerDrillDownCollector] Drill-down failed for {finding.StoryPath}: {ex.GetType().Name}: {ex.Message}\n{ex.StackTrace}");
+ // Don't null out -- keep whatever was collected before the error
+ }
+ }
+ }
+
+ private async Task CollectTopDeadlocks(AnalysisFinding finding, AnalysisContext context)
+ {
+ using var connection = new SqlConnection(_connectionString);
+ await connection.OpenAsync();
+
+ using var cmd = connection.CreateCommand();
+ cmd.CommandText = @"
+SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+
+SELECT TOP 3
+ collection_time,
+ event_date,
+ spid,
+ LEFT(CAST(query AS NVARCHAR(MAX)), 500) AS victim_sql
+FROM collect.deadlocks
+WHERE collection_time >= @startTime AND collection_time <= @endTime
+ORDER BY collection_time DESC;";
+
+ cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart));
+ cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd));
+
+ var items = new List