diff --git a/.coderabbit.yaml b/.coderabbit.yaml new file mode 100644 index 00000000..67c7ecc2 --- /dev/null +++ b/.coderabbit.yaml @@ -0,0 +1,67 @@ +# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json + +language: "en-US" +early_access: false +enable_free_tier: true + +reviews: + profile: "chill" + high_level_summary: true + review_status: true + commit_status: true + collapse_walkthrough: true + sequence_diagrams: false + poem: false + + path_filters: + - "!**/*.Designer.cs" + - "!**/bin/**" + - "!**/obj/**" + - "!**/publish/**" + - "!**/*.user" + - "!**/*.suo" + + path_instructions: + - path: "Dashboard/**/*.cs" + instructions: > + This is a WPF .NET 8 desktop app (Dashboard) that reads from SQL Server. + Uses data binding, async/await patterns, and INotifyPropertyChanged. + Watch for: null reference risks, disposal of SQL connections, + thread safety with UI dispatch, and proper async patterns. + - path: "Lite/**/*.cs" + instructions: > + This is a WPF .NET 8 desktop app (Lite) that collects SQL Server DMV data + into a local DuckDB database. Uses ReaderWriterLockSlim for DB coordination. + Watch for: connection disposal, thread safety, DuckDB access patterns, + and proper async/await usage. + - path: "**/*.sql" + instructions: > + T-SQL stored procedures and scripts for SQL Server. + Watch for: SQL injection risks, missing error handling (TRY/CATCH), + proper use of SET NOCOUNT ON, and parameter sniffing concerns. + - path: "Installers/**" + instructions: > + WiX-based MSI installer projects. Be cautious about upgrade paths + and file versioning. Schema upgrades go in upgrades/ folder, not install scripts. + + auto_review: + enabled: true + drafts: false + base_branches: + - "dev" + - "main" + + tools: + gitleaks: + enabled: true + github-checks: + enabled: true + +chat: + auto_reply: true + +knowledge_base: + learnings: + scope: "local" + pull_requests: + scope: "local" diff --git a/CHANGELOG.md b/CHANGELOG.md index 05c86966..42d7bf2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,87 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [2.3.0] - 2026-03-18 + +### Important + +- **Schema upgrade**: Six columns widened across three tables (`query_stats`, `cpu_scheduler_stats`, `waiting_tasks`, `database_size_stats`) to match DMV documentation types. These are in-place ALTER COLUMN operations — fast on any table size, no data migration. Upgrade scripts run automatically via the CLI/GUI installer. +- **SQL Server version check**: Both installers now reject SQL Server 2014 and earlier before running any scripts, with a clear error message. Azure MI (EngineEdition 8) is always accepted. ([#543]) +- **Installer adversarial tests**: 35 automated tests covering upgrade failures, data survival, idempotency, version detection fallback, file filtering, restricted permissions, and more. These run as part of pre-release validation. ([#543]) + +### Added + +- **ErikAI analysis engine** — rule-based inference engine for Lite that scores server health across wait stats, CPU, memory, I/O, blocking, tempdb, and query performance. Surfaces actionable findings with severity, detail, and recommended actions. Includes anomaly detection (baseline comparison for acute deviations), bad actor detection (per-query scoring for consistently terrible queries), and CPU spike detection for bursty workloads. ([#589], [#593]) +- **ErikAI Dashboard port** — full analysis engine ported to Dashboard with SQL Server backend ([#590]) +- **FinOps cost optimization recommendations** — Phase 1-4 checks: enterprise feature audit, CPU/memory right-sizing, compression savings estimator, unused index cost quantification, dormant database detection, dev/test workload detection, VM right-sizing, storage tier optimization, reserved capacity candidates ([#564]) +- **FinOps High Impact Queries** — 80/20 analysis showing which queries consume the most resources across all dimensions ([#564]) +- **FinOps dollar-denominated cost attribution** — per-server monthly cost setting with proportional database-level breakdown ([#564]) +- **On-demand plan fetch** for bad actor and analysis findings — click to retrieve execution plans for flagged queries ([#604]) +- **Plan analysis integration** — findings include execution plan analysis when plans are available ([#594]) +- **Server unreachable email alerts** — Dashboard sends email (not just tray notification) when a monitored server goes offline or comes back online ([#529]) +- **Column filters on all FinOps DataGrids** — filter funnel icons on every column header across all 7 FinOps grids in Lite and Dashboard ([#562]) +- **Column filters on Dashboard** IdleDatabases, TempDB, and Index Analysis grids +- **Lite data import** — "Import Data" button brings in monitoring history from a previous Lite install via parquet files, preserving trend data across version upgrades ([#566]) +- **Per-server Utility Database setting** — Lite can call community stored procedures (sp_IndexCleanup) from a database other than master ([#555]) +- **SQL Server version check** in both CLI and GUI installers — rejects 2014 and earlier with a clear message ([#543]) +- **Execution plan analysis MCP tools** for both Dashboard and Lite +- **Full MCP tool coverage** — Dashboard expanded from 28 to 57 tools, Lite from 32 to 51 tools ([#576], [#577]) +- **Self-sufficient analyze_server drill-down** — MCP tool returns complete analysis, not breadcrumb trail ([#578]) +- **NuGet package dependency licenses** in THIRD_PARTY_NOTICES.md + +### Changed + +- **Azure SQL DB FinOps** — all collectors (database sizes, query stats, file I/O) now connect to each database individually instead of only querying master. Server Inventory uses dynamic SQL to avoid `sys.master_files` dependency. ([#557]) +- **Index Analysis scroll fix** — both summary and detail grids now use proportional heights instead of Auto, so they scroll independently with large result sets ([#554]) +- **Dashboard Add Server dialog** — increased MaxHeight from 700 to 850px so buttons are visible when SQL auth fields are shown +- **GUI installer** — Uninstall button now correctly enables after a successful install +- **GUI installer** — fixed encryption mapping and history logging ([#612]) +- **Dashboard visible sub-tab only refresh** on auto-refresh ticks ([#528]) +- Analysis engine decouples data maturity check from analysis window + +### Fixed + +- **Installer dropping database on every upgrade** — `00_uninstall.sql` excluded from install file list, installer aborts on upgrade failure, version detection fallback returns "1.0.0" instead of null ([#538], [#539]) +- **SQL dumps on mirroring passive servers** from FinOps collectors ([#535]) +- **RetrievedFromCache** always showing False ([#536]) +- **Arithmetic overflow** in query_stats collector for dop/thread columns ([#547]) +- **Lite perfmon chart bugs** and Dashboard ScottPlot crash handling ([#544], [#545]) +- **PLE=0 scoring bug** — was scored as harmless, now correctly flagged ([#543]) +- **PercentRank >1.0** bug in HealthCalculator +- **6 verified Lite bugs** from code review ([#611]) +- **Enterprise feature audit text** — partitioning is not Enterprise-only +- **FinOps collector scheduling**, server switch, and utilization bugs +- **Dashboard drill-down** Unicode arrow in story path split +- **Empty DataGrid scrollbar artifacts** — hide grids when empty across all FinOps tabs +- **Query preview** — truncated in row, full text in tooltip + +[#529]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/529 +[#535]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/535 +[#536]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/536 +[#538]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/538 +[#539]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/539 +[#543]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/543 +[#544]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/544 +[#545]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/545 +[#547]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/547 +[#554]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/554 +[#555]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/555 +[#557]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/557 +[#562]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/562 +[#564]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/564 +[#566]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/566 +[#576]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/576 +[#577]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/577 +[#578]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/578 +[#528]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/528 +[#589]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/589 +[#590]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/590 +[#593]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/593 +[#594]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/594 +[#604]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/604 +[#611]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/611 +[#612]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/612 + ## [2.2.0] - 2026-03-11 **Contributors:** [@HannahVernon](https://github.com/HannahVernon), [@ClaudioESSilva](https://github.com/ClaudioESSilva), [@dphugo](https://github.com/dphugo), [@Orestes](https://github.com/Orestes) — thank you! diff --git a/Dashboard/AddServerDialog.xaml b/Dashboard/AddServerDialog.xaml index 84d3e5f1..ee06c9d9 100644 --- a/Dashboard/AddServerDialog.xaml +++ b/Dashboard/AddServerDialog.xaml @@ -2,7 +2,7 @@ xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" Title="Add SQL Server" - SizeToContent="Height" Width="450" MaxHeight="700" + SizeToContent="Height" Width="450" MaxHeight="850" WindowStartupLocation="CenterOwner" ResizeMode="NoResize" Background="{DynamicResource BackgroundBrush}" @@ -100,6 +100,14 @@ + + + + + + diff --git a/Dashboard/AddServerDialog.xaml.cs b/Dashboard/AddServerDialog.xaml.cs index f4ad1a30..3bed93d5 100644 --- a/Dashboard/AddServerDialog.xaml.cs +++ b/Dashboard/AddServerDialog.xaml.cs @@ -42,6 +42,7 @@ public AddServerDialog(ServerConnection existingServer) ServerNameTextBox.Text = existingServer.ServerName; DescriptionTextBox.Text = existingServer.Description; IsFavoriteCheckBox.IsChecked = existingServer.IsFavorite; + MonthlyCostTextBox.Text = existingServer.MonthlyCostUsd.ToString(System.Globalization.CultureInfo.InvariantCulture); // Load encryption settings EncryptModeComboBox.SelectedIndex = existingServer.EncryptMode switch @@ -328,9 +329,15 @@ private async void Save_Click(object sender, RoutedEventArgs e) ServerConnection.IsFavorite = IsFavoriteCheckBox.IsChecked == true; ServerConnection.EncryptMode = GetSelectedEncryptMode(); ServerConnection.TrustServerCertificate = TrustServerCertificateCheckBox.IsChecked == true; + if (decimal.TryParse(MonthlyCostTextBox.Text, System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var editCost) && editCost >= 0) + ServerConnection.MonthlyCostUsd = editCost; } else { + decimal monthlyCost = 0m; + if (decimal.TryParse(MonthlyCostTextBox.Text, System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out var newCost) && newCost >= 0) + monthlyCost = newCost; + ServerConnection = new ServerConnection { DisplayName = displayName, @@ -341,7 +348,8 @@ private async void Save_Click(object sender, RoutedEventArgs e) CreatedDate = DateTime.Now, LastConnected = DateTime.Now, EncryptMode = GetSelectedEncryptMode(), - TrustServerCertificate = TrustServerCertificateCheckBox.IsChecked == true + TrustServerCertificate = TrustServerCertificateCheckBox.IsChecked == true, + MonthlyCostUsd = monthlyCost }; } diff --git a/Dashboard/Analysis/AnalysisModels.cs b/Dashboard/Analysis/AnalysisModels.cs new file mode 100644 index 00000000..76718852 --- /dev/null +++ b/Dashboard/Analysis/AnalysisModels.cs @@ -0,0 +1,152 @@ +using System; +using System.Collections.Generic; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// A scored observation from collected data. +/// +public class Fact +{ + public string Source { get; set; } = string.Empty; + public string Key { get; set; } = string.Empty; + public double Value { get; set; } + public double BaseSeverity { get; set; } + public double Severity { get; set; } + public int ServerId { get; set; } + public string? DatabaseName { get; set; } + + /// + /// Raw metric values for analysis and audit trail. + /// Keys are metric-specific (e.g., "wait_time_ms", "waiting_tasks_count"). + /// + public Dictionary Metadata { get; set; } = []; + + /// + /// Amplifiers that were evaluated for this fact. + /// + public List AmplifierResults { get; set; } = []; +} + +/// +/// Result of evaluating a single amplifier against the fact set. +/// +public class AmplifierResult +{ + public string Description { get; set; } = string.Empty; + public bool Matched { get; set; } + public double Boost { get; set; } +} + +/// +/// A conditional edge in the relationship graph. +/// +public class Edge +{ + public string Source { get; set; } = string.Empty; + public string Destination { get; set; } = string.Empty; + public string Category { get; set; } = string.Empty; + public string PredicateDescription { get; set; } = string.Empty; + + /// + /// Evaluates whether this edge should be followed given the current fact set. + /// + public Func, bool> Predicate { get; set; } = _ => false; +} + +/// +/// A complete analysis story — the path from root symptom to leaf recommendation. +/// +public class AnalysisStory +{ + public string RootFactKey { get; set; } = string.Empty; + public double RootFactValue { get; set; } + public double Severity { get; set; } + public double Confidence { get; set; } + public string Category { get; set; } = string.Empty; + public List Path { get; set; } = []; + public string StoryPath { get; set; } = string.Empty; + public string StoryPathHash { get; set; } = string.Empty; + public string StoryText { get; set; } = string.Empty; + public string? LeafFactKey { get; set; } + public double? LeafFactValue { get; set; } + public int FactCount { get; set; } + public bool IsAbsolution { get; set; } +} + +/// +/// A persisted finding from a previous analysis run. +/// Maps to the analysis_findings DuckDB table. +/// +public class AnalysisFinding +{ + public long FindingId { get; set; } + public DateTime AnalysisTime { get; set; } + public int ServerId { get; set; } + public string ServerName { get; set; } = string.Empty; + public string? DatabaseName { get; set; } + public DateTime? TimeRangeStart { get; set; } + public DateTime? TimeRangeEnd { get; set; } + public double Severity { get; set; } + public double Confidence { get; set; } + public string Category { get; set; } = string.Empty; + public string StoryPath { get; set; } = string.Empty; + public string StoryPathHash { get; set; } = string.Empty; + public string StoryText { get; set; } = string.Empty; + public string RootFactKey { get; set; } = string.Empty; + public double? RootFactValue { get; set; } + public string? LeafFactKey { get; set; } + public double? LeafFactValue { get; set; } + public int FactCount { get; set; } + + /// + /// Drill-down data collected after graph traversal. Ephemeral — not persisted to DuckDB. + /// Contains supporting detail keyed by category (e.g., "top_deadlocks", "queries_at_spike"). + /// + public Dictionary? DrillDown { get; set; } +} + +/// +/// A muted finding pattern. Maps to the analysis_muted DuckDB table. +/// +public class AnalysisMuted +{ + public long MuteId { get; set; } + public int? ServerId { get; set; } + public string? DatabaseName { get; set; } + public string StoryPathHash { get; set; } = string.Empty; + public string StoryPath { get; set; } = string.Empty; + public DateTime MutedDate { get; set; } + public string? Reason { get; set; } +} + +/// +/// A user-configured exclusion filter. Maps to the analysis_exclusions DuckDB table. +/// +public class AnalysisExclusion +{ + public long ExclusionId { get; set; } + public string ExclusionType { get; set; } = string.Empty; + public string ExclusionValue { get; set; } = string.Empty; + public int? ServerId { get; set; } + public string? DatabaseName { get; set; } + public bool IsEnabled { get; set; } = true; + public DateTime CreatedDate { get; set; } + public string? Description { get; set; } +} + +/// +/// A severity threshold value. Maps to the analysis_thresholds DuckDB table. +/// +public class AnalysisThreshold +{ + public long ThresholdId { get; set; } + public string Category { get; set; } = string.Empty; + public string FactKey { get; set; } = string.Empty; + public string ThresholdType { get; set; } = string.Empty; + public double ThresholdValue { get; set; } + public int? ServerId { get; set; } + public string? DatabaseName { get; set; } + public bool IsEnabled { get; set; } = true; + public DateTime ModifiedDate { get; set; } +} diff --git a/Dashboard/Analysis/AnalysisService.cs b/Dashboard/Analysis/AnalysisService.cs new file mode 100644 index 00000000..a0d7b2fc --- /dev/null +++ b/Dashboard/Analysis/AnalysisService.cs @@ -0,0 +1,323 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.Data.SqlClient; +using PerformanceMonitorDashboard.Helpers; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Orchestrates the full analysis pipeline: collect -> score -> traverse -> persist. +/// Can be run on-demand or on a timer. Each run analyzes a single server's data +/// for a given time window and persists the findings. +/// Port of Lite's AnalysisService — uses SQL Server instead of DuckDB. +/// +public class AnalysisService +{ + private readonly string _connectionString; + private readonly SqlServerFindingStore _findingStore; + private readonly SqlServerFactCollector _collector; + private readonly FactScorer _scorer; + private readonly RelationshipGraph _graph; + private readonly InferenceEngine _engine; + private readonly SqlServerDrillDownCollector _drillDown; + private readonly SqlServerAnomalyDetector _anomalyDetector; + + /// + /// Minimum hours of collected data required before analysis will run. + /// Short collection windows distort fraction-of-period calculations -- + /// 5 seconds of THREADPOOL looks alarming in a 16-minute window. + /// + internal double MinimumDataHours { get; set; } = 72; + + /// + /// Raised after each analysis run completes, providing the findings for UI display. + /// + public event EventHandler? AnalysisCompleted; + + /// + /// Whether an analysis is currently running. + /// + public bool IsAnalyzing { get; private set; } + + /// + /// Time of the last completed analysis run. + /// + public DateTime? LastAnalysisTime { get; private set; } + + /// + /// Set after AnalyzeAsync if insufficient data was found. Null if enough data exists. + /// + public string? InsufficientDataMessage { get; private set; } + + public AnalysisService(string connectionString, IPlanFetcher? planFetcher = null) + { + _connectionString = connectionString; + _findingStore = new SqlServerFindingStore(connectionString); + _collector = new SqlServerFactCollector(connectionString); + _scorer = new FactScorer(); + _graph = new RelationshipGraph(); + _engine = new InferenceEngine(_graph); + _drillDown = new SqlServerDrillDownCollector(connectionString, planFetcher); + _anomalyDetector = new SqlServerAnomalyDetector(connectionString); + } + + /// + /// Runs the full analysis pipeline for a server. + /// Default time range is the last 4 hours. + /// + public async Task> AnalyzeAsync(int serverId, string serverName, int hoursBack = 4) + { + var timeRangeEnd = DateTime.UtcNow; + var timeRangeStart = timeRangeEnd.AddHours(-hoursBack); + + var context = new AnalysisContext + { + ServerId = serverId, + ServerName = serverName, + TimeRangeStart = timeRangeStart, + TimeRangeEnd = timeRangeEnd + }; + + return await AnalyzeAsync(context); + } + + /// + /// Runs the full analysis pipeline with a specific context. + /// + public async Task> AnalyzeAsync(AnalysisContext context) + { + if (IsAnalyzing) + return []; + + IsAnalyzing = true; + InsufficientDataMessage = null; + + try + { + // 0. Check minimum data span -- total history, not the analysis window. + // A server with 100h of total history can be analyzed over a 4h window. + var dataSpanHours = await GetTotalDataSpanHoursAsync(); + if (dataSpanHours < MinimumDataHours) + { + var needed = MinimumDataHours >= 24 + ? $"{MinimumDataHours / 24:F1} days" + : $"{MinimumDataHours:F0} hours"; + var have = dataSpanHours >= 24 + ? $"{dataSpanHours / 24:F1} days" + : $"{dataSpanHours:F1} hours"; + + InsufficientDataMessage = + $"Not enough data for reliable analysis. Need {needed} of collected data, " + + $"have {have}. Keep the collector running and try again later."; + + Logger.Info( + $"[AnalysisService] Skipping analysis for {context.ServerName}: {dataSpanHours:F1}h data, need {MinimumDataHours}h"); + + LastAnalysisTime = DateTime.UtcNow; + return []; + } + + // 1. Collect facts from SQL Server + var facts = await _collector.CollectFactsAsync(context); + + if (facts.Count == 0) + { + LastAnalysisTime = DateTime.UtcNow; + return []; + } + + // 1.5. Detect anomalies (compare analysis window against baseline) + var anomalies = await _anomalyDetector.DetectAnomaliesAsync(context); + facts.AddRange(anomalies); + + // 2. Score facts (base severity + amplifiers) + _scorer.ScoreAll(facts); + + // 3. Build stories via graph traversal + var stories = _engine.BuildStories(facts); + + // 4. Persist findings (filtering out muted) + var findings = await _findingStore.SaveFindingsAsync(stories, context); + + // 5. Enrich findings with drill-down data (ephemeral, not persisted) + await _drillDown.EnrichFindingsAsync(findings, context); + + LastAnalysisTime = DateTime.UtcNow; + + // 6. Notify listeners + AnalysisCompleted?.Invoke(this, new AnalysisCompletedEventArgs + { + ServerId = context.ServerId, + ServerName = context.ServerName, + Findings = findings, + AnalysisTime = LastAnalysisTime.Value + }); + + Logger.Info( + $"[AnalysisService] Analysis complete for {context.ServerName}: {findings.Count} finding(s), " + + $"highest severity {(findings.Count > 0 ? findings.Max(f => f.Severity) : 0):F2}"); + + return findings; + } + catch (Exception ex) + { + Logger.Error($"[AnalysisService] Analysis failed for {context.ServerName}: {ex.Message}"); + return []; + } + finally + { + IsAnalyzing = false; + } + } + + /// + /// Runs the collect + score pipeline without graph traversal. + /// Returns raw scored facts with amplifier details for direct inspection. + /// + public async Task> CollectAndScoreFactsAsync(int serverId, string serverName, int hoursBack = 4) + { + var timeRangeEnd = DateTime.UtcNow; + var timeRangeStart = timeRangeEnd.AddHours(-hoursBack); + + var context = new AnalysisContext + { + ServerId = serverId, + ServerName = serverName, + TimeRangeStart = timeRangeStart, + TimeRangeEnd = timeRangeEnd + }; + + try + { + var facts = await _collector.CollectFactsAsync(context); + if (facts.Count == 0) return facts; + _scorer.ScoreAll(facts); + return facts; + } + catch (Exception ex) + { + Logger.Error($"[AnalysisService] Fact collection failed for {serverName}: {ex.Message}"); + return []; + } + } + + /// + /// Compares analysis of two time periods, returning facts from both for comparison. + /// + public async Task<(List BaselineFacts, List ComparisonFacts)> ComparePeriodsAsync( + int serverId, string serverName, + DateTime baselineStart, DateTime baselineEnd, + DateTime comparisonStart, DateTime comparisonEnd) + { + var baselineContext = new AnalysisContext + { + ServerId = serverId, + ServerName = serverName, + TimeRangeStart = baselineStart, + TimeRangeEnd = baselineEnd + }; + + var comparisonContext = new AnalysisContext + { + ServerId = serverId, + ServerName = serverName, + TimeRangeStart = comparisonStart, + TimeRangeEnd = comparisonEnd + }; + + try + { + var baselineFacts = await _collector.CollectFactsAsync(baselineContext); + var comparisonFacts = await _collector.CollectFactsAsync(comparisonContext); + + _scorer.ScoreAll(baselineFacts); + _scorer.ScoreAll(comparisonFacts); + + return (baselineFacts, comparisonFacts); + } + catch (Exception ex) + { + Logger.Error($"[AnalysisService] Period comparison failed for {serverName}: {ex.Message}"); + return ([], []); + } + } + + /// + /// Gets the latest findings for a server without running a new analysis. + /// + public async Task> GetLatestFindingsAsync(int serverId) + { + return await _findingStore.GetLatestFindingsAsync(serverId); + } + + /// + /// Gets recent findings for a server within the given time range. + /// + public async Task> GetRecentFindingsAsync(int serverId, int hoursBack = 24) + { + return await _findingStore.GetRecentFindingsAsync(serverId, hoursBack); + } + + /// + /// Mutes a finding pattern so it won't appear in future runs. + /// + public async Task MuteFindingAsync(AnalysisFinding finding, string? reason = null) + { + await _findingStore.MuteStoryAsync( + finding.ServerId, finding.StoryPathHash, finding.StoryPath, reason); + } + + /// + /// Cleans up old findings beyond the retention period. + /// + public async Task CleanupAsync(int retentionDays = 30) + { + await _findingStore.CleanupOldFindingsAsync(retentionDays); + } + + /// + /// Returns the total span of collected data (no time range filter). + /// This answers "has this server been monitored long enough?" -- separate from + /// the analysis window. A server with 100 hours of total history can safely + /// be analyzed over a 4-hour window without dilution. + /// Dashboard monitors one server per database, so no server_id filtering. + /// + private async Task GetTotalDataSpanHoursAsync() + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT DATEDIFF(SECOND, MIN(collection_time), MAX(collection_time)) / 3600.0 +FROM collect.wait_stats;"; + + var result = await cmd.ExecuteScalarAsync(); + if (result == null || result is DBNull) + return 0; + + return Convert.ToDouble(result); + } + catch + { + return 0; + } + } +} + +/// +/// Event args for when an analysis run completes. +/// +public class AnalysisCompletedEventArgs : EventArgs +{ + public int ServerId { get; set; } + public string ServerName { get; set; } = string.Empty; + public List Findings { get; set; } = []; + public DateTime AnalysisTime { get; set; } +} diff --git a/Dashboard/Analysis/FactScorer.cs b/Dashboard/Analysis/FactScorer.cs new file mode 100644 index 00000000..82382989 --- /dev/null +++ b/Dashboard/Analysis/FactScorer.cs @@ -0,0 +1,867 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Assigns severity to facts using threshold formulas (Layer 1) +/// and contextual amplifiers (Layer 2). +/// +/// Layer 1: Base severity 0.0-1.0 from thresholds alone. +/// Layer 2: Amplifiers multiply base up to 2.0 max using corroborating facts. +/// +/// Formula: severity = min(base * (1.0 + sum(amplifiers)), 2.0) +/// +public class FactScorer +{ + /// + /// Scores all facts: Layer 1 (base severity), then Layer 2 (amplifiers). + /// + public void ScoreAll(List facts) + { + // Layer 1: base severity from thresholds + foreach (var fact in facts) + { + fact.BaseSeverity = fact.Source switch + { + "waits" => ScoreWaitFact(fact), + "blocking" => ScoreBlockingFact(fact), + "cpu" => ScoreCpuFact(fact), + "io" => ScoreIoFact(fact), + "tempdb" => ScoreTempDbFact(fact), + "memory" => ScoreMemoryFact(fact), + "queries" => ScoreQueryFact(fact), + "perfmon" => ScorePerfmonFact(fact), + "database_config" => ScoreDatabaseConfigFact(fact), + "jobs" => ScoreJobFact(fact), + "disk" => ScoreDiskFact(fact), + "bad_actor" => ScoreBadActorFact(fact), + "anomaly" => ScoreAnomalyFact(fact), + _ => 0.0 + }; + } + + // Build lookup for amplifier evaluation (include context facts that amplifiers reference) + var contextSources = new HashSet + { "config", "cpu", "io", "tempdb", "memory", "queries", "perfmon", + "database_config", "jobs", "sessions", "disk", "bad_actor", "anomaly" }; + var factsByKey = facts + .Where(f => f.BaseSeverity > 0 || contextSources.Contains(f.Source)) + .ToDictionary(f => f.Key, f => f); + + // Layer 2: amplifiers boost base severity using corroborating facts + foreach (var fact in facts) + { + if (fact.BaseSeverity <= 0) + { + fact.Severity = 0; + continue; + } + + var amplifiers = GetAmplifiers(fact); + var totalBoost = 0.0; + + foreach (var amp in amplifiers) + { + var matched = amp.Predicate(factsByKey); + fact.AmplifierResults.Add(new AmplifierResult + { + Description = amp.Description, + Matched = matched, + Boost = matched ? amp.Boost : 0.0 + }); + + if (matched) totalBoost += amp.Boost; + } + + fact.Severity = Math.Min(fact.BaseSeverity * (1.0 + totalBoost), 2.0); + } + } + + /// + /// Scores a wait fact using the fraction-of-period formula. + /// Some waits have absolute minimum thresholds to filter out background noise. + /// + private static double ScoreWaitFact(Fact fact) + { + var fraction = fact.Value; + if (fraction <= 0) return 0.0; + + // THREADPOOL: require both meaningful total wait time AND meaningful average. + // Tiny amounts are normal thread pool grow/shrink housekeeping, not exhaustion. + if (fact.Key == "THREADPOOL") + { + var waitTimeMs = fact.Metadata.GetValueOrDefault("wait_time_ms"); + var avgMs = fact.Metadata.GetValueOrDefault("avg_ms_per_wait"); + if (waitTimeMs < 3_600_000 || avgMs < 1_000) return 0.0; + } + + var thresholds = GetWaitThresholds(fact.Key); + if (thresholds == null) return 0.0; + + return ApplyThresholdFormula(fraction, thresholds.Value.concerning, thresholds.Value.critical); + } + + /// + /// Scores blocking/deadlock facts using events-per-hour thresholds. + /// + private static double ScoreBlockingFact(Fact fact) + { + var value = fact.Value; // events per hour + if (value <= 0) return 0.0; + + return fact.Key switch + { + // Blocking: concerning >10/hr, critical >50/hr + "BLOCKING_EVENTS" => ApplyThresholdFormula(value, 10, 50), + // Deadlocks: concerning >5/hr (no critical — any sustained deadlocking is bad) + "DEADLOCKS" => ApplyThresholdFormula(value, 5, null), + _ => 0.0 + }; + } + + /// + /// Scores CPU utilization. Value is average SQL CPU %. + /// + private static double ScoreCpuFact(Fact fact) + { + return fact.Key switch + { + // CPU %: concerning at 75%, critical at 95% + "CPU_SQL_PERCENT" => ApplyThresholdFormula(fact.Value, 75, 95), + // CPU spike: value is max CPU %. Concerning at 80%, critical at 95%. + // Only emitted when max is significantly above average (bursty). + "CPU_SPIKE" => ApplyThresholdFormula(fact.Value, 80, 95), + _ => 0.0 + }; + } + + /// + /// Scores I/O latency facts. Value is average latency in ms. + /// + private static double ScoreIoFact(Fact fact) + { + return fact.Key switch + { + // Read latency: concerning at 20ms, critical at 50ms + "IO_READ_LATENCY_MS" => ApplyThresholdFormula(fact.Value, 20, 50), + // Write latency: concerning at 10ms, critical at 30ms + "IO_WRITE_LATENCY_MS" => ApplyThresholdFormula(fact.Value, 10, 30), + _ => 0.0 + }; + } + + /// + /// Scores TempDB usage. Value is usage fraction (reserved / total space). + /// + private static double ScoreTempDbFact(Fact fact) + { + return fact.Key switch + { + // TempDB usage: concerning at 75%, critical at 90% + "TEMPDB_USAGE" => ApplyThresholdFormula(fact.Value, 0.75, 0.90), + _ => 0.0 + }; + } + + /// + /// Scores memory grant facts. Only MEMORY_GRANT_PENDING (from resource semaphore) for now. + /// + private static double ScoreMemoryFact(Fact fact) + { + return fact.Key switch + { + // Grant waiters: concerning at 1, critical at 5 + "MEMORY_GRANT_PENDING" => ApplyThresholdFormula(fact.Value, 1, 5), + _ => 0.0 + }; + } + + /// + /// Scores query-level aggregate facts. + /// + private static double ScoreQueryFact(Fact fact) + { + return fact.Key switch + { + // Spills: concerning at 100, critical at 1000 in the period + "QUERY_SPILLS" => ApplyThresholdFormula(fact.Value, 100, 1000), + // High DOP queries: concerning at 5, critical at 20 in the period + "QUERY_HIGH_DOP" => ApplyThresholdFormula(fact.Value, 5, 20), + _ => 0.0 + }; + } + + /// + /// Scores perfmon counter facts. PLE is the classic memory pressure indicator. + /// + private static double ScorePerfmonFact(Fact fact) + { + return fact.Key switch + { + // PLE: lower is worse. Invert: concerning < 300, critical < 60 + "PERFMON_PLE" when fact.Value <= 0 => 1.0, + "PERFMON_PLE" when fact.Value < 60 => 1.0, + "PERFMON_PLE" when fact.Value < 300 => 0.5 + 0.5 * (300 - fact.Value) / 240, + "PERFMON_PLE" => 0.0, + _ => 0.0 + }; + } + + /// + /// Scores database configuration facts. + /// Auto-shrink and auto-close are always bad. + /// RCSI-off gets a low base that only becomes visible through amplifiers + /// when reader/writer lock contention (LCK_M_S, LCK_M_IS) is present. + /// + private static double ScoreDatabaseConfigFact(Fact fact) + { + if (fact.Key != "DB_CONFIG") return 0.0; + + var autoShrink = fact.Metadata.GetValueOrDefault("auto_shrink_on_count"); + var autoClose = fact.Metadata.GetValueOrDefault("auto_close_on_count"); + var pageVerifyBad = fact.Metadata.GetValueOrDefault("page_verify_not_checksum_count"); + var rcsiOff = fact.Metadata.GetValueOrDefault("rcsi_off_count"); + + var score = 0.0; + + // Auto-shrink, auto-close, bad page verify are always concerning + if (autoShrink > 0 || autoClose > 0 || pageVerifyBad > 0) + score = Math.Max(score, Math.Min((autoShrink + autoClose + pageVerifyBad) * 0.3, 1.0)); + + // RCSI-off: low base (0.3) — below display threshold alone. + // Amplifiers for LCK_M_S/LCK_M_IS push it above 0.5 when reader/writer + // contention confirms RCSI would help. + if (rcsiOff > 0) + score = Math.Max(score, 0.3); + + return score; + } + + /// + /// Scores running job facts. Long-running jobs are a signal. + /// + private static double ScoreJobFact(Fact fact) + { + return fact.Key switch + { + // Long-running jobs: concerning at 1, critical at 3 + "RUNNING_JOBS" => ApplyThresholdFormula(fact.Value, 1, 3), + _ => 0.0 + }; + } + + /// + /// Scores disk space facts. Low free space is critical. + /// + private static double ScoreDiskFact(Fact fact) + { + if (fact.Key != "DISK_SPACE") return 0.0; + + var freePct = fact.Value; + // Invert: lower free space is worse. Critical < 5%, concerning < 10% + if (freePct < 0.05) return 1.0; + if (freePct < 0.10) return 0.5 + 0.5 * (0.10 - freePct) / 0.05; + if (freePct < 0.20) return 0.5 * (0.20 - freePct) / 0.10; + return 0.0; + } + + /// + /// Scores bad actor queries using execution count tier x per-execution impact. + /// A query running 100K times at 1ms CPU is different from 100K times at 5s CPU. + /// The tier gets it in the door, per-execution impact determines how bad it is. + /// + private static double ScoreBadActorFact(Fact fact) + { + var execCount = fact.Metadata.GetValueOrDefault("execution_count"); + var avgCpuMs = fact.Metadata.GetValueOrDefault("avg_cpu_ms"); + var avgReads = fact.Metadata.GetValueOrDefault("avg_reads"); + + // Execution count tier base — higher tiers for more frequent queries + var tierBase = execCount switch + { + < 1_000 => 0.5, + < 10_000 => 0.7, + < 100_000 => 0.85, + _ => 1.0 + }; + + // Per-execution impact: use the worse of CPU or reads + // CPU: concerning at 50ms, critical at 2000ms + var cpuImpact = ApplyThresholdFormula(avgCpuMs, 50, 2000); + // Reads: concerning at 5K, critical at 250K + var readsImpact = ApplyThresholdFormula(avgReads, 5_000, 250_000); + + var impact = Math.Max(cpuImpact, readsImpact); + + // Final: tier * impact. Both must be meaningful. + // A high-frequency query with trivial per-execution cost won't score. + // A heavy query that only runs once won't score high either. + return tierBase * impact; + } + + /// + /// Scores anomaly facts based on deviation from baseline. + /// At 2σ → 0.5, at 4σ → 1.0. Higher deviations are more severe. + /// For count-based anomalies (blocking/deadlock spikes), uses ratio instead. + /// + private static double ScoreAnomalyFact(Fact fact) + { + if (fact.Key.StartsWith("ANOMALY_CPU_SPIKE") || fact.Key.StartsWith("ANOMALY_READ_LATENCY") + || fact.Key.StartsWith("ANOMALY_WRITE_LATENCY")) + { + // Deviation-based scoring: 2σ = 0.5, 4σ = 1.0 + var deviation = fact.Metadata.GetValueOrDefault("deviation_sigma"); + var confidence = fact.Metadata.GetValueOrDefault("confidence", 1.0); + if (deviation < 2.0) return 0.0; + var base_score = 0.5 + 0.5 * Math.Min((deviation - 2.0) / 2.0, 1.0); + return base_score * confidence; + } + + if (fact.Key.StartsWith("ANOMALY_WAIT_")) + { + // Ratio-based scoring: 5x = 0.5, 20x = 1.0 + var ratio = fact.Metadata.GetValueOrDefault("ratio"); + if (ratio < 5) return 0.0; + return 0.5 + 0.5 * Math.Min((ratio - 5.0) / 15.0, 1.0); + } + + if (fact.Key.StartsWith("ANOMALY_BLOCKING_SPIKE") || fact.Key.StartsWith("ANOMALY_DEADLOCK_SPIKE")) + { + // Ratio-based: 3x = 0.5, 10x = 1.0 + var ratio = fact.Metadata.GetValueOrDefault("ratio"); + if (ratio < 3) return 0.0; + return 0.5 + 0.5 * Math.Min((ratio - 3.0) / 7.0, 1.0); + } + + return 0.0; + } + + /// + /// Generic threshold formula used by waits, latency, and count-based metrics. + /// Critical == null means "concerning only" — hitting concerning = 1.0. + /// + internal static double ApplyThresholdFormula(double value, double concerning, double? critical) + { + if (value <= 0) return 0.0; + + if (critical == null) + return Math.Min(value / concerning, 1.0); + + if (value >= critical.Value) + return 1.0; + + if (value >= concerning) + return 0.5 + 0.5 * (value - concerning) / (critical.Value - concerning); + + return 0.5 * (value / concerning); + } + + /// + /// Returns amplifier definitions for a fact. Each amplifier has a description, + /// a boost value, and a predicate that evaluates against the current fact set. + /// Amplifiers are defined per wait type and will grow as more fact categories are added. + /// + private static List GetAmplifiers(Fact fact) + { + return fact.Key switch + { + "SOS_SCHEDULER_YIELD" => SosSchedulerYieldAmplifiers(), + "CXPACKET" => CxPacketAmplifiers(), + "THREADPOOL" => ThreadpoolAmplifiers(), + "PAGEIOLATCH_SH" or "PAGEIOLATCH_EX" => PageiolatchAmplifiers(), + "LATCH_EX" or "LATCH_SH" => LatchAmplifiers(), + "BLOCKING_EVENTS" => BlockingEventsAmplifiers(), + "DEADLOCKS" => DeadlockAmplifiers(), + "LCK" => LckAmplifiers(), + "CPU_SQL_PERCENT" => CpuSqlPercentAmplifiers(), + "CPU_SPIKE" => CpuSpikeAmplifiers(), + "IO_READ_LATENCY_MS" => IoReadLatencyAmplifiers(), + "IO_WRITE_LATENCY_MS" => IoWriteLatencyAmplifiers(), + "MEMORY_GRANT_PENDING" => MemoryGrantAmplifiers(), + "QUERY_SPILLS" => QuerySpillAmplifiers(), + "PERFMON_PLE" => PleAmplifiers(), + "DB_CONFIG" => DbConfigAmplifiers(), + "DISK_SPACE" => DiskSpaceAmplifiers(), + _ => [] + }; + } + + /// + /// SOS_SCHEDULER_YIELD: CPU starvation confirmed by parallelism waits. + /// More amplifiers added when config and CPU utilization facts are available. + /// + private static List SosSchedulerYieldAmplifiers() => + [ + new() + { + Description = "CXPACKET significant — parallelism consuming schedulers", + Boost = 0.2, + Predicate = facts => HasSignificantWait(facts, "CXPACKET", 0.10) + }, + new() + { + Description = "THREADPOOL waits present — escalating to thread exhaustion", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("THREADPOOL") && facts["THREADPOOL"].BaseSeverity > 0 + }, + new() + { + Description = "SQL Server CPU > 80% — confirmed CPU saturation", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("CPU_SQL_PERCENT", out var cpu) && cpu.Value >= 80 + } + ]; + + /// + /// CXPACKET: parallelism waits confirmed by CPU pressure and bad config. + /// CXCONSUMER is grouped into CXPACKET by the collector. + /// + private static List CxPacketAmplifiers() => + [ + new() + { + Description = "SOS_SCHEDULER_YIELD high — CPU starvation from parallelism", + Boost = 0.3, + Predicate = facts => HasSignificantWait(facts, "SOS_SCHEDULER_YIELD", 0.25) + }, + new() + { + Description = "THREADPOOL waits present — thread exhaustion cascade", + Boost = 0.4, + Predicate = facts => facts.ContainsKey("THREADPOOL") && facts["THREADPOOL"].BaseSeverity > 0 + }, + new() + { + Description = "CTFP at default (5) — too low for most workloads", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("CONFIG_CTFP", out var ctfp) && ctfp.Value <= 5 + }, + new() + { + Description = "MAXDOP at 0 — unlimited parallelism", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("CONFIG_MAXDOP", out var maxdop) && maxdop.Value == 0 + }, + new() + { + Description = "Queries running with DOP > 8 — excessive parallelism confirmed", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("QUERY_HIGH_DOP", out var dop) && dop.BaseSeverity > 0 + } + ]; + + /// + /// THREADPOOL: thread exhaustion confirmed by parallelism pressure. + /// Blocking and config amplifiers added later. + /// + private static List ThreadpoolAmplifiers() => + [ + new() + { + Description = "CXPACKET significant — parallel queries consuming thread pool", + Boost = 0.2, + Predicate = facts => HasSignificantWait(facts, "CXPACKET", 0.10) + }, + new() + { + Description = "Lock contention present — blocked queries holding worker threads", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("LCK") && facts["LCK"].BaseSeverity >= 0.5 + } + ]; + + /// + /// PAGEIOLATCH: memory pressure confirmed by other waits. + /// Buffer pool, query, and config amplifiers added when those facts are available. + /// + private static List PageiolatchAmplifiers() => + [ + new() + { + Description = "SOS_SCHEDULER_YIELD elevated — CPU pressure alongside I/O pressure", + Boost = 0.1, + Predicate = facts => HasSignificantWait(facts, "SOS_SCHEDULER_YIELD", 0.15) + }, + new() + { + Description = "Read latency > 20ms — confirmed disk I/O bottleneck", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("IO_READ_LATENCY_MS", out var io) && io.Value >= 20 + }, + new() + { + Description = "Memory grant waiters present — grants competing with buffer pool", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("MEMORY_GRANT_PENDING", out var mg) && mg.Value >= 1 + } + ]; + + /// + /// LATCH_EX/LATCH_SH: in-memory page latch contention. + /// Common causes: TempDB allocation contention, hot page updates, + /// parallel insert into heaps or narrow indexes. + /// + private static List LatchAmplifiers() => + [ + new() + { + Description = "TempDB usage elevated — latch contention likely on TempDB allocation pages", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("TEMPDB_USAGE", out var t) && t.BaseSeverity > 0 + }, + new() + { + Description = "CXPACKET significant — parallel operations amplifying latch contention", + Boost = 0.2, + Predicate = facts => HasSignificantWait(facts, "CXPACKET", 0.10) + }, + new() + { + Description = "SOS_SCHEDULER_YIELD elevated — latch spinning contributing to CPU pressure", + Boost = 0.2, + Predicate = facts => HasSignificantWait(facts, "SOS_SCHEDULER_YIELD", 0.15) + } + ]; + + /// + /// BLOCKING_EVENTS: blocking confirmed by lock waits and deadlocks. + /// + private static List BlockingEventsAmplifiers() => + [ + new() + { + Description = "Head blocker sleeping with open transaction — abandoned transaction pattern", + Boost = 0.4, + Predicate = facts => facts.TryGetValue("BLOCKING_EVENTS", out var f) + && f.Metadata.GetValueOrDefault("sleeping_blocker_count") > 0 + }, + new() + { + Description = "Lock contention waits elevated — blocking visible in wait stats", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("LCK") && facts["LCK"].BaseSeverity >= 0.3 + }, + new() + { + Description = "Deadlocks also present — blocking escalating to deadlocks", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("DEADLOCKS") && facts["DEADLOCKS"].BaseSeverity > 0 + } + ]; + + /// + /// DEADLOCKS: deadlocks confirmed by blocking patterns. + /// + private static List DeadlockAmplifiers() => + [ + new() + { + Description = "Blocking events also present — systemic contention pattern", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0 + }, + new() + { + Description = "Reader/writer lock waits present — RCSI could prevent some deadlocks", + Boost = 0.3, + Predicate = facts => (facts.ContainsKey("LCK_M_S") && facts["LCK_M_S"].BaseSeverity > 0) + || (facts.ContainsKey("LCK_M_IS") && facts["LCK_M_IS"].BaseSeverity > 0) + }, + new() + { + Description = "Databases without RCSI — reader/writer isolation amplifying deadlocks", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("DB_CONFIG", out var db) && db.Metadata.GetValueOrDefault("rcsi_off_count") > 0 + } + ]; + + /// + /// LCK (grouped general lock contention): confirmed by blocking reports and deadlocks. + /// + private static List LckAmplifiers() => + [ + new() + { + Description = "Blocked process reports present — confirmed blocking events", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0 + }, + new() + { + Description = "Deadlocks present — lock contention escalating to deadlocks", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("DEADLOCKS") && facts["DEADLOCKS"].BaseSeverity > 0 + }, + new() + { + Description = "THREADPOOL waits present — blocking causing thread exhaustion", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("THREADPOOL") && facts["THREADPOOL"].BaseSeverity > 0 + } + ]; + + /// + /// PLE: memory pressure confirmed by PAGEIOLATCH and RESOURCE_SEMAPHORE. + /// + private static List PleAmplifiers() => + [ + new() + { + Description = "PAGEIOLATCH waits present — buffer pool misses confirm memory pressure", + Boost = 0.3, + Predicate = facts => HasSignificantWait(facts, "PAGEIOLATCH_SH", 0.10) + || HasSignificantWait(facts, "PAGEIOLATCH_EX", 0.10) + }, + new() + { + Description = "RESOURCE_SEMAPHORE waits — memory grants competing with buffer pool", + Boost = 0.2, + Predicate = facts => facts.ContainsKey("RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].BaseSeverity > 0 + } + ]; + + /// + /// DB_CONFIG: database misconfiguration amplified by related symptoms. + /// RCSI-off amplifiers only fire when reader/writer lock contention is present — + /// LCK_M_S (shared lock waits) and LCK_M_IS (intent-shared) are readers blocked + /// by writers. RCSI eliminates these. Writer/writer conflicts (LCK_M_X, LCK_M_U) + /// are NOT helped by RCSI and should not trigger this amplifier. + /// + private static List DbConfigAmplifiers() => + [ + new() + { + Description = "I/O latency elevated — auto_shrink may be causing fragmentation and I/O pressure", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("IO_READ_LATENCY_MS", out var io) && io.BaseSeverity > 0 + }, + new() + { + Description = "LCK_M_S waits — readers blocked by writers, RCSI would eliminate shared lock waits", + Boost = 0.5, + Predicate = facts => facts.TryGetValue("DB_CONFIG", out var db) + && db.Metadata.GetValueOrDefault("rcsi_off_count") > 0 + && facts.TryGetValue("LCK_M_S", out var lckS) && lckS.BaseSeverity > 0 + }, + new() + { + Description = "LCK_M_IS waits — intent-shared locks blocked by writers, RCSI would eliminate these", + Boost = 0.4, + Predicate = facts => facts.TryGetValue("DB_CONFIG", out var db) + && db.Metadata.GetValueOrDefault("rcsi_off_count") > 0 + && facts.TryGetValue("LCK_M_IS", out var lckIS) && lckIS.BaseSeverity > 0 + }, + new() + { + Description = "Deadlocks with reader/writer lock waits — RCSI eliminates reader/writer deadlocks", + Boost = 0.4, + Predicate = facts => facts.TryGetValue("DB_CONFIG", out var db) + && db.Metadata.GetValueOrDefault("rcsi_off_count") > 0 + && facts.TryGetValue("DEADLOCKS", out var dl) && dl.BaseSeverity > 0 + && (facts.TryGetValue("LCK_M_S", out var s) && s.BaseSeverity > 0 + || facts.TryGetValue("LCK_M_IS", out var i) && i.BaseSeverity > 0) + } + ]; + + /// + /// DISK_SPACE: low disk space amplified by I/O activity and TempDB pressure. + /// + private static List DiskSpaceAmplifiers() => + [ + new() + { + Description = "TempDB usage elevated — growing TempDB on a nearly full volume", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("TEMPDB_USAGE", out var t) && t.BaseSeverity > 0 + }, + new() + { + Description = "Query spills present — spills to disk on a nearly full volume", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("QUERY_SPILLS", out var s) && s.BaseSeverity > 0 + } + ]; + + /// + /// CPU_SQL_PERCENT: CPU saturation confirmed by scheduler yields and parallelism. + /// + private static List CpuSqlPercentAmplifiers() => + [ + new() + { + Description = "SOS_SCHEDULER_YIELD elevated — scheduler pressure confirms CPU saturation", + Boost = 0.3, + Predicate = facts => HasSignificantWait(facts, "SOS_SCHEDULER_YIELD", 0.25) + }, + new() + { + Description = "CXPACKET significant — parallelism contributing to CPU load", + Boost = 0.2, + Predicate = facts => HasSignificantWait(facts, "CXPACKET", 0.10) + } + ]; + + /// + /// CPU_SPIKE: bursty CPU event (max >> average) confirmed by scheduler + /// pressure, parallelism, or query spills during the spike. + /// + private static List CpuSpikeAmplifiers() => + [ + new() + { + Description = "SOS_SCHEDULER_YIELD present — scheduler pressure during CPU spike", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].BaseSeverity > 0 + }, + new() + { + Description = "CXPACKET significant — parallelism contributing to CPU spike", + Boost = 0.2, + Predicate = facts => HasSignificantWait(facts, "CXPACKET", 0.10) + }, + new() + { + Description = "THREADPOOL waits present — CPU spike causing thread exhaustion", + Boost = 0.4, + Predicate = facts => facts.ContainsKey("THREADPOOL") && facts["THREADPOOL"].BaseSeverity > 0 + } + ]; + + /// + /// IO_READ_LATENCY_MS: read latency confirmed by PAGEIOLATCH waits. + /// + private static List IoReadLatencyAmplifiers() => + [ + new() + { + Description = "PAGEIOLATCH waits elevated — buffer pool misses confirm I/O pressure", + Boost = 0.3, + Predicate = facts => HasSignificantWait(facts, "PAGEIOLATCH_SH", 0.10) + || HasSignificantWait(facts, "PAGEIOLATCH_EX", 0.10) + } + ]; + + /// + /// IO_WRITE_LATENCY_MS: write latency confirmed by WRITELOG waits. + /// + private static List IoWriteLatencyAmplifiers() => + [ + new() + { + Description = "WRITELOG waits elevated — transaction log I/O bottleneck confirmed", + Boost = 0.3, + Predicate = facts => HasSignificantWait(facts, "WRITELOG", 0.05) + } + ]; + + /// + /// MEMORY_GRANT_PENDING: grant pressure confirmed by RESOURCE_SEMAPHORE waits and spills. + /// + private static List MemoryGrantAmplifiers() => + [ + new() + { + Description = "RESOURCE_SEMAPHORE waits present — memory grant pressure in wait stats", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].BaseSeverity > 0 + }, + new() + { + Description = "Query spills present — queries running with insufficient memory grants", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("QUERY_SPILLS", out var s) && s.BaseSeverity > 0 + } + ]; + + /// + /// QUERY_SPILLS: spills confirmed by memory grant pressure. + /// + private static List QuerySpillAmplifiers() => + [ + new() + { + Description = "Memory grant waiters present — insufficient memory for query grants", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("MEMORY_GRANT_PENDING", out var mg) && mg.Value >= 1 + }, + new() + { + Description = "RESOURCE_SEMAPHORE waits — grant pressure visible in wait stats", + Boost = 0.2, + Predicate = facts => facts.ContainsKey("RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].BaseSeverity > 0 + } + ]; + + /// + /// Checks if a wait type is present with at least the given fraction of period. + /// + private static bool HasSignificantWait(Dictionary facts, string waitType, double minFraction) + { + return facts.TryGetValue(waitType, out var fact) && fact.Value >= minFraction; + } + + /// + /// Default thresholds for wait types (fraction of examined period). + /// Returns null for unrecognized waits — they get severity 0. + /// + private static (double concerning, double? critical)? GetWaitThresholds(string waitType) + { + return waitType switch + { + // CPU pressure + "SOS_SCHEDULER_YIELD" => (0.75, null), + "THREADPOOL" => (0.01, null), + + // Memory pressure + "PAGEIOLATCH_SH" => (0.25, null), + "PAGEIOLATCH_EX" => (0.25, null), + "RESOURCE_SEMAPHORE" => (0.01, null), + + // Parallelism (CXCONSUMER is grouped into CXPACKET by collector) + "CXPACKET" => (0.25, null), + + // Log I/O + "WRITELOG" => (0.10, null), + + // Lock waits — serializable/repeatable read lock modes + "LCK_M_RS_S" => (0.01, null), + "LCK_M_RS_U" => (0.01, null), + "LCK_M_RIn_NL" => (0.01, null), + "LCK_M_RIn_S" => (0.01, null), + "LCK_M_RIn_U" => (0.01, null), + "LCK_M_RIn_X" => (0.01, null), + "LCK_M_RX_S" => (0.01, null), + "LCK_M_RX_U" => (0.01, null), + "LCK_M_RX_X" => (0.01, null), + + // Reader/writer blocking locks + "LCK_M_S" => (0.05, null), + "LCK_M_IS" => (0.05, null), + + // General lock contention (grouped X, U, IX, SIX, BU, etc.) + "LCK" => (0.10, null), + + // Schema locks — DDL operations, index rebuilds + "SCH_M" => (0.01, null), + + // Latch contention — page latch (not I/O latch) indicates + // in-memory contention, often TempDB allocation or hot pages + "LATCH_EX" => (0.25, null), + "LATCH_SH" => (0.25, null), + + _ => null + }; + } +} + +/// +/// An amplifier definition: a named predicate that boosts severity when matched. +/// +internal class AmplifierDefinition +{ + public string Description { get; set; } = string.Empty; + public double Boost { get; set; } + public Func, bool> Predicate { get; set; } = _ => false; +} diff --git a/Dashboard/Analysis/IFactCollector.cs b/Dashboard/Analysis/IFactCollector.cs new file mode 100644 index 00000000..38b6abbe --- /dev/null +++ b/Dashboard/Analysis/IFactCollector.cs @@ -0,0 +1,31 @@ +using System; +using System.Collections.Generic; +using System.Threading.Tasks; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Context for an analysis run — what server, what time range. +/// +public class AnalysisContext +{ + public int ServerId { get; set; } + public string ServerName { get; set; } = string.Empty; + public DateTime TimeRangeStart { get; set; } + public DateTime TimeRangeEnd { get; set; } + public List Exclusions { get; set; } = []; + + /// + /// Duration of the examined period in milliseconds. + /// + public double PeriodDurationMs => (TimeRangeEnd - TimeRangeStart).TotalMilliseconds; +} + +/// +/// Collects facts from a data source for analysis. +/// Implementations are per-app: DuckDB for Lite, SQL Server for Dashboard. +/// +public interface IFactCollector +{ + Task> CollectFactsAsync(AnalysisContext context); +} diff --git a/Dashboard/Analysis/IPlanFetcher.cs b/Dashboard/Analysis/IPlanFetcher.cs new file mode 100644 index 00000000..e77fea18 --- /dev/null +++ b/Dashboard/Analysis/IPlanFetcher.cs @@ -0,0 +1,19 @@ +using System.Threading.Tasks; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Fetches execution plan XML from SQL Server on demand. +/// Platform-agnostic interface — Lite implements via RemoteCollectorService's +/// SQL connection, Dashboard implements via DatabaseService's connection. +/// Used by DrillDownCollector to analyze plans for high-impact findings +/// without storing plan XML in DuckDB or SQL Server tables. +/// +public interface IPlanFetcher +{ + /// + /// Fetches the execution plan XML for a given plan_handle. + /// Returns null if the plan is no longer in cache. + /// + Task FetchPlanXmlAsync(int serverId, string planHandle); +} diff --git a/Dashboard/Analysis/InferenceEngine.cs b/Dashboard/Analysis/InferenceEngine.cs new file mode 100644 index 00000000..976bef43 --- /dev/null +++ b/Dashboard/Analysis/InferenceEngine.cs @@ -0,0 +1,165 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Security.Cryptography; +using System.Text; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Greedy traversal engine that builds analysis stories from scored facts +/// and the relationship graph. +/// +/// Algorithm: +/// 1. Start at the highest-severity fact as entry point +/// 2. Evaluate all edge predicates from current node +/// 3. Follow edge to highest-severity destination (that hasn't been visited) +/// 4. Repeat until leaf (no active edges or all destinations visited) +/// 5. The path IS the story +/// 6. Mark traversed facts as consumed, repeat from next highest-severity +/// 7. Stop when remaining facts are below 0.5 severity +/// +public class InferenceEngine +{ + private const double MinimumSeverityThreshold = 0.5; + private const int MaxPathDepth = 10; // Safety limit + + private readonly RelationshipGraph _graph; + + public InferenceEngine(RelationshipGraph graph) + { + _graph = graph; + } + + /// + /// Builds analysis stories by traversing the relationship graph + /// starting from the highest-severity facts. + /// + public List BuildStories(List facts) + { + var stories = new List(); + var factsByKey = facts + .Where(f => f.Severity > 0) + .ToDictionary(f => f.Key, f => f); + var consumed = new HashSet(); + + // Process facts in severity order + var entryPoints = facts + .Where(f => f.Severity >= MinimumSeverityThreshold) + .OrderByDescending(f => f.Severity) + .ToList(); + + foreach (var entryFact in entryPoints) + { + if (consumed.Contains(entryFact.Key)) + continue; + + var path = Traverse(entryFact.Key, factsByKey, consumed); + + // Mark all facts in this path as consumed + foreach (var node in path) + consumed.Add(node); + + var story = BuildStory(path, factsByKey); + stories.Add(story); + } + + // Check for absolution — if no stories were generated at all + if (stories.Count == 0 && facts.Count > 0) + { + stories.Add(new AnalysisStory + { + RootFactKey = "server_health", + RootFactValue = 0, + Severity = 0, + Confidence = 1.0, + Category = "absolution", + Path = ["server_health"], + StoryPath = "server_health", + StoryPathHash = ComputeHash("server_health"), + StoryText = string.Empty, + IsAbsolution = true + }); + } + + return stories; + } + + /// + /// Greedy traversal from an entry point through the relationship graph. + /// Returns the path as a list of fact keys. + /// + private List Traverse(string startKey, + Dictionary factsByKey, + HashSet consumed) + { + var path = new List { startKey }; + var visited = new HashSet { startKey }; + var current = startKey; + + for (var depth = 0; depth < MaxPathDepth; depth++) + { + var activeEdges = _graph.GetActiveEdges(current, factsByKey); + + // Filter to destinations not already in this path and not consumed by prior stories + var candidates = activeEdges + .Where(e => !visited.Contains(e.Destination) && !consumed.Contains(e.Destination)) + .Where(e => factsByKey.ContainsKey(e.Destination)) + .OrderByDescending(e => factsByKey[e.Destination].Severity) + .ToList(); + + if (candidates.Count == 0) + break; // Leaf node — no more edges to follow + + var best = candidates[0]; + path.Add(best.Destination); + visited.Add(best.Destination); + current = best.Destination; + } + + return path; + } + + /// + /// Builds an AnalysisStory from a traversal path. + /// + private static AnalysisStory BuildStory(List path, Dictionary factsByKey) + { + var rootFact = factsByKey.GetValueOrDefault(path[0]); + var leafKey = path.Count > 1 ? path[^1] : null; + var leafFact = leafKey != null ? factsByKey.GetValueOrDefault(leafKey) : null; + + var storyPath = string.Join(" → ", path); + var category = rootFact?.Source ?? "unknown"; + + // Confidence = what fraction of edge destinations had matching facts + // For single-node paths, confidence is 1.0 (we found the symptom, just no deeper cause) + var confidence = path.Count == 1 ? 1.0 : (path.Count - 1.0) / path.Count; + + return new AnalysisStory + { + RootFactKey = path[0], + RootFactValue = rootFact?.Severity ?? 0, + Severity = rootFact?.Severity ?? 0, + Confidence = confidence, + Category = category, + Path = path, + StoryPath = storyPath, + StoryPathHash = ComputeHash(storyPath), + StoryText = string.Empty, + LeafFactKey = leafKey, + LeafFactValue = leafFact?.Severity, + FactCount = path.Count, + IsAbsolution = false + }; + } + + /// + /// Stable hash for story path deduplication and muting. + /// + private static string ComputeHash(string storyPath) + { + var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(storyPath)); + return Convert.ToHexString(bytes).ToLowerInvariant()[..16]; + } +} diff --git a/Dashboard/Analysis/RelationshipGraph.cs b/Dashboard/Analysis/RelationshipGraph.cs new file mode 100644 index 00000000..2650a7bf --- /dev/null +++ b/Dashboard/Analysis/RelationshipGraph.cs @@ -0,0 +1,325 @@ +using System.Collections.Generic; +using System.Linq; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Defines conditional edges between facts. The graph encodes Erik's diagnostic +/// reasoning: "when I see symptom X, what do I check next?" +/// +/// Edges are code-defined (not data-driven) because they represent expert knowledge. +/// Each edge has a predicate that evaluates against the current fact set to decide +/// if the edge should be followed. +/// +/// Built incrementally — new edges are added as new fact categories become available. +/// +public class RelationshipGraph +{ + private readonly Dictionary> _edges = new(); + + public RelationshipGraph() + { + BuildGraph(); + } + + /// + /// Returns all edges originating from the given fact key, + /// filtered to only those whose predicates are true. + /// + public List GetActiveEdges(string sourceKey, IReadOnlyDictionary factsByKey) + { + if (!_edges.TryGetValue(sourceKey, out var edges)) + return []; + + return edges.Where(e => e.Predicate(factsByKey)).ToList(); + } + + /// + /// Returns all defined edges from a source (regardless of predicate). + /// Used for audit trail logging. + /// + public List GetAllEdges(string sourceKey) + { + return _edges.TryGetValue(sourceKey, out var edges) ? edges : []; + } + + private void AddEdge(string source, string destination, string category, + string predicateDescription, System.Func, bool> predicate) + { + if (!_edges.ContainsKey(source)) + _edges[source] = []; + + _edges[source].Add(new Edge + { + Source = source, + Destination = destination, + Category = category, + PredicateDescription = predicateDescription, + Predicate = predicate + }); + } + + /// + /// Builds all edges in the relationship graph. + /// Organized by entry point category matching the design doc. + /// + private void BuildGraph() + { + BuildCpuPressureEdges(); + BuildMemoryPressureEdges(); + BuildBlockingEdges(); + BuildIoPressureEdges(); + BuildLatchEdges(); + BuildTempDbEdges(); + BuildQueryEdges(); + } + + /* ── CPU Pressure ── */ + + private void BuildCpuPressureEdges() + { + // SOS_SCHEDULER_YIELD → CXPACKET (parallelism contributing to CPU) + AddEdge("SOS_SCHEDULER_YIELD", "CXPACKET", "cpu_pressure", + "CXPACKET significant — parallelism consuming schedulers", + facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.5); + + // SOS_SCHEDULER_YIELD → THREADPOOL (escalating to thread exhaustion) + AddEdge("SOS_SCHEDULER_YIELD", "THREADPOOL", "cpu_pressure", + "THREADPOOL waits present — escalating to thread exhaustion", + facts => HasFact(facts, "THREADPOOL") && facts["THREADPOOL"].Severity > 0); + + // CXPACKET → SOS (CPU starvation from parallelism) + AddEdge("CXPACKET", "SOS_SCHEDULER_YIELD", "parallelism", + "SOS_SCHEDULER_YIELD elevated — CPU starvation from parallelism", + facts => HasFact(facts, "SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].Value >= 0.25); + + // CXPACKET → THREADPOOL (thread exhaustion cascade) + AddEdge("CXPACKET", "THREADPOOL", "parallelism", + "THREADPOOL waits present — thread exhaustion cascade", + facts => HasFact(facts, "THREADPOOL") && facts["THREADPOOL"].Severity > 0); + + // THREADPOOL → CXPACKET (parallel queries consuming thread pool) + AddEdge("THREADPOOL", "CXPACKET", "thread_exhaustion", + "CXPACKET significant — parallel queries consuming thread pool", + facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.5); + + // THREADPOOL → LCK (blocking causing thread buildup — stuck queries holding threads) + AddEdge("THREADPOOL", "LCK", "thread_exhaustion", + "Lock contention — blocked queries holding worker threads", + facts => HasFact(facts, "LCK") && facts["LCK"].Severity >= 0.5); + + // CPU_SQL_PERCENT → SOS_SCHEDULER_YIELD (CPU confirms scheduler pressure) + AddEdge("CPU_SQL_PERCENT", "SOS_SCHEDULER_YIELD", "cpu_pressure", + "Scheduler yields confirm CPU saturation", + facts => HasFact(facts, "SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].Severity >= 0.5); + + // CPU_SQL_PERCENT → CXPACKET (CPU load from parallelism) + AddEdge("CPU_SQL_PERCENT", "CXPACKET", "cpu_pressure", + "Parallelism waits contributing to CPU load", + facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.5); + + // SOS_SCHEDULER_YIELD → CPU_SQL_PERCENT (scheduler yields with high CPU) + AddEdge("SOS_SCHEDULER_YIELD", "CPU_SQL_PERCENT", "cpu_pressure", + "SQL CPU > 80% — confirms CPU is the bottleneck", + facts => HasFact(facts, "CPU_SQL_PERCENT") && facts["CPU_SQL_PERCENT"].Value >= 80); + + // CPU_SPIKE → SOS_SCHEDULER_YIELD (spike confirmed by scheduler pressure) + AddEdge("CPU_SPIKE", "SOS_SCHEDULER_YIELD", "cpu_spike", + "Scheduler yields — CPU spike caused scheduler starvation", + facts => HasFact(facts, "SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].BaseSeverity > 0); + + // CPU_SPIKE → CXPACKET (spike from parallelism) + AddEdge("CPU_SPIKE", "CXPACKET", "cpu_spike", + "Parallelism waits — parallel queries contributing to CPU spike", + facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.3); + } + + /* ── Memory Pressure ── */ + + private void BuildMemoryPressureEdges() + { + // PAGEIOLATCH_SH → RESOURCE_SEMAPHORE (memory grants contributing to buffer pressure) + AddEdge("PAGEIOLATCH_SH", "RESOURCE_SEMAPHORE", "memory_pressure", + "RESOURCE_SEMAPHORE present — memory grants competing with buffer pool", + facts => HasFact(facts, "RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].Severity > 0); + + // PAGEIOLATCH_EX → same + AddEdge("PAGEIOLATCH_EX", "RESOURCE_SEMAPHORE", "memory_pressure", + "RESOURCE_SEMAPHORE present — memory grants competing with buffer pool", + facts => HasFact(facts, "RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].Severity > 0); + + // RESOURCE_SEMAPHORE → PAGEIOLATCH (downstream I/O cascade) + AddEdge("RESOURCE_SEMAPHORE", "PAGEIOLATCH_SH", "memory_grants", + "PAGEIOLATCH elevated — memory grant pressure causing buffer pool shrinkage", + facts => HasFact(facts, "PAGEIOLATCH_SH") && facts["PAGEIOLATCH_SH"].Severity >= 0.5); + + // RESOURCE_SEMAPHORE → MEMORY_GRANT_PENDING (grant pressure confirmed by semaphore waiters) + AddEdge("RESOURCE_SEMAPHORE", "MEMORY_GRANT_PENDING", "memory_grants", + "Memory grant waiters present — queries queued for memory", + facts => HasFact(facts, "MEMORY_GRANT_PENDING") && facts["MEMORY_GRANT_PENDING"].BaseSeverity > 0); + + // RESOURCE_SEMAPHORE → QUERY_SPILLS (grant pressure causing spills) + AddEdge("RESOURCE_SEMAPHORE", "QUERY_SPILLS", "memory_grants", + "Query spills present — queries running with insufficient memory", + facts => HasFact(facts, "QUERY_SPILLS") && facts["QUERY_SPILLS"].BaseSeverity > 0); + + // MEMORY_GRANT_PENDING → RESOURCE_SEMAPHORE (waiters confirm RESOURCE_SEMAPHORE waits) + AddEdge("MEMORY_GRANT_PENDING", "RESOURCE_SEMAPHORE", "memory_grants", + "RESOURCE_SEMAPHORE waits — grant pressure visible in wait stats", + facts => HasFact(facts, "RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].Severity > 0); + + // MEMORY_GRANT_PENDING → QUERY_SPILLS (insufficient grants causing spills) + AddEdge("MEMORY_GRANT_PENDING", "QUERY_SPILLS", "memory_grants", + "Query spills — queries getting insufficient memory grants", + facts => HasFact(facts, "QUERY_SPILLS") && facts["QUERY_SPILLS"].BaseSeverity > 0); + + // PAGEIOLATCH_SH → IO_READ_LATENCY_MS (buffer miss confirmed by disk latency) + AddEdge("PAGEIOLATCH_SH", "IO_READ_LATENCY_MS", "memory_pressure", + "Read latency elevated — disk confirms buffer pool pressure", + facts => HasFact(facts, "IO_READ_LATENCY_MS") && facts["IO_READ_LATENCY_MS"].BaseSeverity > 0); + + // PAGEIOLATCH_EX → IO_READ_LATENCY_MS + AddEdge("PAGEIOLATCH_EX", "IO_READ_LATENCY_MS", "memory_pressure", + "Read latency elevated — disk confirms buffer pool pressure", + facts => HasFact(facts, "IO_READ_LATENCY_MS") && facts["IO_READ_LATENCY_MS"].BaseSeverity > 0); + } + + /* ── Blocking & Deadlocking ── */ + + private void BuildBlockingEdges() + { + // LCK → BLOCKING_EVENTS (lock waits confirmed by actual blocking reports) + AddEdge("LCK", "BLOCKING_EVENTS", "lock_contention", + "Blocked process reports present — confirmed blocking events", + facts => HasFact(facts, "BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0); + + // LCK → DEADLOCKS (lock contention escalating) + AddEdge("LCK", "DEADLOCKS", "lock_contention", + "Deadlocks present — lock contention escalating to deadlocks", + facts => HasFact(facts, "DEADLOCKS") && facts["DEADLOCKS"].BaseSeverity > 0); + + // BLOCKING_EVENTS → LCK (blocking confirmed by lock waits) + AddEdge("BLOCKING_EVENTS", "LCK", "blocking", + "Lock contention waits elevated — blocking visible in wait stats", + facts => HasFact(facts, "LCK") && facts["LCK"].Severity >= 0.5); + + // BLOCKING_EVENTS → DEADLOCKS (blocking escalating) + AddEdge("BLOCKING_EVENTS", "DEADLOCKS", "blocking", + "Deadlocks also present — blocking escalating to deadlocks", + facts => HasFact(facts, "DEADLOCKS") && facts["DEADLOCKS"].BaseSeverity > 0); + + // BLOCKING_EVENTS → THREADPOOL (blocking causing thread exhaustion) + AddEdge("BLOCKING_EVENTS", "THREADPOOL", "blocking", + "THREADPOOL waits present — blocked queries consuming worker threads", + facts => HasFact(facts, "THREADPOOL") && facts["THREADPOOL"].Severity > 0); + + // DEADLOCKS → BLOCKING_EVENTS (deadlocks with systemic blocking) + AddEdge("DEADLOCKS", "BLOCKING_EVENTS", "deadlocking", + "Blocking events also present — systemic contention pattern", + facts => HasFact(facts, "BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0); + + // DEADLOCKS → LCK_M_S (reader/writer deadlocks) + AddEdge("DEADLOCKS", "LCK_M_S", "deadlocking", + "Reader lock waits present — RCSI could prevent reader/writer deadlocks", + facts => HasFact(facts, "LCK_M_S") && facts["LCK_M_S"].BaseSeverity > 0); + + // LCK_M_S → DB_CONFIG (reader/writer contention → RCSI recommendation) + AddEdge("LCK_M_S", "DB_CONFIG", "lock_contention", + "Databases without RCSI — readers blocked by writers could be eliminated", + facts => HasFact(facts, "DB_CONFIG") + && facts["DB_CONFIG"].Metadata.GetValueOrDefault("rcsi_off_count") > 0 + && facts["DB_CONFIG"].BaseSeverity > 0); + + // DB_CONFIG → LCK_M_S (RCSI-off confirmed by reader/writer lock contention) + AddEdge("DB_CONFIG", "LCK_M_S", "config_issue", + "LCK_M_S waits — readers blocked by writers, RCSI would eliminate these", + facts => HasFact(facts, "LCK_M_S") && facts["LCK_M_S"].BaseSeverity > 0 + && HasFact(facts, "DB_CONFIG") + && facts["DB_CONFIG"].Metadata.GetValueOrDefault("rcsi_off_count") > 0); + + // THREADPOOL → BLOCKING_EVENTS (blocking causing thread buildup) + AddEdge("THREADPOOL", "BLOCKING_EVENTS", "thread_exhaustion", + "Blocking events present — blocked queries holding worker threads", + facts => HasFact(facts, "BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0); + } + + /* ── I/O Pressure ── */ + + private void BuildIoPressureEdges() + { + // IO_READ_LATENCY_MS → PAGEIOLATCH_SH (disk latency with buffer pool misses) + AddEdge("IO_READ_LATENCY_MS", "PAGEIOLATCH_SH", "io_pressure", + "PAGEIOLATCH waits — buffer pool misses driving read I/O", + facts => HasFact(facts, "PAGEIOLATCH_SH") && facts["PAGEIOLATCH_SH"].Severity >= 0.5); + + // IO_WRITE_LATENCY_MS → WRITELOG (write latency with log waits) + AddEdge("IO_WRITE_LATENCY_MS", "WRITELOG", "io_pressure", + "WRITELOG waits — transaction log I/O bottleneck", + facts => HasFact(facts, "WRITELOG") && facts["WRITELOG"].Severity > 0); + + // WRITELOG → IO_WRITE_LATENCY_MS (log waits confirmed by disk latency) + AddEdge("WRITELOG", "IO_WRITE_LATENCY_MS", "log_io", + "Write latency elevated — disk confirms log I/O bottleneck", + facts => HasFact(facts, "IO_WRITE_LATENCY_MS") && facts["IO_WRITE_LATENCY_MS"].BaseSeverity > 0); + } + + /* ── Latch Contention ── */ + + private void BuildLatchEdges() + { + // LATCH_EX → TEMPDB_USAGE (latch contention often from TempDB allocation) + AddEdge("LATCH_EX", "TEMPDB_USAGE", "latch_contention", + "TempDB usage — latch contention may be on TempDB allocation pages", + facts => HasFact(facts, "TEMPDB_USAGE") && facts["TEMPDB_USAGE"].BaseSeverity > 0); + + // LATCH_EX → CXPACKET (parallel operations amplifying latch contention) + AddEdge("LATCH_EX", "CXPACKET", "latch_contention", + "Parallelism waits — parallel operations amplifying page latch contention", + facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.5); + } + + /* ── TempDB ── */ + + private void BuildTempDbEdges() + { + // TEMPDB_USAGE → PAGEIOLATCH_SH (tempdb pressure causing I/O) + AddEdge("TEMPDB_USAGE", "PAGEIOLATCH_SH", "tempdb_pressure", + "PAGEIOLATCH waits — TempDB pressure contributing to I/O", + facts => HasFact(facts, "PAGEIOLATCH_SH") && facts["PAGEIOLATCH_SH"].Severity >= 0.5); + + // TEMPDB_USAGE → QUERY_SPILLS (spills consuming tempdb) + AddEdge("TEMPDB_USAGE", "QUERY_SPILLS", "tempdb_pressure", + "Query spills — spilling to TempDB consuming space", + facts => HasFact(facts, "QUERY_SPILLS") && facts["QUERY_SPILLS"].BaseSeverity > 0); + } + + /* ── Query-Level ── */ + + private void BuildQueryEdges() + { + // QUERY_SPILLS → MEMORY_GRANT_PENDING (spills from insufficient grants) + AddEdge("QUERY_SPILLS", "MEMORY_GRANT_PENDING", "query_performance", + "Memory grant waiters — spills caused by insufficient memory grants", + facts => HasFact(facts, "MEMORY_GRANT_PENDING") && facts["MEMORY_GRANT_PENDING"].BaseSeverity > 0); + + // QUERY_SPILLS → TEMPDB_USAGE (spills consuming tempdb space) + AddEdge("QUERY_SPILLS", "TEMPDB_USAGE", "query_performance", + "TempDB usage elevated — spills consuming TempDB space", + facts => HasFact(facts, "TEMPDB_USAGE") && facts["TEMPDB_USAGE"].BaseSeverity > 0); + + // QUERY_HIGH_DOP → CXPACKET (high-DOP queries causing parallelism waits) + AddEdge("QUERY_HIGH_DOP", "CXPACKET", "query_performance", + "CXPACKET waits — high-DOP queries causing excessive parallelism", + facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.5); + + // QUERY_HIGH_DOP → SOS_SCHEDULER_YIELD (high-DOP queries causing CPU pressure) + AddEdge("QUERY_HIGH_DOP", "SOS_SCHEDULER_YIELD", "query_performance", + "Scheduler yields — high-DOP queries saturating CPU", + facts => HasFact(facts, "SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].Severity >= 0.5); + } + + private static bool HasFact(IReadOnlyDictionary facts, string key) + { + return facts.ContainsKey(key); + } +} diff --git a/Dashboard/Analysis/SqlServerAnomalyDetector.cs b/Dashboard/Analysis/SqlServerAnomalyDetector.cs new file mode 100644 index 00000000..bdf6664a --- /dev/null +++ b/Dashboard/Analysis/SqlServerAnomalyDetector.cs @@ -0,0 +1,543 @@ +using System; +using System.Collections.Generic; +using System.Threading.Tasks; +using Microsoft.Data.SqlClient; +using PerformanceMonitorDashboard.Helpers; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Detects anomalies by comparing the analysis window's metrics against a +/// baseline period. When a metric deviates significantly from baseline +/// (mean + standard deviation), an ANOMALY fact is emitted. +/// +/// This is the "oh shit" mode -- detecting acute deviations that don't show +/// up in aggregate analysis because they're brief. A 5-minute CPU spike +/// that averages out over 4 hours is invisible to aggregate scoring but +/// obvious when compared against "what was this metric doing before?" +/// +/// Baseline selection: uses the 24 hours preceding the analysis window. +/// If less data is available, uses whatever exists with lower confidence. +/// +/// Port of Lite's AnomalyDetector -- uses SQL Server collect.* tables instead of DuckDB views. +/// No server_id filtering -- Dashboard monitors one server per database. +/// +public class SqlServerAnomalyDetector +{ + private readonly string _connectionString; + + /// + /// Minimum number of baseline samples needed for reliable detection. + /// Below this, anomalies are still detected but with reduced confidence. + /// + private const int MinBaselineSamples = 10; + + /// + /// Number of standard deviations above baseline mean to flag as anomalous. + /// + private const double DeviationThreshold = 2.0; + + public SqlServerAnomalyDetector(string connectionString) + { + _connectionString = connectionString; + } + + /// + /// Detects anomalies by comparing the analysis window against a baseline period. + /// Returns anomaly facts to be merged into the main fact list. + /// + public async Task> DetectAnomaliesAsync(AnalysisContext context) + { + var anomalies = new List(); + + // Baseline: 24 hours preceding the analysis window + var baselineEnd = context.TimeRangeStart; + var baselineStart = baselineEnd.AddHours(-24); + + // Check if baseline period has any data at all -- if not, skip all anomaly detection. + // Without baseline data, everything looks anomalous. + if (!await HasBaselineDataAsync(baselineStart, baselineEnd)) + return anomalies; + + await DetectCpuAnomalies(context, baselineStart, baselineEnd, anomalies); + await DetectWaitAnomalies(context, baselineStart, baselineEnd, anomalies); + await DetectBlockingAnomalies(context, baselineStart, baselineEnd, anomalies); + await DetectIoAnomalies(context, baselineStart, baselineEnd, anomalies); + + return anomalies; + } + + /// + /// Checks if the baseline period has any collected data. + /// Uses wait_stats as canary -- if waits are collected, other data is too. + /// + private async Task HasBaselineDataAsync(DateTime baselineStart, DateTime baselineEnd) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + (SELECT COUNT(*) FROM collect.wait_stats + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) + + (SELECT COUNT(*) FROM collect.cpu_utilization_stats + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd);"; + + cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); + cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); + + var count = Convert.ToInt64(await cmd.ExecuteScalarAsync() ?? 0); + return count > 0; + } + catch { return false; } + } + + /// + /// Detects CPU utilization anomalies by comparing per-sample values + /// against the baseline distribution. + /// + private async Task DetectCpuAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + // Get baseline stats + using var baselineCmd = connection.CreateCommand(); + baselineCmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + AVG(CAST(sqlserver_cpu_utilization AS FLOAT)) AS mean_cpu, + STDEV(CAST(sqlserver_cpu_utilization AS FLOAT)) AS stddev_cpu, + COUNT(*) AS sample_count +FROM collect.cpu_utilization_stats +WHERE collection_time >= @baselineStart +AND collection_time < @baselineEnd;"; + + baselineCmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); + baselineCmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); + + double baselineMean = 0, baselineStdDev = 0; + long baselineSamples = 0; + + using (var reader = await baselineCmd.ExecuteReaderAsync()) + { + if (await reader.ReadAsync()) + { + baselineMean = reader.IsDBNull(0) ? 0 : Convert.ToDouble(reader.GetValue(0)); + baselineStdDev = reader.IsDBNull(1) ? 0 : Convert.ToDouble(reader.GetValue(1)); + baselineSamples = reader.IsDBNull(2) ? 0 : Convert.ToInt64(reader.GetValue(2)); + } + } + + if (baselineSamples < 3 || baselineStdDev <= 0) return; + + // Get peak and average in the analysis window + using var windowCmd = connection.CreateCommand(); + windowCmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + MAX(sqlserver_cpu_utilization) AS peak_cpu, + AVG(CAST(sqlserver_cpu_utilization AS FLOAT)) AS avg_cpu, + COUNT(*) AS sample_count, + (SELECT TOP 1 collection_time FROM collect.cpu_utilization_stats + WHERE collection_time >= @windowStart AND collection_time < @windowEnd + ORDER BY sqlserver_cpu_utilization DESC) AS peak_time +FROM collect.cpu_utilization_stats +WHERE collection_time >= @windowStart +AND collection_time < @windowEnd;"; + + windowCmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); + windowCmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); + + using var windowReader = await windowCmd.ExecuteReaderAsync(); + if (!await windowReader.ReadAsync()) return; + + var peakCpu = windowReader.IsDBNull(0) ? 0.0 : Convert.ToDouble(windowReader.GetValue(0)); + var avgCpu = windowReader.IsDBNull(1) ? 0.0 : Convert.ToDouble(windowReader.GetValue(1)); + var windowSamples = windowReader.IsDBNull(2) ? 0L : Convert.ToInt64(windowReader.GetValue(2)); + var peakTime = windowReader.IsDBNull(3) ? (DateTime?)null : windowReader.GetDateTime(3); + + if (windowSamples == 0) return; + + // Check if peak deviates significantly from baseline + var deviation = (peakCpu - baselineMean) / baselineStdDev; + if (deviation < DeviationThreshold || peakCpu < 50) return; // Don't flag low absolute values + + var confidence = baselineSamples >= MinBaselineSamples ? 1.0 : (double)baselineSamples / MinBaselineSamples; + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_CPU_SPIKE", + Value = peakCpu, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["peak_cpu"] = peakCpu, + ["avg_cpu_in_window"] = avgCpu, + ["baseline_mean"] = baselineMean, + ["baseline_stddev"] = baselineStdDev, + ["deviation_sigma"] = deviation, + ["baseline_samples"] = baselineSamples, + ["window_samples"] = windowSamples, + ["confidence"] = confidence, + ["peak_time_ticks"] = peakTime?.Ticks ?? 0 + } + }); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerAnomalyDetector] CPU anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects wait stat anomalies -- significant waits in the analysis window + /// that were absent or much lower in the baseline. + /// + private async Task DetectWaitAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + // Check if baseline has any wait data at all -- if not, skip + using var checkCmd = connection.CreateCommand(); + checkCmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT COUNT(*) FROM collect.wait_stats +WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd;"; + + checkCmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); + checkCmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); + + var baselineCount = Convert.ToInt64(await checkCmd.ExecuteScalarAsync() ?? 0); + if (baselineCount == 0) return; + + // Get per-wait-type totals in both windows + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH baseline AS ( + SELECT wait_type, + CAST(SUM(wait_time_ms_delta) AS BIGINT) AS total_ms + FROM collect.wait_stats + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd + AND wait_time_ms_delta > 0 + GROUP BY wait_type +), +current_window AS ( + SELECT wait_type, + CAST(SUM(wait_time_ms_delta) AS BIGINT) AS total_ms + FROM collect.wait_stats + WHERE collection_time >= @windowStart AND collection_time <= @windowEnd + AND wait_time_ms_delta > 0 + GROUP BY wait_type +) +SELECT TOP 10 + c.wait_type, + c.total_ms AS current_ms, + COALESCE(b.total_ms, 0) AS baseline_ms +FROM current_window c +LEFT JOIN baseline b ON c.wait_type = b.wait_type +WHERE c.total_ms > 10000 -- At least 10 seconds of wait time +ORDER BY c.total_ms DESC;"; + + cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); + cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); + cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var waitType = reader.GetString(0); + var currentMs = Convert.ToInt64(reader.GetValue(1)); + var baselineMs = Convert.ToInt64(reader.GetValue(2)); + + // Normalize to per-hour rates before comparing (windows are different lengths) + var baselineHours = (baselineEnd - baselineStart).TotalHours; + var currentHours = (context.TimeRangeEnd - context.TimeRangeStart).TotalHours; + if (baselineHours <= 0) baselineHours = 1; + if (currentHours <= 0) currentHours = 1; + + double ratio; + string anomalyType; + + if (baselineMs == 0) + { + ratio = currentMs > 60_000 ? 100.0 : 0; // Only flag if > 1 minute total + anomalyType = "new"; + } + else + { + var baselineRate = baselineMs / baselineHours; + var currentRate = currentMs / currentHours; + ratio = baselineRate > 0 ? currentRate / baselineRate : 100.0; + anomalyType = "spike"; + } + + if (ratio < 5.0) continue; // Need at least 5x increase + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = $"ANOMALY_WAIT_{waitType}", + Value = currentMs, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_ms"] = currentMs, + ["baseline_ms"] = baselineMs, + ["ratio"] = ratio, + ["is_new"] = anomalyType == "new" ? 1 : 0 + } + }); + } + } + catch (Exception ex) + { + Logger.Error($"[SqlServerAnomalyDetector] Wait anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects blocking/deadlock anomalies -- events in the analysis window + /// that are significantly above baseline rates. + /// + private async Task DetectBlockingAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + // Check if baseline period has any data at all + using var checkCmd = connection.CreateCommand(); + checkCmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + (SELECT COUNT(*) FROM collect.blocking_BlockedProcessReport + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) + + (SELECT COUNT(*) FROM collect.deadlocks + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) + + (SELECT COUNT(*) FROM collect.wait_stats + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd);"; + + checkCmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); + checkCmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); + + var baselineDataCount = Convert.ToInt64(await checkCmd.ExecuteScalarAsync() ?? 0); + if (baselineDataCount == 0) return; // No baseline data = can't detect anomaly + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + (SELECT COUNT(*) FROM collect.blocking_BlockedProcessReport + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) AS baseline_blocking, + (SELECT COUNT(*) FROM collect.blocking_BlockedProcessReport + WHERE collection_time >= @windowStart AND collection_time <= @windowEnd) AS current_blocking, + (SELECT COUNT(*) FROM collect.deadlocks + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) AS baseline_deadlocks, + (SELECT COUNT(*) FROM collect.deadlocks + WHERE collection_time >= @windowStart AND collection_time <= @windowEnd) AS current_deadlocks;"; + + cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); + cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); + cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var baselineBlocking = Convert.ToInt64(reader.GetValue(0)); + var currentBlocking = Convert.ToInt64(reader.GetValue(1)); + var baselineDeadlocks = Convert.ToInt64(reader.GetValue(2)); + var currentDeadlocks = Convert.ToInt64(reader.GetValue(3)); + + // Normalize to per-hour rates (windows are different lengths) + var baselineHours = (baselineEnd - baselineStart).TotalHours; + var currentHours = (context.TimeRangeEnd - context.TimeRangeStart).TotalHours; + if (baselineHours <= 0) baselineHours = 1; + if (currentHours <= 0) currentHours = 1; + + var baselineBlockingRate = baselineBlocking / baselineHours; + var currentBlockingRate = currentBlocking / currentHours; + var blockingRatio = baselineBlocking > 0 ? currentBlockingRate / baselineBlockingRate : 100.0; + + var baselineDeadlockRate = baselineDeadlocks / baselineHours; + var currentDeadlockRate = currentDeadlocks / currentHours; + var deadlockRatio = baselineDeadlocks > 0 ? currentDeadlockRate / baselineDeadlockRate : 100.0; + + // Blocking spike: at least 5 events AND 3x baseline rate (or new) + if (currentBlocking >= 5 && (baselineBlocking == 0 || blockingRatio >= 3)) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_BLOCKING_SPIKE", + Value = currentBlocking, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_count"] = currentBlocking, + ["baseline_count"] = baselineBlocking, + ["ratio"] = blockingRatio + } + }); + } + + // Deadlock spike: at least 3 events AND 3x baseline rate (or new) + if (currentDeadlocks >= 3 && (baselineDeadlocks == 0 || deadlockRatio >= 3)) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_DEADLOCK_SPIKE", + Value = currentDeadlocks, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_count"] = currentDeadlocks, + ["baseline_count"] = baselineDeadlocks, + ["ratio"] = deadlockRatio + } + }); + } + } + catch (Exception ex) + { + Logger.Error($"[SqlServerAnomalyDetector] Blocking anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects I/O latency anomalies -- significant increase in read/write latency + /// compared to baseline. + /// + private async Task DetectIoAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH baseline AS ( + SELECT + AVG(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS avg_read_lat, + AVG(io_stall_write_ms_delta * 1.0 / NULLIF(num_of_writes_delta, 0)) AS avg_write_lat, + STDEV(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS stddev_read, + STDEV(io_stall_write_ms_delta * 1.0 / NULLIF(num_of_writes_delta, 0)) AS stddev_write, + COUNT(*) AS samples + FROM collect.file_io_stats + WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd + AND (num_of_reads_delta > 0 OR num_of_writes_delta > 0) +), +current_window AS ( + SELECT + AVG(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS avg_read_lat, + AVG(io_stall_write_ms_delta * 1.0 / NULLIF(num_of_writes_delta, 0)) AS avg_write_lat + FROM collect.file_io_stats + WHERE collection_time >= @windowStart AND collection_time <= @windowEnd + AND (num_of_reads_delta > 0 OR num_of_writes_delta > 0) +) +SELECT b.avg_read_lat, b.stddev_read, c.avg_read_lat, + b.avg_write_lat, b.stddev_write, c.avg_write_lat, + b.samples +FROM baseline b, current_window c;"; + + cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); + cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); + cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var baselineReadLat = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var stddevRead = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var currentReadLat = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + var baselineWriteLat = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + var stddevWrite = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)); + var currentWriteLat = reader.IsDBNull(5) ? 0.0 : Convert.ToDouble(reader.GetValue(5)); + var samples = reader.IsDBNull(6) ? 0L : Convert.ToInt64(reader.GetValue(6)); + + if (samples < 3) return; + + // Read latency anomaly + if (stddevRead > 0 && currentReadLat > 10) // At least 10ms to matter + { + var readDeviation = (currentReadLat - baselineReadLat) / stddevRead; + if (readDeviation >= DeviationThreshold) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_READ_LATENCY", + Value = currentReadLat, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_latency_ms"] = currentReadLat, + ["baseline_mean_ms"] = baselineReadLat, + ["baseline_stddev_ms"] = stddevRead, + ["deviation_sigma"] = readDeviation, + ["baseline_samples"] = samples + } + }); + } + } + + // Write latency anomaly + if (stddevWrite > 0 && currentWriteLat > 5) // At least 5ms to matter + { + var writeDeviation = (currentWriteLat - baselineWriteLat) / stddevWrite; + if (writeDeviation >= DeviationThreshold) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_WRITE_LATENCY", + Value = currentWriteLat, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_latency_ms"] = currentWriteLat, + ["baseline_mean_ms"] = baselineWriteLat, + ["baseline_stddev_ms"] = stddevWrite, + ["deviation_sigma"] = writeDeviation, + ["baseline_samples"] = samples + } + }); + } + } + } + catch (Exception ex) + { + Logger.Error($"[SqlServerAnomalyDetector] I/O anomaly detection failed: {ex.Message}"); + } + } +} diff --git a/Dashboard/Analysis/SqlServerDrillDownCollector.cs b/Dashboard/Analysis/SqlServerDrillDownCollector.cs new file mode 100644 index 00000000..050ffe30 --- /dev/null +++ b/Dashboard/Analysis/SqlServerDrillDownCollector.cs @@ -0,0 +1,773 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.Data.SqlClient; +using PerformanceMonitorDashboard.Helpers; +using PerformanceMonitorDashboard.Mcp; +using PerformanceMonitorDashboard.Models; +using PerformanceMonitorDashboard.Services; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Enriches findings with drill-down data from SQL Server. +/// Runs after graph traversal, only for findings above severity threshold. +/// Each drill-down query is limited to top N results with truncated text. +/// +/// This makes analyze_server self-sufficient -- instead of returning a list +/// of "next tools to call," findings include the actual supporting data. +/// +/// Port of Lite's DrillDownCollector -- uses SQL Server collect.* tables instead of DuckDB views. +/// No server_id filtering -- Dashboard monitors one server per database. +/// +public class SqlServerDrillDownCollector +{ + private readonly string _connectionString; + private readonly IPlanFetcher? _planFetcher; + private const int TextLimit = 500; + + public SqlServerDrillDownCollector(string connectionString, IPlanFetcher? planFetcher = null) + { + _connectionString = connectionString; + _planFetcher = planFetcher; + } + + /// + /// Enriches each finding's DrillDown dictionary based on its story path. + /// + public async Task EnrichFindingsAsync(List findings, AnalysisContext context) + { + foreach (var finding in findings) + { + if (finding.Severity < 0.5) continue; + + try + { + finding.DrillDown = new Dictionary(); + var pathKeys = finding.StoryPath.Split(" → ", StringSplitOptions.RemoveEmptyEntries).ToHashSet(); + + if (pathKeys.Contains("DEADLOCKS")) + await CollectTopDeadlocks(finding, context); + + if (pathKeys.Contains("BLOCKING_EVENTS")) + await CollectTopBlockingChains(finding, context); + + if (pathKeys.Contains("CPU_SPIKE")) + await CollectQueriesAtSpike(finding, context); + + if (pathKeys.Contains("CPU_SQL_PERCENT") || pathKeys.Contains("CPU_SPIKE")) + await CollectTopCpuQueries(finding, context); + + if (pathKeys.Contains("QUERY_SPILLS")) + await CollectTopSpillingQueries(finding, context); + + if (pathKeys.Contains("IO_READ_LATENCY_MS") || pathKeys.Contains("IO_WRITE_LATENCY_MS")) + await CollectFileLatencyBreakdown(finding, context); + + if (pathKeys.Contains("LCK") || pathKeys.Contains("LCK_M_S") || pathKeys.Contains("LCK_M_IS")) + await CollectLockModeBreakdown(finding, context); + + if (pathKeys.Contains("DB_CONFIG")) + await CollectConfigIssues(finding, context); + + if (pathKeys.Contains("TEMPDB_USAGE")) + await CollectTempDbBreakdown(finding, context); + + if (pathKeys.Contains("MEMORY_GRANT_PENDING")) + await CollectPendingGrants(finding, context); + + if (pathKeys.Any(k => k.StartsWith("BAD_ACTOR_"))) + await CollectBadActorDetail(finding, context); + + // Plan analysis: for findings with top queries, analyze their cached plans + await CollectPlanAnalysis(finding, context); + + // Remove empty drill-down dictionaries + if (finding.DrillDown.Count == 0) + finding.DrillDown = null; + } + catch (Exception ex) + { + Logger.Error( + $"[SqlServerDrillDownCollector] Drill-down failed for {finding.StoryPath}: {ex.GetType().Name}: {ex.Message}\n{ex.StackTrace}"); + // Don't null out -- keep whatever was collected before the error + } + } + } + + private async Task CollectTopDeadlocks(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 3 + collection_time, + event_date, + spid, + LEFT(CAST(query AS NVARCHAR(MAX)), 500) AS victim_sql +FROM collect.deadlocks +WHERE collection_time >= @startTime AND collection_time <= @endTime +ORDER BY collection_time DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + time = reader.IsDBNull(0) ? "" : reader.GetDateTime(0).ToString("o"), + deadlock_time = reader.IsDBNull(1) ? "" : reader.GetDateTime(1).ToString("o"), + victim = reader.IsDBNull(2) ? "" : reader.GetValue(2).ToString(), + victim_sql = reader.IsDBNull(3) ? "" : reader.GetString(3) + }); + } + + if (items.Count > 0) + finding.DrillDown!["top_deadlocks"] = items; + } + + private async Task CollectTopBlockingChains(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 5 + collection_time, + database_name, + spid AS blocked_spid, + 0 AS blocking_spid, + wait_time_ms, + lock_mode, + LEFT(CAST(query_text AS NVARCHAR(MAX)), 500) AS blocked_sql, + LEFT(blocking_tree, 500) AS blocking_sql +FROM collect.blocking_BlockedProcessReport +WHERE collection_time >= @startTime AND collection_time <= @endTime +ORDER BY wait_time_ms DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + time = reader.IsDBNull(0) ? "" : reader.GetDateTime(0).ToString("o"), + database = reader.IsDBNull(1) ? "" : reader.GetString(1), + blocked_spid = reader.IsDBNull(2) ? 0 : Convert.ToInt32(reader.GetValue(2)), + blocking_spid = reader.IsDBNull(3) ? 0 : Convert.ToInt32(reader.GetValue(3)), + wait_time_ms = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)), + lock_mode = reader.IsDBNull(5) ? "" : reader.GetString(5), + blocked_sql = reader.IsDBNull(6) ? "" : reader.GetString(6), + blocking_sql = reader.IsDBNull(7) ? "" : reader.GetString(7) + }); + } + + if (items.Count > 0) + finding.DrillDown!["top_blocking_chains"] = items; + } + + private async Task CollectQueriesAtSpike(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + // Check if query_snapshots table exists (created dynamically by sp_WhoIsActive) + using var checkCmd = connection.CreateCommand(); + checkCmd.CommandText = "SELECT OBJECT_ID(N'collect.query_snapshots', N'U')"; + var tableExists = await checkCmd.ExecuteScalarAsync(); + if (tableExists == null || tableExists == DBNull.Value) return; + + // Step 1: Find when the spike occurred + using var peakCmd = connection.CreateCommand(); + peakCmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 1 collection_time, sqlserver_cpu_utilization +FROM collect.cpu_utilization_stats +WHERE collection_time >= @startTime AND collection_time <= @endTime +ORDER BY sqlserver_cpu_utilization DESC;"; + + peakCmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + peakCmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + DateTime? peakTime = null; + int peakCpu = 0; + using (var peakReader = await peakCmd.ExecuteReaderAsync()) + { + if (await peakReader.ReadAsync()) + { + peakTime = peakReader.GetDateTime(0); + peakCpu = peakReader.GetInt32(1); + } + } + + if (peakTime == null) return; + + // Step 2: Get queries active within 2 minutes of peak + using var queryCmd = connection.CreateCommand(); + queryCmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 5 + collection_time, + [session_id], + [database_name], + [status], + DATEDIFF(MILLISECOND, 0, [CPU]) AS cpu_time_ms, + DATEDIFF(MILLISECOND, 0, [elapsed_time]) AS total_elapsed_time_ms, + [reads] AS logical_reads, + [wait_info] AS wait_type, + 0 AS dop, + 0 AS parallel_worker_count, + LEFT(CAST([sql_text] AS NVARCHAR(MAX)), 500) AS query_text +FROM collect.query_snapshots +WHERE collection_time >= @spikeStart +AND collection_time <= @spikeEnd +AND CAST([sql_text] AS NVARCHAR(MAX)) NOT LIKE 'WAITFOR%' +ORDER BY DATEDIFF(MILLISECOND, 0, [CPU]) DESC;"; + + queryCmd.Parameters.Add(new SqlParameter("@spikeStart", peakTime.Value.AddMinutes(-2))); + queryCmd.Parameters.Add(new SqlParameter("@spikeEnd", peakTime.Value.AddMinutes(2))); + + var items = new List(); + using (var reader = await queryCmd.ExecuteReaderAsync()) + { + while (await reader.ReadAsync()) + { + items.Add(new + { + time = reader.IsDBNull(0) ? "" : reader.GetDateTime(0).ToString("o"), + session_id = reader.IsDBNull(1) ? 0 : Convert.ToInt32(reader.GetValue(1)), + database = reader.IsDBNull(2) ? "" : reader.GetString(2), + status = reader.IsDBNull(3) ? "" : reader.GetString(3), + cpu_time_ms = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)), + elapsed_time_ms = reader.IsDBNull(5) ? 0L : Convert.ToInt64(reader.GetValue(5)), + logical_reads = reader.IsDBNull(6) ? 0L : Convert.ToInt64(reader.GetValue(6)), + wait_type = reader.IsDBNull(7) ? "" : reader.GetString(7), + dop = reader.IsDBNull(8) ? 0 : Convert.ToInt32(reader.GetValue(8)), + parallel_workers = reader.IsDBNull(9) ? 0 : Convert.ToInt32(reader.GetValue(9)), + query_text = reader.IsDBNull(10) ? "" : reader.GetString(10) + }); + } + } + + if (items.Count > 0) + { + finding.DrillDown!["spike_peak"] = new + { + time = peakTime.Value.ToString("o"), + cpu_percent = peakCpu + }; + finding.DrillDown!["queries_at_spike"] = items; + } + } + + private async Task CollectTopCpuQueries(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 5 + database_name, + CONVERT(VARCHAR(18), query_hash, 1) AS query_hash, + CAST(SUM(total_worker_time_delta) AS BIGINT) AS total_cpu_us, + CAST(SUM(execution_count_delta) AS BIGINT) AS exec_count, + MAX(max_dop) AS max_dop, + CAST(SUM(total_spills) AS BIGINT) AS spills, + LEFT(CAST(DECOMPRESS(MAX(query_text)) AS NVARCHAR(MAX)), 500) AS query_text +FROM collect.query_stats +WHERE collection_time >= @startTime AND collection_time <= @endTime +AND total_worker_time_delta > 0 +GROUP BY database_name, query_hash +ORDER BY CAST(SUM(total_worker_time_delta) AS BIGINT) DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + database = reader.IsDBNull(0) ? "" : reader.GetString(0), + query_hash = reader.IsDBNull(1) ? "" : reader.GetString(1), + total_cpu_ms = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)) / 1000.0, + execution_count = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)), + max_dop = reader.IsDBNull(4) ? 0 : Convert.ToInt32(reader.GetValue(4)), + spills = reader.IsDBNull(5) ? 0L : Convert.ToInt64(reader.GetValue(5)), + query_text = reader.IsDBNull(6) ? "" : reader.GetString(6) + }); + } + + if (items.Count > 0 && !finding.DrillDown!.ContainsKey("top_cpu_queries")) + finding.DrillDown!["top_cpu_queries"] = items; + } + + private async Task CollectTopSpillingQueries(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 5 + database_name, + CONVERT(VARCHAR(18), query_hash, 1) AS query_hash, + CAST(SUM(total_spills) AS BIGINT) AS total_spills, + CAST(SUM(execution_count_delta) AS BIGINT) AS exec_count, + LEFT(CAST(DECOMPRESS(MAX(query_text)) AS NVARCHAR(MAX)), 500) AS query_text +FROM collect.query_stats +WHERE collection_time >= @startTime AND collection_time <= @endTime +AND total_spills > 0 +GROUP BY database_name, query_hash +ORDER BY CAST(SUM(total_spills) AS BIGINT) DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + database = reader.IsDBNull(0) ? "" : reader.GetString(0), + query_hash = reader.IsDBNull(1) ? "" : reader.GetString(1), + total_spills = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)), + execution_count = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)), + query_text = reader.IsDBNull(4) ? "" : reader.GetString(4) + }); + } + + if (items.Count > 0) + finding.DrillDown!["top_spilling_queries"] = items; + } + + private async Task CollectFileLatencyBreakdown(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 10 + database_name, + file_type_desc AS file_type, + AVG(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS avg_read_ms, + AVG(io_stall_write_ms_delta * 1.0 / NULLIF(num_of_writes_delta, 0)) AS avg_write_ms, + CAST(SUM(num_of_reads_delta) AS BIGINT) AS total_reads, + CAST(SUM(num_of_writes_delta) AS BIGINT) AS total_writes +FROM collect.file_io_stats +WHERE collection_time >= @startTime AND collection_time <= @endTime +AND (num_of_reads_delta > 0 OR num_of_writes_delta > 0) +GROUP BY database_name, file_type_desc +ORDER BY AVG(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + database = reader.IsDBNull(0) ? "" : reader.GetString(0), + file_type = reader.IsDBNull(1) ? "" : reader.GetString(1), + avg_read_latency_ms = reader.IsDBNull(2) ? 0.0 : Math.Round(Convert.ToDouble(reader.GetValue(2)), 2), + avg_write_latency_ms = reader.IsDBNull(3) ? 0.0 : Math.Round(Convert.ToDouble(reader.GetValue(3)), 2), + total_reads = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)), + total_writes = reader.IsDBNull(5) ? 0L : Convert.ToInt64(reader.GetValue(5)) + }); + } + + if (items.Count > 0) + finding.DrillDown!["file_latency_breakdown"] = items; + } + + private async Task CollectLockModeBreakdown(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 10 + wait_type, + CAST(SUM(wait_time_ms_delta) AS BIGINT) AS total_wait_ms, + CAST(SUM(waiting_tasks_count_delta) AS BIGINT) AS total_count +FROM collect.wait_stats +WHERE collection_time >= @startTime AND collection_time <= @endTime +AND wait_type LIKE 'LCK%' +AND wait_time_ms_delta > 0 +GROUP BY wait_type +ORDER BY CAST(SUM(wait_time_ms_delta) AS BIGINT) DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + lock_type = reader.IsDBNull(0) ? "" : reader.GetString(0), + total_wait_ms = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)), + waiting_tasks = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)) + }); + } + + if (items.Count > 0) + finding.DrillDown!["lock_mode_breakdown"] = items; + } + + private async Task CollectConfigIssues(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + // The Dashboard uses config.database_configuration_history which stores + // settings as rows (setting_type, setting_name, setting_value) not columns. + // Pivot the latest snapshot into the format we need. + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT database_name, setting_name, + CAST(setting_value AS NVARCHAR(256)) AS setting_value, + ROW_NUMBER() OVER (PARTITION BY database_name, setting_name ORDER BY collection_time DESC) AS rn + FROM config.database_configuration_history + WHERE setting_name IN ( + 'recovery_model_desc', 'is_auto_shrink_on', 'is_auto_close_on', + 'is_read_committed_snapshot_on', 'page_verify_option_desc', 'is_query_store_on' + ) +), +pivoted AS ( + SELECT + database_name, + MAX(CASE WHEN setting_name = 'recovery_model_desc' THEN setting_value END) AS recovery_model, + MAX(CASE WHEN setting_name = 'is_auto_shrink_on' THEN setting_value END) AS is_auto_shrink_on, + MAX(CASE WHEN setting_name = 'is_auto_close_on' THEN setting_value END) AS is_auto_close_on, + MAX(CASE WHEN setting_name = 'is_read_committed_snapshot_on' THEN setting_value END) AS is_rcsi_on, + MAX(CASE WHEN setting_name = 'page_verify_option_desc' THEN setting_value END) AS page_verify_option, + MAX(CASE WHEN setting_name = 'is_query_store_on' THEN setting_value END) AS is_query_store_on + FROM latest + WHERE rn = 1 + GROUP BY database_name +) +SELECT database_name, recovery_model, + is_auto_shrink_on, is_auto_close_on, + is_rcsi_on, page_verify_option, is_query_store_on +FROM pivoted +WHERE is_auto_shrink_on = '1' OR is_auto_close_on = '1' + OR is_rcsi_on = '0' OR page_verify_option != 'CHECKSUM' +ORDER BY database_name;"; + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var issues = new List(); + var autoShrink = reader.IsDBNull(2) ? "" : reader.GetString(2); + var autoClose = reader.IsDBNull(3) ? "" : reader.GetString(3); + var rcsi = reader.IsDBNull(4) ? "" : reader.GetString(4); + var pageVerify = reader.IsDBNull(5) ? "" : reader.GetString(5); + var queryStore = reader.IsDBNull(6) ? "" : reader.GetString(6); + + if (autoShrink == "1") issues.Add("auto_shrink ON"); + if (autoClose == "1") issues.Add("auto_close ON"); + if (rcsi == "0") issues.Add("RCSI OFF"); + if (!string.IsNullOrEmpty(pageVerify) && pageVerify != "CHECKSUM") issues.Add($"page_verify={pageVerify}"); + + items.Add(new + { + database = reader.IsDBNull(0) ? "" : reader.GetString(0), + recovery_model = reader.IsDBNull(1) ? "" : reader.GetString(1), + rcsi = rcsi == "1", + query_store = queryStore == "1", + issues + }); + } + + if (items.Count > 0) + finding.DrillDown!["config_issues"] = items; + } + + private async Task CollectTempDbBreakdown(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 5 + collection_time, + user_object_reserved_mb, + internal_object_reserved_mb, + version_store_reserved_mb, + unallocated_mb +FROM collect.tempdb_stats +WHERE collection_time >= @startTime AND collection_time <= @endTime +ORDER BY (user_object_reserved_mb + internal_object_reserved_mb + version_store_reserved_mb) DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + time = reader.GetDateTime(0).ToString("o"), + user_objects_mb = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)), + internal_objects_mb = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)), + version_store_mb = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)), + unallocated_mb = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)) + }); + } + + if (items.Count > 0) + finding.DrillDown!["tempdb_breakdown"] = items; + } + + private async Task CollectPendingGrants(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 5 + collection_time, + target_memory_mb, total_memory_mb, available_memory_mb, + granted_memory_mb, used_memory_mb, + grantee_count, waiter_count, + timeout_error_count_delta, forced_grant_count_delta +FROM collect.memory_grant_stats +WHERE collection_time >= @startTime AND collection_time <= @endTime +AND waiter_count > 0 +ORDER BY waiter_count DESC;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + time = reader.IsDBNull(0) ? "" : reader.GetDateTime(0).ToString("o"), + target_memory_mb = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)), + total_memory_mb = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)), + available_memory_mb = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)), + granted_memory_mb = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)), + used_memory_mb = reader.IsDBNull(5) ? 0.0 : Convert.ToDouble(reader.GetValue(5)), + grantee_count = reader.IsDBNull(6) ? 0 : reader.GetInt32(6), + waiter_count = reader.IsDBNull(7) ? 0 : reader.GetInt32(7), + timeout_errors = reader.IsDBNull(8) ? 0L : Convert.ToInt64(reader.GetValue(8)), + forced_grants = reader.IsDBNull(9) ? 0L : Convert.ToInt64(reader.GetValue(9)) + }); + } + + if (items.Count > 0) + finding.DrillDown!["pending_grants"] = items; + } + + /// + /// For findings that have query hashes (bad actors), fetch the execution plan + /// live from SQL Server via IPlanFetcher, then run PlanAnalyzer to surface + /// warnings and missing indexes. No plan storage needed -- fetch on demand + /// only for queries that make it into high-impact findings. + /// + private async Task CollectPlanAnalysis(AnalysisFinding finding, AnalysisContext context) + { + if (finding.DrillDown == null || _planFetcher == null) return; + + // Only analyze plans for bad actor findings (1 plan each). + // Skip top_cpu_queries (5 plans would be too heavy). + if (!finding.RootFactKey.StartsWith("BAD_ACTOR_")) return; + + var queryHash = finding.RootFactKey.Replace("BAD_ACTOR_", ""); + if (string.IsNullOrEmpty(queryHash)) return; + + // Look up plan_handle from collect.query_stats for this query_hash + string? planHandle = null; + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 1 CONVERT(VARCHAR(130), plan_handle, 1) AS plan_handle +FROM collect.query_stats +WHERE query_hash = CONVERT(BINARY(8), @queryHash, 1) +AND plan_handle IS NOT NULL +ORDER BY collection_time DESC;"; + + cmd.Parameters.Add(new SqlParameter("@queryHash", queryHash)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (await reader.ReadAsync() && !reader.IsDBNull(0)) + planHandle = reader.GetString(0); + } + catch { return; } + + if (string.IsNullOrEmpty(planHandle)) return; + + // Fetch plan XML live from SQL Server + var planXml = await _planFetcher.FetchPlanXmlAsync(context.ServerId, planHandle); + if (string.IsNullOrEmpty(planXml)) return; + + try + { + var plan = ShowPlanParser.Parse(planXml); + PlanAnalyzer.Analyze(plan); + + var allWarnings = plan.Batches + .SelectMany(b => b.Statements) + .Where(s => s.RootNode != null) + .SelectMany(s => + { + var nodeWarnings = new List(); + CollectPlanNodes(s.RootNode!, nodeWarnings); + return s.PlanWarnings + .Concat(nodeWarnings.SelectMany(n => n.Warnings)); + }) + .ToList(); + + var missingIndexes = plan.AllMissingIndexes; + + if (allWarnings.Count == 0 && missingIndexes.Count == 0) return; + + finding.DrillDown["plan_analysis"] = new + { + query_hash = queryHash, + warning_count = allWarnings.Count, + critical_count = allWarnings.Count(w => w.Severity == PlanWarningSeverity.Critical), + warnings = allWarnings + .OrderByDescending(w => w.Severity) + .Take(10) + .Select(w => new + { + severity = w.Severity.ToString(), + type = w.WarningType, + message = McpHelpers.Truncate(w.Message, 300) + }), + missing_indexes = missingIndexes.Take(5).Select(idx => new + { + table = $"{idx.Schema}.{idx.Table}", + impact = idx.Impact, + create_statement = idx.CreateStatement + }) + }; + } + catch + { + // Plan parsing can fail on malformed XML -- skip silently + } + } + + private static void CollectPlanNodes(PlanNode node, List nodes) + { + nodes.Add(node); + foreach (var child in node.Children) + CollectPlanNodes(child, nodes); + } + + private async Task CollectBadActorDetail(AnalysisFinding finding, AnalysisContext context) + { + // Extract query_hash from the fact key (BAD_ACTOR_0x...) + var queryHash = finding.RootFactKey.Replace("BAD_ACTOR_", ""); + if (string.IsNullOrEmpty(queryHash)) return; + + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + database_name, + CONVERT(VARCHAR(18), query_hash, 1) AS query_hash, + LEFT(CAST(DECOMPRESS(MAX(query_text)) AS NVARCHAR(MAX)), 500) AS query_text, + CAST(SUM(execution_count_delta) AS BIGINT) AS exec_count, + CASE WHEN SUM(execution_count_delta) > 0 + THEN CAST(SUM(total_worker_time_delta) AS FLOAT) / SUM(execution_count_delta) / 1000.0 + ELSE 0 END AS avg_cpu_ms, + CASE WHEN SUM(execution_count_delta) > 0 + THEN CAST(SUM(total_elapsed_time_delta) AS FLOAT) / SUM(execution_count_delta) / 1000.0 + ELSE 0 END AS avg_elapsed_ms, + CASE WHEN SUM(execution_count_delta) > 0 + THEN CAST(SUM(total_logical_reads_delta) AS FLOAT) / SUM(execution_count_delta) + ELSE 0 END AS avg_reads, + CAST(SUM(total_worker_time_delta) AS BIGINT) AS total_cpu_us, + CAST(SUM(total_logical_reads_delta) AS BIGINT) AS total_reads, + CAST(SUM(total_spills) AS BIGINT) AS total_spills, + MAX(max_dop) AS max_dop +FROM collect.query_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime +AND query_hash = CONVERT(BINARY(8), @queryHash, 1) +GROUP BY database_name, query_hash;"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + cmd.Parameters.Add(new SqlParameter("@queryHash", queryHash)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (await reader.ReadAsync()) + { + finding.DrillDown!["bad_actor_query"] = new + { + database = reader.IsDBNull(0) ? "" : reader.GetString(0), + query_hash = reader.IsDBNull(1) ? "" : reader.GetString(1), + query_text = reader.IsDBNull(2) ? "" : reader.GetString(2), + execution_count = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)), + avg_cpu_ms = reader.IsDBNull(4) ? 0.0 : Math.Round(Convert.ToDouble(reader.GetValue(4)), 2), + avg_elapsed_ms = reader.IsDBNull(5) ? 0.0 : Math.Round(Convert.ToDouble(reader.GetValue(5)), 2), + avg_reads = reader.IsDBNull(6) ? 0.0 : Math.Round(Convert.ToDouble(reader.GetValue(6)), 0), + total_cpu_ms = reader.IsDBNull(7) ? 0.0 : Convert.ToDouble(reader.GetValue(7)) / 1000.0, + total_reads = reader.IsDBNull(8) ? 0L : Convert.ToInt64(reader.GetValue(8)), + total_spills = reader.IsDBNull(9) ? 0L : Convert.ToInt64(reader.GetValue(9)), + max_dop = reader.IsDBNull(10) ? 0 : Convert.ToInt32(reader.GetValue(10)) + }; + } + } +} diff --git a/Dashboard/Analysis/SqlServerFactCollector.cs b/Dashboard/Analysis/SqlServerFactCollector.cs new file mode 100644 index 00000000..a99d9aa1 --- /dev/null +++ b/Dashboard/Analysis/SqlServerFactCollector.cs @@ -0,0 +1,1687 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.Data.SqlClient; +using PerformanceMonitorDashboard.Helpers; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Collects facts from SQL Server for the Dashboard analysis engine. +/// Each fact category has its own collection method, added incrementally. +/// Port of DuckDbFactCollector from Lite — queries collect.* tables instead of DuckDB views. +/// +public class SqlServerFactCollector : IFactCollector +{ + private readonly string _connectionString; + + public SqlServerFactCollector(string connectionString) + { + _connectionString = connectionString; + } + + public async Task> CollectFactsAsync(AnalysisContext context) + { + var facts = new List(); + + await CollectWaitStatsFactsAsync(context, facts); + GroupGeneralLockWaits(facts, context); + GroupParallelismWaits(facts, context); + await CollectBlockingFactsAsync(context, facts); + await CollectDeadlockFactsAsync(context, facts); + await CollectServerConfigFactsAsync(context, facts); + await CollectMemoryFactsAsync(context, facts); + await CollectDatabaseSizeFactAsync(context, facts); + await CollectServerMetadataFactsAsync(context, facts); + await CollectCpuUtilizationFactsAsync(context, facts); + await CollectIoLatencyFactsAsync(context, facts); + await CollectTempDbFactsAsync(context, facts); + await CollectMemoryGrantFactsAsync(context, facts); + await CollectQueryStatsFactsAsync(context, facts); + await CollectBadActorFactsAsync(context, facts); + await CollectPerfmonFactsAsync(context, facts); + await CollectMemoryClerkFactsAsync(context, facts); + await CollectDatabaseConfigFactsAsync(context, facts); + await CollectProcedureStatsFactsAsync(context, facts); + await CollectActiveQueryFactsAsync(context, facts); + await CollectRunningJobFactsAsync(context, facts); + await CollectSessionFactsAsync(context, facts); + await CollectTraceFlagFactsAsync(context, facts); + await CollectServerPropertiesFactsAsync(context, facts); + await CollectDiskSpaceFactsAsync(context, facts); + + return facts; + } + + /// + /// Collects wait stats facts — one Fact per significant wait type. + /// Value is wait_time_ms / period_duration_ms (fraction of examined period). + /// + private async Task CollectWaitStatsFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var command = connection.CreateCommand(); + command.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + wait_type, + SUM(waiting_tasks_count_delta) AS total_waiting_tasks, + SUM(wait_time_ms_delta) AS total_wait_time_ms, + SUM(signal_wait_time_ms_delta) AS total_signal_wait_time_ms +FROM collect.wait_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime +AND wait_time_ms_delta > 0 +GROUP BY wait_type +ORDER BY SUM(wait_time_ms_delta) DESC"; + + command.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + command.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await command.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var waitType = reader.GetString(0); + var waitingTasks = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var waitTimeMs = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var signalWaitTimeMs = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + + if (waitTimeMs <= 0) continue; + + var fractionOfPeriod = waitTimeMs / context.PeriodDurationMs; + var avgMsPerWait = waitingTasks > 0 ? (double)waitTimeMs / waitingTasks : 0; + + facts.Add(new Fact + { + Source = "waits", + Key = waitType, + Value = fractionOfPeriod, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["wait_time_ms"] = waitTimeMs, + ["waiting_tasks_count"] = waitingTasks, + ["signal_wait_time_ms"] = signalWaitTimeMs, + ["resource_wait_time_ms"] = waitTimeMs - signalWaitTimeMs, + ["avg_ms_per_wait"] = avgMsPerWait, + ["period_duration_ms"] = context.PeriodDurationMs + } + }); + } + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectWaitStatsFactsAsync failed", ex); + } + } + + /// + /// Collects blocking facts from blocking_BlockedProcessReport. + /// Produces a single BLOCKING_EVENTS fact with event count, rate, and details. + /// Value is events per hour for threshold comparison. + /// + private async Task CollectBlockingFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var command = connection.CreateCommand(); + command.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + COUNT(*) AS event_count, + AVG(CAST(wait_time_ms AS FLOAT)) AS avg_wait_time_ms, + MAX(wait_time_ms) AS max_wait_time_ms, + COUNT(DISTINCT spid) AS distinct_head_blockers, + COUNT(CASE WHEN status = 'sleeping' THEN 1 END) AS sleeping_blocker_count +FROM collect.blocking_BlockedProcessReport +WHERE collection_time >= @startTime +AND collection_time <= @endTime"; + + command.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + command.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await command.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var eventCount = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + if (eventCount <= 0) return; + + var avgWaitTimeMs = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var maxWaitTimeMs = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var distinctHeadBlockers = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var sleepingBlockerCount = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + + var periodHours = context.PeriodDurationMs / 3_600_000.0; + var eventsPerHour = periodHours > 0 ? eventCount / periodHours : 0; + + facts.Add(new Fact + { + Source = "blocking", + Key = "BLOCKING_EVENTS", + Value = eventsPerHour, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["event_count"] = eventCount, + ["events_per_hour"] = eventsPerHour, + ["avg_wait_time_ms"] = avgWaitTimeMs, + ["max_wait_time_ms"] = maxWaitTimeMs, + ["distinct_head_blockers"] = distinctHeadBlockers, + ["sleeping_blocker_count"] = sleepingBlockerCount, + ["period_hours"] = periodHours + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectBlockingFactsAsync failed", ex); + } + } + + /// + /// Collects deadlock facts from the deadlocks table. + /// Produces a single DEADLOCKS fact with count and rate. + /// Value is deadlocks per hour for threshold comparison. + /// + private async Task CollectDeadlockFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var command = connection.CreateCommand(); + command.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT COUNT(*) AS deadlock_count +FROM collect.deadlocks +WHERE collection_time >= @startTime +AND collection_time <= @endTime"; + + command.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + command.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await command.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var deadlockCount = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + if (deadlockCount <= 0) return; + + var periodHours = context.PeriodDurationMs / 3_600_000.0; + var deadlocksPerHour = periodHours > 0 ? deadlockCount / periodHours : 0; + + facts.Add(new Fact + { + Source = "blocking", + Key = "DEADLOCKS", + Value = deadlocksPerHour, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["deadlock_count"] = deadlockCount, + ["deadlocks_per_hour"] = deadlocksPerHour, + ["period_hours"] = periodHours + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectDeadlockFactsAsync failed", ex); + } + } + + /// + /// Collects server configuration settings relevant to analysis. + /// These become facts that amplifiers and the config audit tool can reference + /// to make recommendations specific (e.g., "your CTFP is 50" vs "check CTFP"). + /// + private async Task CollectServerConfigFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 4 + configuration_name, + CAST(value_in_use AS BIGINT) AS value_in_use +FROM config.server_configuration_history +WHERE configuration_name IN ( + 'cost threshold for parallelism', + 'max degree of parallelism', + 'max server memory (MB)', + 'max worker threads' +) +ORDER BY collection_time DESC"; + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var configName = reader.GetString(0); + var value = Convert.ToDouble(reader.GetValue(1)); + + var factKey = configName switch + { + "cost threshold for parallelism" => "CONFIG_CTFP", + "max degree of parallelism" => "CONFIG_MAXDOP", + "max server memory (MB)" => "CONFIG_MAX_MEMORY_MB", + "max worker threads" => "CONFIG_MAX_WORKER_THREADS", + _ => null + }; + + if (factKey == null) continue; + + facts.Add(new Fact + { + Source = "config", + Key = factKey, + Value = value, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["value_in_use"] = value + } + }); + } + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectServerConfigFactsAsync failed", ex); + } + } + + /// + /// Collects memory stats: total physical RAM, buffer pool size, target memory. + /// These facts enable edition-aware memory recommendations in the config audit. + /// + private async Task CollectMemoryFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 1 + total_physical_memory_mb, + buffer_pool_mb, + committed_target_memory_mb +FROM collect.memory_stats +WHERE collection_time <= @endTime +ORDER BY collection_time DESC"; + + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var totalPhysical = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var bufferPool = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var targetMemory = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + + if (totalPhysical > 0) + facts.Add(new Fact { Source = "memory", Key = "MEMORY_TOTAL_PHYSICAL_MB", Value = totalPhysical, ServerId = context.ServerId }); + if (bufferPool > 0) + facts.Add(new Fact { Source = "memory", Key = "MEMORY_BUFFER_POOL_MB", Value = bufferPool, ServerId = context.ServerId }); + if (targetMemory > 0) + facts.Add(new Fact { Source = "memory", Key = "MEMORY_TARGET_MB", Value = targetMemory, ServerId = context.ServerId }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectMemoryFactsAsync failed", ex); + } + } + + /// + /// Collects total database data size from file_io_stats. + /// Sums the latest size_on_disk_bytes across all database files for the server. + /// + private async Task CollectDatabaseSizeFactAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT + database_name, + file_name, + size_on_disk_bytes, + ROW_NUMBER() OVER (PARTITION BY database_name, file_name ORDER BY collection_time DESC) AS rn + FROM collect.file_io_stats + WHERE collection_time <= @endTime + AND size_on_disk_bytes > 0 +) +SELECT SUM(size_on_disk_bytes / 1048576.0) AS total_size_mb +FROM latest +WHERE rn = 1"; + + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var totalSize = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + if (totalSize > 0) + facts.Add(new Fact { Source = "config", Key = "DATABASE_TOTAL_SIZE_MB", Value = totalSize, ServerId = context.ServerId }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectDatabaseSizeFactAsync failed", ex); + } + } + + /// + /// Collects SQL Server edition and major version from the server_properties table. + /// + private async Task CollectServerMetadataFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 1 + engine_edition, + CAST(LEFT(product_version, CHARINDEX('.', product_version) - 1) AS INT) AS major_version +FROM collect.server_properties +ORDER BY collection_time DESC"; + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var edition = reader.IsDBNull(0) ? 0 : Convert.ToInt32(reader.GetValue(0)); + var majorVersion = reader.IsDBNull(1) ? 0 : Convert.ToInt32(reader.GetValue(1)); + + if (edition > 0) + facts.Add(new Fact { Source = "config", Key = "SERVER_EDITION", Value = edition, ServerId = context.ServerId }); + if (majorVersion > 0) + facts.Add(new Fact { Source = "config", Key = "SERVER_MAJOR_VERSION", Value = majorVersion, ServerId = context.ServerId }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectServerMetadataFactsAsync failed", ex); + } + } + + /// + /// Collects CPU utilization: average and max SQL Server CPU % over the period. + /// Value is average SQL CPU %. Corroborates SOS_SCHEDULER_YIELD. + /// + private async Task CollectCpuUtilizationFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + AVG(CAST(sqlserver_cpu_utilization AS FLOAT)) AS avg_sql_cpu, + MAX(sqlserver_cpu_utilization) AS max_sql_cpu, + AVG(CAST(other_process_cpu_utilization AS FLOAT)) AS avg_other_cpu, + MAX(other_process_cpu_utilization) AS max_other_cpu, + COUNT(*) AS sample_count +FROM collect.cpu_utilization_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var avgSqlCpu = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var maxSqlCpu = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var avgOtherCpu = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + var maxOtherCpu = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + var sampleCount = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + + if (sampleCount == 0) return; + + var cpuMetadata = new Dictionary + { + ["avg_sql_cpu"] = avgSqlCpu, + ["max_sql_cpu"] = maxSqlCpu, + ["avg_other_cpu"] = avgOtherCpu, + ["max_other_cpu"] = maxOtherCpu, + ["avg_total_cpu"] = avgSqlCpu + avgOtherCpu, + ["sample_count"] = sampleCount + }; + + facts.Add(new Fact + { + Source = "cpu", + Key = "CPU_SQL_PERCENT", + Value = avgSqlCpu, + ServerId = context.ServerId, + Metadata = cpuMetadata + }); + + // Emit a CPU_SPIKE fact when max is high and significantly above average. + // This catches bursty CPU events that average-based scoring misses entirely. + // Requires max >= 80% AND at least 3x the average (or avg < 20% with max >= 80%). + if (maxSqlCpu >= 80 && (avgSqlCpu < 20 || maxSqlCpu / Math.Max(avgSqlCpu, 1) >= 3)) + { + facts.Add(new Fact + { + Source = "cpu", + Key = "CPU_SPIKE", + Value = maxSqlCpu, + ServerId = context.ServerId, + Metadata = cpuMetadata + }); + } + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectCpuUtilizationFactsAsync failed", ex); + } + } + + /// + /// Collects I/O latency from file_io_stats delta columns. + /// Computes average read and write latency across all database files. + /// + private async Task CollectIoLatencyFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + SUM(io_stall_read_ms_delta) AS total_stall_read_ms, + SUM(num_of_reads_delta) AS total_reads, + SUM(io_stall_write_ms_delta) AS total_stall_write_ms, + SUM(num_of_writes_delta) AS total_writes +FROM collect.file_io_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime +AND (num_of_reads_delta > 0 OR num_of_writes_delta > 0)"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var totalStallReadMs = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + var totalReads = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var totalStallWriteMs = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var totalWrites = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + + if (totalReads > 0) + { + var avgReadLatency = (double)totalStallReadMs / totalReads; + facts.Add(new Fact + { + Source = "io", + Key = "IO_READ_LATENCY_MS", + Value = avgReadLatency, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["avg_read_latency_ms"] = avgReadLatency, + ["total_stall_read_ms"] = totalStallReadMs, + ["total_reads"] = totalReads + } + }); + } + + if (totalWrites > 0) + { + var avgWriteLatency = (double)totalStallWriteMs / totalWrites; + facts.Add(new Fact + { + Source = "io", + Key = "IO_WRITE_LATENCY_MS", + Value = avgWriteLatency, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["avg_write_latency_ms"] = avgWriteLatency, + ["total_stall_write_ms"] = totalStallWriteMs, + ["total_writes"] = totalWrites + } + }); + } + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectIoLatencyFactsAsync failed", ex); + } + } + + /// + /// Collects TempDB usage facts: max usage, version store size, and unallocated space. + /// Value is max total_reserved_mb over the period. + /// Dashboard uses computed columns (total_reserved_mb, etc.) from collect.tempdb_stats. + /// + private async Task CollectTempDbFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + MAX(total_reserved_mb) AS max_total_reserved_mb, + MAX(user_object_reserved_mb) AS max_user_object_mb, + MAX(internal_object_reserved_mb) AS max_internal_object_mb, + MAX(version_store_reserved_mb) AS max_version_store_mb, + MIN(unallocated_mb) AS min_unallocated_mb, + AVG(CAST(total_reserved_mb AS FLOAT)) AS avg_total_reserved_mb +FROM collect.tempdb_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var maxReserved = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var maxUserObj = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var maxInternalObj = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + var maxVersionStore = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + var minUnallocated = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)); + var avgReserved = reader.IsDBNull(5) ? 0.0 : Convert.ToDouble(reader.GetValue(5)); + + if (maxReserved <= 0) return; + + // TempDB usage as fraction of total space (reserved + unallocated) + var totalSpace = maxReserved + minUnallocated; + var usageFraction = totalSpace > 0 ? maxReserved / totalSpace : 0; + + facts.Add(new Fact + { + Source = "tempdb", + Key = "TEMPDB_USAGE", + Value = usageFraction, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["max_reserved_mb"] = maxReserved, + ["avg_reserved_mb"] = avgReserved, + ["max_user_object_mb"] = maxUserObj, + ["max_internal_object_mb"] = maxInternalObj, + ["max_version_store_mb"] = maxVersionStore, + ["min_unallocated_mb"] = minUnallocated, + ["usage_fraction"] = usageFraction + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectTempDbFactsAsync failed", ex); + } + } + + /// + /// Collects memory grant facts from the memory_grant_stats table. + /// Detects grant waiters (sessions waiting for memory) and grant pressure. + /// + private async Task CollectMemoryGrantFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + MAX(waiter_count) AS max_waiters, + AVG(CAST(waiter_count AS FLOAT)) AS avg_waiters, + MAX(grantee_count) AS max_grantees, + SUM(timeout_error_count_delta) AS total_timeout_errors, + SUM(forced_grant_count_delta) AS total_forced_grants +FROM collect.memory_grant_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var maxWaiters = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + var avgWaiters = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var maxGrantees = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var totalTimeouts = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var totalForcedGrants = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + + // Only create a fact if there's evidence of grant pressure + if (maxWaiters <= 0 && totalTimeouts <= 0 && totalForcedGrants <= 0) return; + + facts.Add(new Fact + { + Source = "memory", + Key = "MEMORY_GRANT_PENDING", + Value = maxWaiters, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["max_waiters"] = maxWaiters, + ["avg_waiters"] = avgWaiters, + ["max_grantees"] = maxGrantees, + ["total_timeout_errors"] = totalTimeouts, + ["total_forced_grants"] = totalForcedGrants + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectMemoryGrantFactsAsync failed", ex); + } + } + + /// + /// Collects query-level aggregate facts from query_stats. + /// Focuses on spills (memory grant misestimates) and high-parallelism queries. + /// + private async Task CollectQueryStatsFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + SUM(total_spills) AS total_spills, + COUNT(CASE WHEN max_dop > 8 THEN 1 END) AS high_dop_queries, + COUNT(CASE WHEN total_spills > 0 THEN 1 END) AS spilling_queries, + SUM(execution_count_delta) AS total_executions, + SUM(total_worker_time_delta) AS total_cpu_time_us +FROM collect.query_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime +AND execution_count_delta > 0"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var totalSpills = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + var highDopQueries = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var spillingQueries = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var totalExecutions = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var totalCpuTimeUs = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + + if (totalSpills > 0) + { + facts.Add(new Fact + { + Source = "queries", + Key = "QUERY_SPILLS", + Value = totalSpills, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["total_spills"] = totalSpills, + ["spilling_query_count"] = spillingQueries, + ["total_executions"] = totalExecutions + } + }); + } + + if (highDopQueries > 0) + { + facts.Add(new Fact + { + Source = "queries", + Key = "QUERY_HIGH_DOP", + Value = highDopQueries, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["high_dop_query_count"] = highDopQueries, + ["total_cpu_time_us"] = totalCpuTimeUs, + ["total_executions"] = totalExecutions + } + }); + } + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectQueryStatsFactsAsync failed", ex); + } + } + + /// + /// Identifies individual queries that are consistently terrible ("bad actors"). + /// These queries don't necessarily cause server-level symptoms but waste resources + /// on every execution. Detection uses execution count tiers x per-execution impact. + /// Top 5 worst offenders become individual BAD_ACTOR facts. + /// Dashboard query_hash is binary(8) — convert to hex string for fact key. + /// + private async Task CollectBadActorFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 5 + database_name, + CONVERT(VARCHAR(18), query_hash, 1) AS query_hash, + CAST(SUM(execution_count_delta) AS BIGINT) AS exec_count, + CASE WHEN SUM(execution_count_delta) > 0 + THEN CAST(SUM(total_worker_time_delta) AS FLOAT) / SUM(execution_count_delta) / 1000.0 + ELSE 0 END AS avg_cpu_ms, + CASE WHEN SUM(execution_count_delta) > 0 + THEN CAST(SUM(total_elapsed_time_delta) AS FLOAT) / SUM(execution_count_delta) / 1000.0 + ELSE 0 END AS avg_elapsed_ms, + CASE WHEN SUM(execution_count_delta) > 0 + THEN CAST(SUM(total_logical_reads_delta) AS FLOAT) / SUM(execution_count_delta) + ELSE 0 END AS avg_reads, + CAST(SUM(total_worker_time_delta) AS BIGINT) AS total_cpu_us, + CAST(SUM(total_logical_reads_delta) AS BIGINT) AS total_reads, + CAST(SUM(total_spills) AS BIGINT) AS total_spills, + MAX(max_dop) AS max_dop, + LEFT(CAST(DECOMPRESS(MAX(query_text)) AS NVARCHAR(MAX)), 200) AS query_text +FROM collect.query_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime +AND execution_count_delta > 0 +GROUP BY database_name, query_hash +HAVING SUM(execution_count_delta) >= 100 +ORDER BY CAST(SUM(total_worker_time_delta) AS FLOAT) / NULLIF(SUM(execution_count_delta), 0) * + LOG(NULLIF(SUM(execution_count_delta), 0)) DESC"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var dbName = reader.IsDBNull(0) ? "" : reader.GetString(0); + var queryHash = reader.IsDBNull(1) ? "" : reader.GetString(1); + var execCount = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var avgCpuMs = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + var avgElapsedMs = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)); + var avgReads = reader.IsDBNull(5) ? 0.0 : Convert.ToDouble(reader.GetValue(5)); + var totalCpuUs = reader.IsDBNull(6) ? 0L : Convert.ToInt64(reader.GetValue(6)); + var totalReads = reader.IsDBNull(7) ? 0L : Convert.ToInt64(reader.GetValue(7)); + var totalSpills = reader.IsDBNull(8) ? 0L : Convert.ToInt64(reader.GetValue(8)); + var maxDop = reader.IsDBNull(9) ? 0 : Convert.ToInt32(reader.GetValue(9)); + var queryText = reader.IsDBNull(10) ? "" : reader.GetString(10); + + // Skip low-impact queries — need meaningful per-execution cost + if (avgCpuMs < 10 && avgReads < 1000) continue; + + facts.Add(new Fact + { + Source = "bad_actor", + Key = $"BAD_ACTOR_{queryHash}", + Value = avgCpuMs, // Primary scoring dimension + ServerId = context.ServerId, + DatabaseName = dbName, + Metadata = new Dictionary + { + ["execution_count"] = execCount, + ["avg_cpu_ms"] = avgCpuMs, + ["avg_elapsed_ms"] = avgElapsedMs, + ["avg_reads"] = avgReads, + ["total_cpu_us"] = totalCpuUs, + ["total_reads"] = totalReads, + ["total_spills"] = totalSpills, + ["max_dop"] = maxDop + } + }); + } + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectBadActorFactsAsync failed", ex); + } + } + + /// + /// Collects key perfmon counters: Page Life Expectancy, Batch Requests/sec, compilations. + /// PLE is scored; others are throughput context for the AI. + /// + private async Task CollectPerfmonFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT + counter_name, + cntr_value, + cntr_value_delta, + ROW_NUMBER() OVER (PARTITION BY counter_name ORDER BY collection_time DESC) AS rn + FROM collect.perfmon_stats + WHERE collection_time >= @startTime + AND collection_time <= @endTime + AND counter_name IN ('Page life expectancy', 'Batch Requests/sec', 'SQL Compilations/sec', 'SQL Re-Compilations/sec') +) +SELECT counter_name, cntr_value, cntr_value_delta +FROM latest WHERE rn = 1"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var counterName = reader.GetString(0); + var cntrValue = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var deltaValue = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + + var (factKey, source) = counterName switch + { + "Page life expectancy" => ("PERFMON_PLE", "perfmon"), + "Batch Requests/sec" => ("PERFMON_BATCH_REQ_SEC", "perfmon"), + "SQL Compilations/sec" => ("PERFMON_COMPILATIONS_SEC", "perfmon"), + "SQL Re-Compilations/sec" => ("PERFMON_RECOMPILATIONS_SEC", "perfmon"), + _ => (null, null) + }; + + if (factKey == null) continue; + + // For PLE, use the absolute value. For rate counters, use delta. + var value = counterName == "Page life expectancy" ? (double)cntrValue : (double)deltaValue; + + facts.Add(new Fact + { + Source = source!, + Key = factKey, + Value = value, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["cntr_value"] = cntrValue, + ["delta_cntr_value"] = deltaValue + } + }); + } + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectPerfmonFactsAsync failed", ex); + } + } + + /// + /// Collects top memory clerks by size. Context for understanding where memory is allocated. + /// Dashboard stores pages_kb — convert to MB for consistency with Lite facts. + /// + private async Task CollectMemoryClerkFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT + clerk_type, + SUM(pages_kb) / 1024.0 AS memory_mb, + ROW_NUMBER() OVER (PARTITION BY clerk_type ORDER BY collection_time DESC) AS rn, + collection_time + FROM collect.memory_clerks_stats + WHERE collection_time <= @endTime + GROUP BY clerk_type, collection_time +) +SELECT TOP 10 clerk_type, memory_mb +FROM latest WHERE rn = 1 AND memory_mb > 0 +ORDER BY memory_mb DESC"; + + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + var metadata = new Dictionary(); + var totalMb = 0.0; + var clerkCount = 0; + + while (await reader.ReadAsync()) + { + var clerkType = reader.GetString(0); + var memoryMb = Convert.ToDouble(reader.GetValue(1)); + metadata[clerkType] = memoryMb; + totalMb += memoryMb; + clerkCount++; + } + + if (clerkCount == 0) return; + + metadata["total_top_clerks_mb"] = totalMb; + metadata["clerk_count"] = clerkCount; + + facts.Add(new Fact + { + Source = "memory", + Key = "MEMORY_CLERKS", + Value = totalMb, + ServerId = context.ServerId, + Metadata = metadata + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectMemoryClerkFactsAsync failed", ex); + } + } + + /// + /// Collects database configuration facts: RCSI status, auto_shrink, auto_close, + /// recovery model. Aggregates counts across databases. + /// Dashboard stores config as individual setting rows in config.database_configuration_history. + /// We pivot from the per-setting rows into aggregated counts. + /// + private async Task CollectDatabaseConfigFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT + database_name, + setting_name, + setting_value, + ROW_NUMBER() OVER (PARTITION BY database_name, setting_name ORDER BY collection_time DESC) AS rn + FROM config.database_configuration_history + WHERE setting_type = 'database_option' + AND database_name NOT IN ('master', 'msdb', 'model', 'tempdb') +), +pivoted AS ( + SELECT + database_name, + MAX(CASE WHEN setting_name = 'recovery_model_desc' THEN CAST(setting_value AS NVARCHAR(128)) END) AS recovery_model, + MAX(CASE WHEN setting_name = 'is_auto_shrink_on' THEN CAST(setting_value AS NVARCHAR(10)) END) AS is_auto_shrink_on, + MAX(CASE WHEN setting_name = 'is_auto_close_on' THEN CAST(setting_value AS NVARCHAR(10)) END) AS is_auto_close_on, + MAX(CASE WHEN setting_name = 'is_read_committed_snapshot_on' THEN CAST(setting_value AS NVARCHAR(10)) END) AS is_read_committed_snapshot_on, + MAX(CASE WHEN setting_name = 'is_auto_create_stats_on' THEN CAST(setting_value AS NVARCHAR(10)) END) AS is_auto_create_stats_on, + MAX(CASE WHEN setting_name = 'is_auto_update_stats_on' THEN CAST(setting_value AS NVARCHAR(10)) END) AS is_auto_update_stats_on, + MAX(CASE WHEN setting_name = 'page_verify_option_desc' THEN CAST(setting_value AS NVARCHAR(128)) END) AS page_verify_option, + MAX(CASE WHEN setting_name = 'is_query_store_on' THEN CAST(setting_value AS NVARCHAR(10)) END) AS is_query_store_on + FROM latest + WHERE rn = 1 + GROUP BY database_name +) +SELECT + COUNT(*) AS database_count, + COUNT(CASE WHEN is_auto_shrink_on = '1' OR is_auto_shrink_on = 'True' THEN 1 END) AS auto_shrink_count, + COUNT(CASE WHEN is_auto_close_on = '1' OR is_auto_close_on = 'True' THEN 1 END) AS auto_close_count, + COUNT(CASE WHEN is_read_committed_snapshot_on = '0' OR is_read_committed_snapshot_on = 'False' THEN 1 END) AS rcsi_off_count, + COUNT(CASE WHEN is_auto_create_stats_on = '0' OR is_auto_create_stats_on = 'False' THEN 1 END) AS auto_create_stats_off_count, + COUNT(CASE WHEN is_auto_update_stats_on = '0' OR is_auto_update_stats_on = 'False' THEN 1 END) AS auto_update_stats_off_count, + COUNT(CASE WHEN page_verify_option IS NOT NULL AND page_verify_option != 'CHECKSUM' THEN 1 END) AS page_verify_not_checksum_count, + COUNT(CASE WHEN recovery_model = 'FULL' THEN 1 END) AS full_recovery_count, + COUNT(CASE WHEN recovery_model = 'SIMPLE' THEN 1 END) AS simple_recovery_count, + COUNT(CASE WHEN is_query_store_on = '1' OR is_query_store_on = 'True' THEN 1 END) AS query_store_on_count +FROM pivoted"; + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var dbCount = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + if (dbCount == 0) return; + + var autoShrink = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var autoClose = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var rcsiOff = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var autoCreateOff = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + var autoUpdateOff = reader.IsDBNull(5) ? 0L : Convert.ToInt64(reader.GetValue(5)); + var pageVerifyBad = reader.IsDBNull(6) ? 0L : Convert.ToInt64(reader.GetValue(6)); + var fullRecovery = reader.IsDBNull(7) ? 0L : Convert.ToInt64(reader.GetValue(7)); + var simpleRecovery = reader.IsDBNull(8) ? 0L : Convert.ToInt64(reader.GetValue(8)); + var queryStoreOn = reader.IsDBNull(9) ? 0L : Convert.ToInt64(reader.GetValue(9)); + + facts.Add(new Fact + { + Source = "database_config", + Key = "DB_CONFIG", + Value = dbCount, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["database_count"] = dbCount, + ["auto_shrink_on_count"] = autoShrink, + ["auto_close_on_count"] = autoClose, + ["rcsi_off_count"] = rcsiOff, + ["auto_create_stats_off_count"] = autoCreateOff, + ["auto_update_stats_off_count"] = autoUpdateOff, + ["page_verify_not_checksum_count"] = pageVerifyBad, + ["full_recovery_count"] = fullRecovery, + ["simple_recovery_count"] = simpleRecovery, + ["query_store_on_count"] = queryStoreOn + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectDatabaseConfigFactsAsync failed", ex); + } + } + + /// + /// Collects procedure stats: top procedure by delta CPU time in the period. + /// + private async Task CollectProcedureStatsFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + COUNT(DISTINCT object_name) AS distinct_procs, + SUM(execution_count_delta) AS total_executions, + SUM(total_worker_time_delta) AS total_cpu_time_us, + SUM(total_elapsed_time_delta) AS total_elapsed_time_us, + SUM(total_logical_reads_delta) AS total_logical_reads +FROM collect.procedure_stats +WHERE collection_time >= @startTime +AND collection_time <= @endTime +AND execution_count_delta > 0"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var distinctProcs = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + var totalExecs = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var totalCpuUs = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var totalElapsedUs = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var totalReads = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + + if (totalExecs == 0) return; + + facts.Add(new Fact + { + Source = "queries", + Key = "PROCEDURE_STATS", + Value = totalCpuUs, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["distinct_procedures"] = distinctProcs, + ["total_executions"] = totalExecs, + ["total_cpu_time_us"] = totalCpuUs, + ["total_elapsed_time_us"] = totalElapsedUs, + ["total_logical_reads"] = totalReads, + ["avg_cpu_per_exec_us"] = totalExecs > 0 ? (double)totalCpuUs / totalExecs : 0 + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectProcedureStatsFactsAsync failed", ex); + } + } + + /// + /// Collects active query snapshot facts: long-running queries, blocked sessions, high DOP. + /// Dashboard query_snapshots table is created by sp_WhoIsActive dynamically. + /// We query it if it exists. + /// + private async Task CollectActiveQueryFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + // Check if the table exists first (created dynamically by sp_WhoIsActive) + using var checkCmd = connection.CreateCommand(); + checkCmd.CommandText = "SELECT OBJECT_ID(N'collect.query_snapshots', N'U')"; + var tableExists = await checkCmd.ExecuteScalarAsync(); + if (tableExists == null || tableExists == DBNull.Value) return; + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + COUNT(*) AS total_snapshots, + COUNT(CASE WHEN DATEDIFF(MILLISECOND, 0, [elapsed_time]) > 30000 THEN 1 END) AS long_running_count, + COUNT(CASE WHEN [blocking_session_id] IS NOT NULL AND [blocking_session_id] != '' THEN 1 END) AS blocked_count, + MAX(DATEDIFF(MILLISECOND, 0, [elapsed_time])) AS max_elapsed_ms, + COUNT(DISTINCT [session_id]) AS distinct_sessions +FROM collect.query_snapshots +WHERE collection_time >= @startTime +AND collection_time <= @endTime"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var totalSnapshots = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + if (totalSnapshots == 0) return; + + var longRunning = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var blocked = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var maxElapsed = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var distinctSessions = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + + facts.Add(new Fact + { + Source = "queries", + Key = "ACTIVE_QUERIES", + Value = longRunning, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["total_snapshots"] = totalSnapshots, + ["long_running_count"] = longRunning, + ["blocked_count"] = blocked, + ["max_elapsed_ms"] = maxElapsed, + ["distinct_sessions"] = distinctSessions + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectActiveQueryFactsAsync failed", ex); + } + } + + /// + /// Collects running job facts: jobs currently running long vs historical averages. + /// + private async Task CollectRunningJobFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + COUNT(*) AS running_count, + COUNT(CASE WHEN is_running_long = 1 THEN 1 END) AS running_long_count, + MAX(percent_of_average) AS max_percent_of_avg, + MAX(current_duration_seconds) AS max_duration_seconds +FROM collect.running_jobs +WHERE collection_time >= @startTime +AND collection_time <= @endTime"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var runningCount = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + if (runningCount == 0) return; + + var runningLong = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var maxPctAvg = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + var maxDuration = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + + facts.Add(new Fact + { + Source = "jobs", + Key = "RUNNING_JOBS", + Value = runningLong, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["running_count"] = runningCount, + ["running_long_count"] = runningLong, + ["max_percent_of_average"] = maxPctAvg, + ["max_duration_seconds"] = maxDuration + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectRunningJobFactsAsync failed", ex); + } + } + + /// + /// Collects session stats: connection counts, total connections. + /// Dashboard session_stats is a flat table (not per-program_name), so we adapt. + /// + private async Task CollectSessionFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT + total_sessions, + running_sessions, + sleeping_sessions, + dormant_sessions, + databases_with_connections, + top_application_connections, + ROW_NUMBER() OVER (ORDER BY collection_time DESC) AS rn + FROM collect.session_stats + WHERE collection_time >= @startTime + AND collection_time <= @endTime +) +SELECT + total_sessions AS total_connections, + running_sessions AS total_running, + sleeping_sessions AS total_sleeping, + dormant_sessions AS total_dormant, + databases_with_connections AS distinct_apps, + top_application_connections AS max_app_connections +FROM latest WHERE rn = 1"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var totalConns = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + if (totalConns == 0) return; + + var totalRunning = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + var totalSleeping = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var totalDormant = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var distinctApps = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + var maxAppConns = reader.IsDBNull(5) ? 0L : Convert.ToInt64(reader.GetValue(5)); + + facts.Add(new Fact + { + Source = "sessions", + Key = "SESSION_STATS", + Value = totalConns, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["total_connections"] = totalConns, + ["total_running"] = totalRunning, + ["total_sleeping"] = totalSleeping, + ["total_dormant"] = totalDormant, + ["distinct_applications"] = distinctApps, + ["max_app_connections"] = maxAppConns + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectSessionFactsAsync failed", ex); + } + } + + /// + /// Collects active global trace flags. Context for the AI to factor into recommendations. + /// + private async Task CollectTraceFlagFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT + trace_flag, + status, + ROW_NUMBER() OVER (PARTITION BY trace_flag ORDER BY collection_time DESC) AS rn + FROM config.trace_flags_history + WHERE is_global = 1 +) +SELECT trace_flag +FROM latest WHERE rn = 1 AND status = 1 +ORDER BY trace_flag"; + + using var reader = await cmd.ExecuteReaderAsync(); + var metadata = new Dictionary(); + var flagCount = 0; + + while (await reader.ReadAsync()) + { + var flag = Convert.ToInt32(reader.GetValue(0)); + metadata[$"TF_{flag}"] = 1; + flagCount++; + } + + if (flagCount == 0) return; + + metadata["flag_count"] = flagCount; + + facts.Add(new Fact + { + Source = "config", + Key = "TRACE_FLAGS", + Value = flagCount, + ServerId = context.ServerId, + Metadata = metadata + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectTraceFlagFactsAsync failed", ex); + } + } + + /// + /// Collects server hardware properties: CPU count, cores, sockets, memory. + /// Critical context for MAXDOP and memory recommendations. + /// + private async Task CollectServerPropertiesFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 1 + cpu_count, + hyperthread_ratio, + physical_memory_mb, + socket_count, + cores_per_socket, + is_hadr_enabled, + edition, + product_version +FROM collect.server_properties +ORDER BY collection_time DESC"; + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var cpuCount = reader.IsDBNull(0) ? 0 : Convert.ToInt32(reader.GetValue(0)); + var htRatio = reader.IsDBNull(1) ? 0 : Convert.ToInt32(reader.GetValue(1)); + var physicalMemMb = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var socketCount = reader.IsDBNull(3) ? 0 : Convert.ToInt32(reader.GetValue(3)); + var coresPerSocket = reader.IsDBNull(4) ? 0 : Convert.ToInt32(reader.GetValue(4)); + var hadrEnabled = !reader.IsDBNull(5) && Convert.ToBoolean(reader.GetValue(5)); + + if (cpuCount == 0) return; + + facts.Add(new Fact + { + Source = "config", + Key = "SERVER_HARDWARE", + Value = cpuCount, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["cpu_count"] = cpuCount, + ["hyperthread_ratio"] = htRatio, + ["physical_memory_mb"] = physicalMemMb, + ["socket_count"] = socketCount, + ["cores_per_socket"] = coresPerSocket, + ["hadr_enabled"] = hadrEnabled ? 1 : 0 + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectServerPropertiesFactsAsync failed", ex); + } + } + + /// + /// Collects disk space facts from database_size_stats: volume free space, file sizes. + /// + private async Task CollectDiskSpaceFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH latest AS ( + SELECT + volume_mount_point, + volume_total_mb, + volume_free_mb, + ROW_NUMBER() OVER (PARTITION BY volume_mount_point ORDER BY collection_time DESC) AS rn + FROM collect.database_size_stats + WHERE collection_time <= @endTime + AND volume_total_mb > 0 +) +SELECT + MIN(volume_free_mb * 1.0 / volume_total_mb) AS min_free_pct, + MIN(volume_free_mb) AS min_free_mb, + COUNT(DISTINCT volume_mount_point) AS volume_count, + SUM(volume_total_mb) AS total_volume_mb, + SUM(volume_free_mb) AS total_free_mb +FROM latest WHERE rn = 1"; + + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var minFreePct = reader.IsDBNull(0) ? 1.0 : Convert.ToDouble(reader.GetValue(0)); + var minFreeMb = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var volumeCount = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var totalVolumeMb = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + var totalFreeMb = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)); + + if (volumeCount == 0) return; + + facts.Add(new Fact + { + Source = "disk", + Key = "DISK_SPACE", + Value = minFreePct, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["min_free_pct"] = minFreePct, + ["min_free_mb"] = minFreeMb, + ["volume_count"] = volumeCount, + ["total_volume_mb"] = totalVolumeMb, + ["total_free_mb"] = totalFreeMb + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectDiskSpaceFactsAsync failed", ex); + } + } + + /// + /// Groups general lock waits (X, U, IX, SIX, BU, IU, UIX, etc.) into a single "LCK" fact. + /// Keeps individual facts for: + /// - LCK_M_S, LCK_M_IS (reader/writer blocking -- RCSI signal) + /// - LCK_M_RS_*, LCK_M_RIn_*, LCK_M_RX_* (serializable/repeatable read signal) + /// - SCH_M, SCH_S (schema locks -- DDL/index operations) + /// Individual constituent wait times are preserved in metadata as "{type}_ms" keys. + /// + private static void GroupGeneralLockWaits(List facts, AnalysisContext context) + { + var generalLocks = facts.Where(f => f.Source == "waits" && IsGeneralLockWait(f.Key)).ToList(); + if (generalLocks.Count == 0) return; + + var totalWaitTimeMs = generalLocks.Sum(f => f.Metadata.GetValueOrDefault("wait_time_ms")); + var totalWaitingTasks = generalLocks.Sum(f => f.Metadata.GetValueOrDefault("waiting_tasks_count")); + var totalSignalMs = generalLocks.Sum(f => f.Metadata.GetValueOrDefault("signal_wait_time_ms")); + var avgMsPerWait = totalWaitingTasks > 0 ? totalWaitTimeMs / totalWaitingTasks : 0; + var fractionOfPeriod = totalWaitTimeMs / context.PeriodDurationMs; + + var metadata = new Dictionary + { + ["wait_time_ms"] = totalWaitTimeMs, + ["waiting_tasks_count"] = totalWaitingTasks, + ["signal_wait_time_ms"] = totalSignalMs, + ["resource_wait_time_ms"] = totalWaitTimeMs - totalSignalMs, + ["avg_ms_per_wait"] = avgMsPerWait, + ["period_duration_ms"] = context.PeriodDurationMs, + ["lock_type_count"] = generalLocks.Count + }; + + // Preserve individual constituent wait times for detailed analysis + foreach (var lck in generalLocks) + metadata[$"{lck.Key}_ms"] = lck.Metadata.GetValueOrDefault("wait_time_ms"); + + // Remove individual facts, add grouped fact + foreach (var lck in generalLocks) + facts.Remove(lck); + + facts.Add(new Fact + { + Source = "waits", + Key = "LCK", + Value = fractionOfPeriod, + ServerId = context.ServerId, + Metadata = metadata + }); + } + + /// + /// Groups all CX* parallelism waits (CXPACKET, CXCONSUMER, CXSYNC_PORT, CXSYNC_CONSUMER, etc.) + /// into a single "CXPACKET" fact. They all indicate the same thing: parallel queries are running. + /// Individual wait times are preserved in metadata for detailed analysis. + /// + private static void GroupParallelismWaits(List facts, AnalysisContext context) + { + var cxWaits = facts.Where(f => f.Source == "waits" && f.Key.StartsWith("CX", StringComparison.Ordinal)).ToList(); + if (cxWaits.Count <= 1) return; + + var totalWaitTimeMs = cxWaits.Sum(f => f.Metadata.GetValueOrDefault("wait_time_ms")); + var totalWaitingTasks = cxWaits.Sum(f => f.Metadata.GetValueOrDefault("waiting_tasks_count")); + var totalSignalMs = cxWaits.Sum(f => f.Metadata.GetValueOrDefault("signal_wait_time_ms")); + var avgMsPerWait = totalWaitingTasks > 0 ? totalWaitTimeMs / totalWaitingTasks : 0; + var fractionOfPeriod = totalWaitTimeMs / context.PeriodDurationMs; + + var metadata = new Dictionary + { + ["wait_time_ms"] = totalWaitTimeMs, + ["waiting_tasks_count"] = totalWaitingTasks, + ["signal_wait_time_ms"] = totalSignalMs, + ["resource_wait_time_ms"] = totalWaitTimeMs - totalSignalMs, + ["avg_ms_per_wait"] = avgMsPerWait, + ["period_duration_ms"] = context.PeriodDurationMs + }; + + // Preserve individual constituent wait times for detailed analysis + foreach (var cx in cxWaits) + metadata[$"{cx.Key}_ms"] = cx.Metadata.GetValueOrDefault("wait_time_ms"); + + foreach (var cx in cxWaits) + facts.Remove(cx); + + facts.Add(new Fact + { + Source = "waits", + Key = "CXPACKET", + Value = fractionOfPeriod, + ServerId = cxWaits[0].ServerId, + Metadata = metadata + }); + } + + /// + /// Returns true for general lock waits that should be grouped into "LCK". + /// Excludes reader locks (S, IS), range locks (RS_*, RIn_*, RX_*), and schema locks. + /// + private static bool IsGeneralLockWait(string waitType) + { + if (!waitType.StartsWith("LCK_M_")) return false; + + // Keep individual: reader/writer locks + if (waitType is "LCK_M_S" or "LCK_M_IS") return false; + + // Keep individual: range locks (serializable/repeatable read) + if (waitType.StartsWith("LCK_M_RS_") || + waitType.StartsWith("LCK_M_RIn_") || + waitType.StartsWith("LCK_M_RX_")) return false; + + // Everything else (X, U, IX, SIX, BU, IU, UIX, etc.) -> group + return true; + } +} diff --git a/Dashboard/Analysis/SqlServerFindingStore.cs b/Dashboard/Analysis/SqlServerFindingStore.cs new file mode 100644 index 00000000..0fd0d73e --- /dev/null +++ b/Dashboard/Analysis/SqlServerFindingStore.cs @@ -0,0 +1,408 @@ +using System; +using System.Collections.Generic; +using System.Threading.Tasks; +using Microsoft.Data.SqlClient; +using PerformanceMonitorDashboard.Helpers; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Persists analysis findings to SQL Server and checks for muted story hashes. +/// Handles the write side of the analysis pipeline -- after the engine produces +/// stories, SqlServerFindingStore saves them and filters out muted patterns. +/// Port of Lite's FindingStore -- uses SQL Server instead of DuckDB. +/// Auto-creates config.analysis_findings and config.analysis_muted tables if missing. +/// +public class SqlServerFindingStore +{ + private readonly string _connectionString; + private long _nextId; + + public SqlServerFindingStore(string connectionString) + { + _connectionString = connectionString; + _nextId = DateTime.UtcNow.Ticks; + } + + /// + /// Ensures the analysis_findings and analysis_muted tables exist. + /// Called before any read/write operation. Uses IF NOT EXISTS for idempotency. + /// + private async Task EnsureTablesExistAsync(SqlConnection connection) + { + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +IF OBJECT_ID(N'config.analysis_findings', N'U') IS NULL +BEGIN + CREATE TABLE config.analysis_findings + ( + finding_id bigint NOT NULL, + analysis_time datetime2(7) NOT NULL, + server_id integer NOT NULL, + server_name nvarchar(256) NOT NULL, + database_name nvarchar(256) NULL, + time_range_start datetime2(7) NULL, + time_range_end datetime2(7) NULL, + severity float NOT NULL, + confidence float NOT NULL, + category nvarchar(256) NOT NULL, + story_path nvarchar(2000) NOT NULL, + story_path_hash nvarchar(256) NOT NULL, + story_text nvarchar(4000) NOT NULL, + root_fact_key nvarchar(256) NOT NULL, + root_fact_value float NULL, + leaf_fact_key nvarchar(256) NULL, + leaf_fact_value float NULL, + fact_count integer NOT NULL, + CONSTRAINT PK_analysis_findings PRIMARY KEY CLUSTERED (finding_id) + WITH (DATA_COMPRESSION = PAGE) + ); + + CREATE INDEX IX_analysis_findings_server_time + ON config.analysis_findings (server_id, analysis_time DESC) + WITH (DATA_COMPRESSION = PAGE); +END; + +IF OBJECT_ID(N'config.analysis_muted', N'U') IS NULL +BEGIN + CREATE TABLE config.analysis_muted + ( + mute_id bigint NOT NULL, + server_id integer NULL, + story_path_hash nvarchar(256) NOT NULL, + story_path nvarchar(2000) NOT NULL, + muted_date datetime2(7) NOT NULL, + reason nvarchar(1000) NULL, + CONSTRAINT PK_analysis_muted PRIMARY KEY CLUSTERED (mute_id) + WITH (DATA_COMPRESSION = PAGE) + ); + + CREATE INDEX IX_analysis_muted_server_hash + ON config.analysis_muted (server_id, story_path_hash) + WITH (DATA_COMPRESSION = PAGE); +END;"; + + await cmd.ExecuteNonQueryAsync(); + } + + /// + /// Saves analysis stories as findings, filtering out any that match muted hashes. + /// Returns the list of findings that were actually saved (non-muted). + /// + public async Task> SaveFindingsAsync( + List stories, AnalysisContext context) + { + var mutedHashes = await GetMutedHashesAsync(context.ServerId); + var analysisTime = DateTime.UtcNow; + var saved = new List(); + + foreach (var story in stories) + { + // Skip absolution stories (severity 0) -- they confirm health, not problems + if (story.Severity <= 0) + continue; + + if (mutedHashes.Contains(story.StoryPathHash)) + continue; + + var finding = new AnalysisFinding + { + FindingId = _nextId++, + AnalysisTime = analysisTime, + ServerId = context.ServerId, + ServerName = context.ServerName, + TimeRangeStart = context.TimeRangeStart, + TimeRangeEnd = context.TimeRangeEnd, + Severity = story.Severity, + Confidence = story.Confidence, + Category = story.Category, + StoryPath = story.StoryPath, + StoryPathHash = story.StoryPathHash, + StoryText = story.StoryText, + RootFactKey = story.RootFactKey, + RootFactValue = story.RootFactValue, + LeafFactKey = story.LeafFactKey, + LeafFactValue = story.LeafFactValue, + FactCount = story.FactCount + }; + + await InsertFindingAsync(finding); + saved.Add(finding); + } + + return saved; + } + + /// + /// Returns the most recent findings for a server within the given time range. + /// + public async Task> GetRecentFindingsAsync( + int serverId, int hoursBack = 24, int limit = 100) + { + var findings = new List(); + + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + await EnsureTablesExistAsync(connection); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP (@limit) + finding_id, analysis_time, server_id, server_name, database_name, + time_range_start, time_range_end, severity, confidence, category, + story_path, story_path_hash, story_text, + root_fact_key, root_fact_value, leaf_fact_key, leaf_fact_value, fact_count +FROM config.analysis_findings +WHERE server_id = @serverId +AND analysis_time >= @cutoff +ORDER BY analysis_time DESC, severity DESC;"; + + cmd.Parameters.Add(new SqlParameter("@serverId", serverId)); + cmd.Parameters.Add(new SqlParameter("@cutoff", DateTime.UtcNow.AddHours(-hoursBack))); + cmd.Parameters.Add(new SqlParameter("@limit", limit)); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + findings.Add(ReadFinding(reader)); + } + } + catch (Exception ex) + { + Logger.Error($"[SqlServerFindingStore] GetRecentFindingsAsync failed: {ex.Message}"); + } + + return findings; + } + + /// + /// Returns the latest analysis run's findings for a server (most recent analysis_time). + /// + public async Task> GetLatestFindingsAsync(int serverId) + { + var findings = new List(); + + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + await EnsureTablesExistAsync(connection); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + finding_id, analysis_time, server_id, server_name, database_name, + time_range_start, time_range_end, severity, confidence, category, + story_path, story_path_hash, story_text, + root_fact_key, root_fact_value, leaf_fact_key, leaf_fact_value, fact_count +FROM config.analysis_findings +WHERE server_id = @serverId +AND analysis_time = ( + SELECT MAX(analysis_time) FROM config.analysis_findings WHERE server_id = @serverId +) +ORDER BY severity DESC;"; + + cmd.Parameters.Add(new SqlParameter("@serverId", serverId)); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + findings.Add(ReadFinding(reader)); + } + } + catch (Exception ex) + { + Logger.Error($"[SqlServerFindingStore] GetLatestFindingsAsync failed: {ex.Message}"); + } + + return findings; + } + + /// + /// Mutes a story pattern so it won't appear in future analysis runs. + /// + public async Task MuteStoryAsync(int serverId, string storyPathHash, string storyPath, string? reason = null) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + await EnsureTablesExistAsync(connection); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +INSERT INTO config.analysis_muted (mute_id, server_id, story_path_hash, story_path, muted_date, reason) +VALUES (@muteId, @serverId, @storyPathHash, @storyPath, @mutedDate, @reason);"; + + cmd.Parameters.Add(new SqlParameter("@muteId", _nextId++)); + cmd.Parameters.Add(new SqlParameter("@serverId", serverId)); + cmd.Parameters.Add(new SqlParameter("@storyPathHash", storyPathHash)); + cmd.Parameters.Add(new SqlParameter("@storyPath", storyPath)); + cmd.Parameters.Add(new SqlParameter("@mutedDate", DateTime.UtcNow)); + cmd.Parameters.Add(new SqlParameter("@reason", (object?)reason ?? DBNull.Value)); + + await cmd.ExecuteNonQueryAsync(); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerFindingStore] MuteStoryAsync failed: {ex.Message}"); + } + } + + /// + /// Unmutes a story pattern. + /// + public async Task UnmuteStoryAsync(long muteId) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + await EnsureTablesExistAsync(connection); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = "DELETE FROM config.analysis_muted WHERE mute_id = @muteId;"; + cmd.Parameters.Add(new SqlParameter("@muteId", muteId)); + await cmd.ExecuteNonQueryAsync(); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerFindingStore] UnmuteStoryAsync failed: {ex.Message}"); + } + } + + /// + /// Cleans up old findings beyond the retention period. + /// + public async Task CleanupOldFindingsAsync(int retentionDays = 30) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + await EnsureTablesExistAsync(connection); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = "DELETE FROM config.analysis_findings WHERE analysis_time < @cutoff;"; + cmd.Parameters.Add(new SqlParameter("@cutoff", DateTime.UtcNow.AddDays(-retentionDays))); + await cmd.ExecuteNonQueryAsync(); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerFindingStore] CleanupOldFindingsAsync failed: {ex.Message}"); + } + } + + private async Task> GetMutedHashesAsync(int serverId) + { + var hashes = new HashSet(); + + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + await EnsureTablesExistAsync(connection); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT story_path_hash FROM config.analysis_muted +WHERE server_id = @serverId OR server_id IS NULL;"; + + cmd.Parameters.Add(new SqlParameter("@serverId", serverId)); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + hashes.Add(reader.GetString(0)); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerFindingStore] GetMutedHashesAsync failed: {ex.Message}"); + } + + return hashes; + } + + private async Task InsertFindingAsync(AnalysisFinding finding) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + await EnsureTablesExistAsync(connection); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +INSERT INTO config.analysis_findings + (finding_id, analysis_time, server_id, server_name, database_name, + time_range_start, time_range_end, severity, confidence, category, + story_path, story_path_hash, story_text, + root_fact_key, root_fact_value, leaf_fact_key, leaf_fact_value, fact_count) +VALUES + (@findingId, @analysisTime, @serverId, @serverName, @databaseName, + @timeRangeStart, @timeRangeEnd, @severity, @confidence, @category, + @storyPath, @storyPathHash, @storyText, + @rootFactKey, @rootFactValue, @leafFactKey, @leafFactValue, @factCount);"; + + cmd.Parameters.Add(new SqlParameter("@findingId", finding.FindingId)); + cmd.Parameters.Add(new SqlParameter("@analysisTime", finding.AnalysisTime)); + cmd.Parameters.Add(new SqlParameter("@serverId", finding.ServerId)); + cmd.Parameters.Add(new SqlParameter("@serverName", finding.ServerName)); + cmd.Parameters.Add(new SqlParameter("@databaseName", (object?)finding.DatabaseName ?? DBNull.Value)); + cmd.Parameters.Add(new SqlParameter("@timeRangeStart", (object?)finding.TimeRangeStart ?? DBNull.Value)); + cmd.Parameters.Add(new SqlParameter("@timeRangeEnd", (object?)finding.TimeRangeEnd ?? DBNull.Value)); + cmd.Parameters.Add(new SqlParameter("@severity", finding.Severity)); + cmd.Parameters.Add(new SqlParameter("@confidence", finding.Confidence)); + cmd.Parameters.Add(new SqlParameter("@category", finding.Category)); + cmd.Parameters.Add(new SqlParameter("@storyPath", finding.StoryPath)); + cmd.Parameters.Add(new SqlParameter("@storyPathHash", finding.StoryPathHash)); + cmd.Parameters.Add(new SqlParameter("@storyText", finding.StoryText)); + cmd.Parameters.Add(new SqlParameter("@rootFactKey", finding.RootFactKey)); + cmd.Parameters.Add(new SqlParameter("@rootFactValue", (object?)finding.RootFactValue ?? DBNull.Value)); + cmd.Parameters.Add(new SqlParameter("@leafFactKey", (object?)finding.LeafFactKey ?? DBNull.Value)); + cmd.Parameters.Add(new SqlParameter("@leafFactValue", (object?)finding.LeafFactValue ?? DBNull.Value)); + cmd.Parameters.Add(new SqlParameter("@factCount", finding.FactCount)); + + await cmd.ExecuteNonQueryAsync(); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerFindingStore] InsertFindingAsync failed: {ex.Message}"); + } + } + + /// + /// Reads a single AnalysisFinding from a data reader row. + /// + private static AnalysisFinding ReadFinding(SqlDataReader reader) + { + return new AnalysisFinding + { + FindingId = reader.GetInt64(0), + AnalysisTime = reader.GetDateTime(1), + ServerId = reader.GetInt32(2), + ServerName = reader.GetString(3), + DatabaseName = reader.IsDBNull(4) ? null : reader.GetString(4), + TimeRangeStart = reader.IsDBNull(5) ? null : reader.GetDateTime(5), + TimeRangeEnd = reader.IsDBNull(6) ? null : reader.GetDateTime(6), + Severity = reader.GetDouble(7), + Confidence = reader.GetDouble(8), + Category = reader.GetString(9), + StoryPath = reader.GetString(10), + StoryPathHash = reader.GetString(11), + StoryText = reader.GetString(12), + RootFactKey = reader.GetString(13), + RootFactValue = reader.IsDBNull(14) ? null : reader.GetDouble(14), + LeafFactKey = reader.IsDBNull(15) ? null : reader.GetString(15), + LeafFactValue = reader.IsDBNull(16) ? null : reader.GetDouble(16), + FactCount = reader.GetInt32(17) + }; + } +} diff --git a/Dashboard/Analysis/SqlServerPlanFetcher.cs b/Dashboard/Analysis/SqlServerPlanFetcher.cs new file mode 100644 index 00000000..f92099bc --- /dev/null +++ b/Dashboard/Analysis/SqlServerPlanFetcher.cs @@ -0,0 +1,58 @@ +using System; +using System.Threading.Tasks; +using Microsoft.Data.SqlClient; +using PerformanceMonitorDashboard.Helpers; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Dashboard implementation of IPlanFetcher -- fetches execution plans from SQL Server +/// using the monitored server's connection string directly. +/// Simpler than Lite's SqlPlanFetcher because Dashboard has one connection string +/// per database (no need to look up servers by ID). +/// +public class SqlServerPlanFetcher : IPlanFetcher +{ + private readonly string _connectionString; + + public SqlServerPlanFetcher(string connectionString) + { + _connectionString = connectionString; + } + + public async Task FetchPlanXmlAsync(int serverId, string planHandle) + { + if (string.IsNullOrEmpty(planHandle)) return null; + + try + { + var builder = new SqlConnectionStringBuilder(_connectionString) + { + ConnectTimeout = 10, + CommandTimeout = 15 + }; + + await using var connection = new SqlConnection(builder.ConnectionString); + await connection.OpenAsync(); + + await using var cmd = new SqlCommand(@" +SET NOCOUNT ON; +SELECT query_plan +FROM sys.dm_exec_query_plan(CONVERT(varbinary(64), @plan_handle, 1));", connection); + + cmd.CommandTimeout = 15; + cmd.Parameters.AddWithValue("@plan_handle", planHandle); + + var result = await cmd.ExecuteScalarAsync(); + if (result == null || result is DBNull) return null; + + return result.ToString(); + } + catch (Exception ex) + { + Logger.Error( + $"[SqlServerPlanFetcher] Failed to fetch plan for handle {planHandle}: {ex.Message}"); + return null; + } + } +} diff --git a/Dashboard/Controls/FinOpsContent.xaml b/Dashboard/Controls/FinOpsContent.xaml index 14ae05c2..dfae3e8a 100644 --- a/Dashboard/Controls/FinOpsContent.xaml +++ b/Dashboard/Controls/FinOpsContent.xaml @@ -46,6 +46,117 @@ + + + + + + + + + + + + +