diff --git a/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs b/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs index 53278381022..c134139db00 100644 --- a/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs +++ b/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs @@ -968,12 +968,17 @@ private void ReportTimeout( HashSet processedHelixJobs) { var timeout = TimeSpan.FromMinutes(_options.MaximumWaitMinutes); - var unfinishedJobs = latestAssociatedJobs - .Where(j => !j.IsCompleted || !processedHelixJobs.Contains(j.JobName)) + IReadOnlyList latestAttempts = GetLatestHelixJobAttempts(latestAssociatedJobs); + var unfinishedJobs = latestAttempts + .Where(j => !j.IsCompleted) + .OrderBy(j => j.JobName, StringComparer.OrdinalIgnoreCase) + .ToList(); + var unprocessedCompletedJobs = latestAttempts + .Where(j => j.IsCompleted && !processedHelixJobs.Contains(j.JobName)) .OrderBy(j => j.JobName, StringComparer.OrdinalIgnoreCase) .ToList(); - if (unfinishedJobs.Count == 0) + if (unfinishedJobs.Count == 0 && unprocessedCompletedJobs.Count == 0) { _logger.LogCritical("Helix Job Monitor timed out after {TimeoutMinutes} minute(s) ({Timeout}). No unfinished Helix jobs were tracked at the time of timeout.", timeout.TotalMinutes, @@ -981,12 +986,25 @@ private void ReportTimeout( return; } - _logger.LogError( - $"Helix Job Monitor timed out after {{TimeoutMinutes}} minute(s) ({{Timeout}}). {{UnfinishedCount}} Helix job(s) had not finished:{Environment.NewLine}- {{UnfinishedJobs}}{Environment.NewLine}", - timeout.TotalMinutes, - timeout, - unfinishedJobs.Count, - string.Join(Environment.NewLine + "- ", unfinishedJobs.Select(j => $"{j.DetailsUri} ({j.QueueId})"))); + if (unfinishedJobs.Count > 0) + { + _logger.LogError( + $"Helix Job Monitor timed out after {{TimeoutMinutes}} minute(s) ({{Timeout}}). {{UnfinishedCount}} Helix job(s) had not finished:{Environment.NewLine}- {{UnfinishedJobs}}{Environment.NewLine}", + timeout.TotalMinutes, + timeout, + unfinishedJobs.Count, + string.Join(Environment.NewLine + "- ", unfinishedJobs.Select(j => $"{j.DetailsUri} ({j.QueueId})"))); + } + + if (unprocessedCompletedJobs.Count > 0) + { + _logger.LogWarning( + $"Helix Job Monitor timed out after {{TimeoutMinutes}} minute(s) ({{Timeout}}). {{UnprocessedCompletedCount}} Helix job(s) completed on Helix but their test result processing in AzDO did not finish before timeout:{Environment.NewLine}- {{UnprocessedCompletedJobs}}{Environment.NewLine}", + timeout.TotalMinutes, + timeout, + unprocessedCompletedJobs.Count, + string.Join(Environment.NewLine + "- ", unprocessedCompletedJobs.Select(j => $"{j.DetailsUri} ({j.QueueId})"))); + } } public void Dispose() diff --git a/src/Microsoft.DotNet.Helix/Sdk.Tests/Microsoft.DotNet.Helix.Sdk.Tests/JobMonitorRunnerTests.cs b/src/Microsoft.DotNet.Helix/Sdk.Tests/Microsoft.DotNet.Helix.Sdk.Tests/JobMonitorRunnerTests.cs index 6cd7053e315..bb04719f1b3 100644 --- a/src/Microsoft.DotNet.Helix/Sdk.Tests/Microsoft.DotNet.Helix.Sdk.Tests/JobMonitorRunnerTests.cs +++ b/src/Microsoft.DotNet.Helix/Sdk.Tests/Microsoft.DotNet.Helix.Sdk.Tests/JobMonitorRunnerTests.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Reflection; using System.Threading; using System.Threading.Tasks; using Microsoft.DotNet.Helix.Client.Models; @@ -1217,6 +1218,94 @@ public async Task MonitorTimesOut_CancelsLatestInFlightHelixJobs() helix.CanceledJobs.OrderBy(jobName => jobName, StringComparer.OrdinalIgnoreCase).ToArray()); } + [Fact] + public async Task ReportTimeout_OnlyInFlightLatestAttempts_AlignsWithCancelInFlightCount() + { + var azdo = new FakeAzureDevOpsService(); + var helix = new FakeHelixService(); + var logger = new RecordingLogger(); + HelixJobInfo[] associatedJobs = + [ + HelixJob("helix-running-a", "running", queueId: "queue-a"), + HelixJob("helix-running-b", "waiting", queueId: "queue-b"), + ]; + var processedHelixJobs = new HashSet(StringComparer.OrdinalIgnoreCase); + var runner = CreateRunner(azdo, helix, logger: logger); + + InvokeReportTimeout(runner, associatedJobs, processedHelixJobs); + await InvokeCancelInFlightHelixJobsAsync(runner, associatedJobs, processedHelixJobs); + + string unfinishedMessage = Assert.Single(logger.Messages, message => + message.Contains("Helix job(s) had not finished", StringComparison.Ordinal)); + Assert.Contains("2 Helix job(s) had not finished", unfinishedMessage, StringComparison.Ordinal); + Assert.Contains("helix-running-a", unfinishedMessage, StringComparison.Ordinal); + Assert.Contains("helix-running-b", unfinishedMessage, StringComparison.Ordinal); + Assert.DoesNotContain(logger.Messages, message => + message.Contains("completed on Helix but their test result processing in AzDO did not finish before timeout", StringComparison.Ordinal)); + Assert.Contains(logger.Messages, message => + message.Contains("Attempting to cancel 2 in-flight Helix job(s)", StringComparison.Ordinal)); + Assert.Equal( + ["helix-running-a", "helix-running-b"], + helix.CanceledJobs.OrderBy(jobName => jobName, StringComparer.OrdinalIgnoreCase).ToArray()); + } + + [Fact] + public void ReportTimeout_OnlyCompletedUnprocessedJobs_LogsWarningCategoryOnly() + { + var logger = new RecordingLogger(); + HelixJobInfo[] associatedJobs = + [ + HelixJob("helix-finished", "finished", queueId: "queue-finished"), + ]; + var processedHelixJobs = new HashSet(StringComparer.OrdinalIgnoreCase); + var runner = CreateRunner(new FakeAzureDevOpsService(), new FakeHelixService(), logger: logger); + + InvokeReportTimeout(runner, associatedJobs, processedHelixJobs); + + Assert.DoesNotContain(logger.Messages, message => + message.Contains("Helix job(s) had not finished", StringComparison.Ordinal)); + Assert.Contains(logger.Messages, message => + message.Contains("1 Helix job(s) completed on Helix but their test result processing in AzDO did not finish before timeout", StringComparison.Ordinal) + && message.Contains("helix-finished", StringComparison.Ordinal)); + } + + [Fact] + public async Task ReportTimeout_MixedStateWithSupersededAttempt_SplitsAndFiltersCategories() + { + var azdo = new FakeAzureDevOpsService(); + var helix = new FakeHelixService(); + var logger = new RecordingLogger(); + HelixJobInfo[] associatedJobs = + [ + HelixJob("helix-old-attempt", "running", queueId: "queue-old"), + HelixJob("helix-new-attempt", "running", queueId: "queue-new", previousHelixJobName: "helix-old-attempt"), + HelixJob("helix-completed", "finished", queueId: "queue-completed"), + ]; + var processedHelixJobs = new HashSet(StringComparer.OrdinalIgnoreCase); + var runner = CreateRunner(azdo, helix, logger: logger); + + InvokeReportTimeout(runner, associatedJobs, processedHelixJobs); + await InvokeCancelInFlightHelixJobsAsync(runner, associatedJobs, processedHelixJobs); + + string unfinishedMessage = Assert.Single(logger.Messages, message => + message.Contains("Helix job(s) had not finished", StringComparison.Ordinal)); + Assert.Contains("1 Helix job(s) had not finished", unfinishedMessage, StringComparison.Ordinal); + Assert.Contains("helix-new-attempt", unfinishedMessage, StringComparison.Ordinal); + Assert.DoesNotContain("helix-old-attempt", unfinishedMessage, StringComparison.Ordinal); + Assert.DoesNotContain("helix-completed", unfinishedMessage, StringComparison.Ordinal); + + string unprocessedCompletedMessage = Assert.Single(logger.Messages, message => + message.Contains("completed on Helix but their test result processing in AzDO did not finish before timeout", StringComparison.Ordinal)); + Assert.Contains("1 Helix job(s) completed on Helix", unprocessedCompletedMessage, StringComparison.Ordinal); + Assert.Contains("helix-completed", unprocessedCompletedMessage, StringComparison.Ordinal); + Assert.DoesNotContain("helix-old-attempt", unprocessedCompletedMessage, StringComparison.Ordinal); + Assert.DoesNotContain("helix-new-attempt", unprocessedCompletedMessage, StringComparison.Ordinal); + + Assert.Contains(logger.Messages, message => + message.Contains("Attempting to cancel 1 in-flight Helix job(s)", StringComparison.Ordinal)); + Assert.Equal(["helix-new-attempt"], helix.CanceledJobs.OrderBy(j => j, StringComparer.OrdinalIgnoreCase).ToArray()); + } + /// /// Regression: a Helix job that was "running" in the first poll and "finished" in a /// later poll must not be listed in the timeout error message or cancelled, because the @@ -2717,6 +2806,31 @@ private static JobMonitorRunner CreateRunner( return new(options, logger ?? NullLogger.Instance, azdo, helix, NoDelay); } + private static void InvokeReportTimeout( + JobMonitorRunner runner, + IEnumerable associatedJobs, + HashSet processedHelixJobs) + { + MethodInfo reportTimeoutMethod = typeof(JobMonitorRunner).GetMethod( + "ReportTimeout", + BindingFlags.Instance | BindingFlags.NonPublic); + Assert.NotNull(reportTimeoutMethod); + reportTimeoutMethod!.Invoke(runner, [associatedJobs, processedHelixJobs]); + } + + private static async Task InvokeCancelInFlightHelixJobsAsync( + JobMonitorRunner runner, + IEnumerable associatedJobs, + HashSet processedHelixJobs) + { + MethodInfo cancelMethod = typeof(JobMonitorRunner).GetMethod( + "CancelInFlightHelixJobsAsync", + BindingFlags.Instance | BindingFlags.NonPublic); + Assert.NotNull(cancelMethod); + Task cancelTask = (Task)cancelMethod!.Invoke(runner, [associatedJobs, processedHelixJobs, CancellationToken.None])!; + await cancelTask; + } + private sealed class RecordingLogger : ILogger { public List Messages { get; } = [];