From 8d8584905005433b579cd0d7b3c58ad862cfe30c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 23 May 2026 01:43:39 +0000 Subject: [PATCH 1/6] Initial plan From 4a6f8cbc5d4dc816d53f3fd185a0ab30bbaeca0c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 23 May 2026 01:48:30 +0000 Subject: [PATCH 2/6] Log in-progress jobs on JobMonitor timeout and cover with test Agent-Logs-Url: https://github.com/dotnet/arcade/sessions/c103e3cf-afe4-44dc-8ce7-56e30b442048 Co-authored-by: mmitche <8725170+mmitche@users.noreply.github.com> --- .../JobMonitor/JobMonitorRunner.cs | 82 ++++++++++++++++--- .../JobMonitorRunnerTests.cs | 12 ++- 2 files changed, 82 insertions(+), 12 deletions(-) diff --git a/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs b/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs index f6c7de47efa..423829b4bd6 100644 --- a/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs +++ b/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs @@ -122,6 +122,7 @@ private async Task RunCoreAsync(CancellationToken cancellationToken) IReadOnlySet alreadyProcessed = await _azdo.GetProcessedHelixJobNamesAsync(cancellationToken); HashSet processedHelixJobs = new(alreadyProcessed, StringComparer.OrdinalIgnoreCase); HashSet associatedJobs = new(HelixJobInfo.ByJobNameComparer); + var latestTimelineRecords = new List(); try { @@ -130,6 +131,7 @@ private async Task RunCoreAsync(CancellationToken cancellationToken) return await RunLoopAsync( processedHelixJobs, associatedJobs, + latestTimelineRecords, jobResubmission, cancellationToken); } @@ -141,7 +143,7 @@ private async Task RunCoreAsync(CancellationToken cancellationToken) // lost (and tests that observe UploadedJobNames immediately after a cancelled // run see a partial set). await WaitForPendingTestResultUploadsAsync(CancellationToken.None); - ReportTimeout(associatedJobs, processedHelixJobs); + ReportTimeout(associatedJobs, processedHelixJobs, latestTimelineRecords); // On cancellation (typically the overall timeout), proactively cancel any // Helix jobs we know about that haven't finished yet so they don't keep @@ -203,6 +205,7 @@ private async Task CancelInFlightHelixJobsAsync( private async Task RunLoopAsync( HashSet processedHelixJobs, HashSet associatedJobs, + List latestTimelineRecords, JobResubmissionResult jobResubmission, CancellationToken cancellationToken) { @@ -232,6 +235,8 @@ private async Task RunLoopAsync( string.IsNullOrEmpty(j.StageName) || string.Equals(j.StageName, _options.StageName, StringComparison.OrdinalIgnoreCase)) ]; + latestTimelineRecords.Clear(); + latestTimelineRecords.AddRange(timelineRecords); associatedJobs.UnionWith(associatedJobsWithBuild); @@ -963,28 +968,83 @@ public int GetHashCode((string ChainKey, string WorkItemName) obj) private void ReportTimeout( IEnumerable latestAssociatedJobs, - HashSet processedHelixJobs) + HashSet processedHelixJobs, + IReadOnlyList latestTimelineRecords) { var timeout = TimeSpan.FromMinutes(_options.MaximumWaitMinutes); - var unfinishedJobs = latestAssociatedJobs + var unfinishedHelixJobs = GetLatestHelixJobAttempts(latestAssociatedJobs) .Where(j => !j.IsCompleted || !processedHelixJobs.Contains(j.JobName)) .OrderBy(j => j.JobName, StringComparer.OrdinalIgnoreCase) .ToList(); + var inProgressPipelineJobs = GetInProgressNonMonitorPipelineJobs(latestTimelineRecords, _options.JobMonitorName) + .OrderBy(r => r.Name, StringComparer.OrdinalIgnoreCase) + .ToList(); - if (unfinishedJobs.Count == 0) + if (unfinishedHelixJobs.Count == 0 && inProgressPipelineJobs.Count == 0) { - _logger.LogCritical("Helix Job Monitor timed out after {TimeoutMinutes} minute(s) ({Timeout}). No unfinished Helix jobs were tracked at the time of timeout.", + _logger.LogCritical("Helix Job Monitor timed out after {TimeoutMinutes} minute(s) ({Timeout}). No unfinished Helix or Azure DevOps jobs were tracked at the time of timeout.", timeout.TotalMinutes, timeout); return; } - _logger.LogError( - $"Helix Job Monitor timed out after {{TimeoutMinutes}} minute(s) ({{Timeout}}). {{UnfinishedCount}} Helix job(s) had not finished:{Environment.NewLine}- {{UnfinishedJobs}}{Environment.NewLine}", - timeout.TotalMinutes, - timeout, - unfinishedJobs.Count, - string.Join(Environment.NewLine + "- ", unfinishedJobs.Select(j => $"{j.DetailsUri} ({j.QueueId})"))); + if (unfinishedHelixJobs.Count > 0) + { + _logger.LogError( + $"Helix Job Monitor timed out after {{TimeoutMinutes}} minute(s) ({{Timeout}}). {{UnfinishedCount}} Helix job(s) were unfinished or unprocessed:{Environment.NewLine}- {{UnfinishedJobs}}{Environment.NewLine}", + timeout.TotalMinutes, + timeout, + unfinishedHelixJobs.Count, + string.Join(Environment.NewLine + "- ", unfinishedHelixJobs.Select(FormatUnfinishedHelixJobForTimeoutLog))); + } + + if (inProgressPipelineJobs.Count > 0) + { + _logger.LogError( + $"At timeout, {{InProgressCount}} non-monitor Azure DevOps pipeline job(s) were still in progress or queued:{Environment.NewLine}- {{InProgressJobs}}{Environment.NewLine}", + inProgressPipelineJobs.Count, + string.Join(Environment.NewLine + "- ", inProgressPipelineJobs.Select(FormatInProgressPipelineJobForTimeoutLog))); + } + } + + private static IEnumerable GetInProgressNonMonitorPipelineJobs( + IEnumerable timelineRecords, + string jobMonitorName) + => (timelineRecords ?? []) + .Where(r => string.Equals(r.Type, "Job", StringComparison.OrdinalIgnoreCase)) + .Where(r => !string.Equals(r.State, "completed", StringComparison.OrdinalIgnoreCase)) + .Where(r => !IsMonitorRecord(r, jobMonitorName)); + + private static string FormatUnfinishedHelixJobForTimeoutLog(HelixJobInfo helixJob) + { + string workItemCount = helixJob.InitialWorkItemCount?.ToString() ?? "unknown"; + return $"{helixJob.DisplayName} [status={helixJob.Status}, initialWorkItems={workItemCount}]{Environment.NewLine} {helixJob.DetailsUri}"; + } + + private static string FormatInProgressPipelineJobForTimeoutLog(AzureDevOpsTimelineRecord timelineRecord) + { + string state = string.IsNullOrEmpty(timelineRecord.State) ? "unknown" : timelineRecord.State; + string result = string.IsNullOrEmpty(timelineRecord.Result) ? "none" : timelineRecord.Result; + string name = string.IsNullOrEmpty(timelineRecord.Name) ? timelineRecord.ReferenceName : timelineRecord.Name; + if (string.IsNullOrEmpty(name)) + { + name = timelineRecord.Identifier; + } + + return $"{name} [state={state}, result={result}]"; + } + + private static bool IsMonitorRecord(AzureDevOpsTimelineRecord timelineRecord, string jobMonitorName) + { + if (timelineRecord is null) + { + return false; + } + + return !string.IsNullOrEmpty(jobMonitorName) + && (string.Equals(timelineRecord.ReferenceName, jobMonitorName, StringComparison.OrdinalIgnoreCase) + || string.Equals(timelineRecord.Name, jobMonitorName, StringComparison.OrdinalIgnoreCase) + || string.Equals(timelineRecord.Identifier, jobMonitorName, StringComparison.OrdinalIgnoreCase)); } public void Dispose() diff --git a/src/Microsoft.DotNet.Helix/Sdk.Tests/Microsoft.DotNet.Helix.Sdk.Tests/JobMonitorRunnerTests.cs b/src/Microsoft.DotNet.Helix/Sdk.Tests/Microsoft.DotNet.Helix.Sdk.Tests/JobMonitorRunnerTests.cs index 74d85f635db..8e12115a925 100644 --- a/src/Microsoft.DotNet.Helix/Sdk.Tests/Microsoft.DotNet.Helix.Sdk.Tests/JobMonitorRunnerTests.cs +++ b/src/Microsoft.DotNet.Helix/Sdk.Tests/Microsoft.DotNet.Helix.Sdk.Tests/JobMonitorRunnerTests.cs @@ -1180,6 +1180,7 @@ public async Task MonitorTimesOut_CancelsLatestInFlightHelixJobs() { var azdo = new FakeAzureDevOpsService(); var helix = new FakeHelixService(); + var logger = new RecordingLogger(); azdo.AddTimelineResponse( MonitorJob(), @@ -1200,7 +1201,7 @@ public async Task MonitorTimesOut_CancelsLatestInFlightHelixJobs() }); using var cts = new CancellationTokenSource(); - var runner = new JobMonitorRunner(DefaultOptions(), NullLogger.Instance, azdo, helix, + var runner = new JobMonitorRunner(DefaultOptions(), logger, azdo, helix, (_, _) => { cts.Cancel(); @@ -1214,6 +1215,15 @@ public async Task MonitorTimesOut_CancelsLatestInFlightHelixJobs() Assert.Equal( ["helix-new-attempt", "helix-running", "helix-waiting"], helix.CanceledJobs.OrderBy(jobName => jobName, StringComparer.OrdinalIgnoreCase).ToArray()); + Assert.Contains(logger.Messages, message => + message.Contains("Helix job(s) were unfinished or unprocessed", StringComparison.Ordinal) + && message.Contains("helix-new-attempt", StringComparison.Ordinal) + && message.Contains("status=running", StringComparison.Ordinal) + && message.Contains("helix-waiting", StringComparison.Ordinal) + && message.Contains("status=waiting", StringComparison.Ordinal)); + Assert.Contains(logger.Messages, message => + message.Contains("non-monitor Azure DevOps pipeline job(s) were still in progress or queued", StringComparison.Ordinal) + && message.Contains("Test Linux [state=inProgress, result=none]", StringComparison.Ordinal)); } /// From 502c1ef39e9a2facffed732e6e0c612cbe349c94 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 23 May 2026 01:52:00 +0000 Subject: [PATCH 3/6] Address validation feedback on timeout logging helpers Agent-Logs-Url: https://github.com/dotnet/arcade/sessions/c103e3cf-afe4-44dc-8ce7-56e30b442048 Co-authored-by: mmitche <8725170+mmitche@users.noreply.github.com> --- src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs b/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs index 423829b4bd6..001030fe806 100644 --- a/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs +++ b/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs @@ -1008,17 +1008,17 @@ private void ReportTimeout( } private static IEnumerable GetInProgressNonMonitorPipelineJobs( - IEnumerable timelineRecords, + IReadOnlyList timelineRecords, string jobMonitorName) - => (timelineRecords ?? []) + => timelineRecords .Where(r => string.Equals(r.Type, "Job", StringComparison.OrdinalIgnoreCase)) .Where(r => !string.Equals(r.State, "completed", StringComparison.OrdinalIgnoreCase)) .Where(r => !IsMonitorRecord(r, jobMonitorName)); private static string FormatUnfinishedHelixJobForTimeoutLog(HelixJobInfo helixJob) { - string workItemCount = helixJob.InitialWorkItemCount?.ToString() ?? "unknown"; - return $"{helixJob.DisplayName} [status={helixJob.Status}, initialWorkItems={workItemCount}]{Environment.NewLine} {helixJob.DetailsUri}"; + string workItemCountText = helixJob.InitialWorkItemCount?.ToString() ?? "unknown"; + return $"{helixJob.DisplayName} [status={helixJob.Status}, initialWorkItems={workItemCountText}]{Environment.NewLine} {helixJob.DetailsUri}"; } private static string FormatInProgressPipelineJobForTimeoutLog(AzureDevOpsTimelineRecord timelineRecord) From 8fcae77dd9f5f01ca6ca589b5ca7138a50b75e09 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 26 May 2026 15:08:55 +0000 Subject: [PATCH 4/6] Fix timeout AzDO job sorting when timeline Name is null --- src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs b/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs index 001030fe806..233ebf74f78 100644 --- a/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs +++ b/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs @@ -977,7 +977,7 @@ private void ReportTimeout( .OrderBy(j => j.JobName, StringComparer.OrdinalIgnoreCase) .ToList(); var inProgressPipelineJobs = GetInProgressNonMonitorPipelineJobs(latestTimelineRecords, _options.JobMonitorName) - .OrderBy(r => r.Name, StringComparer.OrdinalIgnoreCase) + .OrderBy(r => r.Name ?? r.ReferenceName ?? r.Identifier ?? string.Empty, StringComparer.OrdinalIgnoreCase) .ToList(); if (unfinishedHelixJobs.Count == 0 && inProgressPipelineJobs.Count == 0) From ebacaf610fcc1d6155c1667955db1a3f8b1b3aec Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 27 May 2026 18:49:21 +0000 Subject: [PATCH 5/6] Use shared monitor-job filtering for timeout diagnostics --- .../JobMonitor/HelixJobMonitorUtilities.cs | 9 +++++++-- .../JobMonitor/JobMonitorRunner.cs | 19 ++----------------- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/src/Microsoft.DotNet.Helix/JobMonitor/HelixJobMonitorUtilities.cs b/src/Microsoft.DotNet.Helix/JobMonitor/HelixJobMonitorUtilities.cs index 88e9f961681..ae02a27896d 100644 --- a/src/Microsoft.DotNet.Helix/JobMonitor/HelixJobMonitorUtilities.cs +++ b/src/Microsoft.DotNet.Helix/JobMonitor/HelixJobMonitorUtilities.cs @@ -56,7 +56,7 @@ public sealed class PreviousAttemptReference public static class HelixJobMonitorUtilities { public static bool AreNonMonitorJobsComplete(IEnumerable records, string jobMonitorName) - => GetRelevantJobRecords(records, jobMonitorName).All(IsTerminal); + => GetNonMonitorJobRecords(records, jobMonitorName).All(IsTerminal); public static bool HasFailedNonMonitorJobs(IEnumerable records, string jobMonitorName) => HasFailedNonMonitorJobs(records, jobMonitorName, new HashSet(StringComparer.OrdinalIgnoreCase)); @@ -65,12 +65,17 @@ public static bool HasFailedNonMonitorJobs( IEnumerable records, string jobMonitorName, IReadOnlySet ignoredJobNames) - => GetRelevantJobRecords(records, jobMonitorName) + => GetNonMonitorJobRecords(records, jobMonitorName) .Where(r => ignoredJobNames == null || !ignoredJobNames.Contains(r.ReferenceName)) .Any(r => string.Equals(r.Result, "failed", StringComparison.OrdinalIgnoreCase) || string.Equals(r.Result, "canceled", StringComparison.OrdinalIgnoreCase)); + public static IEnumerable GetNonMonitorJobRecords( + IEnumerable records, + string jobMonitorName) + => GetRelevantJobRecords(records, jobMonitorName); + /// /// Returns the subset of that belongs to the pipeline stage /// named , including the Stage record itself and any diff --git a/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs b/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs index 233ebf74f78..8cc5644d828 100644 --- a/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs +++ b/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs @@ -1010,10 +1010,8 @@ private void ReportTimeout( private static IEnumerable GetInProgressNonMonitorPipelineJobs( IReadOnlyList timelineRecords, string jobMonitorName) - => timelineRecords - .Where(r => string.Equals(r.Type, "Job", StringComparison.OrdinalIgnoreCase)) - .Where(r => !string.Equals(r.State, "completed", StringComparison.OrdinalIgnoreCase)) - .Where(r => !IsMonitorRecord(r, jobMonitorName)); + => HelixJobMonitorUtilities.GetNonMonitorJobRecords(timelineRecords, jobMonitorName) + .Where(r => !string.Equals(r.State, "completed", StringComparison.OrdinalIgnoreCase)); private static string FormatUnfinishedHelixJobForTimeoutLog(HelixJobInfo helixJob) { @@ -1034,19 +1032,6 @@ private static string FormatInProgressPipelineJobForTimeoutLog(AzureDevOpsTimeli return $"{name} [state={state}, result={result}]"; } - private static bool IsMonitorRecord(AzureDevOpsTimelineRecord timelineRecord, string jobMonitorName) - { - if (timelineRecord is null) - { - return false; - } - - return !string.IsNullOrEmpty(jobMonitorName) - && (string.Equals(timelineRecord.ReferenceName, jobMonitorName, StringComparison.OrdinalIgnoreCase) - || string.Equals(timelineRecord.Name, jobMonitorName, StringComparison.OrdinalIgnoreCase) - || string.Equals(timelineRecord.Identifier, jobMonitorName, StringComparison.OrdinalIgnoreCase)); - } - public void Dispose() { (_azdo as IDisposable)?.Dispose(); From aa1486c14a29d8921507f1e11e84ed2ca13bd212 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 27 May 2026 18:53:25 +0000 Subject: [PATCH 6/6] Reuse shared non-monitor job record filtering --- .../JobMonitor/HelixJobMonitorUtilities.cs | 6 +++--- src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.DotNet.Helix/JobMonitor/HelixJobMonitorUtilities.cs b/src/Microsoft.DotNet.Helix/JobMonitor/HelixJobMonitorUtilities.cs index ae02a27896d..d714ad27f08 100644 --- a/src/Microsoft.DotNet.Helix/JobMonitor/HelixJobMonitorUtilities.cs +++ b/src/Microsoft.DotNet.Helix/JobMonitor/HelixJobMonitorUtilities.cs @@ -56,7 +56,7 @@ public sealed class PreviousAttemptReference public static class HelixJobMonitorUtilities { public static bool AreNonMonitorJobsComplete(IEnumerable records, string jobMonitorName) - => GetNonMonitorJobRecords(records, jobMonitorName).All(IsTerminal); + => GetRelevantNonMonitorJobRecords(records, jobMonitorName).All(IsTerminal); public static bool HasFailedNonMonitorJobs(IEnumerable records, string jobMonitorName) => HasFailedNonMonitorJobs(records, jobMonitorName, new HashSet(StringComparer.OrdinalIgnoreCase)); @@ -65,13 +65,13 @@ public static bool HasFailedNonMonitorJobs( IEnumerable records, string jobMonitorName, IReadOnlySet ignoredJobNames) - => GetNonMonitorJobRecords(records, jobMonitorName) + => GetRelevantNonMonitorJobRecords(records, jobMonitorName) .Where(r => ignoredJobNames == null || !ignoredJobNames.Contains(r.ReferenceName)) .Any(r => string.Equals(r.Result, "failed", StringComparison.OrdinalIgnoreCase) || string.Equals(r.Result, "canceled", StringComparison.OrdinalIgnoreCase)); - public static IEnumerable GetNonMonitorJobRecords( + public static IEnumerable GetRelevantNonMonitorJobRecords( IEnumerable records, string jobMonitorName) => GetRelevantJobRecords(records, jobMonitorName); diff --git a/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs b/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs index 8cc5644d828..fc00208d3a9 100644 --- a/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs +++ b/src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs @@ -1010,7 +1010,7 @@ private void ReportTimeout( private static IEnumerable GetInProgressNonMonitorPipelineJobs( IReadOnlyList timelineRecords, string jobMonitorName) - => HelixJobMonitorUtilities.GetNonMonitorJobRecords(timelineRecords, jobMonitorName) + => HelixJobMonitorUtilities.GetRelevantNonMonitorJobRecords(timelineRecords, jobMonitorName) .Where(r => !string.Equals(r.State, "completed", StringComparison.OrdinalIgnoreCase)); private static string FormatUnfinishedHelixJobForTimeoutLog(HelixJobInfo helixJob)