Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 27 additions & 9 deletions src/Microsoft.DotNet.Helix/JobMonitor/JobMonitorRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -968,25 +968,43 @@ private void ReportTimeout(
HashSet<string> processedHelixJobs)
{
var timeout = TimeSpan.FromMinutes(_options.MaximumWaitMinutes);
var unfinishedJobs = latestAssociatedJobs
.Where(j => !j.IsCompleted || !processedHelixJobs.Contains(j.JobName))
IReadOnlyList<HelixJobInfo> latestAttempts = GetLatestHelixJobAttempts(latestAssociatedJobs);
var unfinishedJobs = latestAttempts
.Where(j => !j.IsCompleted)
.OrderBy(j => j.JobName, StringComparer.OrdinalIgnoreCase)
.ToList();
var unprocessedCompletedJobs = latestAttempts
.Where(j => j.IsCompleted && !processedHelixJobs.Contains(j.JobName))
.OrderBy(j => j.JobName, StringComparer.OrdinalIgnoreCase)
.ToList();

if (unfinishedJobs.Count == 0)
if (unfinishedJobs.Count == 0 && unprocessedCompletedJobs.Count == 0)
{
_logger.LogCritical("Helix Job Monitor timed out after {TimeoutMinutes} minute(s) ({Timeout}). No unfinished Helix jobs were tracked at the time of timeout.",
timeout.TotalMinutes,
timeout);
return;
}

_logger.LogError(
$"Helix Job Monitor timed out after {{TimeoutMinutes}} minute(s) ({{Timeout}}). {{UnfinishedCount}} Helix job(s) had not finished:{Environment.NewLine}- {{UnfinishedJobs}}{Environment.NewLine}",
timeout.TotalMinutes,
timeout,
unfinishedJobs.Count,
string.Join(Environment.NewLine + "- ", unfinishedJobs.Select(j => $"{j.DetailsUri} ({j.QueueId})")));
if (unfinishedJobs.Count > 0)
{
_logger.LogError(
$"Helix Job Monitor timed out after {{TimeoutMinutes}} minute(s) ({{Timeout}}). {{UnfinishedCount}} Helix job(s) had not finished:{Environment.NewLine}- {{UnfinishedJobs}}{Environment.NewLine}",
timeout.TotalMinutes,
timeout,
unfinishedJobs.Count,
string.Join(Environment.NewLine + "- ", unfinishedJobs.Select(j => $"{j.DetailsUri} ({j.QueueId})")));
}

if (unprocessedCompletedJobs.Count > 0)
{
_logger.LogWarning(
$"Helix Job Monitor timed out after {{TimeoutMinutes}} minute(s) ({{Timeout}}). {{UnprocessedCompletedCount}} Helix job(s) completed on Helix but their test result processing in AzDO did not finish before timeout:{Environment.NewLine}- {{UnprocessedCompletedJobs}}{Environment.NewLine}",
timeout.TotalMinutes,
timeout,
unprocessedCompletedJobs.Count,
string.Join(Environment.NewLine + "- ", unprocessedCompletedJobs.Select(j => $"{j.DetailsUri} ({j.QueueId})")));
}
}

public void Dispose()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Reflection;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.DotNet.Helix.Client.Models;
Expand Down Expand Up @@ -1217,6 +1218,94 @@ public async Task MonitorTimesOut_CancelsLatestInFlightHelixJobs()
helix.CanceledJobs.OrderBy(jobName => jobName, StringComparer.OrdinalIgnoreCase).ToArray());
}

[Fact]
public async Task ReportTimeout_OnlyInFlightLatestAttempts_AlignsWithCancelInFlightCount()
{
var azdo = new FakeAzureDevOpsService();
var helix = new FakeHelixService();
var logger = new RecordingLogger();
HelixJobInfo[] associatedJobs =
[
HelixJob("helix-running-a", "running", queueId: "queue-a"),
HelixJob("helix-running-b", "waiting", queueId: "queue-b"),
];
var processedHelixJobs = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var runner = CreateRunner(azdo, helix, logger: logger);

InvokeReportTimeout(runner, associatedJobs, processedHelixJobs);
await InvokeCancelInFlightHelixJobsAsync(runner, associatedJobs, processedHelixJobs);

string unfinishedMessage = Assert.Single(logger.Messages, message =>
message.Contains("Helix job(s) had not finished", StringComparison.Ordinal));
Assert.Contains("2 Helix job(s) had not finished", unfinishedMessage, StringComparison.Ordinal);
Assert.Contains("helix-running-a", unfinishedMessage, StringComparison.Ordinal);
Assert.Contains("helix-running-b", unfinishedMessage, StringComparison.Ordinal);
Assert.DoesNotContain(logger.Messages, message =>
message.Contains("completed on Helix but their test result processing in AzDO did not finish before timeout", StringComparison.Ordinal));
Assert.Contains(logger.Messages, message =>
message.Contains("Attempting to cancel 2 in-flight Helix job(s)", StringComparison.Ordinal));
Assert.Equal(
["helix-running-a", "helix-running-b"],
helix.CanceledJobs.OrderBy(jobName => jobName, StringComparer.OrdinalIgnoreCase).ToArray());
}

[Fact]
public void ReportTimeout_OnlyCompletedUnprocessedJobs_LogsWarningCategoryOnly()
{
var logger = new RecordingLogger();
HelixJobInfo[] associatedJobs =
[
HelixJob("helix-finished", "finished", queueId: "queue-finished"),
];
var processedHelixJobs = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var runner = CreateRunner(new FakeAzureDevOpsService(), new FakeHelixService(), logger: logger);

InvokeReportTimeout(runner, associatedJobs, processedHelixJobs);

Assert.DoesNotContain(logger.Messages, message =>
message.Contains("Helix job(s) had not finished", StringComparison.Ordinal));
Assert.Contains(logger.Messages, message =>
message.Contains("1 Helix job(s) completed on Helix but their test result processing in AzDO did not finish before timeout", StringComparison.Ordinal)
&& message.Contains("helix-finished", StringComparison.Ordinal));
}

[Fact]
public async Task ReportTimeout_MixedStateWithSupersededAttempt_SplitsAndFiltersCategories()
{
var azdo = new FakeAzureDevOpsService();
var helix = new FakeHelixService();
var logger = new RecordingLogger();
HelixJobInfo[] associatedJobs =
[
HelixJob("helix-old-attempt", "running", queueId: "queue-old"),
HelixJob("helix-new-attempt", "running", queueId: "queue-new", previousHelixJobName: "helix-old-attempt"),
HelixJob("helix-completed", "finished", queueId: "queue-completed"),
];
var processedHelixJobs = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var runner = CreateRunner(azdo, helix, logger: logger);

InvokeReportTimeout(runner, associatedJobs, processedHelixJobs);
await InvokeCancelInFlightHelixJobsAsync(runner, associatedJobs, processedHelixJobs);

string unfinishedMessage = Assert.Single(logger.Messages, message =>
message.Contains("Helix job(s) had not finished", StringComparison.Ordinal));
Assert.Contains("1 Helix job(s) had not finished", unfinishedMessage, StringComparison.Ordinal);
Assert.Contains("helix-new-attempt", unfinishedMessage, StringComparison.Ordinal);
Assert.DoesNotContain("helix-old-attempt", unfinishedMessage, StringComparison.Ordinal);
Assert.DoesNotContain("helix-completed", unfinishedMessage, StringComparison.Ordinal);

string unprocessedCompletedMessage = Assert.Single(logger.Messages, message =>
message.Contains("completed on Helix but their test result processing in AzDO did not finish before timeout", StringComparison.Ordinal));
Assert.Contains("1 Helix job(s) completed on Helix", unprocessedCompletedMessage, StringComparison.Ordinal);
Assert.Contains("helix-completed", unprocessedCompletedMessage, StringComparison.Ordinal);
Assert.DoesNotContain("helix-old-attempt", unprocessedCompletedMessage, StringComparison.Ordinal);
Assert.DoesNotContain("helix-new-attempt", unprocessedCompletedMessage, StringComparison.Ordinal);

Assert.Contains(logger.Messages, message =>
message.Contains("Attempting to cancel 1 in-flight Helix job(s)", StringComparison.Ordinal));
Assert.Equal(["helix-new-attempt"], helix.CanceledJobs.OrderBy(j => j, StringComparer.OrdinalIgnoreCase).ToArray());
}

/// <summary>
/// Regression: a Helix job that was "running" in the first poll and "finished" in a
/// later poll must not be listed in the timeout error message or cancelled, because the
Expand Down Expand Up @@ -2717,6 +2806,31 @@ private static JobMonitorRunner CreateRunner(
return new(options, logger ?? NullLogger.Instance, azdo, helix, NoDelay);
}

private static void InvokeReportTimeout(
JobMonitorRunner runner,
IEnumerable<HelixJobInfo> associatedJobs,
HashSet<string> processedHelixJobs)
{
MethodInfo reportTimeoutMethod = typeof(JobMonitorRunner).GetMethod(
"ReportTimeout",
BindingFlags.Instance | BindingFlags.NonPublic);
Assert.NotNull(reportTimeoutMethod);
reportTimeoutMethod!.Invoke(runner, [associatedJobs, processedHelixJobs]);
Comment thread
premun marked this conversation as resolved.
}

private static async Task InvokeCancelInFlightHelixJobsAsync(
JobMonitorRunner runner,
IEnumerable<HelixJobInfo> associatedJobs,
HashSet<string> processedHelixJobs)
{
MethodInfo cancelMethod = typeof(JobMonitorRunner).GetMethod(
"CancelInFlightHelixJobsAsync",
BindingFlags.Instance | BindingFlags.NonPublic);
Assert.NotNull(cancelMethod);
Task cancelTask = (Task)cancelMethod!.Invoke(runner, [associatedJobs, processedHelixJobs, CancellationToken.None])!;
await cancelTask;
Comment thread
premun marked this conversation as resolved.
}

private sealed class RecordingLogger : ILogger
{
public List<string> Messages { get; } = [];
Expand Down
Loading