From ef97acab4f6b9db2bd446cfe245b79cabb362fca Mon Sep 17 00:00:00 2001 From: chcosta Date: Fri, 29 May 2026 16:09:15 -0700 Subject: [PATCH 1/3] Log Helix queue health summary on job submission (opt-in) Surfaces the new `queueStats` block returned by `POST /api/jobs` (queue depth, estimated wait, average run duration, snapshot time) so that users who submit Helix jobs can see queue health at submit time. Behavior: * Off by default. Opt in via MSBuild `EnableShowHelixQueueStats=true` or, for direct JobSender consumers, `.WithShowQueueStats(true)` on the fluent `IJobDefinition`. * Emits a short, human-readable summary with snapshot time converted to the local timezone. * Emits an MSBuild `warning :` line when the estimated wait exceeds the 30-minute SLA (the queue is at capacity / unhealthy) or when the stats snapshot is more than 15 minutes stale. * Prints a one-time preview-feature banner and a one-time link to the dnceng First Responders Teams channel per process to keep log noise low across multi-job builds. Changes: * New `Models.QueueStats` partial bound to the `queueStats` JSON. * Partial `Models.JobCreationResult` extension exposing `QueueStats` (keeps the generated client file untouched). * New `IJobDefinition.WithShowQueueStats(bool)` fluent method and `JobDefinition` implementation gating the new log output. * New `SendHelixJob.EnableShowHelixQueueStats` task input plumbed via `Microsoft.DotNet.Helix.Sdk.MonoQueue.targets` and defaulted to `false` in `Microsoft.DotNet.Helix.Sdk.props`. --- .../Client/CSharp/JobCreationResult.cs | 13 +++ .../Client/CSharp/QueueStats.cs | 29 +++++ .../JobSender/IJobDefinition.cs | 7 ++ .../JobSender/JobDefinition.cs | 106 ++++++++++++++++++ .../Sdk/SendHelixJob.cs | 9 +- ...crosoft.DotNet.Helix.Sdk.MonoQueue.targets | 1 + .../tools/Microsoft.DotNet.Helix.Sdk.props | 9 ++ 7 files changed, 173 insertions(+), 1 deletion(-) create mode 100644 src/Microsoft.DotNet.Helix/Client/CSharp/JobCreationResult.cs create mode 100644 src/Microsoft.DotNet.Helix/Client/CSharp/QueueStats.cs diff --git a/src/Microsoft.DotNet.Helix/Client/CSharp/JobCreationResult.cs b/src/Microsoft.DotNet.Helix/Client/CSharp/JobCreationResult.cs new file mode 100644 index 00000000000..efc0d725ed8 --- /dev/null +++ b/src/Microsoft.DotNet.Helix/Client/CSharp/JobCreationResult.cs @@ -0,0 +1,13 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using Newtonsoft.Json; + +namespace Microsoft.DotNet.Helix.Client.Models +{ + public partial class JobCreationResult + { + [JsonProperty("queueStats")] + public QueueStats QueueStats { get; set; } + } +} diff --git a/src/Microsoft.DotNet.Helix/Client/CSharp/QueueStats.cs b/src/Microsoft.DotNet.Helix/Client/CSharp/QueueStats.cs new file mode 100644 index 00000000000..1ffb8a5fced --- /dev/null +++ b/src/Microsoft.DotNet.Helix/Client/CSharp/QueueStats.cs @@ -0,0 +1,29 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using Newtonsoft.Json; + +namespace Microsoft.DotNet.Helix.Client.Models +{ + public partial class QueueStats + { + [JsonProperty("queueName")] + public string QueueName { get; set; } + + [JsonProperty("depth")] + public int? Depth { get; set; } + + [JsonProperty("averageRunDuration")] + public TimeSpan? AverageRunDuration { get; set; } + + [JsonProperty("estimatedWait")] + public TimeSpan? EstimatedWait { get; set; } + + [JsonProperty("estimatedWaitMethod")] + public string EstimatedWaitMethod { get; set; } + + [JsonProperty("generatedAt")] + public DateTimeOffset? GeneratedAt { get; set; } + } +} diff --git a/src/Microsoft.DotNet.Helix/JobSender/IJobDefinition.cs b/src/Microsoft.DotNet.Helix/JobSender/IJobDefinition.cs index 2a99e318209..ade89409941 100644 --- a/src/Microsoft.DotNet.Helix/JobSender/IJobDefinition.cs +++ b/src/Microsoft.DotNet.Helix/JobSender/IJobDefinition.cs @@ -175,6 +175,13 @@ public interface IJobDefinition /// Fluent job builder. IJobDefinition WithMaxRetryCount(int? maxRetryCount); + /// + /// Opts in to logging the preview Helix queue health summary (estimated wait, depth, + /// snapshot time) returned in the job creation response. Off by default. + /// + /// Fluent job builder. + IJobDefinition WithShowQueueStats(bool showQueueStats); + /// /// Sends the fully specified job to execution. /// diff --git a/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs b/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs index 1c52f9a4811..ddb0b8e4ed0 100644 --- a/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs +++ b/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs @@ -47,6 +47,7 @@ public JobDefinition(IJob jobApi) public string ResultContainerPrefix { get; private set; } public IDictionary CorrelationPayloads { get; } = new Dictionary(); public int? MaxRetryCount { get; private set; } + public bool ShowQueueStats { get; private set; } public string StorageAccountConnectionString { get; private set; } public string TargetContainerName { get; set; } = DefaultContainerName; public string TargetResultsContainerName { get; set; } = DefaultContainerName; @@ -241,9 +242,108 @@ public async Task SendAsync(Action log, CancellationToken canc string jobStartIdentifier = Guid.NewGuid().ToString("N"); var newJob = await JobApi.NewAsync(creationRequest, jobStartIdentifier, cancellationToken: cancellationToken).ConfigureAwait(false); + if (ShowQueueStats) + { + LogQueueStats(log, queueId, newJob?.QueueStats); + } + return new SentJob(JobApi, newJob); } + // Helix SLA threshold; estimated waits above this are flagged as queue-at-capacity / unhealthy. + private static readonly TimeSpan QueueWaitSlaThreshold = TimeSpan.FromMinutes(30); + + // If the Observer snapshot is older than this, the reported numbers may not reflect current queue state. + private static readonly TimeSpan SnapshotStaleThreshold = TimeSpan.FromMinutes(15); + + private const string FirstRespondersUrl = "https://teams.microsoft.com/l/channel/19%3Aafba3d1545dd45d7b79f34c1821f6055%40thread.skype/First%20Responders?groupId=4d73664c-9f2f-450d-82a5-c2f02756606d&tenantId=72f988bf-86f1-41af-91ab-2d7cd011db47"; + + private static int s_queueStatsHeaderShown; + private static int s_firstRespondersHintShown; + + private static void LogQueueStats(Action log, string queueId, Models.QueueStats stats) + { + if (log == null || stats == null) + { + return; + } + + string depth = stats.Depth?.ToString(CultureInfo.InvariantCulture) ?? "unknown"; + string avgRun = FormatTimeSpan(stats.AverageRunDuration); + string estWait = FormatTimeSpan(stats.EstimatedWait); + string snapshot = FormatSnapshotTime(stats.GeneratedAt); + + bool overSla = stats.EstimatedWait is TimeSpan wait && wait > QueueWaitSlaThreshold; + TimeSpan? snapshotAge = stats.GeneratedAt is DateTimeOffset gen + ? DateTimeOffset.UtcNow - gen + : (TimeSpan?)null; + bool stale = snapshotAge is TimeSpan age && age > SnapshotStaleThreshold; + + string healthTag = overSla ? " [AT CAPACITY]" : string.Empty; + string staleTag = stale ? " (stale)" : string.Empty; + + if (Interlocked.Exchange(ref s_queueStatsHeaderShown, 1) == 0) + { + log("note : Helix queue health reporting is a preview feature; data and format may change."); + } + + log($"Helix queue '{queueId}' health{healthTag}:"); + log($" Estimated wait : {estWait} (queue depth: {depth}, avg run: {avgRun})"); + log($" Snapshot taken : {snapshot}{staleTag}"); + + if (overSla) + { + log($"warning : Helix queue '{queueId}' estimated wait of {estWait} exceeds the {QueueWaitSlaThreshold.TotalMinutes:F0}-minute SLA - the queue is at capacity or unhealthy. Jobs may take longer than usual to start."); + } + + if (stale) + { + log($"warning : Helix queue '{queueId}' health snapshot is {FormatTimeSpan(snapshotAge)} old (threshold {SnapshotStaleThreshold.TotalMinutes:F0}m) - reported wait/depth may not reflect current queue state."); + } + + if (Interlocked.Exchange(ref s_firstRespondersHintShown, 1) == 0) + { + log($" Questions about Helix queue health? Reach the dnceng First Responders channel: {FirstRespondersUrl}"); + } + } + + private static string FormatTimeSpan(TimeSpan? value) + { + if (value is not TimeSpan ts) + { + return "unknown"; + } + + if (ts.TotalDays >= 1) + { + return $"{(int)ts.TotalDays}d {ts.Hours}h {ts.Minutes}m"; + } + if (ts.TotalHours >= 1) + { + return $"{(int)ts.TotalHours}h {ts.Minutes}m"; + } + if (ts.TotalMinutes >= 1) + { + return $"{(int)ts.TotalMinutes}m {ts.Seconds}s"; + } + return $"{ts.Seconds}s"; + } + + private static string FormatSnapshotTime(DateTimeOffset? value) + { + if (value is not DateTimeOffset utc) + { + return "unknown"; + } + + DateTime local = utc.LocalDateTime; + string tz = TimeZoneInfo.Local.IsDaylightSavingTime(local) + ? TimeZoneInfo.Local.DaylightName + : TimeZoneInfo.Local.StandardName; + + return $"{local.ToString("yyyy-MM-dd HH:mm", CultureInfo.InvariantCulture)} {tz}"; + } + private void WarnForImpendingRemoval(Action log, QueueInfo queueInfo) { DateTime whenItExpires = DateTime.MaxValue; @@ -354,6 +454,12 @@ public IJobDefinition WithMaxRetryCount(int? maxRetryCount) return this; } + public IJobDefinition WithShowQueueStats(bool showQueueStats) + { + ShowQueueStats = showQueueStats; + return this; + } + internal void AddWorkItem(WorkItemDefinition workItemDefinition) { _workItems.Add(workItemDefinition); diff --git a/src/Microsoft.DotNet.Helix/Sdk/SendHelixJob.cs b/src/Microsoft.DotNet.Helix/Sdk/SendHelixJob.cs index 58af4284325..976f73fd6dc 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/SendHelixJob.cs +++ b/src/Microsoft.DotNet.Helix/Sdk/SendHelixJob.cs @@ -148,6 +148,12 @@ public static class MetadataNames /// public int MaxRetryCount { get; set; } + /// + /// When true, the preview Helix queue health summary (estimated wait, depth, snapshot + /// time) is logged after job submission. Off by default. + /// + public bool EnableShowHelixQueueStats { get; set; } + private CommandPayload _commandPayload; protected override async Task ExecuteCore(CancellationToken cancellationToken) @@ -175,7 +181,8 @@ protected override async Task ExecuteCore(CancellationToken cancellationToken) IJobDefinition def = currentHelixApi.Job.Define() .WithType(Type) .WithTargetQueue(TargetQueue) - .WithMaxRetryCount(MaxRetryCount); + .WithMaxRetryCount(MaxRetryCount) + .WithShowQueueStats(EnableShowHelixQueueStats); Log.LogMessage($"Initialized job definition with type '{Type}', and target queue '{TargetQueue}'"); if (!string.IsNullOrEmpty(Creator)) diff --git a/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.MonoQueue.targets b/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.MonoQueue.targets index eda40deee2e..0d4e0351799 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.MonoQueue.targets +++ b/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.MonoQueue.targets @@ -84,6 +84,7 @@ BaseUri="$(HelixBaseUri)" AccessToken="$(HelixAccessToken)" MaxRetryCount="$(MaxRetryCount)" + EnableShowHelixQueueStats="$(EnableShowHelixQueueStats)" PreCommands="$(HelixPreCommands)" PostCommands="$(HelixPostCommands)" CorrelationPayloads="@(HelixCorrelationPayload)" diff --git a/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.props b/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.props index c26e4f79ec3..f0cd20b9bbb 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.props +++ b/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.props @@ -30,6 +30,15 @@ false + + + false + + From 571734e8520b0f06a611e970fef4206d2fb5852f Mon Sep 17 00:00:00 2001 From: Christopher Costa Date: Fri, 29 May 2026 16:35:19 -0700 Subject: [PATCH 2/3] Potential fix for pull request finding prefer server returned queue name rather than client side queue name Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs b/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs index ddb0b8e4ed0..100c502df04 100644 --- a/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs +++ b/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs @@ -287,18 +287,20 @@ private static void LogQueueStats(Action log, string queueId, Models.Que log("note : Helix queue health reporting is a preview feature; data and format may change."); } - log($"Helix queue '{queueId}' health{healthTag}:"); + string queueName = string.IsNullOrEmpty(stats.QueueName) ? queueId : stats.QueueName; + + log($"Helix queue '{queueName}' health{healthTag}:"); log($" Estimated wait : {estWait} (queue depth: {depth}, avg run: {avgRun})"); log($" Snapshot taken : {snapshot}{staleTag}"); if (overSla) { - log($"warning : Helix queue '{queueId}' estimated wait of {estWait} exceeds the {QueueWaitSlaThreshold.TotalMinutes:F0}-minute SLA - the queue is at capacity or unhealthy. Jobs may take longer than usual to start."); + log($"warning : Helix queue '{queueName}' estimated wait of {estWait} exceeds the {QueueWaitSlaThreshold.TotalMinutes:F0}-minute SLA - the queue is at capacity or unhealthy. Jobs may take longer than usual to start."); } if (stale) { - log($"warning : Helix queue '{queueId}' health snapshot is {FormatTimeSpan(snapshotAge)} old (threshold {SnapshotStaleThreshold.TotalMinutes:F0}m) - reported wait/depth may not reflect current queue state."); + log($"warning : Helix queue '{queueName}' health snapshot is {FormatTimeSpan(snapshotAge)} old (threshold {SnapshotStaleThreshold.TotalMinutes:F0}m) - reported wait/depth may not reflect current queue state."); } if (Interlocked.Exchange(ref s_firstRespondersHintShown, 1) == 0) From 7ed51bdda80ddaeb2bf1a3bc83f6e960b08cd350 Mon Sep 17 00:00:00 2001 From: chcosta Date: Mon, 1 Jun 2026 15:20:05 -0700 Subject: [PATCH 3/3] Rename WithShowQueueStats(bool) to WithQueueStats() per PR feedback Addresses reviewer feedback on #16922: the boolean parameter was a 'boolean trap' at call sites. Renamed to a no-arg WithQueueStats() opt-in method; SendHelixJob now branches on the EnableShowHelixQueueStats MSBuild flag instead of piping the bool through the fluent API. --- src/Microsoft.DotNet.Helix/JobSender/IJobDefinition.cs | 2 +- src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs | 4 ++-- src/Microsoft.DotNet.Helix/Sdk/SendHelixJob.cs | 7 +++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/Microsoft.DotNet.Helix/JobSender/IJobDefinition.cs b/src/Microsoft.DotNet.Helix/JobSender/IJobDefinition.cs index ade89409941..3f80365bd7d 100644 --- a/src/Microsoft.DotNet.Helix/JobSender/IJobDefinition.cs +++ b/src/Microsoft.DotNet.Helix/JobSender/IJobDefinition.cs @@ -180,7 +180,7 @@ public interface IJobDefinition /// snapshot time) returned in the job creation response. Off by default. /// /// Fluent job builder. - IJobDefinition WithShowQueueStats(bool showQueueStats); + IJobDefinition WithQueueStats(); /// /// Sends the fully specified job to execution. diff --git a/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs b/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs index 100c502df04..b9ccd89ecdc 100644 --- a/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs +++ b/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs @@ -456,9 +456,9 @@ public IJobDefinition WithMaxRetryCount(int? maxRetryCount) return this; } - public IJobDefinition WithShowQueueStats(bool showQueueStats) + public IJobDefinition WithQueueStats() { - ShowQueueStats = showQueueStats; + ShowQueueStats = true; return this; } diff --git a/src/Microsoft.DotNet.Helix/Sdk/SendHelixJob.cs b/src/Microsoft.DotNet.Helix/Sdk/SendHelixJob.cs index 976f73fd6dc..2012872ad23 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/SendHelixJob.cs +++ b/src/Microsoft.DotNet.Helix/Sdk/SendHelixJob.cs @@ -181,8 +181,11 @@ protected override async Task ExecuteCore(CancellationToken cancellationToken) IJobDefinition def = currentHelixApi.Job.Define() .WithType(Type) .WithTargetQueue(TargetQueue) - .WithMaxRetryCount(MaxRetryCount) - .WithShowQueueStats(EnableShowHelixQueueStats); + .WithMaxRetryCount(MaxRetryCount); + if (EnableShowHelixQueueStats) + { + def = def.WithQueueStats(); + } Log.LogMessage($"Initialized job definition with type '{Type}', and target queue '{TargetQueue}'"); if (!string.IsNullOrEmpty(Creator))