diff --git a/src/Microsoft.DotNet.Helix/Client/CSharp/JobCreationResult.cs b/src/Microsoft.DotNet.Helix/Client/CSharp/JobCreationResult.cs new file mode 100644 index 00000000000..efc0d725ed8 --- /dev/null +++ b/src/Microsoft.DotNet.Helix/Client/CSharp/JobCreationResult.cs @@ -0,0 +1,13 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using Newtonsoft.Json; + +namespace Microsoft.DotNet.Helix.Client.Models +{ + public partial class JobCreationResult + { + [JsonProperty("queueStats")] + public QueueStats QueueStats { get; set; } + } +} diff --git a/src/Microsoft.DotNet.Helix/Client/CSharp/QueueStats.cs b/src/Microsoft.DotNet.Helix/Client/CSharp/QueueStats.cs new file mode 100644 index 00000000000..1ffb8a5fced --- /dev/null +++ b/src/Microsoft.DotNet.Helix/Client/CSharp/QueueStats.cs @@ -0,0 +1,29 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using Newtonsoft.Json; + +namespace Microsoft.DotNet.Helix.Client.Models +{ + public partial class QueueStats + { + [JsonProperty("queueName")] + public string QueueName { get; set; } + + [JsonProperty("depth")] + public int? Depth { get; set; } + + [JsonProperty("averageRunDuration")] + public TimeSpan? AverageRunDuration { get; set; } + + [JsonProperty("estimatedWait")] + public TimeSpan? EstimatedWait { get; set; } + + [JsonProperty("estimatedWaitMethod")] + public string EstimatedWaitMethod { get; set; } + + [JsonProperty("generatedAt")] + public DateTimeOffset? GeneratedAt { get; set; } + } +} diff --git a/src/Microsoft.DotNet.Helix/JobSender/IJobDefinition.cs b/src/Microsoft.DotNet.Helix/JobSender/IJobDefinition.cs index 2a99e318209..3f80365bd7d 100644 --- a/src/Microsoft.DotNet.Helix/JobSender/IJobDefinition.cs +++ b/src/Microsoft.DotNet.Helix/JobSender/IJobDefinition.cs @@ -175,6 +175,13 @@ public interface IJobDefinition /// Fluent job builder. IJobDefinition WithMaxRetryCount(int? maxRetryCount); + /// + /// Opts in to logging the preview Helix queue health summary (estimated wait, depth, + /// snapshot time) returned in the job creation response. Off by default. + /// + /// Fluent job builder. + IJobDefinition WithQueueStats(); + /// /// Sends the fully specified job to execution. /// diff --git a/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs b/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs index 1c52f9a4811..b9ccd89ecdc 100644 --- a/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs +++ b/src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs @@ -47,6 +47,7 @@ public JobDefinition(IJob jobApi) public string ResultContainerPrefix { get; private set; } public IDictionary CorrelationPayloads { get; } = new Dictionary(); public int? MaxRetryCount { get; private set; } + public bool ShowQueueStats { get; private set; } public string StorageAccountConnectionString { get; private set; } public string TargetContainerName { get; set; } = DefaultContainerName; public string TargetResultsContainerName { get; set; } = DefaultContainerName; @@ -241,9 +242,110 @@ public async Task SendAsync(Action log, CancellationToken canc string jobStartIdentifier = Guid.NewGuid().ToString("N"); var newJob = await JobApi.NewAsync(creationRequest, jobStartIdentifier, cancellationToken: cancellationToken).ConfigureAwait(false); + if (ShowQueueStats) + { + LogQueueStats(log, queueId, newJob?.QueueStats); + } + return new SentJob(JobApi, newJob); } + // Helix SLA threshold; estimated waits above this are flagged as queue-at-capacity / unhealthy. + private static readonly TimeSpan QueueWaitSlaThreshold = TimeSpan.FromMinutes(30); + + // If the Observer snapshot is older than this, the reported numbers may not reflect current queue state. + private static readonly TimeSpan SnapshotStaleThreshold = TimeSpan.FromMinutes(15); + + private const string FirstRespondersUrl = "https://teams.microsoft.com/l/channel/19%3Aafba3d1545dd45d7b79f34c1821f6055%40thread.skype/First%20Responders?groupId=4d73664c-9f2f-450d-82a5-c2f02756606d&tenantId=72f988bf-86f1-41af-91ab-2d7cd011db47"; + + private static int s_queueStatsHeaderShown; + private static int s_firstRespondersHintShown; + + private static void LogQueueStats(Action log, string queueId, Models.QueueStats stats) + { + if (log == null || stats == null) + { + return; + } + + string depth = stats.Depth?.ToString(CultureInfo.InvariantCulture) ?? "unknown"; + string avgRun = FormatTimeSpan(stats.AverageRunDuration); + string estWait = FormatTimeSpan(stats.EstimatedWait); + string snapshot = FormatSnapshotTime(stats.GeneratedAt); + + bool overSla = stats.EstimatedWait is TimeSpan wait && wait > QueueWaitSlaThreshold; + TimeSpan? snapshotAge = stats.GeneratedAt is DateTimeOffset gen + ? DateTimeOffset.UtcNow - gen + : (TimeSpan?)null; + bool stale = snapshotAge is TimeSpan age && age > SnapshotStaleThreshold; + + string healthTag = overSla ? " [AT CAPACITY]" : string.Empty; + string staleTag = stale ? " (stale)" : string.Empty; + + if (Interlocked.Exchange(ref s_queueStatsHeaderShown, 1) == 0) + { + log("note : Helix queue health reporting is a preview feature; data and format may change."); + } + + string queueName = string.IsNullOrEmpty(stats.QueueName) ? queueId : stats.QueueName; + + log($"Helix queue '{queueName}' health{healthTag}:"); + log($" Estimated wait : {estWait} (queue depth: {depth}, avg run: {avgRun})"); + log($" Snapshot taken : {snapshot}{staleTag}"); + + if (overSla) + { + log($"warning : Helix queue '{queueName}' estimated wait of {estWait} exceeds the {QueueWaitSlaThreshold.TotalMinutes:F0}-minute SLA - the queue is at capacity or unhealthy. Jobs may take longer than usual to start."); + } + + if (stale) + { + log($"warning : Helix queue '{queueName}' health snapshot is {FormatTimeSpan(snapshotAge)} old (threshold {SnapshotStaleThreshold.TotalMinutes:F0}m) - reported wait/depth may not reflect current queue state."); + } + + if (Interlocked.Exchange(ref s_firstRespondersHintShown, 1) == 0) + { + log($" Questions about Helix queue health? Reach the dnceng First Responders channel: {FirstRespondersUrl}"); + } + } + + private static string FormatTimeSpan(TimeSpan? value) + { + if (value is not TimeSpan ts) + { + return "unknown"; + } + + if (ts.TotalDays >= 1) + { + return $"{(int)ts.TotalDays}d {ts.Hours}h {ts.Minutes}m"; + } + if (ts.TotalHours >= 1) + { + return $"{(int)ts.TotalHours}h {ts.Minutes}m"; + } + if (ts.TotalMinutes >= 1) + { + return $"{(int)ts.TotalMinutes}m {ts.Seconds}s"; + } + return $"{ts.Seconds}s"; + } + + private static string FormatSnapshotTime(DateTimeOffset? value) + { + if (value is not DateTimeOffset utc) + { + return "unknown"; + } + + DateTime local = utc.LocalDateTime; + string tz = TimeZoneInfo.Local.IsDaylightSavingTime(local) + ? TimeZoneInfo.Local.DaylightName + : TimeZoneInfo.Local.StandardName; + + return $"{local.ToString("yyyy-MM-dd HH:mm", CultureInfo.InvariantCulture)} {tz}"; + } + private void WarnForImpendingRemoval(Action log, QueueInfo queueInfo) { DateTime whenItExpires = DateTime.MaxValue; @@ -354,6 +456,12 @@ public IJobDefinition WithMaxRetryCount(int? maxRetryCount) return this; } + public IJobDefinition WithQueueStats() + { + ShowQueueStats = true; + return this; + } + internal void AddWorkItem(WorkItemDefinition workItemDefinition) { _workItems.Add(workItemDefinition); diff --git a/src/Microsoft.DotNet.Helix/Sdk/SendHelixJob.cs b/src/Microsoft.DotNet.Helix/Sdk/SendHelixJob.cs index 58af4284325..2012872ad23 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/SendHelixJob.cs +++ b/src/Microsoft.DotNet.Helix/Sdk/SendHelixJob.cs @@ -148,6 +148,12 @@ public static class MetadataNames /// public int MaxRetryCount { get; set; } + /// + /// When true, the preview Helix queue health summary (estimated wait, depth, snapshot + /// time) is logged after job submission. Off by default. + /// + public bool EnableShowHelixQueueStats { get; set; } + private CommandPayload _commandPayload; protected override async Task ExecuteCore(CancellationToken cancellationToken) @@ -176,6 +182,10 @@ protected override async Task ExecuteCore(CancellationToken cancellationToken) .WithType(Type) .WithTargetQueue(TargetQueue) .WithMaxRetryCount(MaxRetryCount); + if (EnableShowHelixQueueStats) + { + def = def.WithQueueStats(); + } Log.LogMessage($"Initialized job definition with type '{Type}', and target queue '{TargetQueue}'"); if (!string.IsNullOrEmpty(Creator)) diff --git a/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.MonoQueue.targets b/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.MonoQueue.targets index 62b05066a3a..4dc8eddabb2 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.MonoQueue.targets +++ b/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.MonoQueue.targets @@ -84,6 +84,7 @@ BaseUri="$(HelixBaseUri)" AccessToken="$(HelixAccessToken)" MaxRetryCount="$(MaxRetryCount)" + EnableShowHelixQueueStats="$(EnableShowHelixQueueStats)" PreCommands="$(HelixPreCommands)" PostCommands="$(HelixPostCommands)" CorrelationPayloads="@(HelixCorrelationPayload)" diff --git a/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.props b/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.props index c26e4f79ec3..f0cd20b9bbb 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.props +++ b/src/Microsoft.DotNet.Helix/Sdk/tools/Microsoft.DotNet.Helix.Sdk.props @@ -30,6 +30,15 @@ false + + + false + +