Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/Microsoft.DotNet.Helix/Client/CSharp/JobCreationResult.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using Newtonsoft.Json;

namespace Microsoft.DotNet.Helix.Client.Models
{
public partial class JobCreationResult
{
[JsonProperty("queueStats")]
public QueueStats QueueStats { get; set; }
}
}
29 changes: 29 additions & 0 deletions src/Microsoft.DotNet.Helix/Client/CSharp/QueueStats.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using Newtonsoft.Json;

namespace Microsoft.DotNet.Helix.Client.Models
{
public partial class QueueStats
{
[JsonProperty("queueName")]
public string QueueName { get; set; }

[JsonProperty("depth")]
public int? Depth { get; set; }

[JsonProperty("averageRunDuration")]
public TimeSpan? AverageRunDuration { get; set; }

[JsonProperty("estimatedWait")]
public TimeSpan? EstimatedWait { get; set; }

[JsonProperty("estimatedWaitMethod")]
public string EstimatedWaitMethod { get; set; }

[JsonProperty("generatedAt")]
public DateTimeOffset? GeneratedAt { get; set; }
}
}
7 changes: 7 additions & 0 deletions src/Microsoft.DotNet.Helix/JobSender/IJobDefinition.cs
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,13 @@ public interface IJobDefinition
/// <returns>Fluent job builder.</returns>
IJobDefinition WithMaxRetryCount(int? maxRetryCount);

/// <summary>
/// Opts in to logging the preview Helix queue health summary (estimated wait, depth,
/// snapshot time) returned in the job creation response. Off by default.
/// </summary>
/// <returns>Fluent job builder.</returns>
IJobDefinition WithQueueStats();

/// <summary>
/// <para>Sends the fully specified job to execution.</para>
///
Expand Down
108 changes: 108 additions & 0 deletions src/Microsoft.DotNet.Helix/JobSender/JobDefinition.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ public JobDefinition(IJob jobApi)
public string ResultContainerPrefix { get; private set; }
public IDictionary<IPayload, string> CorrelationPayloads { get; } = new Dictionary<IPayload, string>();
public int? MaxRetryCount { get; private set; }
public bool ShowQueueStats { get; private set; }
public string StorageAccountConnectionString { get; private set; }
public string TargetContainerName { get; set; } = DefaultContainerName;
public string TargetResultsContainerName { get; set; } = DefaultContainerName;
Expand Down Expand Up @@ -241,9 +242,110 @@ public async Task<ISentJob> SendAsync(Action<string> log, CancellationToken canc
string jobStartIdentifier = Guid.NewGuid().ToString("N");
var newJob = await JobApi.NewAsync(creationRequest, jobStartIdentifier, cancellationToken: cancellationToken).ConfigureAwait(false);

if (ShowQueueStats)
{
LogQueueStats(log, queueId, newJob?.QueueStats);
}

return new SentJob(JobApi, newJob);
}

// Helix SLA threshold; estimated waits above this are flagged as queue-at-capacity / unhealthy.
private static readonly TimeSpan QueueWaitSlaThreshold = TimeSpan.FromMinutes(30);

// If the Observer snapshot is older than this, the reported numbers may not reflect current queue state.
private static readonly TimeSpan SnapshotStaleThreshold = TimeSpan.FromMinutes(15);

private const string FirstRespondersUrl = "https://teams.microsoft.com/l/channel/19%3Aafba3d1545dd45d7b79f34c1821f6055%40thread.skype/First%20Responders?groupId=4d73664c-9f2f-450d-82a5-c2f02756606d&tenantId=72f988bf-86f1-41af-91ab-2d7cd011db47";

private static int s_queueStatsHeaderShown;
private static int s_firstRespondersHintShown;

private static void LogQueueStats(Action<string> log, string queueId, Models.QueueStats stats)
{
if (log == null || stats == null)
{
return;
}

string depth = stats.Depth?.ToString(CultureInfo.InvariantCulture) ?? "unknown";
string avgRun = FormatTimeSpan(stats.AverageRunDuration);
string estWait = FormatTimeSpan(stats.EstimatedWait);
string snapshot = FormatSnapshotTime(stats.GeneratedAt);

bool overSla = stats.EstimatedWait is TimeSpan wait && wait > QueueWaitSlaThreshold;
TimeSpan? snapshotAge = stats.GeneratedAt is DateTimeOffset gen
? DateTimeOffset.UtcNow - gen
: (TimeSpan?)null;
bool stale = snapshotAge is TimeSpan age && age > SnapshotStaleThreshold;

string healthTag = overSla ? " [AT CAPACITY]" : string.Empty;
string staleTag = stale ? " (stale)" : string.Empty;

if (Interlocked.Exchange(ref s_queueStatsHeaderShown, 1) == 0)
{
log("note : Helix queue health reporting is a preview feature; data and format may change.");
}

string queueName = string.IsNullOrEmpty(stats.QueueName) ? queueId : stats.QueueName;

log($"Helix queue '{queueName}' health{healthTag}:");
log($" Estimated wait : {estWait} (queue depth: {depth}, avg run: {avgRun})");
log($" Snapshot taken : {snapshot}{staleTag}");

if (overSla)
{
log($"warning : Helix queue '{queueName}' estimated wait of {estWait} exceeds the {QueueWaitSlaThreshold.TotalMinutes:F0}-minute SLA - the queue is at capacity or unhealthy. Jobs may take longer than usual to start.");
}

if (stale)
{
log($"warning : Helix queue '{queueName}' health snapshot is {FormatTimeSpan(snapshotAge)} old (threshold {SnapshotStaleThreshold.TotalMinutes:F0}m) - reported wait/depth may not reflect current queue state.");
}

if (Interlocked.Exchange(ref s_firstRespondersHintShown, 1) == 0)
{
log($" Questions about Helix queue health? Reach the dnceng First Responders channel: {FirstRespondersUrl}");
}
}

private static string FormatTimeSpan(TimeSpan? value)
{
if (value is not TimeSpan ts)
{
return "unknown";
}

if (ts.TotalDays >= 1)
{
return $"{(int)ts.TotalDays}d {ts.Hours}h {ts.Minutes}m";
}
if (ts.TotalHours >= 1)
{
return $"{(int)ts.TotalHours}h {ts.Minutes}m";
}
if (ts.TotalMinutes >= 1)
{
return $"{(int)ts.TotalMinutes}m {ts.Seconds}s";
}
return $"{ts.Seconds}s";
}

private static string FormatSnapshotTime(DateTimeOffset? value)
{
if (value is not DateTimeOffset utc)
{
return "unknown";
}

DateTime local = utc.LocalDateTime;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using local time seems inconsistent. Is UTC better? I haven't looked around but I feel that UTC is much more common in our builds.

string tz = TimeZoneInfo.Local.IsDaylightSavingTime(local)
? TimeZoneInfo.Local.DaylightName
: TimeZoneInfo.Local.StandardName;

return $"{local.ToString("yyyy-MM-dd HH:mm", CultureInfo.InvariantCulture)} {tz}";
}

private void WarnForImpendingRemoval(Action<string> log, QueueInfo queueInfo)
{
DateTime whenItExpires = DateTime.MaxValue;
Expand Down Expand Up @@ -354,6 +456,12 @@ public IJobDefinition WithMaxRetryCount(int? maxRetryCount)
return this;
}

public IJobDefinition WithQueueStats()
{
ShowQueueStats = true;
return this;
}

internal void AddWorkItem(WorkItemDefinition workItemDefinition)
{
_workItems.Add(workItemDefinition);
Expand Down
10 changes: 10 additions & 0 deletions src/Microsoft.DotNet.Helix/Sdk/SendHelixJob.cs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,12 @@ public static class MetadataNames
/// </summary>
public int MaxRetryCount { get; set; }

/// <summary>
/// When true, the preview Helix queue health summary (estimated wait, depth, snapshot
/// time) is logged after job submission. Off by default.
/// </summary>
public bool EnableShowHelixQueueStats { get; set; }

private CommandPayload _commandPayload;

protected override async Task ExecuteCore(CancellationToken cancellationToken)
Expand Down Expand Up @@ -176,6 +182,10 @@ protected override async Task ExecuteCore(CancellationToken cancellationToken)
.WithType(Type)
.WithTargetQueue(TargetQueue)
.WithMaxRetryCount(MaxRetryCount);
if (EnableShowHelixQueueStats)
{
def = def.WithQueueStats();
}
Log.LogMessage($"Initialized job definition with type '{Type}', and target queue '{TargetQueue}'");

if (!string.IsNullOrEmpty(Creator))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
BaseUri="$(HelixBaseUri)"
AccessToken="$(HelixAccessToken)"
MaxRetryCount="$(MaxRetryCount)"
EnableShowHelixQueueStats="$(EnableShowHelixQueueStats)"
PreCommands="$(HelixPreCommands)"
PostCommands="$(HelixPostCommands)"
CorrelationPayloads="@(HelixCorrelationPayload)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@
<EnableHelixJobMonitor Condition="'$(EnableHelixJobMonitor)' != 'true'">false</EnableHelixJobMonitor>
</PropertyGroup>

<!--
Feature flag: when 'true', the Helix Sdk logs a preview queue-health summary (estimated wait,
queue depth, snapshot time) for each submitted job, plus warnings when the estimated wait
exceeds the SLA or the snapshot is stale. Off by default.
-->
<PropertyGroup>
<EnableShowHelixQueueStats Condition="'$(EnableShowHelixQueueStats)' != 'true'">false</EnableShowHelixQueueStats>
</PropertyGroup>

<!-- TODO: Remove Architecture="*" on Runtime="NET" UsingTask declarations below when https://github.com/dotnet/msbuild/issues/13739 is fixed and a new msbuild is consumed with the fix. -->
<UsingTask TaskName="SendHelixJob" AssemblyFile="$(MicrosoftDotNetHelixSdkTasksAssembly)" Runtime="NET" Architecture="*" TaskFactory="$(MicrosoftDotNetHelixSdkTasksFactory)" />
<UsingTask TaskName="WaitForHelixJobCompletion" AssemblyFile="$(MicrosoftDotNetHelixSdkTasksAssembly)" Runtime="NET" Architecture="*" TaskFactory="$(MicrosoftDotNetHelixSdkTasksFactory)" />
Expand Down
Loading