From f4c335a7e3e9bb8fa600b423b64c187f8e52a3be Mon Sep 17 00:00:00 2001 From: Noah Gilson Date: Thu, 26 Feb 2026 12:22:18 -0800 Subject: [PATCH] Improve Helix Error Reporting # Motivators When helix fails, there are several core problems that make the build analysis tab no longer helpful. 1. Failures do not appear, and instead this error is displayed from `dotnet-helix-service`: This is a helix work item crash with status: `BadExit`. 2. Work Item failures simply say `Test has failed. Check the log: log_link` when they could actually provide the error, so we don't have to click and read the log. 3. When I click the link on '[Console]' it takes me here: https://dev.azure.com/dnceng-public/public/_build/results?buildId=1310620&view=ms.vss-test-web.build-test-results-tab&runId=36607218&resultId=113379&paneView=debug This view is not helpful - here is what it shows me. "The Helix Work Item failed. Often this is due to a test crash. Please see the 'Artifacts' tab above for additional logs." The failures for a "Work Item" are not helpful and provide nothing actionable. What I really want to see is a specific test failure link for that build, https://dev.azure.com/dnceng-public/public/_build/results?buildId=1310620&view=ms.vss-test-web.build-test-results-tab&runId=36607090&resultId=100720&paneView=debug such as this link, which shows ''Expected string to be or empty because Expected command to not output to stderr but it was not:' # Fixes Adds more error details from the log into the output messages. Note, I have not worked in the Arcade repo so I'm not familiar with this logic. It may also be controversial to create a longer string in the output by showing more error details. My goal is to speed up diagnostic time and reduce the number of clicks it takes to see what failed in a PR. --- .../Sdk/CheckAzurePipelinesTestResults.cs | 25 +++- .../Sdk/CheckHelixJobStatus.cs | 26 +++- .../CreateFailedTestsForFailedWorkItems.cs | 68 +++++++++- .../Sdk/GetHelixWorkItems.cs | 117 ++++++++++++++++++ .../reporter/test_results_reader/__init__.py | 65 +++++++++- 5 files changed, 293 insertions(+), 8 deletions(-) diff --git a/src/Microsoft.DotNet.Helix/Sdk/CheckAzurePipelinesTestResults.cs b/src/Microsoft.DotNet.Helix/Sdk/CheckAzurePipelinesTestResults.cs index 07c145507e7..bb847afbb60 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/CheckAzurePipelinesTestResults.cs +++ b/src/Microsoft.DotNet.Helix/Sdk/CheckAzurePipelinesTestResults.cs @@ -147,7 +147,30 @@ private async Task LogErrorsForFailedRun(HttpClient client, int testRunId) if (workItem != null) { - Log.LogError(FailureCategory.Test, $"Test {name} has failed. Check the Test tab or this console log: {workItem.GetMetadata("ConsoleOutputUri")}"); + string consoleUri = workItem.GetMetadata("ConsoleOutputUri"); + string consoleErrorText = workItem.GetMetadata("ConsoleErrorText"); + string exitCode = workItem.GetMetadata("ExitCode"); + + var sb = new System.Text.StringBuilder(); + sb.Append($"Test {name} has failed."); + + if (!string.IsNullOrEmpty(exitCode)) + { + sb.Append($" (exit code {exitCode})"); + } + + sb.Append($" Check the Test tab or this console log: {consoleUri}"); + + if (!string.IsNullOrEmpty(consoleErrorText)) + { + sb.AppendLine(); + sb.AppendLine("Error details:"); + ? consoleErrorText.Substring(0, 2000) + "..." + : consoleErrorText; + sb.Append(truncated); + } + + Log.LogError(FailureCategory.Test, sb.ToString()); } else { diff --git a/src/Microsoft.DotNet.Helix/Sdk/CheckHelixJobStatus.cs b/src/Microsoft.DotNet.Helix/Sdk/CheckHelixJobStatus.cs index 2cc315a4899..97df9fe0c73 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/CheckHelixJobStatus.cs +++ b/src/Microsoft.DotNet.Helix/Sdk/CheckHelixJobStatus.cs @@ -40,8 +40,32 @@ protected override Task ExecuteCore(CancellationToken cancellationToken) var jobName = workItem.GetMetadata("JobName"); var workItemName = workItem.GetMetadata("WorkItemName"); var consoleUri = workItem.GetMetadata("ConsoleOutputUri"); + var exitCode = workItem.GetMetadata("ExitCode"); + var consoleErrorText = workItem.GetMetadata("ConsoleErrorText"); - Log.LogError(FailureCategory.Test, $"Work item {workItemName} in job {jobName} has failed.\nFailure log: {consoleUri}{accessTokenSuffix}"); + var sb = new System.Text.StringBuilder(); + sb.Append($"Work item {workItemName} in job {jobName} has failed."); + + if (!string.IsNullOrEmpty(exitCode)) + { + sb.Append($" (exit code {exitCode})"); + } + + sb.AppendLine(); + sb.AppendLine($"Failure log: {consoleUri}{accessTokenSuffix}"); + + if (!string.IsNullOrEmpty(consoleErrorText)) + { + sb.AppendLine(); + sb.AppendLine("Error details:"); + // Truncate to avoid excessively long MSBuild error messages + string truncated = consoleErrorText.Length > 2000 + ? consoleErrorText.Substring(0, 2000) + "..." + : consoleErrorText; + sb.Append(truncated); + } + + Log.LogError(FailureCategory.Test, sb.ToString()); } } } diff --git a/src/Microsoft.DotNet.Helix/Sdk/CreateFailedTestsForFailedWorkItems.cs b/src/Microsoft.DotNet.Helix/Sdk/CreateFailedTestsForFailedWorkItems.cs index 1fa4d85411c..493ac4aca70 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/CreateFailedTestsForFailedWorkItems.cs +++ b/src/Microsoft.DotNet.Helix/Sdk/CreateFailedTestsForFailedWorkItems.cs @@ -26,11 +26,73 @@ protected override async Task ExecuteCoreAsync(HttpClient client) var testRunId = workItem.GetMetadata("TestRunId"); var failed = workItem.GetMetadata("Failed") == "true"; - await CreateFakeTestResultAsync(client, testRunId, jobName, workItemName, failed); + // Build a meaningful error message from available diagnostics + string errorMessage = null; + if (failed) + { + errorMessage = BuildErrorMessage(workItem, jobName, workItemName); + } + + await CreateFakeTestResultAsync(client, testRunId, jobName, workItemName, failed, errorMessage); + } + } + + private static string BuildErrorMessage(ITaskItem workItem, string jobName, string workItemName) + { + var sb = new StringBuilder(); + + string exitCode = workItem.GetMetadata("ExitCode"); + if (!string.IsNullOrEmpty(exitCode)) + { + sb.AppendLine($"Helix work item exited with code {exitCode}."); + } + + string helixErrors = workItem.GetMetadata("HelixErrors"); + if (!string.IsNullOrEmpty(helixErrors)) + { + try + { + var errors = JsonConvert.DeserializeObject(helixErrors); + if (errors?.Length > 0) + { + sb.AppendLine("Helix errors:"); + foreach (var error in errors) + { + sb.AppendLine($" - {error}"); + } + } + } + catch (JsonException) + { + // Ignore deserialization failures + } + } + + string consoleErrorText = workItem.GetMetadata("ConsoleErrorText"); + if (!string.IsNullOrEmpty(consoleErrorText)) + { + sb.AppendLine(); + sb.AppendLine("Test output:"); + sb.AppendLine(consoleErrorText); } + + if (sb.Length == 0) + { + // Fallback to original generic message if we couldn't extract anything + sb.AppendLine("The Helix Work Item failed. Often this is due to a test crash. Please see the 'Artifacts' tab above for additional logs."); + } + + string consoleUri = workItem.GetMetadata("ConsoleOutputUri"); + if (!string.IsNullOrEmpty(consoleUri)) + { + sb.AppendLine(); + sb.AppendLine($"Full console log: {consoleUri}"); + } + + return sb.ToString().TrimEnd(); } - private async Task CreateFakeTestResultAsync(HttpClient client, string testRunId, string jobName, string workItemFriendlyName, bool failed) + private async Task CreateFakeTestResultAsync(HttpClient client, string testRunId, string jobName, string workItemFriendlyName, bool failed, string errorMessage) { var testResultData = await RetryAsync( async () => @@ -51,7 +113,7 @@ private async Task CreateFakeTestResultAsync(HttpClient client, string test ["testCaseTitle"] = $"{workItemFriendlyName} Work Item", ["outcome"] = failed ? "Failed" : "Passed", ["state"] = "Completed", - ["errorMessage"] = failed ? "The Helix Work Item failed. Often this is due to a test crash. Please see the 'Artifacts' tab above for additional logs." : null, + ["errorMessage"] = failed ? errorMessage : null, ["durationInMs"] = 60 * 1000, // Use a non-zero duration so that the graphs look better. ["comment"] = new JObject { diff --git a/src/Microsoft.DotNet.Helix/Sdk/GetHelixWorkItems.cs b/src/Microsoft.DotNet.Helix/Sdk/GetHelixWorkItems.cs index d8d25d70c19..1c10d673528 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/GetHelixWorkItems.cs +++ b/src/Microsoft.DotNet.Helix/Sdk/GetHelixWorkItems.cs @@ -5,7 +5,9 @@ using System.Collections; using System.Collections.Generic; using System.Collections.Immutable; +using System.IO; using System.Linq; +using System.Text; using System.Threading; using System.Threading.Tasks; using Microsoft.Build.Framework; @@ -113,11 +115,126 @@ ITaskItem2 CreateTaskItem(string workItemName, IDictionary metad Log.LogWarningFromException(ex); } + // Fetch console output and work item details for failed items so we can surface + // actual error messages instead of the generic "work item failed" text. + try + { + var details = await HelixApi.WorkItem.DetailsAsync(wi, jobName, cancellationToken).ConfigureAwait(false); + if (details.ExitCode.HasValue) + { + metadata["ExitCode"] = details.ExitCode.Value.ToString(); + } + if (details.Errors != null && details.Errors.Count > 0) + { + metadata["HelixErrors"] = JsonConvert.SerializeObject(details.Errors.Select(e => e.Message)); + } + } + catch (Exception ex) + { + Log.LogMessage(MessageImportance.Low, $"Failed to get work item details for {wi}: {ex.Message}"); + } + + try + { + using (var consoleStream = await HelixApi.WorkItem.ConsoleLogAsync(wi, jobName, cancellationToken).ConfigureAwait(false)) + using (var reader = new StreamReader(consoleStream, Encoding.UTF8)) + { + string fullConsole = await reader.ReadToEndAsync().ConfigureAwait(false); + if (!string.IsNullOrEmpty(fullConsole)) + { + // Extract the tail of the console output (last ~100 lines) to capture errors near the crash + string[] lines = fullConsole.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries); + int startIndex = Math.Max(0, lines.Length - 100); + string tail = string.Join(Environment.NewLine, lines, startIndex, lines.Length - startIndex); + + // Also try to extract specific test failure messages from the console output + string errorExcerpt = ExtractTestFailureMessages(fullConsole, tail); + metadata["ConsoleErrorText"] = errorExcerpt; + } + } + } + catch (Exception ex) + { + Log.LogMessage(MessageImportance.Low, $"Failed to fetch console output for {wi}: {ex.Message}"); + } + workItems.Add(CreateTaskItem($"{jobName}/{wi}", metadata)); await Task.Delay(DelayBetweenHelixApiCallsInMs, cancellationToken); } return workItems; } + + /// + /// Extracts actionable test failure messages from console output. + /// Looks for common patterns from dotnet test / xUnit / MSTest failure output. + /// Falls back to the tail of the console if no specific patterns are found. + /// + private static string ExtractTestFailureMessages(string fullConsole, string tail) + { + var sb = new StringBuilder(); + + // Look for "Failed" test result lines from dotnet test console output + // Pattern: " Failed TestClassName.MethodName [duration]" + string[] allLines = fullConsole.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries); + + bool inFailureBlock = false; + int failureLineCount = 0; + const int maxFailureLines = 30; // Cap per failure block to avoid giant output + int totalFailuresFound = 0; + const int maxTotalFailures = 5; // Only show first N failures + + foreach (string line in allLines) + { + string trimmed = line.TrimStart(); + + // Detect start of a failure block + if (trimmed.StartsWith("Failed ", StringComparison.OrdinalIgnoreCase) || + trimmed.StartsWith("Error Message:", StringComparison.OrdinalIgnoreCase) || + trimmed.StartsWith("Assert.", StringComparison.OrdinalIgnoreCase)) + { + if (!inFailureBlock) + { + totalFailuresFound++; + if (totalFailuresFound > maxTotalFailures) + { + sb.AppendLine($"... and more failures (showing first {maxTotalFailures})"); + break; + } + inFailureBlock = true; + failureLineCount = 0; + if (sb.Length > 0) + sb.AppendLine(); + } + } + + if (inFailureBlock) + { + sb.AppendLine(line); + failureLineCount++; + + // End the block after enough context or on a blank-ish line after error content + if (failureLineCount >= maxFailureLines || + (failureLineCount > 3 && string.IsNullOrWhiteSpace(trimmed))) + { + inFailureBlock = false; + } + } + } + + // If we found structured failure messages, return them + if (sb.Length > 0) + { + return sb.ToString().TrimEnd(); + } + + // Otherwise, return the tail of the console output as-is + // Truncate to ~4000 chars to fit in AzDO errorMessage field + if (tail.Length > 4000) + { + tail = "..." + tail.Substring(tail.Length - 4000); + } + return tail; + } } } diff --git a/src/Microsoft.DotNet.Helix/Sdk/tools/azure-pipelines/reporter/test_results_reader/__init__.py b/src/Microsoft.DotNet.Helix/Sdk/tools/azure-pipelines/reporter/test_results_reader/__init__.py index f80af8dcb82..7e67605d4f4 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/tools/azure-pipelines/reporter/test_results_reader/__init__.py +++ b/src/Microsoft.DotNet.Helix/Sdk/tools/azure-pipelines/reporter/test_results_reader/__init__.py @@ -13,8 +13,65 @@ def __no_results_result(): work_item_name = get_env("HELIX_WORKITEM_FRIENDLYNAME") if exitCode != "0": - # if we have a catastrophic failure, we want to create the fake test result with attached dump files and logs (if available) - return + # Work item crashed / exited non-zero without producing result files. + # Try to capture useful error information from available sources. + failure_message = "Work item exited with code {}. No test result files were produced.".format(exitCode) + attachments = [] + + # Try to capture console output from the test process + # The stdout/stderr of the test command is available in the work item's console log + # but we can also check for any error output files + upload_root = get_env("HELIX_WORKITEM_UPLOAD_ROOT") + workitem_root = get_env("HELIX_WORKITEM_ROOT") + + # Look for blame sequence files that indicate which test was running when crash occurred + error_details = [] + for search_dir in [os.getcwd(), upload_root, workitem_root]: + if not search_dir or not os.path.isdir(search_dir): + continue + for root, dirs, files in os.walk(search_dir): + for f in files: + fpath = os.path.join(root, f) + try: + # Blame sequence file tells us which test was running + if f == "Sequence.xml" or f.endswith("_Sequence.xml"): + with open(fpath, 'r', errors='replace') as seq_file: + content = seq_file.read() + error_details.append("Blame sequence (last test running at crash):\n" + content[-2000:]) + attachments.append(TestResultAttachment( + name=u"Sequence.xml", + text=content, + )) + # Capture any .log files that might have error details + elif f.endswith("TestLog.log") or f == "dotnetTestLog.log": + with open(fpath, 'r', errors='replace') as log_file: + content = log_file.read() + # Get last 3000 chars which likely contain the error + tail = content[-3000:] if len(content) > 3000 else content + error_details.append("Test log tail:\n" + tail) + attachments.append(TestResultAttachment( + name=f, + text=content, + )) + except Exception as e: + log.warning("Failed to read {}: {}".format(fpath, e)) + + if error_details: + failure_message += "\n\n" + "\n\n".join(error_details) + + return TestResult( + name=u'{}.WorkItemExecution'.format(work_item_name), + kind=u'unknown', + type_name=u'{}'.format(work_item_name), + method=u'WorkItemExecution', + duration=1, + result=u'Fail', + exception_type=None, + failure_message=failure_message, + stack_trace=None, + skip_reason=None, + attachments=attachments, + ) else: result = 'Pass' failure_message = None @@ -53,4 +110,6 @@ def read_results(dirs_to_check: List[str]) -> Iterable[TestResult]: if not found: log.warn('No results file found in any of the following formats: {}'.format(', '.join((f.name for f in all_formats)))) - yield __no_results_result() + result = __no_results_result() + if result is not None: + yield result