From 3b9a17fc3c03be30100d4ae35fd05bf73cd5d6db Mon Sep 17 00:00:00 2001 From: jankratochvilcz Date: Fri, 22 May 2026 16:07:14 +0200 Subject: [PATCH 1/2] Shrink TaskNodesDieAfterBuild WaitForExit budget 15000ms -> 2000ms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 24h of post-merge telemetry from dnceng-public pipeline 75 (introduced by #13828) shows actual elapsedMs distributions: net10.0/x64: min=6 p50=8 p95=44 max=191 (37 runs) net472/x86: min=98 p50=105 p95=129 max=136 (20 runs) The 15000ms budget gave ~80x headroom over the worst case. Shrink to 2000ms (~10x of worst observed) — still comfortable margin, and large enough to avoid bringing back the 3000ms flake that motivated #13828. The Stopwatch + elapsedMs/timeoutMs telemetry stays in place so the next regression will be visible in the test output without another bump. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../BackEnd/TaskHostFactory_Tests.cs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/Build.UnitTests/BackEnd/TaskHostFactory_Tests.cs b/src/Build.UnitTests/BackEnd/TaskHostFactory_Tests.cs index a2e9e19e42d..c2b1814e448 100644 --- a/src/Build.UnitTests/BackEnd/TaskHostFactory_Tests.cs +++ b/src/Build.UnitTests/BackEnd/TaskHostFactory_Tests.cs @@ -95,13 +95,15 @@ public void TaskNodesDieAfterBuild(bool taskHostFactorySpecified, bool envVariab string capturedName = SafeGetProcessField(() => taskHostNode.ProcessName); string capturedStart = SafeGetProcessField(() => taskHostNode.StartTime.ToString("O", CultureInfo.InvariantCulture)); - // The task host should exit shortly after the build completes. Use a generous - // timeout because slow CI agents have been observed to take up to ~10s for the - // child process to drain stdio and exit. - // TELEMETRY: elapsedMs is logged so a future iteration can tune this back down - // to a tight-but-safe value. If observed elapsed never approaches the timeout, - // shrink TaskHostExitTimeoutMs in a follow-up PR. - const int TaskHostExitTimeoutMs = 15000; + // The task host should exit shortly after the build completes. Use a + // tight-but-safe timeout based on 24h of post-merge telemetry from + // dnceng-public pipeline 75 (net10/x64 p95=44ms max=191ms; net472/x86 + // p95=129ms max=136ms across 57 runs on main). 2000ms gives ~10x of the + // worst observed elapsed without bringing back the original 3000ms flake + // that motivated #13828. + // TELEMETRY: elapsedMs is still logged so future regressions are visible + // in the test output without another bump. + const int TaskHostExitTimeoutMs = 2000; Stopwatch sw = Stopwatch.StartNew(); bool exited = taskHostNode.WaitForExit(TaskHostExitTimeoutMs); sw.Stop(); From aaeda9f20394b948f2aaa6a45428edc22c0d9911 Mon Sep 17 00:00:00 2001 From: Jan Kratochvil Date: Mon, 25 May 2026 11:18:14 +0200 Subject: [PATCH 2/2] Adjust TaskNodesDieAfterBuild budget 2000ms -> 5000ms After 4 days of telemetry on the bumped 15000ms ceiling (0 failures over 77 builds in dnceng-public PR CI), the bumped budget is clearly sufficient but we have no direct elapsed-ms distribution to know how tight we can safely go. 2000ms was an aggressive shrink based on the 1-day worst-observed 191ms. 5000ms still gives a ~67% reduction while preserving ~26x headroom over worst-observed elapsed, and is safely above the previously-flaking 3000ms threshold that motivated the original bump. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/Build.UnitTests/BackEnd/TaskHostFactory_Tests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Build.UnitTests/BackEnd/TaskHostFactory_Tests.cs b/src/Build.UnitTests/BackEnd/TaskHostFactory_Tests.cs index c2b1814e448..650f9b36aff 100644 --- a/src/Build.UnitTests/BackEnd/TaskHostFactory_Tests.cs +++ b/src/Build.UnitTests/BackEnd/TaskHostFactory_Tests.cs @@ -103,7 +103,7 @@ public void TaskNodesDieAfterBuild(bool taskHostFactorySpecified, bool envVariab // that motivated #13828. // TELEMETRY: elapsedMs is still logged so future regressions are visible // in the test output without another bump. - const int TaskHostExitTimeoutMs = 2000; + const int TaskHostExitTimeoutMs = 5000; Stopwatch sw = Stopwatch.StartNew(); bool exited = taskHostNode.WaitForExit(TaskHostExitTimeoutMs); sw.Stop();