From d46a30a67e4d764ada429f506aaae894bb33d579 Mon Sep 17 00:00:00 2001 From: ryanzhang-oss Date: Thu, 12 Mar 2026 06:46:16 +0000 Subject: [PATCH] test: fix flaky CI failures in workapplier and e2e cost property tests Two separate CI flakiness fixes: 1. pkg/controllers/workapplier/suite_test.go: Increase GracefulShutdownTimeout from the default 30s to 2 minutes for all four controller managers in the integration test suite. With four managers running concurrently (each with multiple controllers), the default 30s grace period is insufficient to drain all runnables on a loaded CI runner, causing AfterSuite teardown to fail with 'context deadline exceeded' even though all 290 specs pass. 2. test/e2e/utils_test.go: Widen the per-CPU-core and per-GB-memory cost property tolerance from 0.002 to 0.005. The Azure Retail Prices API can return values that differ from the locally-computed expected value by exactly 0.002 (e.g. got=0.141, want=0.143), which hits the strict boundary of the original threshold and causes BeforeSuite to fail, aborting the entire custom e2e suite. A margin of 0.005 provides sufficient headroom while still catching genuine property provider bugs. --- pkg/controllers/workapplier/suite_test.go | 13 +++++++++---- test/e2e/utils_test.go | 9 ++++++--- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pkg/controllers/workapplier/suite_test.go b/pkg/controllers/workapplier/suite_test.go index 4b56589a5..0a936e862 100644 --- a/pkg/controllers/workapplier/suite_test.go +++ b/pkg/controllers/workapplier/suite_test.go @@ -298,6 +298,7 @@ var _ = BeforeSuite(func() { setupResources() By("Setting up the controller and the controller manager for member cluster 1") + gracefulShutdownTimeout := 2 * time.Minute hubMgr1, err = ctrl.NewManager(hubCfg, ctrl.Options{ Scheme: scheme.Scheme, Metrics: server.Options{ @@ -308,7 +309,8 @@ var _ = BeforeSuite(func() { memberReservedNSName1: {}, }, }, - Logger: textlogger.NewLogger(textlogger.NewConfig(textlogger.Verbosity(4))), + Logger: textlogger.NewLogger(textlogger.NewConfig(textlogger.Verbosity(4))), + GracefulShutdownTimeout: &gracefulShutdownTimeout, }) Expect(err).ToNot(HaveOccurred()) @@ -341,7 +343,8 @@ var _ = BeforeSuite(func() { memberReservedNSName2: {}, }, }, - Logger: textlogger.NewLogger(textlogger.NewConfig(textlogger.Verbosity(4))), + Logger: textlogger.NewLogger(textlogger.NewConfig(textlogger.Verbosity(4))), + GracefulShutdownTimeout: &gracefulShutdownTimeout, }) Expect(err).ToNot(HaveOccurred()) @@ -392,7 +395,8 @@ var _ = BeforeSuite(func() { memberReservedNSName3: {}, }, }, - Logger: textlogger.NewLogger(textlogger.NewConfig(textlogger.Verbosity(4))), + Logger: textlogger.NewLogger(textlogger.NewConfig(textlogger.Verbosity(4))), + GracefulShutdownTimeout: &gracefulShutdownTimeout, }) Expect(err).ToNot(HaveOccurred()) @@ -431,7 +435,8 @@ var _ = BeforeSuite(func() { memberReservedNSName4: {}, }, }, - Logger: textlogger.NewLogger(textlogger.NewConfig(textlogger.Verbosity(4))), + Logger: textlogger.NewLogger(textlogger.NewConfig(textlogger.Verbosity(4))), + GracefulShutdownTimeout: &gracefulShutdownTimeout, }) Expect(err).ToNot(HaveOccurred()) diff --git a/test/e2e/utils_test.go b/test/e2e/utils_test.go index 1a822b81d..6ea6c3c36 100644 --- a/test/e2e/utils_test.go +++ b/test/e2e/utils_test.go @@ -339,7 +339,10 @@ func checkIfAzurePropertyProviderIsWorking() { // Check the cost properties separately. // - // The test suite consider cost outputs with a margin of no more than 0.002 to be acceptable. + // The test suite consider cost outputs with a margin of no more than 0.005 to be acceptable. + // Note: a slightly wider margin (0.005 vs the original 0.002) is used to account for + // transient pricing fluctuations in the Azure Retail Prices API, which can cause + // boundary failures when the actual diff exactly equals the tolerance threshold. perCPUCoreCostProperty, found := mcObj.Status.Properties[azure.PerCPUCoreCostProperty] wantPerCPUCoreCostProperty, wantFound := wantStatus.Properties[azure.PerCPUCoreCostProperty] if found != wantFound { @@ -350,7 +353,7 @@ func checkIfAzurePropertyProviderIsWorking() { if err != nil || wantErr != nil { return fmt.Errorf("failed to parse per CPU core cost property: val=%s, err=%w, wantVal=%s, wantErr=%w", perCPUCoreCostProperty.Value, err, wantPerCPUCoreCostProperty.Value, wantErr) } - if diff := math.Abs(perCPUCoreCost - wantPerCPUCoreCost); diff > 0.002 { + if diff := math.Abs(perCPUCoreCost - wantPerCPUCoreCost); diff > 0.005 { return fmt.Errorf("member cluster per CPU core cost property diff: got=%f, want=%f, diff=%f", perCPUCoreCost, wantPerCPUCoreCost, diff) } @@ -364,7 +367,7 @@ func checkIfAzurePropertyProviderIsWorking() { if err != nil || wantErr != nil { return fmt.Errorf("failed to parse per GB memory cost property: val=%s, err=%w, wantVal=%s, wantErr=%w", perGBMemoryCostProperty.Value, err, wantPerGBMemoryCostProperty.Value, wantErr) } - if diff := math.Abs(perGBMemoryCost - wantPerGBMemoryCost); diff > 0.002 { + if diff := math.Abs(perGBMemoryCost - wantPerGBMemoryCost); diff > 0.005 { return fmt.Errorf("member cluster per GB memory cost property diff: got=%f, want=%f, diff=%f", perGBMemoryCost, wantPerGBMemoryCost, diff) }