From ac8fe6a011498588e151deb72f23e4ed5c99f277 Mon Sep 17 00:00:00 2001 From: Steve Goodwin Date: Tue, 30 Jun 2026 11:31:45 -0400 Subject: [PATCH] OCPBUGS-67134: add grace period before reporting Available=False The console operator immediately reports Available=False when deployment replicas drop to zero, even during brief disruptions (~10s) that self-recover. Add a 2-minute grace period that suppresses the condition when the deployment was recently available, preventing false alarms during disruptive CI tests while still reporting genuine outages. Co-Authored-By: Claude Opus 4.6 --- pkg/console/operator/operator.go | 2 + pkg/console/operator/sync_v400.go | 35 +++++-- pkg/console/operator/sync_v400_test.go | 140 +++++++++++++++++++++++++ 3 files changed, 170 insertions(+), 7 deletions(-) diff --git a/pkg/console/operator/operator.go b/pkg/console/operator/operator.go index 5ae748af7..2c30f6ce2 100644 --- a/pkg/console/operator/operator.go +++ b/pkg/console/operator/operator.go @@ -94,6 +94,8 @@ type consoleOperator struct { trackables trackables monitoringDeploymentLister appsv1listers.DeploymentLister + + lastDeploymentAvailableTime time.Time } type trackables struct { diff --git a/pkg/console/operator/sync_v400.go b/pkg/console/operator/sync_v400.go index b0b1edb54..35a673831 100644 --- a/pkg/console/operator/sync_v400.go +++ b/pkg/console/operator/sync_v400.go @@ -8,6 +8,7 @@ import ( "slices" "sort" "strings" + "time" // kube appsv1 "k8s.io/api/apps/v1" @@ -46,6 +47,13 @@ import ( telemetry "github.com/openshift/console-operator/pkg/console/telemetry" ) +// deploymentAvailableGracePeriod is the duration the operator tolerates zero +// available replicas before reporting DeploymentAvailable=False. This absorbs +// brief disruptions (e.g. node reboots during conformance-serial tests) that +// take all replicas offline for ~10 seconds. Matches the Degraded inertia +// duration in library-go. +const deploymentAvailableGracePeriod = 2 * time.Minute + // The sync loop starts from zero and works its way through the requirements for a running console. // If at any point something is missing, it creates/updates that piece and immediately dies. // The next loop will pick up where they previous left off and move the process forward one step. @@ -221,13 +229,7 @@ func (co *consoleOperator) sync_v400(ctx context.Context, controllerContext fact return nil }())) - statusHandler.AddCondition(status.HandleAvailable(func() (prefix string, reason string, err error) { - prefix = "Deployment" - if !deploymentsub.IsAvailable(actualDeployment) { - return prefix, "InsufficientReplicas", fmt.Errorf("%v replicas available for console deployment", actualDeployment.Status.ReadyReplicas) - } - return prefix, "", nil - }())) + statusHandler.AddCondition(status.HandleAvailable(co.evaluateDeploymentAvailability(actualDeployment))) // if we survive the gauntlet, we need to update the console config with the // public hostname so that the world can know the console is ready to roll @@ -338,6 +340,25 @@ func (co *consoleOperator) SyncDeployment( return deployment, "", nil } +// evaluateDeploymentAvailability checks whether the console deployment should +// be reported as available. When replicas drop to zero during a transient +// disruption (e.g. node reboot in conformance-serial tests), the operator +// suppresses Available=False for up to deploymentAvailableGracePeriod to +// avoid alarming ClusterOperator condition blips. +func (co *consoleOperator) evaluateDeploymentAvailability(deployment *appsv1.Deployment) (prefix string, reason string, err error) { + prefix = "Deployment" + if deploymentsub.IsAvailable(deployment) { + co.lastDeploymentAvailableTime = time.Now() + return prefix, "", nil + } + if sinceLast := time.Since(co.lastDeploymentAvailableTime); !co.lastDeploymentAvailableTime.IsZero() && sinceLast <= deploymentAvailableGracePeriod { + klog.V(4).Infof("deployment has 0 available replicas but was available %v ago, within %v grace period", + sinceLast, deploymentAvailableGracePeriod) + return prefix, "", nil + } + return prefix, "InsufficientReplicas", fmt.Errorf("%v replicas available for console deployment", deployment.Status.ReadyReplicas) +} + // apply configmap (needs route) // by the time we get to the configmap, we can assume the route exits & is configured properly // therefore no additional error handling is needed here. diff --git a/pkg/console/operator/sync_v400_test.go b/pkg/console/operator/sync_v400_test.go index 985945f30..74ba23c2c 100644 --- a/pkg/console/operator/sync_v400_test.go +++ b/pkg/console/operator/sync_v400_test.go @@ -5,6 +5,7 @@ import ( "encoding/json" "sort" "testing" + "time" "github.com/go-test/deep" @@ -539,3 +540,142 @@ func TestGetTelemetryConfiguration_DisconnectedClusterNoError(t *testing.T) { t.Error("expected ACCOUNT_MAIL key to be present even on disconnected cluster") } } + +// TestEvaluateDeploymentAvailability tests the grace period logic for +// OCPBUGS-67134: the operator should suppress brief Available=False blips +// when all replicas are temporarily offline during disruptive operations. +func TestEvaluateDeploymentAvailability(t *testing.T) { + makeDeployment := func(availableReplicas int32) *appsv1.Deployment { + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "console", + Namespace: "openshift-console", + }, + Status: appsv1.DeploymentStatus{ + AvailableReplicas: availableReplicas, + ReadyReplicas: availableReplicas, + }, + } + } + + t.Run("available deployment reports Available=True and updates timestamp", func(t *testing.T) { + co := &consoleOperator{} + deployment := makeDeployment(2) + + prefix, reason, err := co.evaluateDeploymentAvailability(deployment) + + if err != nil { + t.Errorf("expected no error, got: %v", err) + } + if reason != "" { + t.Errorf("expected empty reason, got: %q", reason) + } + if prefix != "Deployment" { + t.Errorf("expected prefix 'Deployment', got: %q", prefix) + } + if co.lastDeploymentAvailableTime.IsZero() { + t.Error("expected lastDeploymentAvailableTime to be set") + } + }) + + t.Run("unavailable with no prior availability reports Available=False immediately", func(t *testing.T) { + co := &consoleOperator{} + deployment := makeDeployment(0) + + _, reason, err := co.evaluateDeploymentAvailability(deployment) + + if err == nil { + t.Error("expected error, got nil") + } + if reason != "InsufficientReplicas" { + t.Errorf("expected reason 'InsufficientReplicas', got: %q", reason) + } + }) + + t.Run("unavailable within grace period reports Available=True (suppressed)", func(t *testing.T) { + co := &consoleOperator{ + lastDeploymentAvailableTime: time.Now().Add(-10 * time.Second), + } + deployment := makeDeployment(0) + + _, reason, err := co.evaluateDeploymentAvailability(deployment) + + if err != nil { + t.Errorf("expected no error within grace period, got: %v", err) + } + if reason != "" { + t.Errorf("expected empty reason within grace period, got: %q", reason) + } + }) + + t.Run("unavailable beyond grace period reports Available=False", func(t *testing.T) { + co := &consoleOperator{ + lastDeploymentAvailableTime: time.Now().Add(-3 * time.Minute), + } + deployment := makeDeployment(0) + + _, reason, err := co.evaluateDeploymentAvailability(deployment) + + if err == nil { + t.Error("expected error beyond grace period, got nil") + } + if reason != "InsufficientReplicas" { + t.Errorf("expected reason 'InsufficientReplicas', got: %q", reason) + } + }) + + t.Run("recovery after blip resets timestamp", func(t *testing.T) { + co := &consoleOperator{ + lastDeploymentAvailableTime: time.Now().Add(-30 * time.Second), + } + + // Simulate: was available, went to 0, then recovered + deployment := makeDeployment(0) + _, _, err := co.evaluateDeploymentAvailability(deployment) + if err != nil { + t.Error("expected suppression within grace period") + } + + // Recovery + deployment = makeDeployment(2) + before := co.lastDeploymentAvailableTime + _, _, err = co.evaluateDeploymentAvailability(deployment) + if err != nil { + t.Errorf("expected no error on recovery, got: %v", err) + } + if !co.lastDeploymentAvailableTime.After(before) { + t.Error("expected lastDeploymentAvailableTime to be updated on recovery") + } + }) + + t.Run("unavailable just inside grace period boundary reports Available=True", func(t *testing.T) { + co := &consoleOperator{ + lastDeploymentAvailableTime: time.Now().Add(-deploymentAvailableGracePeriod + time.Second), + } + deployment := makeDeployment(0) + + _, reason, err := co.evaluateDeploymentAvailability(deployment) + + if err != nil { + t.Errorf("expected no error just inside grace period, got: %v", err) + } + if reason != "" { + t.Errorf("expected empty reason just inside grace period, got: %q", reason) + } + }) + + t.Run("error message includes ready replica count", func(t *testing.T) { + co := &consoleOperator{} + deployment := makeDeployment(0) + + _, _, err := co.evaluateDeploymentAvailability(deployment) + + if err == nil { + t.Fatal("expected error, got nil") + } + expected := "0 replicas available for console deployment" + if err.Error() != expected { + t.Errorf("expected error message %q, got %q", expected, err.Error()) + } + }) +}