From ac8fe6a011498588e151deb72f23e4ed5c99f277 Mon Sep 17 00:00:00 2001
From: Steve Goodwin <sgoodwin@redhat.com>
Date: Tue, 30 Jun 2026 11:31:45 -0400
Subject: [PATCH] OCPBUGS-67134: add grace period before reporting
 Available=False

The console operator immediately reports Available=False when
deployment replicas drop to zero, even during brief disruptions
(~10s) that self-recover. Add a 2-minute grace period that
suppresses the condition when the deployment was recently available,
preventing false alarms during disruptive CI tests while still
reporting genuine outages.

Co-Authored-By: Claude Opus 4.6
---
 pkg/console/operator/operator.go       |   2 +
 pkg/console/operator/sync_v400.go      |  35 +++++--
 pkg/console/operator/sync_v400_test.go | 140 +++++++++++++++++++++++++
 3 files changed, 170 insertions(+), 7 deletions(-)

diff --git a/pkg/console/operator/operator.go b/pkg/console/operator/operator.go
index 5ae748af7..2c30f6ce2 100644
--- a/pkg/console/operator/operator.go
+++ b/pkg/console/operator/operator.go
@@ -94,6 +94,8 @@ type consoleOperator struct {
 	trackables trackables
 
 	monitoringDeploymentLister appsv1listers.DeploymentLister
+
+	lastDeploymentAvailableTime time.Time
 }
 
 type trackables struct {
diff --git a/pkg/console/operator/sync_v400.go b/pkg/console/operator/sync_v400.go
index b0b1edb54..35a673831 100644
--- a/pkg/console/operator/sync_v400.go
+++ b/pkg/console/operator/sync_v400.go
@@ -8,6 +8,7 @@ import (
 	"slices"
 	"sort"
 	"strings"
+	"time"
 
 	// kube
 	appsv1 "k8s.io/api/apps/v1"
@@ -46,6 +47,13 @@ import (
 	telemetry "github.com/openshift/console-operator/pkg/console/telemetry"
 )
 
+// deploymentAvailableGracePeriod is the duration the operator tolerates zero
+// available replicas before reporting DeploymentAvailable=False. This absorbs
+// brief disruptions (e.g. node reboots during conformance-serial tests) that
+// take all replicas offline for ~10 seconds. Matches the Degraded inertia
+// duration in library-go.
+const deploymentAvailableGracePeriod = 2 * time.Minute
+
 // The sync loop starts from zero and works its way through the requirements for a running console.
 // If at any point something is missing, it creates/updates that piece and immediately dies.
 // The next loop will pick up where they previous left off and move the process forward one step.
@@ -221,13 +229,7 @@ func (co *consoleOperator) sync_v400(ctx context.Context, controllerContext fact
 		return nil
 	}()))
 
-	statusHandler.AddCondition(status.HandleAvailable(func() (prefix string, reason string, err error) {
-		prefix = "Deployment"
-		if !deploymentsub.IsAvailable(actualDeployment) {
-			return prefix, "InsufficientReplicas", fmt.Errorf("%v replicas available for console deployment", actualDeployment.Status.ReadyReplicas)
-		}
-		return prefix, "", nil
-	}()))
+	statusHandler.AddCondition(status.HandleAvailable(co.evaluateDeploymentAvailability(actualDeployment)))
 
 	// if we survive the gauntlet, we need to update the console config with the
 	// public hostname so that the world can know the console is ready to roll
@@ -338,6 +340,25 @@ func (co *consoleOperator) SyncDeployment(
 	return deployment, "", nil
 }
 
+// evaluateDeploymentAvailability checks whether the console deployment should
+// be reported as available. When replicas drop to zero during a transient
+// disruption (e.g. node reboot in conformance-serial tests), the operator
+// suppresses Available=False for up to deploymentAvailableGracePeriod to
+// avoid alarming ClusterOperator condition blips.
+func (co *consoleOperator) evaluateDeploymentAvailability(deployment *appsv1.Deployment) (prefix string, reason string, err error) {
+	prefix = "Deployment"
+	if deploymentsub.IsAvailable(deployment) {
+		co.lastDeploymentAvailableTime = time.Now()
+		return prefix, "", nil
+	}
+	if sinceLast := time.Since(co.lastDeploymentAvailableTime); !co.lastDeploymentAvailableTime.IsZero() && sinceLast <= deploymentAvailableGracePeriod {
+		klog.V(4).Infof("deployment has 0 available replicas but was available %v ago, within %v grace period",
+			sinceLast, deploymentAvailableGracePeriod)
+		return prefix, "", nil
+	}
+	return prefix, "InsufficientReplicas", fmt.Errorf("%v replicas available for console deployment", deployment.Status.ReadyReplicas)
+}
+
 // apply configmap (needs route)
 // by the time we get to the configmap, we can assume the route exits & is configured properly
 // therefore no additional error handling is needed here.
diff --git a/pkg/console/operator/sync_v400_test.go b/pkg/console/operator/sync_v400_test.go
index 985945f30..74ba23c2c 100644
--- a/pkg/console/operator/sync_v400_test.go
+++ b/pkg/console/operator/sync_v400_test.go
@@ -5,6 +5,7 @@ import (
 	"encoding/json"
 	"sort"
 	"testing"
+	"time"
 
 	"github.com/go-test/deep"
 
@@ -539,3 +540,142 @@ func TestGetTelemetryConfiguration_DisconnectedClusterNoError(t *testing.T) {
 		t.Error("expected ACCOUNT_MAIL key to be present even on disconnected cluster")
 	}
 }
+
+// TestEvaluateDeploymentAvailability tests the grace period logic for
+// OCPBUGS-67134: the operator should suppress brief Available=False blips
+// when all replicas are temporarily offline during disruptive operations.
+func TestEvaluateDeploymentAvailability(t *testing.T) {
+	makeDeployment := func(availableReplicas int32) *appsv1.Deployment {
+		return &appsv1.Deployment{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      "console",
+				Namespace: "openshift-console",
+			},
+			Status: appsv1.DeploymentStatus{
+				AvailableReplicas: availableReplicas,
+				ReadyReplicas:     availableReplicas,
+			},
+		}
+	}
+
+	t.Run("available deployment reports Available=True and updates timestamp", func(t *testing.T) {
+		co := &consoleOperator{}
+		deployment := makeDeployment(2)
+
+		prefix, reason, err := co.evaluateDeploymentAvailability(deployment)
+
+		if err != nil {
+			t.Errorf("expected no error, got: %v", err)
+		}
+		if reason != "" {
+			t.Errorf("expected empty reason, got: %q", reason)
+		}
+		if prefix != "Deployment" {
+			t.Errorf("expected prefix 'Deployment', got: %q", prefix)
+		}
+		if co.lastDeploymentAvailableTime.IsZero() {
+			t.Error("expected lastDeploymentAvailableTime to be set")
+		}
+	})
+
+	t.Run("unavailable with no prior availability reports Available=False immediately", func(t *testing.T) {
+		co := &consoleOperator{}
+		deployment := makeDeployment(0)
+
+		_, reason, err := co.evaluateDeploymentAvailability(deployment)
+
+		if err == nil {
+			t.Error("expected error, got nil")
+		}
+		if reason != "InsufficientReplicas" {
+			t.Errorf("expected reason 'InsufficientReplicas', got: %q", reason)
+		}
+	})
+
+	t.Run("unavailable within grace period reports Available=True (suppressed)", func(t *testing.T) {
+		co := &consoleOperator{
+			lastDeploymentAvailableTime: time.Now().Add(-10 * time.Second),
+		}
+		deployment := makeDeployment(0)
+
+		_, reason, err := co.evaluateDeploymentAvailability(deployment)
+
+		if err != nil {
+			t.Errorf("expected no error within grace period, got: %v", err)
+		}
+		if reason != "" {
+			t.Errorf("expected empty reason within grace period, got: %q", reason)
+		}
+	})
+
+	t.Run("unavailable beyond grace period reports Available=False", func(t *testing.T) {
+		co := &consoleOperator{
+			lastDeploymentAvailableTime: time.Now().Add(-3 * time.Minute),
+		}
+		deployment := makeDeployment(0)
+
+		_, reason, err := co.evaluateDeploymentAvailability(deployment)
+
+		if err == nil {
+			t.Error("expected error beyond grace period, got nil")
+		}
+		if reason != "InsufficientReplicas" {
+			t.Errorf("expected reason 'InsufficientReplicas', got: %q", reason)
+		}
+	})
+
+	t.Run("recovery after blip resets timestamp", func(t *testing.T) {
+		co := &consoleOperator{
+			lastDeploymentAvailableTime: time.Now().Add(-30 * time.Second),
+		}
+
+		// Simulate: was available, went to 0, then recovered
+		deployment := makeDeployment(0)
+		_, _, err := co.evaluateDeploymentAvailability(deployment)
+		if err != nil {
+			t.Error("expected suppression within grace period")
+		}
+
+		// Recovery
+		deployment = makeDeployment(2)
+		before := co.lastDeploymentAvailableTime
+		_, _, err = co.evaluateDeploymentAvailability(deployment)
+		if err != nil {
+			t.Errorf("expected no error on recovery, got: %v", err)
+		}
+		if !co.lastDeploymentAvailableTime.After(before) {
+			t.Error("expected lastDeploymentAvailableTime to be updated on recovery")
+		}
+	})
+
+	t.Run("unavailable just inside grace period boundary reports Available=True", func(t *testing.T) {
+		co := &consoleOperator{
+			lastDeploymentAvailableTime: time.Now().Add(-deploymentAvailableGracePeriod + time.Second),
+		}
+		deployment := makeDeployment(0)
+
+		_, reason, err := co.evaluateDeploymentAvailability(deployment)
+
+		if err != nil {
+			t.Errorf("expected no error just inside grace period, got: %v", err)
+		}
+		if reason != "" {
+			t.Errorf("expected empty reason just inside grace period, got: %q", reason)
+		}
+	})
+
+	t.Run("error message includes ready replica count", func(t *testing.T) {
+		co := &consoleOperator{}
+		deployment := makeDeployment(0)
+
+		_, _, err := co.evaluateDeploymentAvailability(deployment)
+
+		if err == nil {
+			t.Fatal("expected error, got nil")
+		}
+		expected := "0 replicas available for console deployment"
+		if err.Error() != expected {
+			t.Errorf("expected error message %q, got %q", expected, err.Error())
+		}
+	})
+}