From c72d8bfd39550eff2df60cb21cad4fba510ce77b Mon Sep 17 00:00:00 2001
From: David Yu <david.yu@redpanda.com>
Date: Wed, 20 May 2026 12:04:37 -0700
Subject: [PATCH] operator: defer rolling restart while a recently replaced pod
 is still coming up
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

During a rolling restart the operator deletes one pod at a time and waits
for the cluster to report healthy before proceeding. In practice the
Redpanda health API lags behind pod state: after deleting pod A, the new
pod A can appear in the cache with the correct revision before Redpanda
detects broker A's departure, so isHealthy remains true and the operator
immediately deletes pod B — causing two pods to be unavailable
simultaneously.

HasRecentlyReplacedPods in pool.go closes this race. Before entering the
rolling loop (and only when there are pods to roll), it checks whether
any pod that already carries the latest StatefulSet revision is not yet
Running+Ready, or any pod is still terminating. If so, the reconciler
defers with a requeue rather than proceeding to the next deletion. This
is narrower than "all pods ready" — pods with the old revision that
happen to be unhealthy for unrelated reasons do not block the roll.

Backport of the guard introduced on main in #1446.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../operator-Fixed-20260520-110000.yaml       | 12 +++++
 .../redpanda/redpanda_controller.go           | 12 +++++
 operator/internal/lifecycle/pool.go           | 46 +++++++++++++++++++
 3 files changed, 70 insertions(+)
 create mode 100644 .changes/unreleased/operator-Fixed-20260520-110000.yaml

diff --git a/.changes/unreleased/operator-Fixed-20260520-110000.yaml b/.changes/unreleased/operator-Fixed-20260520-110000.yaml
new file mode 100644
index 000000000..b2829d9ad
--- /dev/null
+++ b/.changes/unreleased/operator-Fixed-20260520-110000.yaml
@@ -0,0 +1,12 @@
+project: operator
+kind: Fixed
+body: |
+  Closed a race during rolling restart that could cause two broker pods to be
+  unavailable simultaneously. After the operator deletes pod A, the new pod A
+  can appear in the cache with the latest StatefulSet revision before Redpanda
+  detects broker A's departure, so isHealthy remains true and the operator
+  immediately deletes pod B. The rolling loop now defers (requeues) when any
+  pod that already carries the latest StatefulSet revision is not yet
+  Running+Ready, or any pod is still terminating — narrower than "all pods
+  ready" so pods unhealthy for unrelated reasons don't block the roll.
+time: 2026-05-20T11:00:00.000000-07:00
diff --git a/operator/internal/controller/redpanda/redpanda_controller.go b/operator/internal/controller/redpanda/redpanda_controller.go
index 9ece4335e..67d4cdb58 100644
--- a/operator/internal/controller/redpanda/redpanda_controller.go
+++ b/operator/internal/controller/redpanda/redpanda_controller.go
@@ -496,6 +496,18 @@ func (r *RedpandaReconciler) reconcileDecommission(ctx context.Context, state *c
 
 	// finally, we make sure we roll every pod that is not in-sync with its statefulset
 	rollSet := state.pools.PodsToRoll()
+
+	// Don't start rolling while a recently replaced pod is still coming up.
+	// The cluster health view (brokerMap, isHealthy) lags behind pod state,
+	// and rolling a second pod before the first one's replacement is ready
+	// would cause two pods to be unavailable simultaneously.
+	// Only check when there are actually pods to roll — otherwise we'd block
+	// normal reconciliation when a pod is unready for unrelated reasons.
+	if len(rollSet) > 0 && state.pools.HasRecentlyReplacedPods() {
+		logger.V(log.DebugLevel).Info("recently replaced pods not ready, deferring rolling restart")
+		return ctrl.Result{RequeueAfter: requeueTimeout}, nil
+	}
+
 	rolled := false
 	for _, pod := range rollSet {
 		shouldRoll, continueExecution := false, false
diff --git a/operator/internal/lifecycle/pool.go b/operator/internal/lifecycle/pool.go
index 167f0c418..633d8b558 100644
--- a/operator/internal/lifecycle/pool.go
+++ b/operator/internal/lifecycle/pool.go
@@ -303,6 +303,52 @@ func (p *PoolTracker) ToDelete() []*appsv1.StatefulSet {
 	return sortByName(sets)
 }
 
+// HasRecentlyReplacedPods returns true if any existing pod has the latest
+// revision (i.e., was recently recreated during a rolling restart) but is not
+// yet fully ready, or if any pod is still terminating. This is a narrower
+// check than "all pods ready" — it specifically targets the window between a
+// pod replacement and the new pod's readiness probe passing, during which the
+// Redpanda health API has not yet caught up.
+func (p *PoolTracker) HasRecentlyReplacedPods() bool {
+	for _, pool := range p.existingPools {
+		latestRevision := ""
+		if len(pool.revisions) > 0 {
+			latestRevision = pool.revisions[len(pool.revisions)-1].Name
+		}
+
+		for _, withOrdinals := range pool.pods {
+			pod := withOrdinals.pod
+
+			// A pod being deleted is always a reason to wait.
+			if pod.DeletionTimestamp != nil {
+				return true
+			}
+
+			// Only check pods that already have the latest revision — these
+			// are the ones that were just replaced by a prior roll pass.
+			if latestRevision == "" || pod.Labels[appsv1.StatefulSetRevisionLabel] != latestRevision {
+				continue
+			}
+
+			if pod.Status.Phase != corev1.PodRunning {
+				return true
+			}
+
+			ready := false
+			for _, cond := range pod.Status.Conditions {
+				if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue {
+					ready = true
+					break
+				}
+			}
+			if !ready {
+				return true
+			}
+		}
+	}
+	return false
+}
+
 // PodsToRoll returns a list of pods that need to be rolled
 // because their association ControllerRevision does not match
 // the latest applied to the StatefulSet.