diff --git a/.changes/unreleased/operator-Fixed-20260520-110000.yaml b/.changes/unreleased/operator-Fixed-20260520-110000.yaml new file mode 100644 index 000000000..b2829d9ad --- /dev/null +++ b/.changes/unreleased/operator-Fixed-20260520-110000.yaml @@ -0,0 +1,12 @@ +project: operator +kind: Fixed +body: | + Closed a race during rolling restart that could cause two broker pods to be + unavailable simultaneously. After the operator deletes pod A, the new pod A + can appear in the cache with the latest StatefulSet revision before Redpanda + detects broker A's departure, so isHealthy remains true and the operator + immediately deletes pod B. The rolling loop now defers (requeues) when any + pod that already carries the latest StatefulSet revision is not yet + Running+Ready, or any pod is still terminating — narrower than "all pods + ready" so pods unhealthy for unrelated reasons don't block the roll. +time: 2026-05-20T11:00:00.000000-07:00 diff --git a/operator/internal/controller/redpanda/redpanda_controller.go b/operator/internal/controller/redpanda/redpanda_controller.go index 9ece4335e..67d4cdb58 100644 --- a/operator/internal/controller/redpanda/redpanda_controller.go +++ b/operator/internal/controller/redpanda/redpanda_controller.go @@ -496,6 +496,18 @@ func (r *RedpandaReconciler) reconcileDecommission(ctx context.Context, state *c // finally, we make sure we roll every pod that is not in-sync with its statefulset rollSet := state.pools.PodsToRoll() + + // Don't start rolling while a recently replaced pod is still coming up. + // The cluster health view (brokerMap, isHealthy) lags behind pod state, + // and rolling a second pod before the first one's replacement is ready + // would cause two pods to be unavailable simultaneously. + // Only check when there are actually pods to roll — otherwise we'd block + // normal reconciliation when a pod is unready for unrelated reasons. + if len(rollSet) > 0 && state.pools.HasRecentlyReplacedPods() { + logger.V(log.DebugLevel).Info("recently replaced pods not ready, deferring rolling restart") + return ctrl.Result{RequeueAfter: requeueTimeout}, nil + } + rolled := false for _, pod := range rollSet { shouldRoll, continueExecution := false, false diff --git a/operator/internal/lifecycle/pool.go b/operator/internal/lifecycle/pool.go index 167f0c418..633d8b558 100644 --- a/operator/internal/lifecycle/pool.go +++ b/operator/internal/lifecycle/pool.go @@ -303,6 +303,52 @@ func (p *PoolTracker) ToDelete() []*appsv1.StatefulSet { return sortByName(sets) } +// HasRecentlyReplacedPods returns true if any existing pod has the latest +// revision (i.e., was recently recreated during a rolling restart) but is not +// yet fully ready, or if any pod is still terminating. This is a narrower +// check than "all pods ready" — it specifically targets the window between a +// pod replacement and the new pod's readiness probe passing, during which the +// Redpanda health API has not yet caught up. +func (p *PoolTracker) HasRecentlyReplacedPods() bool { + for _, pool := range p.existingPools { + latestRevision := "" + if len(pool.revisions) > 0 { + latestRevision = pool.revisions[len(pool.revisions)-1].Name + } + + for _, withOrdinals := range pool.pods { + pod := withOrdinals.pod + + // A pod being deleted is always a reason to wait. + if pod.DeletionTimestamp != nil { + return true + } + + // Only check pods that already have the latest revision — these + // are the ones that were just replaced by a prior roll pass. + if latestRevision == "" || pod.Labels[appsv1.StatefulSetRevisionLabel] != latestRevision { + continue + } + + if pod.Status.Phase != corev1.PodRunning { + return true + } + + ready := false + for _, cond := range pod.Status.Conditions { + if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue { + ready = true + break + } + } + if !ready { + return true + } + } + } + return false +} + // PodsToRoll returns a list of pods that need to be rolled // because their association ControllerRevision does not match // the latest applied to the StatefulSet.