From c72d8bfd39550eff2df60cb21cad4fba510ce77b Mon Sep 17 00:00:00 2001 From: David Yu Date: Wed, 20 May 2026 12:04:37 -0700 Subject: [PATCH] operator: defer rolling restart while a recently replaced pod is still coming up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During a rolling restart the operator deletes one pod at a time and waits for the cluster to report healthy before proceeding. In practice the Redpanda health API lags behind pod state: after deleting pod A, the new pod A can appear in the cache with the correct revision before Redpanda detects broker A's departure, so isHealthy remains true and the operator immediately deletes pod B — causing two pods to be unavailable simultaneously. HasRecentlyReplacedPods in pool.go closes this race. Before entering the rolling loop (and only when there are pods to roll), it checks whether any pod that already carries the latest StatefulSet revision is not yet Running+Ready, or any pod is still terminating. If so, the reconciler defers with a requeue rather than proceeding to the next deletion. This is narrower than "all pods ready" — pods with the old revision that happen to be unhealthy for unrelated reasons do not block the roll. Backport of the guard introduced on main in #1446. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../operator-Fixed-20260520-110000.yaml | 12 +++++ .../redpanda/redpanda_controller.go | 12 +++++ operator/internal/lifecycle/pool.go | 46 +++++++++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 .changes/unreleased/operator-Fixed-20260520-110000.yaml diff --git a/.changes/unreleased/operator-Fixed-20260520-110000.yaml b/.changes/unreleased/operator-Fixed-20260520-110000.yaml new file mode 100644 index 000000000..b2829d9ad --- /dev/null +++ b/.changes/unreleased/operator-Fixed-20260520-110000.yaml @@ -0,0 +1,12 @@ +project: operator +kind: Fixed +body: | + Closed a race during rolling restart that could cause two broker pods to be + unavailable simultaneously. After the operator deletes pod A, the new pod A + can appear in the cache with the latest StatefulSet revision before Redpanda + detects broker A's departure, so isHealthy remains true and the operator + immediately deletes pod B. The rolling loop now defers (requeues) when any + pod that already carries the latest StatefulSet revision is not yet + Running+Ready, or any pod is still terminating — narrower than "all pods + ready" so pods unhealthy for unrelated reasons don't block the roll. +time: 2026-05-20T11:00:00.000000-07:00 diff --git a/operator/internal/controller/redpanda/redpanda_controller.go b/operator/internal/controller/redpanda/redpanda_controller.go index 9ece4335e..67d4cdb58 100644 --- a/operator/internal/controller/redpanda/redpanda_controller.go +++ b/operator/internal/controller/redpanda/redpanda_controller.go @@ -496,6 +496,18 @@ func (r *RedpandaReconciler) reconcileDecommission(ctx context.Context, state *c // finally, we make sure we roll every pod that is not in-sync with its statefulset rollSet := state.pools.PodsToRoll() + + // Don't start rolling while a recently replaced pod is still coming up. + // The cluster health view (brokerMap, isHealthy) lags behind pod state, + // and rolling a second pod before the first one's replacement is ready + // would cause two pods to be unavailable simultaneously. + // Only check when there are actually pods to roll — otherwise we'd block + // normal reconciliation when a pod is unready for unrelated reasons. + if len(rollSet) > 0 && state.pools.HasRecentlyReplacedPods() { + logger.V(log.DebugLevel).Info("recently replaced pods not ready, deferring rolling restart") + return ctrl.Result{RequeueAfter: requeueTimeout}, nil + } + rolled := false for _, pod := range rollSet { shouldRoll, continueExecution := false, false diff --git a/operator/internal/lifecycle/pool.go b/operator/internal/lifecycle/pool.go index 167f0c418..633d8b558 100644 --- a/operator/internal/lifecycle/pool.go +++ b/operator/internal/lifecycle/pool.go @@ -303,6 +303,52 @@ func (p *PoolTracker) ToDelete() []*appsv1.StatefulSet { return sortByName(sets) } +// HasRecentlyReplacedPods returns true if any existing pod has the latest +// revision (i.e., was recently recreated during a rolling restart) but is not +// yet fully ready, or if any pod is still terminating. This is a narrower +// check than "all pods ready" — it specifically targets the window between a +// pod replacement and the new pod's readiness probe passing, during which the +// Redpanda health API has not yet caught up. +func (p *PoolTracker) HasRecentlyReplacedPods() bool { + for _, pool := range p.existingPools { + latestRevision := "" + if len(pool.revisions) > 0 { + latestRevision = pool.revisions[len(pool.revisions)-1].Name + } + + for _, withOrdinals := range pool.pods { + pod := withOrdinals.pod + + // A pod being deleted is always a reason to wait. + if pod.DeletionTimestamp != nil { + return true + } + + // Only check pods that already have the latest revision — these + // are the ones that were just replaced by a prior roll pass. + if latestRevision == "" || pod.Labels[appsv1.StatefulSetRevisionLabel] != latestRevision { + continue + } + + if pod.Status.Phase != corev1.PodRunning { + return true + } + + ready := false + for _, cond := range pod.Status.Conditions { + if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue { + ready = true + break + } + } + if !ready { + return true + } + } + } + return false +} + // PodsToRoll returns a list of pods that need to be rolled // because their association ControllerRevision does not match // the latest applied to the StatefulSet.