Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .changes/unreleased/operator-Fixed-20260520-110000.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
project: operator
kind: Fixed
body: |
Closed a race during rolling restart that could cause two broker pods to be
unavailable simultaneously. After the operator deletes pod A, the new pod A
can appear in the cache with the latest StatefulSet revision before Redpanda
detects broker A's departure, so isHealthy remains true and the operator
immediately deletes pod B. The rolling loop now defers (requeues) when any
pod that already carries the latest StatefulSet revision is not yet
Running+Ready, or any pod is still terminating — narrower than "all pods
ready" so pods unhealthy for unrelated reasons don't block the roll.
time: 2026-05-20T11:00:00.000000-07:00
12 changes: 12 additions & 0 deletions operator/internal/controller/redpanda/redpanda_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,18 @@ func (r *RedpandaReconciler) reconcileDecommission(ctx context.Context, state *c

// finally, we make sure we roll every pod that is not in-sync with its statefulset
rollSet := state.pools.PodsToRoll()

// Don't start rolling while a recently replaced pod is still coming up.
// The cluster health view (brokerMap, isHealthy) lags behind pod state,
// and rolling a second pod before the first one's replacement is ready
// would cause two pods to be unavailable simultaneously.
// Only check when there are actually pods to roll — otherwise we'd block
// normal reconciliation when a pod is unready for unrelated reasons.
if len(rollSet) > 0 && state.pools.HasRecentlyReplacedPods() {
logger.V(log.DebugLevel).Info("recently replaced pods not ready, deferring rolling restart")
return ctrl.Result{RequeueAfter: requeueTimeout}, nil
}

rolled := false
for _, pod := range rollSet {
shouldRoll, continueExecution := false, false
Expand Down
46 changes: 46 additions & 0 deletions operator/internal/lifecycle/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,52 @@ func (p *PoolTracker) ToDelete() []*appsv1.StatefulSet {
return sortByName(sets)
}

// HasRecentlyReplacedPods returns true if any existing pod has the latest
// revision (i.e., was recently recreated during a rolling restart) but is not
// yet fully ready, or if any pod is still terminating. This is a narrower
// check than "all pods ready" — it specifically targets the window between a
// pod replacement and the new pod's readiness probe passing, during which the
// Redpanda health API has not yet caught up.
func (p *PoolTracker) HasRecentlyReplacedPods() bool {
for _, pool := range p.existingPools {
latestRevision := ""
if len(pool.revisions) > 0 {
latestRevision = pool.revisions[len(pool.revisions)-1].Name
}

for _, withOrdinals := range pool.pods {
pod := withOrdinals.pod

// A pod being deleted is always a reason to wait.
if pod.DeletionTimestamp != nil {
return true
}

// Only check pods that already have the latest revision — these
// are the ones that were just replaced by a prior roll pass.
if latestRevision == "" || pod.Labels[appsv1.StatefulSetRevisionLabel] != latestRevision {
continue
}

if pod.Status.Phase != corev1.PodRunning {
return true
}

ready := false
for _, cond := range pod.Status.Conditions {
if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue {
ready = true
break
}
}
if !ready {
return true
}
}
}
return false
}

// PodsToRoll returns a list of pods that need to be rolled
// because their association ControllerRevision does not match
// the latest applied to the StatefulSet.
Expand Down
Loading