From ba392a3ead9722d92d9e36d4920a83e7a5fe67a7 Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Tue, 3 Mar 2026 17:02:22 -0500 Subject: [PATCH 1/4] Do not set Degraded=True on transient errors CloudConfigReconciler: gate transient errors behind a 2-minute window Three related fixes to stop upgrade-time API blips from immediately setting CloudConfigControllerDegraded=True: 1. Infrastructure NotFound now calls setAvailableCondition (nil return) instead of setDegradedCondition, matching the main controller's existing behaviour. 2. Errors are classified as transient (API blips: all Get/Create/Update calls, feature-gate informer not yet synced) or permanent (config problems that won't self-heal: nil platformStatus, unsupported platform, missing user config key, nil FeatureGateAccess, transform failure). 3. handleTransientError() only sets degraded after consecutiveFailureSince has been set for longer than transientDegradedThreshold (2 min); handleDegradeError() sets degraded immediately and returns nil so controller-runtime does not requeue (existing watches re-trigger when the underlying config changes). clearFailureWindow() is called at every successful reconcile exit. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Nolan Brubaker --- .../cloud_config_sync_controller.go | 133 ++++++++++-------- .../cloud_config_sync_controller_test.go | 49 ++++++- 2 files changed, 119 insertions(+), 63 deletions(-) diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go index dc50e4acf..30077cf66 100644 --- a/pkg/controllers/cloud_config_sync_controller.go +++ b/pkg/controllers/cloud_config_sync_controller.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "reflect" + "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -29,42 +30,48 @@ const ( // Controller conditions for the Cluster Operator resource cloudConfigControllerAvailableCondition = "CloudConfigControllerAvailable" cloudConfigControllerDegradedCondition = "CloudConfigControllerDegraded" + + // transientDegradedThreshold is how long transient errors must persist before + // the controller sets CloudConfigControllerDegraded=True. This prevents brief + // API server blips during upgrades from immediately degrading the operator. + transientDegradedThreshold = 2 * time.Minute ) type CloudConfigReconciler struct { ClusterOperatorStatusClient - Scheme *runtime.Scheme - FeatureGateAccess featuregates.FeatureGateAccess + Scheme *runtime.Scheme + FeatureGateAccess featuregates.FeatureGateAccess + consecutiveFailureSince *time.Time // nil when the last reconcile succeeded } func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { klog.V(1).Infof("Syncing cloud-conf ConfigMap") infra := &configv1.Infrastructure{} - if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); err != nil { - klog.Errorf("infrastructure resource not found") - if err := r.setDegradedCondition(ctx); err != nil { + if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) { + // No cloud platform: mirror the main controller's behaviour of returning Available. + klog.Infof("Infrastructure cluster does not exist. Skipping...") + r.clearFailureWindow() + if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } - return ctrl.Result{}, err + return ctrl.Result{}, nil + } else if err != nil { + return r.handleTransientError(ctx, err) } network := &configv1.Network{} if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller when getting cluster Network object: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, err) } syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + // nil platformStatus is a permanent misconfiguration. + return r.handleDegradeError(ctx, err) } if !syncNeeded { + r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } @@ -74,11 +81,9 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) cloudConfigTransformerFn, needsManagedConfigLookup, err := cloud.GetCloudConfigTransformer(infra.Status.PlatformStatus) if err != nil { + // Unsupported platform won't change without a cluster reconfigure. klog.Errorf("unable to get cloud config transformer function; unsupported platform") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return r.handleDegradeError(ctx, err) } sourceCM := &corev1.ConfigMap{} @@ -101,12 +106,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) managedConfigFound = true } else if errors.IsNotFound(err) { klog.Warningf("managed cloud-config is not found, falling back to infrastructure config") - } else if err != nil { - klog.Errorf("unable to get managed cloud-config for sync") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + } else { + return r.handleTransientError(ctx, err) } } @@ -119,38 +120,26 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) { klog.Warningf("managed cloud-config is not found, falling back to default cloud config.") } else if err != nil { - klog.Errorf("unable to get cloud-config for sync: %v", err) - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, err) } } sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + // User-supplied key mismatch: permanent until the ConfigMap or Infrastructure changes. + return r.handleDegradeError(ctx, err) } - // Check if FeatureGateAccess is configured if r.FeatureGateAccess == nil { - klog.Errorf("FeatureGateAccess is not configured") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, fmt.Errorf("FeatureGateAccess is not configured") + // Operator misconfiguration at startup: permanent. + return r.handleDegradeError(ctx, fmt.Errorf("FeatureGateAccess is not configured")) } features, err := r.FeatureGateAccess.CurrentFeatureGates() if err != nil { + // The feature-gate informer may not have synced yet: transient. klog.Errorf("unable to get feature gates: %v", err) - if errD := r.setDegradedCondition(ctx); errD != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", errD) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, err) } if cloudConfigTransformerFn != nil { @@ -159,10 +148,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) // we're not expecting users to put their data in the former. output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + // Platform-specific transform failed on the current config data: permanent. + return r.handleDegradeError(ctx, err) } sourceCM.Data[defaultConfigKey] = output } @@ -175,16 +162,13 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) // If the config does not exist, it will be created later, so we can ignore a Not Found error if err := r.Get(ctx, targetConfigMapKey, targetCM); err != nil && !errors.IsNotFound(err) { - klog.Errorf("unable to get target cloud-config for sync") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, err) } // Note that the source config map is actually a *transformed* source config map if r.isCloudConfigEqual(sourceCM, targetCM) { klog.V(1).Infof("source and target cloud-config content are equal, no sync needed") + r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } @@ -193,12 +177,10 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.syncCloudConfigData(ctx, sourceCM, targetCM); err != nil { klog.Errorf("unable to sync cloud config") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, err) } + r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } @@ -206,6 +188,45 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, nil } +// clearFailureWindow resets the transient-error tracking. Call this on every +// successful reconcile so the 2-minute window restarts fresh on the next failure. +func (r *CloudConfigReconciler) clearFailureWindow() { + r.consecutiveFailureSince = nil +} + +// handleTransientError records the start of a failure window and degrades the +// controller only after transientDegradedThreshold has elapsed. It always +// returns a non-nil error so controller-runtime requeues with exponential backoff. +func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { + now := r.Clock.Now() + if r.consecutiveFailureSince == nil { + r.consecutiveFailureSince = &now + klog.V(4).Infof("CloudConfigReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) + return ctrl.Result{}, err + } + elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince) + if elapsed < transientDegradedThreshold { + klog.V(4).Infof("CloudConfigReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) + return ctrl.Result{}, err + } + klog.Warningf("CloudConfigReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, err +} + +// handleDegradeError sets CloudConfigControllerDegraded=True immediately and +// returns nil so controller-runtime does NOT requeue. An existing watch on the +// relevant resource will re-trigger reconciliation when the problem is fixed. +func (r *CloudConfigReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) { + klog.Errorf("CloudConfigReconciler: permanent error, setting degraded: %v", err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, nil +} + func (r *CloudConfigReconciler) isCloudConfigSyncNeeded(platformStatus *configv1.PlatformStatus, infraCloudConfigRef configv1.ConfigMapFileReference) (bool, error) { if platformStatus == nil { return false, fmt.Errorf("platformStatus is required") diff --git a/pkg/controllers/cloud_config_sync_controller_test.go b/pkg/controllers/cloud_config_sync_controller_test.go index 513e944e3..5e1a6ce28 100644 --- a/pkg/controllers/cloud_config_sync_controller_test.go +++ b/pkg/controllers/cloud_config_sync_controller_test.go @@ -509,7 +509,7 @@ var _ = Describe("Cloud config sync reconciler", func() { Expect(len(allCMs.Items)).To(BeEquivalentTo(1)) }) - It("should error if a user-specified configmap key isn't present", func() { + It("should degrade immediately if a user-specified configmap key isn't present", func() { infraResource := makeInfrastructureResource(configv1.AWSPlatformType) infraResource.Spec.CloudConfig.Key = "notfound" Expect(cl.Create(ctx, infraResource)).To(Succeed()) @@ -518,8 +518,19 @@ var _ = Describe("Cloud config sync reconciler", func() { Expect(cl.Status().Update(ctx, infraResource.DeepCopy())).To(Succeed()) _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) - Expect(err.Error()).To(ContainSubstring("specified in infra resource does not exist in source configmap")) - + Expect(err).To(Succeed()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil()) + Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue)) }) It("should continue with reconcile when feature gates are available", func() { @@ -584,15 +595,39 @@ var _ = Describe("Cloud config sync reconciler", func() { }) }) - It("reconcile should fail if no infra resource found", func() { + It("reconcile should succeed and be available if no infra resource found", func() { _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) - Expect(err.Error()).Should(BeEquivalentTo("infrastructures.config.openshift.io \"cluster\" not found")) + Expect(err).To(Succeed()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var availCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerAvailableCondition { + availCond = &co.Status.Conditions[i] + break + } + } + Expect(availCond).NotTo(BeNil()) + Expect(availCond.Status).To(Equal(configv1.ConditionTrue)) }) - It("should fail if no PlatformStatus in infra resource presented ", func() { + It("should degrade immediately if no PlatformStatus in infra resource", func() { infraResource := makeInfrastructureResource(configv1.AWSPlatformType) Expect(cl.Create(ctx, infraResource)).To(Succeed()) _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) - Expect(err.Error()).Should(BeEquivalentTo("platformStatus is required")) + Expect(err).To(Succeed()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil()) + Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue)) }) }) From 165ff0a97f633b836bdc78d7d2e8a140e71257f0 Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Wed, 4 Mar 2026 09:48:45 -0500 Subject: [PATCH 2/4] Handle transient errors for CA controller Mirror the CloudConfigReconciler pattern: transient API errors (Proxy get, system trust bundle read, ConfigMap write) are silently requeued and only set Degraded=True after transientDegradedThreshold (2 min) has elapsed. Errors that indicate corrupt cert data (merge failures) set Degraded=True immediately and return nil so controller-runtime does not requeue; existing watches re-trigger reconciliation when the data changes. Also adds two direct unit tests that verify the threshold gating via a fake clock, without running through the manager. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Nolan Brubaker --- .../trusted_ca_bundle_controller.go | 79 ++++++++++++----- .../trusted_ca_bundle_controller_test.go | 87 +++++++++++++++++++ 2 files changed, 142 insertions(+), 24 deletions(-) diff --git a/pkg/controllers/trusted_ca_bundle_controller.go b/pkg/controllers/trusted_ca_bundle_controller.go index ef33709e1..e6e788548 100644 --- a/pkg/controllers/trusted_ca_bundle_controller.go +++ b/pkg/controllers/trusted_ca_bundle_controller.go @@ -6,6 +6,7 @@ import ( "crypto/x509" "fmt" "os" + "time" "github.com/openshift/api/annotations" configv1 "github.com/openshift/api/config/v1" @@ -41,8 +42,9 @@ const ( type TrustedCABundleReconciler struct { ClusterOperatorStatusClient - Scheme *runtime.Scheme - trustBundlePath string + Scheme *runtime.Scheme + trustBundlePath string + consecutiveFailureSince *time.Time // nil when the last reconcile succeeded } // isSpecTrustedCASet returns true if spec.trustedCA of proxyConfig is set. @@ -59,16 +61,14 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ // Request object not found, could have been deleted after reconcile request. // Return and don't requeue klog.Infof("proxy not found; reconciliation will be skipped") + r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } return reconcile.Result{}, nil } - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - // Error reading the object - requeue the request. - return reconcile.Result{}, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err) + // Non-NotFound: transient API error. + return r.handleTransientError(ctx, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err)) } // Check if changed config map in 'openshift-config' namespace is proxy trusted ca. @@ -77,43 +77,37 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } - + // Note: clearFailureWindow is intentionally NOT called here. This path did not + // exercise the full reconcile logic, so an ongoing transient failure window + // (set by a previous reconcile pass) should not be reset. klog.V(1).Infof("changed config map %s is not a proxy trusted ca, skipping", req) return reconcile.Result{}, nil } systemTrustBundle, err := r.getSystemTrustBundle() if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("failed to get system trust bundle: %v", err) + // Node cert store may be updating during upgrade: transient. + return r.handleTransientError(ctx, fmt.Errorf("failed to get system trust bundle: %v", err)) } proxyCABundle, mergedTrustBundle, err := r.addProxyCABundle(ctx, proxyConfig, systemTrustBundle) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err) + // Combined cert bundle is corrupt: persistent. + return r.handleDegradeError(ctx, fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err)) } _, mergedTrustBundle, err = r.addCloudConfigCABundle(ctx, proxyCABundle, mergedTrustBundle) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err) + // Combined cert bundle is corrupt: persistent. + return r.handleDegradeError(ctx, fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err)) } ccmTrustedConfigMap := r.makeCABundleConfigMap(mergedTrustBundle) if err := r.createOrUpdateConfigMap(ctx, ccmTrustedConfigMap); err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("can not update target trust bundle configmap: %v", err) + return r.handleTransientError(ctx, fmt.Errorf("can not update target trust bundle configmap: %v", err)) } + r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } @@ -121,6 +115,43 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, nil } +func (r *TrustedCABundleReconciler) clearFailureWindow() { + r.consecutiveFailureSince = nil +} + +// handleTransientError records the start of a failure window and degrades the +// controller only after transientDegradedThreshold has elapsed. It always +// returns a non-nil error so controller-runtime requeues with exponential backoff. +func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { + now := r.Clock.Now() + if r.consecutiveFailureSince == nil { + r.consecutiveFailureSince = &now + klog.V(4).Infof("TrustedCABundleReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) + return ctrl.Result{}, err + } + elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince) + if elapsed < transientDegradedThreshold { + klog.V(4).Infof("TrustedCABundleReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) + return ctrl.Result{}, err + } + klog.Warningf("TrustedCABundleReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, err +} + +// handleDegradeError sets TrustedCABundleControllerControllerDegraded=True immediately and +// returns nil so controller-runtime does NOT requeue. An existing watch on the +// relevant resource will re-trigger reconciliation when the problem is fixed. +func (r *TrustedCABundleReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) { + klog.Errorf("TrustedCABundleReconciler: persistent error, setting degraded: %v", err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, nil +} + // addProxyCABundle checks ca bundle referred by Proxy resource and adds it to passed bundle // in case if proxy one is valid. // This function returns added bundle as first value, result as second and an error if it was occurred. diff --git a/pkg/controllers/trusted_ca_bundle_controller_test.go b/pkg/controllers/trusted_ca_bundle_controller_test.go index 2c112413f..db1be8c07 100644 --- a/pkg/controllers/trusted_ca_bundle_controller_test.go +++ b/pkg/controllers/trusted_ca_bundle_controller_test.go @@ -16,6 +16,7 @@ import ( "k8s.io/client-go/tools/record" clocktesting "k8s.io/utils/clock/testing" "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/config" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -309,6 +310,92 @@ var _ = Describe("Trusted CA bundle sync controller", func() { }) }) +var _ = Describe("Trusted CA bundle reconciler unit tests", func() { + ctx := context.Background() + + AfterEach(func() { + co := &v1.ClusterOperator{} + if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil { + Expect(cl.Delete(ctx, co)).To(Succeed()) + Eventually(func() bool { + return apierrors.IsNotFound(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)) + }).Should(BeTrue()) + } + }) + + It("reconcile should succeed and be available if no proxy resource found", func() { + reconciler := &TrustedCABundleReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: testManagedNamespace, + }, + trustBundlePath: systemCAValid, + } + + _, err := reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(Succeed()) + + co := &v1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var availCond *v1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == trustedCABundleControllerAvailableCondition { + availCond = &co.Status.Conditions[i] + break + } + } + Expect(availCond).NotTo(BeNil()) + Expect(availCond.Status).To(Equal(v1.ConditionTrue)) + }) + + It("should not degrade on transient error before threshold, but degrade after threshold", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + reconciler := &TrustedCABundleReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: fakeClock, + ManagedNamespace: testManagedNamespace, + }, + trustBundlePath: "/broken/ca/path.pem", // unreadable → transient error + } + + // Create a Proxy so the Proxy get succeeds and we reach the system trust bundle read. + proxy := &v1.Proxy{ObjectMeta: metav1.ObjectMeta{Name: proxyResourceName}} + Expect(cl.Create(ctx, proxy)).To(Succeed()) + DeferCleanup(func() { _ = cl.Delete(ctx, proxy) }) + + // First reconcile: transient failure starts; error is returned but no degraded condition set. + _, err := reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + co := &v1.ClusterOperator{} + if getErr := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); getErr == nil { + for _, cond := range co.Status.Conditions { + if cond.Type == trustedCABundleControllerDegradedCondition { + Expect(cond.Status).NotTo(Equal(v1.ConditionTrue), "should not be degraded before threshold") + } + } + } + + // Advance clock past the degraded threshold. + fakeClock.Step(transientDegradedThreshold + time.Second) + + // Second reconcile: threshold exceeded, controller sets degraded. + _, err = reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *v1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == trustedCABundleControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil()) + Expect(degradedCond.Status).To(Equal(v1.ConditionTrue)) + }) +}) + var _ = Describe("Trusted CA reconciler methods", func() { It("Get system CA should be fine if bundle is valid", func() { reconciler := &TrustedCABundleReconciler{ From 8e7f3f91e8cd3cddc0dba5d9b50cca572f2469fb Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Wed, 4 Mar 2026 17:16:48 -0500 Subject: [PATCH 3/4] Fix transient error handling in CloudOperatorReconciler and tests Add handleTransientError/handleDegradeError methods to CloudOperatorReconciler with an aggregatedTransientDegradedThreshold of 2m30s (longer than the sub-controller threshold of 2m, to accommodate sub-controller recovery time). Fix test: handleTransientError test was stepping the clock by transientDegradedThreshold (2m, the sub-controller constant) instead of aggregatedTransientDegradedThreshold (2m30s), so the threshold was never exceeded and the degraded condition was never set. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Nolan Brubaker --- .../cloud_config_sync_controller.go | 6 +- pkg/controllers/clusteroperator_controller.go | 94 +++++++++++++------ .../clusteroperator_controller_test.go | 71 ++++++++++++++ .../trusted_ca_bundle_controller.go | 3 +- 4 files changed, 140 insertions(+), 34 deletions(-) diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go index 30077cf66..caa4e237f 100644 --- a/pkg/controllers/cloud_config_sync_controller.go +++ b/pkg/controllers/cloud_config_sync_controller.go @@ -32,8 +32,9 @@ const ( cloudConfigControllerDegradedCondition = "CloudConfigControllerDegraded" // transientDegradedThreshold is how long transient errors must persist before - // the controller sets CloudConfigControllerDegraded=True. This prevents brief + // the controller sets Degraded=True. This prevents brief // API server blips during upgrades from immediately degrading the operator. + // Applies to both CloudConfigController and TrustedCAController. transientDegradedThreshold = 2 * time.Minute ) @@ -51,10 +52,11 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) { // No cloud platform: mirror the main controller's behaviour of returning Available. klog.Infof("Infrastructure cluster does not exist. Skipping...") - r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } + // Skip if the infrastructure resource doesn't exist. + r.clearFailureWindow() return ctrl.Result{}, nil } else if err != nil { return r.handleTransientError(ctx, err) diff --git a/pkg/controllers/clusteroperator_controller.go b/pkg/controllers/clusteroperator_controller.go index d94066e4c..7ed5f5d11 100644 --- a/pkg/controllers/clusteroperator_controller.go +++ b/pkg/controllers/clusteroperator_controller.go @@ -19,6 +19,7 @@ package controllers import ( "context" "fmt" + "time" configv1 "github.com/openshift/api/config/v1" operatorv1 "github.com/openshift/api/operator/v1" @@ -45,16 +46,24 @@ const ( // Condition type for Cloud Controller ownership cloudControllerOwnershipCondition = "CloudControllerOwner" + + // aggregatedTransientDegradedThreshold is how long transient errors must persist before + // the controller sets Degraded=True. + // This prevents brief API server blips during upgrades from immediately degrading the operator. + // Applies to top-level operator, and is longer in order + // to accomodate changes in the lower-level operators. + aggregatedTransientDegradedThreshold = 2*time.Minute + (30 * time.Second) ) // CloudOperatorReconciler reconciles a ClusterOperator object type CloudOperatorReconciler struct { ClusterOperatorStatusClient - Scheme *runtime.Scheme - watcher ObjectWatcher - ImagesFile string - FeatureGateAccess featuregates.FeatureGateAccess - TLSProfileSpec configv1.TLSProfileSpec + Scheme *runtime.Scheme + watcher ObjectWatcher + ImagesFile string + FeatureGateAccess featuregates.FeatureGateAccess + TLSProfileSpec configv1.TLSProfileSpec + consecutiveFailureSince *time.Time // nil when the last reconcile succeeded } // +kubebuilder:rbac:groups=config.openshift.io,resources=clusteroperators,verbs=get;list;watch;create;update;patch;delete @@ -69,59 +78,43 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) infra := &configv1.Infrastructure{} if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) { klog.Infof("Infrastructure cluster does not exist. Skipping...") - if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { klog.Errorf("Unable to sync cluster operator status: %s", err) - return ctrl.Result{}, err + return r.handleTransientError(ctx, conditionOverrides, err) } - + // It's ok for the infrastructure cluster to not exist + r.clearFailureWindow() return ctrl.Result{}, nil } else if err != nil { klog.Errorf("Unable to retrive Infrastructure object: %v", err) - - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, conditionOverrides, err) } allowedToProvision, err := r.provisioningAllowed(ctx, infra, conditionOverrides) if err != nil { klog.Errorf("Unable to determine cluster state to check if provision is allowed: %v", err) - return ctrl.Result{}, err + return r.handleTransientError(ctx, conditionOverrides, err) } else if !allowedToProvision { + // We're not allowed to provision, but didn't have any failures. + r.clearFailureWindow() return ctrl.Result{}, nil } clusterProxy := &configv1.Proxy{} if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !errors.IsNotFound(err) { klog.Errorf("Unable to retrive Proxy object: %v", err) - - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, conditionOverrides, err) } operatorConfig, err := config.ComposeConfig(infra, clusterProxy, r.ImagesFile, r.ManagedNamespace, r.FeatureGateAccess, r.TLSProfileSpec) if err != nil { klog.Errorf("Unable to build operator config %s", err) - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return r.handleDegradeError(ctx, conditionOverrides, err) } if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil { klog.Errorf("Unable to sync operands: %s", err) - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, conditionOverrides, err) } if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { @@ -134,9 +127,48 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) return ctrl.Result{}, err } + // successful reconcile, make sure the failure window is cleared. + r.clearFailureWindow() return ctrl.Result{}, nil } +func (r *CloudOperatorReconciler) clearFailureWindow() { + r.consecutiveFailureSince = nil +} + +// handleTransientError records the start of a failure window and degrades the +// operator only after aggregatedTransientDegradedThreshold has elapsed. It always returns +// a non-nil error so controller-runtime requeues with exponential backoff. +func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) { + now := r.Clock.Now() + if r.consecutiveFailureSince == nil { + r.consecutiveFailureSince = &now + klog.V(4).Infof("CloudOperatorReconciler: transient failure started (%v), will degrade after %s", err, aggregatedTransientDegradedThreshold) + return ctrl.Result{}, err + } + elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince) + if elapsed < aggregatedTransientDegradedThreshold { + klog.V(4).Infof("CloudOperatorReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, aggregatedTransientDegradedThreshold, err) + return ctrl.Result{}, err + } + klog.Warningf("CloudOperatorReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) + if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil { + return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr) + } + return ctrl.Result{}, err +} + +// handleDegradeError sets OperatorDegraded=True immediately and returns nil so +// controller-runtime does NOT requeue. Existing watches on Infrastructure, +// ConfigMaps, and Secrets will re-trigger reconciliation when the problem is fixed. +func (r *CloudOperatorReconciler) handleDegradeError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) { + klog.Errorf("CloudOperatorReconciler: persistent error, setting degraded: %v", err) + if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil { + return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr) + } + return ctrl.Result{}, nil // do not requeue; a watch event will re-trigger +} + func (r *CloudOperatorReconciler) sync(ctx context.Context, config config.OperatorConfig, conditionOverrides []configv1.ClusterOperatorStatusCondition) error { // Deploy resources for platform resources, err := cloud.GetResources(config) diff --git a/pkg/controllers/clusteroperator_controller_test.go b/pkg/controllers/clusteroperator_controller_test.go index 041670762..987b5be2d 100644 --- a/pkg/controllers/clusteroperator_controller_test.go +++ b/pkg/controllers/clusteroperator_controller_test.go @@ -2,6 +2,7 @@ package controllers import ( "context" + "fmt" "time" . "github.com/onsi/ginkgo/v2" @@ -618,3 +619,73 @@ var _ = Describe("Apply resources should", func() { }) }) + +var _ = Describe("CloudOperatorReconciler error handling", func() { + ctx := context.Background() + + AfterEach(func() { + co := &configv1.ClusterOperator{} + if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil { + Eventually(func() bool { + err := cl.Delete(ctx, co) + return err == nil || apierrors.IsNotFound(err) + }).Should(BeTrue()) + } + Eventually(apierrors.IsNotFound(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co))).Should(BeTrue()) + }) + + It("handleDegradeError should set OperatorDegraded=True immediately and return nil error", func() { + reconciler := &CloudOperatorReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: defaultManagementNamespace, + Recorder: record.NewFakeRecorder(32), + }, + Scheme: scheme.Scheme, + } + + _, err := reconciler.handleDegradeError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test persistent error")) + Expect(err).NotTo(HaveOccurred()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeTrue()) + }) + + It("handleTransientError should not degrade before threshold, but degrade after threshold", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + reconciler := &CloudOperatorReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: fakeClock, + ManagedNamespace: defaultManagementNamespace, + Recorder: record.NewFakeRecorder(32), + }, + Scheme: scheme.Scheme, + } + + // Pre-create the ClusterOperator so that setStatusDegraded can update its status + // subresource when the threshold is exceeded (status subresource updates require the + // object to already exist in the cluster). + co := &configv1.ClusterOperator{} + co.SetName(clusterOperatorName) + Expect(cl.Create(ctx, co)).To(Succeed()) + + // First reconcile: transient failure starts; error returned but no degraded condition set. + _, err := reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error")) + Expect(err).To(HaveOccurred()) + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeFalse(), + "should not be degraded before threshold") + + // Advance clock past the degraded threshold. + fakeClock.Step(aggregatedTransientDegradedThreshold + time.Second) + + // Second reconcile: threshold exceeded, controller sets degraded. + _, err = reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error")) + Expect(err).To(HaveOccurred()) + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeTrue()) + }) +}) diff --git a/pkg/controllers/trusted_ca_bundle_controller.go b/pkg/controllers/trusted_ca_bundle_controller.go index e6e788548..5921448d7 100644 --- a/pkg/controllers/trusted_ca_bundle_controller.go +++ b/pkg/controllers/trusted_ca_bundle_controller.go @@ -61,10 +61,11 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ // Request object not found, could have been deleted after reconcile request. // Return and don't requeue klog.Infof("proxy not found; reconciliation will be skipped") - r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } + // We tolerate the proxy config not being found. + r.clearFailureWindow() return reconcile.Result{}, nil } // Non-NotFound: transient API error. From 22658584414fed8ee74a945e682436d677ed11e5 Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Thu, 5 Mar 2026 16:46:41 -0500 Subject: [PATCH 4/4] Fix race on consecutiveFailureSince in cloud config sync test The test "config should not be updated if source and target config content are identical" called reconciler.Reconcile() directly while the manager was also running the same reconciler in a background goroutine. Both goroutines could access consecutiveFailureSince concurrently, which the Go race detector flags. Use a fresh CloudConfigReconciler instance (not registered with the manager) for the direct call. It shares the thread-safe API client but owns its own consecutiveFailureSince field, so there is no shared mutable state with the manager's copy. Co-Authored-By: Claude Sonnet 4.6 --- .../cloud_config_sync_controller_test.go | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/pkg/controllers/cloud_config_sync_controller_test.go b/pkg/controllers/cloud_config_sync_controller_test.go index 5e1a6ce28..dcfcd02c7 100644 --- a/pkg/controllers/cloud_config_sync_controller_test.go +++ b/pkg/controllers/cloud_config_sync_controller_test.go @@ -348,8 +348,26 @@ var _ = Describe("Cloud config sync controller", func() { }, timeout).Should(Succeed()) initialCMresourceVersion := syncedCloudConfigMap.ResourceVersion + // Introducing the consecutiveFailureWindow means that there's a field that could be racy + // between the manager calling Reconcile and the test calling Reconcile. + // In production, we only have 1 instance of the reconciler running. + // Create a fresh reconciler that is NOT registered with the manager. + // It shares the same API client (thread-safe) but has its own + // consecutiveFailureSince field, so no data race with the manager's copy. + freshReconciler := &CloudConfigReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: targetNamespaceName, + }, + Scheme: scheme.Scheme, + FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting( + nil, []configv1.FeatureGateName{"AWSServiceLBNetworkSecurityGroup"}, nil, nil, + ), + } + request := reconcile.Request{NamespacedName: client.ObjectKey{Name: "foo", Namespace: "bar"}} - _, err := reconciler.Reconcile(ctx, request) + _, err := freshReconciler.Reconcile(ctx, request) Expect(err).Should(Succeed()) Expect(cl.Get(ctx, syncedConfigMapKey, syncedCloudConfigMap)).Should(Succeed())