diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go index dc50e4acf..caa4e237f 100644 --- a/pkg/controllers/cloud_config_sync_controller.go +++ b/pkg/controllers/cloud_config_sync_controller.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "reflect" + "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -29,42 +30,50 @@ const ( // Controller conditions for the Cluster Operator resource cloudConfigControllerAvailableCondition = "CloudConfigControllerAvailable" cloudConfigControllerDegradedCondition = "CloudConfigControllerDegraded" + + // transientDegradedThreshold is how long transient errors must persist before + // the controller sets Degraded=True. This prevents brief + // API server blips during upgrades from immediately degrading the operator. + // Applies to both CloudConfigController and TrustedCAController. + transientDegradedThreshold = 2 * time.Minute ) type CloudConfigReconciler struct { ClusterOperatorStatusClient - Scheme *runtime.Scheme - FeatureGateAccess featuregates.FeatureGateAccess + Scheme *runtime.Scheme + FeatureGateAccess featuregates.FeatureGateAccess + consecutiveFailureSince *time.Time // nil when the last reconcile succeeded } func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { klog.V(1).Infof("Syncing cloud-conf ConfigMap") infra := &configv1.Infrastructure{} - if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); err != nil { - klog.Errorf("infrastructure resource not found") - if err := r.setDegradedCondition(ctx); err != nil { + if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) { + // No cloud platform: mirror the main controller's behaviour of returning Available. + klog.Infof("Infrastructure cluster does not exist. Skipping...") + if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } - return ctrl.Result{}, err + // Skip if the infrastructure resource doesn't exist. + r.clearFailureWindow() + return ctrl.Result{}, nil + } else if err != nil { + return r.handleTransientError(ctx, err) } network := &configv1.Network{} if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller when getting cluster Network object: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, err) } syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + // nil platformStatus is a permanent misconfiguration. + return r.handleDegradeError(ctx, err) } if !syncNeeded { + r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } @@ -74,11 +83,9 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) cloudConfigTransformerFn, needsManagedConfigLookup, err := cloud.GetCloudConfigTransformer(infra.Status.PlatformStatus) if err != nil { + // Unsupported platform won't change without a cluster reconfigure. klog.Errorf("unable to get cloud config transformer function; unsupported platform") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return r.handleDegradeError(ctx, err) } sourceCM := &corev1.ConfigMap{} @@ -101,12 +108,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) managedConfigFound = true } else if errors.IsNotFound(err) { klog.Warningf("managed cloud-config is not found, falling back to infrastructure config") - } else if err != nil { - klog.Errorf("unable to get managed cloud-config for sync") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + } else { + return r.handleTransientError(ctx, err) } } @@ -119,38 +122,26 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) { klog.Warningf("managed cloud-config is not found, falling back to default cloud config.") } else if err != nil { - klog.Errorf("unable to get cloud-config for sync: %v", err) - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, err) } } sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + // User-supplied key mismatch: permanent until the ConfigMap or Infrastructure changes. + return r.handleDegradeError(ctx, err) } - // Check if FeatureGateAccess is configured if r.FeatureGateAccess == nil { - klog.Errorf("FeatureGateAccess is not configured") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, fmt.Errorf("FeatureGateAccess is not configured") + // Operator misconfiguration at startup: permanent. + return r.handleDegradeError(ctx, fmt.Errorf("FeatureGateAccess is not configured")) } features, err := r.FeatureGateAccess.CurrentFeatureGates() if err != nil { + // The feature-gate informer may not have synced yet: transient. klog.Errorf("unable to get feature gates: %v", err) - if errD := r.setDegradedCondition(ctx); errD != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", errD) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, err) } if cloudConfigTransformerFn != nil { @@ -159,10 +150,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) // we're not expecting users to put their data in the former. output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + // Platform-specific transform failed on the current config data: permanent. + return r.handleDegradeError(ctx, err) } sourceCM.Data[defaultConfigKey] = output } @@ -175,16 +164,13 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) // If the config does not exist, it will be created later, so we can ignore a Not Found error if err := r.Get(ctx, targetConfigMapKey, targetCM); err != nil && !errors.IsNotFound(err) { - klog.Errorf("unable to get target cloud-config for sync") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, err) } // Note that the source config map is actually a *transformed* source config map if r.isCloudConfigEqual(sourceCM, targetCM) { klog.V(1).Infof("source and target cloud-config content are equal, no sync needed") + r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } @@ -193,12 +179,10 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.syncCloudConfigData(ctx, sourceCM, targetCM); err != nil { klog.Errorf("unable to sync cloud config") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, err) } + r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } @@ -206,6 +190,45 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, nil } +// clearFailureWindow resets the transient-error tracking. Call this on every +// successful reconcile so the 2-minute window restarts fresh on the next failure. +func (r *CloudConfigReconciler) clearFailureWindow() { + r.consecutiveFailureSince = nil +} + +// handleTransientError records the start of a failure window and degrades the +// controller only after transientDegradedThreshold has elapsed. It always +// returns a non-nil error so controller-runtime requeues with exponential backoff. +func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { + now := r.Clock.Now() + if r.consecutiveFailureSince == nil { + r.consecutiveFailureSince = &now + klog.V(4).Infof("CloudConfigReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) + return ctrl.Result{}, err + } + elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince) + if elapsed < transientDegradedThreshold { + klog.V(4).Infof("CloudConfigReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) + return ctrl.Result{}, err + } + klog.Warningf("CloudConfigReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, err +} + +// handleDegradeError sets CloudConfigControllerDegraded=True immediately and +// returns nil so controller-runtime does NOT requeue. An existing watch on the +// relevant resource will re-trigger reconciliation when the problem is fixed. +func (r *CloudConfigReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) { + klog.Errorf("CloudConfigReconciler: permanent error, setting degraded: %v", err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, nil +} + func (r *CloudConfigReconciler) isCloudConfigSyncNeeded(platformStatus *configv1.PlatformStatus, infraCloudConfigRef configv1.ConfigMapFileReference) (bool, error) { if platformStatus == nil { return false, fmt.Errorf("platformStatus is required") diff --git a/pkg/controllers/cloud_config_sync_controller_test.go b/pkg/controllers/cloud_config_sync_controller_test.go index 513e944e3..dcfcd02c7 100644 --- a/pkg/controllers/cloud_config_sync_controller_test.go +++ b/pkg/controllers/cloud_config_sync_controller_test.go @@ -348,8 +348,26 @@ var _ = Describe("Cloud config sync controller", func() { }, timeout).Should(Succeed()) initialCMresourceVersion := syncedCloudConfigMap.ResourceVersion + // Introducing the consecutiveFailureWindow means that there's a field that could be racy + // between the manager calling Reconcile and the test calling Reconcile. + // In production, we only have 1 instance of the reconciler running. + // Create a fresh reconciler that is NOT registered with the manager. + // It shares the same API client (thread-safe) but has its own + // consecutiveFailureSince field, so no data race with the manager's copy. + freshReconciler := &CloudConfigReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: targetNamespaceName, + }, + Scheme: scheme.Scheme, + FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting( + nil, []configv1.FeatureGateName{"AWSServiceLBNetworkSecurityGroup"}, nil, nil, + ), + } + request := reconcile.Request{NamespacedName: client.ObjectKey{Name: "foo", Namespace: "bar"}} - _, err := reconciler.Reconcile(ctx, request) + _, err := freshReconciler.Reconcile(ctx, request) Expect(err).Should(Succeed()) Expect(cl.Get(ctx, syncedConfigMapKey, syncedCloudConfigMap)).Should(Succeed()) @@ -509,7 +527,7 @@ var _ = Describe("Cloud config sync reconciler", func() { Expect(len(allCMs.Items)).To(BeEquivalentTo(1)) }) - It("should error if a user-specified configmap key isn't present", func() { + It("should degrade immediately if a user-specified configmap key isn't present", func() { infraResource := makeInfrastructureResource(configv1.AWSPlatformType) infraResource.Spec.CloudConfig.Key = "notfound" Expect(cl.Create(ctx, infraResource)).To(Succeed()) @@ -518,8 +536,19 @@ var _ = Describe("Cloud config sync reconciler", func() { Expect(cl.Status().Update(ctx, infraResource.DeepCopy())).To(Succeed()) _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) - Expect(err.Error()).To(ContainSubstring("specified in infra resource does not exist in source configmap")) - + Expect(err).To(Succeed()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil()) + Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue)) }) It("should continue with reconcile when feature gates are available", func() { @@ -584,15 +613,39 @@ var _ = Describe("Cloud config sync reconciler", func() { }) }) - It("reconcile should fail if no infra resource found", func() { + It("reconcile should succeed and be available if no infra resource found", func() { _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) - Expect(err.Error()).Should(BeEquivalentTo("infrastructures.config.openshift.io \"cluster\" not found")) + Expect(err).To(Succeed()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var availCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerAvailableCondition { + availCond = &co.Status.Conditions[i] + break + } + } + Expect(availCond).NotTo(BeNil()) + Expect(availCond.Status).To(Equal(configv1.ConditionTrue)) }) - It("should fail if no PlatformStatus in infra resource presented ", func() { + It("should degrade immediately if no PlatformStatus in infra resource", func() { infraResource := makeInfrastructureResource(configv1.AWSPlatformType) Expect(cl.Create(ctx, infraResource)).To(Succeed()) _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) - Expect(err.Error()).Should(BeEquivalentTo("platformStatus is required")) + Expect(err).To(Succeed()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil()) + Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue)) }) }) diff --git a/pkg/controllers/clusteroperator_controller.go b/pkg/controllers/clusteroperator_controller.go index d94066e4c..7ed5f5d11 100644 --- a/pkg/controllers/clusteroperator_controller.go +++ b/pkg/controllers/clusteroperator_controller.go @@ -19,6 +19,7 @@ package controllers import ( "context" "fmt" + "time" configv1 "github.com/openshift/api/config/v1" operatorv1 "github.com/openshift/api/operator/v1" @@ -45,16 +46,24 @@ const ( // Condition type for Cloud Controller ownership cloudControllerOwnershipCondition = "CloudControllerOwner" + + // aggregatedTransientDegradedThreshold is how long transient errors must persist before + // the controller sets Degraded=True. + // This prevents brief API server blips during upgrades from immediately degrading the operator. + // Applies to top-level operator, and is longer in order + // to accomodate changes in the lower-level operators. + aggregatedTransientDegradedThreshold = 2*time.Minute + (30 * time.Second) ) // CloudOperatorReconciler reconciles a ClusterOperator object type CloudOperatorReconciler struct { ClusterOperatorStatusClient - Scheme *runtime.Scheme - watcher ObjectWatcher - ImagesFile string - FeatureGateAccess featuregates.FeatureGateAccess - TLSProfileSpec configv1.TLSProfileSpec + Scheme *runtime.Scheme + watcher ObjectWatcher + ImagesFile string + FeatureGateAccess featuregates.FeatureGateAccess + TLSProfileSpec configv1.TLSProfileSpec + consecutiveFailureSince *time.Time // nil when the last reconcile succeeded } // +kubebuilder:rbac:groups=config.openshift.io,resources=clusteroperators,verbs=get;list;watch;create;update;patch;delete @@ -69,59 +78,43 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) infra := &configv1.Infrastructure{} if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) { klog.Infof("Infrastructure cluster does not exist. Skipping...") - if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { klog.Errorf("Unable to sync cluster operator status: %s", err) - return ctrl.Result{}, err + return r.handleTransientError(ctx, conditionOverrides, err) } - + // It's ok for the infrastructure cluster to not exist + r.clearFailureWindow() return ctrl.Result{}, nil } else if err != nil { klog.Errorf("Unable to retrive Infrastructure object: %v", err) - - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, conditionOverrides, err) } allowedToProvision, err := r.provisioningAllowed(ctx, infra, conditionOverrides) if err != nil { klog.Errorf("Unable to determine cluster state to check if provision is allowed: %v", err) - return ctrl.Result{}, err + return r.handleTransientError(ctx, conditionOverrides, err) } else if !allowedToProvision { + // We're not allowed to provision, but didn't have any failures. + r.clearFailureWindow() return ctrl.Result{}, nil } clusterProxy := &configv1.Proxy{} if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !errors.IsNotFound(err) { klog.Errorf("Unable to retrive Proxy object: %v", err) - - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, conditionOverrides, err) } operatorConfig, err := config.ComposeConfig(infra, clusterProxy, r.ImagesFile, r.ManagedNamespace, r.FeatureGateAccess, r.TLSProfileSpec) if err != nil { klog.Errorf("Unable to build operator config %s", err) - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return r.handleDegradeError(ctx, conditionOverrides, err) } if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil { klog.Errorf("Unable to sync operands: %s", err) - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, conditionOverrides, err) } if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { @@ -134,9 +127,48 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) return ctrl.Result{}, err } + // successful reconcile, make sure the failure window is cleared. + r.clearFailureWindow() return ctrl.Result{}, nil } +func (r *CloudOperatorReconciler) clearFailureWindow() { + r.consecutiveFailureSince = nil +} + +// handleTransientError records the start of a failure window and degrades the +// operator only after aggregatedTransientDegradedThreshold has elapsed. It always returns +// a non-nil error so controller-runtime requeues with exponential backoff. +func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) { + now := r.Clock.Now() + if r.consecutiveFailureSince == nil { + r.consecutiveFailureSince = &now + klog.V(4).Infof("CloudOperatorReconciler: transient failure started (%v), will degrade after %s", err, aggregatedTransientDegradedThreshold) + return ctrl.Result{}, err + } + elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince) + if elapsed < aggregatedTransientDegradedThreshold { + klog.V(4).Infof("CloudOperatorReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, aggregatedTransientDegradedThreshold, err) + return ctrl.Result{}, err + } + klog.Warningf("CloudOperatorReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) + if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil { + return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr) + } + return ctrl.Result{}, err +} + +// handleDegradeError sets OperatorDegraded=True immediately and returns nil so +// controller-runtime does NOT requeue. Existing watches on Infrastructure, +// ConfigMaps, and Secrets will re-trigger reconciliation when the problem is fixed. +func (r *CloudOperatorReconciler) handleDegradeError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) { + klog.Errorf("CloudOperatorReconciler: persistent error, setting degraded: %v", err) + if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil { + return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr) + } + return ctrl.Result{}, nil // do not requeue; a watch event will re-trigger +} + func (r *CloudOperatorReconciler) sync(ctx context.Context, config config.OperatorConfig, conditionOverrides []configv1.ClusterOperatorStatusCondition) error { // Deploy resources for platform resources, err := cloud.GetResources(config) diff --git a/pkg/controllers/clusteroperator_controller_test.go b/pkg/controllers/clusteroperator_controller_test.go index 041670762..987b5be2d 100644 --- a/pkg/controllers/clusteroperator_controller_test.go +++ b/pkg/controllers/clusteroperator_controller_test.go @@ -2,6 +2,7 @@ package controllers import ( "context" + "fmt" "time" . "github.com/onsi/ginkgo/v2" @@ -618,3 +619,73 @@ var _ = Describe("Apply resources should", func() { }) }) + +var _ = Describe("CloudOperatorReconciler error handling", func() { + ctx := context.Background() + + AfterEach(func() { + co := &configv1.ClusterOperator{} + if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil { + Eventually(func() bool { + err := cl.Delete(ctx, co) + return err == nil || apierrors.IsNotFound(err) + }).Should(BeTrue()) + } + Eventually(apierrors.IsNotFound(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co))).Should(BeTrue()) + }) + + It("handleDegradeError should set OperatorDegraded=True immediately and return nil error", func() { + reconciler := &CloudOperatorReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: defaultManagementNamespace, + Recorder: record.NewFakeRecorder(32), + }, + Scheme: scheme.Scheme, + } + + _, err := reconciler.handleDegradeError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test persistent error")) + Expect(err).NotTo(HaveOccurred()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeTrue()) + }) + + It("handleTransientError should not degrade before threshold, but degrade after threshold", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + reconciler := &CloudOperatorReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: fakeClock, + ManagedNamespace: defaultManagementNamespace, + Recorder: record.NewFakeRecorder(32), + }, + Scheme: scheme.Scheme, + } + + // Pre-create the ClusterOperator so that setStatusDegraded can update its status + // subresource when the threshold is exceeded (status subresource updates require the + // object to already exist in the cluster). + co := &configv1.ClusterOperator{} + co.SetName(clusterOperatorName) + Expect(cl.Create(ctx, co)).To(Succeed()) + + // First reconcile: transient failure starts; error returned but no degraded condition set. + _, err := reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error")) + Expect(err).To(HaveOccurred()) + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeFalse(), + "should not be degraded before threshold") + + // Advance clock past the degraded threshold. + fakeClock.Step(aggregatedTransientDegradedThreshold + time.Second) + + // Second reconcile: threshold exceeded, controller sets degraded. + _, err = reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error")) + Expect(err).To(HaveOccurred()) + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeTrue()) + }) +}) diff --git a/pkg/controllers/trusted_ca_bundle_controller.go b/pkg/controllers/trusted_ca_bundle_controller.go index ef33709e1..5921448d7 100644 --- a/pkg/controllers/trusted_ca_bundle_controller.go +++ b/pkg/controllers/trusted_ca_bundle_controller.go @@ -6,6 +6,7 @@ import ( "crypto/x509" "fmt" "os" + "time" "github.com/openshift/api/annotations" configv1 "github.com/openshift/api/config/v1" @@ -41,8 +42,9 @@ const ( type TrustedCABundleReconciler struct { ClusterOperatorStatusClient - Scheme *runtime.Scheme - trustBundlePath string + Scheme *runtime.Scheme + trustBundlePath string + consecutiveFailureSince *time.Time // nil when the last reconcile succeeded } // isSpecTrustedCASet returns true if spec.trustedCA of proxyConfig is set. @@ -62,13 +64,12 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } + // We tolerate the proxy config not being found. + r.clearFailureWindow() return reconcile.Result{}, nil } - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - // Error reading the object - requeue the request. - return reconcile.Result{}, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err) + // Non-NotFound: transient API error. + return r.handleTransientError(ctx, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err)) } // Check if changed config map in 'openshift-config' namespace is proxy trusted ca. @@ -77,43 +78,37 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } - + // Note: clearFailureWindow is intentionally NOT called here. This path did not + // exercise the full reconcile logic, so an ongoing transient failure window + // (set by a previous reconcile pass) should not be reset. klog.V(1).Infof("changed config map %s is not a proxy trusted ca, skipping", req) return reconcile.Result{}, nil } systemTrustBundle, err := r.getSystemTrustBundle() if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("failed to get system trust bundle: %v", err) + // Node cert store may be updating during upgrade: transient. + return r.handleTransientError(ctx, fmt.Errorf("failed to get system trust bundle: %v", err)) } proxyCABundle, mergedTrustBundle, err := r.addProxyCABundle(ctx, proxyConfig, systemTrustBundle) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err) + // Combined cert bundle is corrupt: persistent. + return r.handleDegradeError(ctx, fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err)) } _, mergedTrustBundle, err = r.addCloudConfigCABundle(ctx, proxyCABundle, mergedTrustBundle) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err) + // Combined cert bundle is corrupt: persistent. + return r.handleDegradeError(ctx, fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err)) } ccmTrustedConfigMap := r.makeCABundleConfigMap(mergedTrustBundle) if err := r.createOrUpdateConfigMap(ctx, ccmTrustedConfigMap); err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("can not update target trust bundle configmap: %v", err) + return r.handleTransientError(ctx, fmt.Errorf("can not update target trust bundle configmap: %v", err)) } + r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } @@ -121,6 +116,43 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, nil } +func (r *TrustedCABundleReconciler) clearFailureWindow() { + r.consecutiveFailureSince = nil +} + +// handleTransientError records the start of a failure window and degrades the +// controller only after transientDegradedThreshold has elapsed. It always +// returns a non-nil error so controller-runtime requeues with exponential backoff. +func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { + now := r.Clock.Now() + if r.consecutiveFailureSince == nil { + r.consecutiveFailureSince = &now + klog.V(4).Infof("TrustedCABundleReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) + return ctrl.Result{}, err + } + elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince) + if elapsed < transientDegradedThreshold { + klog.V(4).Infof("TrustedCABundleReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) + return ctrl.Result{}, err + } + klog.Warningf("TrustedCABundleReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, err +} + +// handleDegradeError sets TrustedCABundleControllerControllerDegraded=True immediately and +// returns nil so controller-runtime does NOT requeue. An existing watch on the +// relevant resource will re-trigger reconciliation when the problem is fixed. +func (r *TrustedCABundleReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) { + klog.Errorf("TrustedCABundleReconciler: persistent error, setting degraded: %v", err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, nil +} + // addProxyCABundle checks ca bundle referred by Proxy resource and adds it to passed bundle // in case if proxy one is valid. // This function returns added bundle as first value, result as second and an error if it was occurred. diff --git a/pkg/controllers/trusted_ca_bundle_controller_test.go b/pkg/controllers/trusted_ca_bundle_controller_test.go index 2c112413f..db1be8c07 100644 --- a/pkg/controllers/trusted_ca_bundle_controller_test.go +++ b/pkg/controllers/trusted_ca_bundle_controller_test.go @@ -16,6 +16,7 @@ import ( "k8s.io/client-go/tools/record" clocktesting "k8s.io/utils/clock/testing" "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/config" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -309,6 +310,92 @@ var _ = Describe("Trusted CA bundle sync controller", func() { }) }) +var _ = Describe("Trusted CA bundle reconciler unit tests", func() { + ctx := context.Background() + + AfterEach(func() { + co := &v1.ClusterOperator{} + if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil { + Expect(cl.Delete(ctx, co)).To(Succeed()) + Eventually(func() bool { + return apierrors.IsNotFound(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)) + }).Should(BeTrue()) + } + }) + + It("reconcile should succeed and be available if no proxy resource found", func() { + reconciler := &TrustedCABundleReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: testManagedNamespace, + }, + trustBundlePath: systemCAValid, + } + + _, err := reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(Succeed()) + + co := &v1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var availCond *v1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == trustedCABundleControllerAvailableCondition { + availCond = &co.Status.Conditions[i] + break + } + } + Expect(availCond).NotTo(BeNil()) + Expect(availCond.Status).To(Equal(v1.ConditionTrue)) + }) + + It("should not degrade on transient error before threshold, but degrade after threshold", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + reconciler := &TrustedCABundleReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: fakeClock, + ManagedNamespace: testManagedNamespace, + }, + trustBundlePath: "/broken/ca/path.pem", // unreadable → transient error + } + + // Create a Proxy so the Proxy get succeeds and we reach the system trust bundle read. + proxy := &v1.Proxy{ObjectMeta: metav1.ObjectMeta{Name: proxyResourceName}} + Expect(cl.Create(ctx, proxy)).To(Succeed()) + DeferCleanup(func() { _ = cl.Delete(ctx, proxy) }) + + // First reconcile: transient failure starts; error is returned but no degraded condition set. + _, err := reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + co := &v1.ClusterOperator{} + if getErr := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); getErr == nil { + for _, cond := range co.Status.Conditions { + if cond.Type == trustedCABundleControllerDegradedCondition { + Expect(cond.Status).NotTo(Equal(v1.ConditionTrue), "should not be degraded before threshold") + } + } + } + + // Advance clock past the degraded threshold. + fakeClock.Step(transientDegradedThreshold + time.Second) + + // Second reconcile: threshold exceeded, controller sets degraded. + _, err = reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *v1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == trustedCABundleControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil()) + Expect(degradedCond.Status).To(Equal(v1.ConditionTrue)) + }) +}) + var _ = Describe("Trusted CA reconciler methods", func() { It("Get system CA should be fine if bundle is valid", func() { reconciler := &TrustedCABundleReconciler{