Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 79 additions & 56 deletions pkg/controllers/cloud_config_sync_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"reflect"
"time"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
Expand All @@ -29,42 +30,50 @@ const (
// Controller conditions for the Cluster Operator resource
cloudConfigControllerAvailableCondition = "CloudConfigControllerAvailable"
cloudConfigControllerDegradedCondition = "CloudConfigControllerDegraded"

// transientDegradedThreshold is how long transient errors must persist before
// the controller sets Degraded=True. This prevents brief
// API server blips during upgrades from immediately degrading the operator.
// Applies to both CloudConfigController and TrustedCAController.
transientDegradedThreshold = 2 * time.Minute
)

type CloudConfigReconciler struct {
ClusterOperatorStatusClient
Scheme *runtime.Scheme
FeatureGateAccess featuregates.FeatureGateAccess
Scheme *runtime.Scheme
FeatureGateAccess featuregates.FeatureGateAccess
consecutiveFailureSince *time.Time // nil when the last reconcile succeeded
}

func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
klog.V(1).Infof("Syncing cloud-conf ConfigMap")

infra := &configv1.Infrastructure{}
if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); err != nil {
klog.Errorf("infrastructure resource not found")
if err := r.setDegradedCondition(ctx); err != nil {
if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) {
// No cloud platform: mirror the main controller's behaviour of returning Available.
klog.Infof("Infrastructure cluster does not exist. Skipping...")
if err := r.setAvailableCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
// Skip if the infrastructure resource doesn't exist.
r.clearFailureWindow()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To avoid having to set this on every possible happy exit path, what about creating named return variables and using a defer that checks if the error return is nil before calling this?

return ctrl.Result{}, nil
} else if err != nil {
return r.handleTransientError(ctx, err)
}

network := &configv1.Network{}
if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller when getting cluster Network object: %v", err)
}
return ctrl.Result{}, err
return r.handleTransientError(ctx, err)
}

syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig)
if err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
// nil platformStatus is a permanent misconfiguration.
return r.handleDegradeError(ctx, err)
}
if !syncNeeded {
r.clearFailureWindow()
if err := r.setAvailableCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
Expand All @@ -74,11 +83,9 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)

cloudConfigTransformerFn, needsManagedConfigLookup, err := cloud.GetCloudConfigTransformer(infra.Status.PlatformStatus)
if err != nil {
// Unsupported platform won't change without a cluster reconfigure.
klog.Errorf("unable to get cloud config transformer function; unsupported platform")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return r.handleDegradeError(ctx, err)
}

sourceCM := &corev1.ConfigMap{}
Expand All @@ -101,12 +108,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
managedConfigFound = true
} else if errors.IsNotFound(err) {
klog.Warningf("managed cloud-config is not found, falling back to infrastructure config")
} else if err != nil {
klog.Errorf("unable to get managed cloud-config for sync")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
} else {
return r.handleTransientError(ctx, err)
}
}

Expand All @@ -119,38 +122,26 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) {
klog.Warningf("managed cloud-config is not found, falling back to default cloud config.")
} else if err != nil {
klog.Errorf("unable to get cloud-config for sync: %v", err)
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return r.handleTransientError(ctx, err)
}
}

sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra)
if err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
// User-supplied key mismatch: permanent until the ConfigMap or Infrastructure changes.
return r.handleDegradeError(ctx, err)
}

// Check if FeatureGateAccess is configured
if r.FeatureGateAccess == nil {
klog.Errorf("FeatureGateAccess is not configured")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, fmt.Errorf("FeatureGateAccess is not configured")
// Operator misconfiguration at startup: permanent.
return r.handleDegradeError(ctx, fmt.Errorf("FeatureGateAccess is not configured"))
}

features, err := r.FeatureGateAccess.CurrentFeatureGates()
if err != nil {
// The feature-gate informer may not have synced yet: transient.
klog.Errorf("unable to get feature gates: %v", err)
if errD := r.setDegradedCondition(ctx); errD != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", errD)
}
return ctrl.Result{}, err
return r.handleTransientError(ctx, err)
}

if cloudConfigTransformerFn != nil {
Expand All @@ -159,10 +150,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
// we're not expecting users to put their data in the former.
output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features)
if err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
// Platform-specific transform failed on the current config data: permanent.
return r.handleDegradeError(ctx, err)
}
sourceCM.Data[defaultConfigKey] = output
}
Expand All @@ -175,16 +164,13 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)

// If the config does not exist, it will be created later, so we can ignore a Not Found error
if err := r.Get(ctx, targetConfigMapKey, targetCM); err != nil && !errors.IsNotFound(err) {
klog.Errorf("unable to get target cloud-config for sync")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return r.handleTransientError(ctx, err)
}

// Note that the source config map is actually a *transformed* source config map
if r.isCloudConfigEqual(sourceCM, targetCM) {
klog.V(1).Infof("source and target cloud-config content are equal, no sync needed")
r.clearFailureWindow()
if err := r.setAvailableCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
Expand All @@ -193,19 +179,56 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)

if err := r.syncCloudConfigData(ctx, sourceCM, targetCM); err != nil {
klog.Errorf("unable to sync cloud config")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return r.handleTransientError(ctx, err)
}

r.clearFailureWindow()
if err := r.setAvailableCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}

return ctrl.Result{}, nil
}

// clearFailureWindow resets the transient-error tracking. Call this on every
// successful reconcile so the 2-minute window restarts fresh on the next failure.
func (r *CloudConfigReconciler) clearFailureWindow() {
r.consecutiveFailureSince = nil
}

// handleTransientError records the start of a failure window and degrades the
// controller only after transientDegradedThreshold has elapsed. It always
// returns a non-nil error so controller-runtime requeues with exponential backoff.
func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) {
now := r.Clock.Now()
if r.consecutiveFailureSince == nil {
r.consecutiveFailureSince = &now
klog.V(4).Infof("CloudConfigReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold)
return ctrl.Result{}, err
}
elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince)
if elapsed < transientDegradedThreshold {
klog.V(4).Infof("CloudConfigReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err)
return ctrl.Result{}, err
}
klog.Warningf("CloudConfigReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err)
if setErr := r.setDegradedCondition(ctx); setErr != nil {
return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr)
}
return ctrl.Result{}, err
}

// handleDegradeError sets CloudConfigControllerDegraded=True immediately and
// returns nil so controller-runtime does NOT requeue. An existing watch on the
// relevant resource will re-trigger reconciliation when the problem is fixed.
func (r *CloudConfigReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) {
klog.Errorf("CloudConfigReconciler: permanent error, setting degraded: %v", err)
if setErr := r.setDegradedCondition(ctx); setErr != nil {
return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr)
}
return ctrl.Result{}, nil
}

func (r *CloudConfigReconciler) isCloudConfigSyncNeeded(platformStatus *configv1.PlatformStatus, infraCloudConfigRef configv1.ConfigMapFileReference) (bool, error) {
if platformStatus == nil {
return false, fmt.Errorf("platformStatus is required")
Expand Down
69 changes: 61 additions & 8 deletions pkg/controllers/cloud_config_sync_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -348,8 +348,26 @@ var _ = Describe("Cloud config sync controller", func() {
}, timeout).Should(Succeed())
initialCMresourceVersion := syncedCloudConfigMap.ResourceVersion

// Introducing the consecutiveFailureWindow means that there's a field that could be racy
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't love this; definitely open to alternatives.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using some sort of atomic timestamp or lock?

// between the manager calling Reconcile and the test calling Reconcile.
// In production, we only have 1 instance of the reconciler running.
// Create a fresh reconciler that is NOT registered with the manager.
// It shares the same API client (thread-safe) but has its own
// consecutiveFailureSince field, so no data race with the manager's copy.
freshReconciler := &CloudConfigReconciler{
ClusterOperatorStatusClient: ClusterOperatorStatusClient{
Client: cl,
Clock: clocktesting.NewFakePassiveClock(time.Now()),
ManagedNamespace: targetNamespaceName,
},
Scheme: scheme.Scheme,
FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting(
nil, []configv1.FeatureGateName{"AWSServiceLBNetworkSecurityGroup"}, nil, nil,
),
}

request := reconcile.Request{NamespacedName: client.ObjectKey{Name: "foo", Namespace: "bar"}}
_, err := reconciler.Reconcile(ctx, request)
_, err := freshReconciler.Reconcile(ctx, request)
Expect(err).Should(Succeed())

Expect(cl.Get(ctx, syncedConfigMapKey, syncedCloudConfigMap)).Should(Succeed())
Expand Down Expand Up @@ -509,7 +527,7 @@ var _ = Describe("Cloud config sync reconciler", func() {
Expect(len(allCMs.Items)).To(BeEquivalentTo(1))
})

It("should error if a user-specified configmap key isn't present", func() {
It("should degrade immediately if a user-specified configmap key isn't present", func() {
infraResource := makeInfrastructureResource(configv1.AWSPlatformType)
infraResource.Spec.CloudConfig.Key = "notfound"
Expect(cl.Create(ctx, infraResource)).To(Succeed())
Expand All @@ -518,8 +536,19 @@ var _ = Describe("Cloud config sync reconciler", func() {
Expect(cl.Status().Update(ctx, infraResource.DeepCopy())).To(Succeed())

_, err := reconciler.Reconcile(context.TODO(), ctrl.Request{})
Expect(err.Error()).To(ContainSubstring("specified in infra resource does not exist in source configmap"))

Expect(err).To(Succeed())

co := &configv1.ClusterOperator{}
Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
var degradedCond *configv1.ClusterOperatorStatusCondition
for i := range co.Status.Conditions {
if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition {
degradedCond = &co.Status.Conditions[i]
break
}
}
Expect(degradedCond).NotTo(BeNil())
Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue))
})

It("should continue with reconcile when feature gates are available", func() {
Expand Down Expand Up @@ -584,15 +613,39 @@ var _ = Describe("Cloud config sync reconciler", func() {
})
})

It("reconcile should fail if no infra resource found", func() {
It("reconcile should succeed and be available if no infra resource found", func() {
_, err := reconciler.Reconcile(context.TODO(), ctrl.Request{})
Expect(err.Error()).Should(BeEquivalentTo("infrastructures.config.openshift.io \"cluster\" not found"))
Expect(err).To(Succeed())

co := &configv1.ClusterOperator{}
Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
var availCond *configv1.ClusterOperatorStatusCondition
for i := range co.Status.Conditions {
if co.Status.Conditions[i].Type == cloudConfigControllerAvailableCondition {
availCond = &co.Status.Conditions[i]
break
}
}
Expect(availCond).NotTo(BeNil())
Expect(availCond.Status).To(Equal(configv1.ConditionTrue))
})

It("should fail if no PlatformStatus in infra resource presented ", func() {
It("should degrade immediately if no PlatformStatus in infra resource", func() {
infraResource := makeInfrastructureResource(configv1.AWSPlatformType)
Expect(cl.Create(ctx, infraResource)).To(Succeed())
_, err := reconciler.Reconcile(context.TODO(), ctrl.Request{})
Expect(err.Error()).Should(BeEquivalentTo("platformStatus is required"))
Expect(err).To(Succeed())

co := &configv1.ClusterOperator{}
Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
var degradedCond *configv1.ClusterOperatorStatusCondition
for i := range co.Status.Conditions {
if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition {
degradedCond = &co.Status.Conditions[i]
break
}
}
Expect(degradedCond).NotTo(BeNil())
Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue))
})
})
Loading