From a8c51ce913344fbaaeb33ebaa314699f54b94bec Mon Sep 17 00:00:00 2001 From: John Kyros Date: Wed, 20 May 2026 16:42:10 -0500 Subject: [PATCH 1/3] fix(karpenter): Stop controllers fighting over HCP status So right now we have multiple controllers writing HCP status, and several of them are doing full Update() calls instead of patches, which means that the whole object gets overwritten. This results in race conditions such that things like e.g. vcpus get wiped from the status object when something else writes the object. This just swaps all those Status().Update() with Patch + MergeFrom + oplocks so the controllers should get 409'd and retry instead of stomping on fields updated by other controllers. fix(hcco): Stop JSON patch from eating nillable required fields So our conversion from Update() to Patch() was clever, but our patch calls can't write a nil, the field just get omitted, and we have fields where that nil is normal, expected, and necessary, so this tries to work around it by building an explicit JSON patch (because there is seemingly no other way to say that the field needs to be included even though it's empty). Signed-off-by: John Kyros --- .../hostedcontrolplane_controller.go | 17 +- .../controllers/hcpstatus/hcpstatus.go | 75 +++- .../controllers/hcpstatus/hcpstatus_test.go | 330 ++++++++++++++++++ .../controllers/resources/resources.go | 13 +- .../controllers/etcdbackup/reconciler.go | 3 +- .../controllers/etcdbackup/reconciler_test.go | 75 ++++ .../controllers/local_ignitionprovider.go | 3 +- 7 files changed, 494 insertions(+), 22 deletions(-) diff --git a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go index 629ab0e2b606..b83a6ff54c81 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go +++ b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go @@ -1998,10 +1998,11 @@ func (r *HostedControlPlaneReconciler) reconcileValidIDPConfigurationCondition(c Message: fmt.Sprintf("failed to initialize identity providers: %v", err), } } - // Update the condition on the HCP if it has changed + // Patch the condition on the HCP if it has changed + originalHCP := hcp.DeepCopy() if meta.SetStatusCondition(&hcp.Status.Conditions, new) { - if err := r.Status().Update(ctx, hcp); err != nil { - return fmt.Errorf("failed to update valid IDP configuration condition: %w", err) + if err := r.Status().Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil { + return fmt.Errorf("failed to patch valid IDP configuration condition: %w", err) } } return nil @@ -2518,14 +2519,15 @@ func (r *HostedControlPlaneReconciler) removeCloudResources(ctx context.Context, if resourcesDestroyedCond != nil && resourcesDestroyedCond.Message != "" { message = fmt.Sprintf("%s (last status: %s)", message, resourcesDestroyedCond.Message) } + originalHCP := hcp.DeepCopy() meta.SetStatusCondition(&hcp.Status.Conditions, metav1.Condition{ Type: string(hyperv1.CloudResourcesDestroyed), Status: metav1.ConditionFalse, Reason: string(hyperv1.CloudResourcesDeletionTimedOutReason), Message: message, }) - if err := r.Status().Update(ctx, hcp); err != nil { - return false, fmt.Errorf("failed to update cloud resources destroyed condition: %w", err) + if err := r.Status().Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil { + return false, fmt.Errorf("failed to patch cloud resources destroyed condition: %w", err) } return true, nil } @@ -2550,6 +2552,7 @@ func (r *HostedControlPlaneReconciler) removeCloudResources(ctx context.Context, return false, nil } if cvoScaledDownCond == nil || cvoScaledDownCond.Status != metav1.ConditionTrue { + originalHCP := hcp.DeepCopy() cvoScaledDownCond = &metav1.Condition{ Type: string(hyperv1.CVOScaledDown), Status: metav1.ConditionTrue, @@ -2557,8 +2560,8 @@ func (r *HostedControlPlaneReconciler) removeCloudResources(ctx context.Context, LastTransitionTime: metav1.Now(), } meta.SetStatusCondition(&hcp.Status.Conditions, *cvoScaledDownCond) - if err := r.Status().Update(ctx, hcp); err != nil { - return false, fmt.Errorf("failed to set CVO scaled down condition: %w", err) + if err := r.Status().Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil { + return false, fmt.Errorf("failed to patch CVO scaled down condition: %w", err) } } return false, nil diff --git a/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus.go b/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus.go index 6ffe25792713..df6b02064726 100644 --- a/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus.go +++ b/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus.go @@ -1,9 +1,10 @@ package hcpstatus import ( + "bytes" "context" + "encoding/json" "fmt" - "reflect" hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" "github.com/openshift/hypershift/control-plane-operator/hostedclusterconfigoperator/operator" @@ -11,6 +12,7 @@ import ( configv1 "github.com/openshift/api/config/v1" + "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -74,17 +76,76 @@ func (h *hcpStatusReconciler) Reconcile(ctx context.Context, req reconcile.Reque return reconcile.Result{}, err } - if !reflect.DeepEqual(hcp.Status, originalHCP.Status) { - log.Info("Updating HCP status with new configuration and version status") - if err := h.mgtClusterClient.Status().Update(ctx, hcp); err != nil { - return reconcile.Result{}, fmt.Errorf("failed to update hcp: %w", err) - } - log.Info("Successfully updated HCP status") + if equality.Semantic.DeepEqual(hcp.Status, originalHCP.Status) { + return reconcile.Result{}, nil + } + + // Use JSON Patch (RFC 6902) instead of JSON Merge Patch (RFC 7386). + // Merge patch interprets null as "delete field" (RFC 7386 §7), which corrupts + // +required +nullable fields in configv1.ClusterVersionStatus that have no + // omitempty and serialize nil as JSON null: + // - AvailableUpdates []configv1.Release `json:"availableUpdates"` — nil slice → null + // - CompletionTime *metav1.Time `json:"completionTime"` (in UpdateHistory) — nil pointer → null + // JSON Patch "replace"/"add" ops carry null as a literal value, preserving it correctly. + patch, err := buildStatusPatch(originalHCP, hcp) + if err != nil { + return reconcile.Result{}, fmt.Errorf("failed to build status patch: %w", err) + } + + log.Info("Patching HCP status with new configuration and version status") + if err := h.mgtClusterClient.Status().Patch(ctx, hcp, + crclient.RawPatch(types.JSONPatchType, patch)); err != nil { + return reconcile.Result{}, fmt.Errorf("failed to patch hcp status: %w", err) } + log.Info("Successfully patched HCP status") return reconcile.Result{}, nil } +type jsonPatchOp struct { + Op string `json:"op"` + Path string `json:"path"` + Value interface{} `json:"value,omitempty"` +} + +func buildStatusPatch(original, modified *hyperv1.HostedControlPlane) ([]byte, error) { + origJSON, err := json.Marshal(original.Status) + if err != nil { + return nil, err + } + modJSON, err := json.Marshal(modified.Status) + if err != nil { + return nil, err + } + + var origMap, modMap map[string]json.RawMessage + if err := json.Unmarshal(origJSON, &origMap); err != nil { + return nil, err + } + if err := json.Unmarshal(modJSON, &modMap); err != nil { + return nil, err + } + + // Optimistic lock: fail if the object was modified since our read. + ops := []jsonPatchOp{{Op: "test", Path: "/metadata/resourceVersion", Value: original.ResourceVersion}} + + for key, modVal := range modMap { + origVal, exists := origMap[key] + if !exists { + ops = append(ops, jsonPatchOp{Op: "add", Path: "/status/" + key, Value: modVal}) + } else if !bytes.Equal(origVal, modVal) { + ops = append(ops, jsonPatchOp{Op: "replace", Path: "/status/" + key, Value: modVal}) + } + } + for key := range origMap { + if _, exists := modMap[key]; !exists { + ops = append(ops, jsonPatchOp{Op: "remove", Path: "/status/" + key}) + } + } + + return json.Marshal(ops) +} + // findClusterOperatorStatusCondition is identical to meta.FindStatusCondition except that it works on config1.ClusterOperatorStatusCondition instead of // metav1.StatusCondition func findClusterOperatorStatusCondition(conditions []configv1.ClusterOperatorStatusCondition, conditionType configv1.ClusterStatusConditionType) *configv1.ClusterOperatorStatusCondition { diff --git a/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus_test.go b/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus_test.go index 5b391355d313..73655f694287 100644 --- a/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus_test.go +++ b/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus_test.go @@ -1,6 +1,7 @@ package hcpstatus import ( + "encoding/json" "testing" . "github.com/onsi/gomega" @@ -209,3 +210,332 @@ func TestHCPStatusReconciler(t *testing.T) { }) } } + +func TestBuildStatusPatch(t *testing.T) { + t.Parallel() + + t.Run("When versionStatus changes, it should produce a replace op with optimistic lock", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + Status: hyperv1.HostedControlPlaneStatus{ + VersionStatus: &hyperv1.ClusterVersionStatus{ + Desired: configv1.Release{Version: "4.16.0", Image: "quay.io/old:latest"}, + }, + }, + } + original := hcp.DeepCopy() + + hcp.Status.VersionStatus = &hyperv1.ClusterVersionStatus{ + Desired: configv1.Release{Version: "4.17.0", Image: "quay.io/new:latest"}, + } + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + var ops []jsonPatchOp + g.Expect(json.Unmarshal(patchBytes, &ops)).To(Succeed()) + + g.Expect(ops).To(HaveLen(2)) + + g.Expect(ops[0].Op).To(Equal("test")) + g.Expect(ops[0].Path).To(Equal("/metadata/resourceVersion")) + g.Expect(ops[0].Value).To(Equal("100")) + + g.Expect(ops[1].Op).To(Equal("replace")) + g.Expect(ops[1].Path).To(Equal("/status/versionStatus")) + }) + + t.Run("When versionStatus is added for the first time, it should produce an add op", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + } + original := hcp.DeepCopy() + + hcp.Status.VersionStatus = &hyperv1.ClusterVersionStatus{ + Desired: configv1.Release{Version: "4.17.0", Image: "quay.io/test:latest"}, + } + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + var ops []jsonPatchOp + g.Expect(json.Unmarshal(patchBytes, &ops)).To(Succeed()) + + g.Expect(ops).To(HaveLen(2)) + g.Expect(ops[1].Op).To(Equal("add")) + g.Expect(ops[1].Path).To(Equal("/status/versionStatus")) + }) + + t.Run("When nothing changes, it should produce only the test op", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + Status: hyperv1.HostedControlPlaneStatus{ + Conditions: []metav1.Condition{ + {Type: "ConditionA", Status: metav1.ConditionTrue, Reason: "OK"}, + }, + }, + } + original := hcp.DeepCopy() + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + var ops []jsonPatchOp + g.Expect(json.Unmarshal(patchBytes, &ops)).To(Succeed()) + + g.Expect(ops).To(HaveLen(1)) + g.Expect(ops[0].Op).To(Equal("test")) + g.Expect(ops[0].Path).To(Equal("/metadata/resourceVersion")) + }) + + t.Run("When conditions change, it should replace the entire conditions array", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + Status: hyperv1.HostedControlPlaneStatus{ + Conditions: []metav1.Condition{ + {Type: "ConditionA", Status: metav1.ConditionTrue, Reason: "OK"}, + {Type: "ConditionB", Status: metav1.ConditionTrue, Reason: "OK"}, + }, + }, + } + original := hcp.DeepCopy() + + meta.SetStatusCondition(&hcp.Status.Conditions, metav1.Condition{ + Type: "ConditionA", + Status: metav1.ConditionFalse, + Reason: "NowBad", + }) + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + var ops []jsonPatchOp + g.Expect(json.Unmarshal(patchBytes, &ops)).To(Succeed()) + + g.Expect(ops).To(HaveLen(2)) + g.Expect(ops[1].Op).To(Equal("replace")) + g.Expect(ops[1].Path).To(Equal("/status/conditions")) + + conditionsJSON, err := json.Marshal(ops[1].Value) + g.Expect(err).ToNot(HaveOccurred()) + var conditions []metav1.Condition + g.Expect(json.Unmarshal(conditionsJSON, &conditions)).To(Succeed()) + g.Expect(conditions).To(HaveLen(2), "all conditions from the read should be in the patch") + }) + + t.Run("When versionStatus has nil availableUpdates and nil completionTime, the patch should preserve null values", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + } + original := hcp.DeepCopy() + + startedTime := metav1.Now() + hcp.Status.VersionStatus = &hyperv1.ClusterVersionStatus{ + Desired: configv1.Release{Version: "4.17.0", Image: "quay.io/test:latest"}, + History: []configv1.UpdateHistory{ + { + State: configv1.PartialUpdate, + StartedTime: startedTime, + // CompletionTime is nil — update in progress + Version: "4.17.0", + Image: "quay.io/test:latest", + }, + }, + // AvailableUpdates is nil — CVO hasn't checked yet + } + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + // Parse the raw JSON to verify null handling. + // JSON Patch (RFC 6902) uses "value": null to mean "set to null", + // unlike JSON Merge Patch (RFC 7386) where null means "delete". + var rawOps []json.RawMessage + g.Expect(json.Unmarshal(patchBytes, &rawOps)).To(Succeed()) + g.Expect(rawOps).To(HaveLen(2)) + + // Parse the versionStatus op's value to check null fields + var op struct { + Op string `json:"op"` + Path string `json:"path"` + Value json.RawMessage `json:"value"` + } + g.Expect(json.Unmarshal(rawOps[1], &op)).To(Succeed()) + g.Expect(op.Op).To(Equal("add")) + g.Expect(op.Path).To(Equal("/status/versionStatus")) + + var vs map[string]interface{} + g.Expect(json.Unmarshal(op.Value, &vs)).To(Succeed()) + + // availableUpdates should be null (nil slice serializes as null with no omitempty) + au, ok := vs["availableUpdates"] + g.Expect(ok).To(BeTrue(), "availableUpdates key must be present in the patch") + g.Expect(au).To(BeNil(), "availableUpdates should be null — JSON Patch preserves this correctly") + + // completionTime in history should be null (nil *metav1.Time) + history := vs["history"].([]interface{}) + entry := history[0].(map[string]interface{}) + ct, ok := entry["completionTime"] + g.Expect(ok).To(BeTrue(), "completionTime key must be present in the patch") + g.Expect(ct).To(BeNil(), "completionTime should be null — JSON Patch preserves this correctly") + }) + + t.Run("When configuration changes, it should produce a replace op", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + Status: hyperv1.HostedControlPlaneStatus{ + Configuration: &hyperv1.ConfigurationStatus{}, + }, + } + original := hcp.DeepCopy() + + hcp.Status.Configuration = &hyperv1.ConfigurationStatus{ + Authentication: configv1.AuthenticationStatus{ + IntegratedOAuthMetadata: configv1.ConfigMapNameReference{Name: "oauth-metadata"}, + }, + } + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + var ops []jsonPatchOp + g.Expect(json.Unmarshal(patchBytes, &ops)).To(Succeed()) + + g.Expect(ops).To(HaveLen(2)) + g.Expect(ops[1].Op).To(Equal("replace")) + g.Expect(ops[1].Path).To(Equal("/status/configuration")) + }) + + t.Run("When versionStatus is removed, it should produce a remove op", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + Status: hyperv1.HostedControlPlaneStatus{ + VersionStatus: &hyperv1.ClusterVersionStatus{ + Desired: configv1.Release{Version: "4.17.0", Image: "quay.io/test:latest"}, + }, + }, + } + original := hcp.DeepCopy() + + hcp.Status.VersionStatus = nil + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + var ops []jsonPatchOp + g.Expect(json.Unmarshal(patchBytes, &ops)).To(Succeed()) + + g.Expect(ops).To(HaveLen(2)) + g.Expect(ops[0].Op).To(Equal("test")) + g.Expect(ops[1].Op).To(Equal("remove")) + g.Expect(ops[1].Path).To(Equal("/status/versionStatus")) + }) + + t.Run("When multiple fields change, it should produce ops for each changed field", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + Status: hyperv1.HostedControlPlaneStatus{ + VersionStatus: &hyperv1.ClusterVersionStatus{ + Desired: configv1.Release{Version: "4.16.0"}, + }, + Conditions: []metav1.Condition{ + {Type: "ConditionA", Status: metav1.ConditionTrue, Reason: "OK"}, + }, + Configuration: &hyperv1.ConfigurationStatus{}, + Version: "4.16.0", + ReleaseImage: "quay.io/old:latest", + }, + } + original := hcp.DeepCopy() + + hcp.Status.VersionStatus.Desired.Version = "4.17.0" + meta.SetStatusCondition(&hcp.Status.Conditions, metav1.Condition{ + Type: "ConditionA", Status: metav1.ConditionFalse, Reason: "Bad", + }) + hcp.Status.Configuration = &hyperv1.ConfigurationStatus{ + Authentication: configv1.AuthenticationStatus{ + IntegratedOAuthMetadata: configv1.ConfigMapNameReference{Name: "new"}, + }, + } + hcp.Status.Version = "4.17.0" + hcp.Status.ReleaseImage = "quay.io/new:latest" + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + var ops []jsonPatchOp + g.Expect(json.Unmarshal(patchBytes, &ops)).To(Succeed()) + + // test + versionStatus + conditions + configuration + version + releaseImage + g.Expect(ops).To(HaveLen(6)) + + paths := make([]string, len(ops)) + for i, op := range ops { + paths[i] = op.Path + } + g.Expect(paths).To(ContainElements( + "/metadata/resourceVersion", + "/status/versionStatus", + "/status/conditions", + "/status/configuration", + "/status/version", + "/status/releaseImage", + )) + }) +} diff --git a/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go b/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go index 8e4170836d68..a3407b30f872 100644 --- a/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go +++ b/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go @@ -570,12 +570,13 @@ func (r *reconciler) reconcileClusterRecovery(ctx context.Context, log logr.Logg condition.Message = "Hosted cluster recovery finished" } + originalHCP := hcp.DeepCopy() meta.SetStatusCondition(&hcp.Status.Conditions, *condition) log.Info("setting condition", "type", condition.Type, "status", condition.Status, "message", condition.Message) - if err := r.cpClient.Status().Update(ctx, hcp); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update status on hcp for hosted cluster recovery: %w. Condition error message: %v", err, condition.Message) + if err := r.cpClient.Status().Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to patch status on hcp for hosted cluster recovery: %w. Condition error message: %v", err, condition.Message) } - log.Info("successfully updated hcp status with recovery condition") + log.Info("successfully patched hcp status with recovery condition") if !finished { return ctrl.Result{RequeueAfter: 120 * time.Second}, nil @@ -1705,7 +1706,7 @@ func (r *reconciler) patchHCPStatusCondition(ctx context.Context, hcp *hyperv1.H if !meta.SetStatusCondition(&hcp.Status.Conditions, *condition) { return nil // No status change; avoid unnecessary API call. } - if err := r.cpClient.Status().Patch(ctx, hcp, client.MergeFrom(originalHCP)); err != nil { + if err := r.cpClient.Status().Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil { return fmt.Errorf("failed to update HostedControlPlane status with %s condition: %w", condition.Type, err) } log.Info(string(condition.Type) + " condition updated") @@ -2746,8 +2747,8 @@ func (r *reconciler) destroyCloudResources(ctx context.Context, hcp *hyperv1.Hos meta.SetStatusCondition(&hcp.Status.Conditions, *resourcesDestroyedCond) if !equality.Semantic.DeepEqual(hcp, originalHCP) { - if err := r.cpClient.Status().Update(ctx, hcp); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set resources destroyed condition: %w", err) + if err := r.cpClient.Status().Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to patch resources destroyed condition: %w", err) } } diff --git a/hypershift-operator/controllers/etcdbackup/reconciler.go b/hypershift-operator/controllers/etcdbackup/reconciler.go index bd665bdda2db..6c8e4f8672b2 100644 --- a/hypershift-operator/controllers/etcdbackup/reconciler.go +++ b/hypershift-operator/controllers/etcdbackup/reconciler.go @@ -299,9 +299,10 @@ func (r *HCPEtcdBackupReconciler) setCondition(backup *hyperv1.HCPEtcdBackup, co // updateHCPBackupCondition sets a condition on the HostedControlPlane to bubble // up the etcd backup status. The HC controller propagates this to the HostedCluster. func (r *HCPEtcdBackupReconciler) updateHCPBackupCondition(ctx context.Context, hcp *hyperv1.HostedControlPlane, condition metav1.Condition) error { + originalHCP := hcp.DeepCopy() condition.ObservedGeneration = hcp.Generation meta.SetStatusCondition(&hcp.Status.Conditions, condition) - return r.Status().Update(ctx, hcp) + return r.Status().Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})) } // updateHostedClusterBackupURL persists the snapshot URL in the HostedCluster diff --git a/hypershift-operator/controllers/etcdbackup/reconciler_test.go b/hypershift-operator/controllers/etcdbackup/reconciler_test.go index d585b24c5241..7f15a5223ed6 100644 --- a/hypershift-operator/controllers/etcdbackup/reconciler_test.go +++ b/hypershift-operator/controllers/etcdbackup/reconciler_test.go @@ -1786,3 +1786,78 @@ func TestGetSnapshotURLFromPod(t *testing.T) { }) } } + +func TestUpdateHCPBackupCondition(t *testing.T) { + t.Parallel() + + t.Run("When patching a backup condition, it should not stomp unrelated status fields", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := newHostedControlPlane() + hcp.Generation = 3 + hcp.Status.AutoNode = hyperv1.AutoNodeStatus{ + VCPUs: ptr.To[int32](8), + } + + r := newReconciler(hcp) + + err := r.updateHCPBackupCondition(t.Context(), hcp, metav1.Condition{ + Type: string(hyperv1.EtcdBackupSucceeded), + Status: metav1.ConditionTrue, + Reason: "BackupComplete", + Message: "backup finished", + }) + g.Expect(err).ToNot(HaveOccurred()) + + var updated hyperv1.HostedControlPlane + g.Expect(r.Get(t.Context(), types.NamespacedName{ + Name: testHCPName, + Namespace: testHCPNamespace, + }, &updated)).To(Succeed()) + + cond := meta.FindStatusCondition(updated.Status.Conditions, string(hyperv1.EtcdBackupSucceeded)) + g.Expect(cond).ToNot(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionTrue)) + g.Expect(cond.Reason).To(Equal("BackupComplete")) + g.Expect(cond.ObservedGeneration).To(Equal(int64(3))) + + g.Expect(updated.Status.AutoNode.VCPUs).To(Equal(ptr.To[int32](8)), + "autoNode should be preserved — patch must not stomp unrelated fields") + }) + + t.Run("When patching a backup condition onto an HCP with existing conditions, it should preserve them", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := newHostedControlPlane() + hcp.Status.Conditions = []metav1.Condition{ + { + Type: string(hyperv1.ClusterVersionAvailable), + Status: metav1.ConditionTrue, + Reason: "OK", + }, + } + + r := newReconciler(hcp) + + err := r.updateHCPBackupCondition(t.Context(), hcp, metav1.Condition{ + Type: string(hyperv1.EtcdBackupSucceeded), + Status: metav1.ConditionTrue, + Reason: "BackupComplete", + Message: "backup finished", + }) + g.Expect(err).ToNot(HaveOccurred()) + + var updated hyperv1.HostedControlPlane + g.Expect(r.Get(t.Context(), types.NamespacedName{ + Name: testHCPName, + Namespace: testHCPNamespace, + }, &updated)).To(Succeed()) + + g.Expect(updated.Status.Conditions).To(HaveLen(2), + "both the existing condition and the new backup condition should be present") + g.Expect(meta.FindStatusCondition(updated.Status.Conditions, string(hyperv1.ClusterVersionAvailable))).ToNot(BeNil()) + g.Expect(meta.FindStatusCondition(updated.Status.Conditions, string(hyperv1.EtcdBackupSucceeded))).ToNot(BeNil()) + }) +} diff --git a/ignition-server/controllers/local_ignitionprovider.go b/ignition-server/controllers/local_ignitionprovider.go index 7a17217734d7..98fcbee3ede9 100644 --- a/ignition-server/controllers/local_ignitionprovider.go +++ b/ignition-server/controllers/local_ignitionprovider.go @@ -765,6 +765,7 @@ func (r *LocalIgnitionProvider) reconcileValidReleaseInfoCondition(ctx context.C } hostedControlPlane := hcpList.Items[0] + originalHCP := hostedControlPlane.DeepCopy() if len(releaseImageProvider.GetMissingImages()) == 0 { meta.SetStatusCondition(&hostedControlPlane.Status.Conditions, metav1.Condition{ @@ -784,7 +785,7 @@ func (r *LocalIgnitionProvider) reconcileValidReleaseInfoCondition(ctx context.C }) } - return r.Client.Status().Update(ctx, &hostedControlPlane) + return r.Client.Status().Patch(ctx, &hostedControlPlane, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})) } // copyFile copies a file named src to dst, preserving attributes. From 6610122a7d2cb748ccd7c06396950fad3fb80125 Mon Sep 17 00:00:00 2001 From: John Kyros Date: Fri, 29 May 2026 03:06:19 -0500 Subject: [PATCH 2/3] feat(karpenter): make e2e verify vcpus don't flap Signed-off-by: John Kyros --- test/e2e/karpenter_test.go | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/e2e/karpenter_test.go b/test/e2e/karpenter_test.go index e93c189a4b2f..fc42580b5248 100644 --- a/test/e2e/karpenter_test.go +++ b/test/e2e/karpenter_test.go @@ -1428,6 +1428,7 @@ func testBillingConsolidationAndPDB(ctx context.Context, mgtClient, guestClient // Before any Karpenter nodes are provisioned, Karpenter vCPUs should be 0. waitForAutoNodeStatusVCPUs(t, ctx, mgtClient, hostedCluster, 0) + waitForAutoNodeStatusVCPUsStable(t, ctx, mgtClient, hostedCluster, 0, 30*time.Second) baseline, found := getVCPUsMetric(t, ctx, mgtClient, hostedCluster) g.Expect(found).To(BeTrue(), "billing metric should exist before Karpenter nodes are provisioned") @@ -1451,6 +1452,7 @@ func testBillingConsolidationAndPDB(ctx context.Context, mgtClient, guestClient // t3.xlarge = 4 vCPUs; 2 nodes = 8 Karpenter vCPUs on top of baseline waitForAutoNodeStatusVCPUs(t, ctx, mgtClient, hostedCluster, 8) + waitForAutoNodeStatusVCPUsStable(t, ctx, mgtClient, hostedCluster, 8, 30*time.Second) waitForBillingMetricVCPUs(t, ctx, mgtClient, hostedCluster, baseline+8) t.Logf("Scaling workload to 1 replica to verify deprovisioning and consolidation") @@ -1464,6 +1466,7 @@ func testBillingConsolidationAndPDB(ctx context.Context, mgtClient, guestClient // t3.xlarge = 4 vCPUs; 1 node = 4 Karpenter vCPUs on top of baseline waitForAutoNodeStatusVCPUs(t, ctx, mgtClient, hostedCluster, 4) + waitForAutoNodeStatusVCPUsStable(t, ctx, mgtClient, hostedCluster, 4, 30*time.Second) waitForBillingMetricVCPUs(t, ctx, mgtClient, hostedCluster, baseline+4) // Create a blocking PDB and leave everything dangling so cluster teardown @@ -1651,6 +1654,22 @@ func waitForAutoNodeStatusVCPUs(t *testing.T, ctx context.Context, mgtClient crc ) } +// waitForAutoNodeStatusVCPUsStable asserts that AutoNode.VCPUs does not flap +// away from the expected value over the given duration. +func waitForAutoNodeStatusVCPUsStable(t *testing.T, ctx context.Context, mgtClient crclient.Client, hostedCluster *hyperv1.HostedCluster, expected int32, duration time.Duration) { + t.Helper() + g := NewWithT(t) + + t.Logf("Asserting AutoNode.VCPUs remains stable at %d for %s", expected, duration) + g.Consistently(func(g Gomega) { + hc := &hyperv1.HostedCluster{} + g.Expect(mgtClient.Get(ctx, crclient.ObjectKeyFromObject(hostedCluster), hc)).To(Succeed()) + g.Expect(hc.Status.AutoNode.VCPUs).NotTo(BeNil(), "AutoNode.VCPUs became nil") + g.Expect(*hc.Status.AutoNode.VCPUs).To(Equal(expected)) + }).WithTimeout(duration).WithPolling(2 * time.Second).Should(Succeed()) + t.Logf("AutoNode.VCPUs remained stable at %d for %s", expected, duration) +} + // waitForBillingMetricVCPUs polls until the hypershift_cluster_vcpus metric // converges to the expected total (native + Karpenter). func waitForBillingMetricVCPUs(t *testing.T, ctx context.Context, mgtClient crclient.Client, hostedCluster *hyperv1.HostedCluster, expectedTotal int32) { From fb2ea1cfb3d344362f28da3c7f8c4292cc9bb56a Mon Sep 17 00:00:00 2001 From: John Kyros Date: Fri, 29 May 2026 20:22:53 -0500 Subject: [PATCH 3/3] fixup! fix(karpenter): Stop controllers fighting over HCP status --- .../controllers/resources/resources.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go b/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go index a3407b30f872..8356918defa3 100644 --- a/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go +++ b/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go @@ -1707,7 +1707,7 @@ func (r *reconciler) patchHCPStatusCondition(ctx context.Context, hcp *hyperv1.H return nil // No status change; avoid unnecessary API call. } if err := r.cpClient.Status().Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil { - return fmt.Errorf("failed to update HostedControlPlane status with %s condition: %w", condition.Type, err) + return fmt.Errorf("failed to patch HostedControlPlane status with %s condition: %w", condition.Type, err) } log.Info(string(condition.Type) + " condition updated") return nil