diff --git a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go index 629ab0e2b606..b83a6ff54c81 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go +++ b/control-plane-operator/controllers/hostedcontrolplane/hostedcontrolplane_controller.go @@ -1998,10 +1998,11 @@ func (r *HostedControlPlaneReconciler) reconcileValidIDPConfigurationCondition(c Message: fmt.Sprintf("failed to initialize identity providers: %v", err), } } - // Update the condition on the HCP if it has changed + // Patch the condition on the HCP if it has changed + originalHCP := hcp.DeepCopy() if meta.SetStatusCondition(&hcp.Status.Conditions, new) { - if err := r.Status().Update(ctx, hcp); err != nil { - return fmt.Errorf("failed to update valid IDP configuration condition: %w", err) + if err := r.Status().Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil { + return fmt.Errorf("failed to patch valid IDP configuration condition: %w", err) } } return nil @@ -2518,14 +2519,15 @@ func (r *HostedControlPlaneReconciler) removeCloudResources(ctx context.Context, if resourcesDestroyedCond != nil && resourcesDestroyedCond.Message != "" { message = fmt.Sprintf("%s (last status: %s)", message, resourcesDestroyedCond.Message) } + originalHCP := hcp.DeepCopy() meta.SetStatusCondition(&hcp.Status.Conditions, metav1.Condition{ Type: string(hyperv1.CloudResourcesDestroyed), Status: metav1.ConditionFalse, Reason: string(hyperv1.CloudResourcesDeletionTimedOutReason), Message: message, }) - if err := r.Status().Update(ctx, hcp); err != nil { - return false, fmt.Errorf("failed to update cloud resources destroyed condition: %w", err) + if err := r.Status().Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil { + return false, fmt.Errorf("failed to patch cloud resources destroyed condition: %w", err) } return true, nil } @@ -2550,6 +2552,7 @@ func (r *HostedControlPlaneReconciler) removeCloudResources(ctx context.Context, return false, nil } if cvoScaledDownCond == nil || cvoScaledDownCond.Status != metav1.ConditionTrue { + originalHCP := hcp.DeepCopy() cvoScaledDownCond = &metav1.Condition{ Type: string(hyperv1.CVOScaledDown), Status: metav1.ConditionTrue, @@ -2557,8 +2560,8 @@ func (r *HostedControlPlaneReconciler) removeCloudResources(ctx context.Context, LastTransitionTime: metav1.Now(), } meta.SetStatusCondition(&hcp.Status.Conditions, *cvoScaledDownCond) - if err := r.Status().Update(ctx, hcp); err != nil { - return false, fmt.Errorf("failed to set CVO scaled down condition: %w", err) + if err := r.Status().Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil { + return false, fmt.Errorf("failed to patch CVO scaled down condition: %w", err) } } return false, nil diff --git a/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus.go b/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus.go index 6ffe25792713..df6b02064726 100644 --- a/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus.go +++ b/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus.go @@ -1,9 +1,10 @@ package hcpstatus import ( + "bytes" "context" + "encoding/json" "fmt" - "reflect" hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" "github.com/openshift/hypershift/control-plane-operator/hostedclusterconfigoperator/operator" @@ -11,6 +12,7 @@ import ( configv1 "github.com/openshift/api/config/v1" + "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -74,17 +76,76 @@ func (h *hcpStatusReconciler) Reconcile(ctx context.Context, req reconcile.Reque return reconcile.Result{}, err } - if !reflect.DeepEqual(hcp.Status, originalHCP.Status) { - log.Info("Updating HCP status with new configuration and version status") - if err := h.mgtClusterClient.Status().Update(ctx, hcp); err != nil { - return reconcile.Result{}, fmt.Errorf("failed to update hcp: %w", err) - } - log.Info("Successfully updated HCP status") + if equality.Semantic.DeepEqual(hcp.Status, originalHCP.Status) { + return reconcile.Result{}, nil + } + + // Use JSON Patch (RFC 6902) instead of JSON Merge Patch (RFC 7386). + // Merge patch interprets null as "delete field" (RFC 7386 §7), which corrupts + // +required +nullable fields in configv1.ClusterVersionStatus that have no + // omitempty and serialize nil as JSON null: + // - AvailableUpdates []configv1.Release `json:"availableUpdates"` — nil slice → null + // - CompletionTime *metav1.Time `json:"completionTime"` (in UpdateHistory) — nil pointer → null + // JSON Patch "replace"/"add" ops carry null as a literal value, preserving it correctly. + patch, err := buildStatusPatch(originalHCP, hcp) + if err != nil { + return reconcile.Result{}, fmt.Errorf("failed to build status patch: %w", err) + } + + log.Info("Patching HCP status with new configuration and version status") + if err := h.mgtClusterClient.Status().Patch(ctx, hcp, + crclient.RawPatch(types.JSONPatchType, patch)); err != nil { + return reconcile.Result{}, fmt.Errorf("failed to patch hcp status: %w", err) } + log.Info("Successfully patched HCP status") return reconcile.Result{}, nil } +type jsonPatchOp struct { + Op string `json:"op"` + Path string `json:"path"` + Value interface{} `json:"value,omitempty"` +} + +func buildStatusPatch(original, modified *hyperv1.HostedControlPlane) ([]byte, error) { + origJSON, err := json.Marshal(original.Status) + if err != nil { + return nil, err + } + modJSON, err := json.Marshal(modified.Status) + if err != nil { + return nil, err + } + + var origMap, modMap map[string]json.RawMessage + if err := json.Unmarshal(origJSON, &origMap); err != nil { + return nil, err + } + if err := json.Unmarshal(modJSON, &modMap); err != nil { + return nil, err + } + + // Optimistic lock: fail if the object was modified since our read. + ops := []jsonPatchOp{{Op: "test", Path: "/metadata/resourceVersion", Value: original.ResourceVersion}} + + for key, modVal := range modMap { + origVal, exists := origMap[key] + if !exists { + ops = append(ops, jsonPatchOp{Op: "add", Path: "/status/" + key, Value: modVal}) + } else if !bytes.Equal(origVal, modVal) { + ops = append(ops, jsonPatchOp{Op: "replace", Path: "/status/" + key, Value: modVal}) + } + } + for key := range origMap { + if _, exists := modMap[key]; !exists { + ops = append(ops, jsonPatchOp{Op: "remove", Path: "/status/" + key}) + } + } + + return json.Marshal(ops) +} + // findClusterOperatorStatusCondition is identical to meta.FindStatusCondition except that it works on config1.ClusterOperatorStatusCondition instead of // metav1.StatusCondition func findClusterOperatorStatusCondition(conditions []configv1.ClusterOperatorStatusCondition, conditionType configv1.ClusterStatusConditionType) *configv1.ClusterOperatorStatusCondition { diff --git a/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus_test.go b/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus_test.go index 5b391355d313..73655f694287 100644 --- a/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus_test.go +++ b/control-plane-operator/hostedclusterconfigoperator/controllers/hcpstatus/hcpstatus_test.go @@ -1,6 +1,7 @@ package hcpstatus import ( + "encoding/json" "testing" . "github.com/onsi/gomega" @@ -209,3 +210,332 @@ func TestHCPStatusReconciler(t *testing.T) { }) } } + +func TestBuildStatusPatch(t *testing.T) { + t.Parallel() + + t.Run("When versionStatus changes, it should produce a replace op with optimistic lock", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + Status: hyperv1.HostedControlPlaneStatus{ + VersionStatus: &hyperv1.ClusterVersionStatus{ + Desired: configv1.Release{Version: "4.16.0", Image: "quay.io/old:latest"}, + }, + }, + } + original := hcp.DeepCopy() + + hcp.Status.VersionStatus = &hyperv1.ClusterVersionStatus{ + Desired: configv1.Release{Version: "4.17.0", Image: "quay.io/new:latest"}, + } + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + var ops []jsonPatchOp + g.Expect(json.Unmarshal(patchBytes, &ops)).To(Succeed()) + + g.Expect(ops).To(HaveLen(2)) + + g.Expect(ops[0].Op).To(Equal("test")) + g.Expect(ops[0].Path).To(Equal("/metadata/resourceVersion")) + g.Expect(ops[0].Value).To(Equal("100")) + + g.Expect(ops[1].Op).To(Equal("replace")) + g.Expect(ops[1].Path).To(Equal("/status/versionStatus")) + }) + + t.Run("When versionStatus is added for the first time, it should produce an add op", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + } + original := hcp.DeepCopy() + + hcp.Status.VersionStatus = &hyperv1.ClusterVersionStatus{ + Desired: configv1.Release{Version: "4.17.0", Image: "quay.io/test:latest"}, + } + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + var ops []jsonPatchOp + g.Expect(json.Unmarshal(patchBytes, &ops)).To(Succeed()) + + g.Expect(ops).To(HaveLen(2)) + g.Expect(ops[1].Op).To(Equal("add")) + g.Expect(ops[1].Path).To(Equal("/status/versionStatus")) + }) + + t.Run("When nothing changes, it should produce only the test op", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + Status: hyperv1.HostedControlPlaneStatus{ + Conditions: []metav1.Condition{ + {Type: "ConditionA", Status: metav1.ConditionTrue, Reason: "OK"}, + }, + }, + } + original := hcp.DeepCopy() + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + var ops []jsonPatchOp + g.Expect(json.Unmarshal(patchBytes, &ops)).To(Succeed()) + + g.Expect(ops).To(HaveLen(1)) + g.Expect(ops[0].Op).To(Equal("test")) + g.Expect(ops[0].Path).To(Equal("/metadata/resourceVersion")) + }) + + t.Run("When conditions change, it should replace the entire conditions array", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + Status: hyperv1.HostedControlPlaneStatus{ + Conditions: []metav1.Condition{ + {Type: "ConditionA", Status: metav1.ConditionTrue, Reason: "OK"}, + {Type: "ConditionB", Status: metav1.ConditionTrue, Reason: "OK"}, + }, + }, + } + original := hcp.DeepCopy() + + meta.SetStatusCondition(&hcp.Status.Conditions, metav1.Condition{ + Type: "ConditionA", + Status: metav1.ConditionFalse, + Reason: "NowBad", + }) + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + var ops []jsonPatchOp + g.Expect(json.Unmarshal(patchBytes, &ops)).To(Succeed()) + + g.Expect(ops).To(HaveLen(2)) + g.Expect(ops[1].Op).To(Equal("replace")) + g.Expect(ops[1].Path).To(Equal("/status/conditions")) + + conditionsJSON, err := json.Marshal(ops[1].Value) + g.Expect(err).ToNot(HaveOccurred()) + var conditions []metav1.Condition + g.Expect(json.Unmarshal(conditionsJSON, &conditions)).To(Succeed()) + g.Expect(conditions).To(HaveLen(2), "all conditions from the read should be in the patch") + }) + + t.Run("When versionStatus has nil availableUpdates and nil completionTime, the patch should preserve null values", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + } + original := hcp.DeepCopy() + + startedTime := metav1.Now() + hcp.Status.VersionStatus = &hyperv1.ClusterVersionStatus{ + Desired: configv1.Release{Version: "4.17.0", Image: "quay.io/test:latest"}, + History: []configv1.UpdateHistory{ + { + State: configv1.PartialUpdate, + StartedTime: startedTime, + // CompletionTime is nil — update in progress + Version: "4.17.0", + Image: "quay.io/test:latest", + }, + }, + // AvailableUpdates is nil — CVO hasn't checked yet + } + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + // Parse the raw JSON to verify null handling. + // JSON Patch (RFC 6902) uses "value": null to mean "set to null", + // unlike JSON Merge Patch (RFC 7386) where null means "delete". + var rawOps []json.RawMessage + g.Expect(json.Unmarshal(patchBytes, &rawOps)).To(Succeed()) + g.Expect(rawOps).To(HaveLen(2)) + + // Parse the versionStatus op's value to check null fields + var op struct { + Op string `json:"op"` + Path string `json:"path"` + Value json.RawMessage `json:"value"` + } + g.Expect(json.Unmarshal(rawOps[1], &op)).To(Succeed()) + g.Expect(op.Op).To(Equal("add")) + g.Expect(op.Path).To(Equal("/status/versionStatus")) + + var vs map[string]interface{} + g.Expect(json.Unmarshal(op.Value, &vs)).To(Succeed()) + + // availableUpdates should be null (nil slice serializes as null with no omitempty) + au, ok := vs["availableUpdates"] + g.Expect(ok).To(BeTrue(), "availableUpdates key must be present in the patch") + g.Expect(au).To(BeNil(), "availableUpdates should be null — JSON Patch preserves this correctly") + + // completionTime in history should be null (nil *metav1.Time) + history := vs["history"].([]interface{}) + entry := history[0].(map[string]interface{}) + ct, ok := entry["completionTime"] + g.Expect(ok).To(BeTrue(), "completionTime key must be present in the patch") + g.Expect(ct).To(BeNil(), "completionTime should be null — JSON Patch preserves this correctly") + }) + + t.Run("When configuration changes, it should produce a replace op", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + Status: hyperv1.HostedControlPlaneStatus{ + Configuration: &hyperv1.ConfigurationStatus{}, + }, + } + original := hcp.DeepCopy() + + hcp.Status.Configuration = &hyperv1.ConfigurationStatus{ + Authentication: configv1.AuthenticationStatus{ + IntegratedOAuthMetadata: configv1.ConfigMapNameReference{Name: "oauth-metadata"}, + }, + } + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + var ops []jsonPatchOp + g.Expect(json.Unmarshal(patchBytes, &ops)).To(Succeed()) + + g.Expect(ops).To(HaveLen(2)) + g.Expect(ops[1].Op).To(Equal("replace")) + g.Expect(ops[1].Path).To(Equal("/status/configuration")) + }) + + t.Run("When versionStatus is removed, it should produce a remove op", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + Status: hyperv1.HostedControlPlaneStatus{ + VersionStatus: &hyperv1.ClusterVersionStatus{ + Desired: configv1.Release{Version: "4.17.0", Image: "quay.io/test:latest"}, + }, + }, + } + original := hcp.DeepCopy() + + hcp.Status.VersionStatus = nil + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + var ops []jsonPatchOp + g.Expect(json.Unmarshal(patchBytes, &ops)).To(Succeed()) + + g.Expect(ops).To(HaveLen(2)) + g.Expect(ops[0].Op).To(Equal("test")) + g.Expect(ops[1].Op).To(Equal("remove")) + g.Expect(ops[1].Path).To(Equal("/status/versionStatus")) + }) + + t.Run("When multiple fields change, it should produce ops for each changed field", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test-ns", + ResourceVersion: "100", + }, + Status: hyperv1.HostedControlPlaneStatus{ + VersionStatus: &hyperv1.ClusterVersionStatus{ + Desired: configv1.Release{Version: "4.16.0"}, + }, + Conditions: []metav1.Condition{ + {Type: "ConditionA", Status: metav1.ConditionTrue, Reason: "OK"}, + }, + Configuration: &hyperv1.ConfigurationStatus{}, + Version: "4.16.0", + ReleaseImage: "quay.io/old:latest", + }, + } + original := hcp.DeepCopy() + + hcp.Status.VersionStatus.Desired.Version = "4.17.0" + meta.SetStatusCondition(&hcp.Status.Conditions, metav1.Condition{ + Type: "ConditionA", Status: metav1.ConditionFalse, Reason: "Bad", + }) + hcp.Status.Configuration = &hyperv1.ConfigurationStatus{ + Authentication: configv1.AuthenticationStatus{ + IntegratedOAuthMetadata: configv1.ConfigMapNameReference{Name: "new"}, + }, + } + hcp.Status.Version = "4.17.0" + hcp.Status.ReleaseImage = "quay.io/new:latest" + + patchBytes, err := buildStatusPatch(original, hcp) + g.Expect(err).ToNot(HaveOccurred()) + + var ops []jsonPatchOp + g.Expect(json.Unmarshal(patchBytes, &ops)).To(Succeed()) + + // test + versionStatus + conditions + configuration + version + releaseImage + g.Expect(ops).To(HaveLen(6)) + + paths := make([]string, len(ops)) + for i, op := range ops { + paths[i] = op.Path + } + g.Expect(paths).To(ContainElements( + "/metadata/resourceVersion", + "/status/versionStatus", + "/status/conditions", + "/status/configuration", + "/status/version", + "/status/releaseImage", + )) + }) +} diff --git a/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go b/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go index 8e4170836d68..8356918defa3 100644 --- a/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go +++ b/control-plane-operator/hostedclusterconfigoperator/controllers/resources/resources.go @@ -570,12 +570,13 @@ func (r *reconciler) reconcileClusterRecovery(ctx context.Context, log logr.Logg condition.Message = "Hosted cluster recovery finished" } + originalHCP := hcp.DeepCopy() meta.SetStatusCondition(&hcp.Status.Conditions, *condition) log.Info("setting condition", "type", condition.Type, "status", condition.Status, "message", condition.Message) - if err := r.cpClient.Status().Update(ctx, hcp); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update status on hcp for hosted cluster recovery: %w. Condition error message: %v", err, condition.Message) + if err := r.cpClient.Status().Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to patch status on hcp for hosted cluster recovery: %w. Condition error message: %v", err, condition.Message) } - log.Info("successfully updated hcp status with recovery condition") + log.Info("successfully patched hcp status with recovery condition") if !finished { return ctrl.Result{RequeueAfter: 120 * time.Second}, nil @@ -1705,8 +1706,8 @@ func (r *reconciler) patchHCPStatusCondition(ctx context.Context, hcp *hyperv1.H if !meta.SetStatusCondition(&hcp.Status.Conditions, *condition) { return nil // No status change; avoid unnecessary API call. } - if err := r.cpClient.Status().Patch(ctx, hcp, client.MergeFrom(originalHCP)); err != nil { - return fmt.Errorf("failed to update HostedControlPlane status with %s condition: %w", condition.Type, err) + if err := r.cpClient.Status().Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil { + return fmt.Errorf("failed to patch HostedControlPlane status with %s condition: %w", condition.Type, err) } log.Info(string(condition.Type) + " condition updated") return nil @@ -2746,8 +2747,8 @@ func (r *reconciler) destroyCloudResources(ctx context.Context, hcp *hyperv1.Hos meta.SetStatusCondition(&hcp.Status.Conditions, *resourcesDestroyedCond) if !equality.Semantic.DeepEqual(hcp, originalHCP) { - if err := r.cpClient.Status().Update(ctx, hcp); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set resources destroyed condition: %w", err) + if err := r.cpClient.Status().Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to patch resources destroyed condition: %w", err) } } diff --git a/hypershift-operator/controllers/etcdbackup/reconciler.go b/hypershift-operator/controllers/etcdbackup/reconciler.go index bd665bdda2db..6c8e4f8672b2 100644 --- a/hypershift-operator/controllers/etcdbackup/reconciler.go +++ b/hypershift-operator/controllers/etcdbackup/reconciler.go @@ -299,9 +299,10 @@ func (r *HCPEtcdBackupReconciler) setCondition(backup *hyperv1.HCPEtcdBackup, co // updateHCPBackupCondition sets a condition on the HostedControlPlane to bubble // up the etcd backup status. The HC controller propagates this to the HostedCluster. func (r *HCPEtcdBackupReconciler) updateHCPBackupCondition(ctx context.Context, hcp *hyperv1.HostedControlPlane, condition metav1.Condition) error { + originalHCP := hcp.DeepCopy() condition.ObservedGeneration = hcp.Generation meta.SetStatusCondition(&hcp.Status.Conditions, condition) - return r.Status().Update(ctx, hcp) + return r.Status().Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})) } // updateHostedClusterBackupURL persists the snapshot URL in the HostedCluster diff --git a/hypershift-operator/controllers/etcdbackup/reconciler_test.go b/hypershift-operator/controllers/etcdbackup/reconciler_test.go index d585b24c5241..7f15a5223ed6 100644 --- a/hypershift-operator/controllers/etcdbackup/reconciler_test.go +++ b/hypershift-operator/controllers/etcdbackup/reconciler_test.go @@ -1786,3 +1786,78 @@ func TestGetSnapshotURLFromPod(t *testing.T) { }) } } + +func TestUpdateHCPBackupCondition(t *testing.T) { + t.Parallel() + + t.Run("When patching a backup condition, it should not stomp unrelated status fields", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := newHostedControlPlane() + hcp.Generation = 3 + hcp.Status.AutoNode = hyperv1.AutoNodeStatus{ + VCPUs: ptr.To[int32](8), + } + + r := newReconciler(hcp) + + err := r.updateHCPBackupCondition(t.Context(), hcp, metav1.Condition{ + Type: string(hyperv1.EtcdBackupSucceeded), + Status: metav1.ConditionTrue, + Reason: "BackupComplete", + Message: "backup finished", + }) + g.Expect(err).ToNot(HaveOccurred()) + + var updated hyperv1.HostedControlPlane + g.Expect(r.Get(t.Context(), types.NamespacedName{ + Name: testHCPName, + Namespace: testHCPNamespace, + }, &updated)).To(Succeed()) + + cond := meta.FindStatusCondition(updated.Status.Conditions, string(hyperv1.EtcdBackupSucceeded)) + g.Expect(cond).ToNot(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionTrue)) + g.Expect(cond.Reason).To(Equal("BackupComplete")) + g.Expect(cond.ObservedGeneration).To(Equal(int64(3))) + + g.Expect(updated.Status.AutoNode.VCPUs).To(Equal(ptr.To[int32](8)), + "autoNode should be preserved — patch must not stomp unrelated fields") + }) + + t.Run("When patching a backup condition onto an HCP with existing conditions, it should preserve them", func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + hcp := newHostedControlPlane() + hcp.Status.Conditions = []metav1.Condition{ + { + Type: string(hyperv1.ClusterVersionAvailable), + Status: metav1.ConditionTrue, + Reason: "OK", + }, + } + + r := newReconciler(hcp) + + err := r.updateHCPBackupCondition(t.Context(), hcp, metav1.Condition{ + Type: string(hyperv1.EtcdBackupSucceeded), + Status: metav1.ConditionTrue, + Reason: "BackupComplete", + Message: "backup finished", + }) + g.Expect(err).ToNot(HaveOccurred()) + + var updated hyperv1.HostedControlPlane + g.Expect(r.Get(t.Context(), types.NamespacedName{ + Name: testHCPName, + Namespace: testHCPNamespace, + }, &updated)).To(Succeed()) + + g.Expect(updated.Status.Conditions).To(HaveLen(2), + "both the existing condition and the new backup condition should be present") + g.Expect(meta.FindStatusCondition(updated.Status.Conditions, string(hyperv1.ClusterVersionAvailable))).ToNot(BeNil()) + g.Expect(meta.FindStatusCondition(updated.Status.Conditions, string(hyperv1.EtcdBackupSucceeded))).ToNot(BeNil()) + }) +} diff --git a/ignition-server/controllers/local_ignitionprovider.go b/ignition-server/controllers/local_ignitionprovider.go index 7a17217734d7..98fcbee3ede9 100644 --- a/ignition-server/controllers/local_ignitionprovider.go +++ b/ignition-server/controllers/local_ignitionprovider.go @@ -765,6 +765,7 @@ func (r *LocalIgnitionProvider) reconcileValidReleaseInfoCondition(ctx context.C } hostedControlPlane := hcpList.Items[0] + originalHCP := hostedControlPlane.DeepCopy() if len(releaseImageProvider.GetMissingImages()) == 0 { meta.SetStatusCondition(&hostedControlPlane.Status.Conditions, metav1.Condition{ @@ -784,7 +785,7 @@ func (r *LocalIgnitionProvider) reconcileValidReleaseInfoCondition(ctx context.C }) } - return r.Client.Status().Update(ctx, &hostedControlPlane) + return r.Client.Status().Patch(ctx, &hostedControlPlane, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})) } // copyFile copies a file named src to dst, preserving attributes. diff --git a/test/e2e/karpenter_test.go b/test/e2e/karpenter_test.go index e93c189a4b2f..fc42580b5248 100644 --- a/test/e2e/karpenter_test.go +++ b/test/e2e/karpenter_test.go @@ -1428,6 +1428,7 @@ func testBillingConsolidationAndPDB(ctx context.Context, mgtClient, guestClient // Before any Karpenter nodes are provisioned, Karpenter vCPUs should be 0. waitForAutoNodeStatusVCPUs(t, ctx, mgtClient, hostedCluster, 0) + waitForAutoNodeStatusVCPUsStable(t, ctx, mgtClient, hostedCluster, 0, 30*time.Second) baseline, found := getVCPUsMetric(t, ctx, mgtClient, hostedCluster) g.Expect(found).To(BeTrue(), "billing metric should exist before Karpenter nodes are provisioned") @@ -1451,6 +1452,7 @@ func testBillingConsolidationAndPDB(ctx context.Context, mgtClient, guestClient // t3.xlarge = 4 vCPUs; 2 nodes = 8 Karpenter vCPUs on top of baseline waitForAutoNodeStatusVCPUs(t, ctx, mgtClient, hostedCluster, 8) + waitForAutoNodeStatusVCPUsStable(t, ctx, mgtClient, hostedCluster, 8, 30*time.Second) waitForBillingMetricVCPUs(t, ctx, mgtClient, hostedCluster, baseline+8) t.Logf("Scaling workload to 1 replica to verify deprovisioning and consolidation") @@ -1464,6 +1466,7 @@ func testBillingConsolidationAndPDB(ctx context.Context, mgtClient, guestClient // t3.xlarge = 4 vCPUs; 1 node = 4 Karpenter vCPUs on top of baseline waitForAutoNodeStatusVCPUs(t, ctx, mgtClient, hostedCluster, 4) + waitForAutoNodeStatusVCPUsStable(t, ctx, mgtClient, hostedCluster, 4, 30*time.Second) waitForBillingMetricVCPUs(t, ctx, mgtClient, hostedCluster, baseline+4) // Create a blocking PDB and leave everything dangling so cluster teardown @@ -1651,6 +1654,22 @@ func waitForAutoNodeStatusVCPUs(t *testing.T, ctx context.Context, mgtClient crc ) } +// waitForAutoNodeStatusVCPUsStable asserts that AutoNode.VCPUs does not flap +// away from the expected value over the given duration. +func waitForAutoNodeStatusVCPUsStable(t *testing.T, ctx context.Context, mgtClient crclient.Client, hostedCluster *hyperv1.HostedCluster, expected int32, duration time.Duration) { + t.Helper() + g := NewWithT(t) + + t.Logf("Asserting AutoNode.VCPUs remains stable at %d for %s", expected, duration) + g.Consistently(func(g Gomega) { + hc := &hyperv1.HostedCluster{} + g.Expect(mgtClient.Get(ctx, crclient.ObjectKeyFromObject(hostedCluster), hc)).To(Succeed()) + g.Expect(hc.Status.AutoNode.VCPUs).NotTo(BeNil(), "AutoNode.VCPUs became nil") + g.Expect(*hc.Status.AutoNode.VCPUs).To(Equal(expected)) + }).WithTimeout(duration).WithPolling(2 * time.Second).Should(Succeed()) + t.Logf("AutoNode.VCPUs remained stable at %d for %s", expected, duration) +} + // waitForBillingMetricVCPUs polls until the hypershift_cluster_vcpus metric // converges to the expected total (native + Karpenter). func waitForBillingMetricVCPUs(t *testing.T, ctx context.Context, mgtClient crclient.Client, hostedCluster *hyperv1.HostedCluster, expectedTotal int32) {