From 7ea78aff3c38002c2bce2962a34e79afdd3c13c6 Mon Sep 17 00:00:00 2001 From: r4mek Date: Mon, 16 Feb 2026 12:53:05 +0530 Subject: [PATCH 1/5] unnecessary machine creation fix --- pkg/controller/machineset.go | 65 ++++++++++--------------- pkg/util/provider/machineutils/utils.go | 2 +- 2 files changed, 26 insertions(+), 41 deletions(-) diff --git a/pkg/controller/machineset.go b/pkg/controller/machineset.go index 6a5701e3e..8e33c9a40 100644 --- a/pkg/controller/machineset.go +++ b/pkg/controller/machineset.go @@ -335,51 +335,16 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 return nil } - var activeMachines, staleMachines, machinesWithUpdateSuccessfulLabel []*v1alpha1.Machine + var machinesWithUpdateSuccessfulLabelCount int for _, m := range allMachines { // Skip machines that are in the process of being updated. if m.Labels[v1alpha1.LabelKeyNodeUpdateResult] == v1alpha1.LabelValueNodeUpdateSuccessful { klog.V(3).Infof("Ignoring machine %s moved to new machine set during inplace update", m.Name) - machinesWithUpdateSuccessfulLabel = append(machinesWithUpdateSuccessfulLabel, m) - continue - } - - if machineutils.IsMachineFailed(m) || machineutils.IsMachineTriggeredForDeletion(m) { - staleMachines = append(staleMachines, m) - } else if machineutils.IsMachineActive(m) { - activeMachines = append(activeMachines, m) - } - } - - if len(staleMachines) >= 1 { - klog.V(3).Infof("Deleting stale machines %s", getMachineKeys(staleMachines)) - } - if err := c.terminateMachines(ctx, staleMachines, machineSet); err != nil { - // TODO: proper error handling needs to happen here - klog.Errorf("failed to terminate stale machines for machineset %s: %v", machineSet.Name, err) - } - - diff := len(activeMachines) - int(machineSet.Spec.Replicas) - // Removing the "update successful" label from a machine and scaling up the MachineSet are two separate operations. - // If the MachineSet is scaled up first and the "update successful" label is removed later, this controller might - // incorrectly assume that additional machines need to be created, as it ignores machines with the "update successful" label. - // Conversely, if the "update successful" label is removed first and the MachineSet is scaled up afterward, this controller - // might incorrectly assume that more machines need to be deleted, as machines without the label will be counted as active. - // To address this, we first check if the difference between the current active replicas and the desired replicas is negative. - // If it is, we then check if the difference plus the number of machines with the "update successful" label is greater than or equal to 0. - // In such cases, we set the difference to 0 because it is unclear whether the machine deployment controller still needs to increase - // the replicas of the MachineSet. Once the machine deployment controller removes the "update successful" label, the ambiguity will - // be resolved, and normal operations can proceed. This approach ensures that the controller does not create additional machines - // if the MachineSet is scaled up before the "update successful" label is removed. - if diff < 0 { - if diff+len(machinesWithUpdateSuccessfulLabel) >= 0 { - return nil + machinesWithUpdateSuccessfulLabelCount++ } - diff += len(machinesWithUpdateSuccessfulLabel) } - klog.V(4).Infof("Difference between current active replicas and desired replicas - %d", diff) - + diff := len(allMachines) - machinesWithUpdateSuccessfulLabelCount - int(machineSet.Spec.Replicas) if diff < 0 { // If MachineSet is frozen and no deletion timestamp, don't process it if machineSet.Labels["freeze"] == "True" && machineSet.DeletionTimestamp == nil { @@ -457,8 +422,8 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 } klog.V(2).Infof("Too many replicas for %v %s/%s, need %d, deleting %d", machineSet.Kind, machineSet.Namespace, machineSet.Name, (machineSet.Spec.Replicas), diff) - logMachinesWithPriority1(activeMachines) - machinesToDelete := getMachinesToDelete(activeMachines, diff) + logMachinesWithPriority1(allMachines) + machinesToDelete := getMachinesToDelete(allMachines, diff) logMachinesToDelete(machinesToDelete) // Snapshot the UIDs (ns/name) of the machines we're expecting to see @@ -478,6 +443,26 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 } } + var staleMachines []*v1alpha1.Machine + for _, m := range allMachines { + // Skip machines that are in the process of being updated. + if m.Labels[v1alpha1.LabelKeyNodeUpdateResult] == v1alpha1.LabelValueNodeUpdateSuccessful { + continue + } + + if machineutils.IsMachineFailed(m) { + staleMachines = append(staleMachines, m) + } + } + + if len(staleMachines) >= 1 { + klog.V(3).Infof("Deleting stale machines %s", getMachineKeys(staleMachines)) + } + if err := c.terminateMachines(ctx, staleMachines, machineSet); err != nil { + // TODO: proper error handling needs to happen here + klog.Errorf("failed to terminate stale machines for machineset %s: %v", machineSet.Name, err) + } + return nil } diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go index 31ed3cf89..460c0f9ff 100644 --- a/pkg/util/provider/machineutils/utils.go +++ b/pkg/util/provider/machineutils/utils.go @@ -123,5 +123,5 @@ func IsMachineFailed(p *v1alpha1.Machine) bool { // IsMachineTriggeredForDeletion checks if machine was triggered for deletion func IsMachineTriggeredForDeletion(m *v1alpha1.Machine) bool { - return m.Annotations[MachinePriority] == "1" || m.Annotations[TriggerDeletionByMCM] == "true" + return m.Annotations[MachinePriority] == "1" } From f36f38106150a04c5358d0a538fb94642f0e4ed5 Mon Sep 17 00:00:00 2001 From: r4mek Date: Mon, 16 Feb 2026 13:37:49 +0530 Subject: [PATCH 2/5] nit --- pkg/controller/machineset.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/controller/machineset.go b/pkg/controller/machineset.go index 8e33c9a40..1607a4d5c 100644 --- a/pkg/controller/machineset.go +++ b/pkg/controller/machineset.go @@ -457,10 +457,10 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 if len(staleMachines) >= 1 { klog.V(3).Infof("Deleting stale machines %s", getMachineKeys(staleMachines)) - } - if err := c.terminateMachines(ctx, staleMachines, machineSet); err != nil { - // TODO: proper error handling needs to happen here - klog.Errorf("failed to terminate stale machines for machineset %s: %v", machineSet.Name, err) + if err := c.terminateMachines(ctx, staleMachines, machineSet); err != nil { + // TODO: proper error handling needs to happen here + klog.Errorf("failed to terminate stale machines for machineset %s: %v", machineSet.Name, err) + } } return nil From 3eeb7887e4271d1e6e573a99a01c19c3843003a5 Mon Sep 17 00:00:00 2001 From: r4mek Date: Tue, 17 Feb 2026 13:53:57 +0530 Subject: [PATCH 3/5] handling in-place update scenarios --- pkg/controller/machineset.go | 52 +++++++++++++++++-------------- pkg/controller/machineset_test.go | 9 +++--- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/pkg/controller/machineset.go b/pkg/controller/machineset.go index 1607a4d5c..95ed38a24 100644 --- a/pkg/controller/machineset.go +++ b/pkg/controller/machineset.go @@ -335,17 +335,19 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 return nil } - var machinesWithUpdateSuccessfulLabelCount int + allMachinesDiff := len(allMachines) - int(machineSet.Spec.Replicas) + var machinesWithoutUpdateSuccessfulLabel []*v1alpha1.Machine for _, m := range allMachines { - // Skip machines that are in the process of being updated. - if m.Labels[v1alpha1.LabelKeyNodeUpdateResult] == v1alpha1.LabelValueNodeUpdateSuccessful { - klog.V(3).Infof("Ignoring machine %s moved to new machine set during inplace update", m.Name) - machinesWithUpdateSuccessfulLabelCount++ + if m.Labels[v1alpha1.LabelKeyNodeUpdateResult] != v1alpha1.LabelValueNodeUpdateSuccessful { + machinesWithoutUpdateSuccessfulLabel = append(machinesWithoutUpdateSuccessfulLabel, m) } } + machinesWithoutUpdateSuccessfulLabelDiff := len(machinesWithoutUpdateSuccessfulLabel) - int(machineSet.Spec.Replicas) - diff := len(allMachines) - machinesWithUpdateSuccessfulLabelCount - int(machineSet.Spec.Replicas) - if diff < 0 { + // During in-place updates, ScaleUps are disabled in the oldMachineSet and + // in newMachineSet, its ReplicaCount would never increase before a machine from oldMahineSet is moved. + // So no need for any special case during in-place updates. + if allMachinesDiff < 0 { // If MachineSet is frozen and no deletion timestamp, don't process it if machineSet.Labels["freeze"] == "True" && machineSet.DeletionTimestamp == nil { klog.V(2).Infof("MachineSet %q is frozen, and hence not processing", machineSet.Name) @@ -359,20 +361,20 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 return nil } - diff *= -1 - if diff > BurstReplicas { - diff = BurstReplicas + allMachinesDiff *= -1 + if allMachinesDiff > BurstReplicas { + allMachinesDiff = BurstReplicas } // TODO: Track UIDs of creates just like deletes. The problem currently // is we'd need to wait on the result of a create to record the machine's // UID, which would require locking *across* the create, which will turn // into a performance bottleneck. We should generate a UID for the machine // beforehand and store it via ExpectCreations. - if err := c.expectations.ExpectCreations(machineSetKey, diff); err != nil { + if err := c.expectations.ExpectCreations(machineSetKey, allMachinesDiff); err != nil { // TODO: proper error handling needs to happen here klog.Errorf("failed expect creations for machineset %s: %v", machineSet.Name, err) } - klog.V(2).Infof("Too few replicas for MachineSet %s, need %d, creating %d", machineSet.Name, (machineSet.Spec.Replicas), diff) + klog.V(2).Infof("Too few replicas for MachineSet %s, need %d, creating %d", machineSet.Name, (machineSet.Spec.Replicas), allMachinesDiff) // Batch the machine creates. Batch sizes start at SlowStartInitialBatchSize // and double with each successful iteration in a kind of "slow start". // This handles attempts to start large numbers of machines that would @@ -381,7 +383,7 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 // prevented from spamming the API service with the machine create requests // after one of its machines fails. Conveniently, this also prevents the // event spam that those failures would generate. - successfulCreations, err := slowStartBatch(diff, SlowStartInitialBatchSize, func() error { + successfulCreations, err := slowStartBatch(allMachinesDiff, SlowStartInitialBatchSize, func() error { boolPtr := func(b bool) *bool { return &b } controllerRef := &metav1.OwnerReference{ APIVersion: controllerKindMachineSet.GroupVersion().String(), // #ToCheck @@ -408,7 +410,7 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 // Any skipped machines that we never attempted to start shouldn't be expected. // The skipped machines will be retried later. The next controller resync will // retry the slow start process. - if skippedMachines := diff - successfulCreations; skippedMachines > 0 { + if skippedMachines := allMachinesDiff - successfulCreations; skippedMachines > 0 { klog.V(2).Infof("Slow-start failure. Skipping creation of %d machines, decrementing expectations for %v %v/%v", skippedMachines, machineSet.Kind, machineSet.Namespace, machineSet.Name) for range skippedMachines { // Decrement the expected number of creates because the informer won't observe this machine @@ -416,14 +418,18 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 } } return err - } else if diff > 0 { - if diff > BurstReplicas { - diff = BurstReplicas - } - klog.V(2).Infof("Too many replicas for %v %s/%s, need %d, deleting %d", machineSet.Kind, machineSet.Namespace, machineSet.Name, (machineSet.Spec.Replicas), diff) - - logMachinesWithPriority1(allMachines) - machinesToDelete := getMachinesToDelete(allMachines, diff) + } else if machinesWithoutUpdateSuccessfulLabelDiff > 0 { + // During in-place updates, in the newMachineSet, it can happen that a machine has come from the oldMachineSet + // but the ReplicaCount of newMachineSet has not increased yet. + // In such cases, we should not delete the machine immediately, + // otherwise it can cause unnecessary machine deletion and creation during in-place updates. + if machinesWithoutUpdateSuccessfulLabelDiff > BurstReplicas { + machinesWithoutUpdateSuccessfulLabelDiff = BurstReplicas + } + klog.V(2).Infof("Too many replicas for %v %s/%s, need %d, deleting %d", machineSet.Kind, machineSet.Namespace, machineSet.Name, (machineSet.Spec.Replicas), machinesWithoutUpdateSuccessfulLabelDiff) + + logMachinesWithPriority1(machinesWithoutUpdateSuccessfulLabel) + machinesToDelete := getMachinesToDelete(machinesWithoutUpdateSuccessfulLabel, machinesWithoutUpdateSuccessfulLabelDiff) logMachinesToDelete(machinesToDelete) // Snapshot the UIDs (ns/name) of the machines we're expecting to see @@ -444,7 +450,7 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 } var staleMachines []*v1alpha1.Machine - for _, m := range allMachines { + for _, m := range machinesWithoutUpdateSuccessfulLabel { // Skip machines that are in the process of being updated. if m.Labels[v1alpha1.LabelKeyNodeUpdateResult] == v1alpha1.LabelValueNodeUpdateSuccessful { continue diff --git a/pkg/controller/machineset_test.go b/pkg/controller/machineset_test.go index a9fbc9d8c..88d847017 100644 --- a/pkg/controller/machineset_test.go +++ b/pkg/controller/machineset_test.go @@ -844,23 +844,24 @@ var _ = Describe("machineset", func() { Expect(Err).Should(BeNil()) }) - It("should delete MachinePriority=1 machines and spawn replacement machine", func() { + It("should delete MachinePriority=1 machines", func() { stop := make(chan struct{}) defer close(stop) objects := []runtime.Object{} staleMachine := testActiveMachine1.DeepCopy() staleMachine.Annotations[machineutils.MachinePriority] = "1" + testActiveMachine4.Status.CurrentStatus.Phase = MachineRunning - objects = append(objects, testMachineSet, staleMachine, testActiveMachine2, testActiveMachine3) + objects = append(objects, testMachineSet, staleMachine, testActiveMachine2, testActiveMachine3, testActiveMachine4) c, trackers := createController(stop, testNamespace, objects, nil, nil) defer trackers.Stop() waitForCacheSync(stop, c) machines, _ := c.controlMachineClient.Machines(testNamespace).List(context.TODO(), metav1.ListOptions{}) - Expect(len(machines.Items)).To(Equal(int(testMachineSet.Spec.Replicas))) + Expect(len(machines.Items)).To(Equal(int(testMachineSet.Spec.Replicas + 1))) - beforeMachines := []*machinev1.Machine{staleMachine, testActiveMachine2, testActiveMachine3} + beforeMachines := []*machinev1.Machine{staleMachine, testActiveMachine2, testActiveMachine3, testActiveMachine4} err := c.manageReplicas(context.TODO(), beforeMachines, testMachineSet) Expect(err).Should(BeNil()) waitForCacheSync(stop, c) From 266657a47cec48968a5ced121056a08cc8d424b7 Mon Sep 17 00:00:00 2001 From: r4mek Date: Tue, 17 Feb 2026 15:58:50 +0530 Subject: [PATCH 4/5] addressing review comments --- pkg/controller/machineset.go | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pkg/controller/machineset.go b/pkg/controller/machineset.go index 95ed38a24..dd5fdc789 100644 --- a/pkg/controller/machineset.go +++ b/pkg/controller/machineset.go @@ -345,7 +345,7 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 machinesWithoutUpdateSuccessfulLabelDiff := len(machinesWithoutUpdateSuccessfulLabel) - int(machineSet.Spec.Replicas) // During in-place updates, ScaleUps are disabled in the oldMachineSet and - // in newMachineSet, its ReplicaCount would never increase before a machine from oldMahineSet is moved. + // in newMachineSet, its ReplicaCount would never increase before a machine from oldMachineSet is moved. // So no need for any special case during in-place updates. if allMachinesDiff < 0 { // If MachineSet is frozen and no deletion timestamp, don't process it @@ -451,11 +451,6 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 var staleMachines []*v1alpha1.Machine for _, m := range machinesWithoutUpdateSuccessfulLabel { - // Skip machines that are in the process of being updated. - if m.Labels[v1alpha1.LabelKeyNodeUpdateResult] == v1alpha1.LabelValueNodeUpdateSuccessful { - continue - } - if machineutils.IsMachineFailed(m) { staleMachines = append(staleMachines, m) } From 01b87dd65847be0035282d4680565c539d95b389 Mon Sep 17 00:00:00 2001 From: r4mek Date: Tue, 17 Feb 2026 16:16:01 +0530 Subject: [PATCH 5/5] follow up for review comments --- pkg/controller/machineset.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controller/machineset.go b/pkg/controller/machineset.go index dd5fdc789..acd41e80e 100644 --- a/pkg/controller/machineset.go +++ b/pkg/controller/machineset.go @@ -335,13 +335,13 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1 return nil } - allMachinesDiff := len(allMachines) - int(machineSet.Spec.Replicas) var machinesWithoutUpdateSuccessfulLabel []*v1alpha1.Machine for _, m := range allMachines { if m.Labels[v1alpha1.LabelKeyNodeUpdateResult] != v1alpha1.LabelValueNodeUpdateSuccessful { machinesWithoutUpdateSuccessfulLabel = append(machinesWithoutUpdateSuccessfulLabel, m) } } + allMachinesDiff := len(allMachines) - int(machineSet.Spec.Replicas) machinesWithoutUpdateSuccessfulLabelDiff := len(machinesWithoutUpdateSuccessfulLabel) - int(machineSet.Spec.Replicas) // During in-place updates, ScaleUps are disabled in the oldMachineSet and