Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 43 additions & 57 deletions pkg/controller/machineset.go
Original file line number Diff line number Diff line change
Expand Up @@ -335,52 +335,19 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1
return nil
}

var activeMachines, staleMachines, machinesWithUpdateSuccessfulLabel []*v1alpha1.Machine
var machinesWithoutUpdateSuccessfulLabel []*v1alpha1.Machine
for _, m := range allMachines {
// Skip machines that are in the process of being updated.
if m.Labels[v1alpha1.LabelKeyNodeUpdateResult] == v1alpha1.LabelValueNodeUpdateSuccessful {
klog.V(3).Infof("Ignoring machine %s moved to new machine set during inplace update", m.Name)
machinesWithUpdateSuccessfulLabel = append(machinesWithUpdateSuccessfulLabel, m)
continue
}

if machineutils.IsMachineFailed(m) || machineutils.IsMachineTriggeredForDeletion(m) {
staleMachines = append(staleMachines, m)
} else if machineutils.IsMachineActive(m) {
activeMachines = append(activeMachines, m)
if m.Labels[v1alpha1.LabelKeyNodeUpdateResult] != v1alpha1.LabelValueNodeUpdateSuccessful {
machinesWithoutUpdateSuccessfulLabel = append(machinesWithoutUpdateSuccessfulLabel, m)
}
}
allMachinesDiff := len(allMachines) - int(machineSet.Spec.Replicas)
machinesWithoutUpdateSuccessfulLabelDiff := len(machinesWithoutUpdateSuccessfulLabel) - int(machineSet.Spec.Replicas)

if len(staleMachines) >= 1 {
klog.V(3).Infof("Deleting stale machines %s", getMachineKeys(staleMachines))
}
if err := c.terminateMachines(ctx, staleMachines, machineSet); err != nil {
// TODO: proper error handling needs to happen here
klog.Errorf("failed to terminate stale machines for machineset %s: %v", machineSet.Name, err)
}

diff := len(activeMachines) - int(machineSet.Spec.Replicas)
// Removing the "update successful" label from a machine and scaling up the MachineSet are two separate operations.
// If the MachineSet is scaled up first and the "update successful" label is removed later, this controller might
// incorrectly assume that additional machines need to be created, as it ignores machines with the "update successful" label.
// Conversely, if the "update successful" label is removed first and the MachineSet is scaled up afterward, this controller
// might incorrectly assume that more machines need to be deleted, as machines without the label will be counted as active.
// To address this, we first check if the difference between the current active replicas and the desired replicas is negative.
// If it is, we then check if the difference plus the number of machines with the "update successful" label is greater than or equal to 0.
// In such cases, we set the difference to 0 because it is unclear whether the machine deployment controller still needs to increase
// the replicas of the MachineSet. Once the machine deployment controller removes the "update successful" label, the ambiguity will
// be resolved, and normal operations can proceed. This approach ensures that the controller does not create additional machines
// if the MachineSet is scaled up before the "update successful" label is removed.
if diff < 0 {
if diff+len(machinesWithUpdateSuccessfulLabel) >= 0 {
return nil
}
diff += len(machinesWithUpdateSuccessfulLabel)
}

klog.V(4).Infof("Difference between current active replicas and desired replicas - %d", diff)

if diff < 0 {
// During in-place updates, ScaleUps are disabled in the oldMachineSet and
// in newMachineSet, its ReplicaCount would never increase before a machine from oldMachineSet is moved.
// So no need for any special case during in-place updates.
if allMachinesDiff < 0 {
// If MachineSet is frozen and no deletion timestamp, don't process it
if machineSet.Labels["freeze"] == "True" && machineSet.DeletionTimestamp == nil {
klog.V(2).Infof("MachineSet %q is frozen, and hence not processing", machineSet.Name)
Expand All @@ -394,20 +361,20 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1
return nil
}

diff *= -1
if diff > BurstReplicas {
diff = BurstReplicas
allMachinesDiff *= -1
if allMachinesDiff > BurstReplicas {
allMachinesDiff = BurstReplicas
}
// TODO: Track UIDs of creates just like deletes. The problem currently
// is we'd need to wait on the result of a create to record the machine's
// UID, which would require locking *across* the create, which will turn
// into a performance bottleneck. We should generate a UID for the machine
// beforehand and store it via ExpectCreations.
if err := c.expectations.ExpectCreations(machineSetKey, diff); err != nil {
if err := c.expectations.ExpectCreations(machineSetKey, allMachinesDiff); err != nil {
// TODO: proper error handling needs to happen here
klog.Errorf("failed expect creations for machineset %s: %v", machineSet.Name, err)
}
klog.V(2).Infof("Too few replicas for MachineSet %s, need %d, creating %d", machineSet.Name, (machineSet.Spec.Replicas), diff)
klog.V(2).Infof("Too few replicas for MachineSet %s, need %d, creating %d", machineSet.Name, (machineSet.Spec.Replicas), allMachinesDiff)
// Batch the machine creates. Batch sizes start at SlowStartInitialBatchSize
// and double with each successful iteration in a kind of "slow start".
// This handles attempts to start large numbers of machines that would
Expand All @@ -416,7 +383,7 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1
// prevented from spamming the API service with the machine create requests
// after one of its machines fails. Conveniently, this also prevents the
// event spam that those failures would generate.
successfulCreations, err := slowStartBatch(diff, SlowStartInitialBatchSize, func() error {
successfulCreations, err := slowStartBatch(allMachinesDiff, SlowStartInitialBatchSize, func() error {
boolPtr := func(b bool) *bool { return &b }
controllerRef := &metav1.OwnerReference{
APIVersion: controllerKindMachineSet.GroupVersion().String(), // #ToCheck
Expand All @@ -443,22 +410,26 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1
// Any skipped machines that we never attempted to start shouldn't be expected.
// The skipped machines will be retried later. The next controller resync will
// retry the slow start process.
if skippedMachines := diff - successfulCreations; skippedMachines > 0 {
if skippedMachines := allMachinesDiff - successfulCreations; skippedMachines > 0 {
klog.V(2).Infof("Slow-start failure. Skipping creation of %d machines, decrementing expectations for %v %v/%v", skippedMachines, machineSet.Kind, machineSet.Namespace, machineSet.Name)
for range skippedMachines {
// Decrement the expected number of creates because the informer won't observe this machine
c.expectations.CreationObserved(machineSetKey)
}
}
return err
} else if diff > 0 {
if diff > BurstReplicas {
diff = BurstReplicas
}
klog.V(2).Infof("Too many replicas for %v %s/%s, need %d, deleting %d", machineSet.Kind, machineSet.Namespace, machineSet.Name, (machineSet.Spec.Replicas), diff)

logMachinesWithPriority1(activeMachines)
machinesToDelete := getMachinesToDelete(activeMachines, diff)
} else if machinesWithoutUpdateSuccessfulLabelDiff > 0 {
// During in-place updates, in the newMachineSet, it can happen that a machine has come from the oldMachineSet
// but the ReplicaCount of newMachineSet has not increased yet.
// In such cases, we should not delete the machine immediately,
// otherwise it can cause unnecessary machine deletion and creation during in-place updates.
if machinesWithoutUpdateSuccessfulLabelDiff > BurstReplicas {
machinesWithoutUpdateSuccessfulLabelDiff = BurstReplicas
}
klog.V(2).Infof("Too many replicas for %v %s/%s, need %d, deleting %d", machineSet.Kind, machineSet.Namespace, machineSet.Name, (machineSet.Spec.Replicas), machinesWithoutUpdateSuccessfulLabelDiff)

logMachinesWithPriority1(machinesWithoutUpdateSuccessfulLabel)
machinesToDelete := getMachinesToDelete(machinesWithoutUpdateSuccessfulLabel, machinesWithoutUpdateSuccessfulLabelDiff)
logMachinesToDelete(machinesToDelete)

// Snapshot the UIDs (ns/name) of the machines we're expecting to see
Expand All @@ -478,6 +449,21 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1
}
}

var staleMachines []*v1alpha1.Machine
for _, m := range machinesWithoutUpdateSuccessfulLabel {
if machineutils.IsMachineFailed(m) {
staleMachines = append(staleMachines, m)
}
}

if len(staleMachines) >= 1 {
klog.V(3).Infof("Deleting stale machines %s", getMachineKeys(staleMachines))
if err := c.terminateMachines(ctx, staleMachines, machineSet); err != nil {
// TODO: proper error handling needs to happen here
klog.Errorf("failed to terminate stale machines for machineset %s: %v", machineSet.Name, err)
}
}

return nil
}

Expand Down
9 changes: 5 additions & 4 deletions pkg/controller/machineset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -844,23 +844,24 @@ var _ = Describe("machineset", func() {
Expect(Err).Should(BeNil())
})

It("should delete MachinePriority=1 machines and spawn replacement machine", func() {
It("should delete MachinePriority=1 machines", func() {
stop := make(chan struct{})
defer close(stop)
objects := []runtime.Object{}

staleMachine := testActiveMachine1.DeepCopy()
staleMachine.Annotations[machineutils.MachinePriority] = "1"
testActiveMachine4.Status.CurrentStatus.Phase = MachineRunning

objects = append(objects, testMachineSet, staleMachine, testActiveMachine2, testActiveMachine3)
objects = append(objects, testMachineSet, staleMachine, testActiveMachine2, testActiveMachine3, testActiveMachine4)
c, trackers := createController(stop, testNamespace, objects, nil, nil)
defer trackers.Stop()
waitForCacheSync(stop, c)

machines, _ := c.controlMachineClient.Machines(testNamespace).List(context.TODO(), metav1.ListOptions{})
Expect(len(machines.Items)).To(Equal(int(testMachineSet.Spec.Replicas)))
Expect(len(machines.Items)).To(Equal(int(testMachineSet.Spec.Replicas + 1)))

beforeMachines := []*machinev1.Machine{staleMachine, testActiveMachine2, testActiveMachine3}
beforeMachines := []*machinev1.Machine{staleMachine, testActiveMachine2, testActiveMachine3, testActiveMachine4}
err := c.manageReplicas(context.TODO(), beforeMachines, testMachineSet)
Expect(err).Should(BeNil())
waitForCacheSync(stop, c)
Expand Down
2 changes: 1 addition & 1 deletion pkg/util/provider/machineutils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,5 +123,5 @@ func IsMachineFailed(p *v1alpha1.Machine) bool {

// IsMachineTriggeredForDeletion checks if machine was triggered for deletion
func IsMachineTriggeredForDeletion(m *v1alpha1.Machine) bool {
return m.Annotations[MachinePriority] == "1" || m.Annotations[TriggerDeletionByMCM] == "true"
return m.Annotations[MachinePriority] == "1"
}
Loading