From 5d63693b0fd00624fc59f28f6459844bc39b16e0 Mon Sep 17 00:00:00 2001 From: Neeraj Krishna Gopalakrishna Date: Thu, 25 Jun 2026 07:39:19 +0530 Subject: [PATCH 1/2] Fix longrunning test case Exec timeout --- test/extended/imagepolicy/imagepolicy.go | 55 ++- test/extended/node/criocredentialprovider.go | 9 +- test/extended/node/image_volume.go | 4 +- .../node/kubelet_secret_pulled_images.go | 6 +- test/extended/node/kubeletconfig_tls.go | 2 +- .../node/node_e2e/container_runtime_config.go | 12 +- .../node/node_e2e/image_registry_config.go | 14 +- test/extended/node/node_e2e/initcontainer.go | 6 +- test/extended/node/node_e2e/node.go | 48 +-- test/extended/node/node_e2e/pdb_drain.go | 5 +- .../node/node_e2e/probe_termination.go | 4 +- test/extended/node/node_sizing.go | 12 +- test/extended/node/node_swap.go | 2 + test/extended/node/node_swap_cnv.go | 383 ++++++++++++------ test/extended/node/node_utils.go | 108 +++-- test/extended/node/system_compressible.go | 12 +- 16 files changed, 462 insertions(+), 220 deletions(-) diff --git a/test/extended/imagepolicy/imagepolicy.go b/test/extended/imagepolicy/imagepolicy.go index eff805a06057..65284f7e5567 100644 --- a/test/extended/imagepolicy/imagepolicy.go +++ b/test/extended/imagepolicy/imagepolicy.go @@ -216,8 +216,7 @@ func updateImageConfig(oc *exutil.CLI, allowedRegistries []string) { return err }) o.Expect(err).NotTo(o.HaveOccurred(), "error updating image config") - WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) } func cleanupImageConfig(oc *exutil.CLI) error { @@ -238,8 +237,7 @@ func cleanupImageConfig(oc *exutil.CLI) error { return err }) o.Expect(err).NotTo(o.HaveOccurred(), "error cleaning up image config") - WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) return nil } @@ -285,8 +283,7 @@ func createClusterImagePolicy(oc *exutil.CLI, policy configv1.ClusterImagePolicy _, err := oc.AdminConfigClient().ConfigV1().ClusterImagePolicies().Create(context.TODO(), &policy, metav1.CreateOptions{}) o.Expect(err).NotTo(o.HaveOccurred()) - WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) } func deleteClusterImagePolicy(oc *exutil.CLI, policyName string) error { @@ -296,8 +293,7 @@ func deleteClusterImagePolicy(oc *exutil.CLI, policyName string) error { if err := oc.AdminConfigClient().ConfigV1().ClusterImagePolicies().Delete(context.TODO(), policyName, metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { return fmt.Errorf("failed to delete cluster image policy %s: %v", policyName, err) } - WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) return nil } @@ -312,8 +308,7 @@ func createImagePolicy(oc *exutil.CLI, policy configv1.ImagePolicy, namespace st // Wait until each pool's Spec.Configuration.Name changes from the initial value // and the pool reports Updated=true - WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) } func deleteImagePolicy(oc *exutil.CLI, policyName string, namespace string) error { @@ -323,8 +318,7 @@ func deleteImagePolicy(oc *exutil.CLI, policyName string, namespace string) erro if err := oc.AdminConfigClient().ConfigV1().ImagePolicies(namespace).Delete(context.TODO(), policyName, metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { return fmt.Errorf("failed to delete image policy %s in namespace %s: %v", policyName, namespace, err) } - WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) return nil } @@ -707,7 +701,42 @@ func WaitForMCPConfigSpecChangeAndUpdated(oc *exutil.CLI, pool string, initialSp return false } return machineconfighelper.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated) - }, 20*time.Minute, 10*time.Second).Should(o.BeTrue()) + }, 15*time.Minute, 10*time.Second).Should(o.BeTrue()) +} + +func WaitForMCPsConfigSpecChangeAndUpdated(oc *exutil.CLI, workerInitialSpec, masterInitialSpec string) { + e2e.Logf("Waiting for worker and master pools to complete") + clientSet, err := machineconfigclient.NewForConfig(oc.KubeFramework().ClientConfig()) + o.Expect(err).NotTo(o.HaveOccurred()) + + o.Eventually(func() bool { + workerMCP, err := clientSet.MachineconfigurationV1().MachineConfigPools().Get(context.TODO(), "worker", metav1.GetOptions{}) + if err != nil { + return false + } + masterMCP, err := clientSet.MachineconfigurationV1().MachineConfigPools().Get(context.TODO(), "master", metav1.GetOptions{}) + if err != nil { + return false + } + + workerReady := workerMCP.Status.Configuration.Name != workerInitialSpec && + workerMCP.Spec.Configuration.Name == workerMCP.Status.Configuration.Name && + machineconfighelper.IsMachineConfigPoolConditionTrue(workerMCP.Status.Conditions, mcfgv1.MachineConfigPoolUpdated) + + masterReady := masterMCP.Status.Configuration.Name != masterInitialSpec && + masterMCP.Spec.Configuration.Name == masterMCP.Status.Configuration.Name && + machineconfighelper.IsMachineConfigPoolConditionTrue(masterMCP.Status.Conditions, mcfgv1.MachineConfigPoolUpdated) + + if !workerReady { + e2e.Logf("Worker MCP not ready yet") + } + if !masterReady { + e2e.Logf("Master MCP not ready yet") + } + + return workerReady && masterReady + }, 15*time.Minute, 10*time.Second).Should(o.BeTrue()) + e2e.Logf("Both worker and master pools completed successfully") } func isDisconnectedCluster(oc *exutil.CLI) bool { diff --git a/test/extended/node/criocredentialprovider.go b/test/extended/node/criocredentialprovider.go index 9e3d72910dd6..3daf87c0d9ca 100644 --- a/test/extended/node/criocredentialprovider.go +++ b/test/extended/node/criocredentialprovider.go @@ -195,8 +195,7 @@ func updateCRIOCredentialProviderConfig(oc *exutil.CLI, matchImages []string, ex return } - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) } func getWorkerNodes(oc *exutil.CLI) ([]corev1.Node, error) { @@ -289,8 +288,7 @@ func createIDMSResources(oc *exutil.CLI) { e2e.Logf("Created ImageDigestMirrorSet %q", idms.Name) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) } func cleanupIDMSResources(oc *exutil.CLI) { @@ -302,8 +300,7 @@ func cleanupIDMSResources(oc *exutil.CLI) { e2e.Logf("Deleted ImageDigestMirrorSet %q", "digest-mirror") - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) } func createNamespaceRBAC(f *e2e.Framework, namespace string) { diff --git a/test/extended/node/image_volume.go b/test/extended/node/image_volume.go index dea55d4e1a2f..622bb03edc15 100644 --- a/test/extended/node/image_volume.go +++ b/test/extended/node/image_volume.go @@ -66,13 +66,15 @@ func describeImageVolumeTests(config imageVolumeTestConfig) bool { podName = config.frameworkName + "-test" ) - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { // Microshift doesn't inherit OCP feature gates, and ImageVolume won't work either isMicroshift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred()) if isMicroshift { g.Skip("Not supported on Microshift") } + + EnsureNodesReady(ctx, oc) }) g.It("should succeed with pod and pull policy of Always", func(ctx context.Context) { diff --git a/test/extended/node/kubelet_secret_pulled_images.go b/test/extended/node/kubelet_secret_pulled_images.go index af1c74c9b527..a1b1e1f9145f 100644 --- a/test/extended/node/kubelet_secret_pulled_images.go +++ b/test/extended/node/kubelet_secret_pulled_images.go @@ -206,7 +206,7 @@ var _ = g.Describe("[sig-node][Suite:openshift/disruptive-longrunning][Disruptiv g.DeferCleanup(func() { _ = deleteKC(oc, kcName) - _ = waitForMCP(ctx, mcClient, "worker", 30*time.Minute) + _ = waitForMCP(ctx, mcClient, "worker", 15*time.Minute) }) g.By("Pre-caching private image on the node with a valid secret") @@ -215,7 +215,7 @@ var _ = g.Describe("[sig-node][Suite:openshift/disruptive-longrunning][Disruptiv g.By("Applying NeverVerify policy and waiting for MCO rollout") credVerifyApplyPolicy(ctx, mcClient, kcName, `{"imagePullCredentialsVerificationPolicy":"NeverVerify"}`) credVerifyWaitForMCPUpdating(ctx, mcClient, "worker") - err = waitForMCP(ctx, mcClient, "worker", 30*time.Minute) + err = waitForMCP(ctx, mcClient, "worker", 15*time.Minute) o.Expect(err).NotTo(o.HaveOccurred()) g.By("Verifying NeverVerify policy allows pod without secret to use cached image") @@ -224,7 +224,7 @@ var _ = g.Describe("[sig-node][Suite:openshift/disruptive-longrunning][Disruptiv g.By("Switching to AlwaysVerify policy and waiting for MCO rollout") credVerifyApplyPolicy(ctx, mcClient, kcName, `{"imagePullCredentialsVerificationPolicy":"AlwaysVerify"}`) credVerifyWaitForMCPUpdating(ctx, mcClient, "worker") - err = waitForMCP(ctx, mcClient, "worker", 30*time.Minute) + err = waitForMCP(ctx, mcClient, "worker", 15*time.Minute) o.Expect(err).NotTo(o.HaveOccurred()) // This pod also re-caches the image after MCO rollout since pull records are cleared diff --git a/test/extended/node/kubeletconfig_tls.go b/test/extended/node/kubeletconfig_tls.go index 369cccda386f..dcb992badcbb 100644 --- a/test/extended/node/kubeletconfig_tls.go +++ b/test/extended/node/kubeletconfig_tls.go @@ -220,7 +220,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv "Timed out waiting for MachineConfigPool %q to start updating", testMCPName) g.By(fmt.Sprintf("Waiting for MachineConfigPool %s to complete rollout", testMCPName)) - err = waitForMCP(ctx, mcClient, testMCPName, 30*time.Minute) + err = waitForMCP(ctx, mcClient, testMCPName, 15*time.Minute) o.Expect(err).NotTo(o.HaveOccurred(), "Error waiting for MachineConfigPool %q to become ready", testMCPName) framework.Logf("MachineConfigPool %s has completed rollout", testMCPName) diff --git a/test/extended/node/node_e2e/container_runtime_config.go b/test/extended/node/node_e2e/container_runtime_config.go index dc9c0d39fb1a..ff799d4825d3 100644 --- a/test/extended/node/node_e2e/container_runtime_config.go +++ b/test/extended/node/node_e2e/container_runtime_config.go @@ -28,12 +28,14 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv oc = exutil.NewCLIWithoutNamespace("ctrcfg") ) - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { isMicroShift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred(), "failed to detect MicroShift cluster") if isMicroShift { g.Skip("Skipping test on MicroShift cluster - MachineConfig resources are not available") } + + nodeutils.EnsureNodesReady(ctx, oc) }) // Validates that ContainerRuntimeConfig pidsLimit setting is correctly applied @@ -51,12 +53,12 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv workerNode := workers[0].Name g.By("Make a manual change to crio.conf on worker node") - _, err = nodeutils.ExecOnNodeWithChroot(oc, workerNode, + _, err = nodeutils.ExecOnNodeWithChroot(ctx, oc, workerNode, "/bin/bash", "-c", `sed -i '/^\[crio\.runtime\]/a log_level = "debug"' /etc/crio/crio.conf`) o.Expect(err).NotTo(o.HaveOccurred(), "failed to edit crio.conf on node %s", workerNode) g.By("Verify the manual crio.conf edit took effect") - editedConf, err := nodeutils.ExecOnNodeWithChroot(oc, workerNode, "cat", "/etc/crio/crio.conf") + editedConf, err := nodeutils.ExecOnNodeWithChroot(ctx, oc, workerNode, "cat", "/etc/crio/crio.conf") o.Expect(err).NotTo(o.HaveOccurred(), "failed to read crio.conf on node %s", workerNode) o.Expect(editedConf).To(o.ContainSubstring(`log_level = "debug"`), "sed edit did not apply: expected log_level = debug in crio.conf") @@ -100,7 +102,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv var crioConfig string o.Eventually(func() error { var execErr error - crioConfig, execErr = nodeutils.ExecOnNodeWithChroot(oc, workerNode, + crioConfig, execErr = nodeutils.ExecOnNodeWithChroot(ctx, oc, workerNode, "/bin/bash", "-c", "crio config 2>/dev/null") return execErr }, 30*time.Second, 5*time.Second).Should(o.Succeed(), "failed to get crio config on node %s", workerNode) @@ -163,7 +165,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv e2e.Logf("Worker node rolled out successfully") g.By("Check overlaySize takes effect in storage.conf on worker node") - storageConf, err := nodeutils.ExecOnNodeWithChroot(oc, workerNode, + storageConf, err := nodeutils.ExecOnNodeWithChroot(ctx, oc, workerNode, "/bin/bash", "-c", "head -n 7 /etc/containers/storage.conf | grep size") o.Expect(err).NotTo(o.HaveOccurred(), "failed to read storage.conf on node %s", workerNode) e2e.Logf("storage.conf size line: %s", storageConf) diff --git a/test/extended/node/node_e2e/image_registry_config.go b/test/extended/node/node_e2e/image_registry_config.go index ce3a02189813..14bc176f60e1 100644 --- a/test/extended/node/node_e2e/image_registry_config.go +++ b/test/extended/node/node_e2e/image_registry_config.go @@ -24,12 +24,14 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv oc = exutil.NewCLIWithoutNamespace("imgcfg") ) - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { isMicroShift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred(), "failed to detect cluster type") if isMicroShift { g.Skip("Skipping test on MicroShift cluster - MachineConfig resources are not available") } + + nodeutils.EnsureNodesReady(ctx, oc) }) // Verifies that updating image.config.openshift.io/cluster with a new search @@ -62,8 +64,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv cleanupWorkerSpec := imagepolicy.GetMCPCurrentSpecConfigName(oc, "worker") cleanupMasterSpec := imagepolicy.GetMCPCurrentSpecConfigName(oc, "master") - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "worker", cleanupWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "master", cleanupMasterSpec) + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, cleanupWorkerSpec, cleanupMasterSpec) e2e.Logf("Cleanup: waiting for all cluster operators to settle") waitErr := operator.WaitForOperatorsToSettle(ctx, oc.AdminConfigClient(), 10) @@ -90,8 +91,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv o.Expect(err).NotTo(o.HaveOccurred(), "failed to update image.config.openshift.io/cluster") g.By("Wait for worker and master MCP rollout to complete") - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "worker", initialWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "master", initialMasterSpec) + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) g.By("Verify search registries config on a worker node") workers, err := exutil.GetReadySchedulableWorkerNodes(ctx, oc.AdminKubeClient()) @@ -101,7 +101,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv var registriesConf string o.Eventually(func() error { var execErr error - registriesConf, execErr = nodeutils.ExecOnNodeWithChroot(oc, workers[0].Name, + registriesConf, execErr = nodeutils.ExecOnNodeWithChroot(ctx, oc, workers[0].Name, "cat", "/etc/containers/registries.conf.d/01-image-searchRegistries.conf") if execErr != nil { return execErr @@ -115,7 +115,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv e2e.Logf("Registries config on %s:\n%s", workers[0].Name, registriesConf) g.By("Verify policy.json is updated with allowed registries") - policyJSON, err := nodeutils.ExecOnNodeWithChroot(oc, workers[0].Name, + policyJSON, err := nodeutils.ExecOnNodeWithChroot(ctx, oc, workers[0].Name, "cat", "/etc/containers/policy.json") o.Expect(err).NotTo(o.HaveOccurred(), "failed to read policy.json on node %s", workers[0].Name) e2e.Logf("policy.json on %s:\n%s", workers[0].Name, policyJSON) diff --git a/test/extended/node/node_e2e/initcontainer.go b/test/extended/node/node_e2e/initcontainer.go index 6e095fe222d7..9d539f722872 100644 --- a/test/extended/node/node_e2e/initcontainer.go +++ b/test/extended/node/node_e2e/initcontainer.go @@ -26,12 +26,14 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] NODE initContainer policy,vol ) // Skip all tests on MicroShift clusters as MachineConfig resources are not available - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { isMicroShift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred()) if isMicroShift { g.Skip("Skipping test on MicroShift cluster - MachineConfig resources are not available") } + + nodeutils.EnsureNodesReady(ctx, oc) }) //author: bgudi@redhat.com @@ -127,7 +129,7 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] NODE initContainer policy,vol actualContainerID := matches[1] g.By("Delete init container from node") - output, err := nodeutils.ExecOnNodeWithChroot(oc, nodeName, "crictl", "rm", actualContainerID) + output, err := nodeutils.ExecOnNodeWithChroot(ctx, oc, nodeName, "crictl", "rm", actualContainerID) o.Expect(err).NotTo(o.HaveOccurred(), "fail to delete container") e2e.Logf("Container deletion output: %s", output) diff --git a/test/extended/node/node_e2e/node.go b/test/extended/node/node_e2e/node.go index 5f43c93e20af..319396edbc82 100644 --- a/test/extended/node/node_e2e/node.go +++ b/test/extended/node/node_e2e/node.go @@ -29,16 +29,18 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Kubelet, CRI-O, CPU manager", ) // Skip all tests on MicroShift clusters as MachineConfig resources are not available - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { isMicroShift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred()) if isMicroShift { g.Skip("Skipping test on MicroShift cluster - MachineConfig resources are not available") } + + nodeutils.EnsureNodesReady(ctx, oc) }) //author: asahay@redhat.com - g.It("[OTP] validate KUBELET_LOG_LEVEL", func() { + g.It("[OTP] validate KUBELET_LOG_LEVEL", func(ctx context.Context) { var kubeservice string var kubelet string var err error @@ -59,11 +61,11 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Kubelet, CRI-O, CPU manager", if nodeStatus == "True" { g.By("Checking KUBELET_LOG_LEVEL in kubelet.service on node " + node) - kubeservice, err = nodeutils.ExecOnNodeWithChroot(oc, node, "/bin/bash", "-c", "systemctl show kubelet.service | grep KUBELET_LOG_LEVEL") + kubeservice, err = nodeutils.ExecOnNodeWithChroot(ctx, oc, node, "/bin/bash", "-c", "systemctl show kubelet.service | grep KUBELET_LOG_LEVEL") o.Expect(err).NotTo(o.HaveOccurred()) g.By("Checking kubelet process for --v=2 flag on node " + node) - kubelet, err = nodeutils.ExecOnNodeWithChroot(oc, node, "/bin/bash", "-c", "ps aux | grep [k]ubelet") + kubelet, err = nodeutils.ExecOnNodeWithChroot(ctx, oc, node, "/bin/bash", "-c", "ps aux | grep [k]ubelet") o.Expect(err).NotTo(o.HaveOccurred()) g.By("Verifying KUBELET_LOG_LEVEL is set and kubelet is running with --v=2") @@ -89,7 +91,7 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Kubelet, CRI-O, CPU manager", }) //author: cmaurya@redhat.com - g.It("[OTP] validate cgroupv2 is default [OCP-80983]", func() { + g.It("[OTP] validate cgroupv2 is default [OCP-80983]", func(ctx context.Context) { g.By("Check cgroup version on all Ready worker nodes") nodeNames, err := oc.AsAdmin().WithoutNamespace().Run("get").Args("nodes", "-l", "node-role.kubernetes.io/worker", "-o=jsonpath={.items[*].metadata.name}").Output() o.Expect(err).NotTo(o.HaveOccurred()) @@ -103,7 +105,7 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Kubelet, CRI-O, CPU manager", e2e.Logf("Skipping worker node %s (not Ready)", worker) continue } - cgroupV, err := nodeutils.ExecOnNodeWithChroot(oc, worker, "/bin/bash", "-c", "stat -c %T -f /sys/fs/cgroup") + cgroupV, err := nodeutils.ExecOnNodeWithChroot(ctx, oc, worker, "/bin/bash", "-c", "stat -c %T -f /sys/fs/cgroup") o.Expect(err).NotTo(o.HaveOccurred()) e2e.Logf("cgroup version on node %s: [%v]", worker, cgroupV) o.Expect(cgroupV).To(o.ContainSubstring("cgroup2fs"), "Node %s does not have cgroupv2", worker) @@ -116,7 +118,7 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Kubelet, CRI-O, CPU manager", }) //author: cmaurya@redhat.com - g.It("[OTP] Allow dev fuse by default in CRI-O [OCP-70987]", func() { + g.It("[OTP] Allow dev fuse by default in CRI-O [OCP-70987]", func(ctx context.Context) { podName := "pod-devfuse" ns := "devfuse-test" @@ -127,7 +129,7 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Kubelet, CRI-O, CPU manager", "nodes", "-l", "node-role.kubernetes.io/worker", "-o=jsonpath={.items[0].metadata.name}").Output() o.Expect(err).NotTo(o.HaveOccurred()) o.Expect(node).NotTo(o.BeEmpty()) - runtime, err := nodeutils.ExecOnNodeWithChroot(oc, node, "/bin/bash", "-c", + runtime, err := nodeutils.ExecOnNodeWithChroot(ctx, oc, node, "/bin/bash", "-c", "crio status config 2>/dev/null | awk -F'\"' '/default_runtime/{print $2}'") o.Expect(err).NotTo(o.HaveOccurred()) if strings.TrimSpace(runtime) == "runc" { @@ -169,19 +171,20 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Kubelet, CRI-O, CPU manager", // author: asahay@redhat.com var _ = g.Describe("[sig-node][Suite:openshift/disruptive-longrunning][Disruptive][Serial] ImageTagMirrorSet and ImageDigestMirrorSet", func() { var ( - oc = exutil.NewCLIWithoutNamespace("image-mirror-set") - ctx = context.Background() + oc = exutil.NewCLIWithoutNamespace("image-mirror-set") ) - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { isMicroShift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred()) if isMicroShift { g.Skip("Skipping test on MicroShift cluster - MachineConfig resources are not available") } + + nodeutils.EnsureNodesReady(ctx, oc) }) - g.It("[OTP] Create ImageDigestMirrorSet and ImageTagMirrorSet and verify registries.conf [OCP-57401]", func() { + g.It("[OTP] Create ImageDigestMirrorSet and ImageTagMirrorSet and verify registries.conf [OCP-57401]", func(ctx context.Context) { configClient := oc.AdminConfigClient().ConfigV1() suffix := utilrand.String(5) idmsName := fmt.Sprintf("digest-mirror-%s", suffix) @@ -220,21 +223,21 @@ var _ = g.Describe("[sig-node][Suite:openshift/disruptive-longrunning][Disruptiv e2e.Logf("ImageDigestMirrorSet %q created successfully", createdIDMS.Name) g.DeferCleanup(func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() g.By("Cleanup: Delete IDMS and ITMS resources") - cleanupWorkerSpec := imagepolicy.GetMCPCurrentSpecConfigName(oc, "worker") - cleanupMasterSpec := imagepolicy.GetMCPCurrentSpecConfigName(oc, "master") - if delErr := configClient.ImageTagMirrorSets().Delete(ctx, itmsName, metav1.DeleteOptions{}); delErr != nil { + if delErr := configClient.ImageTagMirrorSets().Delete(cleanupCtx, itmsName, metav1.DeleteOptions{}); delErr != nil { e2e.Logf("Warning: failed to delete ImageTagMirrorSet: %v", delErr) } - if delErr := configClient.ImageDigestMirrorSets().Delete(ctx, idmsName, metav1.DeleteOptions{}); delErr != nil { + if delErr := configClient.ImageDigestMirrorSets().Delete(cleanupCtx, idmsName, metav1.DeleteOptions{}); delErr != nil { e2e.Logf("Warning: failed to delete ImageDigestMirrorSet: %v", delErr) } - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "worker", cleanupWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "master", cleanupMasterSpec) + cleanupWorkerSpec := imagepolicy.GetMCPCurrentSpecConfigName(oc, "worker") + cleanupMasterSpec := imagepolicy.GetMCPCurrentSpecConfigName(oc, "master") + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, cleanupWorkerSpec, cleanupMasterSpec) }) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "worker", initialWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "master", initialMasterSpec) + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) e2e.Logf("IDMS MCP rollout complete") g.By("Step 2: Create an ImageTagMirrorSet") @@ -271,15 +274,14 @@ var _ = g.Describe("[sig-node][Suite:openshift/disruptive-longrunning][Disruptiv e2e.Logf("ImageTagMirrorSet %q created successfully", createdITMS.Name) g.By("Step 3: Wait for all nodes to finish rolling out") - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "worker", itmsWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "master", itmsMasterSpec) + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, itmsWorkerSpec, itmsMasterSpec) e2e.Logf("All MCPs have finished rolling out") g.By("Step 4: Verify /etc/containers/registries.conf on a worker node") workerNodeName := nodeutils.GetFirstReadyWorkerNode(oc) o.Expect(workerNodeName).NotTo(o.BeEmpty(), "no ready worker node found") - registriesConf, err := nodeutils.ExecOnNodeWithChroot(oc, workerNodeName, "cat", "/etc/containers/registries.conf") + registriesConf, err := nodeutils.ExecOnNodeWithChroot(ctx, oc, workerNodeName, "cat", "/etc/containers/registries.conf") o.Expect(err).NotTo(o.HaveOccurred(), "failed to read registries.conf from node %s", workerNodeName) e2e.Logf("registries.conf content:\n%s", registriesConf) diff --git a/test/extended/node/node_e2e/pdb_drain.go b/test/extended/node/node_e2e/pdb_drain.go index dc1c95ce14d7..18856a1394bf 100644 --- a/test/extended/node/node_e2e/pdb_drain.go +++ b/test/extended/node/node_e2e/pdb_drain.go @@ -18,6 +18,7 @@ import ( e2e "k8s.io/kubernetes/test/e2e/framework" "k8s.io/utils/ptr" + nodeutils "github.com/openshift/origin/test/extended/node" exutil "github.com/openshift/origin/test/extended/util" "github.com/openshift/origin/test/extended/util/operator" ) @@ -27,12 +28,14 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv oc = exutil.NewCLIWithoutNamespace("pdb-drain") ) - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { isMicroShift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred()) if isMicroShift { g.Skip("Skipping test on MicroShift cluster") } + + nodeutils.EnsureNodesReady(ctx, oc) }) //author: bgudi@redhat.com diff --git a/test/extended/node/node_e2e/probe_termination.go b/test/extended/node/node_e2e/probe_termination.go index 6114cc69494b..a5fb9c02058d 100644 --- a/test/extended/node/node_e2e/probe_termination.go +++ b/test/extended/node/node_e2e/probe_termination.go @@ -27,12 +27,14 @@ var _ = g.Describe("[sig-node] Probe configuration", func() { oc = exutil.NewCLIWithoutNamespace("probe-termination") ) - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { isMicroShift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred()) if isMicroShift { g.Skip("Skipping test on MicroShift cluster") } + + nodeutils.EnsureNodesReady(ctx, oc) }) //author: bgudi@redhat.com diff --git a/test/extended/node/node_sizing.go b/test/extended/node/node_sizing.go index bfba9942473d..40d67ac46c74 100644 --- a/test/extended/node/node_sizing.go +++ b/test/extended/node/node_sizing.go @@ -31,6 +31,8 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv if isMicroShift { g.Skip("Skipping test on MicroShift cluster") } + + EnsureNodesReady(ctx, oc) }) g.It("should have NODE_SIZING_ENABLED=true by default and NODE_SIZING_ENABLED=false when KubeletConfig with autoSizingReserved=false is applied", func(ctx context.Context) { @@ -154,7 +156,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv err = waitForMCP(ctx, mcClient, testMCPName, 5*time.Minute) o.Expect(err).NotTo(o.HaveOccurred(), "Custom MachineConfigPool should become ready") - verifyNodeSizingEnabledFile(oc, nodeName, "true") + verifyNodeSizingEnabledFile(ctx, oc, nodeName, "true") // Now apply KubeletConfig and verify NODE_SIZING_ENABLED=false @@ -232,7 +234,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv err = waitForMCP(ctx, mcClient, testMCPName, 15*time.Minute) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("%s MCP should become ready with new configuration", testMCPName)) - verifyNodeSizingEnabledFile(oc, nodeName, "false") + verifyNodeSizingEnabledFile(ctx, oc, nodeName, "false") // Explicit cleanup on success; DeferCleanup ensures cleanup also runs on failure cleanupKubeletConfig() @@ -242,14 +244,14 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv }) // verifyNodeSizingEnabledFile verifies the NODE_SIZING_ENABLED value in the env file -func verifyNodeSizingEnabledFile(oc *exutil.CLI, nodeName, expectedValue string) { +func verifyNodeSizingEnabledFile(ctx context.Context, oc *exutil.CLI, nodeName, expectedValue string) { g.By("Verifying /etc/node-sizing-enabled.env file exists") - output, err := ExecOnNodeWithChroot(oc, nodeName, "test", "-f", "/etc/node-sizing-enabled.env") + output, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "test", "-f", "/etc/node-sizing-enabled.env") o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("File /etc/node-sizing-enabled.env should exist on node %s. Output: %s", nodeName, output)) g.By("Reading /etc/node-sizing-enabled.env file contents") - output, err = ExecOnNodeWithChroot(oc, nodeName, "cat", "/etc/node-sizing-enabled.env") + output, err = ExecOnNodeWithChroot(ctx, oc, nodeName, "cat", "/etc/node-sizing-enabled.env") o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to read /etc/node-sizing-enabled.env") framework.Logf("Contents of /etc/node-sizing-enabled.env:\n%s", output) diff --git a/test/extended/node/node_swap.go b/test/extended/node/node_swap.go index f3f0c151c0ce..80f194c9ea11 100644 --- a/test/extended/node/node_swap.go +++ b/test/extended/node/node_swap.go @@ -35,6 +35,8 @@ var _ = g.Describe("[Jira:Node][sig-node] Node non-cnv swap configuration", func if isMicroShift { g.Skip("Skipping test on MicroShift cluster") } + + EnsureNodesReady(ctx, oc) }) // This test validates that: diff --git a/test/extended/node/node_swap_cnv.go b/test/extended/node/node_swap_cnv.go index 2a5669addb25..a04e881befb4 100644 --- a/test/extended/node/node_swap_cnv.go +++ b/test/extended/node/node_swap_cnv.go @@ -91,9 +91,8 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr } }) - // TC1: Verify drop-in directory exists on all nodes (created by MCO for kubelet config) - // Per MCO PR #6044: directory is mandatory on ALL nodes (masters, workers) - g.It("TC1: should verify drop-in directory exists on all nodes with correct ownership", func(ctx context.Context) { + // TC1: Verify silent creation and ownership of drop-in directory + g.It("TC1: should verify silent creation and ownership of drop-in directory on CNV nodes", func(ctx context.Context) { // Get a CNV worker node for tests cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) o.Expect(cnvWorkerNode).NotTo(o.BeEmpty(), "No CNV worker nodes available") @@ -116,7 +115,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking drop-in directory exists on ALL worker nodes") for _, workerNode := range workerNodeNames { framework.Logf("Running command: ls -ld %s on node %s", cnvDropInDir, workerNode) - output, err := ExecOnNodeWithChroot(oc, workerNode, "ls", "-ld", cnvDropInDir) + output, err := ExecOnNodeWithChroot(ctx, oc, workerNode, "ls", "-ld", cnvDropInDir) if err != nil { framework.Logf("Drop-in directory does not exist on worker node %s: %v", workerNode, err) e2eskipper.Skipf("Drop-in directory not present on worker node %s - CNV operator may not be installed", workerNode) @@ -129,7 +128,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking directory permissions on all worker nodes (should be 755 or stricter)") for _, workerNode := range workerNodeNames { framework.Logf("Running command: stat -c %%a %s on node %s", cnvDropInDir, workerNode) - output, err := ExecOnNodeWithChroot(oc, workerNode, "stat", "-c", "%a", cnvDropInDir) + output, err := ExecOnNodeWithChroot(ctx, oc, workerNode, "stat", "-c", "%a", cnvDropInDir) o.Expect(err).NotTo(o.HaveOccurred()) perms := strings.TrimSpace(output) framework.Logf("Output from node %s: permissions=%s", workerNode, perms) @@ -139,7 +138,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking SELinux context on worker nodes") framework.Logf("Running command: ls -ldZ %s on node %s", cnvDropInDir, cnvWorkerNode) - output, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "ls", "-ldZ", cnvDropInDir) + output, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "ls", "-ldZ", cnvDropInDir) if err == nil { framework.Logf("Output: %s", output) } @@ -154,28 +153,27 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr o.Expect(lowerOutput).NotTo(o.ContainSubstring("failed to load kubelet config"), "Should not have kubelet config load failures") o.Expect(lowerOutput).NotTo(o.ContainSubstring("error reading drop-in"), "Should not have errors reading drop-in files") - // Verify drop-in directory also exists on control plane nodes + // Skip on Hypershift - MachineConfig API is not available controlPlaneTopology, err := exutil.GetControlPlaneTopology(oc) o.Expect(err).NotTo(o.HaveOccurred()) if *controlPlaneTopology != configv1.ExternalTopologyMode { - g.By("Verifying drop-in directory EXISTS on control plane/master nodes") + g.By("Verifying drop-in directory does NOT exist on control plane/master nodes") controlPlaneNodes, err := getNodesByLabel(ctx, oc, "node-role.kubernetes.io/master") o.Expect(err).NotTo(o.HaveOccurred()) - o.Expect(controlPlaneNodes).NotTo(o.BeEmpty(), - "expected at least one control-plane/master node in non-external topology") framework.Logf("Found %d control plane/master nodes", len(controlPlaneNodes)) - // Drop-in directory SHOULD exist on control plane nodes (created by MCO for all nodes) + // Drop-in directory should NOT exist on control plane nodes for _, cpNode := range controlPlaneNodes { - output, err := ExecOnNodeWithChroot(oc, cpNode.Name, "ls", "-ld", cnvDropInDir) - o.Expect(err).NotTo(o.HaveOccurred(), "Drop-in directory should exist on control plane node %s", cpNode.Name) - framework.Logf("Control plane node %s has drop-in directory (expected): %s", cpNode.Name, strings.TrimSpace(output)) - - // Verify ownership - o.Expect(output).To(o.ContainSubstring("root root"), "Directory should be owned by root:root on control plane node %s", cpNode.Name) + _, err = ExecOnNodeWithChroot(ctx, oc, cpNode.Name, "ls", "-ld", cnvDropInDir) + if err == nil { + framework.Logf("ERROR: Drop-in directory exists on control plane node %s - this is unexpected", cpNode.Name) + o.Expect(err).To(o.HaveOccurred(), "Drop-in directory should NOT exist on control plane node %s", cpNode.Name) + } else { + framework.Logf("Drop-in directory does NOT exist on control plane node %s (expected)", cpNode.Name) + } } - framework.Logf("TC1 PASSED: Drop-in directory is present on all %d worker nodes and all %d control plane nodes", len(workerNodeNames), len(controlPlaneNodes)) + framework.Logf("TC1 PASSED: Drop-in directory is present on all %d worker nodes and NOT present on any control plane nodes", len(workerNodeNames)) } else { framework.Logf("TC1 PASSED: Drop-in directory is present on all %d worker nodes (skipped control plane validation on Hypershift)", len(workerNodeNames)) } @@ -190,14 +188,14 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("Using CNV worker node for tests: %s", cnvWorkerNode) g.By("Checking if drop-in directory exists and is empty") - output, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "ls", "-la", cnvDropInDir) + output, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "ls", "-la", cnvDropInDir) if err != nil { e2eskipper.Skipf("Drop-in directory not present") } framework.Logf("Directory contents: %s", output) g.By("Verifying kubelet is running") - output, err = ExecOnNodeWithChroot(oc, cnvWorkerNode, "systemctl", "is-active", "kubelet") + output, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "systemctl", "is-active", "kubelet") o.Expect(err).NotTo(o.HaveOccurred()) o.Expect(strings.TrimSpace(output)).To(o.Equal("active"), "Kubelet should be active") @@ -241,19 +239,21 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Creating drop-in file with LimitedSwap configuration in /etc/openshift/kubelet.conf.d/") framework.Logf("Creating file: %s with content:\n%s", cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) - err = createDropInFile(oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err = createDropInFile(ctx, oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) g.By("Verifying drop-in file was created successfully") - output, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "cat", cnvDropInFilePath) + output, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "cat", cnvDropInFilePath) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Drop-in file content:\n%s", output) o.Expect(output).To(o.ContainSubstring("LimitedSwap"), "Drop-in file should contain LimitedSwap configuration") // Defer cleanup defer func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() g.By("Cleaning up - removing drop-in file and restarting kubelet") - cleanupDropInAndRestartKubelet(ctx, oc, cnvWorkerNode, cnvDropInFilePath) + cleanupDropInAndRestartKubelet(cleanupCtx, oc, cnvWorkerNode, cnvDropInFilePath) }() g.By("Restarting kubelet to load the new configuration") @@ -293,7 +293,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr if configInitial.MemorySwap.SwapBehavior != "LimitedSwap" { g.By("Creating drop-in file with LimitedSwap configuration") framework.Logf("Creating file: %s", cnvDropInFilePath) - err = createDropInFile(oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err = createDropInFile(ctx, oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) g.By("Restarting kubelet to apply LimitedSwap") @@ -323,26 +323,138 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("=== TC4 PASSED ===") }) - // TC5: Validate security and permissions of drop-in directory - g.It("TC5: should validate security and permissions of drop-in directory", func(ctx context.Context) { + // TC5: Verify kubelet ignores drop-in configuration on ALL control plane nodes + g.It("TC5: should verify control plane kubelets ignore drop-in config", func(ctx context.Context) { + framework.Logf("=== TC5: Testing control plane ignores drop-in configuration ===") + + // skip these tests on hypershift platforms + if ok, _ := exutil.IsHypershift(ctx, oc.AdminConfigClient()); ok { + g.Skip("MachineConfigNodes is not supported on hypershift. Skipping tests.") + } + + // Get all control plane nodes + controlPlaneNodes, err := getControlPlaneNodes(ctx, oc) + o.Expect(err).NotTo(o.HaveOccurred()) + if len(controlPlaneNodes) == 0 { + e2eskipper.Skipf("No control plane nodes available") + } + framework.Logf("Found %d control plane nodes to test", len(controlPlaneNodes)) + + for i, cpNode := range controlPlaneNodes { + cpNodeName := cpNode.Name + framework.Logf("--- Testing control plane node %d/%d: %s ---", i+1, len(controlPlaneNodes), cpNodeName) + + g.By(fmt.Sprintf("Getting kubelet config BEFORE placing drop-in file on %s", cpNodeName)) + configBefore, err := getKubeletConfigFromNode(ctx, oc, cpNodeName) + o.Expect(err).NotTo(o.HaveOccurred()) + framework.Logf("Control plane %s swapBehavior BEFORE: '%s'", cpNodeName, configBefore.MemorySwap.SwapBehavior) + + g.By(fmt.Sprintf("Creating drop-in directory on %s if not exists", cpNodeName)) + _, _ = ExecOnNodeWithChroot(ctx, oc, cpNodeName, "mkdir", "-p", cnvDropInDir) + + g.By(fmt.Sprintf("Creating drop-in file on %s", cpNodeName)) + err = createDropInFile(ctx, oc, cpNodeName, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + o.Expect(err).NotTo(o.HaveOccurred()) + framework.Logf("Created drop-in file: %s on %s", cnvDropInFilePath, cpNodeName) + + g.By(fmt.Sprintf("Restarting kubelet on %s", cpNodeName)) + err = restartKubeletOnNode(ctx, oc, cpNodeName) + o.Expect(err).NotTo(o.HaveOccurred()) + waitForNodeToBeReady(ctx, oc, cpNodeName) + + g.By(fmt.Sprintf("Verifying %s did NOT apply LimitedSwap from drop-in", cpNodeName)) + configAfter, err := getKubeletConfigFromNode(ctx, oc, cpNodeName) + o.Expect(err).NotTo(o.HaveOccurred()) + framework.Logf("Control plane %s swapBehavior AFTER: '%s'", cpNodeName, configAfter.MemorySwap.SwapBehavior) + + // Control plane should not apply LimitedSwap from drop-in (config-dir not configured for control plane) + o.Expect(configAfter.MemorySwap.SwapBehavior).NotTo(o.Equal("LimitedSwap"), + fmt.Sprintf("Control plane %s should NOT apply LimitedSwap from drop-in", cpNodeName)) + + framework.Logf("Control plane %s ignored drop-in file as expected (swapBehavior: '%s' -> '%s')", + cpNodeName, configBefore.MemorySwap.SwapBehavior, configAfter.MemorySwap.SwapBehavior) + + g.By(fmt.Sprintf("Cleaning up %s", cpNodeName)) + removeDropInFile(ctx, oc, cpNodeName, cnvDropInFilePath) + // Also remove the drop-in directory we created on control plane + _, _ = ExecOnNodeWithChroot(ctx, oc, cpNodeName, "rmdir", cnvDropInDir) + framework.Logf("Removed drop-in directory from control plane node %s", cpNodeName) + } + + framework.Logf("=== TC5 PASSED ===") + framework.Logf("All %d control plane nodes ignored drop-in file as expected", len(controlPlaneNodes)) + }) + + // TC6: Verify directory is auto-recreated after deletion and kubelet restart + g.It("TC6: should verify drop-in directory is auto-recreated after deletion", func(ctx context.Context) { + skipOnSingleNodeTopology(oc) //skip this test for SNO + // Get a CNV worker node for tests + cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) + o.Expect(cnvWorkerNode).NotTo(o.BeEmpty(), "No CNV worker nodes available") + + framework.Logf("=== TC6: Testing drop-in directory auto-recreation ===") + framework.Logf("Executing on node: %s", cnvWorkerNode) + + g.By("Checking if directory exists before deletion") + output, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "ls", "-la", cnvDropInDir) + if err != nil { + framework.Logf("Directory does not exist") + } else { + framework.Logf("Output:\n%s", output) + } + + g.By("Deleting drop-in directory") + framework.Logf("Running: rm -rf %s", cnvDropInDir) + _, _ = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "rm", "-rf", cnvDropInDir) + framework.Logf("Directory deletion command executed") + + g.By("Verifying directory is deleted") + framework.Logf("Running: ls -la %s (expecting failure)", cnvDropInDir) + _, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "ls", "-la", cnvDropInDir) + o.Expect(err).To(o.HaveOccurred(), "Directory should not exist after deletion") + framework.Logf("Confirmed: Directory does not exist after deletion") + + g.By("Restarting kubelet") + err = restartKubeletOnNode(ctx, oc, cnvWorkerNode) + o.Expect(err).NotTo(o.HaveOccurred()) + + g.By("Waiting for node to be ready") + waitForNodeToBeReady(ctx, oc, cnvWorkerNode) + + g.By("Verifying directory was auto-recreated") + output, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "ls", "-la", cnvDropInDir) + o.Expect(err).NotTo(o.HaveOccurred(), "Directory should be auto-recreated after kubelet restart") + framework.Logf("Output:\n%s", output) + + g.By("Verifying kubelet is running") + output, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "systemctl", "is-active", "kubelet") + o.Expect(err).NotTo(o.HaveOccurred()) + framework.Logf("kubelet status: %s", strings.TrimSpace(output)) + o.Expect(strings.TrimSpace(output)).To(o.Equal("active")) + + framework.Logf("=== TC6 PASSED ===") + }) + + // TC7: Validate security and permissions of drop-in directory + g.It("TC7: should validate security and permissions of drop-in directory", func(ctx context.Context) { skipOnSingleNodeTopology(oc) //skip this test for SNO // Get a CNV worker node for tests cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) o.Expect(cnvWorkerNode).NotTo(o.BeEmpty(), "No CNV worker nodes available") - framework.Logf("=== TC5: Testing security and permissions of drop-in directory ===") + framework.Logf("=== TC7: Testing security and permissions of drop-in directory ===") framework.Logf("Executing on node: %s", cnvWorkerNode) framework.Logf("Drop-in directory: %s", cnvDropInDir) g.By("Ensuring drop-in directory exists") framework.Logf("Running: mkdir -p %s", cnvDropInDir) - _, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "mkdir", "-p", cnvDropInDir) + _, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "mkdir", "-p", cnvDropInDir) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Directory exists or created successfully") g.By("Verifying directory ownership is root:root") framework.Logf("Running: stat -c %%U:%%G %s", cnvDropInDir) - output, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "stat", "-c", "%U:%G", cnvDropInDir) + output, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "stat", "-c", "%U:%G", cnvDropInDir) o.Expect(err).NotTo(o.HaveOccurred()) ownership := strings.TrimSpace(output) framework.Logf("Directory ownership: %s", ownership) @@ -350,7 +462,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Verifying directory permissions") framework.Logf("Running: stat -c %%a %s", cnvDropInDir) - output, err = ExecOnNodeWithChroot(oc, cnvWorkerNode, "stat", "-c", "%a", cnvDropInDir) + output, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "stat", "-c", "%a", cnvDropInDir) o.Expect(err).NotTo(o.HaveOccurred()) perms := strings.TrimSpace(output) framework.Logf("Directory permissions: %s", perms) @@ -358,7 +470,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking SELinux context of directory") framework.Logf("Running: ls -ldZ %s", cnvDropInDir) - output, err = ExecOnNodeWithChroot(oc, cnvWorkerNode, "ls", "-ldZ", cnvDropInDir) + output, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "ls", "-ldZ", cnvDropInDir) if err == nil { framework.Logf("SELinux context: %s", strings.TrimSpace(output)) } @@ -367,27 +479,27 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr testFile := cnvDropInDir + "/test-permissions.conf" framework.Logf("Creating test file: %s", testFile) framework.Logf("File content:\n%s", loadConfigFromFile(cnvLimitedSwapConfigPath)) - err = createDropInFile(oc, cnvWorkerNode, testFile, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err = createDropInFile(ctx, oc, cnvWorkerNode, testFile, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Test file created successfully") - defer removeDropInFile(oc, cnvWorkerNode, testFile) + defer removeDropInFile(ctx, oc, cnvWorkerNode, testFile) g.By("Verifying config file ownership") framework.Logf("Running: stat -c %%U:%%G %s", testFile) - output, err = ExecOnNodeWithChroot(oc, cnvWorkerNode, "stat", "-c", "%U:%G", testFile) + output, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "stat", "-c", "%U:%G", testFile) o.Expect(err).NotTo(o.HaveOccurred()) fileOwnership := strings.TrimSpace(output) framework.Logf("File ownership: %s", fileOwnership) g.By("Verifying config file permissions (should be 644 or 600)") framework.Logf("Running: stat -c %%a %s", testFile) - output, err = ExecOnNodeWithChroot(oc, cnvWorkerNode, "stat", "-c", "%a", testFile) + output, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "stat", "-c", "%a", testFile) o.Expect(err).NotTo(o.HaveOccurred()) filePerms := strings.TrimSpace(output) framework.Logf("File permissions: %s", filePerms) o.Expect(filePerms).To(o.Or(o.Equal("644"), o.Equal("600"))) - framework.Logf("=== TC5 PASSED ===") + framework.Logf("=== TC7 PASSED ===") framework.Logf("Security and permissions summary:") framework.Logf("- Directory: %s", cnvDropInDir) framework.Logf("- Directory ownership: %s (expected: root:root)", ownership) @@ -397,31 +509,33 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("- File permissions: %s (expected: 644/600)", filePerms) }) - // TC6: Validate cluster stability and performance - g.It("TC6: should verify cluster stability with LimitedSwap enabled", func(ctx context.Context) { + // TC8: Validate cluster stability and performance + g.It("TC8: should verify cluster stability with LimitedSwap enabled", func(ctx context.Context) { skipOnSingleNodeTopology(oc) //skip this test for SNO // Get a CNV worker node for tests cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) o.Expect(cnvWorkerNode).NotTo(o.BeEmpty(), "No CNV worker nodes available") - framework.Logf("=== TC6: Testing cluster stability with LimitedSwap enabled ===") + framework.Logf("=== TC8: Testing cluster stability with LimitedSwap enabled ===") framework.Logf("Executing on node: %s", cnvWorkerNode) g.By("Creating LimitedSwap configuration") framework.Logf("Creating drop-in file: %s", cnvDropInFilePath) framework.Logf("Drop-in file content:\n%s", loadConfigFromFile(cnvLimitedSwapConfigPath)) - err := createDropInFile(oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err := createDropInFile(ctx, oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Drop-in file created successfully") // Verify file was created - output, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "cat", cnvDropInFilePath) + output, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "cat", cnvDropInFilePath) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Verified drop-in file content:\n%s", output) defer func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() g.By("Cleaning up") - cleanupDropInAndRestartKubelet(ctx, oc, cnvWorkerNode, cnvDropInFilePath) + cleanupDropInAndRestartKubelet(cleanupCtx, oc, cnvWorkerNode, cnvDropInFilePath) }() g.By("Restarting kubelet") @@ -473,7 +587,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr } framework.Logf("✅ No memory pressure detected") - framework.Logf("=== TC6 PASSED ===") + framework.Logf("=== TC8 PASSED ===") framework.Logf("Cluster stability verification:") framework.Logf("- Node: %s", cnvWorkerNode) framework.Logf("- swapBehavior: LimitedSwap") @@ -482,9 +596,9 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("- Stability after 30 seconds: CONFIRMED") }) - // TC7: Validate non-CNV cluster unaffected - g.It("TC7: should verify non-CNV workers have no swap configuration", func(ctx context.Context) { - framework.Logf("=== TC7: Testing non-CNV workers have no swap configuration ===") + // TC9: Validate non-CNV cluster unaffected + g.It("TC9: should verify non-CNV workers have no swap configuration", func(ctx context.Context) { + framework.Logf("=== TC9: Testing non-CNV workers have no swap configuration ===") // Get a CNV worker node and temporarily remove its CNV label cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) @@ -526,13 +640,13 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking drop-in directory on non-CNV node") framework.Logf("Running: ls -ld %s on node %s", cnvDropInDir, nonCNVWorkerNode) - output, err = ExecOnNodeWithChroot(oc, nonCNVWorkerNode, "ls", "-ld", cnvDropInDir) + output, err = ExecOnNodeWithChroot(ctx, oc, nonCNVWorkerNode, "ls", "-ld", cnvDropInDir) if err == nil { framework.Logf("Drop-in directory exists: %s", strings.TrimSpace(output)) framework.Logf("Note: Directory exists because CNV was previously installed on this node") g.By("Checking directory contents") framework.Logf("Running: ls -la %s", cnvDropInDir) - dirOutput, _ := ExecOnNodeWithChroot(oc, nonCNVWorkerNode, "ls", "-la", cnvDropInDir) + dirOutput, _ := ExecOnNodeWithChroot(ctx, oc, nonCNVWorkerNode, "ls", "-la", cnvDropInDir) framework.Logf("Directory contents:\n%s", dirOutput) } else { framework.Logf("Drop-in directory does not exist on non-CNV node (expected for truly non-CNV nodes)") @@ -547,21 +661,21 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr o.Expect(config.MemorySwap.SwapBehavior).To(o.Or(o.BeEmpty(), o.Equal("NoSwap")), "swapBehavior should be empty or NoSwap on non-CNV node") - framework.Logf("=== TC7 PASSED ===") + framework.Logf("=== TC9 PASSED ===") framework.Logf("Non-CNV worker verification:") framework.Logf("- Node: %s", nonCNVWorkerNode) framework.Logf("- CNV label removed: YES") framework.Logf("- swapBehavior: %s (NoSwap/default)", config.MemorySwap.SwapBehavior) }) - // TC8: Validate behavior with multiple conflicting drop-in files - g.It("TC8: should apply correct precedence with multiple files", func(ctx context.Context) { + // TC10: Validate behavior with multiple conflicting drop-in files + g.It("TC10: should apply correct precedence with multiple files", func(ctx context.Context) { skipOnSingleNodeTopology(oc) //skip this test for SNO // Get a CNV worker node for tests cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) o.Expect(cnvWorkerNode).NotTo(o.BeEmpty(), "No CNV worker nodes available") - framework.Logf("=== TC8: Testing file precedence with multiple drop-in files ===") + framework.Logf("=== TC10: Testing file precedence with multiple drop-in files ===") framework.Logf("Executing on node: %s", cnvWorkerNode) framework.Logf("Drop-in directory: %s", cnvDropInDir) @@ -571,31 +685,37 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Creating 98-swap-disabled.conf with NoSwap") framework.Logf("Creating file: %s", file98) framework.Logf("Content:\n%s", loadConfigFromFile(cnvNoSwapConfigPath)) - err := createDropInFile(oc, cnvWorkerNode, file98, loadConfigFromFile(cnvNoSwapConfigPath)) + err := createDropInFile(ctx, oc, cnvWorkerNode, file98, loadConfigFromFile(cnvNoSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Created: %s (NoSwap)", file98) g.By("Creating 99-swap-limited.conf with LimitedSwap") framework.Logf("Creating file: %s", file99) framework.Logf("Content:\n%s", loadConfigFromFile(cnvLimitedSwapConfigPath)) - err = createDropInFile(oc, cnvWorkerNode, file99, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err = createDropInFile(ctx, oc, cnvWorkerNode, file99, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Created: %s (LimitedSwap)", file99) g.By("Listing drop-in directory contents") framework.Logf("Running: ls -la %s", cnvDropInDir) - output, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "ls", "-la", cnvDropInDir) + output, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "ls", "-la", cnvDropInDir) framework.Logf("Directory contents:\n%s", output) defer func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() g.By("Cleaning up multiple config files") framework.Logf("Removing: %s", file98) - removeDropInFile(oc, cnvWorkerNode, file98) + if err := removeDropInFile(cleanupCtx, oc, cnvWorkerNode, file98); err != nil { + framework.Logf("Warning: failed to remove %s: %v", file98, err) + } framework.Logf("Removing: %s", file99) - removeDropInFile(oc, cnvWorkerNode, file99) + if err := removeDropInFile(cleanupCtx, oc, cnvWorkerNode, file99); err != nil { + framework.Logf("Warning: failed to remove %s: %v", file99, err) + } framework.Logf("Running: systemctl restart kubelet") - restartKubeletOnNode(ctx, oc, cnvWorkerNode) - waitForNodeToBeReady(ctx, oc, cnvWorkerNode) + restartKubeletOnNode(cleanupCtx, oc, cnvWorkerNode) + waitForNodeToBeReady(cleanupCtx, oc, cnvWorkerNode) framework.Logf("Cleanup completed") }() @@ -615,7 +735,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr o.Expect(config.MemorySwap.SwapBehavior).To(o.Equal("LimitedSwap"), "99-* file should take precedence over 98-* file") - framework.Logf("=== TC8 PASSED ===") + framework.Logf("=== TC10 PASSED ===") framework.Logf("File precedence verification:") framework.Logf("- File 1: 98-swap-disabled.conf (NoSwap)") framework.Logf("- File 2: 99-swap-limited.conf (LimitedSwap)") @@ -623,10 +743,10 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("- 99-* file correctly overrides 98-* file (lexicographic order)") }) - // TC9: Validate multi-node consistency and synchronization with checksum verification - g.It("TC9: should maintain consistent configuration with checksum verification across CNV nodes", func(ctx context.Context) { + // TC11: Validate multi-node consistency and synchronization with checksum verification + g.It("TC11: should maintain consistent configuration with checksum verification across CNV nodes", func(ctx context.Context) { skipOnSingleNodeTopology(oc) //skip this test for SNO - framework.Logf("=== TC9: Testing multi-node consistency with checksum verification ===") + framework.Logf("=== TC11: Testing multi-node consistency with checksum verification ===") g.By("Getting all CNV worker nodes") // Get nodes with both worker role and CNV schedulable label @@ -655,22 +775,26 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("Content:\n%s", loadConfigFromFile(cnvLimitedSwapConfigPath)) for _, node := range cnvNodes { framework.Logf("Creating drop-in file on node: %s", node) - err := createDropInFile(oc, node, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err := createDropInFile(ctx, oc, node, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf(" -> Created successfully on %s", node) } defer func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() g.By("Cleaning up all CNV nodes") for _, node := range cnvNodes { framework.Logf("Removing drop-in file from node: %s", node) - removeDropInFile(oc, node, cnvDropInFilePath) + if err := removeDropInFile(cleanupCtx, oc, node, cnvDropInFilePath); err != nil { + framework.Logf("Warning: failed to remove drop-in from %s: %v", node, err) + } framework.Logf("Restarting kubelet on node: %s", node) - restartKubeletOnNode(ctx, oc, node) + restartKubeletOnNode(cleanupCtx, oc, node) } for _, node := range cnvNodes { framework.Logf("Waiting for node %s to be ready...", node) - waitForNodeToBeReady(ctx, oc, node) + waitForNodeToBeReady(cleanupCtx, oc, node) } framework.Logf("Cleanup completed on all %d CNV nodes", len(cnvNodes)) }() @@ -679,7 +803,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr checksums := make(map[string]string) for _, node := range cnvNodes { framework.Logf("Running: md5sum %s on node %s", cnvDropInFilePath, node) - output, err := ExecOnNodeWithChroot(oc, node, "md5sum", cnvDropInFilePath) + output, err := ExecOnNodeWithChroot(ctx, oc, node, "md5sum", cnvDropInFilePath) o.Expect(err).NotTo(o.HaveOccurred()) // Extract checksum (first field) checksum := strings.Fields(strings.TrimSpace(output))[0] @@ -740,7 +864,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr driftDetected := false for _, node := range cnvNodes { framework.Logf("Running: md5sum %s on node %s (after wait)", cnvDropInFilePath, node) - output, err := ExecOnNodeWithChroot(oc, node, "md5sum", cnvDropInFilePath) + output, err := ExecOnNodeWithChroot(ctx, oc, node, "md5sum", cnvDropInFilePath) o.Expect(err).NotTo(o.HaveOccurred()) checksum := strings.Fields(strings.TrimSpace(output))[0] framework.Logf("Checksum for %s (after wait): %s", node, checksum) @@ -762,7 +886,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr "Node %s should still have LimitedSwap after wait", node) } - framework.Logf("=== TC9 PASSED ===") + framework.Logf("=== TC11 PASSED ===") framework.Logf("Multi-node consistency verification:") framework.Logf("- Total CNV nodes: %d", len(cnvNodes)) framework.Logf("- Configuration checksum: %s (identical across all nodes)", referenceChecksum) @@ -771,11 +895,11 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("- All nodes remain Ready: YES") }) - // TC10: Validate LimitedSwap config when OS-level swap is not enabled + // TC12: Validate LimitedSwap config when OS-level swap is not enabled // This test verifies kubelet gracefully handles LimitedSwap config even without OS swap - g.It("TC10: should handle LimitedSwap config gracefully when OS swap is disabled", func(ctx context.Context) { + g.It("TC12: should handle LimitedSwap config gracefully when OS swap is disabled", func(ctx context.Context) { skipOnSingleNodeTopology(oc) //skip this test for SNO - framework.Logf("=== TC10: Testing LimitedSwap config when OS swap is disabled ===") + framework.Logf("=== TC12: Testing LimitedSwap config when OS swap is disabled ===") // Get a CNV worker node for tests cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) @@ -784,7 +908,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking initial OS-level swap status") framework.Logf("Running: swapon -s") - initialSwapOutput, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "swapon", "-s") + initialSwapOutput, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "swapon", "-s") o.Expect(err).NotTo(o.HaveOccurred(), "Failed to check initial swap status on node %s: %v", cnvWorkerNode, err) framework.Logf("Initial swapon -s output:\n%s", initialSwapOutput) initialHasSwap := strings.TrimSpace(initialSwapOutput) != "" && initialSwapOutput != "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority" @@ -793,7 +917,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr if initialHasSwap { g.By("Disabling existing OS-level swap for test") framework.Logf("Running: swapoff -a") - swapoffOutput, swapoffErr := ExecOnNodeWithNsenter(oc, cnvWorkerNode, "swapoff", "-a") + swapoffOutput, swapoffErr := ExecOnNodeWithNsenter(ctx, oc, cnvWorkerNode, "swapoff", "-a") if swapoffErr != nil { framework.Failf("Failed to disable swap on node %s: %v (output: %s)", cnvWorkerNode, swapoffErr, swapoffOutput) } @@ -802,7 +926,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Verifying no OS-level swap is present") framework.Logf("Running: swapon -s") - swapOutput, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "swapon", "-s") + swapOutput, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "swapon", "-s") o.Expect(err).NotTo(o.HaveOccurred(), "Failed to verify swap status on node %s: %v", cnvWorkerNode, err) framework.Logf("swapon -s output:\n%s", swapOutput) hasOSSwap := strings.TrimSpace(swapOutput) != "" && swapOutput != "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority" @@ -818,26 +942,29 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Ensuring drop-in directory exists") framework.Logf("Running: mkdir -p %s", cnvDropInDir) - _, _ = ExecOnNodeWithChroot(oc, cnvWorkerNode, "mkdir", "-p", cnvDropInDir) + _, _ = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "mkdir", "-p", cnvDropInDir) g.By("Creating LimitedSwap drop-in configuration") framework.Logf("Creating drop-in file: %s", cnvDropInFilePath) framework.Logf("Content:\n%s", loadConfigFromFile(cnvLimitedSwapConfigPath)) - err = createDropInFile(oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err = createDropInFile(ctx, oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Drop-in file created successfully") defer func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() g.By("Cleaning up") framework.Logf("Removing drop-in file: %s", cnvDropInFilePath) - removeDropInFile(oc, cnvWorkerNode, cnvDropInFilePath) - // Re-enable swap if it was initially present + if err := removeDropInFile(cleanupCtx, oc, cnvWorkerNode, cnvDropInFilePath); err != nil { + framework.Logf("Warning: failed to remove drop-in: %v", err) + } if initialHasSwap { framework.Logf("Note: OS swap was initially enabled, may need manual re-enable") } framework.Logf("Restarting kubelet on node: %s", cnvWorkerNode) - restartKubeletOnNode(ctx, oc, cnvWorkerNode) - waitForNodeToBeReady(ctx, oc, cnvWorkerNode) + restartKubeletOnNode(cleanupCtx, oc, cnvWorkerNode) + waitForNodeToBeReady(cleanupCtx, oc, cnvWorkerNode) }() g.By("Restarting kubelet with LimitedSwap config but no OS swap") @@ -902,7 +1029,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Verifying /proc/meminfo shows swap fields (even if 0)") framework.Logf("Running: grep -i swap /proc/meminfo") - meminfoOutput, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "grep", "-i", "swap", "/proc/meminfo") + meminfoOutput, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "grep", "-i", "swap", "/proc/meminfo") o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Swap info from /proc/meminfo:\n%s", strings.TrimSpace(meminfoOutput)) o.Expect(meminfoOutput).To(o.ContainSubstring("SwapTotal")) @@ -910,7 +1037,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Verifying free -h shows swap status") framework.Logf("Running: free -h") - freeOutput, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "free", "-h") + freeOutput, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "free", "-h") framework.Logf("free -h output:\n%s", freeOutput) g.By("Verifying node has no memory pressure conditions") @@ -922,7 +1049,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr } } - framework.Logf("=== TC10 PASSED ===") + framework.Logf("=== TC12 PASSED ===") framework.Logf("LimitedSwap config without OS swap verification:") framework.Logf("- Node: %s", cnvWorkerNode) framework.Logf("- OS swap: disabled/not present") @@ -933,16 +1060,16 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("- Kubelet handles LimitedSwap gracefully even without OS swap") }) - // TC11: Validate behavior with various swap sizes + // TC13: Validate behavior with various swap sizes // This test creates temporary swap files on the node for testing different sizes // It requires sufficient disk space and may take longer to complete - g.It("TC11: should work correctly with various swap sizes", func(ctx context.Context) { + g.It("TC13: should work correctly with various swap sizes", func(ctx context.Context) { skipOnSingleNodeTopology(oc) //skip this test for SNO // Get a CNV worker node for tests cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) o.Expect(cnvWorkerNode).NotTo(o.BeEmpty(), "No CNV worker nodes available") - framework.Logf("=== TC11: Testing LimitedSwap with various swap sizes ===") + framework.Logf("=== TC13: Testing LimitedSwap with various swap sizes ===") framework.Logf("Executing on node: %s", cnvWorkerNode) // Define swap sizes to test (in MB) @@ -960,22 +1087,28 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Creating LimitedSwap drop-in configuration") framework.Logf("Creating drop-in file: %s", cnvDropInFilePath) framework.Logf("Content:\n%s", loadConfigFromFile(cnvLimitedSwapConfigPath)) - err := createDropInFile(oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err := createDropInFile(ctx, oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Drop-in file created successfully") defer func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() g.By("Final cleanup") - // Disable and remove any test swap file framework.Logf("Disabling test swap file if present") - ExecOnNodeWithNsenter(oc, cnvWorkerNode, "swapoff", swapFilePath) - ExecOnNodeWithChroot(oc, cnvWorkerNode, "rm", "-f", swapFilePath) - // Remove drop-in config + if _, err := ExecOnNodeWithNsenter(cleanupCtx, oc, cnvWorkerNode, "swapoff", swapFilePath); err != nil { + framework.Logf("Warning: failed to disable swap on %s: %v", cnvWorkerNode, err) + } + if _, err := ExecOnNodeWithChroot(cleanupCtx, oc, cnvWorkerNode, "rm", "-f", swapFilePath); err != nil { + framework.Logf("Warning: failed to remove swap file %s on %s: %v", swapFilePath, cnvWorkerNode, err) + } framework.Logf("Removing drop-in file: %s", cnvDropInFilePath) - removeDropInFile(oc, cnvWorkerNode, cnvDropInFilePath) + if err := removeDropInFile(cleanupCtx, oc, cnvWorkerNode, cnvDropInFilePath); err != nil { + framework.Logf("Warning: failed to remove drop-in %s on %s: %v", cnvDropInFilePath, cnvWorkerNode, err) + } framework.Logf("Restarting kubelet") - restartKubeletOnNode(ctx, oc, cnvWorkerNode) - waitForNodeToBeReady(ctx, oc, cnvWorkerNode) + restartKubeletOnNode(cleanupCtx, oc, cnvWorkerNode) + waitForNodeToBeReady(cleanupCtx, oc, cnvWorkerNode) framework.Logf("Final cleanup completed") }() @@ -999,19 +1132,19 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By(fmt.Sprintf("Disabling any existing swap for %s test", swapSize.name)) framework.Logf("Running: swapoff -a on node %s", cnvWorkerNode) - swapoffOutput, swapoffErr := ExecOnNodeWithNsenter(oc, cnvWorkerNode, "swapoff", "-a") + swapoffOutput, swapoffErr := ExecOnNodeWithNsenter(ctx, oc, cnvWorkerNode, "swapoff", "-a") if swapoffErr != nil { framework.Failf("Failed to disable swap on node %s for %s test: %v (output: %s)", cnvWorkerNode, swapSize.name, swapoffErr, swapoffOutput) } framework.Logf("Running: rm -f %s on node %s", swapFilePath, cnvWorkerNode) - rmOutput, rmErr := ExecOnNodeWithChroot(oc, cnvWorkerNode, "rm", "-f", swapFilePath) + rmOutput, rmErr := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "rm", "-f", swapFilePath) if rmErr != nil { framework.Failf("Failed to remove swap file %s on node %s for %s test: %v (output: %s)", swapFilePath, cnvWorkerNode, swapSize.name, rmErr, rmOutput) } g.By(fmt.Sprintf("Creating %dMB swap file", swapSize.sizeMB)) framework.Logf("Running: dd if=/dev/zero of=%s bs=1M count=%d", swapFilePath, swapSize.sizeMB) - _, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "dd", "if=/dev/zero", fmt.Sprintf("of=%s", swapFilePath), + _, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "dd", "if=/dev/zero", fmt.Sprintf("of=%s", swapFilePath), "bs=1M", fmt.Sprintf("count=%d", swapSize.sizeMB)) if err != nil { framework.Logf("Warning: Failed to create swap file: %v", err) @@ -1021,10 +1154,10 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr } framework.Logf("Running: chmod 600 %s", swapFilePath) - ExecOnNodeWithChroot(oc, cnvWorkerNode, "chmod", "600", swapFilePath) + ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "chmod", "600", swapFilePath) framework.Logf("Running: mkswap %s", swapFilePath) - _, err = ExecOnNodeWithChroot(oc, cnvWorkerNode, "mkswap", swapFilePath) + _, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "mkswap", swapFilePath) if err != nil { framework.Logf("Warning: Failed to mkswap: %v", err) result.success = false @@ -1033,7 +1166,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr } framework.Logf("Running: swapon %s", swapFilePath) - _, err = ExecOnNodeWithNsenter(oc, cnvWorkerNode, "swapon", swapFilePath) + _, err = ExecOnNodeWithNsenter(ctx, oc, cnvWorkerNode, "swapon", swapFilePath) if err != nil { framework.Logf("Warning: Failed to enable swap: %v", err) result.success = false @@ -1061,11 +1194,11 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By(fmt.Sprintf("Verifying swap metrics with %s swap", swapSize.name)) framework.Logf("Running: swapon -s") - swapOutput, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "swapon", "-s") + swapOutput, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "swapon", "-s") framework.Logf("swapon -s output:\n%s", swapOutput) framework.Logf("Running: grep -i swap /proc/meminfo") - meminfoOutput, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "grep", "-i", "swap", "/proc/meminfo") + meminfoOutput, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "grep", "-i", "swap", "/proc/meminfo") framework.Logf("Swap info from /proc/meminfo:\n%s", strings.TrimSpace(meminfoOutput)) // Parse SwapTotal @@ -1079,7 +1212,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr } framework.Logf("Running: free -h") - freeOutput, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "free", "-h") + freeOutput, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "free", "-h") framework.Logf("free -h output:\n%s", freeOutput) // Verify swap size is approximately what we configured (within 10%) @@ -1102,7 +1235,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("--- %s swap (%dMB) test PASSED ---", swapSize.name, swapSize.sizeMB) } - framework.Logf("=== TC11 PASSED ===") + framework.Logf("=== TC13 PASSED ===") framework.Logf("Swap size verification results:") for _, r := range results { framework.Logf("- %s (%dMB): Success=%v, SwapTotal=%dMB, NodeReady=%v, ConfigOK=%v", @@ -1111,14 +1244,14 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("LimitedSwap works correctly with all tested swap sizes") }) - // TC12: Validate swap metrics and observability via Prometheus - g.It("TC12: should expose swap metrics correctly via Prometheus", func(ctx context.Context) { + // TC14: Validate swap metrics and observability via Prometheus + g.It("TC14: should expose swap metrics correctly via Prometheus", func(ctx context.Context) { skipOnSingleNodeTopology(oc) //skip this test for SNO // Get a CNV worker node for tests cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) o.Expect(cnvWorkerNode).NotTo(o.BeEmpty(), "No CNV worker nodes available") - framework.Logf("=== TC12: Testing swap metrics and observability via Prometheus ===") + framework.Logf("=== TC14: Testing swap metrics and observability via Prometheus ===") framework.Logf("Executing on node: %s", cnvWorkerNode) swapFilePath := "/var/swapfile" @@ -1127,7 +1260,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking OS-level swap status") framework.Logf("Running: swapon -s") - swapOutput, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "swapon", "-s") + swapOutput, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "swapon", "-s") framework.Logf("swapon -s output:\n%s", swapOutput) hasOSSwap := strings.TrimSpace(swapOutput) != "" && swapOutput != "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority" @@ -1138,49 +1271,55 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By(fmt.Sprintf("Creating %dMB swap file at %s", swapSizeMB, swapFilePath)) framework.Logf("Running: dd if=/dev/zero of=%s bs=1M count=%d", swapFilePath, swapSizeMB) - ddOutput, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "dd", "if=/dev/zero", fmt.Sprintf("of=%s", swapFilePath), "bs=1M", fmt.Sprintf("count=%d", swapSizeMB)) + ddOutput, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "dd", "if=/dev/zero", fmt.Sprintf("of=%s", swapFilePath), "bs=1M", fmt.Sprintf("count=%d", swapSizeMB)) if err != nil { framework.Logf("Warning: dd command returned error (may still have succeeded): %v", err) } framework.Logf("dd output: %s", ddOutput) framework.Logf("Running: chmod 600 %s", swapFilePath) - _, err = ExecOnNodeWithChroot(oc, cnvWorkerNode, "chmod", "600", swapFilePath) + _, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "chmod", "600", swapFilePath) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Running: mkswap %s", swapFilePath) - mkswapOutput, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "mkswap", swapFilePath) + mkswapOutput, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "mkswap", swapFilePath) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("mkswap output: %s", mkswapOutput) g.By("Enabling swap") framework.Logf("Running: swapon %s", swapFilePath) - _, err = ExecOnNodeWithNsenter(oc, cnvWorkerNode, "swapon", swapFilePath) + _, err = ExecOnNodeWithNsenter(ctx, oc, cnvWorkerNode, "swapon", swapFilePath) o.Expect(err).NotTo(o.HaveOccurred()) swapCreated = true // Verify swap is now enabled framework.Logf("Verifying swap is enabled...") - swapVerify, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "swapon", "-s") + swapVerify, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "swapon", "-s") framework.Logf("swapon -s after enabling:\n%s", swapVerify) hasOSSwap = true } defer func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() g.By("Cleaning up swap file and drop-in configuration") if swapCreated { framework.Logf("Disabling swap: swapoff %s", swapFilePath) - ExecOnNodeWithNsenter(oc, cnvWorkerNode, "swapoff", swapFilePath) + if _, err := ExecOnNodeWithNsenter(cleanupCtx, oc, cnvWorkerNode, "swapoff", swapFilePath); err != nil { + framework.Logf("Warning: failed to disable swap: %v", err) + } framework.Logf("Removing swap file: rm -f %s", swapFilePath) - ExecOnNodeWithChroot(oc, cnvWorkerNode, "rm", "-f", swapFilePath) + if _, err := ExecOnNodeWithChroot(cleanupCtx, oc, cnvWorkerNode, "rm", "-f", swapFilePath); err != nil { + framework.Logf("Warning: failed to remove swap file: %v", err) + } } - cleanupDropInAndRestartKubelet(ctx, oc, cnvWorkerNode, cnvDropInFilePath) + cleanupDropInAndRestartKubelet(cleanupCtx, oc, cnvWorkerNode, cnvDropInFilePath) }() g.By("Creating LimitedSwap configuration") framework.Logf("Creating drop-in file: %s", cnvDropInFilePath) framework.Logf("Content:\n%s", loadConfigFromFile(cnvLimitedSwapConfigPath)) - err := createDropInFile(oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err := createDropInFile(ctx, oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Drop-in file created successfully") @@ -1200,7 +1339,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Getting swap metrics from /proc/meminfo (baseline)") framework.Logf("Running: grep -i swap /proc/meminfo") - meminfoOutput, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "grep", "-i", "swap", "/proc/meminfo") + meminfoOutput, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "grep", "-i", "swap", "/proc/meminfo") o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Swap metrics from /proc/meminfo:\n%s", strings.TrimSpace(meminfoOutput)) @@ -1219,7 +1358,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking free -h output for swap") framework.Logf("Running: free -h") - freeOutput, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "free", "-h") + freeOutput, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "free", "-h") framework.Logf("free -h output:\n%s", freeOutput) g.By("Querying Prometheus for node swap metrics") @@ -1330,7 +1469,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr } else if hasOSSwap { osSwapStatus = "enabled (pre-existing)" } - framework.Logf("=== TC12 PASSED ===") + framework.Logf("=== TC14 PASSED ===") framework.Logf("Swap metrics and observability verification:") framework.Logf("- Node: %s", cnvWorkerNode) framework.Logf("- OS swap: %s", osSwapStatus) diff --git a/test/extended/node/node_utils.go b/test/extended/node/node_utils.go index 13256656582e..cd9b91d1e8cc 100644 --- a/test/extended/node/node_utils.go +++ b/test/extended/node/node_utils.go @@ -157,34 +157,69 @@ func getCNVWorkerNodeName(ctx context.Context, oc *exutil.CLI) string { return nodes[rand.Intn(len(nodes))].Name } -// ExecOnNodeWithChroot runs a command on a node using oc debug with chroot /host -func ExecOnNodeWithChroot(oc *exutil.CLI, nodeName string, cmd ...string) (string, error) { +func execOnNodeWithDebug(ctx context.Context, oc *exutil.CLI, nodeName string, timeout time.Duration, args []string) (string, error) { + timeoutCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + execCmd, stdOutBuf, stdErrBuf, err := oc.AsAdmin().WithoutNamespace().Run("debug").Args(args...).Background() + if err != nil { + return "", err + } + + type result struct { + err error + } + resultCh := make(chan result, 1) + + go func() { + resultCh <- result{err: execCmd.Wait()} + }() + + select { + case res := <-resultCh: + stdOut := strings.TrimSpace(stdOutBuf.String()) + stdErr := strings.TrimSpace(stdErrBuf.String()) + if res.err != nil { + return stdOut, fmt.Errorf("oc debug failed: %w\nStdErr: %s", res.err, stdErr) + } + return stdOut, nil + case <-timeoutCtx.Done(): + var killErr error + if execCmd.Process != nil { + killErr = execCmd.Process.Kill() + } + if ctx.Err() != nil { + return "", fmt.Errorf("oc debug command canceled on node %s: %w", nodeName, ctx.Err()) + } + if killErr != nil { + return "", fmt.Errorf("oc debug command timed out after %v on node %s; failed to stop debug process: %w", timeout, nodeName, killErr) + } + return "", fmt.Errorf("oc debug command timed out after %v on node %s (cleanup likely hung)", timeout, nodeName) + } +} + +func ExecOnNodeWithChroot(ctx context.Context, oc *exutil.CLI, nodeName string, cmd ...string) (string, error) { args := append([]string{"node/" + nodeName, "-n" + DebugNamespace, "--", "chroot", "/host"}, cmd...) - stdOut, _, err := oc.AsAdmin().WithoutNamespace().Run("debug").Args(args...).Outputs() - return stdOut, err + return execOnNodeWithDebug(ctx, oc, nodeName, 2*time.Minute, args) } -// ExecOnNodeWithNsenter runs a command on a node using nsenter to access host namespaces -// This is needed for swap operations (swapon/swapoff) that require direct namespace access -func ExecOnNodeWithNsenter(oc *exutil.CLI, nodeName string, cmd ...string) (string, error) { +func ExecOnNodeWithNsenter(ctx context.Context, oc *exutil.CLI, nodeName string, cmd ...string) (string, error) { nsenterCmd := append([]string{"nsenter", "-a", "-t", "1"}, cmd...) args := append([]string{"node/" + nodeName, "-n" + DebugNamespace, "--"}, nsenterCmd...) - stdOut, _, err := oc.AsAdmin().WithoutNamespace().Run("debug").Args(args...).Outputs() - return stdOut, err + return execOnNodeWithDebug(ctx, oc, nodeName, 2*time.Minute, args) } // createDropInFile creates a drop-in configuration file on the specified node -func createDropInFile(oc *exutil.CLI, nodeName, filePath, content string) error { - // Escape content for shell +func createDropInFile(ctx context.Context, oc *exutil.CLI, nodeName, filePath, content string) error { escapedContent := strings.ReplaceAll(content, "'", "'\\''") cmd := fmt.Sprintf("echo '%s' > %s && chmod 644 %s", escapedContent, filePath, filePath) - _, err := ExecOnNodeWithChroot(oc, nodeName, "sh", "-c", cmd) + _, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "sh", "-c", cmd) return err } // removeDropInFile removes a drop-in configuration file from the specified node -func removeDropInFile(oc *exutil.CLI, nodeName, filePath string) error { - _, err := ExecOnNodeWithChroot(oc, nodeName, "rm", "-f", filePath) +func removeDropInFile(ctx context.Context, oc *exutil.CLI, nodeName, filePath string) error { + _, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "rm", "-f", filePath) return err } @@ -203,7 +238,7 @@ func restartKubeletOnNode(ctx context.Context, oc *exutil.CLI, nodeName string) const maxAttempts = 3 var lastErr error for attempt := 0; attempt < maxAttempts; attempt++ { - _, err := ExecOnNodeWithChroot(oc, nodeName, "systemctl", "restart", "kubelet") + _, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "systemctl", "restart", "kubelet") if err == nil { return nil } @@ -272,7 +307,7 @@ func isNodeInReadyState(node *corev1.Node) bool { // cleanupDropInAndRestartKubelet removes the drop-in file and restarts kubelet func cleanupDropInAndRestartKubelet(ctx context.Context, oc *exutil.CLI, nodeName, filePath string) { framework.Logf("Removing drop-in file: %s", filePath) - removeDropInFile(oc, nodeName, filePath) + removeDropInFile(ctx, oc, nodeName, filePath) framework.Logf("Restarting kubelet on node: %s", nodeName) restartKubeletOnNode(ctx, oc, nodeName) framework.Logf("Waiting for node to be ready...") @@ -443,7 +478,7 @@ func installCNVOperator(ctx context.Context, oc *exutil.CLI) error { return fmt.Errorf("failed to create MC client for MCP check: %w", err) } - err = waitForMCP(ctx, mcClient, "worker", 30*time.Minute) + err = waitForMCP(ctx, mcClient, "worker", 15*time.Minute) if err != nil { return fmt.Errorf("MCP rollout failed after CNV installation: %w", err) } @@ -486,7 +521,7 @@ func waitForCNVOperatorReady(ctx context.Context, oc *exutil.CLI) error { func waitForHyperConvergedReady(ctx context.Context, oc *exutil.CLI) error { dynamicClient := oc.AdminDynamicClient() - return wait.PollUntilContextTimeout(ctx, 15*time.Second, 20*time.Minute, true, func(ctx context.Context) (bool, error) { + return wait.PollUntilContextTimeout(ctx, 15*time.Second, 15*time.Minute, true, func(ctx context.Context) (bool, error) { hc, err := dynamicClient.Resource(hyperConvergedGVR).Namespace(cnvNamespace).Get(ctx, cnvHyperConverged, metav1.GetOptions{}) if err != nil { framework.Logf("Error getting HyperConverged: %v", err) @@ -716,7 +751,7 @@ func uninstallCNVOperator(ctx context.Context, oc *exutil.CLI) error { if err != nil { framework.Logf("Warning: failed to create MC client for MCP check: %v", err) } else { - err = waitForMCP(ctx, mcClient, "worker", 30*time.Minute) + err = waitForMCP(ctx, mcClient, "worker", 15*time.Minute) if err != nil { framework.Logf("Warning: MCP rollout check failed: %v", err) } @@ -734,7 +769,7 @@ func ensureDropInDirectoryExists(ctx context.Context, oc *exutil.CLI, dirPath st } for _, node := range nodes { - _, err := ExecOnNodeWithChroot(oc, node.Name, "mkdir", "-p", dirPath) + _, err := ExecOnNodeWithChroot(ctx, oc, node.Name, "mkdir", "-p", dirPath) if err != nil { framework.Logf("Warning: failed to create directory on node %s: %v", node.Name, err) } @@ -786,9 +821,9 @@ func CalculateEventTimeDiff(startEvent, endEvent *corev1.Event) time.Duration { // GetPodNetNs retrieves the network namespace path for a pod using crictl. // It uses crictl to get the sandbox ID and then inspects it to extract the NetNS path. // Returns the NetNS path and an error if not found. -func GetPodNetNs(oc *exutil.CLI, nodeName, podName string) (string, error) { +func GetPodNetNs(ctx context.Context, oc *exutil.CLI, nodeName, podName string) (string, error) { // Get sandbox ID using crictl - sandboxID, err := ExecOnNodeWithChroot(oc, nodeName, "crictl", "pods", "--name", podName, "-q") + sandboxID, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "crictl", "pods", "--name", podName, "-q") if err != nil || sandboxID == "" { framework.Logf("Failed to get sandbox ID for pod %s: %v", podName, err) return "", fmt.Errorf("failed to get sandbox ID for pod %s: %w", podName, err) @@ -797,7 +832,7 @@ func GetPodNetNs(oc *exutil.CLI, nodeName, podName string) (string, error) { framework.Logf("Found sandbox ID: %s", sandboxID) // Extract network namespace path from sandbox inspection - netNsStr, err := ExecOnNodeWithChroot(oc, nodeName, "sh", "-c", fmt.Sprintf("crictl inspectp %s | grep -i netns", sandboxID)) + netNsStr, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "sh", "-c", fmt.Sprintf("crictl inspectp %s | grep -i netns", sandboxID)) if err != nil { framework.Logf("Failed to get NetNS from crictl inspect: %v", err) return "", fmt.Errorf("failed to get NetNS from crictl inspect: %w", err) @@ -818,9 +853,9 @@ func GetPodNetNs(oc *exutil.CLI, nodeName, podName string) (string, error) { // CheckNetNsCleaned verifies that the network namespace file has been cleaned up. // It checks if the NetNS path no longer exists on the node. // Returns nil if the file is cleaned, error if it still exists. -func CheckNetNsCleaned(oc *exutil.CLI, nodeName, netNsPath string) error { +func CheckNetNsCleaned(ctx context.Context, oc *exutil.CLI, nodeName, netNsPath string) error { // Use test command which returns proper exit code - _, err := ExecOnNodeWithChroot(oc, nodeName, "test", "-e", netNsPath) + _, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "test", "-e", netNsPath) if err != nil { // Non-nil err: file absent (test exit 1) OR exec/debug failure. framework.Logf("NetNS file considered cleaned (test -e returned error: %v)", err) @@ -829,3 +864,26 @@ func CheckNetNsCleaned(oc *exutil.CLI, nodeName, netNsPath string) error { // No error means file still exists return fmt.Errorf("NetNS file still exists at %s", netNsPath) } + +func GetNotReadyNodes(ctx context.Context, oc *exutil.CLI) ([]string, error) { + nodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, err + } + + var notReadyNodes []string + for _, node := range nodes.Items { + if !isNodeInReadyState(&node) { + notReadyNodes = append(notReadyNodes, node.Name) + } + } + + return notReadyNodes, nil +} + +func EnsureNodesReady(ctx context.Context, oc *exutil.CLI) { + notReadyNodes, err := GetNotReadyNodes(ctx, oc) + o.Expect(err).NotTo(o.HaveOccurred(), "failed to check node readiness") + o.Expect(notReadyNodes).To(o.BeEmpty(), + "Cannot start test: nodes not Ready: %v. Cluster may be recovering from previous test.", notReadyNodes) +} diff --git a/test/extended/node/system_compressible.go b/test/extended/node/system_compressible.go index 6eb0dcc6351d..1358ec46bb5a 100644 --- a/test/extended/node/system_compressible.go +++ b/test/extended/node/system_compressible.go @@ -34,6 +34,8 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv if isMicroShift { g.Skip("Skipping test on MicroShift cluster") } + + EnsureNodesReady(ctx, oc) }) g.It("should enforce system compressible CPU limit by default", func(ctx context.Context) { @@ -57,7 +59,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv // Read SYSTEM_RESERVED_CPU from /etc/node-sizing.env g.By("Reading SYSTEM_RESERVED_CPU from /etc/node-sizing.env") - nodeSizingOutput, err := ExecOnNodeWithChroot(oc, nodeName, "cat", "/etc/node-sizing.env") + nodeSizingOutput, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "cat", "/etc/node-sizing.env") o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to read /etc/node-sizing.env") framework.Logf("/etc/node-sizing.env contents:\n%s", nodeSizingOutput) @@ -81,7 +83,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv // Check cgroup cpu.weight configuration for system.slice g.By("Verifying system.slice cgroup CPU weight") - actualWeight, err := readCgroupCPUWeight(oc, nodeName, "system.slice") + actualWeight, err := readCgroupCPUWeight(ctx, oc, nodeName, "system.slice") o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to read cpu.weight for system.slice") framework.Logf("system.slice actual cpu.weight: %d", actualWeight) @@ -265,7 +267,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv // Check cgroup cpu.weight configuration for system.slice g.By("Verifying system.slice cgroup CPU weight when system compressible is disabled") - actualWeight, err := readCgroupCPUWeight(oc, nodeName, "system.slice") + actualWeight, err := readCgroupCPUWeight(ctx, oc, nodeName, "system.slice") o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to read cpu.weight for system.slice") framework.Logf("system.slice actual cpu.weight when disabled: %d", actualWeight) @@ -562,10 +564,10 @@ func selectTestNode(ctx context.Context, oc *exutil.CLI, minCPUs int) (string, i } // readCgroupCPUWeight reads cpu.weight file for a cgroup slice -func readCgroupCPUWeight(oc *exutil.CLI, nodeName, slicePath string) (uint64, error) { +func readCgroupCPUWeight(ctx context.Context, oc *exutil.CLI, nodeName, slicePath string) (uint64, error) { weightPath := fmt.Sprintf("/sys/fs/cgroup/%s/cpu.weight", slicePath) - output, err := ExecOnNodeWithChroot(oc, nodeName, "cat", weightPath) + output, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "cat", weightPath) if err != nil { return 0, fmt.Errorf("failed to read %s: %w", weightPath, err) } From fc7d1bac53eb8afba46fbc7ae2d9e124a094a661 Mon Sep 17 00:00:00 2001 From: Neeraj Krishna Gopalakrishna Date: Tue, 30 Jun 2026 15:26:47 +0530 Subject: [PATCH 2/2] Fix longrunning test case Exec timeout --- .../node/kubelet_secret_pulled_images.go | 10 +- test/extended/node/kubeletconfig_features.go | 110 +++--- test/extended/node/kubeletconfig_tls.go | 141 +------- .../node/node_e2e/container_runtime_config.go | 97 ++---- test/extended/node/node_e2e/netns_cleanup.go | 6 +- test/extended/node/node_kc_helpers.go | 141 ++++++++ test/extended/node/node_mcp_helpers.go | 207 +++++++++++ test/extended/node/node_sizing.go | 163 +-------- test/extended/node/node_swap.go | 5 +- test/extended/node/node_utils.go | 8 +- test/extended/node/system_compressible.go | 326 +++--------------- 11 files changed, 506 insertions(+), 708 deletions(-) create mode 100644 test/extended/node/node_kc_helpers.go create mode 100644 test/extended/node/node_mcp_helpers.go diff --git a/test/extended/node/kubelet_secret_pulled_images.go b/test/extended/node/kubelet_secret_pulled_images.go index a1b1e1f9145f..bf819c53aa7e 100644 --- a/test/extended/node/kubelet_secret_pulled_images.go +++ b/test/extended/node/kubelet_secret_pulled_images.go @@ -205,8 +205,8 @@ var _ = g.Describe("[sig-node][Suite:openshift/disruptive-longrunning][Disruptiv credVerifyCreateSecret(ctx, oc, ns, "pull-secret", pullSecret) g.DeferCleanup(func() { - _ = deleteKC(oc, kcName) - _ = waitForMCP(ctx, mcClient, "worker", 15*time.Minute) + cleanupCtx := context.Background() + _ = CleanupKubeletConfig(cleanupCtx, mcClient, kcName, "worker") }) g.By("Pre-caching private image on the node with a valid secret") @@ -215,7 +215,7 @@ var _ = g.Describe("[sig-node][Suite:openshift/disruptive-longrunning][Disruptiv g.By("Applying NeverVerify policy and waiting for MCO rollout") credVerifyApplyPolicy(ctx, mcClient, kcName, `{"imagePullCredentialsVerificationPolicy":"NeverVerify"}`) credVerifyWaitForMCPUpdating(ctx, mcClient, "worker") - err = waitForMCP(ctx, mcClient, "worker", 15*time.Minute) + err = WaitForMCP(ctx, mcClient, "worker", 15*time.Minute) o.Expect(err).NotTo(o.HaveOccurred()) g.By("Verifying NeverVerify policy allows pod without secret to use cached image") @@ -224,7 +224,7 @@ var _ = g.Describe("[sig-node][Suite:openshift/disruptive-longrunning][Disruptiv g.By("Switching to AlwaysVerify policy and waiting for MCO rollout") credVerifyApplyPolicy(ctx, mcClient, kcName, `{"imagePullCredentialsVerificationPolicy":"AlwaysVerify"}`) credVerifyWaitForMCPUpdating(ctx, mcClient, "worker") - err = waitForMCP(ctx, mcClient, "worker", 15*time.Minute) + err = WaitForMCP(ctx, mcClient, "worker", 15*time.Minute) o.Expect(err).NotTo(o.HaveOccurred()) // This pod also re-caches the image after MCO rollout since pull records are cleared @@ -413,7 +413,7 @@ func credVerifyApplyPolicy(ctx context.Context, mcClient *mcclient.Clientset, na existing, err := mcClient.MachineconfigurationV1().KubeletConfigs().Get(ctx, name, metav1.GetOptions{}) if apierrors.IsNotFound(err) { - _, err = mcClient.MachineconfigurationV1().KubeletConfigs().Create(ctx, kc, metav1.CreateOptions{}) + _, err = CreateKubeletConfig(ctx, mcClient, kc) o.Expect(err).NotTo(o.HaveOccurred()) return } diff --git a/test/extended/node/kubeletconfig_features.go b/test/extended/node/kubeletconfig_features.go index 5c7aa20f1fb4..1de3d209e42d 100644 --- a/test/extended/node/kubeletconfig_features.go +++ b/test/extended/node/kubeletconfig_features.go @@ -11,6 +11,7 @@ import ( o "github.com/onsi/gomega" osconfigv1 "github.com/openshift/api/config/v1" + machineconfigclient "github.com/openshift/client-go/machineconfiguration/clientset/versioned" exutil "github.com/openshift/origin/test/extended/util" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -32,96 +33,83 @@ var ( var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptive]", func() { defer g.GinkgoRecover() var ( - NodeMachineConfigPoolBaseDir = exutil.FixturePath("testdata", "node", "machineconfigpool") - NodeKubeletConfigBaseDir = exutil.FixturePath("testdata", "node", "kubeletconfig") - - customMCPFixture = filepath.Join(NodeMachineConfigPoolBaseDir, "customMCP.yaml") - customLoggingKCFixture = filepath.Join(NodeKubeletConfigBaseDir, "loggingKC.yaml") + NodeKubeletConfigBaseDir = exutil.FixturePath("testdata", "node", "kubeletconfig") + customLoggingKCFixture = filepath.Join(NodeKubeletConfigBaseDir, "loggingKC.yaml") oc = exutil.NewCLIWithoutNamespace("node-kubeletconfig") ) // This test is also considered `Slow` because it takes longer than 5 minutes to run. - g.It("[Slow]should apply KubeletConfig with logging verbosity to custom pool [apigroup:machineconfiguration.openshift.io]", func() { + g.It("[Slow]should apply KubeletConfig with logging verbosity to custom pool [apigroup:machineconfiguration.openshift.io]", func(ctx context.Context) { // Skip this test on single node and two-node platforms since custom MCPs are not supported // for clusters with only a master MCP skipOnSingleNodeTopology(oc) skipOnTwoNodeTopology(oc) - // Get the MCP and KubeletConfig fixtures needed for this test - mcpFixture := customMCPFixture + // Get the KubeletConfig fixture needed for this test kcFixture := customLoggingKCFixture // Create kube client for test kubeClient, err := kubernetes.NewForConfig(oc.KubeFramework().ClientConfig()) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error getting kube client: %v", err)) - // Create custom MCP - defer deleteMCP(oc, "custom") - err = oc.Run("apply").Args("-f", mcpFixture).Execute() - o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error creating MCP `custom`: %v", err)) + // Get a worker node for testing + nodes, err := kubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{LabelSelector: labels.SelectorFromSet(labels.Set{"node-role.kubernetes.io/worker": ""}).String()}) + o.Expect(err).NotTo(o.HaveOccurred(), "Error getting worker nodes") + o.Expect(len(nodes.Items)).To(o.BeNumerically(">", 0), "No worker nodes found") + testNode := nodes.Items[0].Name + + // Create machine config client + mcClient, err := machineconfigclient.NewForConfig(oc.KubeFramework().ClientConfig()) + o.Expect(err).NotTo(o.HaveOccurred(), "Error creating machine config client") + + // Create custom MCP for the node + mcpConfig, err := CreateCustomMCPForNode(ctx, oc, mcClient, "custom", testNode) + o.Expect(err).NotTo(o.HaveOccurred(), "Error creating custom MCP") - // Add node to custom MCP & wait for the node to be ready in the MCP - optedNodes, err := addWorkerNodesToCustomPool(oc, kubeClient, 1, "custom") - o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error adding node to `custom` MCP: %v", err)) - defer waitTillNodeReadyWithConfig(kubeClient, optedNodes[0], workerConfigPrefix) - defer unlabelNode(oc, optedNodes[0]) - framework.Logf("Waiting for `%v` node to be ready in `custom` MCP.", optedNodes[0]) - waitTillNodeReadyWithConfig(kubeClient, optedNodes[0], customConfigPrefix) + defer func() { + cleanupErr := CleanupCustomMCP(ctx, mcpConfig) + if cleanupErr != nil { + framework.Logf("Warning: cleanup had errors: %v", cleanupErr) + } + }() + + // Wait for the node to be ready in the custom MCP + framework.Logf("Waiting for node %s to be ready in custom MCP", testNode) + waitTillNodeReadyWithConfig(kubeClient, testNode, customConfigPrefix) // Get the current config before applying KubeletConfig - node, err := kubeClient.CoreV1().Nodes().Get(context.TODO(), optedNodes[0], metav1.GetOptions{}) + node, err := kubeClient.CoreV1().Nodes().Get(ctx, testNode, metav1.GetOptions{}) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error getting node: %v", err)) originalConfig := node.Annotations["machineconfiguration.openshift.io/currentConfig"] - framework.Logf("Node '%v' has original config: %v", optedNodes[0], originalConfig) + framework.Logf("Node %s has original config: %s", testNode, originalConfig) // Apply KubeletConfig with logging verbosity - defer deleteKC(oc, "custom-logging-config") + defer func() { + if err := CleanupKubeletConfig(ctx, mcClient, "custom-logging-config", ""); err != nil { + framework.Logf("Warning: KubeletConfig cleanup failed: %v", err) + } + }() err = oc.Run("apply").Args("-f", kcFixture).Execute() o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error applying KubeletConfig: %v", err)) // Wait for the node to reboot after applying KubeletConfig // KubeletConfig changes require a node reboot to take effect - framework.Logf("Waiting for node '%v' to reboot after applying KubeletConfig", optedNodes[0]) - waitForReboot(kubeClient, optedNodes[0]) + framework.Logf("Waiting for node %s to reboot after applying KubeletConfig", testNode) + waitForReboot(kubeClient, testNode) // Verify the node has been updated with new config - framework.Logf("Verifying node '%v' has updated config after reboot", optedNodes[0]) - node, err = kubeClient.CoreV1().Nodes().Get(context.TODO(), optedNodes[0], metav1.GetOptions{}) + framework.Logf("Verifying node %s has updated config after reboot", testNode) + node, err = kubeClient.CoreV1().Nodes().Get(ctx, testNode, metav1.GetOptions{}) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error getting node after update: %v", err)) o.Expect(node.Annotations["machineconfiguration.openshift.io/state"]).To(o.Equal("Done"), "Node should be in Done state after reboot") newConfig := node.Annotations["machineconfiguration.openshift.io/currentConfig"] - o.Expect(newConfig).NotTo(o.Equal(originalConfig), "Node config should have changed from %v to %v", originalConfig, newConfig) + o.Expect(newConfig).NotTo(o.Equal(originalConfig), "Node config should have changed from %s to %s", originalConfig, newConfig) - framework.Logf("Successfully applied KubeletConfig with logging verbosity to node '%v', config changed from '%v' to '%v'", optedNodes[0], originalConfig, newConfig) + framework.Logf("Successfully applied KubeletConfig with logging verbosity to node %s, config changed from %s to %s", testNode, originalConfig, newConfig) }) }) -// `addWorkerNodesToCustomPool` labels the desired number of worker nodes with the MCP role -// selector so that the nodes become part of the desired custom MCP -func addWorkerNodesToCustomPool(oc *exutil.CLI, kubeClient *kubernetes.Clientset, numberOfNodes int, customMCP string) ([]string, error) { - // Get the worker nodes - nodes, err := kubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{LabelSelector: labels.SelectorFromSet(labels.Set{"node-role.kubernetes.io/worker": ""}).String()}) - if err != nil { - return nil, err - } - // Return an error if there are less worker nodes in the cluster than the desired number of nodes to add to the custom MCP - if len(nodes.Items) < numberOfNodes { - return nil, fmt.Errorf("Node in Worker MCP %d < Number of nodes needed in %d MCP", len(nodes.Items), numberOfNodes) - } - - // Label the nodes with the custom MCP role selector - var optedNodes []string - for node_i := 0; node_i < numberOfNodes; node_i++ { - err = oc.AsAdmin().Run("label").Args("node", nodes.Items[node_i].Name, fmt.Sprintf("node-role.kubernetes.io/%s=", customMCP)).Execute() - if err != nil { - return nil, err - } - optedNodes = append(optedNodes, nodes.Items[node_i].Name) - } - return optedNodes, nil -} - // `waitForReboot` waits for up to 5 minutes for the input node to start a reboot and then up to 15 // minutes for the node to complete its reboot. func waitForReboot(kubeClient *kubernetes.Clientset, nodeName string) { @@ -172,22 +160,6 @@ func waitTillNodeReadyWithConfig(kubeClient *kubernetes.Clientset, nodeName, cur }, 5*time.Minute, 10*time.Second).Should(o.BeTrue(), "Timed out waiting for Node '%s' to have rendered-worker config.", nodeName) } -// `unlabelNode` removes the `node-role.kubernetes.io/custom` label from the node with the input -// name. This triggers the node's removal from the custom MCP named `custom`. -func unlabelNode(oc *exutil.CLI, name string) error { - return oc.AsAdmin().Run("label").Args("node", name, "node-role.kubernetes.io/custom-").Execute() -} - -// `deleteKC` deletes the KubeletConfig with the input name -func deleteKC(oc *exutil.CLI, name string) error { - return oc.Run("delete").Args("kubeletconfig", name).Execute() -} - -// `deleteMCP` deletes the MachineConfigPool with the input name -func deleteMCP(oc *exutil.CLI, name string) error { - return oc.Run("delete").Args("mcp", name).Execute() -} - // `skipOnSingleNodeTopology` skips the test if the cluster is using single-node topology func skipOnSingleNodeTopology(oc *exutil.CLI) { infra, err := oc.AdminConfigClient().ConfigV1().Infrastructures().Get(context.Background(), "cluster", metav1.GetOptions{}) diff --git a/test/extended/node/kubeletconfig_tls.go b/test/extended/node/kubeletconfig_tls.go index dcb992badcbb..85c512aaf75f 100644 --- a/test/extended/node/kubeletconfig_tls.go +++ b/test/extended/node/kubeletconfig_tls.go @@ -3,7 +3,6 @@ package node import ( "context" "fmt" - "strings" "time" g "github.com/onsi/ginkgo/v2" @@ -13,12 +12,7 @@ import ( mcfgv1 "github.com/openshift/api/machineconfiguration/v1" machineconfigclient "github.com/openshift/client-go/machineconfiguration/clientset/versioned" exutil "github.com/openshift/origin/test/extended/util" - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/kubernetes/test/e2e/framework" ) @@ -31,7 +25,6 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv oc = exutil.NewCLIWithoutNamespace("node-kubeletconfig-tls") kubeletConfigName = "tls13-kubelet-config" testMCPName = "kubelet-tls-test" - testNodeMCPLabel = fmt.Sprintf("node-role.kubernetes.io/%s", testMCPName) ) skipUnsupportedTopologies := func() { @@ -74,94 +67,19 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv framework.Failf("Unexpected default TLS version %q, expected VersionTLS12 or empty (cluster default)", defaultTLSVersion) } - g.By(fmt.Sprintf("Creating custom MachineConfigPool %s", testMCPName)) - testMCP := &mcfgv1.MachineConfigPool{ - ObjectMeta: metav1.ObjectMeta{ - Name: testMCPName, - Labels: map[string]string{ - "machineconfiguration.openshift.io/pool": testMCPName, - }, - }, - Spec: mcfgv1.MachineConfigPoolSpec{ - MachineConfigSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "machineconfiguration.openshift.io/role", - Operator: metav1.LabelSelectorOpIn, - Values: []string{"worker", testMCPName}, - }, - }, - }, - NodeSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - testNodeMCPLabel: "", - }, - }, - }, - } - - _, err = mcClient.MachineconfigurationV1().MachineConfigPools().Create(ctx, testMCP, metav1.CreateOptions{}) - o.Expect(err).NotTo(o.HaveOccurred(), "Failed to create custom MachineConfigPool %s", testMCPName) + // Create custom MCP for the node + mcpConfig, err := CreateCustomMCPForNode(ctx, oc, mcClient, testMCPName, testNode) + o.Expect(err).NotTo(o.HaveOccurred(), "Should create custom MCP") cleanupMCP := func() { - framework.Logf("Cleanup: deleting MachineConfigPool %s", testMCPName) cleanupCtx := context.Background() - deleteErr := mcClient.MachineconfigurationV1().MachineConfigPools().Delete(cleanupCtx, testMCPName, metav1.DeleteOptions{}) - if apierrors.IsNotFound(deleteErr) { - return - } - if deleteErr != nil { - framework.Logf("Failed to delete MachineConfigPool %s: %v", testMCPName, deleteErr) + err := CleanupCustomMCP(cleanupCtx, mcpConfig) + if err != nil { + framework.Logf("Warning: cleanup had errors: %v", err) } } - // DeferCleanup runs in LIFO order. Registering MCP first ensures it - // is deleted last, after the node label is removed and the node has - // transitioned back to the worker pool. g.DeferCleanup(cleanupMCP) - g.By(fmt.Sprintf("Labeling node %s with %s", testNode, testNodeMCPLabel)) - patchData := []byte(fmt.Sprintf(`{"metadata":{"labels":{%q:""}}}`, testNodeMCPLabel)) - _, err = oc.AdminKubeClient().CoreV1().Nodes().Patch(ctx, testNode, types.MergePatchType, patchData, metav1.PatchOptions{}) - o.Expect(err).NotTo(o.HaveOccurred(), "Failed to label node %s", testNode) - - cleanupNodeLabel := func() { - framework.Logf("Cleanup: removing label %s from node %s", testNodeMCPLabel, testNode) - cleanupCtx := context.Background() - removePatch := []byte(fmt.Sprintf(`{"metadata":{"labels":{%q:null}}}`, testNodeMCPLabel)) - _, patchErr := oc.AdminKubeClient().CoreV1().Nodes().Patch(cleanupCtx, testNode, types.MergePatchType, removePatch, metav1.PatchOptions{}) - if apierrors.IsNotFound(patchErr) { - return - } - if patchErr != nil { - framework.Logf("Failed to remove label from node %s: %v", testNode, patchErr) - return - } - - framework.Logf("Cleanup: waiting for node %s to transition back to worker pool", testNode) - o.Eventually(func() bool { - currentNode, getErr := oc.AdminKubeClient().CoreV1().Nodes().Get(cleanupCtx, testNode, metav1.GetOptions{}) - if getErr != nil { - framework.Logf("Error getting node: %v", getErr) - return false - } - currentConfig := currentNode.Annotations["machineconfiguration.openshift.io/currentConfig"] - desiredConfig := currentNode.Annotations["machineconfiguration.openshift.io/desiredConfig"] - isWorkerConfig := currentConfig != "" && !strings.Contains(currentConfig, testMCPName) && currentConfig == desiredConfig - if isWorkerConfig { - framework.Logf("Node %s transitioned back to worker config: %s", testNode, currentConfig) - } else { - framework.Logf("Node %s still transitioning: current=%s, desired=%s", testNode, currentConfig, desiredConfig) - } - return isWorkerConfig - }, 15*time.Minute, 15*time.Second).Should(o.BeTrue(), - "Node %s should transition back to worker pool", testNode) - } - g.DeferCleanup(cleanupNodeLabel) - - framework.Logf("Waiting for custom MachineConfigPool %s to be ready", testMCPName) - err = waitForMCP(ctx, mcClient, testMCPName, 10*time.Minute) - o.Expect(err).NotTo(o.HaveOccurred(), "Custom MachineConfigPool %s did not become ready", testMCPName) - g.By(fmt.Sprintf("Creating KubeletConfig with Modern TLS profile (TLS 1.3) targeting pool %s", testMCPName)) kubeletConfig := &mcfgv1.KubeletConfig{ ObjectMeta: metav1.ObjectMeta{ @@ -179,50 +97,15 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv }, } - cleanupKubeletConfig := func() { - framework.Logf("Cleanup: deleting KubeletConfig %s", kubeletConfigName) + g.DeferCleanup(func() { cleanupCtx := context.Background() - deleteErr := mcClient.MachineconfigurationV1().KubeletConfigs().Delete(cleanupCtx, kubeletConfigName, metav1.DeleteOptions{}) - if apierrors.IsNotFound(deleteErr) { - return - } - o.Expect(deleteErr).NotTo(o.HaveOccurred(), "Cleanup: failed to delete KubeletConfig %s", kubeletConfigName) - - framework.Logf("Cleanup: waiting for MCP %s to become ready after KubeletConfig deletion", testMCPName) - waitErr := waitForMCP(cleanupCtx, mcClient, testMCPName, 15*time.Minute) - if apierrors.IsNotFound(waitErr) { - return - } - o.Expect(waitErr).NotTo(o.HaveOccurred(), - "Cleanup: MCP %s did not become ready after KubeletConfig deletion", testMCPName) - } - g.DeferCleanup(cleanupKubeletConfig) - - _, err = mcClient.MachineconfigurationV1().KubeletConfigs().Create(ctx, kubeletConfig, metav1.CreateOptions{}) - o.Expect(err).NotTo(o.HaveOccurred(), "Error creating KubeletConfig with TLS 1.3") - - g.By(fmt.Sprintf("Waiting for MachineConfigPool %s to begin updating", testMCPName)) - err = wait.PollUntilContextTimeout(ctx, 15*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) { - mcp, getErr := mcClient.MachineconfigurationV1().MachineConfigPools().Get(ctx, testMCPName, metav1.GetOptions{}) - if getErr != nil { - framework.Logf("Error getting MCP %s: %v", testMCPName, getErr) - return false, nil - } - for _, condition := range mcp.Status.Conditions { - if condition.Type == "Updating" && condition.Status == corev1.ConditionTrue { - framework.Logf("MCP %s has started updating", testMCPName) - return true, nil - } - } - return false, nil + err := CleanupKubeletConfig(cleanupCtx, mcClient, kubeletConfigName, testMCPName) + o.Expect(err).NotTo(o.HaveOccurred(), "Cleanup: failed to delete KubeletConfig %s", kubeletConfigName) }) - o.Expect(err).NotTo(o.HaveOccurred(), - "Timed out waiting for MachineConfigPool %q to start updating", testMCPName) - g.By(fmt.Sprintf("Waiting for MachineConfigPool %s to complete rollout", testMCPName)) - err = waitForMCP(ctx, mcClient, testMCPName, 15*time.Minute) - o.Expect(err).NotTo(o.HaveOccurred(), "Error waiting for MachineConfigPool %q to become ready", testMCPName) - framework.Logf("MachineConfigPool %s has completed rollout", testMCPName) + g.By("Applying KubeletConfig and waiting for MCP rollout") + err = ApplyKubeletConfigAndWaitForMCP(ctx, mcClient, kubeletConfig, testMCPName, 15*time.Minute) + o.Expect(err).NotTo(o.HaveOccurred(), "Should apply KubeletConfig and complete MCP rollout") g.By(fmt.Sprintf("Verifying node %s is Ready and Done after rollout", testNode)) updatedNode, err := oc.AdminKubeClient().CoreV1().Nodes().Get(ctx, testNode, metav1.GetOptions{}) diff --git a/test/extended/node/node_e2e/container_runtime_config.go b/test/extended/node/node_e2e/container_runtime_config.go index ff799d4825d3..df81db7d9f8a 100644 --- a/test/extended/node/node_e2e/container_runtime_config.go +++ b/test/extended/node/node_e2e/container_runtime_config.go @@ -2,7 +2,6 @@ package node import ( "context" - "fmt" "strings" "time" @@ -10,11 +9,11 @@ import ( o "github.com/onsi/gomega" mcfgv1 "github.com/openshift/api/machineconfiguration/v1" + machineconfigclient "github.com/openshift/client-go/machineconfiguration/clientset/versioned" "github.com/openshift/origin/test/extended/imagepolicy" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" e2e "k8s.io/kubernetes/test/e2e/framework" "k8s.io/utils/ptr" @@ -63,7 +62,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv o.Expect(editedConf).To(o.ContainSubstring(`log_level = "debug"`), "sed edit did not apply: expected log_level = debug in crio.conf") - createSingleNodeMCP(ctx, oc, mcpName, workerNode) + mcpConfig := createSingleNodeMCP(ctx, oc, mcpName, workerNode) g.DeferCleanup(func() { g.By("Cleanup: delete ContainerRuntimeConfig") @@ -73,7 +72,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv o.Expect(delErr).NotTo(o.HaveOccurred(), "cleanup failed: could not delete ContainerRuntimeConfig %s", ctrcfgName) } - cleanupSingleNodeMCP(ctx, oc, mcpName, workerNode) + cleanupSingleNodeMCP(ctx, mcpConfig) }) initialSpec := imagepolicy.GetMCPCurrentSpecConfigName(oc, mcpName) @@ -128,7 +127,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv o.Expect(workers).NotTo(o.BeEmpty(), "No Ready worker nodes found") workerNode := workers[0].Name - createSingleNodeMCP(ctx, oc, mcpName, workerNode) + mcpConfig := createSingleNodeMCP(ctx, oc, mcpName, workerNode) g.DeferCleanup(func() { g.By("Cleanup: delete ContainerRuntimeConfig") @@ -138,7 +137,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv o.Expect(delErr).NotTo(o.HaveOccurred(), "cleanup failed: could not delete ContainerRuntimeConfig %s", ctrcfgName) } - cleanupSingleNodeMCP(ctx, oc, mcpName, workerNode) + cleanupSingleNodeMCP(ctx, mcpConfig) }) initialSpec := imagepolicy.GetMCPCurrentSpecConfigName(oc, mcpName) @@ -209,81 +208,27 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv }) // createSingleNodeMCP creates a custom MachineConfigPool that targets exactly one worker node. -// It labels the node to move it into the custom pool and waits until the pool reports 1 node. -func createSingleNodeMCP(ctx context.Context, oc *exutil.CLI, mcpName, workerNode string) { - nodeLabel := "node-role.kubernetes.io/" + mcpName - - g.By("Create a custom MachineConfigPool targeting a single worker node") - mcp := &mcfgv1.MachineConfigPool{ - ObjectMeta: metav1.ObjectMeta{ - Name: mcpName, - Labels: map[string]string{"machineconfiguration.openshift.io/pool": mcpName}, - }, - Spec: mcfgv1.MachineConfigPoolSpec{ - MachineConfigSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "machineconfiguration.openshift.io/role", - Operator: metav1.LabelSelectorOpIn, - Values: []string{"worker", mcpName}, - }, - }, - }, - NodeSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{nodeLabel: ""}, - }, - }, - } - _, err := oc.MachineConfigurationClient().MachineconfigurationV1().MachineConfigPools().Create(ctx, mcp, metav1.CreateOptions{}) - o.Expect(err).NotTo(o.HaveOccurred(), "failed to create custom MachineConfigPool %s", mcpName) - - g.By("Label worker node to move it into the custom MCP") - patch := []byte(fmt.Sprintf(`{"metadata":{"labels":{%q:""}}}`, nodeLabel)) - _, err = oc.AdminKubeClient().CoreV1().Nodes().Patch(ctx, workerNode, types.MergePatchType, patch, metav1.PatchOptions{}) - o.Expect(err).NotTo(o.HaveOccurred(), "failed to label node %s", workerNode) - - g.By("Wait for the custom MCP to report the node") - o.Eventually(func() int { - pool, getErr := oc.MachineConfigurationClient().MachineconfigurationV1().MachineConfigPools().Get(ctx, mcpName, metav1.GetOptions{}) - if getErr != nil { - return 0 - } - return int(pool.Status.MachineCount) - }, 2*time.Minute, 10*time.Second).Should(o.Equal(1), "custom MCP %s should have 1 node", mcpName) +// It uses the shared helper from node_mcp_helpers.go and returns the config for cleanup. +func createSingleNodeMCP(ctx context.Context, oc *exutil.CLI, mcpName, workerNode string) *nodeutils.CustomMCPConfig { + mcClient, err := machineconfigclient.NewForConfig(oc.KubeFramework().ClientConfig()) + o.Expect(err).NotTo(o.HaveOccurred(), "failed to create machine config client") + + mcpConfig, err := nodeutils.CreateCustomMCPForNode(ctx, oc, mcClient, mcpName, workerNode) + o.Expect(err).NotTo(o.HaveOccurred(), "failed to create custom MCP") + + return mcpConfig } // cleanupSingleNodeMCP removes the node label, waits for the node to transition back to the // worker pool config, and then deletes the custom MCP. -func cleanupSingleNodeMCP(ctx context.Context, oc *exutil.CLI, mcpName, workerNode string) { - nodeLabel := "node-role.kubernetes.io/" + mcpName - - g.By("Cleanup: remove node label to move node back to worker pool") - patch := []byte(fmt.Sprintf(`{"metadata":{"labels":{%q:null}}}`, nodeLabel)) - _, err := oc.AdminKubeClient().CoreV1().Nodes().Patch(ctx, workerNode, types.MergePatchType, patch, metav1.PatchOptions{}) - if err != nil && !apierrors.IsNotFound(err) { - e2e.Logf("WARNING: failed to remove label from node %s: %v", workerNode, err) +// It uses the shared helper from node_mcp_helpers.go. +func cleanupSingleNodeMCP(ctx context.Context, mcpConfig *nodeutils.CustomMCPConfig) { + if mcpConfig == nil { + return } - g.By("Cleanup: wait for node to transition back to worker config") - o.Eventually(func() bool { - node, getErr := oc.AdminKubeClient().CoreV1().Nodes().Get(ctx, workerNode, metav1.GetOptions{}) - if getErr != nil { - e2e.Logf("Error getting node: %v", getErr) - return false - } - currentConfig := node.Annotations["machineconfiguration.openshift.io/currentConfig"] - desiredConfig := node.Annotations["machineconfiguration.openshift.io/desiredConfig"] - isWorkerConfig := currentConfig != "" && !strings.Contains(currentConfig, mcpName) && currentConfig == desiredConfig - if !isWorkerConfig { - e2e.Logf("Node %s still transitioning: current=%s, desired=%s", workerNode, currentConfig, desiredConfig) - } - return isWorkerConfig - }, 15*time.Minute, 15*time.Second).Should(o.BeTrue(), - "node %s should transition back to worker pool config", workerNode) - - g.By("Cleanup: delete custom MachineConfigPool") - delErr := oc.MachineConfigurationClient().MachineconfigurationV1().MachineConfigPools().Delete(ctx, mcpName, metav1.DeleteOptions{}) - if !apierrors.IsNotFound(delErr) && delErr != nil { - e2e.Logf("WARNING: failed to delete MachineConfigPool %s: %v", mcpName, delErr) + err := nodeutils.CleanupCustomMCP(ctx, mcpConfig) + if err != nil { + e2e.Logf("WARNING: cleanup had errors: %v", err) } } diff --git a/test/extended/node/node_e2e/netns_cleanup.go b/test/extended/node/node_e2e/netns_cleanup.go index 9fd3438ce2fa..a55d8d925a54 100644 --- a/test/extended/node/node_e2e/netns_cleanup.go +++ b/test/extended/node/node_e2e/netns_cleanup.go @@ -70,12 +70,12 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Network namespace cleanup", f e2e.Logf("Pod is running on node: %s", nodeName) g.By("Get pod's network namespace path") - netNsPath, err := nodeutils.GetPodNetNs(oc, nodeName, podName) + netNsPath, err := nodeutils.GetPodNetNs(ctx, oc, nodeName, podName) o.Expect(err).NotTo(o.HaveOccurred(), "failed to get pod NetNS") e2e.Logf("Pod NetNS path: %s", netNsPath) g.By("Verify NetNS file exists before pod deletion") - _, err = nodeutils.ExecOnNodeWithChroot(oc, nodeName, "test", "-e", netNsPath) + _, err = nodeutils.ExecOnNodeWithChroot(ctx, oc, nodeName, "test", "-e", netNsPath) o.Expect(err).NotTo(o.HaveOccurred(), "NetNS file does not exist before pod deletion") g.By("Delete the pod") @@ -99,7 +99,7 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Network namespace cleanup", f o.Expect(err).NotTo(o.HaveOccurred(), "pod was not deleted") g.By("Verify that the NetNS file has been cleaned up on the node") - err = nodeutils.CheckNetNsCleaned(oc, nodeName, netNsPath) + err = nodeutils.CheckNetNsCleaned(ctx, oc, nodeName, netNsPath) o.Expect(err).NotTo(o.HaveOccurred(), "NetNS file was not cleaned up") }) }) diff --git a/test/extended/node/node_kc_helpers.go b/test/extended/node/node_kc_helpers.go new file mode 100644 index 000000000000..4aaa77b6c555 --- /dev/null +++ b/test/extended/node/node_kc_helpers.go @@ -0,0 +1,141 @@ +package node + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/kubernetes/test/e2e/framework" + + machineconfigv1 "github.com/openshift/api/machineconfiguration/v1" + machineconfigclient "github.com/openshift/client-go/machineconfiguration/clientset/versioned" +) + +// CreateKubeletConfig creates a KubeletConfig resource. +// Returns the created KubeletConfig for reference. +// +// Example usage: +// +// kc, err := CreateKubeletConfig(ctx, mcClient, kubeletConfig) +// o.Expect(err).NotTo(o.HaveOccurred()) +// defer CleanupKubeletConfig(context.Background(), mcClient, kc.Name, "") +func CreateKubeletConfig(ctx context.Context, mcClient *machineconfigclient.Clientset, kubeletConfig *machineconfigv1.KubeletConfig) (*machineconfigv1.KubeletConfig, error) { + framework.Logf("Creating KubeletConfig %s", kubeletConfig.Name) + created, err := mcClient.MachineconfigurationV1().KubeletConfigs().Create(ctx, kubeletConfig, metav1.CreateOptions{}) + if err != nil { + return nil, err + } + return created, nil +} + +// CleanupKubeletConfig deletes a KubeletConfig and optionally waits for the associated MCP to stabilize. +// This function is idempotent and safe to call multiple times. +// +// Parameters: +// - ctx: Context for the operation +// - mcClient: Machine config client +// - kcName: Name of the KubeletConfig to delete +// - mcpName: Optional name of the MachineConfigPool to wait for after deletion. +// If empty, no MCP wait is performed. +// +// Example usage: +// +// // Simple cleanup without waiting for MCP +// err := CleanupKubeletConfig(ctx, mcClient, "my-kc", "") +// +// // Cleanup and wait for MCP to stabilize +// err := CleanupKubeletConfig(ctx, mcClient, "my-kc", "worker") +func CleanupKubeletConfig(ctx context.Context, mcClient *machineconfigclient.Clientset, kcName, mcpName string) error { + framework.Logf("Cleaning up KubeletConfig %s", kcName) + + // Delete the KubeletConfig + deleteErr := mcClient.MachineconfigurationV1().KubeletConfigs().Delete(ctx, kcName, metav1.DeleteOptions{}) + if deleteErr != nil && !apierrors.IsNotFound(deleteErr) { + return deleteErr + } + + // If MCP name is provided, wait for it to stabilize + if mcpName != "" && (deleteErr == nil || apierrors.IsNotFound(deleteErr)) { + framework.Logf("Waiting for MCP %s to become ready after KubeletConfig deletion", mcpName) + waitErr := WaitForMCP(ctx, mcClient, mcpName, 15*time.Minute) + if waitErr != nil && !apierrors.IsNotFound(waitErr) { + return waitErr + } + } + + framework.Logf("KubeletConfig %s cleaned up successfully", kcName) + return nil +} + +// ApplyKubeletConfigAndWaitForMCP creates a KubeletConfig and waits for the specified MCP to complete rollout. +// This is a common pattern in tests that apply KubeletConfig changes. +// +// Parameters: +// - ctx: Context for the operation +// - mcClient: Machine config client +// - kubeletConfig: The KubeletConfig to create +// - mcpName: Name of the MachineConfigPool to wait for +// - rolloutTimeout: How long to wait for the full rollout (default: 15 minutes) +// +// Example usage: +// +// err := ApplyKubeletConfigAndWaitForMCP(ctx, mcClient, kubeletConfig, "worker", 15*time.Minute) +// o.Expect(err).NotTo(o.HaveOccurred()) +func ApplyKubeletConfigAndWaitForMCP(ctx context.Context, mcClient *machineconfigclient.Clientset, kubeletConfig *machineconfigv1.KubeletConfig, mcpName string, rolloutTimeout time.Duration) error { + // Create the KubeletConfig + _, err := CreateKubeletConfig(ctx, mcClient, kubeletConfig) + if err != nil { + return err + } + + // Wait for MCP to start updating + framework.Logf("Waiting for MCP %s to start updating", mcpName) + err = WaitForMCPUpdating(ctx, mcClient, mcpName, 5*time.Minute) + if err != nil { + return err + } + + // Wait for MCP to complete rollout + framework.Logf("Waiting for MCP %s to complete rollout", mcpName) + return WaitForMCP(ctx, mcClient, mcpName, rolloutTimeout) +} + +// WaitForMCPUpdating waits for a MachineConfigPool to enter the "Updating" state. +// This is useful to confirm that a configuration change has been picked up by MCO. +// +// Parameters: +// - ctx: Context for the operation +// - mcClient: Machine config client +// - mcpName: Name of the MachineConfigPool to watch +// - timeout: How long to wait for the update to start +// +// Returns error if timeout expires or MCP not found. +func WaitForMCPUpdating(ctx context.Context, mcClient *machineconfigclient.Clientset, mcpName string, timeout time.Duration) error { + startTime := time.Now() + for { + mcp, err := mcClient.MachineconfigurationV1().MachineConfigPools().Get(ctx, mcpName, metav1.GetOptions{}) + if err != nil { + if time.Since(startTime) > timeout { + return err + } + framework.Logf("Error getting MCP %s: %v, retrying...", mcpName, err) + time.Sleep(10 * time.Second) + continue + } + + for _, condition := range mcp.Status.Conditions { + if condition.Type == "Updating" && condition.Status == "True" { + framework.Logf("MCP %s has started updating", mcpName) + return nil + } + } + + if time.Since(startTime) > timeout { + return fmt.Errorf("timeout waiting for MCP %s to start updating", mcpName) + } + + time.Sleep(10 * time.Second) + } +} diff --git a/test/extended/node/node_mcp_helpers.go b/test/extended/node/node_mcp_helpers.go new file mode 100644 index 000000000000..73ad73bafa9e --- /dev/null +++ b/test/extended/node/node_mcp_helpers.go @@ -0,0 +1,207 @@ +package node + +import ( + "context" + "fmt" + "strings" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/kubernetes/test/e2e/framework" + + machineconfigv1 "github.com/openshift/api/machineconfiguration/v1" + machineconfigclient "github.com/openshift/client-go/machineconfiguration/clientset/versioned" + exutil "github.com/openshift/origin/test/extended/util" +) + +// CustomMCPConfig holds the configuration for a custom MachineConfigPool setup. +// This is returned by CreateCustomMCPForNode and should be passed to CleanupCustomMCP. +type CustomMCPConfig struct { + // Name is the name of the custom MachineConfigPool + Name string + // NodeName is the name of the node labeled and added to the MCP + NodeName string + // MCClient is the machine config client for MCP operations + MCClient *machineconfigclient.Clientset + // KubeClient is the Kubernetes client for node operations + KubeClient *exutil.CLI +} + +// CreateCustomMCPForNode creates a custom MachineConfigPool and labels a node to join it. +// It returns a CustomMCPConfig that should be passed to CleanupCustomMCP for cleanup. +// This is useful for tests that need to apply custom KubeletConfigs to specific nodes +// without affecting the entire worker pool. +// +// The function performs the following steps: +// 1. Labels the specified node with "node-role.kubernetes.io/" +// 2. Creates a custom MachineConfigPool that targets nodes with that label +// 3. Waits for the MCP to become ready (up to 5 minutes) +// +// Example usage: +// +// mcClient, err := machineconfigclient.NewForConfig(oc.KubeFramework().ClientConfig()) +// o.Expect(err).NotTo(o.HaveOccurred()) +// +// mcpConfig, err := CreateCustomMCPForNode(ctx, oc, mcClient, "my-test-pool", nodeName) +// o.Expect(err).NotTo(o.HaveOccurred()) +// defer CleanupCustomMCP(context.Background(), mcpConfig) +// +// // Now you can create KubeletConfigs targeting this MCP +// // The node will apply configs without affecting other workers +func CreateCustomMCPForNode(ctx context.Context, oc *exutil.CLI, mcClient *machineconfigclient.Clientset, mcpName, nodeName string) (*CustomMCPConfig, error) { + config := &CustomMCPConfig{ + Name: mcpName, + NodeName: nodeName, + MCClient: mcClient, + KubeClient: oc, + } + + nodeLabel := fmt.Sprintf("node-role.kubernetes.io/%s", mcpName) + + // Step 1: Label the node + framework.Logf("Labeling node %s with %s", nodeName, nodeLabel) + patchData := []byte(fmt.Sprintf(`{"metadata":{"labels":{%q:""}}}`, nodeLabel)) + _, err := oc.AdminKubeClient().CoreV1().Nodes().Patch(ctx, nodeName, types.MergePatchType, patchData, metav1.PatchOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to label node %s: %w", nodeName, err) + } + + // Step 2: Create custom MachineConfigPool + framework.Logf("Creating custom MachineConfigPool %s", mcpName) + mcp := &machineconfigv1.MachineConfigPool{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "machineconfiguration.openshift.io/v1", + Kind: "MachineConfigPool", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: mcpName, + Labels: map[string]string{ + "machineconfiguration.openshift.io/pool": mcpName, + }, + }, + Spec: machineconfigv1.MachineConfigPoolSpec{ + MachineConfigSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "machineconfiguration.openshift.io/role", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"worker", mcpName}, + }, + }, + }, + NodeSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + nodeLabel: "", + }, + }, + }, + } + + _, err = mcClient.MachineconfigurationV1().MachineConfigPools().Create(ctx, mcp, metav1.CreateOptions{}) + if err != nil { + // Cleanup the node label if MCP creation fails + framework.Logf("MCP creation failed, removing node label") + unlabelPatchData := []byte(fmt.Sprintf(`{"metadata":{"labels":{%q:null}}}`, nodeLabel)) + _, _ = oc.AdminKubeClient().CoreV1().Nodes().Patch(ctx, nodeName, types.MergePatchType, unlabelPatchData, metav1.PatchOptions{}) + return nil, fmt.Errorf("failed to create MachineConfigPool %s: %w", mcpName, err) + } + + // Step 3: Wait for MCP to become ready + framework.Logf("Waiting for custom MachineConfigPool %s to be ready", mcpName) + err = WaitForMCP(ctx, mcClient, mcpName, 5*time.Minute) + if err != nil { + return config, fmt.Errorf("MachineConfigPool %s did not become ready: %w", mcpName, err) + } + + framework.Logf("Custom MachineConfigPool %s created successfully", mcpName) + return config, nil +} + +// CleanupCustomMCP removes the custom MachineConfigPool and unlabels the node, +// returning it to the worker pool. It should be called in a defer or cleanup +// handler after CreateCustomMCPForNode. +// +// The function performs the following steps: +// 1. Removes the custom node label from the node +// 2. Waits for the node to transition back to the worker pool (up to 7 minutes) +// 3. Deletes the custom MachineConfigPool +// 4. Waits for the worker MCP to stabilize (up to 10 minutes) +// +// This function is idempotent and safe to call multiple times. It handles +// NotFound errors gracefully, making it safe to use even if resources were +// already cleaned up. +// +// Example usage: +// +// mcpConfig, err := CreateCustomMCPForNode(ctx, oc, mcClient, "my-test-pool", nodeName) +// o.Expect(err).NotTo(o.HaveOccurred()) +// defer func() { +// err := CleanupCustomMCP(context.Background(), mcpConfig) +// if err != nil { +// framework.Logf("Warning: cleanup had errors: %v", err) +// } +// }() +func CleanupCustomMCP(ctx context.Context, config *CustomMCPConfig) error { + if config == nil { + return nil + } + + nodeLabel := fmt.Sprintf("node-role.kubernetes.io/%s", config.Name) + var cleanupErrors []error + + // Step 1: Remove node label + framework.Logf("Removing node label %s from node %s", nodeLabel, config.NodeName) + patchData := []byte(fmt.Sprintf(`{"metadata":{"labels":{%q:null}}}`, nodeLabel)) + _, err := config.KubeClient.AdminKubeClient().CoreV1().Nodes().Patch(ctx, config.NodeName, types.MergePatchType, patchData, metav1.PatchOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + cleanupErrors = append(cleanupErrors, fmt.Errorf("failed to remove label from node %s: %w", config.NodeName, err)) + } + + // Step 2: Wait for node to transition back to worker pool + if err == nil || apierrors.IsNotFound(err) { + framework.Logf("Waiting for node %s to transition back to worker pool", config.NodeName) + transitionErr := wait.PollUntilContextTimeout(ctx, 10*time.Second, 7*time.Minute, true, func(ctx context.Context) (bool, error) { + node, getErr := config.KubeClient.AdminKubeClient().CoreV1().Nodes().Get(ctx, config.NodeName, metav1.GetOptions{}) + if apierrors.IsNotFound(getErr) { + // Node was deleted, consider it transitioned + return true, nil + } + if getErr != nil { + return false, nil + } + currentConfig := node.Annotations["machineconfiguration.openshift.io/currentConfig"] + desiredConfig := node.Annotations["machineconfiguration.openshift.io/desiredConfig"] + isWorkerConfig := currentConfig != "" && !strings.Contains(currentConfig, config.Name) && currentConfig == desiredConfig + return isWorkerConfig, nil + }) + if transitionErr != nil { + cleanupErrors = append(cleanupErrors, fmt.Errorf("node %s did not transition back to worker pool: %w", config.NodeName, transitionErr)) + } + } + + // Step 3: Delete the custom MachineConfigPool + framework.Logf("Deleting custom MachineConfigPool %s", config.Name) + deleteErr := config.MCClient.MachineconfigurationV1().MachineConfigPools().Delete(ctx, config.Name, metav1.DeleteOptions{}) + if deleteErr != nil && !apierrors.IsNotFound(deleteErr) { + cleanupErrors = append(cleanupErrors, fmt.Errorf("failed to delete MachineConfigPool %s: %w", config.Name, deleteErr)) + } + + // Step 4: Wait for worker MCP to stabilize + if deleteErr == nil || apierrors.IsNotFound(deleteErr) { + framework.Logf("Waiting for worker MCP to stabilize after custom MCP deletion") + waitErr := WaitForMCP(ctx, config.MCClient, "worker", 10*time.Minute) + if waitErr != nil && !apierrors.IsNotFound(waitErr) { + cleanupErrors = append(cleanupErrors, fmt.Errorf("worker MCP did not stabilize: %w", waitErr)) + } + } + + if len(cleanupErrors) > 0 { + return fmt.Errorf("cleanup completed with errors: %v", cleanupErrors) + } + + framework.Logf("Custom MachineConfigPool %s cleaned up successfully", config.Name) + return nil +} diff --git a/test/extended/node/node_sizing.go b/test/extended/node/node_sizing.go index 40d67ac46c74..27a0b3baa4e6 100644 --- a/test/extended/node/node_sizing.go +++ b/test/extended/node/node_sizing.go @@ -8,10 +8,7 @@ import ( g "github.com/onsi/ginkgo/v2" o "github.com/onsi/gomega" - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" "k8s.io/kubernetes/test/e2e/framework" mcfgv1 "github.com/openshift/api/machineconfiguration/v1" @@ -41,7 +38,6 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv o.Expect(err).NotTo(o.HaveOccurred(), "Error creating MCO client") testMCPName := "node-sizing-test" - testNodeMCPLabel := fmt.Sprintf("node-role.kubernetes.io/%s", testMCPName) kubeletConfigName := "auto-sizing-enabled" // Verify the default state (NODE_SIZING_ENABLED=false) @@ -58,104 +54,19 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv nodeName := nodes.Items[0].Name framework.Logf("Testing on node: %s", nodeName) - // Define cleanup function for node label before applying the label - cleanupNodeLabel := func() { - g.By(fmt.Sprintf("Removing node label %s from node %s", testNodeMCPLabel, nodeName)) - cleanupCtx := context.Background() - // Use JSON patch to remove the label atomically - patchData := []byte(fmt.Sprintf(`{"metadata":{"labels":{%q:null}}}`, testNodeMCPLabel)) - _, updateErr := oc.AdminKubeClient().CoreV1().Nodes().Patch(cleanupCtx, nodeName, types.MergePatchType, patchData, metav1.PatchOptions{}) - if apierrors.IsNotFound(updateErr) { - // Node already deleted, nothing to clean up - } else if updateErr != nil { - framework.Logf("Failed to remove label from node %s: %v", nodeName, updateErr) - return - } - - // Wait for the node to transition back to the worker pool configuration - g.By(fmt.Sprintf("Waiting for node %s to transition back to worker pool", nodeName)) - o.Eventually(func() bool { - currentNode, err := oc.AdminKubeClient().CoreV1().Nodes().Get(cleanupCtx, nodeName, metav1.GetOptions{}) - if err != nil { - framework.Logf("Error getting node: %v", err) - return false - } - currentConfig := currentNode.Annotations["machineconfiguration.openshift.io/currentConfig"] - desiredConfig := currentNode.Annotations["machineconfiguration.openshift.io/desiredConfig"] - - // Check if the node is using a worker config (not node-sizing-test config) - isWorkerConfig := currentConfig != "" && !strings.Contains(currentConfig, testMCPName) && currentConfig == desiredConfig - if isWorkerConfig { - framework.Logf("Node %s successfully transitioned to worker config: %s", nodeName, currentConfig) - } else { - framework.Logf("Node %s still transitioning: current=%s, desired=%s", nodeName, currentConfig, desiredConfig) - } - return isWorkerConfig - }, 7*time.Minute, 10*time.Second).Should(o.BeTrue(), fmt.Sprintf("Node %s should transition back to worker pool", nodeName)) - } - - g.By(fmt.Sprintf("Labeling node %s with %s", nodeName, testNodeMCPLabel)) - patchData := []byte(fmt.Sprintf(`{"metadata":{"labels":{%q:""}}}`, testNodeMCPLabel)) - _, err = oc.AdminKubeClient().CoreV1().Nodes().Patch(ctx, nodeName, types.MergePatchType, patchData, metav1.PatchOptions{}) - o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to label node") - - // Register cleanup immediately after successful label application - g.DeferCleanup(cleanupNodeLabel) - - // Create custom MCP - g.By(fmt.Sprintf("Creating custom MachineConfigPool %s", testMCPName)) - testMCP := &mcfgv1.MachineConfigPool{ - TypeMeta: metav1.TypeMeta{ - APIVersion: "machineconfiguration.openshift.io/v1", - Kind: "MachineConfigPool", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: testMCPName, - Labels: map[string]string{ - "machineconfiguration.openshift.io/pool": testMCPName, - }, - }, - Spec: mcfgv1.MachineConfigPoolSpec{ - MachineConfigSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "machineconfiguration.openshift.io/role", - Operator: metav1.LabelSelectorOpIn, - Values: []string{"worker", testMCPName}, - }, - }, - }, - NodeSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - testNodeMCPLabel: "", - }, - }, - }, - } - - _, err = mcClient.MachineconfigurationV1().MachineConfigPools().Create(ctx, testMCP, metav1.CreateOptions{}) - o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to create custom MachineConfigPool") + // Create custom MCP for the node + mcpConfig, err := CreateCustomMCPForNode(ctx, oc, mcClient, testMCPName, nodeName) + o.Expect(err).NotTo(o.HaveOccurred(), "Should create custom MCP") cleanupMCP := func() { - g.By("Cleaning up custom MachineConfigPool") cleanupCtx := context.Background() - deleteErr := mcClient.MachineconfigurationV1().MachineConfigPools().Delete(cleanupCtx, testMCPName, metav1.DeleteOptions{}) - if apierrors.IsNotFound(deleteErr) { - // MachineConfigPool already deleted, nothing to clean up - } else if deleteErr != nil { - framework.Logf("Failed to delete MachineConfigPool %s: %v", testMCPName, deleteErr) + err := CleanupCustomMCP(cleanupCtx, mcpConfig) + if err != nil { + framework.Logf("Warning: cleanup had errors: %v", err) } } - - // Register DeferCleanup so cleanup happens even on test failure - // DeferCleanup runs in LIFO order: MCP deleted last (registered first) - // Note: cleanupNodeLabel already registered immediately after node labeling g.DeferCleanup(cleanupMCP) - g.By("Waiting for custom MachineConfigPool to be ready") - err = waitForMCP(ctx, mcClient, testMCPName, 5*time.Minute) - o.Expect(err).NotTo(o.HaveOccurred(), "Custom MachineConfigPool should become ready") - verifyNodeSizingEnabledFile(ctx, oc, nodeName, "true") // Now apply KubeletConfig and verify NODE_SIZING_ENABLED=false @@ -180,66 +91,28 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv }, } - _, err = mcClient.MachineconfigurationV1().KubeletConfigs().Create(ctx, kubeletConfig, metav1.CreateOptions{}) - o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to create KubeletConfig") - - cleanupKubeletConfig := func() { - g.By("Cleaning up KubeletConfig") + g.DeferCleanup(func() { cleanupCtx := context.Background() - deleteErr := mcClient.MachineconfigurationV1().KubeletConfigs().Delete(cleanupCtx, kubeletConfigName, metav1.DeleteOptions{}) - if apierrors.IsNotFound(deleteErr) { - // KubeletConfig already deleted, nothing to clean up - } else if deleteErr != nil { - framework.Logf("Failed to delete KubeletConfig %s: %v", kubeletConfigName, deleteErr) + if err := CleanupKubeletConfig(cleanupCtx, mcClient, kubeletConfigName, testMCPName); err != nil { + framework.Logf("Warning: KubeletConfig cleanup failed: %v", err) } + }) - // Wait for custom MCP to be ready after cleanup - g.By("Waiting for custom MCP to be ready after KubeletConfig deletion") - waitErr := waitForMCP(cleanupCtx, mcClient, testMCPName, 5*time.Minute) - if apierrors.IsNotFound(waitErr) { - // MachineConfigPool already deleted, nothing to wait for - } else if waitErr != nil { - framework.Logf("Failed to wait for custom MCP to be ready: %v", waitErr) - } - } - g.DeferCleanup(cleanupKubeletConfig) - - g.By("Waiting for KubeletConfig to be created") - var createdKC *mcfgv1.KubeletConfig - o.Eventually(func() error { - createdKC, err = mcClient.MachineconfigurationV1().KubeletConfigs().Get(ctx, kubeletConfigName, metav1.GetOptions{}) - return err - }, 30*time.Second, 5*time.Second).Should(o.Succeed(), "KubeletConfig should be created") + g.By("Applying KubeletConfig and waiting for MCP rollout") + err = ApplyKubeletConfigAndWaitForMCP(ctx, mcClient, kubeletConfig, testMCPName, 15*time.Minute) + o.Expect(err).NotTo(o.HaveOccurred(), "Should apply KubeletConfig and complete MCP rollout") + // Verify KubeletConfig was created with correct spec + createdKC, err := mcClient.MachineconfigurationV1().KubeletConfigs().Get(ctx, kubeletConfigName, metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to get KubeletConfig") o.Expect(createdKC.Spec.AutoSizingReserved).NotTo(o.BeNil(), "AutoSizingReserved should not be nil") o.Expect(*createdKC.Spec.AutoSizingReserved).To(o.BeFalse(), "AutoSizingReserved should be false") - g.By(fmt.Sprintf("Waiting for %s MCP to start updating", testMCPName)) - o.Eventually(func() bool { - mcp, err := mcClient.MachineconfigurationV1().MachineConfigPools().Get(ctx, testMCPName, metav1.GetOptions{}) - if err != nil { - framework.Logf("Error getting %s MCP: %v", testMCPName, err) - return false - } - // Check if MCP is updating (has conditions indicating update in progress) - for _, condition := range mcp.Status.Conditions { - if condition.Type == "Updating" && condition.Status == corev1.ConditionTrue { - return true - } - } - return false - }, 2*time.Minute, 10*time.Second).Should(o.BeTrue(), fmt.Sprintf("%s MCP should start updating", testMCPName)) - - g.By(fmt.Sprintf("Waiting for %s MCP to be ready with new configuration", testMCPName)) - err = waitForMCP(ctx, mcClient, testMCPName, 15*time.Minute) - o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("%s MCP should become ready with new configuration", testMCPName)) - verifyNodeSizingEnabledFile(ctx, oc, nodeName, "false") // Explicit cleanup on success; DeferCleanup ensures cleanup also runs on failure - cleanupKubeletConfig() - cleanupNodeLabel() - cleanupMCP() + CleanupKubeletConfig(ctx, mcClient, kubeletConfigName, testMCPName) + CleanupCustomMCP(ctx, mcpConfig) }) }) diff --git a/test/extended/node/node_swap.go b/test/extended/node/node_swap.go index 80f194c9ea11..73aad3f91370 100644 --- a/test/extended/node/node_swap.go +++ b/test/extended/node/node_swap.go @@ -11,7 +11,6 @@ import ( ote "github.com/openshift-eng/openshift-tests-extension/pkg/ginkgo" corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/wait" @@ -140,12 +139,12 @@ var _ = g.Describe("[Jira:Node][sig-node] Node non-cnv swap configuration", func g.By("Attempting to apply the KubeletConfig") defer func() { - if err := mcClient.MachineconfigurationV1().KubeletConfigs().Delete(ctx, kcName, metav1.DeleteOptions{}); err != nil && !apierrors.IsNotFound(err) { + if err := CleanupKubeletConfig(ctx, mcClient, kcName, ""); err != nil { framework.Logf("cleanup failed for KubeletConfig %s: %v", kcName, err) } }() framework.Logf("Creating KubeletConfig with failSwapOn=true and swapBehavior=LimitedSwap") - _, err = mcClient.MachineconfigurationV1().KubeletConfigs().Create(ctx, kubeletConfig, metav1.CreateOptions{}) + _, err = CreateKubeletConfig(ctx, mcClient, kubeletConfig) o.Expect(err).NotTo(o.HaveOccurred(), "Failed to create KubeletConfig") g.By("Checking KubeletConfig status for expected error message") diff --git a/test/extended/node/node_utils.go b/test/extended/node/node_utils.go index cd9b91d1e8cc..0c131e09ca34 100644 --- a/test/extended/node/node_utils.go +++ b/test/extended/node/node_utils.go @@ -478,7 +478,7 @@ func installCNVOperator(ctx context.Context, oc *exutil.CLI) error { return fmt.Errorf("failed to create MC client for MCP check: %w", err) } - err = waitForMCP(ctx, mcClient, "worker", 15*time.Minute) + err = WaitForMCP(ctx, mcClient, "worker", 15*time.Minute) if err != nil { return fmt.Errorf("MCP rollout failed after CNV installation: %w", err) } @@ -552,9 +552,9 @@ func waitForHyperConvergedReady(ctx context.Context, oc *exutil.CLI) error { }) } -// waitForMCP waits for a MachineConfigPool to be ready (not updating, updated, and all machines ready) +// WaitForMCP waits for a MachineConfigPool to be ready (not updating, updated, and all machines ready) // Returns error immediately if the MCP becomes degraded -func waitForMCP(ctx context.Context, mcClient *machineconfigclient.Clientset, poolName string, timeout time.Duration) error { +func WaitForMCP(ctx context.Context, mcClient *machineconfigclient.Clientset, poolName string, timeout time.Duration) error { framework.Logf("Waiting for MCP %s to be ready (timeout: %v)...", poolName, timeout) return wait.PollUntilContextTimeout(ctx, 10*time.Second, timeout, true, func(ctx context.Context) (bool, error) { @@ -751,7 +751,7 @@ func uninstallCNVOperator(ctx context.Context, oc *exutil.CLI) error { if err != nil { framework.Logf("Warning: failed to create MC client for MCP check: %v", err) } else { - err = waitForMCP(ctx, mcClient, "worker", 15*time.Minute) + err = WaitForMCP(ctx, mcClient, "worker", 15*time.Minute) if err != nil { framework.Logf("Warning: MCP rollout check failed: %v", err) } diff --git a/test/extended/node/system_compressible.go b/test/extended/node/system_compressible.go index 1358ec46bb5a..dca8b403b6f0 100644 --- a/test/extended/node/system_compressible.go +++ b/test/extended/node/system_compressible.go @@ -10,12 +10,12 @@ import ( g "github.com/onsi/ginkgo/v2" o "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" kubeletconfigv1beta1 "k8s.io/kubelet/config/v1beta1" "k8s.io/kubernetes/test/e2e/framework" + "sigs.k8s.io/yaml" mcfgv1 "github.com/openshift/api/machineconfiguration/v1" machineconfigclient "github.com/openshift/client-go/machineconfiguration/clientset/versioned" @@ -57,24 +57,22 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv o.Expect(isSystemCompressibleEnabled(config)).To(o.BeTrue(), "System compressible should be enabled by default") - // Read SYSTEM_RESERVED_CPU from /etc/node-sizing.env - g.By("Reading SYSTEM_RESERVED_CPU from /etc/node-sizing.env") - nodeSizingOutput, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "cat", "/etc/node-sizing.env") - o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to read /etc/node-sizing.env") - framework.Logf("/etc/node-sizing.env contents:\n%s", nodeSizingOutput) - - // Parse SYSTEM_RESERVED_CPU value (e.g., "0.5" means 500m) - var systemReservedCPU float64 - for _, line := range strings.Split(nodeSizingOutput, "\n") { - if strings.HasPrefix(line, "SYSTEM_RESERVED_CPU=") { - cpuStr := strings.TrimPrefix(line, "SYSTEM_RESERVED_CPU=") - systemReservedCPU, err = strconv.ParseFloat(cpuStr, 64) - o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to parse SYSTEM_RESERVED_CPU value: %s", cpuStr) - break - } - } - o.Expect(systemReservedCPU).To(o.BeNumerically(">", 0), "SYSTEM_RESERVED_CPU should be set") - framework.Logf("SYSTEM_RESERVED_CPU: %.2f (%.0f millicores)", systemReservedCPU, systemReservedCPU*1000) + g.By("Reading systemReserved.cpu from /etc/openshift/kubelet.conf.d/20-auto-sizing.conf") + autoSizingOutput, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "cat", "/etc/openshift/kubelet.conf.d/20-auto-sizing.conf") + o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to read /etc/openshift/kubelet.conf.d/20-auto-sizing.conf") + framework.Logf("/etc/openshift/kubelet.conf.d/20-auto-sizing.conf contents:\n%s", autoSizingOutput) + + var autoSizingConfig kubeletconfigv1beta1.KubeletConfiguration + err = yaml.Unmarshal([]byte(autoSizingOutput), &autoSizingConfig) + o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to parse auto-sizing config") + + cpuQuantity, ok := autoSizingConfig.SystemReserved["cpu"] + o.Expect(ok).To(o.BeTrue(), "systemReserved.cpu should be set") + cpuResource, err := resource.ParseQuantity(cpuQuantity) + o.Expect(err).NotTo(o.HaveOccurred(), "systemReserved.cpu must be a valid resource quantity") + systemReservedCPU := float64(cpuResource.MilliValue()) / 1000.0 + o.Expect(systemReservedCPU).To(o.BeNumerically(">", 0), "systemReserved.cpu should be greater than 0") + framework.Logf("systemReserved.cpu: %.2f (%.0f millicores)", systemReservedCPU, systemReservedCPU*1000) // Convert to cpuShares: cpuShares = systemReservedCPU * 1024 cpuShares := uint64(systemReservedCPU * 1024) @@ -99,7 +97,6 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv o.Expect(err).NotTo(o.HaveOccurred(), "Error creating MCO client") testMCPName := "system-compressible-test" - testNodeMCPLabel := fmt.Sprintf("node-role.kubernetes.io/%s", testMCPName) kubeletConfigName := "system-compressible-override" // Select node @@ -107,111 +104,23 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv o.Expect(err).NotTo(o.HaveOccurred(), "Should find a node with at least 4 CPUs") framework.Logf("Testing on node: %s with %d CPUs", nodeName, cpuCount) - // Setup cleanup functions - cleanupNodeLabel := func() { - g.By(fmt.Sprintf("Removing node label %s from node %s", testNodeMCPLabel, nodeName)) - cleanupCtx := context.Background() - patchData := []byte(fmt.Sprintf(`{"metadata":{"labels":{%q:null}}}`, testNodeMCPLabel)) - _, updateErr := oc.AdminKubeClient().CoreV1().Nodes().Patch(cleanupCtx, nodeName, types.MergePatchType, patchData, metav1.PatchOptions{}) - if apierrors.IsNotFound(updateErr) { - // Node already deleted, nothing to clean up - return - } else if updateErr != nil { - framework.Failf("Failed to remove label from node %s: %v", nodeName, updateErr) - } - - g.By(fmt.Sprintf("Waiting for node %s to transition back to worker pool", nodeName)) - o.Eventually(func() bool { - currentNode, err := oc.AdminKubeClient().CoreV1().Nodes().Get(cleanupCtx, nodeName, metav1.GetOptions{}) - if err != nil { - return false - } - currentConfig := currentNode.Annotations["machineconfiguration.openshift.io/currentConfig"] - desiredConfig := currentNode.Annotations["machineconfiguration.openshift.io/desiredConfig"] - isWorkerConfig := currentConfig != "" && !strings.Contains(currentConfig, testMCPName) && currentConfig == desiredConfig - return isWorkerConfig - }, 7*time.Minute, 10*time.Second).Should(o.BeTrue()) - } + // Create custom MCP for the node + mcpConfig, err := CreateCustomMCPForNode(ctx, oc, mcClient, testMCPName, nodeName) + o.Expect(err).NotTo(o.HaveOccurred(), "Should create custom MCP") - cleanupKubeletConfig := func() { - g.By("Cleaning up KubeletConfig") + // Register cleanups in LIFO order + g.DeferCleanup(func() { cleanupCtx := context.Background() - deleteErr := mcClient.MachineconfigurationV1().KubeletConfigs().Delete(cleanupCtx, kubeletConfigName, metav1.DeleteOptions{}) - if apierrors.IsNotFound(deleteErr) { - // KubeletConfig already deleted, nothing to clean up - } else if deleteErr != nil { - framework.Failf("Failed to delete KubeletConfig %s: %v", kubeletConfigName, deleteErr) + if err := CleanupCustomMCP(cleanupCtx, mcpConfig); err != nil { + framework.Logf("Warning: MCP cleanup had errors: %v", err) } - } - - cleanupMCP := func() { - g.By("Cleaning up custom MachineConfigPool") + }) + g.DeferCleanup(func() { cleanupCtx := context.Background() - deleteErr := mcClient.MachineconfigurationV1().MachineConfigPools().Delete(cleanupCtx, testMCPName, metav1.DeleteOptions{}) - if apierrors.IsNotFound(deleteErr) { - // MachineConfigPool already deleted, nothing to clean up - } else if deleteErr != nil { - framework.Failf("Failed to delete MachineConfigPool %s: %v", testMCPName, deleteErr) - } - - // Wait for worker MCP to stabilize after custom MCP deletion - g.By("Waiting for worker MCP to stabilize after custom MCP deletion") - waitErr := waitForMCP(cleanupCtx, mcClient, "worker", 10*time.Minute) - if apierrors.IsNotFound(waitErr) { - // MachineConfigPool already deleted, nothing to wait for - } else if waitErr != nil { - framework.Failf("Worker MCP did not stabilize after custom MCP deletion: %v", waitErr) + if err := CleanupKubeletConfig(cleanupCtx, mcClient, kubeletConfigName, ""); err != nil { + framework.Logf("Warning: KubeletConfig cleanup failed: %v", err) } - } - - // Register cleanups in LIFO order - g.DeferCleanup(cleanupMCP) - g.DeferCleanup(cleanupKubeletConfig) - g.DeferCleanup(cleanupNodeLabel) - - // Label node - g.By(fmt.Sprintf("Labeling node %s with %s", nodeName, testNodeMCPLabel)) - patchData := []byte(fmt.Sprintf(`{"metadata":{"labels":{%q:""}}}`, testNodeMCPLabel)) - _, err = oc.AdminKubeClient().CoreV1().Nodes().Patch(ctx, nodeName, types.MergePatchType, patchData, metav1.PatchOptions{}) - o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to label node") - - // Create custom MCP - g.By(fmt.Sprintf("Creating custom MachineConfigPool %s", testMCPName)) - testMCP := &mcfgv1.MachineConfigPool{ - TypeMeta: metav1.TypeMeta{ - APIVersion: "machineconfiguration.openshift.io/v1", - Kind: "MachineConfigPool", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: testMCPName, - Labels: map[string]string{ - "machineconfiguration.openshift.io/pool": testMCPName, - }, - }, - Spec: mcfgv1.MachineConfigPoolSpec{ - MachineConfigSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "machineconfiguration.openshift.io/role", - Operator: metav1.LabelSelectorOpIn, - Values: []string{"worker", testMCPName}, - }, - }, - }, - NodeSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - testNodeMCPLabel: "", - }, - }, - }, - } - _, err = mcClient.MachineconfigurationV1().MachineConfigPools().Create(ctx, testMCP, metav1.CreateOptions{}) - o.Expect(err).NotTo(o.HaveOccurred(), "Should create custom MachineConfigPool") - - // Wait for MCP ready - g.By("Waiting for custom MachineConfigPool to be ready") - err = waitForMCP(ctx, mcClient, testMCPName, 5*time.Minute) - o.Expect(err).NotTo(o.HaveOccurred(), "MCP should be ready") + }) // Create KubeletConfig to disable system compressible g.By("Creating KubeletConfig to disable system compressible") @@ -235,29 +144,9 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv }, } - _, err = mcClient.MachineconfigurationV1().KubeletConfigs().Create(ctx, kubeletConfig, metav1.CreateOptions{}) - o.Expect(err).NotTo(o.HaveOccurred(), "Should create KubeletConfig") - - // Wait for MCP to start updating - g.By(fmt.Sprintf("Waiting for %s MCP to start updating", testMCPName)) - o.Eventually(func() bool { - mcp, err := mcClient.MachineconfigurationV1().MachineConfigPools().Get(ctx, testMCPName, metav1.GetOptions{}) - if err != nil { - framework.Logf("Error getting %s MCP: %v", testMCPName, err) - return false - } - for _, condition := range mcp.Status.Conditions { - if condition.Type == "Updating" && condition.Status == corev1.ConditionTrue { - return true - } - } - return false - }, 2*time.Minute, 10*time.Second).Should(o.BeTrue(), fmt.Sprintf("%s MCP should start updating", testMCPName)) - - // Wait for MCP to apply configuration - g.By("Waiting for MCP to update with new configuration") - err = waitForMCP(ctx, mcClient, testMCPName, 15*time.Minute) - o.Expect(err).NotTo(o.HaveOccurred(), "MCP should update successfully") + g.By("Applying KubeletConfig and waiting for MCP rollout") + err = ApplyKubeletConfigAndWaitForMCP(ctx, mcClient, kubeletConfig, testMCPName, 15*time.Minute) + o.Expect(err).NotTo(o.HaveOccurred(), "Should apply KubeletConfig and complete MCP rollout") // Verify system compressible is disabled config, err := getKubeletConfigFromNode(ctx, oc, nodeName) @@ -282,9 +171,8 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv framework.Logf("System compressible override verified successfully: cpu.weight is default value") // Cleanup explicitly before DeferCleanup - cleanupKubeletConfig() - cleanupNodeLabel() - cleanupMCP() + CleanupKubeletConfig(ctx, mcClient, kubeletConfigName, "") + CleanupCustomMCP(ctx, mcpConfig) }) g.It("should not enable system compressible when reserved CPU is configured", func(ctx context.Context) { @@ -292,7 +180,6 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv o.Expect(err).NotTo(o.HaveOccurred(), "Error creating MCO client") testMCPName := "reserved-cpu-test" - testNodeMCPLabel := fmt.Sprintf("node-role.kubernetes.io/%s", testMCPName) kubeletConfigName := "reserved-cpu-config" // Select node @@ -300,111 +187,23 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv o.Expect(err).NotTo(o.HaveOccurred(), "Should find a node with at least 4 CPUs") framework.Logf("Testing on node: %s with %d CPUs", nodeName, cpuCount) - // Setup cleanup functions - cleanupNodeLabel := func() { - g.By(fmt.Sprintf("Removing node label %s from node %s", testNodeMCPLabel, nodeName)) - cleanupCtx := context.Background() - patchData := []byte(fmt.Sprintf(`{"metadata":{"labels":{%q:null}}}`, testNodeMCPLabel)) - _, updateErr := oc.AdminKubeClient().CoreV1().Nodes().Patch(cleanupCtx, nodeName, types.MergePatchType, patchData, metav1.PatchOptions{}) - if apierrors.IsNotFound(updateErr) { - // Node already deleted, nothing to clean up - return - } else if updateErr != nil { - framework.Failf("Failed to remove label from node %s: %v", nodeName, updateErr) - } - - g.By(fmt.Sprintf("Waiting for node %s to transition back to worker pool", nodeName)) - o.Eventually(func() bool { - currentNode, err := oc.AdminKubeClient().CoreV1().Nodes().Get(cleanupCtx, nodeName, metav1.GetOptions{}) - if err != nil { - return false - } - currentConfig := currentNode.Annotations["machineconfiguration.openshift.io/currentConfig"] - desiredConfig := currentNode.Annotations["machineconfiguration.openshift.io/desiredConfig"] - isWorkerConfig := currentConfig != "" && !strings.Contains(currentConfig, testMCPName) && currentConfig == desiredConfig - return isWorkerConfig - }, 7*time.Minute, 10*time.Second).Should(o.BeTrue()) - } + // Create custom MCP for the node + mcpConfig, err := CreateCustomMCPForNode(ctx, oc, mcClient, testMCPName, nodeName) + o.Expect(err).NotTo(o.HaveOccurred(), "Should create custom MCP") - cleanupKubeletConfig := func() { - g.By("Cleaning up KubeletConfig") + // Register cleanups in LIFO order + g.DeferCleanup(func() { cleanupCtx := context.Background() - deleteErr := mcClient.MachineconfigurationV1().KubeletConfigs().Delete(cleanupCtx, kubeletConfigName, metav1.DeleteOptions{}) - if apierrors.IsNotFound(deleteErr) { - // KubeletConfig already deleted, nothing to clean up - } else if deleteErr != nil { - framework.Failf("Failed to delete KubeletConfig %s: %v", kubeletConfigName, deleteErr) + if err := CleanupCustomMCP(cleanupCtx, mcpConfig); err != nil { + framework.Logf("Warning: MCP cleanup had errors: %v", err) } - } - - cleanupMCP := func() { - g.By("Cleaning up custom MachineConfigPool") + }) + g.DeferCleanup(func() { cleanupCtx := context.Background() - deleteErr := mcClient.MachineconfigurationV1().MachineConfigPools().Delete(cleanupCtx, testMCPName, metav1.DeleteOptions{}) - if apierrors.IsNotFound(deleteErr) { - // MachineConfigPool already deleted, nothing to clean up - } else if deleteErr != nil { - framework.Failf("Failed to delete MachineConfigPool %s: %v", testMCPName, deleteErr) - } - - // Wait for worker MCP to stabilize after custom MCP deletion - g.By("Waiting for worker MCP to stabilize after custom MCP deletion") - waitErr := waitForMCP(cleanupCtx, mcClient, "worker", 10*time.Minute) - if apierrors.IsNotFound(waitErr) { - // MachineConfigPool already deleted, nothing to wait for - } else if waitErr != nil { - framework.Failf("Worker MCP did not stabilize after custom MCP deletion: %v", waitErr) + if err := CleanupKubeletConfig(cleanupCtx, mcClient, kubeletConfigName, ""); err != nil { + framework.Logf("Warning: KubeletConfig cleanup failed: %v", err) } - } - - // Register cleanups in LIFO order - g.DeferCleanup(cleanupMCP) - g.DeferCleanup(cleanupKubeletConfig) - g.DeferCleanup(cleanupNodeLabel) - - // Label node - g.By(fmt.Sprintf("Labeling node %s with %s", nodeName, testNodeMCPLabel)) - patchData := []byte(fmt.Sprintf(`{"metadata":{"labels":{%q:""}}}`, testNodeMCPLabel)) - _, err = oc.AdminKubeClient().CoreV1().Nodes().Patch(ctx, nodeName, types.MergePatchType, patchData, metav1.PatchOptions{}) - o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to label node") - - // Create custom MCP - g.By(fmt.Sprintf("Creating custom MachineConfigPool %s", testMCPName)) - testMCP := &mcfgv1.MachineConfigPool{ - TypeMeta: metav1.TypeMeta{ - APIVersion: "machineconfiguration.openshift.io/v1", - Kind: "MachineConfigPool", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: testMCPName, - Labels: map[string]string{ - "machineconfiguration.openshift.io/pool": testMCPName, - }, - }, - Spec: mcfgv1.MachineConfigPoolSpec{ - MachineConfigSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "machineconfiguration.openshift.io/role", - Operator: metav1.LabelSelectorOpIn, - Values: []string{"worker", testMCPName}, - }, - }, - }, - NodeSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - testNodeMCPLabel: "", - }, - }, - }, - } - _, err = mcClient.MachineconfigurationV1().MachineConfigPools().Create(ctx, testMCP, metav1.CreateOptions{}) - o.Expect(err).NotTo(o.HaveOccurred(), "Should create custom MachineConfigPool") - - // Wait for MCP ready - g.By("Waiting for custom MachineConfigPool to be ready") - err = waitForMCP(ctx, mcClient, testMCPName, 5*time.Minute) - o.Expect(err).NotTo(o.HaveOccurred(), "MCP should be ready") + }) // Configure static CPU manager with reserved CPUs g.By("Creating KubeletConfig with reserved CPU") @@ -428,29 +227,9 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv }, } - _, err = mcClient.MachineconfigurationV1().KubeletConfigs().Create(ctx, kubeletConfig, metav1.CreateOptions{}) - o.Expect(err).NotTo(o.HaveOccurred(), "Should create KubeletConfig") - - // Wait for MCP to start updating - g.By(fmt.Sprintf("Waiting for %s MCP to start updating", testMCPName)) - o.Eventually(func() bool { - mcp, err := mcClient.MachineconfigurationV1().MachineConfigPools().Get(ctx, testMCPName, metav1.GetOptions{}) - if err != nil { - framework.Logf("Error getting %s MCP: %v", testMCPName, err) - return false - } - for _, condition := range mcp.Status.Conditions { - if condition.Type == "Updating" && condition.Status == corev1.ConditionTrue { - return true - } - } - return false - }, 2*time.Minute, 10*time.Second).Should(o.BeTrue(), fmt.Sprintf("%s MCP should start updating", testMCPName)) - - // Wait for configuration - g.By("Waiting for MCP to update with reserved CPU configuration") - err = waitForMCP(ctx, mcClient, testMCPName, 15*time.Minute) - o.Expect(err).NotTo(o.HaveOccurred(), "MCP should update successfully") + g.By("Applying KubeletConfig and waiting for MCP rollout") + err = ApplyKubeletConfigAndWaitForMCP(ctx, mcClient, kubeletConfig, testMCPName, 15*time.Minute) + o.Expect(err).NotTo(o.HaveOccurred(), "Should apply KubeletConfig and complete MCP rollout") // Verify reserved CPU is enabled config, err := getKubeletConfigFromNode(ctx, oc, nodeName) @@ -465,9 +244,8 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv framework.Logf("Reserved CPU takes precedence over system compressible") // Cleanup explicitly before DeferCleanup - cleanupKubeletConfig() - cleanupNodeLabel() - cleanupMCP() + CleanupKubeletConfig(ctx, mcClient, kubeletConfigName, "") + CleanupCustomMCP(ctx, mcpConfig) }) })