From 984bfb243b3259d82c10cfeeca17eab31b1055b2 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Fri, 26 Sep 2025 17:16:42 -0400 Subject: [PATCH 01/59] initial update --- modules/python/crud/aws/node_pool_crud.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/python/crud/aws/node_pool_crud.py b/modules/python/crud/aws/node_pool_crud.py index 261b5ec05b..cb51b489cb 100644 --- a/modules/python/crud/aws/node_pool_crud.py +++ b/modules/python/crud/aws/node_pool_crud.py @@ -2,8 +2,8 @@ AWS EKS Node Group CRUD Operations Module. This module provides a cloud-agnostic NodePoolCRUD class for Amazon Elastic Kubernetes Service (EKS) -node groups, including create, scale (up/down), and delete operations. It supports -both direct and progressive scaling operations and handles GPU-enabled node groups. +node groups, including create, scale (up/down), and delete operations. +It supports both direct and progressive scaling operations and handles GPU-enabled node groups. """ import logging From 032f59b3319d7279732685f6aaef984ae3d57362 Mon Sep 17 00:00:00 2001 From: Lokesh Keyan Date: Thu, 16 Oct 2025 16:26:09 -0400 Subject: [PATCH 02/59] wip: add create_deployment function to crud --- modules/python/crud/azure/node_pool_crud.py | 87 +++++++++++++++++++ modules/python/crud/main.py | 52 +++++++++++ .../crud/workload_templates/deployment.yml | 34 ++++++++ 3 files changed, 173 insertions(+) create mode 100644 modules/python/crud/workload_templates/deployment.yml diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index ed88bbce93..3e7efde687 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -8,6 +8,7 @@ import logging import time +import yaml from clients.aks_client import AKSClient from utils.logger_config import get_logger, setup_logging @@ -270,3 +271,89 @@ def all( logger.error(error_msg) errors.append(error_msg) return False + + def create_deployment( + self, + node_pool_name, + replicas=10, + manifest_dir=None, + number_of_deployments=1 + ): + """ + Create Kubernetes deployments after node pool operations. + + Args: + node_pool_name: Name of the node pool to target + deployment_name: Base name for the deployments + namespace: Kubernetes namespace (default: "default") + replicas: Number of deployment replicas per deployment (default: 10) + manifest_dir: Directory containing Kubernetes manifest files + number_of_deployments: Number of deployments to create (default: 1) + + Returns: + True if all deployment creations were successful, False otherwise + """ + logger.info(f"Creating {number_of_deployments} deployment(s)") + logger.info(f"Target node pool: {node_pool_name}") + logger.info(f"Replicas per deployment: {replicas}") + logger.info(f"Using manifest directory: {manifest_dir}") + + try: + # Get Kubernetes client from AKS client + k8s_client = self.aks_client.k8s_client + + if not k8s_client: + logger.error("Kubernetes client not available") + return False + + successful_deployments = 0 + + # Loop through number of deployments + for deployment_index in range(1, number_of_deployments + 1): + logger.info(f"Creating deployment {deployment_index}/{number_of_deployments}") + + try: + if manifest_dir: + # Use the template path from manifest_dir + template_path = f"{manifest_dir}/deployment.yml" + else: + # Use default template path + template_path = "modules/python/crud/workload_templates/deployment.yml" + + # Create deployment template using k8s_client.create_template + deployment_template = k8s_client.create_template( + template_path, + { + "DEPLOYMENT_REPLICAS": replicas, + "NODE_POOL_NAME": node_pool_name, + "INDEX": deployment_index + } + ) + + # Apply the processed template + k8s_client.apply_manifest_from_file( + manifest_dict=yaml.safe_load_all(deployment_template) + ) + + logger.info(f"Successfully created deployment {deployment_index} using template") + successful_deployments += 1 + + except Exception as e: + logger.error(f"Failed to create deployment {deployment_index}: {str(e)}") + # Continue with next deployment instead of failing completely + continue + + # Check if all deployments were successful + if successful_deployments == number_of_deployments: + logger.info(f"Successfully created all {number_of_deployments} deployment(s)") + return True + elif successful_deployments > 0: + logger.warning(f"Created {successful_deployments}/{number_of_deployments} deployment(s)") + return False + else: + logger.error("Failed to create any deployments") + return False + + except Exception as e: + logger.error(f"Failed to create deployments: {str(e)}") + return False diff --git a/modules/python/crud/main.py b/modules/python/crud/main.py index 082fd68682..21e03f7bb3 100644 --- a/modules/python/crud/main.py +++ b/modules/python/crud/main.py @@ -146,6 +146,32 @@ def handle_node_pool_operation(node_pool_crud, args): logger.error(f"Error during '{command}' operation: {str(e)}") return 1 +def handle_workload_operations(node_pool_crud, args): + """Handle workload operations (deployment, statefulset, jobs) based on the command""" + command = args.command + result = None + + try: + if command == "deployment": + # Prepare deploy arguments + deploy_kwargs = { + "node_pool_name": args.node_pool_name, + "deployment_name": args.deployment_name, + "namespace": args.namespace, + "replicas": args.replicas, + "manifest_dir": args.manifest_dir, + "number_of_deployments": args.number_of_deployments + } + + result = node_pool_crud.create_deployment(**deploy_kwargs) + # Check if the operation was successful + if result is False: + logger.error(f"Operation '{command}' failed") + return 1 + return 0 + except Exception as e: + logger.error(f"Error during '{command}' operation: {str(e)}") + return 1 def handle_node_pool_all(node_pool_crud, args): """Handle the all-in-one node pool operation command (create, scale up, scale down, delete)""" @@ -320,6 +346,32 @@ def main(): ) all_parser.set_defaults(func=handle_node_pool_operation) + # Deployment command - add after the "all" command parser + deployment_parser = subparsers.add_parser( + "deployment", parents=[common_parser], help="create deployments" + ) + deployment_parser.add_argument("--node-pool-name", required=True, help="Node pool name") + deployment_parser.add_argument("--deployment-name", required=True, help="Deployment name") + deployment_parser.add_argument( + "--number_of_deployments", + type=int, + default=1, + help="Number of deployments" + ) + deployment_parser.add_argument( + "--replicas", + type=int, + default=10, + help="Number of deployment replicas" + ) + deployment_parser.add_argument( + "--manifest-dir", + required=True, + help="Directory containing Kubernetes manifest files for the deployment" + ) + + deployment_parser.set_defaults(func=handle_workload_operations) + # Arguments provided, run node pool operations and collect benchmark results try: args = parser.parse_args() diff --git a/modules/python/crud/workload_templates/deployment.yml b/modules/python/crud/workload_templates/deployment.yml new file mode 100644 index 0000000000..ad19fa40e7 --- /dev/null +++ b/modules/python/crud/workload_templates/deployment.yml @@ -0,0 +1,34 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: myapp-{{NODE_POOL_NAME}}-{{INDEX}} + labels: + app: nginx-container +spec: + template: + metadata: + name: + labels: + app: nginx-container + spec: + containers: + - name: nginx-container + image: mcr.microsoft.com/oss/nginx/nginx:1.21.6 + ports: + - containerPort: 80 + replicas: {{DEPLOYMENT_REPLICAS}} + selector: + matchLabels: + app: nginx-container +--- +apiVersion: v1 +kind: Service +metadata: + name: myapp-{{NODE_POOL_NAME}}-{{INDEX}} +spec: + ports: + - port: 80 + name: myapp + clusterIP: None + selector: + app: nginx-container \ No newline at end of file From bb4bca8d9a7e3332b8c75997775fea662a2ce08f Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Wed, 5 Nov 2025 13:35:47 -0500 Subject: [PATCH 03/59] add import for handle_worload_operation function --- modules/python/tests/crud/test_main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/python/tests/crud/test_main.py b/modules/python/tests/crud/test_main.py index c3ab848a83..7e5d194efc 100644 --- a/modules/python/tests/crud/test_main.py +++ b/modules/python/tests/crud/test_main.py @@ -13,6 +13,7 @@ from crud.main import ( get_node_pool_crud_class, handle_node_pool_operation, + handle_workload_operations, main, check_for_progressive_scaling, collect_benchmark_results, From 47caf3c04ed9a9387a52597a73f84b73f909c922 Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Wed, 5 Nov 2025 13:37:28 -0500 Subject: [PATCH 04/59] add test for success --- modules/python/tests/crud/test_main.py | 30 ++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/modules/python/tests/crud/test_main.py b/modules/python/tests/crud/test_main.py index 7e5d194efc..2fe28a98b0 100644 --- a/modules/python/tests/crud/test_main.py +++ b/modules/python/tests/crud/test_main.py @@ -340,6 +340,36 @@ def test_main_collect_command_simple(self, mock_collect_func): mock_collect_func.assert_called_once() self.assertEqual(cm.exception.code, 0) + @mock.patch("crud.main.AzureNodePoolCRUD") + def test_handle_workload_operations_deployment_success(self, mock_azure_crud): + """Test handle_workload_operations for successful deployment creation""" + # Setup + mock_args = mock.MagicMock() + mock_args.command = "deployment" + mock_args.node_pool_name = "test-nodepool" + mock_args.deployment_name = "test-deployment" + mock_args.namespace = "default" + mock_args.replicas = 5 + mock_args.manifest_dir = "/path/to/manifests" + mock_args.number_of_deployments = 3 + + # Configure mock to return success + mock_azure_crud.create_deployment.return_value = True + + # Execute + result = handle_workload_operations(mock_azure_crud, mock_args) + + # Verify + self.assertEqual(result, 0) # 0 means success + mock_azure_crud.create_deployment.assert_called_once_with( + node_pool_name="test-nodepool", + deployment_name="test-deployment", + namespace="default", + replicas=5, + manifest_dir="/path/to/manifests", + number_of_deployments=3 + ) + class TestCollectBenchmarkResults(unittest.TestCase): """Tests for the collect_benchmark_results function""" From c435e3626fb8b2f061b3ca00c9445ffbf52cf8a0 Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Wed, 5 Nov 2025 13:37:47 -0500 Subject: [PATCH 05/59] change operation name --- modules/python/crud/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/crud/main.py b/modules/python/crud/main.py index 21e03f7bb3..fa1367e085 100644 --- a/modules/python/crud/main.py +++ b/modules/python/crud/main.py @@ -152,7 +152,7 @@ def handle_workload_operations(node_pool_crud, args): result = None try: - if command == "deployment": + if command == "create_pod": # Prepare deploy arguments deploy_kwargs = { "node_pool_name": args.node_pool_name, From 11be2fc29bfdb5ba64d094c386723980db4274f0 Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Wed, 5 Nov 2025 13:38:51 -0500 Subject: [PATCH 06/59] update operation name in test --- modules/python/tests/crud/test_main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/python/tests/crud/test_main.py b/modules/python/tests/crud/test_main.py index 2fe28a98b0..46269aa18a 100644 --- a/modules/python/tests/crud/test_main.py +++ b/modules/python/tests/crud/test_main.py @@ -341,11 +341,11 @@ def test_main_collect_command_simple(self, mock_collect_func): self.assertEqual(cm.exception.code, 0) @mock.patch("crud.main.AzureNodePoolCRUD") - def test_handle_workload_operations_deployment_success(self, mock_azure_crud): - """Test handle_workload_operations for successful deployment creation""" + def test_handle_workload_operations_create_pod_success(self, mock_azure_crud): + """Test handle_workload_operations for successful pod creation""" # Setup mock_args = mock.MagicMock() - mock_args.command = "deployment" + mock_args.command = "create_pod" mock_args.node_pool_name = "test-nodepool" mock_args.deployment_name = "test-deployment" mock_args.namespace = "default" From 5e73464952d55f11b22fd464a6b60d69d16d6fd7 Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Wed, 5 Nov 2025 13:45:06 -0500 Subject: [PATCH 07/59] add test for failure --- modules/python/tests/crud/test_main.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/modules/python/tests/crud/test_main.py b/modules/python/tests/crud/test_main.py index 46269aa18a..1b75f0b6d6 100644 --- a/modules/python/tests/crud/test_main.py +++ b/modules/python/tests/crud/test_main.py @@ -370,6 +370,28 @@ def test_handle_workload_operations_create_pod_success(self, mock_azure_crud): number_of_deployments=3 ) + @mock.patch("crud.main.AzureNodePoolCRUD") + def test_handle_workload_operations_failure(self, mock_azure_crud): + """Test handle_workload_operations when operation fails""" + # Setup + mock_args = mock.MagicMock() + mock_args.command = "create_pod" + mock_args.node_pool_name = "test-nodepool" + mock_args.deployment_name = "test-deployment" + mock_args.namespace = "default" + mock_args.replicas = 5 + mock_args.manifest_dir = "/path/to/manifests" + mock_args.number_of_deployments = 3 + + # Configure mock to return failure + mock_azure_crud.create_deployment.return_value = False + + # Execute + result = handle_workload_operations(mock_azure_crud, mock_args) + + # Verify + self.assertEqual(result, 1) # 1 means failure + class TestCollectBenchmarkResults(unittest.TestCase): """Tests for the collect_benchmark_results function""" From 8530049cfd8958821d5ef5fad665df0f51ab8a6c Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Wed, 5 Nov 2025 13:47:51 -0500 Subject: [PATCH 08/59] add exception test --- modules/python/tests/crud/test_main.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/modules/python/tests/crud/test_main.py b/modules/python/tests/crud/test_main.py index 1b75f0b6d6..3da3245c07 100644 --- a/modules/python/tests/crud/test_main.py +++ b/modules/python/tests/crud/test_main.py @@ -392,6 +392,32 @@ def test_handle_workload_operations_failure(self, mock_azure_crud): # Verify self.assertEqual(result, 1) # 1 means failure + @mock.patch("crud.main.logger") + @mock.patch("crud.main.AzureNodePoolCRUD") + def test_handle_workload_operations_exception(self, mock_azure_crud, mock_logger): + """Test handle_workload_operations with exception during operation""" + # Setup + mock_args = mock.MagicMock() + mock_args.command = "create_pod" + mock_args.node_pool_name = "test-nodepool" + mock_args.deployment_name = "test-deployment" + mock_args.namespace = "default" + mock_args.replicas = 5 + mock_args.manifest_dir = "/path/to/manifests" + mock_args.number_of_deployments = 3 + + # Configure mock to raise exception + mock_azure_crud.create_deployment.side_effect = ValueError("Test error") + + # Execute + result = handle_workload_operations(mock_azure_crud, mock_args) + + # Verify + self.assertEqual(result, 1) # 1 means error + mock_logger.error.assert_called_with( + "Error during 'create_pod' operation: Test error" + ) + class TestCollectBenchmarkResults(unittest.TestCase): """Tests for the collect_benchmark_results function""" From 4e7a73b2508291086e923457800c7129b595a211 Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Wed, 5 Nov 2025 14:08:52 -0500 Subject: [PATCH 09/59] Linting error: removed elif and else --- modules/python/crud/azure/node_pool_crud.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index 3e7efde687..e933eff7e6 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -347,12 +347,11 @@ def create_deployment( if successful_deployments == number_of_deployments: logger.info(f"Successfully created all {number_of_deployments} deployment(s)") return True - elif successful_deployments > 0: + if successful_deployments > 0: logger.warning(f"Created {successful_deployments}/{number_of_deployments} deployment(s)") return False - else: - logger.error("Failed to create any deployments") - return False + logger.error("Failed to create any deployments") + return False except Exception as e: logger.error(f"Failed to create deployments: {str(e)}") From 7a8dad4beeea044342f0d402ad7b7ab619ceda26 Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Wed, 5 Nov 2025 14:15:29 -0500 Subject: [PATCH 10/59] fixed the spacing --- .../python/crud/workload_templates/deployment.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/python/crud/workload_templates/deployment.yml b/modules/python/crud/workload_templates/deployment.yml index ad19fa40e7..6e1e5f39f3 100644 --- a/modules/python/crud/workload_templates/deployment.yml +++ b/modules/python/crud/workload_templates/deployment.yml @@ -26,9 +26,9 @@ kind: Service metadata: name: myapp-{{NODE_POOL_NAME}}-{{INDEX}} spec: - ports: - - port: 80 - name: myapp - clusterIP: None - selector: - app: nginx-container \ No newline at end of file + ports: + - port: 80 + name: myapp + clusterIP: None + selector: + app: nginx-container \ No newline at end of file From 5e52b71c8b34977eee6dcf7e2bb789212de4b3fa Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Wed, 5 Nov 2025 14:20:54 -0500 Subject: [PATCH 11/59] removed extra spaces --- modules/python/crud/main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/python/crud/main.py b/modules/python/crud/main.py index fa1367e085..83379a1d7e 100644 --- a/modules/python/crud/main.py +++ b/modules/python/crud/main.py @@ -353,15 +353,15 @@ def main(): deployment_parser.add_argument("--node-pool-name", required=True, help="Node pool name") deployment_parser.add_argument("--deployment-name", required=True, help="Deployment name") deployment_parser.add_argument( - "--number_of_deployments", - type=int, - default=1, + "--number_of_deployments", + type=int, + default=1, help="Number of deployments" ) deployment_parser.add_argument( - "--replicas", - type=int, - default=10, + "--replicas", + type=int, + default=10, help="Number of deployment replicas" ) deployment_parser.add_argument( From 364c2647194b4c4fa8fb33e1443bde85959f49b1 Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Thu, 6 Nov 2025 15:15:02 -0500 Subject: [PATCH 12/59] Add deployment_name for consistency and to reference later --- modules/python/crud/azure/node_pool_crud.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index e933eff7e6..96147f6f82 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -320,6 +320,9 @@ def create_deployment( # Use default template path template_path = "modules/python/crud/workload_templates/deployment.yml" + # Generate deployment name + deployment_name = f"myapp-{node_pool_name}-{deployment_index}" + # Create deployment template using k8s_client.create_template deployment_template = k8s_client.create_template( template_path, From 0bc8275c4da828fb00c73cc0a6847d80aa0f2fe0 Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Thu, 6 Nov 2025 15:17:05 -0500 Subject: [PATCH 13/59] verify deployment using wait condition --- modules/python/crud/azure/node_pool_crud.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index 96147f6f82..f52ad71452 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -338,9 +338,13 @@ def create_deployment( manifest_dict=yaml.safe_load_all(deployment_template) ) - logger.info(f"Successfully created deployment {deployment_index} using template") - successful_deployments += 1 - + deployment_ready = k8s_client.wait_for_condition( + resource_type="deployment", + wait_condition_type="available", + resource_name=deployment_name, + namespace="default", + timeout_seconds=300 # 5 minutes timeout + ) except Exception as e: logger.error(f"Failed to create deployment {deployment_index}: {str(e)}") # Continue with next deployment instead of failing completely From 6604ac0533455edc25a17a4cae3b8593270631da Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Thu, 6 Nov 2025 15:18:54 -0500 Subject: [PATCH 14/59] Add logging for maniest and to wait for deployment - debug --- modules/python/crud/azure/node_pool_crud.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index f52ad71452..2ec0ffe498 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -338,6 +338,10 @@ def create_deployment( manifest_dict=yaml.safe_load_all(deployment_template) ) + logger.info(f"Applied manifest for deployment {deployment_name}") + + # Wait for deployment to be available (successful deployment verification) + logger.info(f"Waiting for deployment {deployment_name} to become available...") deployment_ready = k8s_client.wait_for_condition( resource_type="deployment", wait_condition_type="available", From 3e6beea5bfed20799f0c118dc4386b2b2271abe6 Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Thu, 6 Nov 2025 15:19:36 -0500 Subject: [PATCH 15/59] add logger for deployment success --- modules/python/crud/azure/node_pool_crud.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index 2ec0ffe498..ceccde0712 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -349,6 +349,9 @@ def create_deployment( namespace="default", timeout_seconds=300 # 5 minutes timeout ) + + if deployment_ready: + logger.info(f"Deployment {deployment_name} is successfully available") except Exception as e: logger.error(f"Failed to create deployment {deployment_index}: {str(e)}") # Continue with next deployment instead of failing completely From ae99b54b2a6d2066ee8ec3d1949fdf4d6cc05549 Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Thu, 6 Nov 2025 15:20:51 -0500 Subject: [PATCH 16/59] verify pods are available in deployment --- modules/python/crud/azure/node_pool_crud.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index ceccde0712..055971f5b5 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -352,6 +352,13 @@ def create_deployment( if deployment_ready: logger.info(f"Deployment {deployment_name} is successfully available") + logger.info(f"Waiting for pods of deployment {deployment_name} to be ready...") + k8s_client.wait_for_pods_ready( + operation_timeout_in_minutes=5, + namespace="default", + pod_count=replicas, + label_selector=f"app=nginx-container" + ) except Exception as e: logger.error(f"Failed to create deployment {deployment_index}: {str(e)}") # Continue with next deployment instead of failing completely From 6623cb283da84f49baa003921083269de002f111 Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Thu, 6 Nov 2025 15:22:39 -0500 Subject: [PATCH 17/59] add failure count --- modules/python/crud/azure/node_pool_crud.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index 055971f5b5..f6c3163236 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -352,6 +352,8 @@ def create_deployment( if deployment_ready: logger.info(f"Deployment {deployment_name} is successfully available") + + # Additionally wait for pods to be ready logger.info(f"Waiting for pods of deployment {deployment_name} to be ready...") k8s_client.wait_for_pods_ready( operation_timeout_in_minutes=5, @@ -359,6 +361,10 @@ def create_deployment( pod_count=replicas, label_selector=f"app=nginx-container" ) + else: + logger.error(f"Deployment {deployment_name} failed to become available within timeout") + continue + except Exception as e: logger.error(f"Failed to create deployment {deployment_index}: {str(e)}") # Continue with next deployment instead of failing completely From 8916a99bc47898e03d72690812dc17631f590963 Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Thu, 6 Nov 2025 15:37:33 -0500 Subject: [PATCH 18/59] add logger to verify deployment --- modules/python/crud/azure/node_pool_crud.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index f6c3163236..1de9b6ec8a 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -361,6 +361,9 @@ def create_deployment( pod_count=replicas, label_selector=f"app=nginx-container" ) + + logger.info(f"Successfully created and verified deployment {deployment_index}") + successful_deployments += 1 else: logger.error(f"Deployment {deployment_name} failed to become available within timeout") continue From a4273ac192a5df4c960005d946559323823e80f1 Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Thu, 6 Nov 2025 16:23:54 -0500 Subject: [PATCH 19/59] add unit test for create_deployment method --- .../tests/crud/test_azure_node_pool_crud.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/modules/python/tests/crud/test_azure_node_pool_crud.py b/modules/python/tests/crud/test_azure_node_pool_crud.py index 7089426166..407867b9ed 100644 --- a/modules/python/tests/crud/test_azure_node_pool_crud.py +++ b/modules/python/tests/crud/test_azure_node_pool_crud.py @@ -222,6 +222,42 @@ def test_all_operations(self, mock_time): # Check time.sleep was called 3 times (between operations) self.assertEqual(mock_time.sleep.call_count, 3) + def test_create_deployment_success(self): + """Test successful deployment creation""" + # Setup + mock_k8s_client = mock.MagicMock() + self.mock_aks_client.k8s_client = mock_k8s_client + mock_k8s_client.wait_for_condition.return_value = True + + # Execute + result = self.node_pool_crud.create_deployment(node_pool_name="test-pool") + + # Verify + self.assertTrue(result) + + def test_create_deployment_failure(self): + """Test deployment creation failure""" + # Setup + mock_k8s_client = mock.MagicMock() + self.mock_aks_client.k8s_client = mock_k8s_client + mock_k8s_client.wait_for_condition.return_value = False + + # Execute + result = self.node_pool_crud.create_deployment(node_pool_name="test-pool") + + # Verify + self.assertFalse(result) + + def test_create_deployment_no_client(self): + """Test deployment creation with no Kubernetes client""" + # Setup + self.mock_aks_client.k8s_client = None + + # Execute + result = self.node_pool_crud.create_deployment(node_pool_name="test-pool") + + # Verify + self.assertFalse(result) if __name__ == "__main__": unittest.main() From bf6143a58066ebc4c2c97e159ca1cf3cb7c6fdfd Mon Sep 17 00:00:00 2001 From: diamond jorsling Date: Thu, 6 Nov 2025 16:34:27 -0500 Subject: [PATCH 20/59] ran lint --- modules/python/crud/azure/node_pool_crud.py | 36 ++++++++++----------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index 1de9b6ec8a..6d3b90b05e 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -281,7 +281,7 @@ def create_deployment( ): """ Create Kubernetes deployments after node pool operations. - + Args: node_pool_name: Name of the node pool to target deployment_name: Base name for the deployments @@ -289,7 +289,7 @@ def create_deployment( replicas: Number of deployment replicas per deployment (default: 10) manifest_dir: Directory containing Kubernetes manifest files number_of_deployments: Number of deployments to create (default: 1) - + Returns: True if all deployment creations were successful, False otherwise """ @@ -297,21 +297,21 @@ def create_deployment( logger.info(f"Target node pool: {node_pool_name}") logger.info(f"Replicas per deployment: {replicas}") logger.info(f"Using manifest directory: {manifest_dir}") - + try: # Get Kubernetes client from AKS client k8s_client = self.aks_client.k8s_client - + if not k8s_client: logger.error("Kubernetes client not available") return False - + successful_deployments = 0 - + # Loop through number of deployments for deployment_index in range(1, number_of_deployments + 1): logger.info(f"Creating deployment {deployment_index}/{number_of_deployments}") - + try: if manifest_dir: # Use the template path from manifest_dir @@ -319,10 +319,10 @@ def create_deployment( else: # Use default template path template_path = "modules/python/crud/workload_templates/deployment.yml" - + # Generate deployment name deployment_name = f"myapp-{node_pool_name}-{deployment_index}" - + # Create deployment template using k8s_client.create_template deployment_template = k8s_client.create_template( template_path, @@ -332,14 +332,14 @@ def create_deployment( "INDEX": deployment_index } ) - + # Apply the processed template k8s_client.apply_manifest_from_file( manifest_dict=yaml.safe_load_all(deployment_template) ) - + logger.info(f"Applied manifest for deployment {deployment_name}") - + # Wait for deployment to be available (successful deployment verification) logger.info(f"Waiting for deployment {deployment_name} to become available...") deployment_ready = k8s_client.wait_for_condition( @@ -349,10 +349,10 @@ def create_deployment( namespace="default", timeout_seconds=300 # 5 minutes timeout ) - + if deployment_ready: logger.info(f"Deployment {deployment_name} is successfully available") - + # Additionally wait for pods to be ready logger.info(f"Waiting for pods of deployment {deployment_name} to be ready...") k8s_client.wait_for_pods_ready( @@ -361,18 +361,18 @@ def create_deployment( pod_count=replicas, label_selector=f"app=nginx-container" ) - + logger.info(f"Successfully created and verified deployment {deployment_index}") successful_deployments += 1 else: logger.error(f"Deployment {deployment_name} failed to become available within timeout") continue - + except Exception as e: logger.error(f"Failed to create deployment {deployment_index}: {str(e)}") # Continue with next deployment instead of failing completely continue - + # Check if all deployments were successful if successful_deployments == number_of_deployments: logger.info(f"Successfully created all {number_of_deployments} deployment(s)") @@ -382,7 +382,7 @@ def create_deployment( return False logger.error("Failed to create any deployments") return False - + except Exception as e: logger.error(f"Failed to create deployments: {str(e)}") return False From 712939e0fa17b2e93816748a48981aaf4ee243a4 Mon Sep 17 00:00:00 2001 From: Diamond Powell Date: Mon, 2 Mar 2026 12:26:51 -0500 Subject: [PATCH 21/59] Add test for deployment partial sucess --- modules/python/tests/crud/test_main.py | 40 ++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/modules/python/tests/crud/test_main.py b/modules/python/tests/crud/test_main.py index 3da3245c07..679afe01ab 100644 --- a/modules/python/tests/crud/test_main.py +++ b/modules/python/tests/crud/test_main.py @@ -418,6 +418,46 @@ def test_handle_workload_operations_exception(self, mock_azure_crud, mock_logger "Error during 'create_pod' operation: Test error" ) + @mock.patch("crud.main.logger") + @mock.patch("crud.main.AzureNodePoolCRUD") + def test_handle_workload_operations_partial_success(self, mock_azure_crud, mock_logger): + """Test handle_workload_operations when deployment returns partial success (False). + + The create_deployment method returns False when some deployments succeed but + not all of them (partial success). This tests that handle_workload_operations + correctly treats this as a failure and returns exit code 1. + """ + # Setup - simulate a partial success scenario where create_deployment + # returns False (e.g., 2 out of 3 deployments succeeded) + mock_args = mock.MagicMock() + mock_args.command = "create_pod" + mock_args.node_pool_name = "test-nodepool" + mock_args.deployment_name = "test-deployment" + mock_args.namespace = "default" + mock_args.replicas = 5 + mock_args.manifest_dir = "/path/to/manifests" + mock_args.number_of_deployments = 3 # Requesting 3 deployments + + # Configure mock to return False (partial success - some deployments + # succeeded but not all, which is still considered a failure) + mock_azure_crud.create_deployment.return_value = False + + # Execute + result = handle_workload_operations(mock_azure_crud, mock_args) + + # Verify + self.assertEqual(result, 1) # 1 means failure (partial success is still failure) + mock_azure_crud.create_deployment.assert_called_once_with( + node_pool_name="test-nodepool", + deployment_name="test-deployment", + namespace="default", + replicas=5, + manifest_dir="/path/to/manifests", + number_of_deployments=3 + ) + # Verify the error was logged for the failed operation + mock_logger.error.assert_called_with("Operation 'create_pod' failed") + class TestCollectBenchmarkResults(unittest.TestCase): """Tests for the collect_benchmark_results function""" From b6f248cec58b5ef7ea27283e284b81f2c0b0dedf Mon Sep 17 00:00:00 2001 From: Diamond Powell Date: Mon, 2 Mar 2026 12:38:53 -0500 Subject: [PATCH 22/59] Add test for multiple deployments --- modules/python/tests/crud/test_main.py | 35 ++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/modules/python/tests/crud/test_main.py b/modules/python/tests/crud/test_main.py index 679afe01ab..42e495e854 100644 --- a/modules/python/tests/crud/test_main.py +++ b/modules/python/tests/crud/test_main.py @@ -458,6 +458,41 @@ def test_handle_workload_operations_partial_success(self, mock_azure_crud, mock_ # Verify the error was logged for the failed operation mock_logger.error.assert_called_with("Operation 'create_pod' failed") + @mock.patch("crud.main.AzureNodePoolCRUD") + def test_handle_workload_operations_multiple_deployments_success(self, mock_azure_crud): + """Test handle_workload_operations with multiple deployments all succeeding. + + This test verifies that when create_deployment is called with multiple + deployments (number_of_deployments > 1) and all deployments succeed, + the function returns success (exit code 0). + """ + # Setup - configure for multiple deployments + mock_args = mock.MagicMock() + mock_args.command = "create_pod" + mock_args.node_pool_name = "test-nodepool" + mock_args.deployment_name = "test-deployment" + mock_args.namespace = "default" + mock_args.replicas = 10 + mock_args.manifest_dir = "/path/to/manifests" + mock_args.number_of_deployments = 5 # Multiple deployments + + # Configure mock to return True (all deployments succeeded) + mock_azure_crud.create_deployment.return_value = True + + # Execute + result = handle_workload_operations(mock_azure_crud, mock_args) + + # Verify + self.assertEqual(result, 0) # 0 means success + mock_azure_crud.create_deployment.assert_called_once_with( + node_pool_name="test-nodepool", + deployment_name="test-deployment", + namespace="default", + replicas=10, + manifest_dir="/path/to/manifests", + number_of_deployments=5 + ) + class TestCollectBenchmarkResults(unittest.TestCase): """Tests for the collect_benchmark_results function""" From e0d8037467c4af4c27b31aa350fbe44e95f494b5 Mon Sep 17 00:00:00 2001 From: Diamond Powell Date: Mon, 2 Mar 2026 13:02:28 -0500 Subject: [PATCH 23/59] Add test for progressive scaling failure --- modules/python/tests/crud/test_main.py | 35 ++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/modules/python/tests/crud/test_main.py b/modules/python/tests/crud/test_main.py index 42e495e854..cdb1f25ee3 100644 --- a/modules/python/tests/crud/test_main.py +++ b/modules/python/tests/crud/test_main.py @@ -146,6 +146,41 @@ def test_handle_node_pool_operation_scale_non_progressive(self, mock_azure_crud) gpu_node_pool=False, ) + @mock.patch("crud.main.logger") + @mock.patch("crud.main.AzureNodePoolCRUD") + def test_handle_node_pool_operation_scale_fails_returns_error( + self, mock_azure_crud, mock_logger + ): + """Test handle_node_pool_operation when scale up fails but continues execution. + + This test verifies that when scale_node_pool returns False (e.g., some nodes + failed to scale but the operation completed), the function correctly returns + exit code 1 to indicate failure while allowing the calling code to continue. + """ + # Setup - progressive scaling where operation fails + mock_args = mock.MagicMock() + mock_args.command = "scale" + mock_args.node_pool_name = "test-np" + mock_args.target_count = 10 + mock_args.scale_step_size = 2 # Progressive scaling + mock_args.gpu_node_pool = False + + # Configure mock to return False (scale failed but didn't raise exception) + mock_azure_crud.scale_node_pool.return_value = False + + # Execute + result = handle_node_pool_operation(mock_azure_crud, mock_args) + + # Verify - operation failed but returned gracefully (no exception) + self.assertEqual(result, 1) # 1 means failure + mock_azure_crud.scale_node_pool.assert_called_once_with( + node_pool_name="test-np", + node_count=10, + progressive=True, + scale_step_size=2, + gpu_node_pool=False, + ) + @mock.patch("crud.main.AzureNodePoolCRUD") def test_handle_node_pool_operation_delete(self, mock_azure_crud): """Test handle_node_pool_operation for delete command""" From e6feced8323307fa8d095128d9e2464e9352ff2f Mon Sep 17 00:00:00 2001 From: Diamond Powell Date: Mon, 2 Mar 2026 13:59:40 -0500 Subject: [PATCH 24/59] Add test in node_pool_crud for returns false early exit --- .../tests/crud/test_azure_node_pool_crud.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/modules/python/tests/crud/test_azure_node_pool_crud.py b/modules/python/tests/crud/test_azure_node_pool_crud.py index 407867b9ed..8a2e4f894b 100644 --- a/modules/python/tests/crud/test_azure_node_pool_crud.py +++ b/modules/python/tests/crud/test_azure_node_pool_crud.py @@ -222,6 +222,36 @@ def test_all_operations(self, mock_time): # Check time.sleep was called 3 times (between operations) self.assertEqual(mock_time.sleep.call_count, 3) + @mock.patch("crud.azure.node_pool_crud.time") + def test_all_create_returns_false_early_exit(self, mock_time): + """Test that all() exits early when create returns False""" + # Setup - mock create to fail + self.node_pool_crud.create_node_pool = mock.MagicMock(return_value=False) + self.node_pool_crud.scale_node_pool = mock.MagicMock(return_value=True) + self.node_pool_crud.delete_node_pool = mock.MagicMock(return_value=True) + + # Execute + result = self.node_pool_crud.all( + node_pool_name="test-pool", + vm_size="Standard_DS2_v2", + node_count=1, + target_count=3, + progressive=True, + scale_step_size=1, + ) + + # Verify - should return False + self.assertFalse(result) + + # Verify create was called once + self.node_pool_crud.create_node_pool.assert_called_once() + + # Verify scale and delete were NOT called (early exit) + self.node_pool_crud.scale_node_pool.assert_not_called() + self.node_pool_crud.delete_node_pool.assert_not_called() + + # Verify time.sleep was NOT called (no operations after create) + mock_time.sleep.assert_not_called() def test_create_deployment_success(self): """Test successful deployment creation""" # Setup From e3d142d049a492be449099f8fb7c248f7fdd41ca Mon Sep 17 00:00:00 2001 From: Diamond Powell Date: Mon, 2 Mar 2026 14:04:08 -0500 Subject: [PATCH 25/59] Add test in node_pool_crud for scale up fails but continues to scale down --- .../tests/crud/test_azure_node_pool_crud.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/modules/python/tests/crud/test_azure_node_pool_crud.py b/modules/python/tests/crud/test_azure_node_pool_crud.py index 8a2e4f894b..53abcae715 100644 --- a/modules/python/tests/crud/test_azure_node_pool_crud.py +++ b/modules/python/tests/crud/test_azure_node_pool_crud.py @@ -252,6 +252,42 @@ def test_all_create_returns_false_early_exit(self, mock_time): # Verify time.sleep was NOT called (no operations after create) mock_time.sleep.assert_not_called() + + @mock.patch("crud.azure.node_pool_crud.time") + def test_all_scale_up_fails_continues(self, mock_time): + """Test that all() continues to scale down and delete when scale up fails""" + # Setup - create succeeds, scale_up fails, scale_down and delete succeed + self.node_pool_crud.create_node_pool = mock.MagicMock(return_value=True) + self.node_pool_crud.scale_node_pool = mock.MagicMock( + side_effect=[False, True] # scale_up fails, scale_down succeeds + ) + self.node_pool_crud.delete_node_pool = mock.MagicMock(return_value=True) + + # Execute + result = self.node_pool_crud.all( + node_pool_name="test-pool", + vm_size="Standard_DS2_v2", + node_count=1, + target_count=3, + progressive=True, + scale_step_size=1, + ) + + # Verify - should return False (scale_up failed) + self.assertFalse(result) + + # Verify create was called once + self.node_pool_crud.create_node_pool.assert_called_once() + + # Verify scale was called TWICE (scale_up failed, but scale_down still called) + self.assertEqual(self.node_pool_crud.scale_node_pool.call_count, 2) + + # Verify delete was still called (cleanup continues despite scale_up failure) + self.node_pool_crud.delete_node_pool.assert_called_once() + + # Verify time.sleep was called 3 times (between all operations) + self.assertEqual(mock_time.sleep.call_count, 3) + def test_create_deployment_success(self): """Test successful deployment creation""" # Setup From 769573ea1e1cc25b2dcee747529b9dec3b575804 Mon Sep 17 00:00:00 2001 From: Diamond Powell Date: Mon, 2 Mar 2026 14:11:18 -0500 Subject: [PATCH 26/59] Add test for node_pool_crud for scale down fails operation continues --- .../tests/crud/test_azure_node_pool_crud.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/modules/python/tests/crud/test_azure_node_pool_crud.py b/modules/python/tests/crud/test_azure_node_pool_crud.py index 53abcae715..40890f5f3f 100644 --- a/modules/python/tests/crud/test_azure_node_pool_crud.py +++ b/modules/python/tests/crud/test_azure_node_pool_crud.py @@ -288,6 +288,41 @@ def test_all_scale_up_fails_continues(self, mock_time): # Verify time.sleep was called 3 times (between all operations) self.assertEqual(mock_time.sleep.call_count, 3) + @mock.patch("crud.azure.node_pool_crud.time") + def test_all_scale_down_fails_continues(self, mock_time): + """Test that all() continues to delete when scale down fails""" + # Setup - create and scale_up succeed, scale_down fails, delete succeeds + self.node_pool_crud.create_node_pool = mock.MagicMock(return_value=True) + self.node_pool_crud.scale_node_pool = mock.MagicMock( + side_effect=[True, False] # scale_up succeeds, scale_down fails + ) + self.node_pool_crud.delete_node_pool = mock.MagicMock(return_value=True) + + # Execute + result = self.node_pool_crud.all( + node_pool_name="test-pool", + vm_size="Standard_DS2_v2", + node_count=1, + target_count=3, + progressive=True, + scale_step_size=1, + ) + + # Verify - should return False (scale_down failed) + self.assertFalse(result) + + # Verify create was called once + self.node_pool_crud.create_node_pool.assert_called_once() + + # Verify scale was called TWICE (scale_up succeeded, scale_down failed) + self.assertEqual(self.node_pool_crud.scale_node_pool.call_count, 2) + + # Verify delete was still called (cleanup continues despite scale_down failure) + self.node_pool_crud.delete_node_pool.assert_called_once() + + # Verify time.sleep was called 3 times (between all operations) + self.assertEqual(mock_time.sleep.call_count, 3) + def test_create_deployment_success(self): """Test successful deployment creation""" # Setup From ce8197ebc6c727422b54f647e312890d7bd96501 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Thu, 5 Mar 2026 13:07:15 -0500 Subject: [PATCH 27/59] Add test in node_pool_crud for deployment partial success --- .../tests/crud/test_azure_node_pool_crud.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/modules/python/tests/crud/test_azure_node_pool_crud.py b/modules/python/tests/crud/test_azure_node_pool_crud.py index 40890f5f3f..79af938be8 100644 --- a/modules/python/tests/crud/test_azure_node_pool_crud.py +++ b/modules/python/tests/crud/test_azure_node_pool_crud.py @@ -360,5 +360,31 @@ def test_create_deployment_no_client(self): # Verify self.assertFalse(result) + def test_create_deployment_partial_success(self): + """Test deployment creation when some deployments succeed and others fail""" + # Setup + mock_k8s_client = mock.MagicMock() + self.mock_aks_client.k8s_client = mock_k8s_client + + # Simulate: deployment 1 succeeds, deployment 2 fails, deployment 3 succeeds + # wait_for_condition returns True/False for each deployment + mock_k8s_client.wait_for_condition.side_effect = [True, False, True] + + # Execute - request 3 deployments + result = self.node_pool_crud.create_deployment( + node_pool_name="test-pool", + number_of_deployments=3, + replicas=5 + ) + + # Verify - should return False (not all deployments succeeded) + self.assertFalse(result) + + # Verify wait_for_condition was called 3 times (once per deployment) + self.assertEqual(mock_k8s_client.wait_for_condition.call_count, 3) + + # Verify create_template was called 3 times (attempted all deployments) + self.assertEqual(mock_k8s_client.create_template.call_count, 3) + if __name__ == "__main__": unittest.main() From be9af7424ca4e4b9df1ce9f47d2854b53b5d045f Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Mon, 16 Mar 2026 10:18:50 -0400 Subject: [PATCH 28/59] pipeline test --- pipelines/system/new-pipeline-test.yml | 55 ++++++++++++++++++-------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 63d55f02d9..78d0629c2f 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,25 +1,48 @@ +# trigger: none + +# variables: +# SCENARIO_TYPE: +# SCENARIO_NAME: + +# stages: +# - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) +# dependsOn: [] +# jobs: +# - template: /jobs/competitive-test.yml # must keep as is +# parameters: +# cloud: # e.g. azure, aws +# regions: # list of regions +# - region1 # e.g. eastus2 +# topology: # e.g. cluster-autoscaler +# engine: # e.g. clusterloader2 +# matrix: # list of test parameters to customize the provisioned resources +# : +# : +# : +# max_parallel: # required +# credential_type: service_connection # required +# ssh_key_enabled: false +# timeout_in_minutes: 60 # if not specified, default is 60 trigger: none variables: - SCENARIO_TYPE: - SCENARIO_NAME: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: k8s-gpu-cluster-crud stages: - - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) + - stage: azure_eastus2_node_pool_crud dependsOn: [] jobs: - - template: /jobs/competitive-test.yml # must keep as is + - template: /jobs/competitive-test.yml parameters: - cloud: # e.g. azure, aws - regions: # list of regions - - region1 # e.g. eastus2 - topology: # e.g. cluster-autoscaler - engine: # e.g. clusterloader2 - matrix: # list of test parameters to customize the provisioned resources - : - : - : - max_parallel: # required - credential_type: service_connection # required + cloud: azure + regions: + - eastus2 + topology: k8s-crud-gpu + engine: crud + matrix: + node_pool_crud_standard: {} + max_parallel: 1 + credential_type: service_connection ssh_key_enabled: false - timeout_in_minutes: 60 # if not specified, default is 60 + timeout_in_minutes: 60 \ No newline at end of file From af59e12c92d3f9bd539c917f08d18560042babee Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Mon, 16 Mar 2026 10:55:04 -0400 Subject: [PATCH 29/59] linting --- modules/python/tests/crud/test_main.py | 1 + pipelines/system/new-pipeline-test.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/python/tests/crud/test_main.py b/modules/python/tests/crud/test_main.py index cdb1f25ee3..0e82d9a656 100644 --- a/modules/python/tests/crud/test_main.py +++ b/modules/python/tests/crud/test_main.py @@ -180,6 +180,7 @@ def test_handle_node_pool_operation_scale_fails_returns_error( scale_step_size=2, gpu_node_pool=False, ) + mock_logger.error.assert_called_with("Operation 'scale' failed") @mock.patch("crud.main.AzureNodePoolCRUD") def test_handle_node_pool_operation_delete(self, mock_azure_crud): diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 78d0629c2f..64c68b0e9a 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -45,4 +45,4 @@ stages: max_parallel: 1 credential_type: service_connection ssh_key_enabled: false - timeout_in_minutes: 60 \ No newline at end of file + timeout_in_minutes: 60 From ba865d5146c32e37d3aa61f6748c5a4a8a3e19f4 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Mon, 16 Mar 2026 10:58:23 -0400 Subject: [PATCH 30/59] yaml lint --- pipelines/system/new-pipeline-test.yml | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 64c68b0e9a..4fbee4635e 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,28 +1,3 @@ -# trigger: none - -# variables: -# SCENARIO_TYPE: -# SCENARIO_NAME: - -# stages: -# - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) -# dependsOn: [] -# jobs: -# - template: /jobs/competitive-test.yml # must keep as is -# parameters: -# cloud: # e.g. azure, aws -# regions: # list of regions -# - region1 # e.g. eastus2 -# topology: # e.g. cluster-autoscaler -# engine: # e.g. clusterloader2 -# matrix: # list of test parameters to customize the provisioned resources -# : -# : -# : -# max_parallel: # required -# credential_type: service_connection # required -# ssh_key_enabled: false -# timeout_in_minutes: 60 # if not specified, default is 60 trigger: none variables: From 1a348f9c61332832d74ea155f96f04339b9929d4 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Mon, 16 Mar 2026 12:06:59 -0400 Subject: [PATCH 31/59] add python security dependency --- modules/python/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/python/requirements.txt b/modules/python/requirements.txt index 4f576b1f37..2db036d22d 100644 --- a/modules/python/requirements.txt +++ b/modules/python/requirements.txt @@ -11,4 +11,5 @@ botocore==1.36.5 coverage==7.6.12 semver==3.0.4 requests==2.32.4 -pyyaml==6.0.2 \ No newline at end of file +pyyaml==6.0.2 +pyopenssl>=24.0.0 \ No newline at end of file From 193af954c3e7c086ee4c1bda6bc042e396d2f440 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Mon, 16 Mar 2026 12:09:28 -0400 Subject: [PATCH 32/59] fix dependency --- modules/python/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/requirements.txt b/modules/python/requirements.txt index 2db036d22d..efdd38b23b 100644 --- a/modules/python/requirements.txt +++ b/modules/python/requirements.txt @@ -12,4 +12,4 @@ coverage==7.6.12 semver==3.0.4 requests==2.32.4 pyyaml==6.0.2 -pyopenssl>=24.0.0 \ No newline at end of file +pyopenssl==24.0.0 \ No newline at end of file From 7598c61a05b56da71ea928b5ecd94b83f59bb74c Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Mon, 16 Mar 2026 12:34:36 -0400 Subject: [PATCH 33/59] update --- modules/python/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/requirements.txt b/modules/python/requirements.txt index 163017c448..7101a2b416 100644 --- a/modules/python/requirements.txt +++ b/modules/python/requirements.txt @@ -12,4 +12,4 @@ coverage==7.6.12 semver==3.0.4 requests==2.32.4 pyyaml==6.0.2 -pyOpenSSL==24.0.0 +pyopenssl>=24.0.0 From a006d4c491bd4d6de3112d30dcbf35165a6bc106 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Mon, 16 Mar 2026 12:50:48 -0400 Subject: [PATCH 34/59] added matrix variables to pipeline --- pipelines/system/new-pipeline-test.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 4fbee4635e..6d55159768 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -16,7 +16,15 @@ stages: topology: k8s-crud-gpu engine: crud matrix: - node_pool_crud_standard: {} + node_pool_crud_standard: + VM_SIZE: Standard_D2s_v3 + CREATE_NODE_COUNT: 1 + SCALE_NODE_COUNT: 3 + SCALE_STEP_SIZE: 1 + POOL_NAME: testpool + STEP_TIME_OUT: 600 + GPU_NODE_POOL: false + STEP_WAIT_TIME: 30 max_parallel: 1 credential_type: service_connection ssh_key_enabled: false From d1f5ebb843f051f63910ded6e3281801aea921e7 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Mon, 16 Mar 2026 13:03:15 -0400 Subject: [PATCH 35/59] testing: set GPU_NODE_POOL to empty string --- pipelines/system/new-pipeline-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 6d55159768..00c9b93ee8 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -23,7 +23,7 @@ stages: SCALE_STEP_SIZE: 1 POOL_NAME: testpool STEP_TIME_OUT: 600 - GPU_NODE_POOL: false + GPU_NODE_POOL: "" STEP_WAIT_TIME: 30 max_parallel: 1 credential_type: service_connection From 2ca536b0a46bb7ab2fcc7c8881edec4e2a1ff1ee Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Mon, 16 Mar 2026 13:38:59 -0400 Subject: [PATCH 36/59] update vm size --- pipelines/system/new-pipeline-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 00c9b93ee8..0ea4987931 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -17,7 +17,7 @@ stages: engine: crud matrix: node_pool_crud_standard: - VM_SIZE: Standard_D2s_v3 + VM_SIZE: Standard_NC6s_v3 CREATE_NODE_COUNT: 1 SCALE_NODE_COUNT: 3 SCALE_STEP_SIZE: 1 From 8a963564ad62be429b4823a8b8c9da6db4e5573b Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Mon, 16 Mar 2026 14:09:06 -0400 Subject: [PATCH 37/59] testing change vm size with available quota --- pipelines/system/new-pipeline-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 0ea4987931..1d745edf69 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -17,7 +17,7 @@ stages: engine: crud matrix: node_pool_crud_standard: - VM_SIZE: Standard_NC6s_v3 + VM_SIZE: Standard_NC6_Promo CREATE_NODE_COUNT: 1 SCALE_NODE_COUNT: 3 SCALE_STEP_SIZE: 1 From a65546e1ed822a0570dfd55aff9f65556b52052d Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Mon, 16 Mar 2026 14:48:58 -0400 Subject: [PATCH 38/59] update node count + vm size --- pipelines/system/new-pipeline-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 1d745edf69..0c099f6ac9 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -17,9 +17,9 @@ stages: engine: crud matrix: node_pool_crud_standard: - VM_SIZE: Standard_NC6_Promo + VM_SIZE: Standard_NC24ads_A100_v4 CREATE_NODE_COUNT: 1 - SCALE_NODE_COUNT: 3 + SCALE_NODE_COUNT: 2 SCALE_STEP_SIZE: 1 POOL_NAME: testpool STEP_TIME_OUT: 600 From 0c62bca7d8776336b549ee275b501e716c14b1a0 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Mon, 16 Mar 2026 17:16:06 -0400 Subject: [PATCH 39/59] update: topology selection --- pipelines/system/new-pipeline-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 0c099f6ac9..6839fad0de 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -13,7 +13,7 @@ stages: cloud: azure regions: - eastus2 - topology: k8s-crud-gpu + topology: create-delete-k8s-python engine: crud matrix: node_pool_crud_standard: From 227975350fed6be22bb4765ec26e6834efd54f0c Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Tue, 17 Mar 2026 15:00:12 -0400 Subject: [PATCH 40/59] add deployment step after scale-up operation --- steps/engine/crud/k8s/execute.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/steps/engine/crud/k8s/execute.yml b/steps/engine/crud/k8s/execute.yml index 354b67e828..dc9138e6fc 100644 --- a/steps/engine/crud/k8s/execute.yml +++ b/steps/engine/crud/k8s/execute.yml @@ -9,6 +9,10 @@ parameters: step_time_out: 600 step_wait_time: 30 gpu_node_pool: false + deployment_name: "" + number_of_deployments: 1 + replicas: 10 + manifest_dir: "" steps: - script: | @@ -37,6 +41,19 @@ steps: --step-wait-time "$STEP_WAIT_TIME" \ --step-timeout "$STEP_TIME_OUT" \ ${GPU_NODE_POOL:+--gpu-node-pool} + + # Deploy Workloads + PYTHONPATH=$PYTHONPATH:$(pwd) python3 "$PYTHON_SCRIPT_FILE" deployment \ + --cloud "$CLOUD" \ + --run-id "$RUN_ID" \ + --result-dir "$RESULT_DIR" \ + --node-pool-name "$POOL_NAME" \ + --deployment-name "$DEPLOYMENT_NAME" \ + --number_of_deployments "$NUMBER_OF_DEPLOYMENTS" \ + --replicas "$REPLICAS" \ + --manifest-dir "$MANIFEST_DIR" \ + --step-timeout "$STEP_TIME_OUT" \ + ${GPU_NODE_POOL:+--gpu-node-pool} displayName: 'Execute K8s Create & Scale Up Operations for ${{ parameters.cloud }}' workingDirectory: modules/python env: @@ -51,6 +68,10 @@ steps: RESULT_DIR: $(System.DefaultWorkingDirectory)/$(RUN_ID) GPU_NODE_POOL: ${{ parameters.gpu_node_pool }} STEP_WAIT_TIME: ${{ parameters.step_wait_time }} + DEPLOYMENT_NAME: ${{ parameters.deployment_name }} + NUMBER_OF_DEPLOYMENTS: ${{ parameters.number_of_deployments }} + REPLICAS: ${{ parameters.replicas }} + MANIFEST_DIR: ${{ parameters.manifest_dir }} ${{ if eq(parameters.cloud, 'aws') }}: CAPACITY_TYPE: $(CAPACITY_TYPE) From 102886416dd2193967622f129af56b64e5a5f913 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Tue, 17 Mar 2026 15:00:59 -0400 Subject: [PATCH 41/59] wire deployment parameters through k8s-crud-gpu topology --- steps/topology/k8s-crud-gpu/execute-crud.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/steps/topology/k8s-crud-gpu/execute-crud.yml b/steps/topology/k8s-crud-gpu/execute-crud.yml index 166a123e38..847a623be4 100644 --- a/steps/topology/k8s-crud-gpu/execute-crud.yml +++ b/steps/topology/k8s-crud-gpu/execute-crud.yml @@ -22,3 +22,7 @@ steps: result_dir: $(System.DefaultWorkingDirectory)/$(RUN_ID) gpu_node_pool: $(GPU_NODE_POOL) step_wait_time: $(STEP_WAIT_TIME) + deployment_name: $(DEPLOYMENT_NAME) + number_of_deployments: $(NUMBER_OF_DEPLOYMENTS) + replicas: $(REPLICAS) + manifest_dir: $(MANIFEST_DIR) From 3ca87d7e45a0ab11ce85d7ee29c933eb95ac3ba7 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Tue, 17 Mar 2026 15:01:52 -0400 Subject: [PATCH 42/59] correct deployment command routing and kwargs in handle_workload_operations --- modules/python/crud/main.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/python/crud/main.py b/modules/python/crud/main.py index 83379a1d7e..a5381e121a 100644 --- a/modules/python/crud/main.py +++ b/modules/python/crud/main.py @@ -152,12 +152,10 @@ def handle_workload_operations(node_pool_crud, args): result = None try: - if command == "create_pod": + if command == "deployment": # Prepare deploy arguments deploy_kwargs = { "node_pool_name": args.node_pool_name, - "deployment_name": args.deployment_name, - "namespace": args.namespace, "replicas": args.replicas, "manifest_dir": args.manifest_dir, "number_of_deployments": args.number_of_deployments From 8ceb2562e009c2f281f8042e85433ce46b4b883b Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Tue, 17 Mar 2026 15:02:05 -0400 Subject: [PATCH 43/59] correct topology name and add deployment matrix variables --- pipelines/system/new-pipeline-test.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 6839fad0de..9e2469a9a6 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -13,7 +13,7 @@ stages: cloud: azure regions: - eastus2 - topology: create-delete-k8s-python + topology: k8s-crud-gpu engine: crud matrix: node_pool_crud_standard: @@ -25,6 +25,10 @@ stages: STEP_TIME_OUT: 600 GPU_NODE_POOL: "" STEP_WAIT_TIME: 30 + DEPLOYMENT_NAME: testdeployment + NUMBER_OF_DEPLOYMENTS: 1 + REPLICAS: 10 + MANIFEST_DIR: $(Pipeline.Workspace)/s/modules/python/crud/workload_templates max_parallel: 1 credential_type: service_connection ssh_key_enabled: false From b8f876e76806419dea171b30d4a1d3a4d81aa88e Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Tue, 17 Mar 2026 15:35:30 -0400 Subject: [PATCH 44/59] update handle_workload_operations tests to match deployment command --- modules/python/tests/crud/test_main.py | 30 ++++++-------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/modules/python/tests/crud/test_main.py b/modules/python/tests/crud/test_main.py index 0e82d9a656..8ad5a839e1 100644 --- a/modules/python/tests/crud/test_main.py +++ b/modules/python/tests/crud/test_main.py @@ -381,10 +381,8 @@ def test_handle_workload_operations_create_pod_success(self, mock_azure_crud): """Test handle_workload_operations for successful pod creation""" # Setup mock_args = mock.MagicMock() - mock_args.command = "create_pod" + mock_args.command = "deployment" mock_args.node_pool_name = "test-nodepool" - mock_args.deployment_name = "test-deployment" - mock_args.namespace = "default" mock_args.replicas = 5 mock_args.manifest_dir = "/path/to/manifests" mock_args.number_of_deployments = 3 @@ -399,8 +397,6 @@ def test_handle_workload_operations_create_pod_success(self, mock_azure_crud): self.assertEqual(result, 0) # 0 means success mock_azure_crud.create_deployment.assert_called_once_with( node_pool_name="test-nodepool", - deployment_name="test-deployment", - namespace="default", replicas=5, manifest_dir="/path/to/manifests", number_of_deployments=3 @@ -411,10 +407,8 @@ def test_handle_workload_operations_failure(self, mock_azure_crud): """Test handle_workload_operations when operation fails""" # Setup mock_args = mock.MagicMock() - mock_args.command = "create_pod" + mock_args.command = "deployment" mock_args.node_pool_name = "test-nodepool" - mock_args.deployment_name = "test-deployment" - mock_args.namespace = "default" mock_args.replicas = 5 mock_args.manifest_dir = "/path/to/manifests" mock_args.number_of_deployments = 3 @@ -434,10 +428,8 @@ def test_handle_workload_operations_exception(self, mock_azure_crud, mock_logger """Test handle_workload_operations with exception during operation""" # Setup mock_args = mock.MagicMock() - mock_args.command = "create_pod" + mock_args.command = "deployment" mock_args.node_pool_name = "test-nodepool" - mock_args.deployment_name = "test-deployment" - mock_args.namespace = "default" mock_args.replicas = 5 mock_args.manifest_dir = "/path/to/manifests" mock_args.number_of_deployments = 3 @@ -451,7 +443,7 @@ def test_handle_workload_operations_exception(self, mock_azure_crud, mock_logger # Verify self.assertEqual(result, 1) # 1 means error mock_logger.error.assert_called_with( - "Error during 'create_pod' operation: Test error" + "Error during 'deployment' operation: Test error" ) @mock.patch("crud.main.logger") @@ -466,10 +458,8 @@ def test_handle_workload_operations_partial_success(self, mock_azure_crud, mock_ # Setup - simulate a partial success scenario where create_deployment # returns False (e.g., 2 out of 3 deployments succeeded) mock_args = mock.MagicMock() - mock_args.command = "create_pod" + mock_args.command = "deployment" mock_args.node_pool_name = "test-nodepool" - mock_args.deployment_name = "test-deployment" - mock_args.namespace = "default" mock_args.replicas = 5 mock_args.manifest_dir = "/path/to/manifests" mock_args.number_of_deployments = 3 # Requesting 3 deployments @@ -485,14 +475,12 @@ def test_handle_workload_operations_partial_success(self, mock_azure_crud, mock_ self.assertEqual(result, 1) # 1 means failure (partial success is still failure) mock_azure_crud.create_deployment.assert_called_once_with( node_pool_name="test-nodepool", - deployment_name="test-deployment", - namespace="default", replicas=5, manifest_dir="/path/to/manifests", number_of_deployments=3 ) # Verify the error was logged for the failed operation - mock_logger.error.assert_called_with("Operation 'create_pod' failed") + mock_logger.error.assert_called_with("Operation 'deployment' failed") @mock.patch("crud.main.AzureNodePoolCRUD") def test_handle_workload_operations_multiple_deployments_success(self, mock_azure_crud): @@ -504,10 +492,8 @@ def test_handle_workload_operations_multiple_deployments_success(self, mock_azur """ # Setup - configure for multiple deployments mock_args = mock.MagicMock() - mock_args.command = "create_pod" + mock_args.command = "deployment" mock_args.node_pool_name = "test-nodepool" - mock_args.deployment_name = "test-deployment" - mock_args.namespace = "default" mock_args.replicas = 10 mock_args.manifest_dir = "/path/to/manifests" mock_args.number_of_deployments = 5 # Multiple deployments @@ -522,8 +508,6 @@ def test_handle_workload_operations_multiple_deployments_success(self, mock_azur self.assertEqual(result, 0) # 0 means success mock_azure_crud.create_deployment.assert_called_once_with( node_pool_name="test-nodepool", - deployment_name="test-deployment", - namespace="default", replicas=10, manifest_dir="/path/to/manifests", number_of_deployments=5 From 767f0cc81d3aba8cfcef6df59f02fa1461c38e80 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Tue, 17 Mar 2026 15:50:08 -0400 Subject: [PATCH 45/59] fix yamllint and pylint warnings --- modules/python/crud/azure/node_pool_crud.py | 2 +- .../python/crud/workload_templates/deployment.yml | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index 6d3b90b05e..26799f0903 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -359,7 +359,7 @@ def create_deployment( operation_timeout_in_minutes=5, namespace="default", pod_count=replicas, - label_selector=f"app=nginx-container" + label_selector="app=nginx-container" ) logger.info(f"Successfully created and verified deployment {deployment_index}") diff --git a/modules/python/crud/workload_templates/deployment.yml b/modules/python/crud/workload_templates/deployment.yml index 6e1e5f39f3..3901400041 100644 --- a/modules/python/crud/workload_templates/deployment.yml +++ b/modules/python/crud/workload_templates/deployment.yml @@ -26,9 +26,9 @@ kind: Service metadata: name: myapp-{{NODE_POOL_NAME}}-{{INDEX}} spec: - ports: - - port: 80 - name: myapp - clusterIP: None - selector: - app: nginx-container \ No newline at end of file + ports: + - port: 80 + name: myapp + clusterIP: None + selector: + app: nginx-container From d9ac29ff05a3382377da5aa4ad95649012fdfe70 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Tue, 17 Mar 2026 15:58:20 -0400 Subject: [PATCH 46/59] add correct indentation --- modules/python/crud/workload_templates/deployment.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/python/crud/workload_templates/deployment.yml b/modules/python/crud/workload_templates/deployment.yml index 3901400041..d5d9d1b2af 100644 --- a/modules/python/crud/workload_templates/deployment.yml +++ b/modules/python/crud/workload_templates/deployment.yml @@ -27,8 +27,8 @@ metadata: name: myapp-{{NODE_POOL_NAME}}-{{INDEX}} spec: ports: - - port: 80 - name: myapp + - port: 80 + name: myapp clusterIP: None selector: app: nginx-container From e6ccf1ffb5041346c650b3710e32ac14c9d28562 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Tue, 17 Mar 2026 17:26:10 -0400 Subject: [PATCH 47/59] iterate multi-doc YAML generator when applying deployment manifests --- modules/python/crud/azure/node_pool_crud.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index 26799f0903..01182544ea 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -333,10 +333,10 @@ def create_deployment( } ) - # Apply the processed template - k8s_client.apply_manifest_from_file( - manifest_dict=yaml.safe_load_all(deployment_template) - ) + # Apply each document in the rendered multi-doc template + for doc in yaml.safe_load_all(deployment_template): + if doc: + k8s_client.apply_manifest_from_file(manifest_dict=doc) logger.info(f"Applied manifest for deployment {deployment_name}") From eae04094bac80c2a20f3451583f37474cdc2fde9 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Wed, 18 Mar 2026 10:58:45 -0400 Subject: [PATCH 48/59] refactor: seperate deploy workloads into its own pipelinee step --- steps/engine/crud/k8s/execute.yml | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/steps/engine/crud/k8s/execute.yml b/steps/engine/crud/k8s/execute.yml index dc9138e6fc..4b578878d1 100644 --- a/steps/engine/crud/k8s/execute.yml +++ b/steps/engine/crud/k8s/execute.yml @@ -42,6 +42,26 @@ steps: --step-timeout "$STEP_TIME_OUT" \ ${GPU_NODE_POOL:+--gpu-node-pool} + displayName: 'Execute K8s Create & Scale Up Operations for ${{ parameters.cloud }}' + workingDirectory: modules/python + env: + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/crud/main.py + VM_SIZE: ${{ parameters.vm_size }} + CREATE_NODE_COUNT: ${{ parameters.create_node_count }} + SCALE_NODE_COUNT: ${{ parameters.scale_node_count }} + SCALE_STEP_SIZE: ${{ parameters.scale_step_up_count }} + POOL_NAME: ${{ parameters.pool_name }} + CLOUD: ${{ parameters.cloud }} + STEP_TIME_OUT: ${{ parameters.step_time_out }} + RESULT_DIR: $(System.DefaultWorkingDirectory)/$(RUN_ID) + GPU_NODE_POOL: ${{ parameters.gpu_node_pool }} + STEP_WAIT_TIME: ${{ parameters.step_wait_time }} + ${{ if eq(parameters.cloud, 'aws') }}: + CAPACITY_TYPE: $(CAPACITY_TYPE) + +- script: | + set -eo pipefail + # Deploy Workloads PYTHONPATH=$PYTHONPATH:$(pwd) python3 "$PYTHON_SCRIPT_FILE" deployment \ --cloud "$CLOUD" \ @@ -54,26 +74,19 @@ steps: --manifest-dir "$MANIFEST_DIR" \ --step-timeout "$STEP_TIME_OUT" \ ${GPU_NODE_POOL:+--gpu-node-pool} - displayName: 'Execute K8s Create & Scale Up Operations for ${{ parameters.cloud }}' + displayName: 'Deploy Workloads for ${{ parameters.cloud }}' workingDirectory: modules/python env: PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/crud/main.py - VM_SIZE: ${{ parameters.vm_size }} - CREATE_NODE_COUNT: ${{ parameters.create_node_count }} - SCALE_NODE_COUNT: ${{ parameters.scale_node_count }} - SCALE_STEP_SIZE: ${{ parameters.scale_step_up_count }} POOL_NAME: ${{ parameters.pool_name }} CLOUD: ${{ parameters.cloud }} STEP_TIME_OUT: ${{ parameters.step_time_out }} RESULT_DIR: $(System.DefaultWorkingDirectory)/$(RUN_ID) GPU_NODE_POOL: ${{ parameters.gpu_node_pool }} - STEP_WAIT_TIME: ${{ parameters.step_wait_time }} DEPLOYMENT_NAME: ${{ parameters.deployment_name }} NUMBER_OF_DEPLOYMENTS: ${{ parameters.number_of_deployments }} REPLICAS: ${{ parameters.replicas }} MANIFEST_DIR: ${{ parameters.manifest_dir }} - ${{ if eq(parameters.cloud, 'aws') }}: - CAPACITY_TYPE: $(CAPACITY_TYPE) - script: | set -eo pipefail From 3a6a9b08f65a23dd5f497415761c5801ae3cd5c4 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Wed, 18 Mar 2026 13:48:44 -0400 Subject: [PATCH 49/59] fix: execute k8s workload operations displayname --- steps/engine/crud/k8s/execute.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/steps/engine/crud/k8s/execute.yml b/steps/engine/crud/k8s/execute.yml index 4b578878d1..1abdaef8f0 100644 --- a/steps/engine/crud/k8s/execute.yml +++ b/steps/engine/crud/k8s/execute.yml @@ -74,7 +74,7 @@ steps: --manifest-dir "$MANIFEST_DIR" \ --step-timeout "$STEP_TIME_OUT" \ ${GPU_NODE_POOL:+--gpu-node-pool} - displayName: 'Deploy Workloads for ${{ parameters.cloud }}' + displayName: 'Execute K8s Workload operations for ${{ parameters.cloud }}' workingDirectory: modules/python env: PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/crud/main.py From 7a79129140a0259cb54fe87d19657aa65566b4eb Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Wed, 18 Mar 2026 13:59:42 -0400 Subject: [PATCH 50/59] fix: prevent infinite loop in azure node pool deployment tests yaml.safe_load_all() enters an infinite loop when passed a MagicMock object because PyYAML detects the .read attribute and treats it as a file-like stream, then loops forever waiting to buffer enough bytes (len(MagicMock()) returns 0 by default). Fix by setting create_template.return_value to a valid YAML string in the three create_deployment tests, so yaml.safe_load_all receives a real string and parses it via the non-blocking code path. Affected tests: - test_create_deployment_success - test_create_deployment_failure - test_create_deployment_partial_success --- modules/python/tests/crud/test_azure_node_pool_crud.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/modules/python/tests/crud/test_azure_node_pool_crud.py b/modules/python/tests/crud/test_azure_node_pool_crud.py index 79af938be8..9064a5af4a 100644 --- a/modules/python/tests/crud/test_azure_node_pool_crud.py +++ b/modules/python/tests/crud/test_azure_node_pool_crud.py @@ -328,6 +328,8 @@ def test_create_deployment_success(self): # Setup mock_k8s_client = mock.MagicMock() self.mock_aks_client.k8s_client = mock_k8s_client + # Must return a real string - yaml.safe_load_all(MagicMock()) causes an infinite loop + mock_k8s_client.create_template.return_value = "apiVersion: apps/v1\nkind: Deployment\n" mock_k8s_client.wait_for_condition.return_value = True # Execute @@ -341,6 +343,8 @@ def test_create_deployment_failure(self): # Setup mock_k8s_client = mock.MagicMock() self.mock_aks_client.k8s_client = mock_k8s_client + # Must return a real string - yaml.safe_load_all(MagicMock()) causes an infinite loop + mock_k8s_client.create_template.return_value = "apiVersion: apps/v1\nkind: Deployment\n" mock_k8s_client.wait_for_condition.return_value = False # Execute @@ -366,6 +370,9 @@ def test_create_deployment_partial_success(self): mock_k8s_client = mock.MagicMock() self.mock_aks_client.k8s_client = mock_k8s_client + # Must return a real string - yaml.safe_load_all(MagicMock()) causes an infinite loop + mock_k8s_client.create_template.return_value = "apiVersion: apps/v1\nkind: Deployment\n" + # Simulate: deployment 1 succeeds, deployment 2 fails, deployment 3 succeeds # wait_for_condition returns True/False for each deployment mock_k8s_client.wait_for_condition.side_effect = [True, False, True] From 3fa62954fa3857954f7892ab2691864ea287895a Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Wed, 18 Mar 2026 18:48:00 -0400 Subject: [PATCH 51/59] fix: await Azure LRO poller to prevent scale race condition begin_create_or_update() returns an LROPoller that was being discarded, allowing execution to continue while Azure still had an operation in-progress. Subsequent scale/delete calls were then rejected with OperationNotAllowed. Fix by calling poller.result() in scale_node_pool and _progressive_scale to block until Azure fully completes each operation before proceeding. --- modules/python/clients/aks_client.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/python/clients/aks_client.py b/modules/python/clients/aks_client.py index 3eb5c4179e..9af5302821 100644 --- a/modules/python/clients/aks_client.py +++ b/modules/python/clients/aks_client.py @@ -464,12 +464,13 @@ def scale_node_pool( node_pool.count = node_count logger.info(f"Scaling node pool {node_pool_name} to {node_count} nodes") - self.aks_client.agent_pools.begin_create_or_update( + poller = self.aks_client.agent_pools.begin_create_or_update( resource_group_name=self.resource_group, resource_name=cluster_name, agent_pool_name=node_pool_name, parameters=node_pool, ) + poller.result() # Wait for Azure control plane to finish before proceeding logger.info( f"Waiting for {node_count} nodes in pool {node_pool_name} to be ready..." @@ -676,12 +677,13 @@ def _progressive_scale( "cluster_info", self.get_cluster_data(cluster_name) ) node_pool.count = step # Update node count in the node pool object - result = self.aks_client.agent_pools.begin_create_or_update( + poller = self.aks_client.agent_pools.begin_create_or_update( resource_group_name=self.resource_group, resource_name=cluster_name, agent_pool_name=node_pool_name, parameters=node_pool, ) + result = poller.result() # Wait for Azure control plane to finish before proceeding # Use agentpool=node_pool_name as default label if not specified label_selector = f"agentpool={node_pool_name}" From 6b7a448a082c1591a358a72e3400e0d9a3594411 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Thu, 26 Mar 2026 14:58:59 -0400 Subject: [PATCH 52/59] fix: replace hardcoded timeout with self.step_timeout in create_deployment --- modules/python/crud/azure/node_pool_crud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index 01182544ea..cc48f18451 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -347,7 +347,7 @@ def create_deployment( wait_condition_type="available", resource_name=deployment_name, namespace="default", - timeout_seconds=300 # 5 minutes timeout + timeout_seconds=self.step_timeout ) if deployment_ready: From 8e3445cbe39084e335762ff4a4fd6a201520ee38 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Thu, 26 Mar 2026 15:00:17 -0400 Subject: [PATCH 53/59] refactor: convert f-string logger calls to %-style in create_deployment --- modules/python/crud/azure/node_pool_crud.py | 30 ++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index cc48f18451..c79b9cfe55 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -293,10 +293,10 @@ def create_deployment( Returns: True if all deployment creations were successful, False otherwise """ - logger.info(f"Creating {number_of_deployments} deployment(s)") - logger.info(f"Target node pool: {node_pool_name}") - logger.info(f"Replicas per deployment: {replicas}") - logger.info(f"Using manifest directory: {manifest_dir}") + logger.info("Creating %d deployment(s)", number_of_deployments) + logger.info("Target node pool: %s", node_pool_name) + logger.info("Replicas per deployment: %d", replicas) + logger.info("Using manifest directory: %s", manifest_dir) try: # Get Kubernetes client from AKS client @@ -310,7 +310,7 @@ def create_deployment( # Loop through number of deployments for deployment_index in range(1, number_of_deployments + 1): - logger.info(f"Creating deployment {deployment_index}/{number_of_deployments}") + logger.info("Creating deployment %d/%d", deployment_index, number_of_deployments) try: if manifest_dir: @@ -338,10 +338,10 @@ def create_deployment( if doc: k8s_client.apply_manifest_from_file(manifest_dict=doc) - logger.info(f"Applied manifest for deployment {deployment_name}") + logger.info("Applied manifest for deployment %s", deployment_name) # Wait for deployment to be available (successful deployment verification) - logger.info(f"Waiting for deployment {deployment_name} to become available...") + logger.info("Waiting for deployment %s to become available...", deployment_name) deployment_ready = k8s_client.wait_for_condition( resource_type="deployment", wait_condition_type="available", @@ -351,10 +351,10 @@ def create_deployment( ) if deployment_ready: - logger.info(f"Deployment {deployment_name} is successfully available") + logger.info("Deployment %s is successfully available", deployment_name) # Additionally wait for pods to be ready - logger.info(f"Waiting for pods of deployment {deployment_name} to be ready...") + logger.info("Waiting for pods of deployment %s to be ready...", deployment_name) k8s_client.wait_for_pods_ready( operation_timeout_in_minutes=5, namespace="default", @@ -362,27 +362,27 @@ def create_deployment( label_selector="app=nginx-container" ) - logger.info(f"Successfully created and verified deployment {deployment_index}") + logger.info("Successfully created and verified deployment %d", deployment_index) successful_deployments += 1 else: - logger.error(f"Deployment {deployment_name} failed to become available within timeout") + logger.error("Deployment %s failed to become available within timeout", deployment_name) continue except Exception as e: - logger.error(f"Failed to create deployment {deployment_index}: {str(e)}") + logger.error("Failed to create deployment %d: %s", deployment_index, e) # Continue with next deployment instead of failing completely continue # Check if all deployments were successful if successful_deployments == number_of_deployments: - logger.info(f"Successfully created all {number_of_deployments} deployment(s)") + logger.info("Successfully created all %d deployment(s)", number_of_deployments) return True if successful_deployments > 0: - logger.warning(f"Created {successful_deployments}/{number_of_deployments} deployment(s)") + logger.warning("Created %d/%d deployment(s)", successful_deployments, number_of_deployments) return False logger.error("Failed to create any deployments") return False except Exception as e: - logger.error(f"Failed to create deployments: {str(e)}") + logger.error("Failed to create deployments: %s", e) return False From 2aac24d4b15babad4662b5b3e35441d582ecb9eb Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Thu, 26 Mar 2026 15:19:12 -0400 Subject: [PATCH 54/59] feat: make label_selector derive from parameter nginx-container was hardcoded in deployment template and in create deployment method - add label_selector to parameters - replace nginx-container in deployment.yaml (label_alue) - derive label_value from selector - pass label_selector directly --- modules/python/crud/azure/node_pool_crud.py | 8 +++++--- modules/python/crud/workload_templates/deployment.yml | 10 +++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index c79b9cfe55..8df9c34323 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -277,7 +277,8 @@ def create_deployment( node_pool_name, replicas=10, manifest_dir=None, - number_of_deployments=1 + number_of_deployments=1, + label_selector="app=nginx-container" ): """ Create Kubernetes deployments after node pool operations. @@ -329,7 +330,8 @@ def create_deployment( { "DEPLOYMENT_REPLICAS": replicas, "NODE_POOL_NAME": node_pool_name, - "INDEX": deployment_index + "INDEX": deployment_index, + "LABEL_VALUE": label_selector.split("=", 1)[-1], } ) @@ -359,7 +361,7 @@ def create_deployment( operation_timeout_in_minutes=5, namespace="default", pod_count=replicas, - label_selector="app=nginx-container" + label_selector=label_selector ) logger.info("Successfully created and verified deployment %d", deployment_index) diff --git a/modules/python/crud/workload_templates/deployment.yml b/modules/python/crud/workload_templates/deployment.yml index d5d9d1b2af..0d23751682 100644 --- a/modules/python/crud/workload_templates/deployment.yml +++ b/modules/python/crud/workload_templates/deployment.yml @@ -3,23 +3,23 @@ kind: Deployment metadata: name: myapp-{{NODE_POOL_NAME}}-{{INDEX}} labels: - app: nginx-container + app: {{LABEL_VALUE}} spec: template: metadata: name: labels: - app: nginx-container + app: {{LABEL_VALUE}} spec: containers: - - name: nginx-container + - name: {{LABEL_VALUE}} image: mcr.microsoft.com/oss/nginx/nginx:1.21.6 ports: - containerPort: 80 replicas: {{DEPLOYMENT_REPLICAS}} selector: matchLabels: - app: nginx-container + app: {{LABEL_VALUE}} --- apiVersion: v1 kind: Service @@ -31,4 +31,4 @@ spec: name: myapp clusterIP: None selector: - app: nginx-container + app: {{LABEL_VALUE}} From 7522f1ad18c3cf75ccf177b314919e3d0f5c7d23 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Thu, 26 Mar 2026 15:28:03 -0400 Subject: [PATCH 55/59] feat: remove hardcoding add namespace parameter --- modules/python/crud/azure/node_pool_crud.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index 8df9c34323..2c1ae71928 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -278,7 +278,8 @@ def create_deployment( replicas=10, manifest_dir=None, number_of_deployments=1, - label_selector="app=nginx-container" + label_selector="app=nginx-container", + namespace="default" ): """ Create Kubernetes deployments after node pool operations. @@ -348,7 +349,7 @@ def create_deployment( resource_type="deployment", wait_condition_type="available", resource_name=deployment_name, - namespace="default", + namespace=namespace, timeout_seconds=self.step_timeout ) @@ -359,7 +360,7 @@ def create_deployment( logger.info("Waiting for pods of deployment %s to be ready...", deployment_name) k8s_client.wait_for_pods_ready( operation_timeout_in_minutes=5, - namespace="default", + namespace=namespace, pod_count=replicas, label_selector=label_selector ) From 65fdd004ede702506cf7b9fefb503772942b5560 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Thu, 26 Mar 2026 15:56:46 -0400 Subject: [PATCH 56/59] fix: remove --deployment-name CLI --- modules/python/crud/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/python/crud/main.py b/modules/python/crud/main.py index a5381e121a..a0dcafbebb 100644 --- a/modules/python/crud/main.py +++ b/modules/python/crud/main.py @@ -349,7 +349,6 @@ def main(): "deployment", parents=[common_parser], help="create deployments" ) deployment_parser.add_argument("--node-pool-name", required=True, help="Node pool name") - deployment_parser.add_argument("--deployment-name", required=True, help="Deployment name") deployment_parser.add_argument( "--number_of_deployments", type=int, From b0be1b1656d4b67876c5f2ed9915c941bdc487ab Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Thu, 26 Mar 2026 16:02:03 -0400 Subject: [PATCH 57/59] fix: use hyphen for --number-of-deployments --- modules/python/crud/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/crud/main.py b/modules/python/crud/main.py index a0dcafbebb..4b3b51a521 100644 --- a/modules/python/crud/main.py +++ b/modules/python/crud/main.py @@ -350,7 +350,7 @@ def main(): ) deployment_parser.add_argument("--node-pool-name", required=True, help="Node pool name") deployment_parser.add_argument( - "--number_of_deployments", + "--number-of-deployments", type=int, default=1, help="Number of deployments" From c5d01bef9aa91b9ccb1564bbc22d3b80a2941224 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Thu, 26 Mar 2026 16:15:50 -0400 Subject: [PATCH 58/59] fix: return error on unknown workload command --- modules/python/crud/main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/python/crud/main.py b/modules/python/crud/main.py index 4b3b51a521..5e90d6564d 100644 --- a/modules/python/crud/main.py +++ b/modules/python/crud/main.py @@ -162,6 +162,9 @@ def handle_workload_operations(node_pool_crud, args): } result = node_pool_crud.create_deployment(**deploy_kwargs) + else: + logger.error("Unknown workload command: '%s'", command) + return 1 # Check if the operation was successful if result is False: logger.error(f"Operation '{command}' failed") From 946dea9d1c639960e5376e8e6e077fd32fc21ef3 Mon Sep 17 00:00:00 2001 From: Diamond Powell <32712461+engineeredcurlz@users.noreply.github.com> Date: Thu, 26 Mar 2026 16:17:49 -0400 Subject: [PATCH 59/59] revert: restore original docstring line wrapping --- modules/python/crud/aws/node_pool_crud.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/python/crud/aws/node_pool_crud.py b/modules/python/crud/aws/node_pool_crud.py index cb51b489cb..261b5ec05b 100644 --- a/modules/python/crud/aws/node_pool_crud.py +++ b/modules/python/crud/aws/node_pool_crud.py @@ -2,8 +2,8 @@ AWS EKS Node Group CRUD Operations Module. This module provides a cloud-agnostic NodePoolCRUD class for Amazon Elastic Kubernetes Service (EKS) -node groups, including create, scale (up/down), and delete operations. -It supports both direct and progressive scaling operations and handles GPU-enabled node groups. +node groups, including create, scale (up/down), and delete operations. It supports +both direct and progressive scaling operations and handles GPU-enabled node groups. """ import logging