diff --git a/modules/python/clients/aks_client.py b/modules/python/clients/aks_client.py index 3eb5c4179e..9af5302821 100644 --- a/modules/python/clients/aks_client.py +++ b/modules/python/clients/aks_client.py @@ -464,12 +464,13 @@ def scale_node_pool( node_pool.count = node_count logger.info(f"Scaling node pool {node_pool_name} to {node_count} nodes") - self.aks_client.agent_pools.begin_create_or_update( + poller = self.aks_client.agent_pools.begin_create_or_update( resource_group_name=self.resource_group, resource_name=cluster_name, agent_pool_name=node_pool_name, parameters=node_pool, ) + poller.result() # Wait for Azure control plane to finish before proceeding logger.info( f"Waiting for {node_count} nodes in pool {node_pool_name} to be ready..." @@ -676,12 +677,13 @@ def _progressive_scale( "cluster_info", self.get_cluster_data(cluster_name) ) node_pool.count = step # Update node count in the node pool object - result = self.aks_client.agent_pools.begin_create_or_update( + poller = self.aks_client.agent_pools.begin_create_or_update( resource_group_name=self.resource_group, resource_name=cluster_name, agent_pool_name=node_pool_name, parameters=node_pool, ) + result = poller.result() # Wait for Azure control plane to finish before proceeding # Use agentpool=node_pool_name as default label if not specified label_selector = f"agentpool={node_pool_name}" diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index ed88bbce93..2c1ae71928 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -8,6 +8,7 @@ import logging import time +import yaml from clients.aks_client import AKSClient from utils.logger_config import get_logger, setup_logging @@ -270,3 +271,121 @@ def all( logger.error(error_msg) errors.append(error_msg) return False + + def create_deployment( + self, + node_pool_name, + replicas=10, + manifest_dir=None, + number_of_deployments=1, + label_selector="app=nginx-container", + namespace="default" + ): + """ + Create Kubernetes deployments after node pool operations. + + Args: + node_pool_name: Name of the node pool to target + deployment_name: Base name for the deployments + namespace: Kubernetes namespace (default: "default") + replicas: Number of deployment replicas per deployment (default: 10) + manifest_dir: Directory containing Kubernetes manifest files + number_of_deployments: Number of deployments to create (default: 1) + + Returns: + True if all deployment creations were successful, False otherwise + """ + logger.info("Creating %d deployment(s)", number_of_deployments) + logger.info("Target node pool: %s", node_pool_name) + logger.info("Replicas per deployment: %d", replicas) + logger.info("Using manifest directory: %s", manifest_dir) + + try: + # Get Kubernetes client from AKS client + k8s_client = self.aks_client.k8s_client + + if not k8s_client: + logger.error("Kubernetes client not available") + return False + + successful_deployments = 0 + + # Loop through number of deployments + for deployment_index in range(1, number_of_deployments + 1): + logger.info("Creating deployment %d/%d", deployment_index, number_of_deployments) + + try: + if manifest_dir: + # Use the template path from manifest_dir + template_path = f"{manifest_dir}/deployment.yml" + else: + # Use default template path + template_path = "modules/python/crud/workload_templates/deployment.yml" + + # Generate deployment name + deployment_name = f"myapp-{node_pool_name}-{deployment_index}" + + # Create deployment template using k8s_client.create_template + deployment_template = k8s_client.create_template( + template_path, + { + "DEPLOYMENT_REPLICAS": replicas, + "NODE_POOL_NAME": node_pool_name, + "INDEX": deployment_index, + "LABEL_VALUE": label_selector.split("=", 1)[-1], + } + ) + + # Apply each document in the rendered multi-doc template + for doc in yaml.safe_load_all(deployment_template): + if doc: + k8s_client.apply_manifest_from_file(manifest_dict=doc) + + logger.info("Applied manifest for deployment %s", deployment_name) + + # Wait for deployment to be available (successful deployment verification) + logger.info("Waiting for deployment %s to become available...", deployment_name) + deployment_ready = k8s_client.wait_for_condition( + resource_type="deployment", + wait_condition_type="available", + resource_name=deployment_name, + namespace=namespace, + timeout_seconds=self.step_timeout + ) + + if deployment_ready: + logger.info("Deployment %s is successfully available", deployment_name) + + # Additionally wait for pods to be ready + logger.info("Waiting for pods of deployment %s to be ready...", deployment_name) + k8s_client.wait_for_pods_ready( + operation_timeout_in_minutes=5, + namespace=namespace, + pod_count=replicas, + label_selector=label_selector + ) + + logger.info("Successfully created and verified deployment %d", deployment_index) + successful_deployments += 1 + else: + logger.error("Deployment %s failed to become available within timeout", deployment_name) + continue + + except Exception as e: + logger.error("Failed to create deployment %d: %s", deployment_index, e) + # Continue with next deployment instead of failing completely + continue + + # Check if all deployments were successful + if successful_deployments == number_of_deployments: + logger.info("Successfully created all %d deployment(s)", number_of_deployments) + return True + if successful_deployments > 0: + logger.warning("Created %d/%d deployment(s)", successful_deployments, number_of_deployments) + return False + logger.error("Failed to create any deployments") + return False + + except Exception as e: + logger.error("Failed to create deployments: %s", e) + return False diff --git a/modules/python/crud/main.py b/modules/python/crud/main.py index 082fd68682..5e90d6564d 100644 --- a/modules/python/crud/main.py +++ b/modules/python/crud/main.py @@ -146,6 +146,33 @@ def handle_node_pool_operation(node_pool_crud, args): logger.error(f"Error during '{command}' operation: {str(e)}") return 1 +def handle_workload_operations(node_pool_crud, args): + """Handle workload operations (deployment, statefulset, jobs) based on the command""" + command = args.command + result = None + + try: + if command == "deployment": + # Prepare deploy arguments + deploy_kwargs = { + "node_pool_name": args.node_pool_name, + "replicas": args.replicas, + "manifest_dir": args.manifest_dir, + "number_of_deployments": args.number_of_deployments + } + + result = node_pool_crud.create_deployment(**deploy_kwargs) + else: + logger.error("Unknown workload command: '%s'", command) + return 1 + # Check if the operation was successful + if result is False: + logger.error(f"Operation '{command}' failed") + return 1 + return 0 + except Exception as e: + logger.error(f"Error during '{command}' operation: {str(e)}") + return 1 def handle_node_pool_all(node_pool_crud, args): """Handle the all-in-one node pool operation command (create, scale up, scale down, delete)""" @@ -320,6 +347,31 @@ def main(): ) all_parser.set_defaults(func=handle_node_pool_operation) + # Deployment command - add after the "all" command parser + deployment_parser = subparsers.add_parser( + "deployment", parents=[common_parser], help="create deployments" + ) + deployment_parser.add_argument("--node-pool-name", required=True, help="Node pool name") + deployment_parser.add_argument( + "--number-of-deployments", + type=int, + default=1, + help="Number of deployments" + ) + deployment_parser.add_argument( + "--replicas", + type=int, + default=10, + help="Number of deployment replicas" + ) + deployment_parser.add_argument( + "--manifest-dir", + required=True, + help="Directory containing Kubernetes manifest files for the deployment" + ) + + deployment_parser.set_defaults(func=handle_workload_operations) + # Arguments provided, run node pool operations and collect benchmark results try: args = parser.parse_args() diff --git a/modules/python/crud/workload_templates/deployment.yml b/modules/python/crud/workload_templates/deployment.yml new file mode 100644 index 0000000000..0d23751682 --- /dev/null +++ b/modules/python/crud/workload_templates/deployment.yml @@ -0,0 +1,34 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: myapp-{{NODE_POOL_NAME}}-{{INDEX}} + labels: + app: {{LABEL_VALUE}} +spec: + template: + metadata: + name: + labels: + app: {{LABEL_VALUE}} + spec: + containers: + - name: {{LABEL_VALUE}} + image: mcr.microsoft.com/oss/nginx/nginx:1.21.6 + ports: + - containerPort: 80 + replicas: {{DEPLOYMENT_REPLICAS}} + selector: + matchLabels: + app: {{LABEL_VALUE}} +--- +apiVersion: v1 +kind: Service +metadata: + name: myapp-{{NODE_POOL_NAME}}-{{INDEX}} +spec: + ports: + - port: 80 + name: myapp + clusterIP: None + selector: + app: {{LABEL_VALUE}} diff --git a/modules/python/requirements.txt b/modules/python/requirements.txt index 3ec9b9f8cc..7101a2b416 100644 --- a/modules/python/requirements.txt +++ b/modules/python/requirements.txt @@ -12,4 +12,4 @@ coverage==7.6.12 semver==3.0.4 requests==2.32.4 pyyaml==6.0.2 -pyOpenSSL==24.0.0 \ No newline at end of file +pyopenssl>=24.0.0 diff --git a/modules/python/tests/crud/test_azure_node_pool_crud.py b/modules/python/tests/crud/test_azure_node_pool_crud.py index 7089426166..9064a5af4a 100644 --- a/modules/python/tests/crud/test_azure_node_pool_crud.py +++ b/modules/python/tests/crud/test_azure_node_pool_crud.py @@ -222,6 +222,176 @@ def test_all_operations(self, mock_time): # Check time.sleep was called 3 times (between operations) self.assertEqual(mock_time.sleep.call_count, 3) + @mock.patch("crud.azure.node_pool_crud.time") + def test_all_create_returns_false_early_exit(self, mock_time): + """Test that all() exits early when create returns False""" + # Setup - mock create to fail + self.node_pool_crud.create_node_pool = mock.MagicMock(return_value=False) + self.node_pool_crud.scale_node_pool = mock.MagicMock(return_value=True) + self.node_pool_crud.delete_node_pool = mock.MagicMock(return_value=True) + + # Execute + result = self.node_pool_crud.all( + node_pool_name="test-pool", + vm_size="Standard_DS2_v2", + node_count=1, + target_count=3, + progressive=True, + scale_step_size=1, + ) + + # Verify - should return False + self.assertFalse(result) + + # Verify create was called once + self.node_pool_crud.create_node_pool.assert_called_once() + + # Verify scale and delete were NOT called (early exit) + self.node_pool_crud.scale_node_pool.assert_not_called() + self.node_pool_crud.delete_node_pool.assert_not_called() + + # Verify time.sleep was NOT called (no operations after create) + mock_time.sleep.assert_not_called() + + @mock.patch("crud.azure.node_pool_crud.time") + def test_all_scale_up_fails_continues(self, mock_time): + """Test that all() continues to scale down and delete when scale up fails""" + # Setup - create succeeds, scale_up fails, scale_down and delete succeed + self.node_pool_crud.create_node_pool = mock.MagicMock(return_value=True) + self.node_pool_crud.scale_node_pool = mock.MagicMock( + side_effect=[False, True] # scale_up fails, scale_down succeeds + ) + self.node_pool_crud.delete_node_pool = mock.MagicMock(return_value=True) + + # Execute + result = self.node_pool_crud.all( + node_pool_name="test-pool", + vm_size="Standard_DS2_v2", + node_count=1, + target_count=3, + progressive=True, + scale_step_size=1, + ) + + # Verify - should return False (scale_up failed) + self.assertFalse(result) + + # Verify create was called once + self.node_pool_crud.create_node_pool.assert_called_once() + + # Verify scale was called TWICE (scale_up failed, but scale_down still called) + self.assertEqual(self.node_pool_crud.scale_node_pool.call_count, 2) + + # Verify delete was still called (cleanup continues despite scale_up failure) + self.node_pool_crud.delete_node_pool.assert_called_once() + + # Verify time.sleep was called 3 times (between all operations) + self.assertEqual(mock_time.sleep.call_count, 3) + + @mock.patch("crud.azure.node_pool_crud.time") + def test_all_scale_down_fails_continues(self, mock_time): + """Test that all() continues to delete when scale down fails""" + # Setup - create and scale_up succeed, scale_down fails, delete succeeds + self.node_pool_crud.create_node_pool = mock.MagicMock(return_value=True) + self.node_pool_crud.scale_node_pool = mock.MagicMock( + side_effect=[True, False] # scale_up succeeds, scale_down fails + ) + self.node_pool_crud.delete_node_pool = mock.MagicMock(return_value=True) + + # Execute + result = self.node_pool_crud.all( + node_pool_name="test-pool", + vm_size="Standard_DS2_v2", + node_count=1, + target_count=3, + progressive=True, + scale_step_size=1, + ) + + # Verify - should return False (scale_down failed) + self.assertFalse(result) + + # Verify create was called once + self.node_pool_crud.create_node_pool.assert_called_once() + + # Verify scale was called TWICE (scale_up succeeded, scale_down failed) + self.assertEqual(self.node_pool_crud.scale_node_pool.call_count, 2) + + # Verify delete was still called (cleanup continues despite scale_down failure) + self.node_pool_crud.delete_node_pool.assert_called_once() + + # Verify time.sleep was called 3 times (between all operations) + self.assertEqual(mock_time.sleep.call_count, 3) + + def test_create_deployment_success(self): + """Test successful deployment creation""" + # Setup + mock_k8s_client = mock.MagicMock() + self.mock_aks_client.k8s_client = mock_k8s_client + # Must return a real string - yaml.safe_load_all(MagicMock()) causes an infinite loop + mock_k8s_client.create_template.return_value = "apiVersion: apps/v1\nkind: Deployment\n" + mock_k8s_client.wait_for_condition.return_value = True + + # Execute + result = self.node_pool_crud.create_deployment(node_pool_name="test-pool") + + # Verify + self.assertTrue(result) + + def test_create_deployment_failure(self): + """Test deployment creation failure""" + # Setup + mock_k8s_client = mock.MagicMock() + self.mock_aks_client.k8s_client = mock_k8s_client + # Must return a real string - yaml.safe_load_all(MagicMock()) causes an infinite loop + mock_k8s_client.create_template.return_value = "apiVersion: apps/v1\nkind: Deployment\n" + mock_k8s_client.wait_for_condition.return_value = False + + # Execute + result = self.node_pool_crud.create_deployment(node_pool_name="test-pool") + + # Verify + self.assertFalse(result) + + def test_create_deployment_no_client(self): + """Test deployment creation with no Kubernetes client""" + # Setup + self.mock_aks_client.k8s_client = None + + # Execute + result = self.node_pool_crud.create_deployment(node_pool_name="test-pool") + + # Verify + self.assertFalse(result) + + def test_create_deployment_partial_success(self): + """Test deployment creation when some deployments succeed and others fail""" + # Setup + mock_k8s_client = mock.MagicMock() + self.mock_aks_client.k8s_client = mock_k8s_client + + # Must return a real string - yaml.safe_load_all(MagicMock()) causes an infinite loop + mock_k8s_client.create_template.return_value = "apiVersion: apps/v1\nkind: Deployment\n" + + # Simulate: deployment 1 succeeds, deployment 2 fails, deployment 3 succeeds + # wait_for_condition returns True/False for each deployment + mock_k8s_client.wait_for_condition.side_effect = [True, False, True] + + # Execute - request 3 deployments + result = self.node_pool_crud.create_deployment( + node_pool_name="test-pool", + number_of_deployments=3, + replicas=5 + ) + + # Verify - should return False (not all deployments succeeded) + self.assertFalse(result) + + # Verify wait_for_condition was called 3 times (once per deployment) + self.assertEqual(mock_k8s_client.wait_for_condition.call_count, 3) + + # Verify create_template was called 3 times (attempted all deployments) + self.assertEqual(mock_k8s_client.create_template.call_count, 3) if __name__ == "__main__": unittest.main() diff --git a/modules/python/tests/crud/test_main.py b/modules/python/tests/crud/test_main.py index c3ab848a83..8ad5a839e1 100644 --- a/modules/python/tests/crud/test_main.py +++ b/modules/python/tests/crud/test_main.py @@ -13,6 +13,7 @@ from crud.main import ( get_node_pool_crud_class, handle_node_pool_operation, + handle_workload_operations, main, check_for_progressive_scaling, collect_benchmark_results, @@ -145,6 +146,42 @@ def test_handle_node_pool_operation_scale_non_progressive(self, mock_azure_crud) gpu_node_pool=False, ) + @mock.patch("crud.main.logger") + @mock.patch("crud.main.AzureNodePoolCRUD") + def test_handle_node_pool_operation_scale_fails_returns_error( + self, mock_azure_crud, mock_logger + ): + """Test handle_node_pool_operation when scale up fails but continues execution. + + This test verifies that when scale_node_pool returns False (e.g., some nodes + failed to scale but the operation completed), the function correctly returns + exit code 1 to indicate failure while allowing the calling code to continue. + """ + # Setup - progressive scaling where operation fails + mock_args = mock.MagicMock() + mock_args.command = "scale" + mock_args.node_pool_name = "test-np" + mock_args.target_count = 10 + mock_args.scale_step_size = 2 # Progressive scaling + mock_args.gpu_node_pool = False + + # Configure mock to return False (scale failed but didn't raise exception) + mock_azure_crud.scale_node_pool.return_value = False + + # Execute + result = handle_node_pool_operation(mock_azure_crud, mock_args) + + # Verify - operation failed but returned gracefully (no exception) + self.assertEqual(result, 1) # 1 means failure + mock_azure_crud.scale_node_pool.assert_called_once_with( + node_pool_name="test-np", + node_count=10, + progressive=True, + scale_step_size=2, + gpu_node_pool=False, + ) + mock_logger.error.assert_called_with("Operation 'scale' failed") + @mock.patch("crud.main.AzureNodePoolCRUD") def test_handle_node_pool_operation_delete(self, mock_azure_crud): """Test handle_node_pool_operation for delete command""" @@ -339,6 +376,143 @@ def test_main_collect_command_simple(self, mock_collect_func): mock_collect_func.assert_called_once() self.assertEqual(cm.exception.code, 0) + @mock.patch("crud.main.AzureNodePoolCRUD") + def test_handle_workload_operations_create_pod_success(self, mock_azure_crud): + """Test handle_workload_operations for successful pod creation""" + # Setup + mock_args = mock.MagicMock() + mock_args.command = "deployment" + mock_args.node_pool_name = "test-nodepool" + mock_args.replicas = 5 + mock_args.manifest_dir = "/path/to/manifests" + mock_args.number_of_deployments = 3 + + # Configure mock to return success + mock_azure_crud.create_deployment.return_value = True + + # Execute + result = handle_workload_operations(mock_azure_crud, mock_args) + + # Verify + self.assertEqual(result, 0) # 0 means success + mock_azure_crud.create_deployment.assert_called_once_with( + node_pool_name="test-nodepool", + replicas=5, + manifest_dir="/path/to/manifests", + number_of_deployments=3 + ) + + @mock.patch("crud.main.AzureNodePoolCRUD") + def test_handle_workload_operations_failure(self, mock_azure_crud): + """Test handle_workload_operations when operation fails""" + # Setup + mock_args = mock.MagicMock() + mock_args.command = "deployment" + mock_args.node_pool_name = "test-nodepool" + mock_args.replicas = 5 + mock_args.manifest_dir = "/path/to/manifests" + mock_args.number_of_deployments = 3 + + # Configure mock to return failure + mock_azure_crud.create_deployment.return_value = False + + # Execute + result = handle_workload_operations(mock_azure_crud, mock_args) + + # Verify + self.assertEqual(result, 1) # 1 means failure + + @mock.patch("crud.main.logger") + @mock.patch("crud.main.AzureNodePoolCRUD") + def test_handle_workload_operations_exception(self, mock_azure_crud, mock_logger): + """Test handle_workload_operations with exception during operation""" + # Setup + mock_args = mock.MagicMock() + mock_args.command = "deployment" + mock_args.node_pool_name = "test-nodepool" + mock_args.replicas = 5 + mock_args.manifest_dir = "/path/to/manifests" + mock_args.number_of_deployments = 3 + + # Configure mock to raise exception + mock_azure_crud.create_deployment.side_effect = ValueError("Test error") + + # Execute + result = handle_workload_operations(mock_azure_crud, mock_args) + + # Verify + self.assertEqual(result, 1) # 1 means error + mock_logger.error.assert_called_with( + "Error during 'deployment' operation: Test error" + ) + + @mock.patch("crud.main.logger") + @mock.patch("crud.main.AzureNodePoolCRUD") + def test_handle_workload_operations_partial_success(self, mock_azure_crud, mock_logger): + """Test handle_workload_operations when deployment returns partial success (False). + + The create_deployment method returns False when some deployments succeed but + not all of them (partial success). This tests that handle_workload_operations + correctly treats this as a failure and returns exit code 1. + """ + # Setup - simulate a partial success scenario where create_deployment + # returns False (e.g., 2 out of 3 deployments succeeded) + mock_args = mock.MagicMock() + mock_args.command = "deployment" + mock_args.node_pool_name = "test-nodepool" + mock_args.replicas = 5 + mock_args.manifest_dir = "/path/to/manifests" + mock_args.number_of_deployments = 3 # Requesting 3 deployments + + # Configure mock to return False (partial success - some deployments + # succeeded but not all, which is still considered a failure) + mock_azure_crud.create_deployment.return_value = False + + # Execute + result = handle_workload_operations(mock_azure_crud, mock_args) + + # Verify + self.assertEqual(result, 1) # 1 means failure (partial success is still failure) + mock_azure_crud.create_deployment.assert_called_once_with( + node_pool_name="test-nodepool", + replicas=5, + manifest_dir="/path/to/manifests", + number_of_deployments=3 + ) + # Verify the error was logged for the failed operation + mock_logger.error.assert_called_with("Operation 'deployment' failed") + + @mock.patch("crud.main.AzureNodePoolCRUD") + def test_handle_workload_operations_multiple_deployments_success(self, mock_azure_crud): + """Test handle_workload_operations with multiple deployments all succeeding. + + This test verifies that when create_deployment is called with multiple + deployments (number_of_deployments > 1) and all deployments succeed, + the function returns success (exit code 0). + """ + # Setup - configure for multiple deployments + mock_args = mock.MagicMock() + mock_args.command = "deployment" + mock_args.node_pool_name = "test-nodepool" + mock_args.replicas = 10 + mock_args.manifest_dir = "/path/to/manifests" + mock_args.number_of_deployments = 5 # Multiple deployments + + # Configure mock to return True (all deployments succeeded) + mock_azure_crud.create_deployment.return_value = True + + # Execute + result = handle_workload_operations(mock_azure_crud, mock_args) + + # Verify + self.assertEqual(result, 0) # 0 means success + mock_azure_crud.create_deployment.assert_called_once_with( + node_pool_name="test-nodepool", + replicas=10, + manifest_dir="/path/to/manifests", + number_of_deployments=5 + ) + class TestCollectBenchmarkResults(unittest.TestCase): """Tests for the collect_benchmark_results function""" diff --git a/pipelines/system/new-pipeline-test.yml b/pipelines/system/new-pipeline-test.yml index 63d55f02d9..9e2469a9a6 100644 --- a/pipelines/system/new-pipeline-test.yml +++ b/pipelines/system/new-pipeline-test.yml @@ -1,25 +1,35 @@ trigger: none variables: - SCENARIO_TYPE: - SCENARIO_NAME: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: k8s-gpu-cluster-crud stages: - - stage: # format: [_]+ (e.g. azure_eastus2, aws_eastus_westus) + - stage: azure_eastus2_node_pool_crud dependsOn: [] jobs: - - template: /jobs/competitive-test.yml # must keep as is + - template: /jobs/competitive-test.yml parameters: - cloud: # e.g. azure, aws - regions: # list of regions - - region1 # e.g. eastus2 - topology: # e.g. cluster-autoscaler - engine: # e.g. clusterloader2 - matrix: # list of test parameters to customize the provisioned resources - : - : - : - max_parallel: # required - credential_type: service_connection # required + cloud: azure + regions: + - eastus2 + topology: k8s-crud-gpu + engine: crud + matrix: + node_pool_crud_standard: + VM_SIZE: Standard_NC24ads_A100_v4 + CREATE_NODE_COUNT: 1 + SCALE_NODE_COUNT: 2 + SCALE_STEP_SIZE: 1 + POOL_NAME: testpool + STEP_TIME_OUT: 600 + GPU_NODE_POOL: "" + STEP_WAIT_TIME: 30 + DEPLOYMENT_NAME: testdeployment + NUMBER_OF_DEPLOYMENTS: 1 + REPLICAS: 10 + MANIFEST_DIR: $(Pipeline.Workspace)/s/modules/python/crud/workload_templates + max_parallel: 1 + credential_type: service_connection ssh_key_enabled: false - timeout_in_minutes: 60 # if not specified, default is 60 + timeout_in_minutes: 60 diff --git a/steps/engine/crud/k8s/execute.yml b/steps/engine/crud/k8s/execute.yml index 354b67e828..1abdaef8f0 100644 --- a/steps/engine/crud/k8s/execute.yml +++ b/steps/engine/crud/k8s/execute.yml @@ -9,6 +9,10 @@ parameters: step_time_out: 600 step_wait_time: 30 gpu_node_pool: false + deployment_name: "" + number_of_deployments: 1 + replicas: 10 + manifest_dir: "" steps: - script: | @@ -37,6 +41,7 @@ steps: --step-wait-time "$STEP_WAIT_TIME" \ --step-timeout "$STEP_TIME_OUT" \ ${GPU_NODE_POOL:+--gpu-node-pool} + displayName: 'Execute K8s Create & Scale Up Operations for ${{ parameters.cloud }}' workingDirectory: modules/python env: @@ -54,6 +59,35 @@ steps: ${{ if eq(parameters.cloud, 'aws') }}: CAPACITY_TYPE: $(CAPACITY_TYPE) +- script: | + set -eo pipefail + + # Deploy Workloads + PYTHONPATH=$PYTHONPATH:$(pwd) python3 "$PYTHON_SCRIPT_FILE" deployment \ + --cloud "$CLOUD" \ + --run-id "$RUN_ID" \ + --result-dir "$RESULT_DIR" \ + --node-pool-name "$POOL_NAME" \ + --deployment-name "$DEPLOYMENT_NAME" \ + --number_of_deployments "$NUMBER_OF_DEPLOYMENTS" \ + --replicas "$REPLICAS" \ + --manifest-dir "$MANIFEST_DIR" \ + --step-timeout "$STEP_TIME_OUT" \ + ${GPU_NODE_POOL:+--gpu-node-pool} + displayName: 'Execute K8s Workload operations for ${{ parameters.cloud }}' + workingDirectory: modules/python + env: + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/crud/main.py + POOL_NAME: ${{ parameters.pool_name }} + CLOUD: ${{ parameters.cloud }} + STEP_TIME_OUT: ${{ parameters.step_time_out }} + RESULT_DIR: $(System.DefaultWorkingDirectory)/$(RUN_ID) + GPU_NODE_POOL: ${{ parameters.gpu_node_pool }} + DEPLOYMENT_NAME: ${{ parameters.deployment_name }} + NUMBER_OF_DEPLOYMENTS: ${{ parameters.number_of_deployments }} + REPLICAS: ${{ parameters.replicas }} + MANIFEST_DIR: ${{ parameters.manifest_dir }} + - script: | set -eo pipefail diff --git a/steps/topology/k8s-crud-gpu/execute-crud.yml b/steps/topology/k8s-crud-gpu/execute-crud.yml index 166a123e38..847a623be4 100644 --- a/steps/topology/k8s-crud-gpu/execute-crud.yml +++ b/steps/topology/k8s-crud-gpu/execute-crud.yml @@ -22,3 +22,7 @@ steps: result_dir: $(System.DefaultWorkingDirectory)/$(RUN_ID) gpu_node_pool: $(GPU_NODE_POOL) step_wait_time: $(STEP_WAIT_TIME) + deployment_name: $(DEPLOYMENT_NAME) + number_of_deployments: $(NUMBER_OF_DEPLOYMENTS) + replicas: $(REPLICAS) + manifest_dir: $(MANIFEST_DIR)