diff --git a/src/aks-preview/HISTORY.rst b/src/aks-preview/HISTORY.rst index d96ada3863a..222307e1904 100644 --- a/src/aks-preview/HISTORY.rst +++ b/src/aks-preview/HISTORY.rst @@ -11,6 +11,7 @@ To release a new version, please select a new version number (usually plus 1 to Pending +++++++ +* Add MIG (Multi-Instance GPU) strategy option to node pool property in `az aks nodepool add` and `az aks nodepool update`. * `az aks create/update`: Add `--outbound-type managedNATGatewayV2` support using Azure NAT Gateway Standard V2 SKU with IPv6, user-provided IPs, and IP prefixes. * Fix monitoring addon key casing compatibility with azure-cli/acs diff --git a/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json b/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json index a9138382806..abfd26e419c 100644 --- a/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json +++ b/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json @@ -24,7 +24,8 @@ "test_aks_nodepool_add_with_gpu_instance_profile", "test_aks_gpu_driver_type", "test_aks_nodepool_add_with_enable_managed_gpu", - "test_aks_nodepool_update_with_enable_managed_gpu" + "test_aks_nodepool_update_with_enable_managed_gpu", + "test_aks_nodepool_add_with_gpu_mig_strategy" ], "pod ip allocation mode static block, missing feature registration": [ "test_aks_create_with_pod_ip_allocation_mode_static_block" diff --git a/src/aks-preview/azext_aks_preview/_consts.py b/src/aks-preview/azext_aks_preview/_consts.py index ad3f5e6ec43..6a07919e3f0 100644 --- a/src/aks-preview/azext_aks_preview/_consts.py +++ b/src/aks-preview/azext_aks_preview/_consts.py @@ -406,6 +406,10 @@ CONST_GPU_DRIVER_TYPE_CUDA = "CUDA" CONST_GPU_DRIVER_TYPE_GRID = "GRID" +# GPU MIG Strategy Consts +CONST_GPU_MIG_STRATEGY_SINGLE = "Single" +CONST_GPU_MIG_STRATEGY_MIXED = "Mixed" + # k8s extension constants CONST_K8S_EXTENSION_CUSTOM_MOD_NAME = "azext_k8s_extension.custom" CONST_K8S_EXTENSION_CLIENT_FACTORY_MOD_NAME = "azext_k8s_extension._client_factory" diff --git a/src/aks-preview/azext_aks_preview/_help.py b/src/aks-preview/azext_aks_preview/_help.py index f71fc9ed92d..cb0e32e7d8a 100644 --- a/src/aks-preview/azext_aks_preview/_help.py +++ b/src/aks-preview/azext_aks_preview/_help.py @@ -2230,6 +2230,9 @@ - name: --driver-type type: string short-summary: Specify the type of GPU driver to install when creating Windows agent pools. Valid values are "GRID" and "CUDA". If not provided, AKS selects the driver based on system compatibility. This option cannot be changed once the AgentPool has been created. The default is system selected. + - name: --gpu-mig-strategy + type: string + short-summary: Specify the MIG (Multi-Instance GPU) strategy for managed MIG support. Valid values are "Single" and "Mixed". When not specified, managed MIG is disabled. - name: --ssh-access type: string short-summary: Configure SSH setting for the node pool. Use "disabled" to disable SSH access, "localuser" to enable SSH access using private key. diff --git a/src/aks-preview/azext_aks_preview/_params.py b/src/aks-preview/azext_aks_preview/_params.py index 22de352806d..4092076b468 100644 --- a/src/aks-preview/azext_aks_preview/_params.py +++ b/src/aks-preview/azext_aks_preview/_params.py @@ -158,6 +158,8 @@ CONST_APP_ROUTING_NONE_NGINX, CONST_GPU_DRIVER_TYPE_CUDA, CONST_GPU_DRIVER_TYPE_GRID, + CONST_GPU_MIG_STRATEGY_SINGLE, + CONST_GPU_MIG_STRATEGY_MIXED, CONST_ADVANCED_NETWORKPOLICIES_NONE, CONST_ADVANCED_NETWORKPOLICIES_FQDN, CONST_ADVANCED_NETWORKPOLICIES_L7, @@ -548,6 +550,11 @@ CONST_GPU_DRIVER_TYPE_GRID, ] +gpu_mig_strategies = [ + CONST_GPU_MIG_STRATEGY_SINGLE, + CONST_GPU_MIG_STRATEGY_MIXED, +] + upgrade_strategies = [ CONST_UPGRADE_STRATEGY_ROLLING, CONST_UPGRADE_STRATEGY_BLUE_GREEN, @@ -2114,7 +2121,7 @@ def load_arguments(self, _): ) c.argument( "enable_managed_gpu", - action="store_true", + arg_type=get_three_state_flag(), is_preview=True, help="Enable the Managed GPU experience.", ) @@ -2143,6 +2150,12 @@ def load_arguments(self, _): arg_type=get_enum_type(gpu_driver_types), is_preview=True, ) + c.argument( + "gpu_mig_strategy", + arg_type=get_enum_type(gpu_mig_strategies), + is_preview=True, + help="Specify the GPU Multi-Instance GPU (MIG) strategy. Allowed values: Single, Mixed.", + ) # in creation scenario, use "localuser" as default c.argument( 'ssh_access', @@ -2229,9 +2242,9 @@ def load_arguments(self, _): ) c.argument( "enable_managed_gpu", - action="store_true", + arg_type=get_three_state_flag(), is_preview=True, - help="Enable the Managed GPU experience.", + help="Enable or disable the Managed GPU experience.", ) c.argument( "os_sku", @@ -2286,6 +2299,12 @@ def load_arguments(self, _): "gpu_driver", arg_type=get_enum_type(gpu_driver_install_modes) ) + c.argument( + "gpu_mig_strategy", + arg_type=get_enum_type(gpu_mig_strategies), + is_preview=True, + help="Specify the GPU Multi-Instance GPU (MIG) strategy. Allowed values: Single, Mixed.", + ) with self.argument_context("aks nodepool upgrade") as c: # upgrade strategy diff --git a/src/aks-preview/azext_aks_preview/agentpool_decorator.py b/src/aks-preview/azext_aks_preview/agentpool_decorator.py index 6446ac61edc..35655a08320 100644 --- a/src/aks-preview/azext_aks_preview/agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/agentpool_decorator.py @@ -752,6 +752,25 @@ def get_driver_type(self) -> Union[str, None]: return driver_type + def get_gpu_mig_strategy(self) -> Union[str, None]: + """Obtain the value of gpu_mig_strategy. + :return: str or None + """ + # read the original value passed by the command + gpu_mig_strategy = self.raw_param.get("gpu_mig_strategy") + + # In create mode, try to read the property value corresponding to the parameter from the `agentpool` object + if self.decorator_mode == DecoratorMode.CREATE: + if ( + self.agentpool and + self.agentpool.gpu_profile is not None and + self.agentpool.gpu_profile.nvidia is not None and + self.agentpool.gpu_profile.nvidia.mig_strategy is not None + ): + gpu_mig_strategy = self.agentpool.gpu_profile.nvidia.mig_strategy + + return gpu_mig_strategy + def get_enable_secure_boot(self) -> bool: """Obtain the value of enable_secure_boot. :return: bool @@ -1365,6 +1384,20 @@ def set_up_driver_type(self, agentpool: AgentPool) -> AgentPool: agentpool.gpu_profile.driver_type = driver_type return agentpool + def set_up_gpu_mig_strategy(self, agentpool: AgentPool) -> AgentPool: + """Set up gpu mig strategy property for the AgentPool object.""" + self._ensure_agentpool(agentpool) + + gpu_mig_strategy = self.context.get_gpu_mig_strategy() + if gpu_mig_strategy is not None: + if agentpool.gpu_profile is None: + agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member + if agentpool.gpu_profile.nvidia is None: + agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member + agentpool.gpu_profile.nvidia.mig_strategy = gpu_mig_strategy + agentpool.gpu_profile.driver = CONST_GPU_DRIVER_INSTALL + return agentpool + def set_up_pod_ip_allocation_mode(self, agentpool: AgentPool) -> AgentPool: """Set up pod ip allocation mode for the AgentPool object.""" self._ensure_agentpool(agentpool) @@ -1557,6 +1590,8 @@ def construct_agentpool_profile_preview(self) -> AgentPool: agentpool = self.set_up_gpu_profile(agentpool) # set up driver_type agentpool = self.set_up_driver_type(agentpool) + # set up gpu_mig_strategy + agentpool = self.set_up_gpu_mig_strategy(agentpool) # set up agentpool ssh access agentpool = self.set_up_ssh_access(agentpool) # set up agentpool pod ip allocation mode @@ -1733,6 +1768,20 @@ def update_gpu_profile(self, agentpool: AgentPool) -> AgentPool: agentpool.gpu_profile.driver = gpu_driver return agentpool + def update_gpu_mig_strategy(self, agentpool: AgentPool) -> AgentPool: + """Update gpu mig strategy property for the AgentPool object.""" + self._ensure_agentpool(agentpool) + + gpu_mig_strategy = self.context.get_gpu_mig_strategy() + if gpu_mig_strategy is not None: + if agentpool.gpu_profile is None: + agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member + if agentpool.gpu_profile.nvidia is None: + agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member + agentpool.gpu_profile.nvidia.mig_strategy = gpu_mig_strategy + agentpool.gpu_profile.driver = CONST_GPU_DRIVER_INSTALL + return agentpool + def update_artifact_streaming(self, agentpool: AgentPool) -> AgentPool: """Update artifact streaming property for the AgentPool object. :return: the AgentPool object @@ -1760,10 +1809,16 @@ def update_managed_gpu(self, agentpool: AgentPool) -> AgentPool: agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member if agentpool.gpu_profile.nvidia is None: agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member + # Check if already set to the desired value to avoid API error + if agentpool.gpu_profile.nvidia.management_mode == CONST_GPU_MANAGEMENT_MODE_MANAGED: + return agentpool agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_MANAGED agentpool.gpu_profile.driver = CONST_GPU_DRIVER_INSTALL else: if agentpool.gpu_profile and agentpool.gpu_profile.nvidia: + # Check if already set to the desired value to avoid API error + if agentpool.gpu_profile.nvidia.management_mode == CONST_GPU_MANAGEMENT_MODE_UNMANAGED: + return agentpool agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_UNMANAGED return agentpool @@ -1925,6 +1980,9 @@ def update_agentpool_profile_preview(self, agentpools: List[AgentPool] = None) - # update gpu profile agentpool = self.update_gpu_profile(agentpool) + # update gpu mig strategy + agentpool = self.update_gpu_mig_strategy(agentpool) + return agentpool def update_auto_scaler_properties(self, agentpool: AgentPool) -> AgentPool: diff --git a/src/aks-preview/azext_aks_preview/custom.py b/src/aks-preview/azext_aks_preview/custom.py index ab273f365dd..d7e11b622b7 100644 --- a/src/aks-preview/azext_aks_preview/custom.py +++ b/src/aks-preview/azext_aks_preview/custom.py @@ -1934,6 +1934,7 @@ def aks_agentpool_add( skip_gpu_driver_install=False, gpu_driver=None, driver_type=None, + gpu_mig_strategy=None, ssh_access=CONST_SSH_ACCESS_LOCALUSER, # trusted launch enable_secure_boot=False, @@ -2022,6 +2023,7 @@ def aks_agentpool_update( localdns_config=None, node_vm_size=None, gpu_driver=None, + gpu_mig_strategy=None, ): # DO NOT MOVE: get all the original parameters and save them as a dictionary raw_parameters = locals() diff --git a/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py b/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py index 8bbe8fe9028..2797f99ea4b 100644 --- a/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py @@ -434,6 +434,66 @@ def common_get_driver_type(self): ctx_0.attach_agentpool(agentpool_0) self.assertEqual(ctx_0.get_driver_type(), "CUDA") + def common_get_gpu_mig_strategy(self): + # default + ctx_1 = AKSPreviewAgentPoolContext( + self.cmd, + AKSAgentPoolParamDict({"gpu_mig_strategy": None}), + self.models, + DecoratorMode.CREATE, + self.agentpool_decorator_mode, + ) + self.assertEqual(ctx_1.get_gpu_mig_strategy(), None) + agentpool_1 = self.create_initialized_agentpool_instance( + gpu_profile=self.models.GPUProfile( + nvidia=self.models.NvidiaGPUProfile( + mig_strategy="Single" + ) + ) + ) + + ctx_1.attach_agentpool(agentpool_1) + self.assertEqual(ctx_1.get_gpu_mig_strategy(), "Single") + + # default + ctx_2 = AKSPreviewAgentPoolContext( + self.cmd, + AKSAgentPoolParamDict({"gpu_mig_strategy": None}), + self.models, + DecoratorMode.CREATE, + self.agentpool_decorator_mode, + ) + self.assertEqual(ctx_2.get_gpu_mig_strategy(), None) + agentpool_2 = self.create_initialized_agentpool_instance( + gpu_profile=self.models.GPUProfile( + nvidia=self.models.NvidiaGPUProfile( + mig_strategy="Mixed" + ) + ) + ) + ctx_2.attach_agentpool(agentpool_2) + self.assertEqual(ctx_2.get_gpu_mig_strategy(), "Mixed") + + # custom + ctx_0 = AKSPreviewAgentPoolContext( + self.cmd, + AKSAgentPoolParamDict({"gpu_mig_strategy": "Single"}), + self.models, + DecoratorMode.CREATE, + self.agentpool_decorator_mode, + ) + self.assertEqual(ctx_0.get_gpu_mig_strategy(), "Single") + agentpool_0 = self.create_initialized_agentpool_instance( + gpu_profile=self.models.GPUProfile( + nvidia=self.models.NvidiaGPUProfile( + mig_strategy=None + ) + ) + ) + + ctx_0.attach_agentpool(agentpool_0) + self.assertEqual(ctx_0.get_gpu_mig_strategy(), "Single") + def common_get_os_sku(self): # default ctx_1 = AKSPreviewAgentPoolContext( @@ -1095,6 +1155,9 @@ def test_get_gpu_driver(self): def test_get_driver_type(self): self.common_get_driver_type() + def test_get_gpu_mig_strategy(self): + self.common_get_gpu_mig_strategy() + def test_get_enable_secure_boot(self): self.common_get_enable_secure_boot() diff --git a/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py b/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py index a0f8bf45967..01b2a6a56ce 100644 --- a/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py +++ b/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py @@ -5586,6 +5586,64 @@ def test_aks_nodepool_add_with_gpu_instance_profile( checks=[self.is_empty()], ) + @live_only() + @AllowLargeResponse() + @AKSCustomResourceGroupPreparer( + random_name_length=17, name_prefix="clitest", location="westus3" + ) + def test_aks_nodepool_add_with_gpu_mig_strategy( + self, resource_group, resource_group_location + ): + aks_name = self.create_random_name("cliakstest", 16) + node_pool_name = self.create_random_name("c", 6) + node_pool_name_second = self.create_random_name("c", 6) + self.kwargs.update( + { + "resource_group": resource_group, + "name": aks_name, + "node_pool_name": node_pool_name, + "node_pool_name_second": node_pool_name_second, + "ssh_key_value": self.generate_ssh_keys(), + } + ) + + create_cmd = ( + "aks create --resource-group={resource_group} --name={name} " + "--nodepool-name {node_pool_name} -c 1 " + "--ssh-key-value={ssh_key_value}" + ) + self.cmd( + create_cmd, + checks=[ + self.check("provisioningState", "Succeeded"), + ], + ) + + # nodepool add with gpu-mig-strategy + self.cmd( + "aks nodepool add " + "--resource-group={resource_group} " + "--cluster-name={name} " + "--name={node_pool_name_second} " + "--enable-managed-gpu=true " + "--gpu-instance-profile=MIG3g " + "--gpu-mig-strategy=Single " + "-c 1 " + "--aks-custom-headers UseGPUDedicatedVHD=true " + "--node-vm-size=Standard_NC24ads_A100_v4", + checks=[ + self.check("provisioningState", "Succeeded"), + self.check("gpuInstanceProfile", "MIG3g"), + self.check("gpuProfile.nvidia.migStrategy", "Single"), + ], + ) + + # delete + self.cmd( + "aks delete -g {resource_group} -n {name} --yes --no-wait", + checks=[self.is_empty()], + ) + @live_only() # live only due to workspace is not mocked correctly and role assignment is not mocked @AllowLargeResponse() @AKSCustomResourceGroupPreparer( @@ -6978,7 +7036,7 @@ def test_aks_nodepool_add_with_enable_managed_gpu( self.cmd( "aks nodepool add --resource-group={resource_group} --cluster-name={name} --name={node_pool_name} " "--node-vm-size={node_vm_size} --node-count 1 " - " --enable-managed-gpu", + "--enable-managed-gpu=true ", checks=[ self.check("provisioningState", "Succeeded"), self.check("gpuProfile.driver", "Install"), @@ -16715,7 +16773,7 @@ def test_aks_nodepool_update_with_enable_managed_gpu( "--resource-group={resource_group} " "--cluster-name={name} " "--name={node_pool_name} " - "--enable-managed-gpu", + "--enable-managed-gpu=true ", checks=[ self.check("provisioningState", "Succeeded"), self.check("gpuProfile.driver", "Install"), diff --git a/src/aks-preview/azext_aks_preview/tests/latest/test_update_agentpool_profile_preview.py b/src/aks-preview/azext_aks_preview/tests/latest/test_update_agentpool_profile_preview.py index a9d5f9548ab..d10f5f6c9bf 100644 --- a/src/aks-preview/azext_aks_preview/tests/latest/test_update_agentpool_profile_preview.py +++ b/src/aks-preview/azext_aks_preview/tests/latest/test_update_agentpool_profile_preview.py @@ -134,6 +134,7 @@ def test_update_agentpool_profile_preview_default_behavior(self): decorator.update_upgrade_strategy = Mock(return_value=agentpool) decorator.update_blue_green_upgrade_settings = Mock(return_value=agentpool) decorator.update_gpu_profile = Mock(return_value=agentpool) + decorator.update_gpu_mig_strategy = Mock(return_value=agentpool) # Act result = decorator.update_agentpool_profile_preview() @@ -158,6 +159,7 @@ def test_update_agentpool_profile_preview_default_behavior(self): decorator.update_upgrade_strategy.assert_called_once_with(agentpool) decorator.update_blue_green_upgrade_settings.assert_called_once_with(agentpool) decorator.update_gpu_profile.assert_called_once_with(agentpool) + decorator.update_gpu_mig_strategy.assert_called_once_with(agentpool) def test_update_agentpool_profile_preview_with_agentpools_parameter(self): """Test update_agentpool_profile_preview with agentpools parameter.""" @@ -200,6 +202,7 @@ def test_update_agentpool_profile_preview_with_agentpools_parameter(self): decorator.update_upgrade_strategy = Mock(return_value=agentpool) decorator.update_blue_green_upgrade_settings = Mock(return_value=agentpool) decorator.update_gpu_profile = Mock(return_value=agentpool) + decorator.update_gpu_mig_strategy = Mock(return_value=agentpool) # Act result = decorator.update_agentpool_profile_preview(agentpools) @@ -361,6 +364,7 @@ def test_update_agentpool_profile_preview_system_mode_regular_flow(self): decorator.update_upgrade_strategy = Mock(return_value=agentpool) decorator.update_blue_green_upgrade_settings = Mock(return_value=agentpool) decorator.update_gpu_profile = Mock(return_value=agentpool) + decorator.update_gpu_mig_strategy = Mock(return_value=agentpool) # Act result = decorator.update_agentpool_profile_preview() @@ -383,6 +387,7 @@ def test_update_agentpool_profile_preview_system_mode_regular_flow(self): decorator.update_upgrade_strategy.assert_called_once_with(agentpool) decorator.update_blue_green_upgrade_settings.assert_called_once_with(agentpool) decorator.update_gpu_profile.assert_called_once_with(agentpool) + decorator.update_gpu_mig_strategy.assert_called_once_with(agentpool) def test_update_agentpool_profile_preview_execution_order(self): """Test that update methods are called in the correct order.""" @@ -430,6 +435,7 @@ def mock_method(pool): decorator.update_upgrade_strategy = create_mock_update_method("update_upgrade_strategy") decorator.update_blue_green_upgrade_settings = create_mock_update_method("update_blue_green_upgrade_settings") decorator.update_gpu_profile = create_mock_update_method("update_gpu_profile") + decorator.update_gpu_mig_strategy = create_mock_update_method("update_gpu_mig_strategy") # Act decorator.update_agentpool_profile_preview() @@ -449,6 +455,7 @@ def mock_method(pool): "update_upgrade_strategy", "update_blue_green_upgrade_settings", "update_gpu_profile", + "update_gpu_mig_strategy", ] self.assertEqual(call_order, expected_order) @@ -498,6 +505,7 @@ def track_and_return(pool): decorator.update_upgrade_strategy = create_tracking_mock("update_upgrade_strategy") decorator.update_blue_green_upgrade_settings = create_tracking_mock("update_blue_green_upgrade_settings") decorator.update_gpu_profile = create_tracking_mock("update_gpu_profile") + decorator.update_gpu_mig_strategy = create_tracking_mock("update_gpu_mig_strategy") # Act result = decorator.update_agentpool_profile_preview() @@ -560,7 +568,8 @@ def test_update_agentpool_profile_preview_mixed_modes_scenario(self): 'update_network_profile', 'update_artifact_streaming', 'update_managed_gpu', 'update_secure_boot', 'update_vtpm', 'update_os_sku', 'update_fips_image', 'update_ssh_access', 'update_localdns_profile', 'update_auto_scaler_properties_vms', - 'update_upgrade_strategy', 'update_blue_green_upgrade_settings', 'update_gpu_profile' + 'update_upgrade_strategy', 'update_blue_green_upgrade_settings', 'update_gpu_profile', + 'update_gpu_mig_strategy' ] for method_name in update_methods: @@ -634,6 +643,7 @@ def test_update_agentpool_profile_preview_managed_cluster_mode(self): decorator.update_upgrade_strategy = Mock(return_value=agentpool) decorator.update_blue_green_upgrade_settings = Mock(return_value=agentpool) decorator.update_gpu_profile = Mock(return_value=agentpool) + decorator.update_gpu_mig_strategy = Mock(return_value=agentpool) # Act result = decorator.update_agentpool_profile_preview(agentpools)