From 147cf5e2fa1a0b721e0f102fff86ecdd40c20888 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Wed, 25 Mar 2026 22:13:39 +0000 Subject: [PATCH 01/11] mig-strategy --- src/aks-preview/azext_aks_preview/_consts.py | 4 ++ src/aks-preview/azext_aks_preview/_help.py | 3 + src/aks-preview/azext_aks_preview/_params.py | 20 +++++- .../azext_aks_preview/agentpool_decorator.py | 58 ++++++++++++++++- src/aks-preview/azext_aks_preview/custom.py | 2 + .../tests/latest/test_agentpool_decorator.py | 63 +++++++++++++++++++ 6 files changed, 147 insertions(+), 3 deletions(-) diff --git a/src/aks-preview/azext_aks_preview/_consts.py b/src/aks-preview/azext_aks_preview/_consts.py index 10c430a9ce6..5fda7ab2204 100644 --- a/src/aks-preview/azext_aks_preview/_consts.py +++ b/src/aks-preview/azext_aks_preview/_consts.py @@ -405,6 +405,10 @@ CONST_GPU_DRIVER_TYPE_CUDA = "CUDA" CONST_GPU_DRIVER_TYPE_GRID = "GRID" +# GPU MIG Strategy Consts +CONST_GPU_MIG_STRATEGY_SINGLE = "Single" +CONST_GPU_MIG_STRATEGY_MIXED = "Mixed" + # k8s extension constants CONST_K8S_EXTENSION_CUSTOM_MOD_NAME = "azext_k8s_extension.custom" CONST_K8S_EXTENSION_CLIENT_FACTORY_MOD_NAME = "azext_k8s_extension._client_factory" diff --git a/src/aks-preview/azext_aks_preview/_help.py b/src/aks-preview/azext_aks_preview/_help.py index cac2a156f5c..43f304cc658 100644 --- a/src/aks-preview/azext_aks_preview/_help.py +++ b/src/aks-preview/azext_aks_preview/_help.py @@ -2230,6 +2230,9 @@ - name: --driver-type type: string short-summary: Specify the type of GPU driver to install when creating Windows agent pools. Valid values are "GRID" and "CUDA". If not provided, AKS selects the driver based on system compatibility. This option cannot be changed once the AgentPool has been created. The default is system selected. + - name: --gpu-mig-strategy + type: string + short-summary: Specify the MIG (Multi-Instance GPU) strategy for managed MIG support. Valid values are "Single", "Mixed", and "None". When not specified, the default is None. - name: --ssh-access type: string short-summary: Configure SSH setting for the node pool. Use "disabled" to disable SSH access, "localuser" to enable SSH access using private key. diff --git a/src/aks-preview/azext_aks_preview/_params.py b/src/aks-preview/azext_aks_preview/_params.py index b1c272c44d9..983db6927f5 100644 --- a/src/aks-preview/azext_aks_preview/_params.py +++ b/src/aks-preview/azext_aks_preview/_params.py @@ -153,6 +153,9 @@ CONST_APP_ROUTING_NONE_NGINX, CONST_GPU_DRIVER_TYPE_CUDA, CONST_GPU_DRIVER_TYPE_GRID, + CONST_GPU_MIG_STRATEGY_SINGLE, + CONST_GPU_MIG_STRATEGY_MIXED, + CONST_GPU_MIG_STRATEGY_NONE, CONST_ADVANCED_NETWORKPOLICIES_NONE, CONST_ADVANCED_NETWORKPOLICIES_FQDN, CONST_ADVANCED_NETWORKPOLICIES_L7, @@ -542,6 +545,11 @@ CONST_GPU_DRIVER_TYPE_GRID, ] +gpu_mig_strategies = [ + CONST_GPU_MIG_STRATEGY_SINGLE, + CONST_GPU_MIG_STRATEGY_MIXED, +] + upgrade_strategies = [ CONST_UPGRADE_STRATEGY_ROLLING, CONST_UPGRADE_STRATEGY_BLUE_GREEN, @@ -2038,7 +2046,6 @@ def load_arguments(self, _): ) c.argument( "enable_managed_gpu", - action="store_true", is_preview=True, help="Enable the Managed GPU experience.", ) @@ -2067,6 +2074,11 @@ def load_arguments(self, _): arg_type=get_enum_type(gpu_driver_types), is_preview=True, ) + c.argument( + "gpu_mig_strategy", + arg_type=get_enum_type(gpu_mig_strategies), + is_preview=True, + ) # in creation scenario, use "localuser" as default c.argument( 'ssh_access', @@ -2153,7 +2165,6 @@ def load_arguments(self, _): ) c.argument( "enable_managed_gpu", - action="store_true", is_preview=True, help="Enable the Managed GPU experience.", ) @@ -2210,6 +2221,11 @@ def load_arguments(self, _): "gpu_driver", arg_type=get_enum_type(gpu_driver_install_modes) ) + c.argument( + "gpu_mig_strategy", + arg_type=get_enum_type(gpu_mig_strategies), + is_preview=True, + ) with self.argument_context("aks nodepool upgrade") as c: # upgrade strategy diff --git a/src/aks-preview/azext_aks_preview/agentpool_decorator.py b/src/aks-preview/azext_aks_preview/agentpool_decorator.py index 6446ac61edc..ce228e0d3fc 100644 --- a/src/aks-preview/azext_aks_preview/agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/agentpool_decorator.py @@ -752,6 +752,25 @@ def get_driver_type(self) -> Union[str, None]: return driver_type + def get_gpu_mig_strategy(self) -> Union[str, None]: + """Obtain the value of gpu_mig_strategy. + :return: str or None + """ + # read the original value passed by the command + gpu_mig_strategy = self.raw_param.get("gpu_mig_strategy") + + # In create mode, try to read the property value corresponding to the parameter from the `agentpool` object + if self.decorator_mode == DecoratorMode.CREATE: + if ( + self.agentpool and + self.agentpool.gpu_profile is not None and + self.agentpool.gpu_profile.nvidia is not None and + self.agentpool.gpu_profile.nvidia.mig_strategy is not None + ): + gpu_mig_strategy = self.agentpool.gpu_profile.nvidia.mig_strategy + + return gpu_mig_strategy + def get_enable_secure_boot(self) -> bool: """Obtain the value of enable_secure_boot. :return: bool @@ -1365,6 +1384,19 @@ def set_up_driver_type(self, agentpool: AgentPool) -> AgentPool: agentpool.gpu_profile.driver_type = driver_type return agentpool + def set_up_gpu_mig_strategy(self, agentpool: AgentPool) -> AgentPool: + """Set up gpu mig strategy property for the AgentPool object.""" + self._ensure_agentpool(agentpool) + + gpu_mig_strategy = self.context.get_gpu_mig_strategy() + if gpu_mig_strategy is not None: + if agentpool.gpu_profile is None: + agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member + if agentpool.gpu_profile.nvidia is None: + agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member + agentpool.gpu_profile.nvidia.mig_strategy = gpu_mig_strategy + return agentpool + def set_up_pod_ip_allocation_mode(self, agentpool: AgentPool) -> AgentPool: """Set up pod ip allocation mode for the AgentPool object.""" self._ensure_agentpool(agentpool) @@ -1557,6 +1589,8 @@ def construct_agentpool_profile_preview(self) -> AgentPool: agentpool = self.set_up_gpu_profile(agentpool) # set up driver_type agentpool = self.set_up_driver_type(agentpool) + # set up gpu_mig_strategy + agentpool = self.set_up_gpu_mig_strategy(agentpool) # set up agentpool ssh access agentpool = self.set_up_ssh_access(agentpool) # set up agentpool pod ip allocation mode @@ -1733,6 +1767,19 @@ def update_gpu_profile(self, agentpool: AgentPool) -> AgentPool: agentpool.gpu_profile.driver = gpu_driver return agentpool + def update_gpu_mig_strategy(self, agentpool: AgentPool) -> AgentPool: + """Update gpu mig strategy property for the AgentPool object.""" + self._ensure_agentpool(agentpool) + + gpu_mig_strategy = self.context.get_gpu_mig_strategy() + if gpu_mig_strategy is not None: + if agentpool.gpu_profile is None: + agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member + if agentpool.gpu_profile.nvidia is None: + agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member + agentpool.gpu_profile.nvidia.mig_strategy = gpu_mig_strategy + return agentpool + def update_artifact_streaming(self, agentpool: AgentPool) -> AgentPool: """Update artifact streaming property for the AgentPool object. :return: the AgentPool object @@ -1760,11 +1807,17 @@ def update_managed_gpu(self, agentpool: AgentPool) -> AgentPool: agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member if agentpool.gpu_profile.nvidia is None: agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member + # Check if already set to the desired value to avoid API error + if agentpool.gpu_profile.nvidia.management_mode == CONST_GPU_MANAGEMENT_MODE_MANAGED: + return agentpool agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_MANAGED agentpool.gpu_profile.driver = CONST_GPU_DRIVER_INSTALL else: if agentpool.gpu_profile and agentpool.gpu_profile.nvidia: - agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_UNMANAGED + # Check if already set to the desired value to avoid API error + if agentpool.gpu_profile.nvidia.management_mode == CONST_GPU_MANAGEMENT_MODE_UNMANAGED: + return agentpool + agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_UNMANAGED return agentpool @@ -1925,6 +1978,9 @@ def update_agentpool_profile_preview(self, agentpools: List[AgentPool] = None) - # update gpu profile agentpool = self.update_gpu_profile(agentpool) + # update gpu mig strategy + agentpool = self.update_gpu_mig_strategy(agentpool) + return agentpool def update_auto_scaler_properties(self, agentpool: AgentPool) -> AgentPool: diff --git a/src/aks-preview/azext_aks_preview/custom.py b/src/aks-preview/azext_aks_preview/custom.py index acc7311ccdf..f3bb93cf579 100644 --- a/src/aks-preview/azext_aks_preview/custom.py +++ b/src/aks-preview/azext_aks_preview/custom.py @@ -1928,6 +1928,7 @@ def aks_agentpool_add( skip_gpu_driver_install=False, gpu_driver=None, driver_type=None, + gpu_mig_strategy=None, ssh_access=CONST_SSH_ACCESS_LOCALUSER, # trusted launch enable_secure_boot=False, @@ -2016,6 +2017,7 @@ def aks_agentpool_update( localdns_config=None, node_vm_size=None, gpu_driver=None, + gpu_mig_strategy=None, ): # DO NOT MOVE: get all the original parameters and save them as a dictionary raw_parameters = locals() diff --git a/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py b/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py index 8bbe8fe9028..2797f99ea4b 100644 --- a/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/tests/latest/test_agentpool_decorator.py @@ -434,6 +434,66 @@ def common_get_driver_type(self): ctx_0.attach_agentpool(agentpool_0) self.assertEqual(ctx_0.get_driver_type(), "CUDA") + def common_get_gpu_mig_strategy(self): + # default + ctx_1 = AKSPreviewAgentPoolContext( + self.cmd, + AKSAgentPoolParamDict({"gpu_mig_strategy": None}), + self.models, + DecoratorMode.CREATE, + self.agentpool_decorator_mode, + ) + self.assertEqual(ctx_1.get_gpu_mig_strategy(), None) + agentpool_1 = self.create_initialized_agentpool_instance( + gpu_profile=self.models.GPUProfile( + nvidia=self.models.NvidiaGPUProfile( + mig_strategy="Single" + ) + ) + ) + + ctx_1.attach_agentpool(agentpool_1) + self.assertEqual(ctx_1.get_gpu_mig_strategy(), "Single") + + # default + ctx_2 = AKSPreviewAgentPoolContext( + self.cmd, + AKSAgentPoolParamDict({"gpu_mig_strategy": None}), + self.models, + DecoratorMode.CREATE, + self.agentpool_decorator_mode, + ) + self.assertEqual(ctx_2.get_gpu_mig_strategy(), None) + agentpool_2 = self.create_initialized_agentpool_instance( + gpu_profile=self.models.GPUProfile( + nvidia=self.models.NvidiaGPUProfile( + mig_strategy="Mixed" + ) + ) + ) + ctx_2.attach_agentpool(agentpool_2) + self.assertEqual(ctx_2.get_gpu_mig_strategy(), "Mixed") + + # custom + ctx_0 = AKSPreviewAgentPoolContext( + self.cmd, + AKSAgentPoolParamDict({"gpu_mig_strategy": "Single"}), + self.models, + DecoratorMode.CREATE, + self.agentpool_decorator_mode, + ) + self.assertEqual(ctx_0.get_gpu_mig_strategy(), "Single") + agentpool_0 = self.create_initialized_agentpool_instance( + gpu_profile=self.models.GPUProfile( + nvidia=self.models.NvidiaGPUProfile( + mig_strategy=None + ) + ) + ) + + ctx_0.attach_agentpool(agentpool_0) + self.assertEqual(ctx_0.get_gpu_mig_strategy(), "Single") + def common_get_os_sku(self): # default ctx_1 = AKSPreviewAgentPoolContext( @@ -1095,6 +1155,9 @@ def test_get_gpu_driver(self): def test_get_driver_type(self): self.common_get_driver_type() + def test_get_gpu_mig_strategy(self): + self.common_get_gpu_mig_strategy() + def test_get_enable_secure_boot(self): self.common_get_enable_secure_boot() From 8e1713e94d44b9a4738e9200ddd60bb2bb7c5ac0 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Wed, 25 Mar 2026 22:21:33 +0000 Subject: [PATCH 02/11] mig --- src/aks-preview/azext_aks_preview/_params.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/aks-preview/azext_aks_preview/_params.py b/src/aks-preview/azext_aks_preview/_params.py index 983db6927f5..56dadc451ff 100644 --- a/src/aks-preview/azext_aks_preview/_params.py +++ b/src/aks-preview/azext_aks_preview/_params.py @@ -155,7 +155,6 @@ CONST_GPU_DRIVER_TYPE_GRID, CONST_GPU_MIG_STRATEGY_SINGLE, CONST_GPU_MIG_STRATEGY_MIXED, - CONST_GPU_MIG_STRATEGY_NONE, CONST_ADVANCED_NETWORKPOLICIES_NONE, CONST_ADVANCED_NETWORKPOLICIES_FQDN, CONST_ADVANCED_NETWORKPOLICIES_L7, From 887984b57824b851272ff91ef83bc028d0d05037 Mon Sep 17 00:00:00 2001 From: Runzhen Date: Wed, 25 Mar 2026 15:35:04 -0700 Subject: [PATCH 03/11] Update src/aks-preview/azext_aks_preview/_help.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/aks-preview/azext_aks_preview/_help.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aks-preview/azext_aks_preview/_help.py b/src/aks-preview/azext_aks_preview/_help.py index 43f304cc658..11a757bd6c0 100644 --- a/src/aks-preview/azext_aks_preview/_help.py +++ b/src/aks-preview/azext_aks_preview/_help.py @@ -2232,7 +2232,7 @@ short-summary: Specify the type of GPU driver to install when creating Windows agent pools. Valid values are "GRID" and "CUDA". If not provided, AKS selects the driver based on system compatibility. This option cannot be changed once the AgentPool has been created. The default is system selected. - name: --gpu-mig-strategy type: string - short-summary: Specify the MIG (Multi-Instance GPU) strategy for managed MIG support. Valid values are "Single", "Mixed", and "None". When not specified, the default is None. + short-summary: Specify the MIG (Multi-Instance GPU) strategy for managed MIG support. Valid values are "Single" and "Mixed". When not specified, managed MIG is disabled. - name: --ssh-access type: string short-summary: Configure SSH setting for the node pool. Use "disabled" to disable SSH access, "localuser" to enable SSH access using private key. From cb627191c69c03704dfc334d29aed6a4db621a51 Mon Sep 17 00:00:00 2001 From: Runzhen Date: Wed, 25 Mar 2026 15:35:20 -0700 Subject: [PATCH 04/11] Update src/aks-preview/azext_aks_preview/agentpool_decorator.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/aks-preview/azext_aks_preview/agentpool_decorator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aks-preview/azext_aks_preview/agentpool_decorator.py b/src/aks-preview/azext_aks_preview/agentpool_decorator.py index ce228e0d3fc..b1b8305ca7a 100644 --- a/src/aks-preview/azext_aks_preview/agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/agentpool_decorator.py @@ -1817,7 +1817,7 @@ def update_managed_gpu(self, agentpool: AgentPool) -> AgentPool: # Check if already set to the desired value to avoid API error if agentpool.gpu_profile.nvidia.management_mode == CONST_GPU_MANAGEMENT_MODE_UNMANAGED: return agentpool - agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_UNMANAGED + agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_UNMANAGED return agentpool From af3cab6a228ce2196e9df0501384708adc6a7703 Mon Sep 17 00:00:00 2001 From: Runzhen Date: Wed, 25 Mar 2026 15:35:42 -0700 Subject: [PATCH 05/11] Update src/aks-preview/azext_aks_preview/_params.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/aks-preview/azext_aks_preview/_params.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/aks-preview/azext_aks_preview/_params.py b/src/aks-preview/azext_aks_preview/_params.py index 56dadc451ff..ff35d3304b7 100644 --- a/src/aks-preview/azext_aks_preview/_params.py +++ b/src/aks-preview/azext_aks_preview/_params.py @@ -2164,8 +2164,9 @@ def load_arguments(self, _): ) c.argument( "enable_managed_gpu", + arg_type=get_three_state_flag(), is_preview=True, - help="Enable the Managed GPU experience.", + help="Enable or disable the Managed GPU experience.", ) c.argument( "os_sku", From 265543ef85dec9c2bab40443ee76f6805d3f5307 Mon Sep 17 00:00:00 2001 From: Runzhen Date: Wed, 25 Mar 2026 15:35:52 -0700 Subject: [PATCH 06/11] Update src/aks-preview/azext_aks_preview/_params.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/aks-preview/azext_aks_preview/_params.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/aks-preview/azext_aks_preview/_params.py b/src/aks-preview/azext_aks_preview/_params.py index ff35d3304b7..2f70860c9f5 100644 --- a/src/aks-preview/azext_aks_preview/_params.py +++ b/src/aks-preview/azext_aks_preview/_params.py @@ -2045,6 +2045,7 @@ def load_arguments(self, _): ) c.argument( "enable_managed_gpu", + arg_type=get_three_state_flag(), is_preview=True, help="Enable the Managed GPU experience.", ) From 677b06c36d5fb76ada5d8ae6b836d5544b97ca81 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Wed, 25 Mar 2026 22:37:20 +0000 Subject: [PATCH 07/11] mig --- src/aks-preview/HISTORY.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/src/aks-preview/HISTORY.rst b/src/aks-preview/HISTORY.rst index e2b97761e0f..04b684cae4c 100644 --- a/src/aks-preview/HISTORY.rst +++ b/src/aks-preview/HISTORY.rst @@ -16,6 +16,7 @@ Pending * `az aks create/update`: Add `--enable-service-account-image-pull`, `--disable-service-account-image-pull`, and `--service-account-image-pull-default-managed-identity-id` parameters to manage service account based image pull settings. * `az aks list-vm-skus`: New command to list available VM SKUs for AKS clusters in a given region. * Add managed GPU enablement option to node pool property in `az aks nodepool add` and `az aks nodepool update`. +* Add MIG (Multi-Instance GPU) strategy option to node pool property in `az aks nodepool add` and `az aks nodepool update`. 19.0.0b27 +++++++ From 18e32fc5f3295eec5ce9666ff819be06c304f83f Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Wed, 25 Mar 2026 23:20:52 +0000 Subject: [PATCH 08/11] gpu --- .../latest/test_update_agentpool_profile_preview.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/aks-preview/azext_aks_preview/tests/latest/test_update_agentpool_profile_preview.py b/src/aks-preview/azext_aks_preview/tests/latest/test_update_agentpool_profile_preview.py index a9d5f9548ab..d10f5f6c9bf 100644 --- a/src/aks-preview/azext_aks_preview/tests/latest/test_update_agentpool_profile_preview.py +++ b/src/aks-preview/azext_aks_preview/tests/latest/test_update_agentpool_profile_preview.py @@ -134,6 +134,7 @@ def test_update_agentpool_profile_preview_default_behavior(self): decorator.update_upgrade_strategy = Mock(return_value=agentpool) decorator.update_blue_green_upgrade_settings = Mock(return_value=agentpool) decorator.update_gpu_profile = Mock(return_value=agentpool) + decorator.update_gpu_mig_strategy = Mock(return_value=agentpool) # Act result = decorator.update_agentpool_profile_preview() @@ -158,6 +159,7 @@ def test_update_agentpool_profile_preview_default_behavior(self): decorator.update_upgrade_strategy.assert_called_once_with(agentpool) decorator.update_blue_green_upgrade_settings.assert_called_once_with(agentpool) decorator.update_gpu_profile.assert_called_once_with(agentpool) + decorator.update_gpu_mig_strategy.assert_called_once_with(agentpool) def test_update_agentpool_profile_preview_with_agentpools_parameter(self): """Test update_agentpool_profile_preview with agentpools parameter.""" @@ -200,6 +202,7 @@ def test_update_agentpool_profile_preview_with_agentpools_parameter(self): decorator.update_upgrade_strategy = Mock(return_value=agentpool) decorator.update_blue_green_upgrade_settings = Mock(return_value=agentpool) decorator.update_gpu_profile = Mock(return_value=agentpool) + decorator.update_gpu_mig_strategy = Mock(return_value=agentpool) # Act result = decorator.update_agentpool_profile_preview(agentpools) @@ -361,6 +364,7 @@ def test_update_agentpool_profile_preview_system_mode_regular_flow(self): decorator.update_upgrade_strategy = Mock(return_value=agentpool) decorator.update_blue_green_upgrade_settings = Mock(return_value=agentpool) decorator.update_gpu_profile = Mock(return_value=agentpool) + decorator.update_gpu_mig_strategy = Mock(return_value=agentpool) # Act result = decorator.update_agentpool_profile_preview() @@ -383,6 +387,7 @@ def test_update_agentpool_profile_preview_system_mode_regular_flow(self): decorator.update_upgrade_strategy.assert_called_once_with(agentpool) decorator.update_blue_green_upgrade_settings.assert_called_once_with(agentpool) decorator.update_gpu_profile.assert_called_once_with(agentpool) + decorator.update_gpu_mig_strategy.assert_called_once_with(agentpool) def test_update_agentpool_profile_preview_execution_order(self): """Test that update methods are called in the correct order.""" @@ -430,6 +435,7 @@ def mock_method(pool): decorator.update_upgrade_strategy = create_mock_update_method("update_upgrade_strategy") decorator.update_blue_green_upgrade_settings = create_mock_update_method("update_blue_green_upgrade_settings") decorator.update_gpu_profile = create_mock_update_method("update_gpu_profile") + decorator.update_gpu_mig_strategy = create_mock_update_method("update_gpu_mig_strategy") # Act decorator.update_agentpool_profile_preview() @@ -449,6 +455,7 @@ def mock_method(pool): "update_upgrade_strategy", "update_blue_green_upgrade_settings", "update_gpu_profile", + "update_gpu_mig_strategy", ] self.assertEqual(call_order, expected_order) @@ -498,6 +505,7 @@ def track_and_return(pool): decorator.update_upgrade_strategy = create_tracking_mock("update_upgrade_strategy") decorator.update_blue_green_upgrade_settings = create_tracking_mock("update_blue_green_upgrade_settings") decorator.update_gpu_profile = create_tracking_mock("update_gpu_profile") + decorator.update_gpu_mig_strategy = create_tracking_mock("update_gpu_mig_strategy") # Act result = decorator.update_agentpool_profile_preview() @@ -560,7 +568,8 @@ def test_update_agentpool_profile_preview_mixed_modes_scenario(self): 'update_network_profile', 'update_artifact_streaming', 'update_managed_gpu', 'update_secure_boot', 'update_vtpm', 'update_os_sku', 'update_fips_image', 'update_ssh_access', 'update_localdns_profile', 'update_auto_scaler_properties_vms', - 'update_upgrade_strategy', 'update_blue_green_upgrade_settings', 'update_gpu_profile' + 'update_upgrade_strategy', 'update_blue_green_upgrade_settings', 'update_gpu_profile', + 'update_gpu_mig_strategy' ] for method_name in update_methods: @@ -634,6 +643,7 @@ def test_update_agentpool_profile_preview_managed_cluster_mode(self): decorator.update_upgrade_strategy = Mock(return_value=agentpool) decorator.update_blue_green_upgrade_settings = Mock(return_value=agentpool) decorator.update_gpu_profile = Mock(return_value=agentpool) + decorator.update_gpu_mig_strategy = Mock(return_value=agentpool) # Act result = decorator.update_agentpool_profile_preview(agentpools) From 7be5e1bee6c1dd50e4901fba75b7176f3175b084 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Wed, 25 Mar 2026 23:45:27 +0000 Subject: [PATCH 09/11] mig --- src/aks-preview/azext_aks_preview/_params.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/aks-preview/azext_aks_preview/_params.py b/src/aks-preview/azext_aks_preview/_params.py index 2f70860c9f5..74117e84cf1 100644 --- a/src/aks-preview/azext_aks_preview/_params.py +++ b/src/aks-preview/azext_aks_preview/_params.py @@ -2078,6 +2078,7 @@ def load_arguments(self, _): "gpu_mig_strategy", arg_type=get_enum_type(gpu_mig_strategies), is_preview=True, + help="Specify the GPU Multi-Instance GPU (MIG) strategy. Allowed values: Single, Mixed.", ) # in creation scenario, use "localuser" as default c.argument( @@ -2226,6 +2227,7 @@ def load_arguments(self, _): "gpu_mig_strategy", arg_type=get_enum_type(gpu_mig_strategies), is_preview=True, + help="Specify the GPU Multi-Instance GPU (MIG) strategy. Allowed values: Single, Mixed.", ) with self.argument_context("aks nodepool upgrade") as c: From 4b4f6d096a87ffbd58cd845571fdc93d758c62e4 Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Thu, 26 Mar 2026 04:29:51 +0000 Subject: [PATCH 10/11] mig --- .../configs/ext_matrix_default.json | 3 +- .../azext_aks_preview/agentpool_decorator.py | 2 + .../tests/latest/test_aks_commands.py | 62 ++++++++++++++++++- 3 files changed, 64 insertions(+), 3 deletions(-) diff --git a/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json b/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json index e3e54bb1540..35dd08fe84f 100644 --- a/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json +++ b/src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json @@ -24,7 +24,8 @@ "test_aks_nodepool_add_with_gpu_instance_profile", "test_aks_gpu_driver_type", "test_aks_nodepool_add_with_enable_managed_gpu", - "test_aks_nodepool_update_with_enable_managed_gpu" + "test_aks_nodepool_update_with_enable_managed_gpu", + "test_aks_nodepool_add_with_gpu_mig_strategy" ], "pod ip allocation mode static block, missing feature registration": [ "test_aks_create_with_pod_ip_allocation_mode_static_block" diff --git a/src/aks-preview/azext_aks_preview/agentpool_decorator.py b/src/aks-preview/azext_aks_preview/agentpool_decorator.py index b1b8305ca7a..35655a08320 100644 --- a/src/aks-preview/azext_aks_preview/agentpool_decorator.py +++ b/src/aks-preview/azext_aks_preview/agentpool_decorator.py @@ -1395,6 +1395,7 @@ def set_up_gpu_mig_strategy(self, agentpool: AgentPool) -> AgentPool: if agentpool.gpu_profile.nvidia is None: agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member agentpool.gpu_profile.nvidia.mig_strategy = gpu_mig_strategy + agentpool.gpu_profile.driver = CONST_GPU_DRIVER_INSTALL return agentpool def set_up_pod_ip_allocation_mode(self, agentpool: AgentPool) -> AgentPool: @@ -1778,6 +1779,7 @@ def update_gpu_mig_strategy(self, agentpool: AgentPool) -> AgentPool: if agentpool.gpu_profile.nvidia is None: agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member agentpool.gpu_profile.nvidia.mig_strategy = gpu_mig_strategy + agentpool.gpu_profile.driver = CONST_GPU_DRIVER_INSTALL return agentpool def update_artifact_streaming(self, agentpool: AgentPool) -> AgentPool: diff --git a/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py b/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py index a21bbbed4aa..718dfea9bb2 100644 --- a/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py +++ b/src/aks-preview/azext_aks_preview/tests/latest/test_aks_commands.py @@ -5515,6 +5515,64 @@ def test_aks_nodepool_add_with_gpu_instance_profile( checks=[self.is_empty()], ) + @live_only() + @AllowLargeResponse() + @AKSCustomResourceGroupPreparer( + random_name_length=17, name_prefix="clitest", location="westus3" + ) + def test_aks_nodepool_add_with_gpu_mig_strategy( + self, resource_group, resource_group_location + ): + aks_name = self.create_random_name("cliakstest", 16) + node_pool_name = self.create_random_name("c", 6) + node_pool_name_second = self.create_random_name("c", 6) + self.kwargs.update( + { + "resource_group": resource_group, + "name": aks_name, + "node_pool_name": node_pool_name, + "node_pool_name_second": node_pool_name_second, + "ssh_key_value": self.generate_ssh_keys(), + } + ) + + create_cmd = ( + "aks create --resource-group={resource_group} --name={name} " + "--nodepool-name {node_pool_name} -c 1 " + "--ssh-key-value={ssh_key_value}" + ) + self.cmd( + create_cmd, + checks=[ + self.check("provisioningState", "Succeeded"), + ], + ) + + # nodepool add with gpu-mig-strategy + self.cmd( + "aks nodepool add " + "--resource-group={resource_group} " + "--cluster-name={name} " + "--name={node_pool_name_second} " + "--enable-managed-gpu=true " + "--gpu-instance-profile=MIG3g " + "--gpu-mig-strategy=Single " + "-c 1 " + "--aks-custom-headers UseGPUDedicatedVHD=true " + "--node-vm-size=Standard_NC24ads_A100_v4", + checks=[ + self.check("provisioningState", "Succeeded"), + self.check("gpuInstanceProfile", "MIG3g"), + self.check("gpuProfile.nvidia.migStrategy", "Single"), + ], + ) + + # delete + self.cmd( + "aks delete -g {resource_group} -n {name} --yes --no-wait", + checks=[self.is_empty()], + ) + @live_only() # live only due to workspace is not mocked correctly and role assignment is not mocked @AllowLargeResponse() @AKSCustomResourceGroupPreparer( @@ -6907,7 +6965,7 @@ def test_aks_nodepool_add_with_enable_managed_gpu( self.cmd( "aks nodepool add --resource-group={resource_group} --cluster-name={name} --name={node_pool_name} " "--node-vm-size={node_vm_size} --node-count 1 " - " --enable-managed-gpu", + "--enable-managed-gpu=true ", checks=[ self.check("provisioningState", "Succeeded"), self.check("gpuProfile.driver", "Install"), @@ -16644,7 +16702,7 @@ def test_aks_nodepool_update_with_enable_managed_gpu( "--resource-group={resource_group} " "--cluster-name={name} " "--name={node_pool_name} " - "--enable-managed-gpu", + "--enable-managed-gpu=true ", checks=[ self.check("provisioningState", "Succeeded"), self.check("gpuProfile.driver", "Install"), From 611e24075d4140c66a7dc5714175fa9a18489ffb Mon Sep 17 00:00:00 2001 From: Runzhen Date: Wed, 25 Mar 2026 22:10:24 -0700 Subject: [PATCH 11/11] Add MIG strategy option to node pool properties --- src/aks-preview/HISTORY.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aks-preview/HISTORY.rst b/src/aks-preview/HISTORY.rst index 2a1d66b5fee..58e01adc320 100644 --- a/src/aks-preview/HISTORY.rst +++ b/src/aks-preview/HISTORY.rst @@ -11,6 +11,7 @@ To release a new version, please select a new version number (usually plus 1 to Pending +++++++ +* Add MIG (Multi-Instance GPU) strategy option to node pool property in `az aks nodepool add` and `az aks nodepool update`. 19.0.0b28 +++++++ @@ -20,7 +21,6 @@ Pending * `az aks list-vm-skus`: New command to list available VM SKUs for AKS clusters in a given region. * Add managed GPU enablement option to node pool property in `az aks nodepool add` and `az aks nodepool update`. * `az aks namespace update`: Fix location should use existing namespace location. -* Add MIG (Multi-Instance GPU) strategy option to node pool property in `az aks nodepool add` and `az aks nodepool update`. 19.0.0b27 +++++++