Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/aks-preview/HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ To release a new version, please select a new version number (usually plus 1 to

Pending
+++++++
* Add MIG (Multi-Instance GPU) strategy option to node pool property in `az aks nodepool add` and `az aks nodepool update`.
* `az aks create/update`: Add `--outbound-type managedNATGatewayV2` support using Azure NAT Gateway Standard V2 SKU with IPv6, user-provided IPs, and IP prefixes.
* Fix monitoring addon key casing compatibility with azure-cli/acs

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
"test_aks_nodepool_add_with_gpu_instance_profile",
"test_aks_gpu_driver_type",
"test_aks_nodepool_add_with_enable_managed_gpu",
"test_aks_nodepool_update_with_enable_managed_gpu"
"test_aks_nodepool_update_with_enable_managed_gpu",
"test_aks_nodepool_add_with_gpu_mig_strategy"
],
"pod ip allocation mode static block, missing feature registration": [
"test_aks_create_with_pod_ip_allocation_mode_static_block"
Expand Down
4 changes: 4 additions & 0 deletions src/aks-preview/azext_aks_preview/_consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,10 @@
CONST_GPU_DRIVER_TYPE_CUDA = "CUDA"
CONST_GPU_DRIVER_TYPE_GRID = "GRID"

# GPU MIG Strategy Consts
CONST_GPU_MIG_STRATEGY_SINGLE = "Single"
CONST_GPU_MIG_STRATEGY_MIXED = "Mixed"

# k8s extension constants
CONST_K8S_EXTENSION_CUSTOM_MOD_NAME = "azext_k8s_extension.custom"
CONST_K8S_EXTENSION_CLIENT_FACTORY_MOD_NAME = "azext_k8s_extension._client_factory"
Expand Down
3 changes: 3 additions & 0 deletions src/aks-preview/azext_aks_preview/_help.py
Original file line number Diff line number Diff line change
Expand Up @@ -2230,6 +2230,9 @@
- name: --driver-type
type: string
short-summary: Specify the type of GPU driver to install when creating Windows agent pools. Valid values are "GRID" and "CUDA". If not provided, AKS selects the driver based on system compatibility. This option cannot be changed once the AgentPool has been created. The default is system selected.
- name: --gpu-mig-strategy
type: string
short-summary: Specify the MIG (Multi-Instance GPU) strategy for managed MIG support. Valid values are "Single" and "Mixed". When not specified, managed MIG is disabled.
- name: --ssh-access
type: string
short-summary: Configure SSH setting for the node pool. Use "disabled" to disable SSH access, "localuser" to enable SSH access using private key.
Expand Down
25 changes: 22 additions & 3 deletions src/aks-preview/azext_aks_preview/_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@
CONST_APP_ROUTING_NONE_NGINX,
CONST_GPU_DRIVER_TYPE_CUDA,
CONST_GPU_DRIVER_TYPE_GRID,
CONST_GPU_MIG_STRATEGY_SINGLE,
CONST_GPU_MIG_STRATEGY_MIXED,
CONST_ADVANCED_NETWORKPOLICIES_NONE,
CONST_ADVANCED_NETWORKPOLICIES_FQDN,
CONST_ADVANCED_NETWORKPOLICIES_L7,
Expand Down Expand Up @@ -548,6 +550,11 @@
CONST_GPU_DRIVER_TYPE_GRID,
]

gpu_mig_strategies = [
CONST_GPU_MIG_STRATEGY_SINGLE,
CONST_GPU_MIG_STRATEGY_MIXED,
]

upgrade_strategies = [
CONST_UPGRADE_STRATEGY_ROLLING,
CONST_UPGRADE_STRATEGY_BLUE_GREEN,
Expand Down Expand Up @@ -2114,7 +2121,7 @@ def load_arguments(self, _):
)
c.argument(
"enable_managed_gpu",
action="store_true",
arg_type=get_three_state_flag(),
is_preview=True,
help="Enable the Managed GPU experience.",
)
Expand Down Expand Up @@ -2143,6 +2150,12 @@ def load_arguments(self, _):
arg_type=get_enum_type(gpu_driver_types),
is_preview=True,
)
c.argument(
"gpu_mig_strategy",
arg_type=get_enum_type(gpu_mig_strategies),
is_preview=True,
help="Specify the GPU Multi-Instance GPU (MIG) strategy. Allowed values: Single, Mixed.",
)
# in creation scenario, use "localuser" as default
c.argument(
'ssh_access',
Expand Down Expand Up @@ -2229,9 +2242,9 @@ def load_arguments(self, _):
)
c.argument(
"enable_managed_gpu",
action="store_true",
arg_type=get_three_state_flag(),
is_preview=True,
help="Enable the Managed GPU experience.",
help="Enable or disable the Managed GPU experience.",
)
c.argument(
"os_sku",
Expand Down Expand Up @@ -2286,6 +2299,12 @@ def load_arguments(self, _):
"gpu_driver",
arg_type=get_enum_type(gpu_driver_install_modes)
)
c.argument(
"gpu_mig_strategy",
arg_type=get_enum_type(gpu_mig_strategies),
is_preview=True,
help="Specify the GPU Multi-Instance GPU (MIG) strategy. Allowed values: Single, Mixed.",
)

with self.argument_context("aks nodepool upgrade") as c:
# upgrade strategy
Expand Down
58 changes: 58 additions & 0 deletions src/aks-preview/azext_aks_preview/agentpool_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,6 +752,25 @@ def get_driver_type(self) -> Union[str, None]:

return driver_type

def get_gpu_mig_strategy(self) -> Union[str, None]:
"""Obtain the value of gpu_mig_strategy.
:return: str or None
"""
# read the original value passed by the command
gpu_mig_strategy = self.raw_param.get("gpu_mig_strategy")

# In create mode, try to read the property value corresponding to the parameter from the `agentpool` object
if self.decorator_mode == DecoratorMode.CREATE:
if (
self.agentpool and
self.agentpool.gpu_profile is not None and
self.agentpool.gpu_profile.nvidia is not None and
self.agentpool.gpu_profile.nvidia.mig_strategy is not None
):
gpu_mig_strategy = self.agentpool.gpu_profile.nvidia.mig_strategy

return gpu_mig_strategy

def get_enable_secure_boot(self) -> bool:
"""Obtain the value of enable_secure_boot.
:return: bool
Expand Down Expand Up @@ -1365,6 +1384,20 @@ def set_up_driver_type(self, agentpool: AgentPool) -> AgentPool:
agentpool.gpu_profile.driver_type = driver_type
return agentpool

def set_up_gpu_mig_strategy(self, agentpool: AgentPool) -> AgentPool:
"""Set up gpu mig strategy property for the AgentPool object."""
self._ensure_agentpool(agentpool)

gpu_mig_strategy = self.context.get_gpu_mig_strategy()
if gpu_mig_strategy is not None:
if agentpool.gpu_profile is None:
agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member
if agentpool.gpu_profile.nvidia is None:
agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member
agentpool.gpu_profile.nvidia.mig_strategy = gpu_mig_strategy
agentpool.gpu_profile.driver = CONST_GPU_DRIVER_INSTALL
return agentpool

def set_up_pod_ip_allocation_mode(self, agentpool: AgentPool) -> AgentPool:
"""Set up pod ip allocation mode for the AgentPool object."""
self._ensure_agentpool(agentpool)
Expand Down Expand Up @@ -1557,6 +1590,8 @@ def construct_agentpool_profile_preview(self) -> AgentPool:
agentpool = self.set_up_gpu_profile(agentpool)
# set up driver_type
agentpool = self.set_up_driver_type(agentpool)
# set up gpu_mig_strategy
agentpool = self.set_up_gpu_mig_strategy(agentpool)
# set up agentpool ssh access
agentpool = self.set_up_ssh_access(agentpool)
# set up agentpool pod ip allocation mode
Expand Down Expand Up @@ -1733,6 +1768,20 @@ def update_gpu_profile(self, agentpool: AgentPool) -> AgentPool:
agentpool.gpu_profile.driver = gpu_driver
return agentpool

def update_gpu_mig_strategy(self, agentpool: AgentPool) -> AgentPool:
"""Update gpu mig strategy property for the AgentPool object."""
self._ensure_agentpool(agentpool)

gpu_mig_strategy = self.context.get_gpu_mig_strategy()
if gpu_mig_strategy is not None:
if agentpool.gpu_profile is None:
agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member
if agentpool.gpu_profile.nvidia is None:
agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member
agentpool.gpu_profile.nvidia.mig_strategy = gpu_mig_strategy
agentpool.gpu_profile.driver = CONST_GPU_DRIVER_INSTALL
return agentpool

def update_artifact_streaming(self, agentpool: AgentPool) -> AgentPool:
"""Update artifact streaming property for the AgentPool object.
:return: the AgentPool object
Expand Down Expand Up @@ -1760,10 +1809,16 @@ def update_managed_gpu(self, agentpool: AgentPool) -> AgentPool:
agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member
if agentpool.gpu_profile.nvidia is None:
agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member
# Check if already set to the desired value to avoid API error
if agentpool.gpu_profile.nvidia.management_mode == CONST_GPU_MANAGEMENT_MODE_MANAGED:
return agentpool
agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_MANAGED
agentpool.gpu_profile.driver = CONST_GPU_DRIVER_INSTALL
else:
if agentpool.gpu_profile and agentpool.gpu_profile.nvidia:
# Check if already set to the desired value to avoid API error
if agentpool.gpu_profile.nvidia.management_mode == CONST_GPU_MANAGEMENT_MODE_UNMANAGED:
return agentpool
agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_UNMANAGED

return agentpool
Expand Down Expand Up @@ -1925,6 +1980,9 @@ def update_agentpool_profile_preview(self, agentpools: List[AgentPool] = None) -
# update gpu profile
agentpool = self.update_gpu_profile(agentpool)

# update gpu mig strategy
agentpool = self.update_gpu_mig_strategy(agentpool)

return agentpool

def update_auto_scaler_properties(self, agentpool: AgentPool) -> AgentPool:
Expand Down
2 changes: 2 additions & 0 deletions src/aks-preview/azext_aks_preview/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -1934,6 +1934,7 @@ def aks_agentpool_add(
skip_gpu_driver_install=False,
gpu_driver=None,
driver_type=None,
gpu_mig_strategy=None,
ssh_access=CONST_SSH_ACCESS_LOCALUSER,
# trusted launch
enable_secure_boot=False,
Expand Down Expand Up @@ -2022,6 +2023,7 @@ def aks_agentpool_update(
localdns_config=None,
node_vm_size=None,
gpu_driver=None,
gpu_mig_strategy=None,
):
# DO NOT MOVE: get all the original parameters and save them as a dictionary
raw_parameters = locals()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,66 @@ def common_get_driver_type(self):
ctx_0.attach_agentpool(agentpool_0)
self.assertEqual(ctx_0.get_driver_type(), "CUDA")

def common_get_gpu_mig_strategy(self):
# default
ctx_1 = AKSPreviewAgentPoolContext(
self.cmd,
AKSAgentPoolParamDict({"gpu_mig_strategy": None}),
self.models,
DecoratorMode.CREATE,
self.agentpool_decorator_mode,
)
self.assertEqual(ctx_1.get_gpu_mig_strategy(), None)
agentpool_1 = self.create_initialized_agentpool_instance(
gpu_profile=self.models.GPUProfile(
nvidia=self.models.NvidiaGPUProfile(
mig_strategy="Single"
)
)
)

ctx_1.attach_agentpool(agentpool_1)
self.assertEqual(ctx_1.get_gpu_mig_strategy(), "Single")

# default
ctx_2 = AKSPreviewAgentPoolContext(
self.cmd,
AKSAgentPoolParamDict({"gpu_mig_strategy": None}),
self.models,
DecoratorMode.CREATE,
self.agentpool_decorator_mode,
)
self.assertEqual(ctx_2.get_gpu_mig_strategy(), None)
agentpool_2 = self.create_initialized_agentpool_instance(
gpu_profile=self.models.GPUProfile(
nvidia=self.models.NvidiaGPUProfile(
mig_strategy="Mixed"
)
)
)
ctx_2.attach_agentpool(agentpool_2)
self.assertEqual(ctx_2.get_gpu_mig_strategy(), "Mixed")

# custom
ctx_0 = AKSPreviewAgentPoolContext(
self.cmd,
AKSAgentPoolParamDict({"gpu_mig_strategy": "Single"}),
self.models,
DecoratorMode.CREATE,
self.agentpool_decorator_mode,
)
self.assertEqual(ctx_0.get_gpu_mig_strategy(), "Single")
agentpool_0 = self.create_initialized_agentpool_instance(
gpu_profile=self.models.GPUProfile(
nvidia=self.models.NvidiaGPUProfile(
mig_strategy=None
)
)
)

ctx_0.attach_agentpool(agentpool_0)
self.assertEqual(ctx_0.get_gpu_mig_strategy(), "Single")

def common_get_os_sku(self):
# default
ctx_1 = AKSPreviewAgentPoolContext(
Expand Down Expand Up @@ -1095,6 +1155,9 @@ def test_get_gpu_driver(self):
def test_get_driver_type(self):
self.common_get_driver_type()

def test_get_gpu_mig_strategy(self):
self.common_get_gpu_mig_strategy()

def test_get_enable_secure_boot(self):
self.common_get_enable_secure_boot()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5586,6 +5586,64 @@ def test_aks_nodepool_add_with_gpu_instance_profile(
checks=[self.is_empty()],
)

@live_only()
@AllowLargeResponse()
@AKSCustomResourceGroupPreparer(
random_name_length=17, name_prefix="clitest", location="westus3"
)
def test_aks_nodepool_add_with_gpu_mig_strategy(
self, resource_group, resource_group_location
):
aks_name = self.create_random_name("cliakstest", 16)
node_pool_name = self.create_random_name("c", 6)
node_pool_name_second = self.create_random_name("c", 6)
self.kwargs.update(
{
"resource_group": resource_group,
"name": aks_name,
"node_pool_name": node_pool_name,
"node_pool_name_second": node_pool_name_second,
"ssh_key_value": self.generate_ssh_keys(),
}
)

create_cmd = (
"aks create --resource-group={resource_group} --name={name} "
"--nodepool-name {node_pool_name} -c 1 "
"--ssh-key-value={ssh_key_value}"
)
self.cmd(
create_cmd,
checks=[
self.check("provisioningState", "Succeeded"),
],
)

# nodepool add with gpu-mig-strategy
self.cmd(
"aks nodepool add "
"--resource-group={resource_group} "
"--cluster-name={name} "
"--name={node_pool_name_second} "
"--enable-managed-gpu=true "
"--gpu-instance-profile=MIG3g "
"--gpu-mig-strategy=Single "
"-c 1 "
"--aks-custom-headers UseGPUDedicatedVHD=true "
"--node-vm-size=Standard_NC24ads_A100_v4",
checks=[
self.check("provisioningState", "Succeeded"),
self.check("gpuInstanceProfile", "MIG3g"),
self.check("gpuProfile.nvidia.migStrategy", "Single"),
],
)

# delete
self.cmd(
"aks delete -g {resource_group} -n {name} --yes --no-wait",
checks=[self.is_empty()],
)

@live_only() # live only due to workspace is not mocked correctly and role assignment is not mocked
@AllowLargeResponse()
@AKSCustomResourceGroupPreparer(
Expand Down Expand Up @@ -6978,7 +7036,7 @@ def test_aks_nodepool_add_with_enable_managed_gpu(
self.cmd(
"aks nodepool add --resource-group={resource_group} --cluster-name={name} --name={node_pool_name} "
"--node-vm-size={node_vm_size} --node-count 1 "
" --enable-managed-gpu",
"--enable-managed-gpu=true ",
checks=[
self.check("provisioningState", "Succeeded"),
self.check("gpuProfile.driver", "Install"),
Expand Down Expand Up @@ -16715,7 +16773,7 @@ def test_aks_nodepool_update_with_enable_managed_gpu(
"--resource-group={resource_group} "
"--cluster-name={name} "
"--name={node_pool_name} "
"--enable-managed-gpu",
"--enable-managed-gpu=true ",
checks=[
self.check("provisioningState", "Succeeded"),
self.check("gpuProfile.driver", "Install"),
Expand Down
Loading
Loading