From 916faa034a32a33084b03fe33266e254aca9f5f0 Mon Sep 17 00:00:00 2001 From: Marcin Maciaszczyk Date: Fri, 12 Jun 2026 14:49:27 +0200 Subject: [PATCH 1/4] fix azure node upgrades --- terraform/clouds/azure/aks.tf | 11 ++++++----- terraform/clouds/azure/locals.tf | 2 ++ terraform/modules/clusters/azure/aks.tf | 11 +++++++++-- terraform/modules/clusters/azure/locals.tf | 7 +++++-- terraform/modules/clusters/azure/variables.tf | 5 +++++ 5 files changed, 27 insertions(+), 9 deletions(-) diff --git a/terraform/clouds/azure/aks.tf b/terraform/clouds/azure/aks.tf index bc0e4db..4fe7f57 100644 --- a/terraform/clouds/azure/aks.tf +++ b/terraform/clouds/azure/aks.tf @@ -1,15 +1,15 @@ locals { node_pool_add = { (local.active_node_group) = { - orchestrator_version = var.kubernetes_version, - node_taints = local.upgrading ? ["platform.plural.sh/draining=true:NoSchedule"] : [], + orchestrator_version = local.node_orchestrator_version, + node_taints = local.upgrading ? ["platform.plural.sh/draining=true:NoSchedule"] : [], }, - (local.drain_node_group) = { + (local.drain_node_group) = { orchestrator_version = var.next_kubernetes_version, } } - full_node_pools = {for k, v in var.node_pools: k => merge(v, try(lookup(local.node_pool_add, k), {})) if k != local.drain_node_group || local.upgrading == true} + full_node_pools = { for k, v in var.node_pools : k => merge(v, try(lookup(local.node_pool_add, k), {})) if k != local.drain_node_group || local.upgrading == true } } @@ -18,6 +18,7 @@ module "aks" { version = "9.2.0" kubernetes_version = var.next_kubernetes_version + orchestrator_version = local.node_orchestrator_version cluster_name = var.cluster_name resource_group_name = local.resource_group.name prefix = var.cluster_name @@ -25,7 +26,7 @@ module "aks" { sku_tier = "Standard" rbac_aad = false vnet_subnet_id = azurerm_subnet.network.id - node_pools = {for name, pool in local.full_node_pools : name => merge(pool, {name = name, vnet_subnet_id = azurerm_subnet.network.id})} + node_pools = { for name, pool in local.full_node_pools : name => merge(pool, { name = name, vnet_subnet_id = azurerm_subnet.network.id }) } ebpf_data_plane = "cilium" network_plugin_mode = "overlay" diff --git a/terraform/clouds/azure/locals.tf b/terraform/clouds/azure/locals.tf index eb4b036..6580f34 100644 --- a/terraform/clouds/azure/locals.tf +++ b/terraform/clouds/azure/locals.tf @@ -12,4 +12,6 @@ locals { vsn_even = ((tonumber(local.split_vsn[0]) * 100 + tonumber(local.split_vsn[1])) % 2) == 0 active_node_group = local.vsn_even ? "blue" : "green" drain_node_group = local.vsn_even ? "green" : "blue" + # AKS cannot upgrade CP and node pools in one apply when both versions change. + node_orchestrator_version = local.upgrading ? var.kubernetes_version : var.next_kubernetes_version } \ No newline at end of file diff --git a/terraform/modules/clusters/azure/aks.tf b/terraform/modules/clusters/azure/aks.tf index 8107819..86f40ff 100644 --- a/terraform/modules/clusters/azure/aks.tf +++ b/terraform/modules/clusters/azure/aks.tf @@ -2,7 +2,8 @@ module "aks" { source = "Azure/aks/azurerm" version = "9.2.0" - kubernetes_version = var.kubernetes_version + kubernetes_version = var.next_kubernetes_version + orchestrator_version = local.node_orchestrator_version cluster_name = var.cluster resource_group_name = data.azurerm_resource_group.default.name prefix = var.cluster @@ -10,7 +11,13 @@ module "aks" { sku_tier = "Standard" rbac_aad = false vnet_subnet_id = local.network.sn_subnet_id - node_pools = {for name, pool in var.node_pools : name => merge(pool, {name = name, vnet_subnet_id = local.network.sn_subnet_id})} + node_pools = { + for name, pool in var.node_pools : name => merge(pool, { + name = name + vnet_subnet_id = local.network.sn_subnet_id + orchestrator_version = local.node_orchestrator_version + }) + } ebpf_data_plane = "cilium" network_plugin_mode = "overlay" diff --git a/terraform/modules/clusters/azure/locals.tf b/terraform/modules/clusters/azure/locals.tf index febd546..bc79358 100644 --- a/terraform/modules/clusters/azure/locals.tf +++ b/terraform/modules/clusters/azure/locals.tf @@ -1,4 +1,7 @@ locals { - identity = jsondecode(data.plural_service_context.identity.configuration) - network = jsondecode(data.plural_service_context.network.configuration) + identity = jsondecode(data.plural_service_context.identity.configuration) + network = jsondecode(data.plural_service_context.network.configuration) + upgrading = var.kubernetes_version != var.next_kubernetes_version + # AKS upgrades control plane and node pools in separate applies; see clouds/azure/aks.tf. + node_orchestrator_version = local.upgrading ? var.kubernetes_version : var.next_kubernetes_version } diff --git a/terraform/modules/clusters/azure/variables.tf b/terraform/modules/clusters/azure/variables.tf index a032c50..e8c82b9 100644 --- a/terraform/modules/clusters/azure/variables.tf +++ b/terraform/modules/clusters/azure/variables.tf @@ -16,6 +16,11 @@ variable "kubernetes_version" { default = "1.34" } +variable "next_kubernetes_version" { + type = string + default = "1.34" +} + variable "resource_group_name" { type = string default = "plural" From ef8d19c48031aea06e82e785767e38130e0abe9e Mon Sep 17 00:00:00 2001 From: Marcin Maciaszczyk Date: Mon, 15 Jun 2026 12:56:35 +0200 Subject: [PATCH 2/4] add version defaulting --- terraform/clouds/azure/aks.tf | 4 ++-- terraform/clouds/azure/locals.tf | 5 +++-- terraform/clouds/azure/variables.tf | 5 +++-- terraform/modules/clusters/azure/aks.tf | 2 +- terraform/modules/clusters/azure/locals.tf | 6 ++++-- terraform/modules/clusters/azure/variables.tf | 5 +++-- 6 files changed, 16 insertions(+), 11 deletions(-) diff --git a/terraform/clouds/azure/aks.tf b/terraform/clouds/azure/aks.tf index 4fe7f57..5ee1fb9 100644 --- a/terraform/clouds/azure/aks.tf +++ b/terraform/clouds/azure/aks.tf @@ -5,7 +5,7 @@ locals { node_taints = local.upgrading ? ["platform.plural.sh/draining=true:NoSchedule"] : [], }, (local.drain_node_group) = { - orchestrator_version = var.next_kubernetes_version, + orchestrator_version = local.next_kubernetes_version, } } @@ -17,7 +17,7 @@ module "aks" { source = "Azure/aks/azurerm" version = "9.2.0" - kubernetes_version = var.next_kubernetes_version + kubernetes_version = local.next_kubernetes_version orchestrator_version = local.node_orchestrator_version cluster_name = var.cluster_name resource_group_name = local.resource_group.name diff --git a/terraform/clouds/azure/locals.tf b/terraform/clouds/azure/locals.tf index 6580f34..416b2c1 100644 --- a/terraform/clouds/azure/locals.tf +++ b/terraform/clouds/azure/locals.tf @@ -7,11 +7,12 @@ locals { rg = var.create_resource_group ? azurerm_resource_group.main[0] : data.azurerm_resource_group.main[0] db_url = format("postgresql://console:%s@%s:5432/console", random_password.password.result, try(azurerm_postgresql_flexible_server.postgres[0].fqdn, "")) - upgrading = var.kubernetes_version != var.next_kubernetes_version + next_kubernetes_version = var.next_kubernetes_version != "" ? var.next_kubernetes_version : var.kubernetes_version + upgrading = var.kubernetes_version != local.next_kubernetes_version split_vsn = [ for i in split(".", var.kubernetes_version): tonumber(i) ] vsn_even = ((tonumber(local.split_vsn[0]) * 100 + tonumber(local.split_vsn[1])) % 2) == 0 active_node_group = local.vsn_even ? "blue" : "green" drain_node_group = local.vsn_even ? "green" : "blue" # AKS cannot upgrade CP and node pools in one apply when both versions change. - node_orchestrator_version = local.upgrading ? var.kubernetes_version : var.next_kubernetes_version + node_orchestrator_version = local.upgrading ? var.kubernetes_version : local.next_kubernetes_version } \ No newline at end of file diff --git a/terraform/clouds/azure/variables.tf b/terraform/clouds/azure/variables.tf index 524785d..a79ae8e 100644 --- a/terraform/clouds/azure/variables.tf +++ b/terraform/clouds/azure/variables.tf @@ -19,8 +19,9 @@ variable "kubernetes_version" { } variable "next_kubernetes_version" { - type = string - default = "1.34" + type = string + default = "" + description = "AKS control plane target; leave empty to match kubernetes_version." } variable "create_resource_group" { diff --git a/terraform/modules/clusters/azure/aks.tf b/terraform/modules/clusters/azure/aks.tf index 86f40ff..1679238 100644 --- a/terraform/modules/clusters/azure/aks.tf +++ b/terraform/modules/clusters/azure/aks.tf @@ -2,7 +2,7 @@ module "aks" { source = "Azure/aks/azurerm" version = "9.2.0" - kubernetes_version = var.next_kubernetes_version + kubernetes_version = local.next_kubernetes_version orchestrator_version = local.node_orchestrator_version cluster_name = var.cluster resource_group_name = data.azurerm_resource_group.default.name diff --git a/terraform/modules/clusters/azure/locals.tf b/terraform/modules/clusters/azure/locals.tf index bc79358..6c1dc6b 100644 --- a/terraform/modules/clusters/azure/locals.tf +++ b/terraform/modules/clusters/azure/locals.tf @@ -1,7 +1,9 @@ locals { identity = jsondecode(data.plural_service_context.identity.configuration) network = jsondecode(data.plural_service_context.network.configuration) - upgrading = var.kubernetes_version != var.next_kubernetes_version + # Empty next_kubernetes_version means in sync with kubernetes_version (safe before scaffolds passes both). + next_kubernetes_version = var.next_kubernetes_version != "" ? var.next_kubernetes_version : var.kubernetes_version + upgrading = var.kubernetes_version != local.next_kubernetes_version # AKS upgrades control plane and node pools in separate applies; see clouds/azure/aks.tf. - node_orchestrator_version = local.upgrading ? var.kubernetes_version : var.next_kubernetes_version + node_orchestrator_version = local.upgrading ? var.kubernetes_version : local.next_kubernetes_version } diff --git a/terraform/modules/clusters/azure/variables.tf b/terraform/modules/clusters/azure/variables.tf index e8c82b9..6feda40 100644 --- a/terraform/modules/clusters/azure/variables.tf +++ b/terraform/modules/clusters/azure/variables.tf @@ -17,8 +17,9 @@ variable "kubernetes_version" { } variable "next_kubernetes_version" { - type = string - default = "1.34" + type = string + default = "" + description = "AKS control plane target; leave empty to match kubernetes_version." } variable "resource_group_name" { From ba4ff6f0e3e374c756079b7bc3be9570c7fc1785 Mon Sep 17 00:00:00 2001 From: Marcin Maciaszczyk Date: Mon, 15 Jun 2026 17:23:37 +0200 Subject: [PATCH 3/4] add stack-runner label --- setup/stacks/mgmt.yaml | 2 ++ terraform/clouds/azure/aks.tf | 11 ++++++++--- terraform/clouds/azure/variables.tf | 4 ++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/setup/stacks/mgmt.yaml b/setup/stacks/mgmt.yaml index 1bf2abb..ddfa2c1 100644 --- a/setup/stacks/mgmt.yaml +++ b/setup/stacks/mgmt.yaml @@ -36,6 +36,8 @@ spec: labels: azure.workload.identity/use: "true" serviceAccount: "stacks" + nodeSelector: + platform.plural.sh/stack-runner: "true" [[ end ]] git: ref: main diff --git a/terraform/clouds/azure/aks.tf b/terraform/clouds/azure/aks.tf index 5ee1fb9..436084c 100644 --- a/terraform/clouds/azure/aks.tf +++ b/terraform/clouds/azure/aks.tf @@ -3,16 +3,21 @@ locals { (local.active_node_group) = { orchestrator_version = local.node_orchestrator_version, node_taints = local.upgrading ? ["platform.plural.sh/draining=true:NoSchedule"] : [], + node_labels = local.upgrading ? {} : { + "platform.plural.sh/stack-runner" = "true" + }, }, (local.drain_node_group) = { orchestrator_version = local.next_kubernetes_version, + node_labels = local.upgrading ? { + "platform.plural.sh/stack-runner" = "true" + } : {}, } } full_node_pools = { for k, v in var.node_pools : k => merge(v, try(lookup(local.node_pool_add, k), {})) if k != local.drain_node_group || local.upgrading == true } } - module "aks" { source = "Azure/aks/azurerm" version = "9.2.0" @@ -27,7 +32,7 @@ module "aks" { rbac_aad = false vnet_subnet_id = azurerm_subnet.network.id node_pools = { for name, pool in local.full_node_pools : name => merge(pool, { name = name, vnet_subnet_id = azurerm_subnet.network.id }) } - + ebpf_data_plane = "cilium" network_plugin_mode = "overlay" network_plugin = "azure" @@ -36,4 +41,4 @@ module "aks" { workload_identity_enabled = var.workload_identity_enabled oidc_issuer_enabled = var.workload_identity_enabled -} \ No newline at end of file +} diff --git a/terraform/clouds/azure/variables.tf b/terraform/clouds/azure/variables.tf index a79ae8e..3e64861 100644 --- a/terraform/clouds/azure/variables.tf +++ b/terraform/clouds/azure/variables.tf @@ -96,14 +96,14 @@ variable "node_pools" { blue = { vm_size = "Standard_D2s_v3" node_count = 3 - min_count = 1 + min_count = 0 max_count = 20 enable_auto_scaling = true } green = { vm_size = "Standard_D2s_v3" node_count = 3 - min_count = 1 + min_count = 0 max_count = 20 enable_auto_scaling = true } From 7159bca4cc29bec730cd7385c51cabb180a20802 Mon Sep 17 00:00:00 2001 From: Marcin Maciaszczyk Date: Tue, 16 Jun 2026 12:13:06 +0200 Subject: [PATCH 4/4] revert min count --- terraform/clouds/azure/variables.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/terraform/clouds/azure/variables.tf b/terraform/clouds/azure/variables.tf index 3e64861..a79ae8e 100644 --- a/terraform/clouds/azure/variables.tf +++ b/terraform/clouds/azure/variables.tf @@ -96,14 +96,14 @@ variable "node_pools" { blue = { vm_size = "Standard_D2s_v3" node_count = 3 - min_count = 0 + min_count = 1 max_count = 20 enable_auto_scaling = true } green = { vm_size = "Standard_D2s_v3" node_count = 3 - min_count = 0 + min_count = 1 max_count = 20 enable_auto_scaling = true }