From 7c097d09934409dd5ff0521ab1200e93d0397503 Mon Sep 17 00:00:00 2001 From: Agustin Celentano <12614595+agustincelentano@users.noreply.github.com> Date: Fri, 17 Apr 2026 18:42:13 -0300 Subject: [PATCH 1/3] feat(istio): expose istiod_replicas to guarantee HA for node drains Single-replica istiod + PDB minAvailable=1 (chart default) yields disruptionsAllowed=0, which blocks every EKS node rolling update with 'PodEvictionFailure: Reached max retries'. Expose a new istiod_replicas variable (default 2) and wire it into both pilot.replicaCount and pilot.autoscaleMin on the helm_release. Setting only replicaCount is insufficient because the chart enables the HPA by default with autoscaleMin=1, and the HPA would scale back to 1 replica shortly after install. --- infrastructure/commons/istio/main.tf | 14 ++++++++++++++ infrastructure/commons/istio/variables.tf | 11 +++++++++++ 2 files changed, 25 insertions(+) diff --git a/infrastructure/commons/istio/main.tf b/infrastructure/commons/istio/main.tf index fd69f839..fc09c2f5 100644 --- a/infrastructure/commons/istio/main.tf +++ b/infrastructure/commons/istio/main.tf @@ -45,6 +45,20 @@ resource "helm_release" "istiod" { reuse_values = false dependency_update = true max_history = 10 + + # Enforce HA on istiod. The chart's HPA is enabled by default with + # autoscaleMin=1, so setting only replicaCount is not enough — the HPA + # would scale it back to 1 and re-block any node drain (istiod PDB has + # minAvailable=1). Setting autoscaleMin locks in the floor. + set { + name = "pilot.replicaCount" + value = var.istiod_replicas + } + + set { + name = "pilot.autoscaleMin" + value = var.istiod_replicas + } } # Setup Istio Gateway using Helm diff --git a/infrastructure/commons/istio/variables.tf b/infrastructure/commons/istio/variables.tf index 657b1f0a..dbcb51e0 100644 --- a/infrastructure/commons/istio/variables.tf +++ b/infrastructure/commons/istio/variables.tf @@ -20,6 +20,17 @@ variable "istiod_version" { default = "1.27.1" } +variable "istiod_replicas" { + description = "Number of istiod replicas. Defaults to 2 so the pilot deployment can tolerate node drains — the istiod chart installs a PodDisruptionBudget with minAvailable=1, and a single-replica istiod therefore blocks EKS node rolling updates. This value is applied to both pilot.replicaCount and pilot.autoscaleMin; without the autoscaleMin override, the HPA (enabled by default with autoscaleMin=1) would immediately scale back to 1 replica." + type = number + default = 2 + + validation { + condition = var.istiod_replicas >= 1 + error_message = "istiod_replicas must be at least 1." + } +} + ############################################################################### # SERVICE CONFIGURATION ############################################################################### From c4158a4ed2f04f57bb82fc1cb16d4cfac4c86786 Mon Sep 17 00:00:00 2001 From: Agustin Celentano <12614595+agustincelentano@users.noreply.github.com> Date: Fri, 17 Apr 2026 18:48:14 -0300 Subject: [PATCH 2/3] fix(istio): use list syntax for helm set (provider v3) The hashicorp/helm v3 provider replaced the 'set {}' block with a 'set' attribute taking a list of objects. --- infrastructure/commons/istio/main.tf | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/infrastructure/commons/istio/main.tf b/infrastructure/commons/istio/main.tf index fc09c2f5..3f8c27c5 100644 --- a/infrastructure/commons/istio/main.tf +++ b/infrastructure/commons/istio/main.tf @@ -50,15 +50,16 @@ resource "helm_release" "istiod" { # autoscaleMin=1, so setting only replicaCount is not enough — the HPA # would scale it back to 1 and re-block any node drain (istiod PDB has # minAvailable=1). Setting autoscaleMin locks in the floor. - set { - name = "pilot.replicaCount" - value = var.istiod_replicas - } - - set { - name = "pilot.autoscaleMin" - value = var.istiod_replicas - } + set = [ + { + name = "pilot.replicaCount" + value = var.istiod_replicas + }, + { + name = "pilot.autoscaleMin" + value = var.istiod_replicas + }, + ] } # Setup Istio Gateway using Helm From ddaf0f3bdc2b103c6f853a599845fd730263b31b Mon Sep 17 00:00:00 2001 From: Agustin Celentano <12614595+agustincelentano@users.noreply.github.com> Date: Fri, 17 Apr 2026 20:45:18 -0300 Subject: [PATCH 3/3] refactor(istio): keep istiod_replicas default at 1 for backwards compatibility Flip the default from 2 to 1 so existing consumers of this module see no behavior change after upgrading. Callers that need HA (recommended for clusters doing node rolling updates) opt in explicitly with istiod_replicas = 2. --- infrastructure/commons/istio/variables.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/infrastructure/commons/istio/variables.tf b/infrastructure/commons/istio/variables.tf index dbcb51e0..875cc3b1 100644 --- a/infrastructure/commons/istio/variables.tf +++ b/infrastructure/commons/istio/variables.tf @@ -21,9 +21,9 @@ variable "istiod_version" { } variable "istiod_replicas" { - description = "Number of istiod replicas. Defaults to 2 so the pilot deployment can tolerate node drains — the istiod chart installs a PodDisruptionBudget with minAvailable=1, and a single-replica istiod therefore blocks EKS node rolling updates. This value is applied to both pilot.replicaCount and pilot.autoscaleMin; without the autoscaleMin override, the HPA (enabled by default with autoscaleMin=1) would immediately scale back to 1 replica." + description = "Number of istiod replicas. Default is 1 to preserve the previous behavior of this module for existing consumers; set to 2 (recommended) to let the pilot deployment tolerate node drains — the istiod chart installs a PodDisruptionBudget with minAvailable=1, and a single-replica istiod therefore blocks node rolling updates (e.g. EKS AMI bumps). This value is applied to both pilot.replicaCount and pilot.autoscaleMin; without the autoscaleMin override, the HPA (enabled by default with autoscaleMin=1) would scale back to 1 replica shortly after install." type = number - default = 2 + default = 1 validation { condition = var.istiod_replicas >= 1