From 949ec0eb122659735baac71b59b75f8f17482f5d Mon Sep 17 00:00:00 2001 From: Pablo Vilas Date: Wed, 15 Apr 2026 14:03:42 -0300 Subject: [PATCH 1/5] fix(alb): improve capacity checks and add ingress rollback on reconciliation failure - Fix rule count to check HTTPS (443) listener only instead of summing all listeners - Add estimation of rules/TGs this scope will add before comparing against thresholds - Lower default ALB_MAX_TARGET_GROUPS from 98 to 90 for blue-green safety margin - Add rule capacity check to deployment workflow (was only at scope creation) - Add rollback_failed_ingress script that deletes broken ingresses on reconciliation failure to prevent sync poisoning of the entire ALB group - Wire rollback into verify_ingress_reconciliation at cert error, event error, and timeout - Add ALB_ROLLBACK_ON_RECONCILIATION_FAILURE config (default: true) --- k8s/deployment/rollback_failed_ingress | 50 +++++ .../tests/rollback_failed_ingress.bats | 194 ++++++++++++++++++ .../validate_alb_target_group_capacity.bats | 41 ++-- .../validate_alb_target_group_capacity | 20 +- k8s/deployment/verify_ingress_reconciliation | 12 +- k8s/deployment/workflows/initial.yaml | 3 + k8s/scope/build_context | 7 + k8s/scope/tests/validate_alb_capacity.bats | 48 ++--- k8s/scope/validate_alb_capacity | 102 ++++----- k8s/values.yaml | 3 +- 10 files changed, 378 insertions(+), 102 deletions(-) create mode 100755 k8s/deployment/rollback_failed_ingress create mode 100644 k8s/deployment/tests/rollback_failed_ingress.bats diff --git a/k8s/deployment/rollback_failed_ingress b/k8s/deployment/rollback_failed_ingress new file mode 100755 index 00000000..820eb7d5 --- /dev/null +++ b/k8s/deployment/rollback_failed_ingress @@ -0,0 +1,50 @@ +#!/bin/bash +# Rolls back a failed ingress to prevent sync poisoning of the ALB group. +# When a broken ingress is applied, the ALB Ingress Controller fails to +# reconcile ALL ingresses in the same group.name. By deleting the bad +# ingress, the rest of the group can resume normal reconciliation. +# +# Only deletes ingresses — deployments/services/secrets are left in place +# so that retries don't need to re-create them. + +ALB_ROLLBACK_ON_RECONCILIATION_FAILURE="${ALB_ROLLBACK_ON_RECONCILIATION_FAILURE:-true}" + +if [[ "$ALB_ROLLBACK_ON_RECONCILIATION_FAILURE" != "true" ]]; then + log debug "📋 Ingress rollback disabled (ALB_ROLLBACK_ON_RECONCILIATION_FAILURE=$ALB_ROLLBACK_ON_RECONCILIATION_FAILURE), skipping" + return 0 +fi + +if [[ "$DNS_TYPE" != "route53" ]]; then + log debug "📋 DNS type is '$DNS_TYPE', ingress rollback only applies to route53, skipping" + return 0 +fi + +SCOPE_SLUG=$(echo "$CONTEXT" | jq -r .scope.slug) +INGRESS_NAME="k-8-s-$SCOPE_SLUG-$SCOPE_ID-$INGRESS_VISIBILITY" +ALB_NAME=$(echo "$CONTEXT" | jq -r .alb_name) + +log warn "🔄 Rolling back ingress [$INGRESS_NAME] to prevent ALB sync poisoning..." +log warn "📋 ALB group: $ALB_NAME | Namespace: $K8S_NAMESPACE" + +# Delete the main ingress +if kubectl delete ingress "$INGRESS_NAME" -n "$K8S_NAMESPACE" --ignore-not-found=true 2>/dev/null; then + log info " ✅ Deleted ingress: $INGRESS_NAME" +else + log warn " ⚠️ Could not delete ingress: $INGRESS_NAME" +fi + +# Delete additional port ingresses for this scope (they share the same scope_id label) +ADDITIONAL_INGRESSES=$(kubectl get ingress -n "$K8S_NAMESPACE" -l "scope_id=$SCOPE_ID" \ + -o jsonpath='{.items[*].metadata.name}' 2>/dev/null) + +for ing_name in $ADDITIONAL_INGRESSES; do + if [[ "$ing_name" != "$INGRESS_NAME" ]]; then + if kubectl delete ingress "$ing_name" -n "$K8S_NAMESPACE" --ignore-not-found=true 2>/dev/null; then + log info " ✅ Deleted additional port ingress: $ing_name" + else + log warn " ⚠️ Could not delete additional port ingress: $ing_name" + fi + fi +done + +log warn "🔄 Rollback complete — other scopes on ALB group '$ALB_NAME' should resume normal reconciliation" diff --git a/k8s/deployment/tests/rollback_failed_ingress.bats b/k8s/deployment/tests/rollback_failed_ingress.bats new file mode 100644 index 00000000..4d8f9f7f --- /dev/null +++ b/k8s/deployment/tests/rollback_failed_ingress.bats @@ -0,0 +1,194 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for deployment/rollback_failed_ingress - ingress rollback on failure +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + log() { if [ "$1" = "error" ]; then echo "$2" >&2; else echo "$2"; fi; } + export -f log + + export K8S_NAMESPACE="test-namespace" + export SCOPE_ID="scope-123" + export INGRESS_VISIBILITY="internet-facing" + export DNS_TYPE="route53" + export ALB_ROLLBACK_ON_RECONCILIATION_FAILURE="true" + + export CONTEXT='{ + "scope": { + "slug": "my-app" + }, + "alb_name": "k8s-test-alb" + }' +} + +teardown() { + unset CONTEXT +} + +# ============================================================================= +# Success Cases +# ============================================================================= +@test "rollback_failed_ingress: deletes main ingress" { + run bash -c " + DELETED_INGRESSES=() + kubectl() { + case \"\$1\" in + delete) + DELETED_INGRESSES+=(\"\$3\") + return 0 + ;; + get) + echo '' + return 0 + ;; + esac + return 0 + } + export -f kubectl + export K8S_NAMESPACE='$K8S_NAMESPACE' SCOPE_ID='$SCOPE_ID' INGRESS_VISIBILITY='$INGRESS_VISIBILITY' + export DNS_TYPE='$DNS_TYPE' ALB_ROLLBACK_ON_RECONCILIATION_FAILURE='$ALB_ROLLBACK_ON_RECONCILIATION_FAILURE' + export CONTEXT='$CONTEXT' + source '$BATS_TEST_DIRNAME/../rollback_failed_ingress' + " + + [ "$status" -eq 0 ] + assert_contains "$output" "Rolling back ingress" + assert_contains "$output" "k-8-s-my-app-scope-123-internet-facing" + assert_contains "$output" "Deleted ingress" + assert_contains "$output" "Rollback complete" +} + +@test "rollback_failed_ingress: deletes additional port ingresses" { + run bash -c " + kubectl() { + case \"\$1\" in + delete) + echo \"deleted \$3\" + return 0 + ;; + get) + if [[ \"\$*\" == *\"-l\"* ]]; then + echo 'k-8-s-my-app-scope-123-http-8081-internet-facing k-8-s-my-app-scope-123-grpc-9090-internet-facing' + return 0 + fi + echo '{\"metadata\": {\"resourceVersion\": \"12345\"}}' + return 0 + ;; + esac + return 0 + } + export -f kubectl + export K8S_NAMESPACE='$K8S_NAMESPACE' SCOPE_ID='$SCOPE_ID' INGRESS_VISIBILITY='$INGRESS_VISIBILITY' + export DNS_TYPE='$DNS_TYPE' ALB_ROLLBACK_ON_RECONCILIATION_FAILURE='$ALB_ROLLBACK_ON_RECONCILIATION_FAILURE' + export CONTEXT='$CONTEXT' + source '$BATS_TEST_DIRNAME/../rollback_failed_ingress' + " + + [ "$status" -eq 0 ] + assert_contains "$output" "Deleted ingress: k-8-s-my-app-scope-123-internet-facing" + assert_contains "$output" "Deleted additional port ingress: k-8-s-my-app-scope-123-http-8081-internet-facing" + assert_contains "$output" "Deleted additional port ingress: k-8-s-my-app-scope-123-grpc-9090-internet-facing" +} + +# ============================================================================= +# Skip Cases +# ============================================================================= +@test "rollback_failed_ingress: skips when disabled" { + run bash -c " + kubectl() { echo 'should not be called'; return 1; } + export -f kubectl + export K8S_NAMESPACE='$K8S_NAMESPACE' SCOPE_ID='$SCOPE_ID' INGRESS_VISIBILITY='$INGRESS_VISIBILITY' + export DNS_TYPE='$DNS_TYPE' ALB_ROLLBACK_ON_RECONCILIATION_FAILURE='false' + export CONTEXT='$CONTEXT' + source '$BATS_TEST_DIRNAME/../rollback_failed_ingress' + " + + [ "$status" -eq 0 ] + assert_contains "$output" "Ingress rollback disabled" + [[ "$output" != *"Rolling back ingress"* ]] +} + +@test "rollback_failed_ingress: skips for non-route53 DNS types" { + run bash -c " + kubectl() { echo 'should not be called'; return 1; } + export -f kubectl + export K8S_NAMESPACE='$K8S_NAMESPACE' SCOPE_ID='$SCOPE_ID' INGRESS_VISIBILITY='$INGRESS_VISIBILITY' + export DNS_TYPE='azure' ALB_ROLLBACK_ON_RECONCILIATION_FAILURE='true' + export CONTEXT='$CONTEXT' + source '$BATS_TEST_DIRNAME/../rollback_failed_ingress' + " + + [ "$status" -eq 0 ] + assert_contains "$output" "ingress rollback only applies to route53" + [[ "$output" != *"Rolling back ingress"* ]] +} + +@test "rollback_failed_ingress: skips for external_dns DNS type" { + run bash -c " + kubectl() { echo 'should not be called'; return 1; } + export -f kubectl + export K8S_NAMESPACE='$K8S_NAMESPACE' SCOPE_ID='$SCOPE_ID' INGRESS_VISIBILITY='$INGRESS_VISIBILITY' + export DNS_TYPE='external_dns' ALB_ROLLBACK_ON_RECONCILIATION_FAILURE='true' + export CONTEXT='$CONTEXT' + source '$BATS_TEST_DIRNAME/../rollback_failed_ingress' + " + + [ "$status" -eq 0 ] + assert_contains "$output" "ingress rollback only applies to route53" +} + +# ============================================================================= +# Resilience Cases +# ============================================================================= +@test "rollback_failed_ingress: handles missing ingress gracefully" { + run bash -c " + kubectl() { + case \"\$1\" in + delete) + return 0 + ;; + get) + echo '' + return 0 + ;; + esac + return 0 + } + export -f kubectl + export K8S_NAMESPACE='$K8S_NAMESPACE' SCOPE_ID='$SCOPE_ID' INGRESS_VISIBILITY='$INGRESS_VISIBILITY' + export DNS_TYPE='$DNS_TYPE' ALB_ROLLBACK_ON_RECONCILIATION_FAILURE='$ALB_ROLLBACK_ON_RECONCILIATION_FAILURE' + export CONTEXT='$CONTEXT' + source '$BATS_TEST_DIRNAME/../rollback_failed_ingress' + " + + [ "$status" -eq 0 ] + assert_contains "$output" "Rollback complete" +} + +@test "rollback_failed_ingress: continues when kubectl delete fails" { + run bash -c " + kubectl() { + case \"\$1\" in + delete) + return 1 + ;; + get) + echo 'extra-ingress' + return 0 + ;; + esac + return 0 + } + export -f kubectl + export K8S_NAMESPACE='$K8S_NAMESPACE' SCOPE_ID='$SCOPE_ID' INGRESS_VISIBILITY='$INGRESS_VISIBILITY' + export DNS_TYPE='$DNS_TYPE' ALB_ROLLBACK_ON_RECONCILIATION_FAILURE='$ALB_ROLLBACK_ON_RECONCILIATION_FAILURE' + export CONTEXT='$CONTEXT' + source '$BATS_TEST_DIRNAME/../rollback_failed_ingress' + " + + [ "$status" -eq 0 ] + assert_contains "$output" "Could not delete ingress" + assert_contains "$output" "Rollback complete" +} diff --git a/k8s/deployment/tests/validate_alb_target_group_capacity.bats b/k8s/deployment/tests/validate_alb_target_group_capacity.bats index 08d1f28c..86f0eca4 100644 --- a/k8s/deployment/tests/validate_alb_target_group_capacity.bats +++ b/k8s/deployment/tests/validate_alb_target_group_capacity.bats @@ -19,7 +19,8 @@ setup() { # Base CONTEXT export CONTEXT='{ - "providers": {} + "providers": {}, + "deployment": {"strategy": "rolling"} }' # Mock aws - default: ALB with 40 target groups @@ -50,8 +51,8 @@ teardown() { assert_equal "$status" "0" assert_contains "$output" "🔍 Validating ALB target group capacity for 'k8s-nullplatform-internet-facing'..." - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 40 target groups (max: 98)" - assert_contains "$output" "✅ ALB target group capacity validated: 40/98" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 40 target groups, this deployment would add ~1 (projected: 41, max: 98)" + assert_contains "$output" "✅ ALB target group capacity validated: 41/98 (current: 40, new: ~1)" } @test "validate_alb_target_group_capacity: displays debug info" { @@ -85,7 +86,7 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "1" - assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached target group capacity: 98/98" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' would exceed target group capacity: 98 current + 1 new = 99/98" assert_contains "$output" "💡 Possible causes:" assert_contains "$output" "Too many services or deployments are attached to this ALB" assert_contains "$output" "🔧 How to fix:" @@ -113,7 +114,7 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "1" - assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached target group capacity: 100/98" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' would exceed target group capacity: 100 current + 1 new = 101/98" } # ============================================================================= @@ -125,7 +126,7 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "0" - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 40 target groups (max: 98)" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 40 target groups, this deployment would add ~1 (projected: 41, max: 98)" } @test "validate_alb_target_group_capacity: ALB_MAX_TARGET_GROUPS from env var" { @@ -134,47 +135,47 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "1" - assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached target group capacity: 40/30" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' would exceed target group capacity: 40 current + 1 new = 41/30" } @test "validate_alb_target_group_capacity: ALB_MAX_TARGET_GROUPS from scope-configurations provider" { - export CONTEXT='{"providers":{"scope-configurations":{"networking":{"alb_max_target_groups":"30"}}}}' + export CONTEXT='{"providers":{"scope-configurations":{"networking":{"alb_max_target_groups":"30"}}},"deployment":{"strategy":"rolling"}}' export ALB_MAX_TARGET_GROUPS="98" run bash -c 'source "$SCRIPT"' assert_equal "$status" "1" - assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached target group capacity: 40/30" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' would exceed target group capacity: 40 current + 1 new = 41/30" } @test "validate_alb_target_group_capacity: ALB_MAX_TARGET_GROUPS from container-orchestration provider" { - export CONTEXT='{"providers":{"container-orchestration":{"balancer":{"alb_max_target_groups":"30"}}}}' + export CONTEXT='{"providers":{"container-orchestration":{"balancer":{"alb_max_target_groups":"30"}}},"deployment":{"strategy":"rolling"}}' export ALB_MAX_TARGET_GROUPS="98" run bash -c 'source "$SCRIPT"' assert_equal "$status" "1" - assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached target group capacity: 40/30" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' would exceed target group capacity: 40 current + 1 new = 41/30" } @test "validate_alb_target_group_capacity: scope-configurations takes priority over container-orchestration" { - export CONTEXT='{"providers":{"scope-configurations":{"networking":{"alb_max_target_groups":"100"}},"container-orchestration":{"balancer":{"alb_max_target_groups":"30"}}}}' + export CONTEXT='{"providers":{"scope-configurations":{"networking":{"alb_max_target_groups":"100"}},"container-orchestration":{"balancer":{"alb_max_target_groups":"30"}}},"deployment":{"strategy":"rolling"}}' run bash -c 'source "$SCRIPT"' assert_equal "$status" "0" - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 40 target groups (max: 100)" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 40 target groups, this deployment would add ~1 (projected: 41, max: 100)" } @test "validate_alb_target_group_capacity: provider takes priority over env var" { - export CONTEXT='{"providers":{"scope-configurations":{"networking":{"alb_max_target_groups":"100"}}}}' + export CONTEXT='{"providers":{"scope-configurations":{"networking":{"alb_max_target_groups":"100"}}},"deployment":{"strategy":"rolling"}}' export ALB_MAX_TARGET_GROUPS="30" run bash -c 'source "$SCRIPT"' assert_equal "$status" "0" - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 40 target groups (max: 100)" - assert_contains "$output" "✅ ALB target group capacity validated: 40/100" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 40 target groups, this deployment would add ~1 (projected: 41, max: 100)" + assert_contains "$output" "✅ ALB target group capacity validated: 41/100 (current: 40, new: ~1)" } # ============================================================================= @@ -265,8 +266,8 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "0" - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 0 target groups (max: 98)" - assert_contains "$output" "✅ ALB target group capacity validated: 0/98" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 0 target groups, this deployment would add ~1 (projected: 1, max: 98)" + assert_contains "$output" "✅ ALB target group capacity validated: 1/98 (current: 0, new: ~1)" } @test "validate_alb_target_group_capacity: passes at exactly one below capacity" { @@ -277,7 +278,7 @@ teardown() { return 0 ;; *"describe-target-groups"*) - echo "97" + echo "96" return 0 ;; esac @@ -287,7 +288,7 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "0" - assert_contains "$output" "✅ ALB target group capacity validated: 97/98" + assert_contains "$output" "✅ ALB target group capacity validated: 97/98 (current: 96, new: ~1)" } @test "validate_alb_target_group_capacity: fails when target group count is non-numeric" { diff --git a/k8s/deployment/validate_alb_target_group_capacity b/k8s/deployment/validate_alb_target_group_capacity index 9b3fc8de..8c3b8dcd 100755 --- a/k8s/deployment/validate_alb_target_group_capacity +++ b/k8s/deployment/validate_alb_target_group_capacity @@ -94,10 +94,22 @@ if ! [[ "$TARGET_GROUP_COUNT" =~ ^[0-9]+$ ]]; then exit 1 fi -log info "📋 ALB '$ALB_NAME' has $TARGET_GROUP_COUNT target groups (max: $ALB_MAX_TARGET_GROUPS)" +# Estimate target groups this deployment will add +ADDITIONAL_PORT_COUNT=$(echo "$CONTEXT" | jq '.scope.capabilities.additional_ports // [] | length') +DEPLOYMENT_STRATEGY=$(echo "$CONTEXT" | jq -r '.deployment.strategy // "initial"') +if [[ "$DEPLOYMENT_STRATEGY" == "blue_green" ]]; then + # Blue-green creates TGs for both blue and green backends + ESTIMATED_NEW_TGS=$(( (1 + ADDITIONAL_PORT_COUNT) * 2 )) +else + ESTIMATED_NEW_TGS=$(( 1 + ADDITIONAL_PORT_COUNT )) +fi + +PROJECTED_TOTAL=$((TARGET_GROUP_COUNT + ESTIMATED_NEW_TGS)) + +log info "📋 ALB '$ALB_NAME' has $TARGET_GROUP_COUNT target groups, this deployment would add ~$ESTIMATED_NEW_TGS (projected: $PROJECTED_TOTAL, max: $ALB_MAX_TARGET_GROUPS)" -if [[ "$TARGET_GROUP_COUNT" -ge "$ALB_MAX_TARGET_GROUPS" ]]; then - log error "❌ ALB '$ALB_NAME' has reached target group capacity: $TARGET_GROUP_COUNT/$ALB_MAX_TARGET_GROUPS" +if [[ "$PROJECTED_TOTAL" -ge "$ALB_MAX_TARGET_GROUPS" ]]; then + log error "❌ ALB '$ALB_NAME' would exceed target group capacity: $TARGET_GROUP_COUNT current + $ESTIMATED_NEW_TGS new = $PROJECTED_TOTAL/$ALB_MAX_TARGET_GROUPS" log error "" log error "💡 Possible causes:" log error " Too many services or deployments are attached to this ALB" @@ -111,4 +123,4 @@ if [[ "$TARGET_GROUP_COUNT" -ge "$ALB_MAX_TARGET_GROUPS" ]]; then exit 1 fi -log info "✅ ALB target group capacity validated: $TARGET_GROUP_COUNT/$ALB_MAX_TARGET_GROUPS" +log info "✅ ALB target group capacity validated: $PROJECTED_TOTAL/$ALB_MAX_TARGET_GROUPS (current: $TARGET_GROUP_COUNT, new: ~$ESTIMATED_NEW_TGS)" diff --git a/k8s/deployment/verify_ingress_reconciliation b/k8s/deployment/verify_ingress_reconciliation index ee9f3221..35930b24 100644 --- a/k8s/deployment/verify_ingress_reconciliation +++ b/k8s/deployment/verify_ingress_reconciliation @@ -1,5 +1,11 @@ #!/bin/bash +handle_reconciliation_failure() { + if [[ -f "$SERVICE_PATH/deployment/rollback_failed_ingress" ]]; then + source "$SERVICE_PATH/deployment/rollback_failed_ingress" + fi + exit 1 +} SCOPE_SLUG=$(echo "$CONTEXT" | jq -r .scope.slug) ALB_NAME=$(echo "$CONTEXT" | jq -r .alb_name) @@ -246,14 +252,14 @@ while [ $elapsed -lt $MAX_WAIT_SECONDS ]; do log error "🔧 How to fix:" log error " - Verify hostname matches certificate in ACM" log error " - Ensure certificate includes exact hostname or matching wildcard" - exit 1 + handle_reconciliation_failure fi if [ "$event_type" == "Error" ]; then log error "❌ Ingress reconciliation failed" log error "💡 Error messages:" echo "$relevant_events" | jq -r '.[] | " - \(.message)"' >&2 - exit 1 + handle_reconciliation_failure fi if [ "$event_type" == "Warning" ]; then @@ -281,4 +287,4 @@ events_json=$(kubectl get events -n "$K8S_NAMESPACE" \ -o json) echo "$events_json" | jq -r '.items | sort_by(.lastTimestamp) | .[] | " [\(.type)] \(.reason): \(.message)"' | tail -10 -exit 1 +handle_reconciliation_failure diff --git a/k8s/deployment/workflows/initial.yaml b/k8s/deployment/workflows/initial.yaml index b7bc8134..ad76c3f8 100644 --- a/k8s/deployment/workflows/initial.yaml +++ b/k8s/deployment/workflows/initial.yaml @@ -28,6 +28,9 @@ steps: type: environment - name: BLUE_DEPLOYMENT_ID type: environment + - name: validate alb capacity + type: script + file: "$SERVICE_PATH/scope/validate_alb_capacity" - name: validate alb target group capacity type: script file: "$SERVICE_PATH/deployment/validate_alb_target_group_capacity" diff --git a/k8s/scope/build_context b/k8s/scope/build_context index 8328eab6..211a226b 100755 --- a/k8s/scope/build_context +++ b/k8s/scope/build_context @@ -49,6 +49,12 @@ ALB_RECONCILIATION_ENABLED=$(get_config_value \ --default "false" ) +ALB_ROLLBACK_ON_RECONCILIATION_FAILURE=$(get_config_value \ + --env ALB_ROLLBACK_ON_RECONCILIATION_FAILURE \ + --provider '.providers["scope-configurations"].networking.alb_rollback_on_reconciliation_failure' \ + --default "true" +) + DEPLOYMENT_MAX_WAIT_IN_SECONDS=$(get_config_value \ --env DEPLOYMENT_MAX_WAIT_IN_SECONDS \ --provider '.providers["scope-configurations"].deployment.deployment_max_wait_seconds' \ @@ -102,6 +108,7 @@ export HOSTED_ZONE_RG export AZURE_SUBSCRIPTION_ID export RESOURCE_GROUP export ALB_RECONCILIATION_ENABLED +export ALB_ROLLBACK_ON_RECONCILIATION_FAILURE export DEPLOYMENT_MAX_WAIT_IN_SECONDS export MANIFEST_BACKUP export VAULT_ADDR diff --git a/k8s/scope/tests/validate_alb_capacity.bats b/k8s/scope/tests/validate_alb_capacity.bats index af08defd..7fac9009 100644 --- a/k8s/scope/tests/validate_alb_capacity.bats +++ b/k8s/scope/tests/validate_alb_capacity.bats @@ -22,7 +22,7 @@ setup() { "providers": {} }' - # Mock aws - default: ALB with 2 listeners, 30 rules each + # Mock aws - default: ALB with HTTPS (443) listener, 50 rules aws() { case "$*" in *"describe-load-balancers"*) @@ -30,11 +30,11 @@ setup() { return 0 ;; *"describe-listeners"*) - echo "arn:aws:elasticloadbalancing:us-east-1:123456789:listener/app/k8s-nullplatform-internet-facing/abc123/listener1 arn:aws:elasticloadbalancing:us-east-1:123456789:listener/app/k8s-nullplatform-internet-facing/abc123/listener2" + echo "arn:aws:elasticloadbalancing:us-east-1:123456789:listener/app/k8s-nullplatform-internet-facing/abc123/listener1" return 0 ;; *"describe-rules"*) - echo "30" + echo "50" return 0 ;; esac @@ -54,8 +54,8 @@ teardown() { assert_equal "$status" "0" assert_contains "$output" "🔍 Validating ALB capacity for 'k8s-nullplatform-internet-facing'..." - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 60 rules (max capacity: 75)" - assert_contains "$output" "✅ ALB capacity validated: 60/75 rules" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' HTTPS listener has 50 rules, this scope would add ~1 (projected: 51, max: 75)" + assert_contains "$output" "✅ ALB capacity validated: 51/75 rules (current: 50, new: ~1)" } @test "validate_alb_capacity: displays debug info" { @@ -90,8 +90,8 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "0" - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 10 rules (max capacity: 75)" - assert_contains "$output" "✅ ALB capacity validated: 10/75 rules" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' HTTPS listener has 10 rules, this scope would add ~1 (projected: 11, max: 75)" + assert_contains "$output" "✅ ALB capacity validated: 11/75 rules (current: 10, new: ~1)" } # ============================================================================= @@ -119,12 +119,12 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "1" - assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached capacity: 75/75 rules" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' would exceed capacity: 75 current + 1 new = 76/75 rules" assert_contains "$output" "💡 Possible causes:" assert_contains "$output" "Too many scopes or ingress rules are configured on this ALB" assert_contains "$output" "🔧 How to fix:" assert_contains "$output" "Remove unused scopes or ingress rules from the ALB" - assert_contains "$output" "Increase ALB_MAX_CAPACITY in values.yaml or container-orchestration provider (AWS limit is 100 per listener)" + assert_contains "$output" "Increase ALB_MAX_CAPACITY in values.yaml or scope-configurations provider (AWS limit is 100 per listener)" assert_contains "$output" "Request an AWS service quota increase for rules per ALB listener" assert_contains "$output" "Consider using a separate ALB for additional scopes" } @@ -151,7 +151,7 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "1" - assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached capacity: 90/75 rules" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' would exceed capacity: 90 current + 1 new = 91/75 rules" } # ============================================================================= @@ -163,7 +163,7 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "0" - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 60 rules (max capacity: 75)" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' HTTPS listener has 50 rules, this scope would add ~1 (projected: 51, max: 75)" } @test "validate_alb_capacity: ALB_MAX_CAPACITY from env var" { @@ -172,7 +172,7 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "1" - assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached capacity: 60/50 rules" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' would exceed capacity: 50 current + 1 new = 51/50 rules" } @test "validate_alb_capacity: ALB_MAX_CAPACITY from scope-configurations provider" { @@ -182,7 +182,7 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "1" - assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached capacity: 60/50 rules" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' would exceed capacity: 50 current + 1 new = 51/50 rules" } @test "validate_alb_capacity: provider takes priority over env var" { @@ -192,8 +192,8 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "0" - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 60 rules (max capacity: 100)" - assert_contains "$output" "✅ ALB capacity validated: 60/100 rules" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' HTTPS listener has 50 rules, this scope would add ~1 (projected: 51, max: 100)" + assert_contains "$output" "✅ ALB capacity validated: 51/100 rules (current: 50, new: ~1)" } @test "validate_alb_capacity: ALB_MAX_CAPACITY from container-orchestration provider" { @@ -203,7 +203,7 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "1" - assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' has reached capacity: 60/50 rules" + assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' would exceed capacity: 50 current + 1 new = 51/50 rules" } @test "validate_alb_capacity: scope-configurations takes priority over container-orchestration" { @@ -212,7 +212,7 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "0" - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 60 rules (max capacity: 100)" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' HTTPS listener has 50 rules, this scope would add ~1 (projected: 51, max: 100)" } # ============================================================================= @@ -305,7 +305,7 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "0" - assert_contains "$output" "⚠️ No listeners found on ALB 'k8s-nullplatform-internet-facing', skipping capacity check" + assert_contains "$output" "⚠️ No HTTPS (443) listener found on ALB 'k8s-nullplatform-internet-facing', skipping capacity check" } @test "validate_alb_capacity: fails when describe-rules fails" { @@ -330,7 +330,7 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "1" - assert_contains "$output" "❌ Failed to describe rules for listener" + assert_contains "$output" "❌ Failed to describe rules for HTTPS listener" assert_contains "$output" "📋 Listener ARN: arn:aws:elasticloadbalancing:us-east-1:123456789:listener/app/alb/abc123/listener1" assert_contains "$output" "💡 Possible causes:" assert_contains "$output" "The agent may lack permissions to describe rules" @@ -363,8 +363,8 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "0" - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 0 rules (max capacity: 75)" - assert_contains "$output" "✅ ALB capacity validated: 0/75 rules" + assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' HTTPS listener has 0 rules, this scope would add ~1 (projected: 1, max: 75)" + assert_contains "$output" "✅ ALB capacity validated: 1/75 rules (current: 0, new: ~1)" } @test "validate_alb_capacity: passes at exactly one below capacity" { @@ -379,7 +379,7 @@ teardown() { return 0 ;; *"describe-rules"*) - echo "74" + echo "73" return 0 ;; esac @@ -389,7 +389,7 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "0" - assert_contains "$output" "✅ ALB capacity validated: 74/75 rules" + assert_contains "$output" "✅ ALB capacity validated: 74/75 rules (current: 73, new: ~1)" } @test "validate_alb_capacity: fails when rule count is non-numeric" { @@ -414,7 +414,7 @@ teardown() { run bash -c 'source "$SCRIPT"' assert_equal "$status" "1" - assert_contains "$output" "❌ Unexpected non-numeric rule count from listener" + assert_contains "$output" "❌ Unexpected non-numeric rule count from HTTPS listener" assert_contains "$output" "📋 Listener ARN: arn:aws:elasticloadbalancing:us-east-1:123456789:listener/app/alb/abc123/listener1" assert_contains "$output" "📋 Received value: WARNING: something unexpected" assert_contains "$output" "💡 Possible causes:" diff --git a/k8s/scope/validate_alb_capacity b/k8s/scope/validate_alb_capacity index fc5eb50e..b1551f89 100755 --- a/k8s/scope/validate_alb_capacity +++ b/k8s/scope/validate_alb_capacity @@ -60,10 +60,11 @@ fi log debug "📋 ALB ARN: $ALB_ARN" -LISTENER_ARNS=$(aws elbv2 describe-listeners \ +# Count rules on the HTTPS (443) listener only — AWS limit is per-listener, not per-ALB +HTTPS_LISTENER_ARN=$(aws elbv2 describe-listeners \ --load-balancer-arn "$ALB_ARN" \ --region "$REGION" \ - --query 'Listeners[].ListenerArn' \ + --query 'Listeners[?Port==`443`].ListenerArn | [0]' \ --output text \ --no-paginate 2>&1) || { log error "❌ Failed to describe listeners for ALB '$ALB_NAME'" @@ -77,66 +78,67 @@ LISTENER_ARNS=$(aws elbv2 describe-listeners \ exit 1 } -if [[ -z "$LISTENER_ARNS" ]] || [[ "$LISTENER_ARNS" == "None" ]]; then - log warn "⚠️ No listeners found on ALB '$ALB_NAME', skipping capacity check" +if [[ -z "$HTTPS_LISTENER_ARN" ]] || [[ "$HTTPS_LISTENER_ARN" == "None" ]]; then + log warn "⚠️ No HTTPS (443) listener found on ALB '$ALB_NAME', skipping capacity check" return 0 fi -# Count rules across all listeners (excluding default rules) -TOTAL_RULES=0 - -for LISTENER_ARN in $LISTENER_ARNS; do - RULE_COUNT=$(aws elbv2 describe-rules \ - --listener-arn "$LISTENER_ARN" \ - --region "$REGION" \ - --query 'length(Rules[?!IsDefault])' \ - --output text \ - --no-paginate 2>&1) || { - log error "❌ Failed to describe rules for listener" - log error "📋 Listener ARN: $LISTENER_ARN" - log error "" - log error "💡 Possible causes:" - log error " The agent may lack permissions to describe rules" - log error "" - log error "🔧 How to fix:" - log error " • Check IAM permissions for elbv2:DescribeRules" - log error "" - exit 1 - } - - if ! [[ "$RULE_COUNT" =~ ^[0-9]+$ ]]; then - log error "❌ Unexpected non-numeric rule count from listener" - log error "📋 Listener ARN: $LISTENER_ARN" - log error "📋 Received value: $RULE_COUNT" - log error "" - log error "💡 Possible causes:" - log error " The AWS CLI returned an unexpected response format" - log error "" - log error "🔧 How to fix:" - log error " • Verify AWS CLI version and credentials are correct" - log error " • Run manually: aws elbv2 describe-rules --listener-arn $LISTENER_ARN --region $REGION --query 'length(Rules[?!IsDefault])'" - log error "" - exit 1 - fi - - TOTAL_RULES=$((TOTAL_RULES + RULE_COUNT)) -done - -log info "📋 ALB '$ALB_NAME' has $TOTAL_RULES rules (max capacity: $ALB_MAX_CAPACITY)" - -if [[ "$TOTAL_RULES" -ge "$ALB_MAX_CAPACITY" ]]; then - log error "❌ ALB '$ALB_NAME' has reached capacity: $TOTAL_RULES/$ALB_MAX_CAPACITY rules" +TOTAL_RULES=$(aws elbv2 describe-rules \ + --listener-arn "$HTTPS_LISTENER_ARN" \ + --region "$REGION" \ + --query 'length(Rules[?!IsDefault])' \ + --output text \ + --no-paginate 2>&1) || { + log error "❌ Failed to describe rules for HTTPS listener" + log error "📋 Listener ARN: $HTTPS_LISTENER_ARN" + log error "" + log error "💡 Possible causes:" + log error " The agent may lack permissions to describe rules" + log error "" + log error "🔧 How to fix:" + log error " • Check IAM permissions for elbv2:DescribeRules" + log error "" + exit 1 +} + +if ! [[ "$TOTAL_RULES" =~ ^[0-9]+$ ]]; then + log error "❌ Unexpected non-numeric rule count from HTTPS listener" + log error "📋 Listener ARN: $HTTPS_LISTENER_ARN" + log error "📋 Received value: $TOTAL_RULES" + log error "" + log error "💡 Possible causes:" + log error " The AWS CLI returned an unexpected response format" + log error "" + log error "🔧 How to fix:" + log error " • Verify AWS CLI version and credentials are correct" + log error " • Run manually: aws elbv2 describe-rules --listener-arn $HTTPS_LISTENER_ARN --region $REGION --query 'length(Rules[?!IsDefault])'" + log error "" + exit 1 +fi + +# Estimate rules this scope will add +DOMAIN_COUNT=$(echo "$CONTEXT" | jq '[.scope.domain] + [.scope.domains[]?.name // empty] | length') +ADDITIONAL_PORT_COUNT=$(echo "$CONTEXT" | jq '.scope.capabilities.additional_ports // [] | length') +# Each domain creates 1 rule on the primary ingress; additional ports create separate ingresses with the same domain rules +ESTIMATED_NEW_RULES=$(( DOMAIN_COUNT * (1 + ADDITIONAL_PORT_COUNT) )) + +PROJECTED_TOTAL=$((TOTAL_RULES + ESTIMATED_NEW_RULES)) + +log info "📋 ALB '$ALB_NAME' HTTPS listener has $TOTAL_RULES rules, this scope would add ~$ESTIMATED_NEW_RULES (projected: $PROJECTED_TOTAL, max: $ALB_MAX_CAPACITY)" + +if [[ "$PROJECTED_TOTAL" -ge "$ALB_MAX_CAPACITY" ]]; then + log error "❌ ALB '$ALB_NAME' would exceed capacity: $TOTAL_RULES current + $ESTIMATED_NEW_RULES new = $PROJECTED_TOTAL/$ALB_MAX_CAPACITY rules" log error "" log error "💡 Possible causes:" log error " Too many scopes or ingress rules are configured on this ALB" log error "" log error "🔧 How to fix:" log error " • Remove unused scopes or ingress rules from the ALB" - log error " • Increase ALB_MAX_CAPACITY in values.yaml or container-orchestration provider (AWS limit is 100 per listener)" + log error " • Increase ALB_MAX_CAPACITY in values.yaml or scope-configurations provider (AWS limit is 100 per listener)" log error " • Request an AWS service quota increase for rules per ALB listener" log error " • Consider using a separate ALB for additional scopes" log error "" exit 1 fi -log info "✅ ALB capacity validated: $TOTAL_RULES/$ALB_MAX_CAPACITY rules" +log info "✅ ALB capacity validated: $PROJECTED_TOTAL/$ALB_MAX_CAPACITY rules (current: $TOTAL_RULES, new: ~$ESTIMATED_NEW_RULES)" diff --git a/k8s/values.yaml b/k8s/values.yaml index 020b6059..c2d3da15 100644 --- a/k8s/values.yaml +++ b/k8s/values.yaml @@ -11,7 +11,8 @@ configuration: DNS_TYPE: route53 # Available values route53 | azure | external_dns ALB_RECONCILIATION_ENABLED: false ALB_MAX_CAPACITY: 75 - ALB_MAX_TARGET_GROUPS: 98 + ALB_MAX_TARGET_GROUPS: 90 + ALB_ROLLBACK_ON_RECONCILIATION_FAILURE: true ALB_METRICS_PUBLISH_ENABLED: false # ALB_METRICS_PUBLISH_TARGET: cloudwatch # Available values: cloudwatch | datadog DEPLOYMENT_MAX_WAIT_IN_SECONDS: 600 From f32bc11804b971368559569974080f8570d9f0d5 Mon Sep 17 00:00:00 2001 From: Pablo Vilas Date: Wed, 15 Apr 2026 14:14:46 -0300 Subject: [PATCH 2/5] refactor(alb): combine rule and TG capacity checks into single deployment step Merge validate_alb_capacity and validate_alb_target_group_capacity into a single deployment/validate_alb_capacity script that checks both in one pass, sharing the ALB ARN lookup and DNS_TYPE guard. The scope-level validate_alb_capacity (rules only) remains for create.yaml where no deployment context exists. --- .../tests/validate_alb_capacity.bats | 288 ++++++++++++++++++ k8s/deployment/validate_alb_capacity | 228 ++++++++++++++ k8s/deployment/workflows/initial.yaml | 5 +- 3 files changed, 517 insertions(+), 4 deletions(-) create mode 100644 k8s/deployment/tests/validate_alb_capacity.bats create mode 100755 k8s/deployment/validate_alb_capacity diff --git a/k8s/deployment/tests/validate_alb_capacity.bats b/k8s/deployment/tests/validate_alb_capacity.bats new file mode 100644 index 00000000..3401f922 --- /dev/null +++ b/k8s/deployment/tests/validate_alb_capacity.bats @@ -0,0 +1,288 @@ +#!/usr/bin/env bats +# ============================================================================= +# Unit tests for deployment/validate_alb_capacity - combined rule + TG check +# ============================================================================= + +setup() { + export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" + source "$PROJECT_ROOT/testing/assertions.sh" + log() { if [ "$1" = "error" ]; then echo "$2" >&2; else echo "$2"; fi; } + export -f log + + export ALB_NAME="k8s-nullplatform-internet-facing" + export REGION="us-east-1" + export DNS_TYPE="route53" + + export CONTEXT='{ + "scope": { + "slug": "my-app", + "domain": "app.example.com", + "domains": [], + "capabilities": {} + }, + "alb_name": "k8s-nullplatform-internet-facing", + "deployment": { + "strategy": "rolling" + } + }' + + # Default mock: ALB with HTTPS listener, 50 rules, 40 TGs + get_config_value() { + local default_val="" + while [[ $# -gt 0 ]]; do + case "$1" in + --default) default_val="$2"; shift 2 ;; + *) shift 2 ;; + esac + done + echo "$default_val" + } + export -f get_config_value + + aws() { + case "$*" in + *describe-load-balancers*) + echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" + return 0 + ;; + *describe-listeners*) + echo "arn:aws:elasticloadbalancing:us-east-1:123456789:listener/app/alb/abc123/listener1" + return 0 + ;; + *describe-rules*) + echo "50" + return 0 + ;; + *describe-target-groups*) + echo "40" + return 0 + ;; + esac + return 0 + } + export -f aws +} + +teardown() { + unset CONTEXT +} + +# ============================================================================= +# Success Cases +# ============================================================================= +@test "validate_alb_capacity: passes when both rules and TGs are under capacity" { + run bash -c " + $(declare -f aws get_config_value log) + export ALB_NAME='$ALB_NAME' REGION='$REGION' DNS_TYPE='$DNS_TYPE' CONTEXT='$CONTEXT' + source '$BATS_TEST_DIRNAME/../validate_alb_capacity' + " + + [ "$status" -eq 0 ] + assert_contains "$output" "Rule capacity OK" + assert_contains "$output" "Target group capacity OK" +} + +@test "validate_alb_capacity: skips for non-route53 DNS types" { + run bash -c " + $(declare -f aws get_config_value log) + export ALB_NAME='$ALB_NAME' REGION='$REGION' DNS_TYPE='azure' CONTEXT='$CONTEXT' + source '$BATS_TEST_DIRNAME/../validate_alb_capacity' + " + + [ "$status" -eq 0 ] + assert_contains "$output" "ALB capacity validation only applies to route53" +} + +# ============================================================================= +# Rule capacity failures +# ============================================================================= +@test "validate_alb_capacity: fails when rules would exceed capacity" { + run bash -c " + aws() { + case \"\$*\" in + *describe-load-balancers*) echo 'arn:aws:elbv2:alb/abc'; return 0 ;; + *describe-listeners*) echo 'arn:aws:elbv2:listener/1'; return 0 ;; + *describe-rules*) echo '74'; return 0 ;; + *describe-target-groups*) echo '40'; return 0 ;; + esac + } + export -f aws + $(declare -f get_config_value log) + export ALB_NAME='$ALB_NAME' REGION='$REGION' DNS_TYPE='$DNS_TYPE' CONTEXT='$CONTEXT' + source '$BATS_TEST_DIRNAME/../validate_alb_capacity' + " + + [ "$status" -eq 1 ] + assert_contains "$output" "would exceed rule capacity: 74 current + 1 new = 75/75" +} + +@test "validate_alb_capacity: estimates rules from domains and additional ports" { + local ctx='{"scope":{"slug":"app","domain":"a.com","domains":[{"name":"b.com"},{"name":"c.com"}],"capabilities":{"additional_ports":[{"type":"HTTP","port":8081}]}},"alb_name":"alb","deployment":{"strategy":"rolling"}}' + + run bash -c " + aws() { + case \"\$*\" in + *describe-load-balancers*) echo 'arn:alb'; return 0 ;; + *describe-listeners*) echo 'arn:listener'; return 0 ;; + *describe-rules*) echo '40'; return 0 ;; + *describe-target-groups*) echo '40'; return 0 ;; + esac + } + export -f aws + $(declare -f get_config_value log) + export ALB_NAME='$ALB_NAME' REGION='$REGION' DNS_TYPE='$DNS_TYPE' CONTEXT='$ctx' + source '$BATS_TEST_DIRNAME/../validate_alb_capacity' + " + + [ "$status" -eq 0 ] + # 3 domains * (1 + 1 additional port) = 6 rules + assert_contains "$output" "this scope would add ~6" +} + +# ============================================================================= +# Target group capacity failures +# ============================================================================= +@test "validate_alb_capacity: fails when TGs would exceed capacity" { + run bash -c " + aws() { + case \"\$*\" in + *describe-load-balancers*) echo 'arn:alb'; return 0 ;; + *describe-listeners*) echo 'arn:listener'; return 0 ;; + *describe-rules*) echo '10'; return 0 ;; + *describe-target-groups*) echo '89'; return 0 ;; + esac + } + export -f aws + $(declare -f get_config_value log) + export ALB_NAME='$ALB_NAME' REGION='$REGION' DNS_TYPE='$DNS_TYPE' CONTEXT='$CONTEXT' + source '$BATS_TEST_DIRNAME/../validate_alb_capacity' + " + + [ "$status" -eq 1 ] + assert_contains "$output" "Rule capacity OK" + assert_contains "$output" "would exceed target group capacity: 89 current + 1 new = 90/90" +} + +@test "validate_alb_capacity: estimates 2x TGs for blue-green strategy" { + local ctx='{"scope":{"slug":"app","domain":"a.com","domains":[],"capabilities":{"additional_ports":[{"type":"HTTP","port":8081}]}},"alb_name":"alb","deployment":{"strategy":"blue_green"}}' + + run bash -c " + aws() { + case \"\$*\" in + *describe-load-balancers*) echo 'arn:alb'; return 0 ;; + *describe-listeners*) echo 'arn:listener'; return 0 ;; + *describe-rules*) echo '10'; return 0 ;; + *describe-target-groups*) echo '10'; return 0 ;; + esac + } + export -f aws + $(declare -f get_config_value log) + export ALB_NAME='$ALB_NAME' REGION='$REGION' DNS_TYPE='$DNS_TYPE' CONTEXT='$ctx' + source '$BATS_TEST_DIRNAME/../validate_alb_capacity' + " + + [ "$status" -eq 0 ] + # (1 + 1 additional port) * 2 = 4 TGs for blue-green + assert_contains "$output" "this deployment would add ~4" +} + +# ============================================================================= +# AWS API failures +# ============================================================================= +@test "validate_alb_capacity: fails when describe-load-balancers fails" { + run bash -c " + aws() { return 1; } + export -f aws + $(declare -f get_config_value log) + export ALB_NAME='$ALB_NAME' REGION='$REGION' DNS_TYPE='$DNS_TYPE' CONTEXT='$CONTEXT' + source '$BATS_TEST_DIRNAME/../validate_alb_capacity' + " + + [ "$status" -eq 1 ] + assert_contains "$output" "Failed to find load balancer" +} + +@test "validate_alb_capacity: fails when ALB ARN is None" { + run bash -c " + aws() { echo 'None'; return 0; } + export -f aws + $(declare -f get_config_value log) + export ALB_NAME='$ALB_NAME' REGION='$REGION' DNS_TYPE='$DNS_TYPE' CONTEXT='$CONTEXT' + source '$BATS_TEST_DIRNAME/../validate_alb_capacity' + " + + [ "$status" -eq 1 ] + assert_contains "$output" "not found in region" +} + +@test "validate_alb_capacity: skips rule check when no HTTPS listener found" { + run bash -c " + aws() { + case \"\$*\" in + *describe-load-balancers*) echo 'arn:alb'; return 0 ;; + *describe-listeners*) echo 'None'; return 0 ;; + *describe-target-groups*) echo '10'; return 0 ;; + esac + } + export -f aws + $(declare -f get_config_value log) + export ALB_NAME='$ALB_NAME' REGION='$REGION' DNS_TYPE='$DNS_TYPE' CONTEXT='$CONTEXT' + source '$BATS_TEST_DIRNAME/../validate_alb_capacity' + " + + [ "$status" -eq 0 ] + assert_contains "$output" "No HTTPS (443) listener found" + assert_contains "$output" "Target group capacity OK" +} + +@test "validate_alb_capacity: fails when describe-target-groups fails" { + run bash -c " + aws() { + case \"\$*\" in + *describe-load-balancers*) echo 'arn:alb'; return 0 ;; + *describe-listeners*) echo 'arn:listener'; return 0 ;; + *describe-rules*) echo '10'; return 0 ;; + *describe-target-groups*) return 1 ;; + esac + } + export -f aws + $(declare -f get_config_value log) + export ALB_NAME='$ALB_NAME' REGION='$REGION' DNS_TYPE='$DNS_TYPE' CONTEXT='$CONTEXT' + source '$BATS_TEST_DIRNAME/../validate_alb_capacity' + " + + [ "$status" -eq 1 ] + assert_contains "$output" "Rule capacity OK" + assert_contains "$output" "Failed to describe target groups" +} + +# ============================================================================= +# Config validation +# ============================================================================= +@test "validate_alb_capacity: fails when ALB_MAX_CAPACITY is non-numeric" { + run bash -c " + get_config_value() { + local default_val='' + while [[ \$# -gt 0 ]]; do + case \"\$1\" in + --default) default_val=\"\$2\"; shift 2 ;; + --env) + if [[ \"\$2\" == \"ALB_MAX_CAPACITY\" ]]; then + echo 'abc'; return + fi + shift 2 ;; + *) shift 2 ;; + esac + done + echo \"\$default_val\" + } + export -f get_config_value + $(declare -f aws log) + export ALB_NAME='$ALB_NAME' REGION='$REGION' DNS_TYPE='$DNS_TYPE' CONTEXT='$CONTEXT' + export ALB_MAX_CAPACITY='abc' + source '$BATS_TEST_DIRNAME/../validate_alb_capacity' + " + + [ "$status" -eq 1 ] + assert_contains "$output" "ALB_MAX_CAPACITY must be a numeric value" +} diff --git a/k8s/deployment/validate_alb_capacity b/k8s/deployment/validate_alb_capacity new file mode 100755 index 00000000..82a22299 --- /dev/null +++ b/k8s/deployment/validate_alb_capacity @@ -0,0 +1,228 @@ +#!/bin/bash +# Validates ALB capacity (rules + target groups) before applying deployment resources. +# Combines rule and target group checks in a single pass to avoid duplicate AWS API calls. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/../utils/get_config_value" + +if [[ "$DNS_TYPE" != "route53" ]]; then + log debug "📋 DNS type is '$DNS_TYPE', ALB capacity validation only applies to route53, skipping" + return 0 +fi + +# --- Load thresholds --- + +ALB_MAX_CAPACITY=$(get_config_value \ + --env ALB_MAX_CAPACITY \ + --provider '.providers["scope-configurations"].networking.alb_max_capacity' \ + --provider '.providers["container-orchestration"].balancer.alb_capacity_threshold' \ + --default "75" +) + +ALB_MAX_TARGET_GROUPS=$(get_config_value \ + --env ALB_MAX_TARGET_GROUPS \ + --provider '.providers["scope-configurations"].networking.alb_max_target_groups' \ + --provider '.providers["container-orchestration"].balancer.alb_max_target_groups' \ + --default "90" +) + +if ! [[ "$ALB_MAX_CAPACITY" =~ ^[0-9]+$ ]]; then + log error "❌ ALB_MAX_CAPACITY must be a numeric value, got: '$ALB_MAX_CAPACITY'" + log error "" + log error "🔧 How to fix:" + log error " • Set a numeric value in values.yaml or scope-configurations provider" + log error "" + exit 1 +fi + +if ! [[ "$ALB_MAX_TARGET_GROUPS" =~ ^[0-9]+$ ]]; then + log error "❌ ALB_MAX_TARGET_GROUPS must be a numeric value, got: '$ALB_MAX_TARGET_GROUPS'" + log error "" + log error "🔧 How to fix:" + log error " • Set a numeric value in values.yaml or scope-configurations provider" + log error "" + exit 1 +fi + +log info "🔍 Validating ALB capacity for '$ALB_NAME'..." +log debug "📋 ALB: $ALB_NAME | Region: $REGION | Max rules: $ALB_MAX_CAPACITY | Max target groups: $ALB_MAX_TARGET_GROUPS" + +# --- Resolve ALB ARN (single API call for both checks) --- + +ALB_ARN=$(aws elbv2 describe-load-balancers \ + --names "$ALB_NAME" \ + --region "$REGION" \ + --query 'LoadBalancers[0].LoadBalancerArn' \ + --output text \ + --no-paginate 2>&1) || { + log error "❌ Failed to find load balancer '$ALB_NAME' in region '$REGION'" + log error "" + log error "💡 Possible causes:" + log error " The load balancer may not exist or the agent lacks permissions" + log error "" + log error "🔧 How to fix:" + log error " • Verify the ALB exists: aws elbv2 describe-load-balancers --names $ALB_NAME --region $REGION" + log error " • Check IAM permissions for elbv2:DescribeLoadBalancers" + log error "" + exit 1 +} + +if [[ -z "$ALB_ARN" ]] || [[ "$ALB_ARN" == "None" ]]; then + log error "❌ Load balancer '$ALB_NAME' not found in region '$REGION'" + log error "" + log error "💡 Possible causes:" + log error " The load balancer name may be incorrect or it was deleted" + log error "" + log error "🔧 How to fix:" + log error " • List available ALBs: aws elbv2 describe-load-balancers --region $REGION" + log error " • Check the balancer name in values.yaml or scope-configurations provider" + log error "" + exit 1 +fi + +log debug "📋 ALB ARN: $ALB_ARN" + +# --- Check 1: Rule capacity (HTTPS 443 listener) --- + +HTTPS_LISTENER_ARN=$(aws elbv2 describe-listeners \ + --load-balancer-arn "$ALB_ARN" \ + --region "$REGION" \ + --query 'Listeners[?Port==`443`].ListenerArn | [0]' \ + --output text \ + --no-paginate 2>&1) || { + log error "❌ Failed to describe listeners for ALB '$ALB_NAME'" + log error "" + log error "💡 Possible causes:" + log error " The agent may lack permissions to describe listeners" + log error "" + log error "🔧 How to fix:" + log error " • Check IAM permissions for elbv2:DescribeListeners" + log error "" + exit 1 +} + +if [[ -z "$HTTPS_LISTENER_ARN" ]] || [[ "$HTTPS_LISTENER_ARN" == "None" ]]; then + log warn "⚠️ No HTTPS (443) listener found on ALB '$ALB_NAME', skipping rule capacity check" +else + TOTAL_RULES=$(aws elbv2 describe-rules \ + --listener-arn "$HTTPS_LISTENER_ARN" \ + --region "$REGION" \ + --query 'length(Rules[?!IsDefault])' \ + --output text \ + --no-paginate 2>&1) || { + log error "❌ Failed to describe rules for HTTPS listener" + log error "📋 Listener ARN: $HTTPS_LISTENER_ARN" + log error "" + log error "💡 Possible causes:" + log error " The agent may lack permissions to describe rules" + log error "" + log error "🔧 How to fix:" + log error " • Check IAM permissions for elbv2:DescribeRules" + log error "" + exit 1 + } + + if ! [[ "$TOTAL_RULES" =~ ^[0-9]+$ ]]; then + log error "❌ Unexpected non-numeric rule count from HTTPS listener" + log error "📋 Listener ARN: $HTTPS_LISTENER_ARN" + log error "📋 Received value: $TOTAL_RULES" + log error "" + log error "💡 Possible causes:" + log error " The AWS CLI returned an unexpected response format" + log error "" + log error "🔧 How to fix:" + log error " • Verify AWS CLI version and credentials are correct" + log error " • Run manually: aws elbv2 describe-rules --listener-arn $HTTPS_LISTENER_ARN --region $REGION --query 'length(Rules[?!IsDefault])'" + log error "" + exit 1 + fi + + # Estimate rules this scope will add + DOMAIN_COUNT=$(echo "$CONTEXT" | jq '[.scope.domain] + [.scope.domains[]?.name // empty] | length') + ADDITIONAL_PORT_COUNT=$(echo "$CONTEXT" | jq '.scope.capabilities.additional_ports // [] | length') + ESTIMATED_NEW_RULES=$(( DOMAIN_COUNT * (1 + ADDITIONAL_PORT_COUNT) )) + PROJECTED_RULES=$((TOTAL_RULES + ESTIMATED_NEW_RULES)) + + log info "📋 ALB '$ALB_NAME' HTTPS listener has $TOTAL_RULES rules, this scope would add ~$ESTIMATED_NEW_RULES (projected: $PROJECTED_RULES, max: $ALB_MAX_CAPACITY)" + + if [[ "$PROJECTED_RULES" -ge "$ALB_MAX_CAPACITY" ]]; then + log error "❌ ALB '$ALB_NAME' would exceed rule capacity: $TOTAL_RULES current + $ESTIMATED_NEW_RULES new = $PROJECTED_RULES/$ALB_MAX_CAPACITY rules" + log error "" + log error "💡 Possible causes:" + log error " Too many scopes or ingress rules are configured on this ALB" + log error "" + log error "🔧 How to fix:" + log error " • Remove unused scopes or ingress rules from the ALB" + log error " • Increase ALB_MAX_CAPACITY in values.yaml or scope-configurations provider (AWS limit is 100 per listener)" + log error " • Request an AWS service quota increase for rules per ALB listener" + log error " • Consider using a separate ALB for additional scopes" + log error "" + exit 1 + fi + + log info "✅ Rule capacity OK: $PROJECTED_RULES/$ALB_MAX_CAPACITY (current: $TOTAL_RULES, new: ~$ESTIMATED_NEW_RULES)" +fi + +# --- Check 2: Target group capacity --- + +TARGET_GROUP_COUNT=$(aws elbv2 describe-target-groups \ + --load-balancer-arn "$ALB_ARN" \ + --region "$REGION" \ + --query 'length(TargetGroups)' \ + --output text \ + --no-paginate 2>&1) || { + log error "❌ Failed to describe target groups for ALB '$ALB_NAME'" + log error "" + log error "💡 Possible causes:" + log error " The agent may lack permissions to describe target groups" + log error "" + log error "🔧 How to fix:" + log error " • Check IAM permissions for elbv2:DescribeTargetGroups" + log error "" + exit 1 +} + +if ! [[ "$TARGET_GROUP_COUNT" =~ ^[0-9]+$ ]]; then + log error "❌ Unexpected non-numeric target group count from ALB" + log error "📋 ALB ARN: $ALB_ARN" + log error "📋 Received value: $TARGET_GROUP_COUNT" + log error "" + log error "💡 Possible causes:" + log error " The AWS CLI returned an unexpected response format" + log error "" + log error "🔧 How to fix:" + log error " • Verify AWS CLI version and credentials are correct" + log error " • Run manually: aws elbv2 describe-target-groups --load-balancer-arn $ALB_ARN --region $REGION --query 'length(TargetGroups)'" + log error "" + exit 1 +fi + +# Estimate target groups this deployment will add +ADDITIONAL_PORT_COUNT=$(echo "$CONTEXT" | jq '.scope.capabilities.additional_ports // [] | length') +DEPLOYMENT_STRATEGY=$(echo "$CONTEXT" | jq -r '.deployment.strategy // "initial"') +if [[ "$DEPLOYMENT_STRATEGY" == "blue_green" ]]; then + ESTIMATED_NEW_TGS=$(( (1 + ADDITIONAL_PORT_COUNT) * 2 )) +else + ESTIMATED_NEW_TGS=$(( 1 + ADDITIONAL_PORT_COUNT )) +fi + +PROJECTED_TGS=$((TARGET_GROUP_COUNT + ESTIMATED_NEW_TGS)) + +log info "📋 ALB '$ALB_NAME' has $TARGET_GROUP_COUNT target groups, this deployment would add ~$ESTIMATED_NEW_TGS (projected: $PROJECTED_TGS, max: $ALB_MAX_TARGET_GROUPS)" + +if [[ "$PROJECTED_TGS" -ge "$ALB_MAX_TARGET_GROUPS" ]]; then + log error "❌ ALB '$ALB_NAME' would exceed target group capacity: $TARGET_GROUP_COUNT current + $ESTIMATED_NEW_TGS new = $PROJECTED_TGS/$ALB_MAX_TARGET_GROUPS" + log error "" + log error "💡 Possible causes:" + log error " Too many services or deployments are attached to this ALB" + log error "" + log error "🔧 How to fix:" + log error " • Remove unused deployments or services from the ALB" + log error " • Increase ALB_MAX_TARGET_GROUPS in values.yaml or scope-configurations provider (AWS limit is 100)" + log error " • Request an AWS service quota increase for target groups per ALB" + log error " • Consider using a separate ALB for additional deployments" + log error "" + exit 1 +fi + +log info "✅ Target group capacity OK: $PROJECTED_TGS/$ALB_MAX_TARGET_GROUPS (current: $TARGET_GROUP_COUNT, new: ~$ESTIMATED_NEW_TGS)" diff --git a/k8s/deployment/workflows/initial.yaml b/k8s/deployment/workflows/initial.yaml index ad76c3f8..28056a17 100644 --- a/k8s/deployment/workflows/initial.yaml +++ b/k8s/deployment/workflows/initial.yaml @@ -30,10 +30,7 @@ steps: type: environment - name: validate alb capacity type: script - file: "$SERVICE_PATH/scope/validate_alb_capacity" - - name: validate alb target group capacity - type: script - file: "$SERVICE_PATH/deployment/validate_alb_target_group_capacity" + file: "$SERVICE_PATH/deployment/validate_alb_capacity" - name: route traffic type: script file: "$SERVICE_PATH/deployment/networking/gateway/route_traffic" From 60fdea48ffacf587517c814eccfa67d67ccb9d97 Mon Sep 17 00:00:00 2001 From: Pablo Vilas Date: Wed, 15 Apr 2026 16:06:03 -0300 Subject: [PATCH 3/5] fix(alb): run rollback_failed_ingress in subshell to prevent return 0 escape When rollback_failed_ingress is sourced inside handle_reconciliation_failure, its `return 0` (from skip guards) would exit the enclosing function, skipping the critical `exit 1`. Running in a subshell isolates the return boundary. --- k8s/deployment/verify_ingress_reconciliation | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/k8s/deployment/verify_ingress_reconciliation b/k8s/deployment/verify_ingress_reconciliation index 35930b24..edefb1f2 100644 --- a/k8s/deployment/verify_ingress_reconciliation +++ b/k8s/deployment/verify_ingress_reconciliation @@ -2,7 +2,9 @@ handle_reconciliation_failure() { if [[ -f "$SERVICE_PATH/deployment/rollback_failed_ingress" ]]; then - source "$SERVICE_PATH/deployment/rollback_failed_ingress" + # Run in subshell so `return 0` in the sourced script cannot escape + # this function and skip the `exit 1` below + ( source "$SERVICE_PATH/deployment/rollback_failed_ingress" ) fi exit 1 } From 82facbd38201e7414d9e7e68f4c4645c12d276c5 Mon Sep 17 00:00:00 2001 From: Pablo Vilas Date: Wed, 15 Apr 2026 16:09:50 -0300 Subject: [PATCH 4/5] chore(alb): remove dead standalone TG capacity script and document new config vars - Remove validate_alb_target_group_capacity and its tests (superseded by the combined deployment/validate_alb_capacity script) - Document ALB_MAX_CAPACITY, ALB_MAX_TARGET_GROUPS, and ALB_ROLLBACK_ON_RECONCILIATION_FAILURE in k8s/README.md --- k8s/README.md | 3 + .../validate_alb_target_group_capacity.bats | 385 ------------------ .../validate_alb_target_group_capacity | 126 ------ 3 files changed, 3 insertions(+), 511 deletions(-) delete mode 100644 k8s/deployment/tests/validate_alb_target_group_capacity.bats delete mode 100755 k8s/deployment/validate_alb_target_group_capacity diff --git a/k8s/README.md b/k8s/README.md index 59d19980..950fb32d 100644 --- a/k8s/README.md +++ b/k8s/README.md @@ -51,6 +51,9 @@ Configuration specific to AWS Route53 DNS provider. Visible only when `dns_type` | **ALB_NAME** (public) | Public Application Load Balancer name | `networking.balancer_public_name` | | **ALB_NAME** (private) | Private Application Load Balancer name | `networking.balancer_private_name` | | **ALB_RECONCILIATION_ENABLED** | Whether ALB reconciliation is enabled | `networking.alb_reconciliation_enabled` | +| **ALB_MAX_CAPACITY** | Maximum number of rules allowed on the HTTPS listener before blocking new scopes (default: 75, AWS limit: 100) | `networking.alb_max_capacity` | +| **ALB_MAX_TARGET_GROUPS** | Maximum number of target groups allowed on the ALB before blocking new deployments (default: 90, AWS limit: 100) | `networking.alb_max_target_groups` | +| **ALB_ROLLBACK_ON_RECONCILIATION_FAILURE** | Whether to automatically delete a broken ingress when ALB reconciliation fails, preventing sync poisoning of the entire ALB group (default: true) | `networking.alb_rollback_on_reconciliation_failure` | #### Azure DNS diff --git a/k8s/deployment/tests/validate_alb_target_group_capacity.bats b/k8s/deployment/tests/validate_alb_target_group_capacity.bats deleted file mode 100644 index 86f0eca4..00000000 --- a/k8s/deployment/tests/validate_alb_target_group_capacity.bats +++ /dev/null @@ -1,385 +0,0 @@ -#!/usr/bin/env bats -# ============================================================================= -# Unit tests for validate_alb_target_group_capacity -# ============================================================================= - -setup() { - export PROJECT_ROOT="$(cd "$BATS_TEST_DIRNAME/../../.." && pwd)" - source "$PROJECT_ROOT/testing/assertions.sh" - log() { if [ "$1" = "error" ]; then echo "$2" >&2; else echo "$2"; fi; } - export -f log - source "$PROJECT_ROOT/k8s/utils/get_config_value" - - export SCRIPT="$PROJECT_ROOT/k8s/deployment/validate_alb_target_group_capacity" - - export ALB_NAME="k8s-nullplatform-internet-facing" - export REGION="us-east-1" - export ALB_MAX_TARGET_GROUPS="98" - export DNS_TYPE="route53" - - # Base CONTEXT - export CONTEXT='{ - "providers": {}, - "deployment": {"strategy": "rolling"} - }' - - # Mock aws - default: ALB with 40 target groups - aws() { - case "$*" in - *"describe-load-balancers"*) - echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/k8s-nullplatform-internet-facing/abc123" - return 0 - ;; - *"describe-target-groups"*) - echo "40" - return 0 - ;; - esac - } - export -f aws -} - -teardown() { - unset -f aws -} - -# ============================================================================= -# Success flow -# ============================================================================= -@test "validate_alb_target_group_capacity: success when under capacity" { - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "0" - assert_contains "$output" "🔍 Validating ALB target group capacity for 'k8s-nullplatform-internet-facing'..." - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 40 target groups, this deployment would add ~1 (projected: 41, max: 98)" - assert_contains "$output" "✅ ALB target group capacity validated: 41/98 (current: 40, new: ~1)" -} - -@test "validate_alb_target_group_capacity: displays debug info" { - export LOG_LEVEL="debug" - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "0" - assert_contains "$output" "📋 ALB: k8s-nullplatform-internet-facing | Region: us-east-1 | Max target groups: 98" - assert_contains "$output" "📋 ALB ARN: arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/k8s-nullplatform-internet-facing/abc123" -} - -# ============================================================================= -# Capacity exceeded -# ============================================================================= -@test "validate_alb_target_group_capacity: fails when at capacity" { - aws() { - case "$*" in - *"describe-load-balancers"*) - echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" - return 0 - ;; - *"describe-target-groups"*) - echo "98" - return 0 - ;; - esac - } - export -f aws - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "1" - assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' would exceed target group capacity: 98 current + 1 new = 99/98" - assert_contains "$output" "💡 Possible causes:" - assert_contains "$output" "Too many services or deployments are attached to this ALB" - assert_contains "$output" "🔧 How to fix:" - assert_contains "$output" "Remove unused deployments or services from the ALB" - assert_contains "$output" "Increase ALB_MAX_TARGET_GROUPS in values.yaml or scope-configurations provider (AWS limit is 100)" - assert_contains "$output" "Request an AWS service quota increase for target groups per ALB" - assert_contains "$output" "Consider using a separate ALB for additional deployments" -} - -@test "validate_alb_target_group_capacity: fails when over capacity" { - aws() { - case "$*" in - *"describe-load-balancers"*) - echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" - return 0 - ;; - *"describe-target-groups"*) - echo "100" - return 0 - ;; - esac - } - export -f aws - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "1" - assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' would exceed target group capacity: 100 current + 1 new = 101/98" -} - -# ============================================================================= -# Configuration via get_config_value -# ============================================================================= -@test "validate_alb_target_group_capacity: uses default ALB_MAX_TARGET_GROUPS of 98" { - unset ALB_MAX_TARGET_GROUPS - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "0" - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 40 target groups, this deployment would add ~1 (projected: 41, max: 98)" -} - -@test "validate_alb_target_group_capacity: ALB_MAX_TARGET_GROUPS from env var" { - export ALB_MAX_TARGET_GROUPS="30" - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "1" - assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' would exceed target group capacity: 40 current + 1 new = 41/30" -} - -@test "validate_alb_target_group_capacity: ALB_MAX_TARGET_GROUPS from scope-configurations provider" { - export CONTEXT='{"providers":{"scope-configurations":{"networking":{"alb_max_target_groups":"30"}}},"deployment":{"strategy":"rolling"}}' - export ALB_MAX_TARGET_GROUPS="98" - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "1" - assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' would exceed target group capacity: 40 current + 1 new = 41/30" -} - -@test "validate_alb_target_group_capacity: ALB_MAX_TARGET_GROUPS from container-orchestration provider" { - export CONTEXT='{"providers":{"container-orchestration":{"balancer":{"alb_max_target_groups":"30"}}},"deployment":{"strategy":"rolling"}}' - export ALB_MAX_TARGET_GROUPS="98" - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "1" - assert_contains "$output" "❌ ALB 'k8s-nullplatform-internet-facing' would exceed target group capacity: 40 current + 1 new = 41/30" -} - -@test "validate_alb_target_group_capacity: scope-configurations takes priority over container-orchestration" { - export CONTEXT='{"providers":{"scope-configurations":{"networking":{"alb_max_target_groups":"100"}},"container-orchestration":{"balancer":{"alb_max_target_groups":"30"}}},"deployment":{"strategy":"rolling"}}' - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "0" - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 40 target groups, this deployment would add ~1 (projected: 41, max: 100)" -} - -@test "validate_alb_target_group_capacity: provider takes priority over env var" { - export CONTEXT='{"providers":{"scope-configurations":{"networking":{"alb_max_target_groups":"100"}}},"deployment":{"strategy":"rolling"}}' - export ALB_MAX_TARGET_GROUPS="30" - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "0" - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 40 target groups, this deployment would add ~1 (projected: 41, max: 100)" - assert_contains "$output" "✅ ALB target group capacity validated: 41/100 (current: 40, new: ~1)" -} - -# ============================================================================= -# AWS API errors -# ============================================================================= -@test "validate_alb_target_group_capacity: fails when describe-load-balancers fails" { - aws() { - case "$*" in - *"describe-load-balancers"*) - echo "An error occurred (LoadBalancerNotFound)" >&2 - return 1 - ;; - esac - } - export -f aws - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "1" - assert_contains "$output" "❌ Failed to find load balancer 'k8s-nullplatform-internet-facing' in region 'us-east-1'" - assert_contains "$output" "💡 Possible causes:" - assert_contains "$output" "The load balancer may not exist or the agent lacks permissions" - assert_contains "$output" "🔧 How to fix:" - assert_contains "$output" "Verify the ALB exists: aws elbv2 describe-load-balancers --names k8s-nullplatform-internet-facing --region us-east-1" - assert_contains "$output" "Check IAM permissions for elbv2:DescribeLoadBalancers" -} - -@test "validate_alb_target_group_capacity: fails when ALB ARN is None" { - aws() { - case "$*" in - *"describe-load-balancers"*) - echo "None" - return 0 - ;; - esac - } - export -f aws - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "1" - assert_contains "$output" "❌ Load balancer 'k8s-nullplatform-internet-facing' not found in region 'us-east-1'" -} - -@test "validate_alb_target_group_capacity: fails when describe-target-groups fails" { - aws() { - case "$*" in - *"describe-load-balancers"*) - echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" - return 0 - ;; - *"describe-target-groups"*) - echo "Access Denied" >&2 - return 1 - ;; - esac - } - export -f aws - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "1" - assert_contains "$output" "❌ Failed to describe target groups for ALB 'k8s-nullplatform-internet-facing'" - assert_contains "$output" "💡 Possible causes:" - assert_contains "$output" "The agent may lack permissions to describe target groups" - assert_contains "$output" "🔧 How to fix:" - assert_contains "$output" "Check IAM permissions for elbv2:DescribeTargetGroups" -} - -# ============================================================================= -# Edge cases -# ============================================================================= -@test "validate_alb_target_group_capacity: handles zero target groups" { - aws() { - case "$*" in - *"describe-load-balancers"*) - echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" - return 0 - ;; - *"describe-target-groups"*) - echo "0" - return 0 - ;; - esac - } - export -f aws - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "0" - assert_contains "$output" "📋 ALB 'k8s-nullplatform-internet-facing' has 0 target groups, this deployment would add ~1 (projected: 1, max: 98)" - assert_contains "$output" "✅ ALB target group capacity validated: 1/98 (current: 0, new: ~1)" -} - -@test "validate_alb_target_group_capacity: passes at exactly one below capacity" { - aws() { - case "$*" in - *"describe-load-balancers"*) - echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" - return 0 - ;; - *"describe-target-groups"*) - echo "96" - return 0 - ;; - esac - } - export -f aws - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "0" - assert_contains "$output" "✅ ALB target group capacity validated: 97/98 (current: 96, new: ~1)" -} - -@test "validate_alb_target_group_capacity: fails when target group count is non-numeric" { - aws() { - case "$*" in - *"describe-load-balancers"*) - echo "arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" - return 0 - ;; - *"describe-target-groups"*) - echo "WARNING: something unexpected" - return 0 - ;; - esac - } - export -f aws - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "1" - assert_contains "$output" "❌ Unexpected non-numeric target group count from ALB" - assert_contains "$output" "📋 ALB ARN: arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/alb/abc123" - assert_contains "$output" "📋 Received value: WARNING: something unexpected" - assert_contains "$output" "💡 Possible causes:" - assert_contains "$output" "The AWS CLI returned an unexpected response format" -} - -@test "validate_alb_target_group_capacity: fails when ALB_MAX_TARGET_GROUPS is non-numeric" { - export ALB_MAX_TARGET_GROUPS="abc" - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "1" - assert_contains "$output" "❌ ALB_MAX_TARGET_GROUPS must be a numeric value, got: 'abc'" - assert_contains "$output" "🔧 How to fix:" - assert_contains "$output" "Set a numeric value in values.yaml or scope-configurations provider" -} - -@test "validate_alb_target_group_capacity: empty ALB ARN response triggers error" { - aws() { - case "$*" in - *"describe-load-balancers"*) - echo "" - return 0 - ;; - esac - } - export -f aws - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "1" - assert_contains "$output" "❌ Load balancer 'k8s-nullplatform-internet-facing' not found in region 'us-east-1'" -} - -# ============================================================================= -# DNS_TYPE guard -# ============================================================================= -@test "validate_alb_target_group_capacity: skips when DNS_TYPE is external_dns" { - export DNS_TYPE="external_dns" - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "0" - [[ "$output" != *"🔍 Validating ALB target group capacity"* ]] -} - -@test "validate_alb_target_group_capacity: skips when DNS_TYPE is azure" { - export DNS_TYPE="azure" - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "0" - [[ "$output" != *"🔍 Validating ALB target group capacity"* ]] -} - -@test "validate_alb_target_group_capacity: skips with debug message for non-route53 DNS" { - export DNS_TYPE="external_dns" - export LOG_LEVEL="debug" - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "0" - assert_contains "$output" "DNS type is 'external_dns', ALB target group validation only applies to route53, skipping" -} - -@test "validate_alb_target_group_capacity: runs when DNS_TYPE is route53" { - export DNS_TYPE="route53" - - run bash -c 'source "$SCRIPT"' - - assert_equal "$status" "0" - assert_contains "$output" "🔍 Validating ALB target group capacity for 'k8s-nullplatform-internet-facing'..." -} diff --git a/k8s/deployment/validate_alb_target_group_capacity b/k8s/deployment/validate_alb_target_group_capacity deleted file mode 100755 index 8c3b8dcd..00000000 --- a/k8s/deployment/validate_alb_target_group_capacity +++ /dev/null @@ -1,126 +0,0 @@ -#!/bin/bash - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -source "$SCRIPT_DIR/../utils/get_config_value" - -if [[ "$DNS_TYPE" != "route53" ]]; then - log debug "📋 DNS type is '$DNS_TYPE', ALB target group validation only applies to route53, skipping" - return 0 -fi - -ALB_MAX_TARGET_GROUPS=$(get_config_value \ - --env ALB_MAX_TARGET_GROUPS \ - --provider '.providers["scope-configurations"].networking.alb_max_target_groups' \ - --provider '.providers["container-orchestration"].balancer.alb_max_target_groups' \ - --default "98" -) - -if ! [[ "$ALB_MAX_TARGET_GROUPS" =~ ^[0-9]+$ ]]; then - log error "❌ ALB_MAX_TARGET_GROUPS must be a numeric value, got: '$ALB_MAX_TARGET_GROUPS'" - log error "" - log error "🔧 How to fix:" - log error " • Set a numeric value in values.yaml or scope-configurations provider" - log error "" - exit 1 -fi - -log info "🔍 Validating ALB target group capacity for '$ALB_NAME'..." -log debug "📋 ALB: $ALB_NAME | Region: $REGION | Max target groups: $ALB_MAX_TARGET_GROUPS" - -# Get the ALB ARN -ALB_ARN=$(aws elbv2 describe-load-balancers \ - --names "$ALB_NAME" \ - --region "$REGION" \ - --query 'LoadBalancers[0].LoadBalancerArn' \ - --output text \ - --no-paginate 2>&1) || { - log error "❌ Failed to find load balancer '$ALB_NAME' in region '$REGION'" - log error "" - log error "💡 Possible causes:" - log error " The load balancer may not exist or the agent lacks permissions" - log error "" - log error "🔧 How to fix:" - log error " • Verify the ALB exists: aws elbv2 describe-load-balancers --names $ALB_NAME --region $REGION" - log error " • Check IAM permissions for elbv2:DescribeLoadBalancers" - log error "" - exit 1 -} - -if [[ -z "$ALB_ARN" ]] || [[ "$ALB_ARN" == "None" ]]; then - log error "❌ Load balancer '$ALB_NAME' not found in region '$REGION'" - log error "" - log error "💡 Possible causes:" - log error " The load balancer name may be incorrect or it was deleted" - log error "" - log error "🔧 How to fix:" - log error " • List available ALBs: aws elbv2 describe-load-balancers --region $REGION" - log error " • Check the balancer name in values.yaml or scope-configurations provider" - log error "" - exit 1 -fi - -log debug "📋 ALB ARN: $ALB_ARN" - -# Count target groups attached to this ALB -TARGET_GROUP_COUNT=$(aws elbv2 describe-target-groups \ - --load-balancer-arn "$ALB_ARN" \ - --region "$REGION" \ - --query 'length(TargetGroups)' \ - --output text \ - --no-paginate 2>&1) || { - log error "❌ Failed to describe target groups for ALB '$ALB_NAME'" - log error "" - log error "💡 Possible causes:" - log error " The agent may lack permissions to describe target groups" - log error "" - log error "🔧 How to fix:" - log error " • Check IAM permissions for elbv2:DescribeTargetGroups" - log error "" - exit 1 -} - -if ! [[ "$TARGET_GROUP_COUNT" =~ ^[0-9]+$ ]]; then - log error "❌ Unexpected non-numeric target group count from ALB" - log error "📋 ALB ARN: $ALB_ARN" - log error "📋 Received value: $TARGET_GROUP_COUNT" - log error "" - log error "💡 Possible causes:" - log error " The AWS CLI returned an unexpected response format" - log error "" - log error "🔧 How to fix:" - log error " • Verify AWS CLI version and credentials are correct" - log error " • Run manually: aws elbv2 describe-target-groups --load-balancer-arn $ALB_ARN --region $REGION --query 'length(TargetGroups)'" - log error "" - exit 1 -fi - -# Estimate target groups this deployment will add -ADDITIONAL_PORT_COUNT=$(echo "$CONTEXT" | jq '.scope.capabilities.additional_ports // [] | length') -DEPLOYMENT_STRATEGY=$(echo "$CONTEXT" | jq -r '.deployment.strategy // "initial"') -if [[ "$DEPLOYMENT_STRATEGY" == "blue_green" ]]; then - # Blue-green creates TGs for both blue and green backends - ESTIMATED_NEW_TGS=$(( (1 + ADDITIONAL_PORT_COUNT) * 2 )) -else - ESTIMATED_NEW_TGS=$(( 1 + ADDITIONAL_PORT_COUNT )) -fi - -PROJECTED_TOTAL=$((TARGET_GROUP_COUNT + ESTIMATED_NEW_TGS)) - -log info "📋 ALB '$ALB_NAME' has $TARGET_GROUP_COUNT target groups, this deployment would add ~$ESTIMATED_NEW_TGS (projected: $PROJECTED_TOTAL, max: $ALB_MAX_TARGET_GROUPS)" - -if [[ "$PROJECTED_TOTAL" -ge "$ALB_MAX_TARGET_GROUPS" ]]; then - log error "❌ ALB '$ALB_NAME' would exceed target group capacity: $TARGET_GROUP_COUNT current + $ESTIMATED_NEW_TGS new = $PROJECTED_TOTAL/$ALB_MAX_TARGET_GROUPS" - log error "" - log error "💡 Possible causes:" - log error " Too many services or deployments are attached to this ALB" - log error "" - log error "🔧 How to fix:" - log error " • Remove unused deployments or services from the ALB" - log error " • Increase ALB_MAX_TARGET_GROUPS in values.yaml or scope-configurations provider (AWS limit is 100)" - log error " • Request an AWS service quota increase for target groups per ALB" - log error " • Consider using a separate ALB for additional deployments" - log error "" - exit 1 -fi - -log info "✅ ALB target group capacity validated: $PROJECTED_TOTAL/$ALB_MAX_TARGET_GROUPS (current: $TARGET_GROUP_COUNT, new: ~$ESTIMATED_NEW_TGS)" From a605aa86ac6cbb1a5a85a9c3ecc53959b1269145 Mon Sep 17 00:00:00 2001 From: Pablo Vilas Date: Wed, 15 Apr 2026 16:34:13 -0300 Subject: [PATCH 5/5] docs(alb): add clarifying comments for estimation edge cases - Document that scope-level domain estimation works correctly when .scope.domain is null during scope creation ([null] has jq length 1) - Document that blue-green TG estimation intentionally overcounts (existing blue TGs are already in TARGET_GROUP_COUNT) - Document that ADDITIONAL_PORT_COUNT is re-parsed independently of the rule check which may have been skipped --- k8s/deployment/validate_alb_capacity | 7 ++++++- k8s/scope/validate_alb_capacity | 4 +++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/k8s/deployment/validate_alb_capacity b/k8s/deployment/validate_alb_capacity index 82a22299..40695fb4 100755 --- a/k8s/deployment/validate_alb_capacity +++ b/k8s/deployment/validate_alb_capacity @@ -197,7 +197,12 @@ if ! [[ "$TARGET_GROUP_COUNT" =~ ^[0-9]+$ ]]; then exit 1 fi -# Estimate target groups this deployment will add +# Estimate target groups this deployment will add. +# Re-parsed independently of the rule check above, which may have been skipped +# if no HTTPS listener was found. +# Note: for blue-green re-deployments, existing blue TGs are already in +# TARGET_GROUP_COUNT, so this intentionally overcounts. Conservative is safer +# than allowing the AWS hard limit (100) to be hit. ADDITIONAL_PORT_COUNT=$(echo "$CONTEXT" | jq '.scope.capabilities.additional_ports // [] | length') DEPLOYMENT_STRATEGY=$(echo "$CONTEXT" | jq -r '.deployment.strategy // "initial"') if [[ "$DEPLOYMENT_STRATEGY" == "blue_green" ]]; then diff --git a/k8s/scope/validate_alb_capacity b/k8s/scope/validate_alb_capacity index b1551f89..7d218c97 100755 --- a/k8s/scope/validate_alb_capacity +++ b/k8s/scope/validate_alb_capacity @@ -116,7 +116,9 @@ if ! [[ "$TOTAL_RULES" =~ ^[0-9]+$ ]]; then exit 1 fi -# Estimate rules this scope will add +# Estimate rules this scope will add. +# During scope creation, .scope.domain may be null (generated later in the workflow). +# [null] has jq length 1, which is the correct estimate (the scope will get one domain). DOMAIN_COUNT=$(echo "$CONTEXT" | jq '[.scope.domain] + [.scope.domains[]?.name // empty] | length') ADDITIONAL_PORT_COUNT=$(echo "$CONTEXT" | jq '.scope.capabilities.additional_ports // [] | length') # Each domain creates 1 rule on the primary ingress; additional ports create separate ingresses with the same domain rules