From fc8861d96e2239406901e13b28a0e302ac33cae8 Mon Sep 17 00:00:00 2001
From: Kumar Pratyush <kpratyush@splunk.com>
Date: Tue, 16 Jun 2026 09:03:19 +0530
Subject: [PATCH 1/6] fix: update documentation links in installer output

K0S_QUICKSTART.md was removed during doc consolidation. Replace with
DEPLOYMENT_GUIDE.md and TROUBLESHOOTING.md which are the current docs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tools/cluster_setup/k0s_cluster_with_stack.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh
index 7fb87726..e78f6029 100755
--- a/tools/cluster_setup/k0s_cluster_with_stack.sh
+++ b/tools/cluster_setup/k0s_cluster_with_stack.sh
@@ -5008,9 +5008,9 @@ show_platform_access_info() {
   log "============================================"
   log "📚 Documentation:"
   log "  Setup Guide: ./tools/cluster_setup/K0S_README.md"
-  log "  Setup Guide (Concise version): ./tools/cluster_setup/K0S_QUICKSTART.md"
+  log "  Deployment Guide: ./tools/cluster_setup/DEPLOYMENT_GUIDE.md"
+  log "  Troubleshooting: ./tools/cluster_setup/TROUBLESHOOTING.md"
   log "  Custom Resources: ./docs/CustomResources.md"
-  log "  Troubleshooting: Check operator logs and events above"
   log "============================================"
   log ""
 

From 668001ffeab68c644e657e6c42a57578b2c7cbe1 Mon Sep 17 00:00:00 2001
From: Kumar Pratyush <kpratyush@splunk.com>
Date: Tue, 16 Jun 2026 09:06:58 +0530
Subject: [PATCH 2/6] fix: separate root-cause pods from downstream
 PodInitializing in health summary

The post-install banner was listing PodInitializing workers alongside actual
failures (e.g. ImagePullBackOff on the head pod), making it hard to identify
what to fix. Now root-cause pods and downstream initializing pods are printed
in separate sections with a clear tip to fix root causes first.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tools/cluster_setup/k0s_cluster_with_stack.sh | 106 ++++++++++--------
 1 file changed, 58 insertions(+), 48 deletions(-)

diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh
index e78f6029..f04334ae 100755
--- a/tools/cluster_setup/k0s_cluster_with_stack.sh
+++ b/tools/cluster_setup/k0s_cluster_with_stack.sh
@@ -4818,12 +4818,11 @@ _print_unhealthy_pod_summary() {
     }
   fi
 
+  local -a root_cause_lines=() downstream_lines=()
   for line in "${POD_LINES[@]}"; do
     [[ -z "${line}" ]] && continue
     IFS="${_POD_FS}" read -r ns name phase ready reason message owner_kind owner_name waiting terminated restarts created <<<"${line}"
     if ! _pod_is_healthy "${phase}" "${ready}" "${waiting}" "${terminated}" "${reason}"; then
-      # Build a compact "[Phase ready/total, reason]" suffix. We omit empty
-      # reason fields rather than printing literal "[Pending 0/1, ]".
       local suffix="[${phase} ${ready}"
       if [[ -n "${reason}" ]]; then
         suffix+=", ${reason}"
@@ -4833,61 +4832,73 @@ _print_unhealthy_pod_summary() {
         suffix+=", ${terminated}"
       fi
       suffix+="]"
-      unhealthy_lines+=("${ns}${_POD_FS}${name}${_POD_FS}${suffix}")
-      total=$((total + 1))
+      # PodInitializing is a transient downstream effect — another pod's failure
+      # is blocking this one. Separate it so the operator focuses on root causes.
+      if [[ "${waiting}" == "PodInitializing" || "${reason}" == "PodInitializing" ]]; then
+        downstream_lines+=("${ns}${_POD_FS}${name}${_POD_FS}${suffix}")
+      else
+        root_cause_lines+=("${ns}${_POD_FS}${name}${_POD_FS}${suffix}")
+        total=$((total + 1))
+      fi
     fi
   done
 
-  if (( total == 0 )); then
+  local downstream_count=${#downstream_lines[@]}
+  total=$(( total ))  # root-cause count only
+
+  if (( total == 0 && downstream_count == 0 )); then
     log "✅ All pods are healthy at banner time."
     return 0
   fi
 
-  # Bucket by namespace so the banner is easy to skim. We stick to plain
-  # arrays (bash 3.2 has no associative arrays) by collecting unique
-  # namespaces in encounter order and counting occurrences in a parallel
-  # array.
-  local -a ns_keys=() ns_counts=()
-  local i found_idx
-  for line in "${unhealthy_lines[@]}"; do
-    IFS="${_POD_FS}" read -r ns _name _suffix <<<"${line}"
-    found_idx=-1
-    for (( i=0; i < ${#ns_keys[@]}; i++ )); do
-      if [[ "${ns_keys[$i]}" == "${ns}" ]]; then
-        found_idx="$i"
-        break
+  # Helper: print namespace-bucketed pod list from an array
+  _print_pod_section() {
+    local -a lines=("$@")
+    local -a ns_keys=() ns_counts=()
+    local i found_idx line ns _name _suffix pn _pname _psuffix
+    for line in "${lines[@]}"; do
+      IFS="${_POD_FS}" read -r ns _name _suffix <<<"${line}"
+      found_idx=-1
+      for (( i=0; i < ${#ns_keys[@]}; i++ )); do
+        [[ "${ns_keys[$i]}" == "${ns}" ]] && { found_idx=$i; break; }
+      done
+      if (( found_idx == -1 )); then
+        ns_keys+=("${ns}"); ns_counts+=(1)
+      else
+        ns_counts[$found_idx]=$(( ns_counts[found_idx] + 1 ))
       fi
     done
-    if (( found_idx == -1 )); then
-      ns_keys+=("${ns}")
-      ns_counts+=(1)
-    else
-      ns_counts[$found_idx]=$(( ns_counts[found_idx] + 1 ))
-    fi
-  done
-
-  warn "${total} unhealthy pod(s) across ${#ns_keys[@]} namespace(s):"
-  for (( i=0; i < ${#ns_keys[@]}; i++ )); do
-    warn "  • ${ns_keys[$i]} (${ns_counts[$i]}):"
-    local printed=0
-    local max_per_ns=5  # avoid 200-line banners on truly broken clusters
-    local pn _pname _psuffix
-    for line in "${unhealthy_lines[@]}"; do
-      IFS="${_POD_FS}" read -r pn _pname _psuffix <<<"${line}"
-      [[ "${pn}" != "${ns_keys[$i]}" ]] && continue
-      warn "      - ${_pname} ${_psuffix}"
-      printed=$(( printed + 1 ))
-      if (( printed >= max_per_ns )); then
-        local remaining=$(( ns_counts[i] - printed ))
-        if (( remaining > 0 )); then
-          warn "      … and ${remaining} more in ${ns_keys[$i]} (run: kubectl get pods -n ${ns_keys[$i]})"
+    for (( i=0; i < ${#ns_keys[@]}; i++ )); do
+      warn "  • ${ns_keys[$i]} (${ns_counts[$i]}):"
+      local printed=0
+      for line in "${lines[@]}"; do
+        IFS="${_POD_FS}" read -r pn _pname _psuffix <<<"${line}"
+        [[ "${pn}" != "${ns_keys[$i]}" ]] && continue
+        warn "      - ${_pname} ${_psuffix}"
+        printed=$(( printed + 1 ))
+        if (( printed >= 5 )); then
+          local remaining=$(( ns_counts[i] - printed ))
+          (( remaining > 0 )) && warn "      … and ${remaining} more in ${ns_keys[$i]} (run: kubectl get pods -n ${ns_keys[$i]})"
+          break
         fi
-        break
-      fi
+      done
     done
-  done
-  warn ""
-  warn "Tip: scroll up to see per-pod logs, events, and recommended fixes."
+  }
+
+  if (( total > 0 )); then
+    warn "${total} root-cause pod(s) need attention:"
+    _print_pod_section "${root_cause_lines[@]}"
+    warn ""
+  fi
+
+  if (( downstream_count > 0 )); then
+    warn "${downstream_count} pod(s) are still initializing (waiting on root-cause pod(s) above to become ready):"
+    _print_pod_section "${downstream_lines[@]}"
+    warn ""
+  fi
+
+  warn "Tip: fix the root-cause pod(s) first — initializing pods will recover automatically."
+  warn "     Scroll up to see per-pod logs, events, and recommended fixes."
 }
 
 # ====== SHOW PLATFORM ACCESS INFORMATION ======
@@ -5031,8 +5042,7 @@ show_platform_access_info() {
     warn "    Or re-run just the verifier (no install steps):"
     warn "       CONFIG_FILE=${CONFIG_FILE:-<your-config>} ${0} verify-pods"
   else
-    warn "⚠️  Your AI Platform is NOT ready to use yet: ${verify_rc} pod(s)"
-    warn "    are unhealthy. Summary:"
+    warn "⚠️  Your AI Platform is NOT ready to use yet. Summary:"
     log ""
     _print_unhealthy_pod_summary
     warn "    Re-run the verifier after fixing the issues above:"

From 6992544169643e6aa57a37056d0380920f142c42 Mon Sep 17 00:00:00 2001
From: Kumar Pratyush <kpratyush@splunk.com>
Date: Tue, 16 Jun 2026 09:11:02 +0530
Subject: [PATCH 3/6] fix: improve wording when only PodInitializing pods are
 present
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When there are no root-cause pods (e.g. head pod recovered but workers
still initializing), the tip no longer says "waiting on root-cause pod(s)
above" where there is nothing above. Instead shows "still starting up —
re-run verifier in a few minutes".

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tools/cluster_setup/k0s_cluster_with_stack.sh | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh
index f04334ae..f0a4bfe9 100755
--- a/tools/cluster_setup/k0s_cluster_with_stack.sh
+++ b/tools/cluster_setup/k0s_cluster_with_stack.sh
@@ -4892,12 +4892,20 @@ _print_unhealthy_pod_summary() {
   fi
 
   if (( downstream_count > 0 )); then
-    warn "${downstream_count} pod(s) are still initializing (waiting on root-cause pod(s) above to become ready):"
+    if (( total > 0 )); then
+      warn "${downstream_count} pod(s) are still initializing (will recover once root-cause pod(s) above are healthy):"
+    else
+      warn "${downstream_count} pod(s) are still initializing:"
+    fi
     _print_pod_section "${downstream_lines[@]}"
     warn ""
   fi
 
-  warn "Tip: fix the root-cause pod(s) first — initializing pods will recover automatically."
+  if (( total > 0 )); then
+    warn "Tip: fix the root-cause pod(s) first — initializing pods will recover automatically."
+  else
+    warn "Tip: pods are still starting up — re-run the verifier in a few minutes."
+  fi
   warn "     Scroll up to see per-pod logs, events, and recommended fixes."
 }
 

From fe51bc70f86356cb98b666214d03d460312441d4 Mon Sep 17 00:00:00 2001
From: Kumar Pratyush <kpratyush@splunk.com>
Date: Tue, 16 Jun 2026 09:14:10 +0530
Subject: [PATCH 4/6] test: add unit test script for k0s_cluster_with_stack.sh
 pure-logic functions

Covers build_image_url, object_store_auth_looks_like_placeholder,
_pod_is_healthy, _classify_pod_failure, and _print_unhealthy_pod_summary
(4 scenarios including mixed root-cause/PodInitializing, only-initializing,
only-root-cause multi-namespace, and truncation). No cluster or network needed.

Run: ./tools/cluster_setup/test_k0s_cluster_with_stack.sh [-v] [filter]

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../test_k0s_cluster_with_stack.sh            | 265 ++++++++++++++++++
 1 file changed, 265 insertions(+)
 create mode 100755 tools/cluster_setup/test_k0s_cluster_with_stack.sh

diff --git a/tools/cluster_setup/test_k0s_cluster_with_stack.sh b/tools/cluster_setup/test_k0s_cluster_with_stack.sh
new file mode 100755
index 00000000..1e73efc8
--- /dev/null
+++ b/tools/cluster_setup/test_k0s_cluster_with_stack.sh
@@ -0,0 +1,265 @@
+#!/usr/bin/env bash
+# test_k0s_cluster_with_stack.sh
+# Unit tests for pure-logic functions in k0s_cluster_with_stack.sh.
+# No cluster, SSH, kubectl, or network access required.
+#
+# Usage:
+#   ./test_k0s_cluster_with_stack.sh          # run all tests
+#   ./test_k0s_cluster_with_stack.sh -v        # verbose (show each assertion)
+#   ./test_k0s_cluster_with_stack.sh pod       # run only tests matching "pod"
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPT="${SCRIPT_DIR}/k0s_cluster_with_stack.sh"
+
+VERBOSE=0
+FILTER="${1:-}"
+if [[ "${FILTER}" == "-v" ]]; then VERBOSE=1; FILTER="${2:-}"; fi
+
+# ── Test framework ─────────────────────────────────────────────────────────────
+
+PASS=0; FAIL=0; SKIP=0
+_current_suite=""
+
+suite() { _current_suite="$1"; }
+
+assert_eq() {
+  local desc="$1" expected="$2" actual="$3"
+  if [[ -n "${FILTER}" && "${_current_suite} ${desc}" != *"${FILTER}"* ]]; then
+    SKIP=$(( SKIP + 1 )); return
+  fi
+  if [[ "${expected}" == "${actual}" ]]; then
+    PASS=$(( PASS + 1 ))
+    [[ "${VERBOSE}" == "1" ]] && echo "  ✅ ${desc}"
+  else
+    FAIL=$(( FAIL + 1 ))
+    echo "  ❌ ${desc}"
+    echo "       expected: $(printf '%q' "${expected}")"
+    echo "       actual  : $(printf '%q' "${actual}")"
+  fi
+}
+
+assert_rc() {
+  local desc="$1" expected_rc="$2"
+  shift 2
+  if [[ -n "${FILTER}" && "${_current_suite} ${desc}" != *"${FILTER}"* ]]; then
+    SKIP=$(( SKIP + 1 )); return
+  fi
+  local actual_rc=0
+  "$@" >/dev/null 2>&1 || actual_rc=$?
+  if [[ "${expected_rc}" == "${actual_rc}" ]]; then
+    PASS=$(( PASS + 1 ))
+    [[ "${VERBOSE}" == "1" ]] && echo "  ✅ ${desc}"
+  else
+    FAIL=$(( FAIL + 1 ))
+    echo "  ❌ ${desc} (expected rc=${expected_rc}, got rc=${actual_rc})"
+  fi
+}
+
+# Source only the pure-logic functions we want to test.
+# We stub out everything that touches the filesystem, network, or cluster.
+_load_functions() {
+  # Minimal stubs so sourcing individual functions doesn't blow up
+  log()  { :; }
+  warn() { :; }
+  err()  { echo "ERROR: $*" >&2; exit 1; }
+  pf_ok()   { :; }
+  pf_warn() { :; }
+  pf_fail() { :; }
+
+  # Extract and eval each function by line range
+  eval "$(sed -n '569,581p'   "${SCRIPT}")"   # build_image_url
+  eval "$(sed -n '739,745p'   "${SCRIPT}")"   # object_store_auth_looks_like_placeholder
+  eval "$(sed -n '4385,4421p' "${SCRIPT}")"   # _pod_is_healthy
+  eval "$(sed -n '4425,4454p' "${SCRIPT}")"   # _classify_pod_failure
+
+  # _POD_FS and summary helpers
+  eval "$(sed -n '4531p'      "${SCRIPT}")"   # _POD_FS=$'\x1f'
+
+  # _print_unhealthy_pod_summary — find its actual end line dynamically
+  local start end
+  start=$(grep -n '^_print_unhealthy_pod_summary()' "${SCRIPT}" | cut -d: -f1)
+  end=$(awk -v s="${start}" 'NR>s && /^}$/{print NR; exit}' "${SCRIPT}")
+  eval "$(sed -n "${start},${end}p" "${SCRIPT}")"
+}
+
+_load_functions
+
+# ── Tests: build_image_url ─────────────────────────────────────────────────────
+
+suite "build_image_url"
+echo "▶ build_image_url"
+
+assert_eq "prepends registry to bare path" \
+  "my.registry.io/splunk/operator:1.0" \
+  "$(build_image_url "my.registry.io" "splunk/operator:1.0")"
+
+assert_eq "skips registry when image already has a host" \
+  "ghcr.io/splunk/operator:1.0" \
+  "$(build_image_url "my.registry.io" "ghcr.io/splunk/operator:1.0")"
+
+assert_eq "skips registry when image has IP host" \
+  "10.0.0.1:5000/operator:1.0" \
+  "$(build_image_url "my.registry.io" "10.0.0.1:5000/operator:1.0")"
+
+assert_eq "returns bare path when registry is empty" \
+  "splunk/operator:1.0" \
+  "$(build_image_url "" "splunk/operator:1.0")"
+
+assert_eq "returns bare path when registry is null" \
+  "splunk/operator:1.0" \
+  "$(build_image_url "null" "splunk/operator:1.0")"
+
+# ── Tests: object_store_auth_looks_like_placeholder ───────────────────────────
+
+suite "object_store_auth_looks_like_placeholder"
+echo "▶ object_store_auth_looks_like_placeholder"
+
+_run_placeholder_check() {
+  MINIO_ROOT_USER="$1" MINIO_ROOT_PASSWORD="$2" \
+    object_store_auth_looks_like_placeholder
+}
+
+assert_rc "detects <CHANGE_ME> angle bracket placeholder" 0 \
+  bash -c "$(declare -f object_store_auth_looks_like_placeholder); MINIO_ROOT_USER='<user>' MINIO_ROOT_PASSWORD='secret' object_store_auth_looks_like_placeholder"
+
+assert_rc "detects CHANGEME keyword in password" 0 \
+  bash -c "$(declare -f object_store_auth_looks_like_placeholder); MINIO_ROOT_USER='admin' MINIO_ROOT_PASSWORD='CHANGEME' object_store_auth_looks_like_placeholder"
+
+assert_rc "detects changeme (lowercase)" 0 \
+  bash -c "$(declare -f object_store_auth_looks_like_placeholder); MINIO_ROOT_USER='admin' MINIO_ROOT_PASSWORD='changeme' object_store_auth_looks_like_placeholder"
+
+assert_rc "accepts real credentials (returns 1)" 1 \
+  bash -c "$(declare -f object_store_auth_looks_like_placeholder); MINIO_ROOT_USER='admin' MINIO_ROOT_PASSWORD='s3cr3t!' object_store_auth_looks_like_placeholder"
+
+# ── Tests: _pod_is_healthy ─────────────────────────────────────────────────────
+
+suite "_pod_is_healthy"
+echo "▶ _pod_is_healthy"
+
+# args: phase ready waiting terminated reason
+assert_rc "Running 2/2 is healthy"              0 _pod_is_healthy Running  "2/2" ""                ""      ""
+assert_rc "Running 1/2 is unhealthy"            1 _pod_is_healthy Running  "1/2" ""                ""      ""
+assert_rc "Succeeded is healthy"                0 _pod_is_healthy Succeeded "" ""                 ""      ""
+assert_rc "Pending is unhealthy"                1 _pod_is_healthy Pending  "0/1" ""                ""      ""
+assert_rc "Failed is unhealthy"                 1 _pod_is_healthy Failed   "0/1" ""                ""      ""
+assert_rc "Unknown is unhealthy"                1 _pod_is_healthy Unknown  "0/1" ""                ""      ""
+assert_rc "CrashLoopBackOff is unhealthy"       1 _pod_is_healthy Running  "0/1" "CrashLoopBackOff" ""     ""
+assert_rc "ImagePullBackOff is unhealthy"       1 _pod_is_healthy Running  "0/1" "ImagePullBackOff" ""     ""
+assert_rc "ErrImagePull is unhealthy"           1 _pod_is_healthy Running  "0/1" "ErrImagePull"    ""      ""
+assert_rc "OOMKilled terminated is unhealthy"   1 _pod_is_healthy Running  "1/1" ""                "OOMKilled" ""
+assert_rc "Error terminated is unhealthy"       1 _pod_is_healthy Running  "1/1" ""                "Error" ""
+assert_rc "NodeLost reason is unhealthy"        1 _pod_is_healthy Running  "1/1" ""                ""      "NodeLost"
+assert_rc "Evicted reason is unhealthy"         1 _pod_is_healthy Running  "1/1" ""                ""      "Evicted"
+assert_rc "PodInitializing waiting is unhealthy" 1 _pod_is_healthy Pending "0/1" "PodInitializing"  ""     ""
+
+# ── Tests: _classify_pod_failure ──────────────────────────────────────────────
+
+suite "_classify_pod_failure"
+echo "▶ _classify_pod_failure"
+
+# args: phase reason waiting terminated message
+assert_eq "ImagePullBackOff → image-pull" \
+  "image-pull" "$(  _classify_pod_failure Pending ""       "ImagePullBackOff" ""         "")"
+assert_eq "ErrImagePull → image-pull" \
+  "image-pull" "$(  _classify_pod_failure Running ""       "ErrImagePull"     ""         "")"
+assert_eq "CrashLoopBackOff → crashloop" \
+  "crashloop"  "$(  _classify_pod_failure Running ""       "CrashLoopBackOff" ""         "")"
+assert_eq "OOMKilled → oom" \
+  "oom"        "$(  _classify_pod_failure Running ""       ""                 "OOMKilled" "")"
+assert_eq "Evicted → evicted" \
+  "evicted"    "$(  _classify_pod_failure Running "Evicted" ""                ""         "")"
+assert_eq "Pending with no signal → pending-long" \
+  "pending-long" "$(_classify_pod_failure Pending ""       ""                 ""         "")"
+assert_eq "Failed with no signal → failed" \
+  "failed"     "$( _classify_pod_failure Failed  ""       ""                 ""         "")"
+
+# ── Tests: _print_unhealthy_pod_summary ───────────────────────────────────────
+
+suite "_print_unhealthy_pod_summary pod"
+echo "▶ _print_unhealthy_pod_summary"
+
+# Override warn/log to capture output
+_captured=""
+warn() { _captured+="WARN: $*"$'\n'; }
+log()  { _captured+="LOG: $*"$'\n'; }
+
+mk_pod_line() {
+  # ns name phase ready waiting
+  local ns=$1 name=$2 phase=$3 ready=$4 waiting=$5
+  printf "%s" "${ns}${_POD_FS}${name}${_POD_FS}${phase}${_POD_FS}${ready}${_POD_FS}${_POD_FS}${_POD_FS}RS${_POD_FS}owner${_POD_FS}${waiting}${_POD_FS}${_POD_FS}0${_POD_FS}2026-06-16"
+}
+
+# Scenario 1: mixed ImagePullBackOff (root cause) + PodInitializing (downstream)
+_captured=""
+declare -a POD_LINES=(
+  "$(mk_pod_line ai-platform head-pod      Pending "0/2" "ImagePullBackOff")"
+  "$(mk_pod_line ai-platform gpu-worker-1  Pending "0/1" "PodInitializing")"
+  "$(mk_pod_line ai-platform gpu-worker-2  Pending "0/1" "PodInitializing")"
+)
+_print_unhealthy_pod_summary
+assert_eq "scenario 1: shows root-cause section header" \
+  "1" "$(echo "${_captured}" | grep -c "^WARN: 1 root-cause pod")"
+assert_eq "scenario 1: shows downstream section" \
+  "1" "$(echo "${_captured}" | grep -c "still initializing")"
+assert_eq "scenario 1: head pod appears in root-cause output" \
+  "1" "$(echo "${_captured}" | grep -c "head-pod")"
+assert_eq "scenario 1: tip mentions fixing root cause" \
+  "1" "$(echo "${_captured}" | grep -c "fix the root-cause")"
+assert_eq "scenario 1: downstream count is 2" \
+  "1" "$(echo "${_captured}" | grep -c "^WARN: 2 pod(s) are still initializing")"
+
+# Scenario 2: only PodInitializing — no root cause
+_captured=""
+POD_LINES=(
+  "$(mk_pod_line ai-platform gpu-worker-1 Pending "0/1" "PodInitializing")"
+  "$(mk_pod_line ai-platform gpu-worker-2 Pending "0/1" "PodInitializing")"
+)
+_print_unhealthy_pod_summary
+assert_eq "scenario 2: no root-cause section shown" \
+  "0" "$(echo "${_captured}" | grep -c "root-cause pod")"
+assert_eq "scenario 2: shows 2 still initializing" \
+  "1" "$(echo "${_captured}" | grep -c "^WARN: 2 pod(s) are still initializing")"
+assert_eq "scenario 2: tip says re-run verifier" \
+  "1" "$(echo "${_captured}" | grep -c "re-run the verifier")"
+
+# Scenario 3: only root causes, no PodInitializing — multi-namespace
+_captured=""
+POD_LINES=(
+  "$(mk_pod_line ai-platform  head-pod     Pending "0/2" "ImagePullBackOff")"
+  "$(mk_pod_line kube-system  calico-node  Pending "0/1" "CrashLoopBackOff")"
+)
+_print_unhealthy_pod_summary
+assert_eq "scenario 3: 2 root-cause pods" \
+  "1" "$(echo "${_captured}" | grep -c "^WARN: 2 root-cause pod")"
+assert_eq "scenario 3: no downstream section" \
+  "0" "$(echo "${_captured}" | grep -c "still initializing")"
+assert_eq "scenario 3: both namespaces shown" \
+  "1" "$(echo "${_captured}" | grep -c "ai-platform")"
+assert_eq "scenario 3: kube-system also shown" \
+  "1" "$(echo "${_captured}" | grep -c "kube-system")"
+
+# Scenario 4: max_per_ns truncation (>5 pods in one namespace)
+_captured=""
+POD_LINES=(
+  "$(mk_pod_line ai-platform head-pod     Pending "0/2" "ImagePullBackOff")"
+  "$(mk_pod_line ai-platform gpu-w-1      Pending "0/1" "PodInitializing")"
+  "$(mk_pod_line ai-platform gpu-w-2      Pending "0/1" "PodInitializing")"
+  "$(mk_pod_line ai-platform gpu-w-3      Pending "0/1" "PodInitializing")"
+  "$(mk_pod_line ai-platform gpu-w-4      Pending "0/1" "PodInitializing")"
+  "$(mk_pod_line ai-platform gpu-w-5      Pending "0/1" "PodInitializing")"
+  "$(mk_pod_line ai-platform gpu-w-6      Pending "0/1" "PodInitializing")"
+)
+_print_unhealthy_pod_summary
+assert_eq "scenario 4: truncation ellipsis shown for downstream" \
+  "1" "$(echo "${_captured}" | grep -c "… and")"
+
+# ── Summary ───────────────────────────────────────────────────────────────────
+
+echo ""
+echo "Results: ${PASS} passed, ${FAIL} failed, ${SKIP} skipped"
+echo ""
+if (( FAIL > 0 )); then
+  exit 1
+fi

From 6b644234cc05a7e5fdea87e807ff5fbe687216bc Mon Sep 17 00:00:00 2001
From: Kumar Pratyush <kpratyush@splunk.com>
Date: Tue, 16 Jun 2026 09:40:52 +0530
Subject: [PATCH 5/6] fix: address PR #110 review comments on pod summary and
 test script

k0s_cluster_with_stack.sh:
- Remove no-op total=$(( total )) assignment
- Restore max_per_ns named variable in _print_pod_section
- Move _print_pod_section to a top-level helper (was re-defined as a
  global on every call to _print_unhealthy_pod_summary)

test_k0s_cluster_with_stack.sh:
- Drop set -e (grep -c returns exit 1 on zero matches, causing early exit)
- Add || true to all grep -c expressions to make them non-fatal
- Extract functions by name via _extract_fn() instead of brittle hard-coded
  line ranges; robust to unrelated edits shifting line numbers

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tools/cluster_setup/k0s_cluster_with_stack.sh |  72 ++++-----
 .../test_k0s_cluster_with_stack.sh            | 137 +++++++++---------
 2 files changed, 107 insertions(+), 102 deletions(-)

diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh
index f0a4bfe9..4d3df33e 100755
--- a/tools/cluster_setup/k0s_cluster_with_stack.sh
+++ b/tools/cluster_setup/k0s_cluster_with_stack.sh
@@ -4805,6 +4805,43 @@ _collect_pod_summary() {
 # We deliberately do NOT re-run kubectl by default: the diagnostics above
 # the banner already exhausted the freshest information; re-querying here
 # would add latency and risk a different snapshot, confusing the operator.
+
+# Helper: print a namespace-bucketed pod list from a delimited-line array.
+# Called by _print_unhealthy_pod_summary for both root-cause and downstream sections.
+_print_pod_section() {
+  local -a lines=("$@")
+  local -a ns_keys=() ns_counts=()
+  local i found_idx line ns _name _suffix pn _pname _psuffix
+  for line in "${lines[@]}"; do
+    IFS="${_POD_FS}" read -r ns _name _suffix <<<"${line}"
+    found_idx=-1
+    for (( i=0; i < ${#ns_keys[@]}; i++ )); do
+      [[ "${ns_keys[$i]}" == "${ns}" ]] && { found_idx=$i; break; }
+    done
+    if (( found_idx == -1 )); then
+      ns_keys+=("${ns}"); ns_counts+=(1)
+    else
+      ns_counts[$found_idx]=$(( ns_counts[found_idx] + 1 ))
+    fi
+  done
+  for (( i=0; i < ${#ns_keys[@]}; i++ )); do
+    warn "  • ${ns_keys[$i]} (${ns_counts[$i]}):"
+    local printed=0
+    local max_per_ns=5  # avoid 200-line banners on truly broken clusters
+    for line in "${lines[@]}"; do
+      IFS="${_POD_FS}" read -r pn _pname _psuffix <<<"${line}"
+      [[ "${pn}" != "${ns_keys[$i]}" ]] && continue
+      warn "      - ${_pname} ${_psuffix}"
+      printed=$(( printed + 1 ))
+      if (( printed >= max_per_ns )); then
+        local remaining=$(( ns_counts[i] - printed ))
+        (( remaining > 0 )) && warn "      … and ${remaining} more in ${ns_keys[$i]} (run: kubectl get pods -n ${ns_keys[$i]})"
+        break
+      fi
+    done
+  done
+}
+
 _print_unhealthy_pod_summary() {
   local total=0
   local line ns name phase ready reason message owner_kind owner_name waiting terminated restarts created
@@ -4844,47 +4881,12 @@ _print_unhealthy_pod_summary() {
   done
 
   local downstream_count=${#downstream_lines[@]}
-  total=$(( total ))  # root-cause count only
 
   if (( total == 0 && downstream_count == 0 )); then
     log "✅ All pods are healthy at banner time."
     return 0
   fi
 
-  # Helper: print namespace-bucketed pod list from an array
-  _print_pod_section() {
-    local -a lines=("$@")
-    local -a ns_keys=() ns_counts=()
-    local i found_idx line ns _name _suffix pn _pname _psuffix
-    for line in "${lines[@]}"; do
-      IFS="${_POD_FS}" read -r ns _name _suffix <<<"${line}"
-      found_idx=-1
-      for (( i=0; i < ${#ns_keys[@]}; i++ )); do
-        [[ "${ns_keys[$i]}" == "${ns}" ]] && { found_idx=$i; break; }
-      done
-      if (( found_idx == -1 )); then
-        ns_keys+=("${ns}"); ns_counts+=(1)
-      else
-        ns_counts[$found_idx]=$(( ns_counts[found_idx] + 1 ))
-      fi
-    done
-    for (( i=0; i < ${#ns_keys[@]}; i++ )); do
-      warn "  • ${ns_keys[$i]} (${ns_counts[$i]}):"
-      local printed=0
-      for line in "${lines[@]}"; do
-        IFS="${_POD_FS}" read -r pn _pname _psuffix <<<"${line}"
-        [[ "${pn}" != "${ns_keys[$i]}" ]] && continue
-        warn "      - ${_pname} ${_psuffix}"
-        printed=$(( printed + 1 ))
-        if (( printed >= 5 )); then
-          local remaining=$(( ns_counts[i] - printed ))
-          (( remaining > 0 )) && warn "      … and ${remaining} more in ${ns_keys[$i]} (run: kubectl get pods -n ${ns_keys[$i]})"
-          break
-        fi
-      done
-    done
-  }
-
   if (( total > 0 )); then
     warn "${total} root-cause pod(s) need attention:"
     _print_pod_section "${root_cause_lines[@]}"
diff --git a/tools/cluster_setup/test_k0s_cluster_with_stack.sh b/tools/cluster_setup/test_k0s_cluster_with_stack.sh
index 1e73efc8..6cbeaaa1 100755
--- a/tools/cluster_setup/test_k0s_cluster_with_stack.sh
+++ b/tools/cluster_setup/test_k0s_cluster_with_stack.sh
@@ -8,7 +8,9 @@
 #   ./test_k0s_cluster_with_stack.sh -v        # verbose (show each assertion)
 #   ./test_k0s_cluster_with_stack.sh pod       # run only tests matching "pod"
 
-set -euo pipefail
+# Intentionally no set -e: grep -c returns exit code 1 on zero matches, which
+# would cause the harness to exit early on legitimate "0 occurrences" assertions.
+set -uo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 SCRIPT="${SCRIPT_DIR}/k0s_cluster_with_stack.sh"
@@ -57,10 +59,23 @@ assert_rc() {
   fi
 }
 
-# Source only the pure-logic functions we want to test.
-# We stub out everything that touches the filesystem, network, or cluster.
+# ── Function loader ────────────────────────────────────────────────────────────
+# Extract a named bash function (and its closing brace) from the script by
+# name — robust to line-number shifts caused by unrelated edits.
+
+_extract_fn() {
+  local name="$1"
+  local start end
+  start=$(grep -n "^${name}()" "${SCRIPT}" | cut -d: -f1)
+  if [[ -z "${start}" ]]; then
+    echo "ERROR: function '${name}' not found in ${SCRIPT}" >&2
+    return 1
+  fi
+  end=$(awk -v s="${start}" 'NR>s && /^}$/{print NR; exit}' "${SCRIPT}")
+  sed -n "${start},${end}p" "${SCRIPT}"
+}
+
 _load_functions() {
-  # Minimal stubs so sourcing individual functions doesn't blow up
   log()  { :; }
   warn() { :; }
   err()  { echo "ERROR: $*" >&2; exit 1; }
@@ -68,20 +83,15 @@ _load_functions() {
   pf_warn() { :; }
   pf_fail() { :; }
 
-  # Extract and eval each function by line range
-  eval "$(sed -n '569,581p'   "${SCRIPT}")"   # build_image_url
-  eval "$(sed -n '739,745p'   "${SCRIPT}")"   # object_store_auth_looks_like_placeholder
-  eval "$(sed -n '4385,4421p' "${SCRIPT}")"   # _pod_is_healthy
-  eval "$(sed -n '4425,4454p' "${SCRIPT}")"   # _classify_pod_failure
+  # Extract _POD_FS assignment (single line, not a function)
+  eval "$(grep '^_POD_FS=' "${SCRIPT}")"
 
-  # _POD_FS and summary helpers
-  eval "$(sed -n '4531p'      "${SCRIPT}")"   # _POD_FS=$'\x1f'
-
-  # _print_unhealthy_pod_summary — find its actual end line dynamically
-  local start end
-  start=$(grep -n '^_print_unhealthy_pod_summary()' "${SCRIPT}" | cut -d: -f1)
-  end=$(awk -v s="${start}" 'NR>s && /^}$/{print NR; exit}' "${SCRIPT}")
-  eval "$(sed -n "${start},${end}p" "${SCRIPT}")"
+  eval "$(_extract_fn build_image_url)"
+  eval "$(_extract_fn object_store_auth_looks_like_placeholder)"
+  eval "$(_extract_fn _pod_is_healthy)"
+  eval "$(_extract_fn _classify_pod_failure)"
+  eval "$(_extract_fn _print_pod_section)"
+  eval "$(_extract_fn _print_unhealthy_pod_summary)"
 }
 
 _load_functions
@@ -116,11 +126,6 @@ assert_eq "returns bare path when registry is null" \
 suite "object_store_auth_looks_like_placeholder"
 echo "▶ object_store_auth_looks_like_placeholder"
 
-_run_placeholder_check() {
-  MINIO_ROOT_USER="$1" MINIO_ROOT_PASSWORD="$2" \
-    object_store_auth_looks_like_placeholder
-}
-
 assert_rc "detects <CHANGE_ME> angle bracket placeholder" 0 \
   bash -c "$(declare -f object_store_auth_looks_like_placeholder); MINIO_ROOT_USER='<user>' MINIO_ROOT_PASSWORD='secret' object_store_auth_looks_like_placeholder"
 
@@ -139,20 +144,20 @@ suite "_pod_is_healthy"
 echo "▶ _pod_is_healthy"
 
 # args: phase ready waiting terminated reason
-assert_rc "Running 2/2 is healthy"              0 _pod_is_healthy Running  "2/2" ""                ""      ""
-assert_rc "Running 1/2 is unhealthy"            1 _pod_is_healthy Running  "1/2" ""                ""      ""
-assert_rc "Succeeded is healthy"                0 _pod_is_healthy Succeeded "" ""                 ""      ""
-assert_rc "Pending is unhealthy"                1 _pod_is_healthy Pending  "0/1" ""                ""      ""
-assert_rc "Failed is unhealthy"                 1 _pod_is_healthy Failed   "0/1" ""                ""      ""
-assert_rc "Unknown is unhealthy"                1 _pod_is_healthy Unknown  "0/1" ""                ""      ""
-assert_rc "CrashLoopBackOff is unhealthy"       1 _pod_is_healthy Running  "0/1" "CrashLoopBackOff" ""     ""
-assert_rc "ImagePullBackOff is unhealthy"       1 _pod_is_healthy Running  "0/1" "ImagePullBackOff" ""     ""
-assert_rc "ErrImagePull is unhealthy"           1 _pod_is_healthy Running  "0/1" "ErrImagePull"    ""      ""
-assert_rc "OOMKilled terminated is unhealthy"   1 _pod_is_healthy Running  "1/1" ""                "OOMKilled" ""
-assert_rc "Error terminated is unhealthy"       1 _pod_is_healthy Running  "1/1" ""                "Error" ""
-assert_rc "NodeLost reason is unhealthy"        1 _pod_is_healthy Running  "1/1" ""                ""      "NodeLost"
-assert_rc "Evicted reason is unhealthy"         1 _pod_is_healthy Running  "1/1" ""                ""      "Evicted"
-assert_rc "PodInitializing waiting is unhealthy" 1 _pod_is_healthy Pending "0/1" "PodInitializing"  ""     ""
+assert_rc "Running 2/2 is healthy"               0 _pod_is_healthy Running   "2/2" ""                 ""          ""
+assert_rc "Running 1/2 is unhealthy"             1 _pod_is_healthy Running   "1/2" ""                 ""          ""
+assert_rc "Succeeded is healthy"                 0 _pod_is_healthy Succeeded ""    ""                 ""          ""
+assert_rc "Pending is unhealthy"                 1 _pod_is_healthy Pending   "0/1" ""                 ""          ""
+assert_rc "Failed is unhealthy"                  1 _pod_is_healthy Failed    "0/1" ""                 ""          ""
+assert_rc "Unknown is unhealthy"                 1 _pod_is_healthy Unknown   "0/1" ""                 ""          ""
+assert_rc "CrashLoopBackOff is unhealthy"        1 _pod_is_healthy Running   "0/1" "CrashLoopBackOff" ""          ""
+assert_rc "ImagePullBackOff is unhealthy"        1 _pod_is_healthy Running   "0/1" "ImagePullBackOff" ""          ""
+assert_rc "ErrImagePull is unhealthy"            1 _pod_is_healthy Running   "0/1" "ErrImagePull"     ""          ""
+assert_rc "OOMKilled terminated is unhealthy"    1 _pod_is_healthy Running   "1/1" ""                 "OOMKilled" ""
+assert_rc "Error terminated is unhealthy"        1 _pod_is_healthy Running   "1/1" ""                 "Error"     ""
+assert_rc "NodeLost reason is unhealthy"         1 _pod_is_healthy Running   "1/1" ""                 ""          "NodeLost"
+assert_rc "Evicted reason is unhealthy"          1 _pod_is_healthy Running   "1/1" ""                 ""          "Evicted"
+assert_rc "PodInitializing waiting is unhealthy" 1 _pod_is_healthy Pending   "0/1" "PodInitializing"  ""          ""
 
 # ── Tests: _classify_pod_failure ──────────────────────────────────────────────
 
@@ -161,32 +166,30 @@ echo "▶ _classify_pod_failure"
 
 # args: phase reason waiting terminated message
 assert_eq "ImagePullBackOff → image-pull" \
-  "image-pull" "$(  _classify_pod_failure Pending ""       "ImagePullBackOff" ""         "")"
+  "image-pull"   "$(_classify_pod_failure Pending ""        "ImagePullBackOff" ""          "")"
 assert_eq "ErrImagePull → image-pull" \
-  "image-pull" "$(  _classify_pod_failure Running ""       "ErrImagePull"     ""         "")"
+  "image-pull"   "$(_classify_pod_failure Running ""        "ErrImagePull"     ""          "")"
 assert_eq "CrashLoopBackOff → crashloop" \
-  "crashloop"  "$(  _classify_pod_failure Running ""       "CrashLoopBackOff" ""         "")"
+  "crashloop"    "$(_classify_pod_failure Running ""        "CrashLoopBackOff" ""          "")"
 assert_eq "OOMKilled → oom" \
-  "oom"        "$(  _classify_pod_failure Running ""       ""                 "OOMKilled" "")"
+  "oom"          "$(_classify_pod_failure Running ""        ""                 "OOMKilled" "")"
 assert_eq "Evicted → evicted" \
-  "evicted"    "$(  _classify_pod_failure Running "Evicted" ""                ""         "")"
+  "evicted"      "$(_classify_pod_failure Running "Evicted" ""                ""           "")"
 assert_eq "Pending with no signal → pending-long" \
-  "pending-long" "$(_classify_pod_failure Pending ""       ""                 ""         "")"
+  "pending-long" "$(_classify_pod_failure Pending ""        ""                 ""          "")"
 assert_eq "Failed with no signal → failed" \
-  "failed"     "$( _classify_pod_failure Failed  ""       ""                 ""         "")"
+  "failed"       "$(_classify_pod_failure Failed  ""        ""                 ""          "")"
 
 # ── Tests: _print_unhealthy_pod_summary ───────────────────────────────────────
 
 suite "_print_unhealthy_pod_summary pod"
 echo "▶ _print_unhealthy_pod_summary"
 
-# Override warn/log to capture output
 _captured=""
 warn() { _captured+="WARN: $*"$'\n'; }
 log()  { _captured+="LOG: $*"$'\n'; }
 
 mk_pod_line() {
-  # ns name phase ready waiting
   local ns=$1 name=$2 phase=$3 ready=$4 waiting=$5
   printf "%s" "${ns}${_POD_FS}${name}${_POD_FS}${phase}${_POD_FS}${ready}${_POD_FS}${_POD_FS}${_POD_FS}RS${_POD_FS}owner${_POD_FS}${waiting}${_POD_FS}${_POD_FS}0${_POD_FS}2026-06-16"
 }
@@ -200,15 +203,15 @@ declare -a POD_LINES=(
 )
 _print_unhealthy_pod_summary
 assert_eq "scenario 1: shows root-cause section header" \
-  "1" "$(echo "${_captured}" | grep -c "^WARN: 1 root-cause pod")"
+  "1" "$(echo "${_captured}" | grep -c "^WARN: 1 root-cause pod" || true)"
 assert_eq "scenario 1: shows downstream section" \
-  "1" "$(echo "${_captured}" | grep -c "still initializing")"
+  "1" "$(echo "${_captured}" | grep -c "still initializing" || true)"
 assert_eq "scenario 1: head pod appears in root-cause output" \
-  "1" "$(echo "${_captured}" | grep -c "head-pod")"
+  "1" "$(echo "${_captured}" | grep -c "head-pod" || true)"
 assert_eq "scenario 1: tip mentions fixing root cause" \
-  "1" "$(echo "${_captured}" | grep -c "fix the root-cause")"
+  "1" "$(echo "${_captured}" | grep -c "fix the root-cause" || true)"
 assert_eq "scenario 1: downstream count is 2" \
-  "1" "$(echo "${_captured}" | grep -c "^WARN: 2 pod(s) are still initializing")"
+  "1" "$(echo "${_captured}" | grep -c "^WARN: 2 pod(s) are still initializing" || true)"
 
 # Scenario 2: only PodInitializing — no root cause
 _captured=""
@@ -218,11 +221,11 @@ POD_LINES=(
 )
 _print_unhealthy_pod_summary
 assert_eq "scenario 2: no root-cause section shown" \
-  "0" "$(echo "${_captured}" | grep -c "root-cause pod")"
+  "0" "$(echo "${_captured}" | grep -c "root-cause pod" || true)"
 assert_eq "scenario 2: shows 2 still initializing" \
-  "1" "$(echo "${_captured}" | grep -c "^WARN: 2 pod(s) are still initializing")"
+  "1" "$(echo "${_captured}" | grep -c "^WARN: 2 pod(s) are still initializing" || true)"
 assert_eq "scenario 2: tip says re-run verifier" \
-  "1" "$(echo "${_captured}" | grep -c "re-run the verifier")"
+  "1" "$(echo "${_captured}" | grep -c "re-run the verifier" || true)"
 
 # Scenario 3: only root causes, no PodInitializing — multi-namespace
 _captured=""
@@ -232,28 +235,28 @@ POD_LINES=(
 )
 _print_unhealthy_pod_summary
 assert_eq "scenario 3: 2 root-cause pods" \
-  "1" "$(echo "${_captured}" | grep -c "^WARN: 2 root-cause pod")"
+  "1" "$(echo "${_captured}" | grep -c "^WARN: 2 root-cause pod" || true)"
 assert_eq "scenario 3: no downstream section" \
-  "0" "$(echo "${_captured}" | grep -c "still initializing")"
-assert_eq "scenario 3: both namespaces shown" \
-  "1" "$(echo "${_captured}" | grep -c "ai-platform")"
-assert_eq "scenario 3: kube-system also shown" \
-  "1" "$(echo "${_captured}" | grep -c "kube-system")"
+  "0" "$(echo "${_captured}" | grep -c "still initializing" || true)"
+assert_eq "scenario 3: ai-platform namespace shown" \
+  "1" "$(echo "${_captured}" | grep -c "ai-platform" || true)"
+assert_eq "scenario 3: kube-system namespace shown" \
+  "1" "$(echo "${_captured}" | grep -c "kube-system" || true)"
 
 # Scenario 4: max_per_ns truncation (>5 pods in one namespace)
 _captured=""
 POD_LINES=(
-  "$(mk_pod_line ai-platform head-pod     Pending "0/2" "ImagePullBackOff")"
-  "$(mk_pod_line ai-platform gpu-w-1      Pending "0/1" "PodInitializing")"
-  "$(mk_pod_line ai-platform gpu-w-2      Pending "0/1" "PodInitializing")"
-  "$(mk_pod_line ai-platform gpu-w-3      Pending "0/1" "PodInitializing")"
-  "$(mk_pod_line ai-platform gpu-w-4      Pending "0/1" "PodInitializing")"
-  "$(mk_pod_line ai-platform gpu-w-5      Pending "0/1" "PodInitializing")"
-  "$(mk_pod_line ai-platform gpu-w-6      Pending "0/1" "PodInitializing")"
+  "$(mk_pod_line ai-platform head-pod  Pending "0/2" "ImagePullBackOff")"
+  "$(mk_pod_line ai-platform gpu-w-1   Pending "0/1" "PodInitializing")"
+  "$(mk_pod_line ai-platform gpu-w-2   Pending "0/1" "PodInitializing")"
+  "$(mk_pod_line ai-platform gpu-w-3   Pending "0/1" "PodInitializing")"
+  "$(mk_pod_line ai-platform gpu-w-4   Pending "0/1" "PodInitializing")"
+  "$(mk_pod_line ai-platform gpu-w-5   Pending "0/1" "PodInitializing")"
+  "$(mk_pod_line ai-platform gpu-w-6   Pending "0/1" "PodInitializing")"
 )
 _print_unhealthy_pod_summary
 assert_eq "scenario 4: truncation ellipsis shown for downstream" \
-  "1" "$(echo "${_captured}" | grep -c "… and")"
+  "1" "$(echo "${_captured}" | grep -c "… and" || true)"
 
 # ── Summary ───────────────────────────────────────────────────────────────────
 

From 5d97865290b2f29a7e94a4a2a4d73b61e93be621 Mon Sep 17 00:00:00 2001
From: Kumar Pratyush <kpratyush@splunk.com>
Date: Tue, 16 Jun 2026 09:51:07 +0530
Subject: [PATCH 6/6] fix: update banner comment to match new output format;
 remove unused local

- Output format comment now shows the root-cause / still-initializing
  section headers instead of the old single-section format
- Remove unused unhealthy_lines local (superseded by root_cause_lines
  and downstream_lines)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tools/cluster_setup/k0s_cluster_with_stack.sh | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh
index 4d3df33e..29c72bb9 100755
--- a/tools/cluster_setup/k0s_cluster_with_stack.sh
+++ b/tools/cluster_setup/k0s_cluster_with_stack.sh
@@ -4796,11 +4796,15 @@ _collect_pod_summary() {
 # (e.g. verify ran a long time ago, or kubectl was unavailable) we fall back
 # to a single-line `kubectl get pods -A` query so the banner is never silent.
 #
-# Output format:
-#   ⚠️  3 unhealthy pod(s) across 2 namespace(s):
-#         ai-platform        (1) airgap-cluster-l40s-…-l-worker-w957f [Running 1/2]
-#         kube-system        (2) calico-node-f5qk7 [Pending 0/1, BackOff]
-#                                konnectivity-agent-nkgrs [Pending 0/1]
+# Output format (root-cause pods listed first, transient PodInitializing second):
+#   1 root-cause pod(s) need attention:
+#     • ai-platform (1):
+#         - head-pod [Pending 0/2, ImagePullBackOff]
+#
+#   5 pod(s) are still initializing (will recover once root-cause pod(s) above are healthy):
+#     • ai-platform (5):
+#         - gpu-worker-1 [Pending 0/1, PodInitializing]
+#         … and 4 more in ai-platform (run: kubectl get pods -n ai-platform)
 #
 # We deliberately do NOT re-run kubectl by default: the diagnostics above
 # the banner already exhausted the freshest information; re-querying here
@@ -4845,7 +4849,6 @@ _print_pod_section() {
 _print_unhealthy_pod_summary() {
   local total=0
   local line ns name phase ready reason message owner_kind owner_name waiting terminated restarts created
-  local -a unhealthy_lines=()
 
   # Use cached POD_LINES if available; otherwise refresh once.
   if (( ${#POD_LINES[@]} == 0 )); then