splunk · kupratyu-splunk · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh
@@ -4796,19 +4796,59 @@ _collect_pod_summary() {
 # (e.g. verify ran a long time ago, or kubectl was unavailable) we fall back
 # to a single-line `kubectl get pods -A` query so the banner is never silent.
 #
-# Output format:
-#   ⚠️  3 unhealthy pod(s) across 2 namespace(s):
-#         ai-platform        (1) airgap-cluster-l40s-…-l-worker-w957f [Running 1/2]
-#         kube-system        (2) calico-node-f5qk7 [Pending 0/1, BackOff]
-#                                konnectivity-agent-nkgrs [Pending 0/1]
+# Output format (root-cause pods listed first, transient PodInitializing second):
+#   1 root-cause pod(s) need attention:
+#     • ai-platform (1):
+#         - head-pod [Pending 0/2, ImagePullBackOff]
+#
+#   5 pod(s) are still initializing (will recover once root-cause pod(s) above are healthy):
+#     • ai-platform (5):
+#         - gpu-worker-1 [Pending 0/1, PodInitializing]
+#         … and 4 more in ai-platform (run: kubectl get pods -n ai-platform)
 #
 # We deliberately do NOT re-run kubectl by default: the diagnostics above
 # the banner already exhausted the freshest information; re-querying here
 # would add latency and risk a different snapshot, confusing the operator.
+
+# Helper: print a namespace-bucketed pod list from a delimited-line array.
+# Called by _print_unhealthy_pod_summary for both root-cause and downstream sections.
+_print_pod_section() {
+  local -a lines=("$@")
+  local -a ns_keys=() ns_counts=()
+  local i found_idx line ns _name _suffix pn _pname _psuffix
+  for line in "${lines[@]}"; do
+    IFS="${_POD_FS}" read -r ns _name _suffix <<<"${line}"
+    found_idx=-1
+    for (( i=0; i < ${#ns_keys[@]}; i++ )); do
+      [[ "${ns_keys[$i]}" == "${ns}" ]] && { found_idx=$i; break; }
+    done
+    if (( found_idx == -1 )); then
+      ns_keys+=("${ns}"); ns_counts+=(1)
+    else
+      ns_counts[$found_idx]=$(( ns_counts[found_idx] + 1 ))
+    fi
+  done
+  for (( i=0; i < ${#ns_keys[@]}; i++ )); do
+    warn "  • ${ns_keys[$i]} (${ns_counts[$i]}):"
+    local printed=0
+    local max_per_ns=5  # avoid 200-line banners on truly broken clusters
+    for line in "${lines[@]}"; do
+      IFS="${_POD_FS}" read -r pn _pname _psuffix <<<"${line}"
+      [[ "${pn}" != "${ns_keys[$i]}" ]] && continue
+      warn "      - ${_pname} ${_psuffix}"
+      printed=$(( printed + 1 ))
+      if (( printed >= max_per_ns )); then
+        local remaining=$(( ns_counts[i] - printed ))
+        (( remaining > 0 )) && warn "      … and ${remaining} more in ${ns_keys[$i]} (run: kubectl get pods -n ${ns_keys[$i]})"
+        break
+      fi
+    done
+  done
+}
+
 _print_unhealthy_pod_summary() {
   local total=0
   local line ns name phase ready reason message owner_kind owner_name waiting terminated restarts created
-  local -a unhealthy_lines=()
 
   # Use cached POD_LINES if available; otherwise refresh once.
   if (( ${#POD_LINES[@]} == 0 )); then
@@ -4818,12 +4858,11 @@ _print_unhealthy_pod_summary() {
     }
   fi
 
+  local -a root_cause_lines=() downstream_lines=()
   for line in "${POD_LINES[@]}"; do
     [[ -z "${line}" ]] && continue
     IFS="${_POD_FS}" read -r ns name phase ready reason message owner_kind owner_name waiting terminated restarts created <<<"${line}"
     if ! _pod_is_healthy "${phase}" "${ready}" "${waiting}" "${terminated}" "${reason}"; then
-      # Build a compact "[Phase ready/total, reason]" suffix. We omit empty
-      # reason fields rather than printing literal "[Pending 0/1, ]".
       local suffix="[${phase} ${ready}"
       if [[ -n "${reason}" ]]; then
         suffix+=", ${reason}"
@@ -4833,61 +4872,46 @@ _print_unhealthy_pod_summary() {
         suffix+=", ${terminated}"
       fi
       suffix+="]"
-      unhealthy_lines+=("${ns}${_POD_FS}${name}${_POD_FS}${suffix}")
-      total=$((total + 1))
+      # PodInitializing is a transient downstream effect — another pod's failure
+      # is blocking this one. Separate it so the operator focuses on root causes.
+      if [[ "${waiting}" == "PodInitializing" || "${reason}" == "PodInitializing" ]]; then
+        downstream_lines+=("${ns}${_POD_FS}${name}${_POD_FS}${suffix}")
+      else
+        root_cause_lines+=("${ns}${_POD_FS}${name}${_POD_FS}${suffix}")
+        total=$((total + 1))
+      fi
     fi
   done
 
-  if (( total == 0 )); then
+  local downstream_count=${#downstream_lines[@]}
+
+  if (( total == 0 && downstream_count == 0 )); then
     log "✅ All pods are healthy at banner time."
     return 0
   fi
 
-  # Bucket by namespace so the banner is easy to skim. We stick to plain
-  # arrays (bash 3.2 has no associative arrays) by collecting unique
-  # namespaces in encounter order and counting occurrences in a parallel
-  # array.
-  local -a ns_keys=() ns_counts=()
-  local i found_idx
-  for line in "${unhealthy_lines[@]}"; do
-    IFS="${_POD_FS}" read -r ns _name _suffix <<<"${line}"
-    found_idx=-1
-    for (( i=0; i < ${#ns_keys[@]}; i++ )); do
-      if [[ "${ns_keys[$i]}" == "${ns}" ]]; then
-        found_idx="$i"
-        break
-      fi
-    done
-    if (( found_idx == -1 )); then
-      ns_keys+=("${ns}")
-      ns_counts+=(1)
+  if (( total > 0 )); then
+    warn "${total} root-cause pod(s) need attention:"
+    _print_pod_section "${root_cause_lines[@]}"
+    warn ""
+  fi
+
+  if (( downstream_count > 0 )); then
+    if (( total > 0 )); then
+      warn "${downstream_count} pod(s) are still initializing (will recover once root-cause pod(s) above are healthy):"
     else
-      ns_counts[$found_idx]=$(( ns_counts[found_idx] + 1 ))
+      warn "${downstream_count} pod(s) are still initializing:"
     fi
-  done
+    _print_pod_section "${downstream_lines[@]}"
+    warn ""
+  fi
 
-  warn "${total} unhealthy pod(s) across ${#ns_keys[@]} namespace(s):"
-  for (( i=0; i < ${#ns_keys[@]}; i++ )); do
-    warn "  • ${ns_keys[$i]} (${ns_counts[$i]}):"
-    local printed=0
-    local max_per_ns=5  # avoid 200-line banners on truly broken clusters
-    local pn _pname _psuffix
-    for line in "${unhealthy_lines[@]}"; do
-      IFS="${_POD_FS}" read -r pn _pname _psuffix <<<"${line}"
-      [[ "${pn}" != "${ns_keys[$i]}" ]] && continue
-      warn "      - ${_pname} ${_psuffix}"
-      printed=$(( printed + 1 ))
-      if (( printed >= max_per_ns )); then
-        local remaining=$(( ns_counts[i] - printed ))
-        if (( remaining > 0 )); then
-          warn "      … and ${remaining} more in ${ns_keys[$i]} (run: kubectl get pods -n ${ns_keys[$i]})"
-        fi
-        break
-      fi
-    done
-  done
-  warn ""
-  warn "Tip: scroll up to see per-pod logs, events, and recommended fixes."
+  if (( total > 0 )); then
+    warn "Tip: fix the root-cause pod(s) first — initializing pods will recover automatically."
+  else
+    warn "Tip: pods are still starting up — re-run the verifier in a few minutes."
+  fi
+  warn "     Scroll up to see per-pod logs, events, and recommended fixes."
 }
 
 # ====== SHOW PLATFORM ACCESS INFORMATION ======
@@ -5008,9 +5032,9 @@ show_platform_access_info() {
   log "============================================"
   log "📚 Documentation:"
   log "  Setup Guide: ./tools/cluster_setup/K0S_README.md"
-  log "  Setup Guide (Concise version): ./tools/cluster_setup/K0S_QUICKSTART.md"
+  log "  Deployment Guide: ./tools/cluster_setup/DEPLOYMENT_GUIDE.md"
+  log "  Troubleshooting: ./tools/cluster_setup/TROUBLESHOOTING.md"
   log "  Custom Resources: ./docs/CustomResources.md"
-  log "  Troubleshooting: Check operator logs and events above"
   log "============================================"
   log ""
 
@@ -5031,8 +5055,7 @@ show_platform_access_info() {
     warn "    Or re-run just the verifier (no install steps):"
     warn "       CONFIG_FILE=${CONFIG_FILE:-<your-config>} ${0} verify-pods"
   else
-    warn "⚠️  Your AI Platform is NOT ready to use yet: ${verify_rc} pod(s)"
-    warn "    are unhealthy. Summary:"
+    warn "⚠️  Your AI Platform is NOT ready to use yet. Summary:"
     log ""
     _print_unhealthy_pod_summary
     warn "    Re-run the verifier after fixing the issues above:"