From fc8861d96e2239406901e13b28a0e302ac33cae8 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Tue, 16 Jun 2026 09:03:19 +0530 Subject: [PATCH 1/6] fix: update documentation links in installer output K0S_QUICKSTART.md was removed during doc consolidation. Replace with DEPLOYMENT_GUIDE.md and TROUBLESHOOTING.md which are the current docs. Co-Authored-By: Claude Sonnet 4.6 --- tools/cluster_setup/k0s_cluster_with_stack.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 7fb87726..e78f6029 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -5008,9 +5008,9 @@ show_platform_access_info() { log "============================================" log "📚 Documentation:" log " Setup Guide: ./tools/cluster_setup/K0S_README.md" - log " Setup Guide (Concise version): ./tools/cluster_setup/K0S_QUICKSTART.md" + log " Deployment Guide: ./tools/cluster_setup/DEPLOYMENT_GUIDE.md" + log " Troubleshooting: ./tools/cluster_setup/TROUBLESHOOTING.md" log " Custom Resources: ./docs/CustomResources.md" - log " Troubleshooting: Check operator logs and events above" log "============================================" log "" From 668001ffeab68c644e657e6c42a57578b2c7cbe1 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Tue, 16 Jun 2026 09:06:58 +0530 Subject: [PATCH 2/6] fix: separate root-cause pods from downstream PodInitializing in health summary The post-install banner was listing PodInitializing workers alongside actual failures (e.g. ImagePullBackOff on the head pod), making it hard to identify what to fix. Now root-cause pods and downstream initializing pods are printed in separate sections with a clear tip to fix root causes first. Co-Authored-By: Claude Sonnet 4.6 --- tools/cluster_setup/k0s_cluster_with_stack.sh | 106 ++++++++++-------- 1 file changed, 58 insertions(+), 48 deletions(-) diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index e78f6029..f04334ae 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -4818,12 +4818,11 @@ _print_unhealthy_pod_summary() { } fi + local -a root_cause_lines=() downstream_lines=() for line in "${POD_LINES[@]}"; do [[ -z "${line}" ]] && continue IFS="${_POD_FS}" read -r ns name phase ready reason message owner_kind owner_name waiting terminated restarts created <<<"${line}" if ! _pod_is_healthy "${phase}" "${ready}" "${waiting}" "${terminated}" "${reason}"; then - # Build a compact "[Phase ready/total, reason]" suffix. We omit empty - # reason fields rather than printing literal "[Pending 0/1, ]". local suffix="[${phase} ${ready}" if [[ -n "${reason}" ]]; then suffix+=", ${reason}" @@ -4833,61 +4832,73 @@ _print_unhealthy_pod_summary() { suffix+=", ${terminated}" fi suffix+="]" - unhealthy_lines+=("${ns}${_POD_FS}${name}${_POD_FS}${suffix}") - total=$((total + 1)) + # PodInitializing is a transient downstream effect — another pod's failure + # is blocking this one. Separate it so the operator focuses on root causes. + if [[ "${waiting}" == "PodInitializing" || "${reason}" == "PodInitializing" ]]; then + downstream_lines+=("${ns}${_POD_FS}${name}${_POD_FS}${suffix}") + else + root_cause_lines+=("${ns}${_POD_FS}${name}${_POD_FS}${suffix}") + total=$((total + 1)) + fi fi done - if (( total == 0 )); then + local downstream_count=${#downstream_lines[@]} + total=$(( total )) # root-cause count only + + if (( total == 0 && downstream_count == 0 )); then log "✅ All pods are healthy at banner time." return 0 fi - # Bucket by namespace so the banner is easy to skim. We stick to plain - # arrays (bash 3.2 has no associative arrays) by collecting unique - # namespaces in encounter order and counting occurrences in a parallel - # array. - local -a ns_keys=() ns_counts=() - local i found_idx - for line in "${unhealthy_lines[@]}"; do - IFS="${_POD_FS}" read -r ns _name _suffix <<<"${line}" - found_idx=-1 - for (( i=0; i < ${#ns_keys[@]}; i++ )); do - if [[ "${ns_keys[$i]}" == "${ns}" ]]; then - found_idx="$i" - break + # Helper: print namespace-bucketed pod list from an array + _print_pod_section() { + local -a lines=("$@") + local -a ns_keys=() ns_counts=() + local i found_idx line ns _name _suffix pn _pname _psuffix + for line in "${lines[@]}"; do + IFS="${_POD_FS}" read -r ns _name _suffix <<<"${line}" + found_idx=-1 + for (( i=0; i < ${#ns_keys[@]}; i++ )); do + [[ "${ns_keys[$i]}" == "${ns}" ]] && { found_idx=$i; break; } + done + if (( found_idx == -1 )); then + ns_keys+=("${ns}"); ns_counts+=(1) + else + ns_counts[$found_idx]=$(( ns_counts[found_idx] + 1 )) fi done - if (( found_idx == -1 )); then - ns_keys+=("${ns}") - ns_counts+=(1) - else - ns_counts[$found_idx]=$(( ns_counts[found_idx] + 1 )) - fi - done - - warn "${total} unhealthy pod(s) across ${#ns_keys[@]} namespace(s):" - for (( i=0; i < ${#ns_keys[@]}; i++ )); do - warn " • ${ns_keys[$i]} (${ns_counts[$i]}):" - local printed=0 - local max_per_ns=5 # avoid 200-line banners on truly broken clusters - local pn _pname _psuffix - for line in "${unhealthy_lines[@]}"; do - IFS="${_POD_FS}" read -r pn _pname _psuffix <<<"${line}" - [[ "${pn}" != "${ns_keys[$i]}" ]] && continue - warn " - ${_pname} ${_psuffix}" - printed=$(( printed + 1 )) - if (( printed >= max_per_ns )); then - local remaining=$(( ns_counts[i] - printed )) - if (( remaining > 0 )); then - warn " … and ${remaining} more in ${ns_keys[$i]} (run: kubectl get pods -n ${ns_keys[$i]})" + for (( i=0; i < ${#ns_keys[@]}; i++ )); do + warn " • ${ns_keys[$i]} (${ns_counts[$i]}):" + local printed=0 + for line in "${lines[@]}"; do + IFS="${_POD_FS}" read -r pn _pname _psuffix <<<"${line}" + [[ "${pn}" != "${ns_keys[$i]}" ]] && continue + warn " - ${_pname} ${_psuffix}" + printed=$(( printed + 1 )) + if (( printed >= 5 )); then + local remaining=$(( ns_counts[i] - printed )) + (( remaining > 0 )) && warn " … and ${remaining} more in ${ns_keys[$i]} (run: kubectl get pods -n ${ns_keys[$i]})" + break fi - break - fi + done done - done - warn "" - warn "Tip: scroll up to see per-pod logs, events, and recommended fixes." + } + + if (( total > 0 )); then + warn "${total} root-cause pod(s) need attention:" + _print_pod_section "${root_cause_lines[@]}" + warn "" + fi + + if (( downstream_count > 0 )); then + warn "${downstream_count} pod(s) are still initializing (waiting on root-cause pod(s) above to become ready):" + _print_pod_section "${downstream_lines[@]}" + warn "" + fi + + warn "Tip: fix the root-cause pod(s) first — initializing pods will recover automatically." + warn " Scroll up to see per-pod logs, events, and recommended fixes." } # ====== SHOW PLATFORM ACCESS INFORMATION ====== @@ -5031,8 +5042,7 @@ show_platform_access_info() { warn " Or re-run just the verifier (no install steps):" warn " CONFIG_FILE=${CONFIG_FILE:-} ${0} verify-pods" else - warn "⚠️ Your AI Platform is NOT ready to use yet: ${verify_rc} pod(s)" - warn " are unhealthy. Summary:" + warn "⚠️ Your AI Platform is NOT ready to use yet. Summary:" log "" _print_unhealthy_pod_summary warn " Re-run the verifier after fixing the issues above:" From 6992544169643e6aa57a37056d0380920f142c42 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Tue, 16 Jun 2026 09:11:02 +0530 Subject: [PATCH 3/6] fix: improve wording when only PodInitializing pods are present MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When there are no root-cause pods (e.g. head pod recovered but workers still initializing), the tip no longer says "waiting on root-cause pod(s) above" where there is nothing above. Instead shows "still starting up — re-run verifier in a few minutes". Co-Authored-By: Claude Sonnet 4.6 --- tools/cluster_setup/k0s_cluster_with_stack.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index f04334ae..f0a4bfe9 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -4892,12 +4892,20 @@ _print_unhealthy_pod_summary() { fi if (( downstream_count > 0 )); then - warn "${downstream_count} pod(s) are still initializing (waiting on root-cause pod(s) above to become ready):" + if (( total > 0 )); then + warn "${downstream_count} pod(s) are still initializing (will recover once root-cause pod(s) above are healthy):" + else + warn "${downstream_count} pod(s) are still initializing:" + fi _print_pod_section "${downstream_lines[@]}" warn "" fi - warn "Tip: fix the root-cause pod(s) first — initializing pods will recover automatically." + if (( total > 0 )); then + warn "Tip: fix the root-cause pod(s) first — initializing pods will recover automatically." + else + warn "Tip: pods are still starting up — re-run the verifier in a few minutes." + fi warn " Scroll up to see per-pod logs, events, and recommended fixes." } From fe51bc70f86356cb98b666214d03d460312441d4 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Tue, 16 Jun 2026 09:14:10 +0530 Subject: [PATCH 4/6] test: add unit test script for k0s_cluster_with_stack.sh pure-logic functions Covers build_image_url, object_store_auth_looks_like_placeholder, _pod_is_healthy, _classify_pod_failure, and _print_unhealthy_pod_summary (4 scenarios including mixed root-cause/PodInitializing, only-initializing, only-root-cause multi-namespace, and truncation). No cluster or network needed. Run: ./tools/cluster_setup/test_k0s_cluster_with_stack.sh [-v] [filter] Co-Authored-By: Claude Sonnet 4.6 --- .../test_k0s_cluster_with_stack.sh | 265 ++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100755 tools/cluster_setup/test_k0s_cluster_with_stack.sh diff --git a/tools/cluster_setup/test_k0s_cluster_with_stack.sh b/tools/cluster_setup/test_k0s_cluster_with_stack.sh new file mode 100755 index 00000000..1e73efc8 --- /dev/null +++ b/tools/cluster_setup/test_k0s_cluster_with_stack.sh @@ -0,0 +1,265 @@ +#!/usr/bin/env bash +# test_k0s_cluster_with_stack.sh +# Unit tests for pure-logic functions in k0s_cluster_with_stack.sh. +# No cluster, SSH, kubectl, or network access required. +# +# Usage: +# ./test_k0s_cluster_with_stack.sh # run all tests +# ./test_k0s_cluster_with_stack.sh -v # verbose (show each assertion) +# ./test_k0s_cluster_with_stack.sh pod # run only tests matching "pod" + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT="${SCRIPT_DIR}/k0s_cluster_with_stack.sh" + +VERBOSE=0 +FILTER="${1:-}" +if [[ "${FILTER}" == "-v" ]]; then VERBOSE=1; FILTER="${2:-}"; fi + +# ── Test framework ───────────────────────────────────────────────────────────── + +PASS=0; FAIL=0; SKIP=0 +_current_suite="" + +suite() { _current_suite="$1"; } + +assert_eq() { + local desc="$1" expected="$2" actual="$3" + if [[ -n "${FILTER}" && "${_current_suite} ${desc}" != *"${FILTER}"* ]]; then + SKIP=$(( SKIP + 1 )); return + fi + if [[ "${expected}" == "${actual}" ]]; then + PASS=$(( PASS + 1 )) + [[ "${VERBOSE}" == "1" ]] && echo " ✅ ${desc}" + else + FAIL=$(( FAIL + 1 )) + echo " ❌ ${desc}" + echo " expected: $(printf '%q' "${expected}")" + echo " actual : $(printf '%q' "${actual}")" + fi +} + +assert_rc() { + local desc="$1" expected_rc="$2" + shift 2 + if [[ -n "${FILTER}" && "${_current_suite} ${desc}" != *"${FILTER}"* ]]; then + SKIP=$(( SKIP + 1 )); return + fi + local actual_rc=0 + "$@" >/dev/null 2>&1 || actual_rc=$? + if [[ "${expected_rc}" == "${actual_rc}" ]]; then + PASS=$(( PASS + 1 )) + [[ "${VERBOSE}" == "1" ]] && echo " ✅ ${desc}" + else + FAIL=$(( FAIL + 1 )) + echo " ❌ ${desc} (expected rc=${expected_rc}, got rc=${actual_rc})" + fi +} + +# Source only the pure-logic functions we want to test. +# We stub out everything that touches the filesystem, network, or cluster. +_load_functions() { + # Minimal stubs so sourcing individual functions doesn't blow up + log() { :; } + warn() { :; } + err() { echo "ERROR: $*" >&2; exit 1; } + pf_ok() { :; } + pf_warn() { :; } + pf_fail() { :; } + + # Extract and eval each function by line range + eval "$(sed -n '569,581p' "${SCRIPT}")" # build_image_url + eval "$(sed -n '739,745p' "${SCRIPT}")" # object_store_auth_looks_like_placeholder + eval "$(sed -n '4385,4421p' "${SCRIPT}")" # _pod_is_healthy + eval "$(sed -n '4425,4454p' "${SCRIPT}")" # _classify_pod_failure + + # _POD_FS and summary helpers + eval "$(sed -n '4531p' "${SCRIPT}")" # _POD_FS=$'\x1f' + + # _print_unhealthy_pod_summary — find its actual end line dynamically + local start end + start=$(grep -n '^_print_unhealthy_pod_summary()' "${SCRIPT}" | cut -d: -f1) + end=$(awk -v s="${start}" 'NR>s && /^}$/{print NR; exit}' "${SCRIPT}") + eval "$(sed -n "${start},${end}p" "${SCRIPT}")" +} + +_load_functions + +# ── Tests: build_image_url ───────────────────────────────────────────────────── + +suite "build_image_url" +echo "▶ build_image_url" + +assert_eq "prepends registry to bare path" \ + "my.registry.io/splunk/operator:1.0" \ + "$(build_image_url "my.registry.io" "splunk/operator:1.0")" + +assert_eq "skips registry when image already has a host" \ + "ghcr.io/splunk/operator:1.0" \ + "$(build_image_url "my.registry.io" "ghcr.io/splunk/operator:1.0")" + +assert_eq "skips registry when image has IP host" \ + "10.0.0.1:5000/operator:1.0" \ + "$(build_image_url "my.registry.io" "10.0.0.1:5000/operator:1.0")" + +assert_eq "returns bare path when registry is empty" \ + "splunk/operator:1.0" \ + "$(build_image_url "" "splunk/operator:1.0")" + +assert_eq "returns bare path when registry is null" \ + "splunk/operator:1.0" \ + "$(build_image_url "null" "splunk/operator:1.0")" + +# ── Tests: object_store_auth_looks_like_placeholder ─────────────────────────── + +suite "object_store_auth_looks_like_placeholder" +echo "▶ object_store_auth_looks_like_placeholder" + +_run_placeholder_check() { + MINIO_ROOT_USER="$1" MINIO_ROOT_PASSWORD="$2" \ + object_store_auth_looks_like_placeholder +} + +assert_rc "detects angle bracket placeholder" 0 \ + bash -c "$(declare -f object_store_auth_looks_like_placeholder); MINIO_ROOT_USER='' MINIO_ROOT_PASSWORD='secret' object_store_auth_looks_like_placeholder" + +assert_rc "detects CHANGEME keyword in password" 0 \ + bash -c "$(declare -f object_store_auth_looks_like_placeholder); MINIO_ROOT_USER='admin' MINIO_ROOT_PASSWORD='CHANGEME' object_store_auth_looks_like_placeholder" + +assert_rc "detects changeme (lowercase)" 0 \ + bash -c "$(declare -f object_store_auth_looks_like_placeholder); MINIO_ROOT_USER='admin' MINIO_ROOT_PASSWORD='changeme' object_store_auth_looks_like_placeholder" + +assert_rc "accepts real credentials (returns 1)" 1 \ + bash -c "$(declare -f object_store_auth_looks_like_placeholder); MINIO_ROOT_USER='admin' MINIO_ROOT_PASSWORD='s3cr3t!' object_store_auth_looks_like_placeholder" + +# ── Tests: _pod_is_healthy ───────────────────────────────────────────────────── + +suite "_pod_is_healthy" +echo "▶ _pod_is_healthy" + +# args: phase ready waiting terminated reason +assert_rc "Running 2/2 is healthy" 0 _pod_is_healthy Running "2/2" "" "" "" +assert_rc "Running 1/2 is unhealthy" 1 _pod_is_healthy Running "1/2" "" "" "" +assert_rc "Succeeded is healthy" 0 _pod_is_healthy Succeeded "" "" "" "" +assert_rc "Pending is unhealthy" 1 _pod_is_healthy Pending "0/1" "" "" "" +assert_rc "Failed is unhealthy" 1 _pod_is_healthy Failed "0/1" "" "" "" +assert_rc "Unknown is unhealthy" 1 _pod_is_healthy Unknown "0/1" "" "" "" +assert_rc "CrashLoopBackOff is unhealthy" 1 _pod_is_healthy Running "0/1" "CrashLoopBackOff" "" "" +assert_rc "ImagePullBackOff is unhealthy" 1 _pod_is_healthy Running "0/1" "ImagePullBackOff" "" "" +assert_rc "ErrImagePull is unhealthy" 1 _pod_is_healthy Running "0/1" "ErrImagePull" "" "" +assert_rc "OOMKilled terminated is unhealthy" 1 _pod_is_healthy Running "1/1" "" "OOMKilled" "" +assert_rc "Error terminated is unhealthy" 1 _pod_is_healthy Running "1/1" "" "Error" "" +assert_rc "NodeLost reason is unhealthy" 1 _pod_is_healthy Running "1/1" "" "" "NodeLost" +assert_rc "Evicted reason is unhealthy" 1 _pod_is_healthy Running "1/1" "" "" "Evicted" +assert_rc "PodInitializing waiting is unhealthy" 1 _pod_is_healthy Pending "0/1" "PodInitializing" "" "" + +# ── Tests: _classify_pod_failure ────────────────────────────────────────────── + +suite "_classify_pod_failure" +echo "▶ _classify_pod_failure" + +# args: phase reason waiting terminated message +assert_eq "ImagePullBackOff → image-pull" \ + "image-pull" "$( _classify_pod_failure Pending "" "ImagePullBackOff" "" "")" +assert_eq "ErrImagePull → image-pull" \ + "image-pull" "$( _classify_pod_failure Running "" "ErrImagePull" "" "")" +assert_eq "CrashLoopBackOff → crashloop" \ + "crashloop" "$( _classify_pod_failure Running "" "CrashLoopBackOff" "" "")" +assert_eq "OOMKilled → oom" \ + "oom" "$( _classify_pod_failure Running "" "" "OOMKilled" "")" +assert_eq "Evicted → evicted" \ + "evicted" "$( _classify_pod_failure Running "Evicted" "" "" "")" +assert_eq "Pending with no signal → pending-long" \ + "pending-long" "$(_classify_pod_failure Pending "" "" "" "")" +assert_eq "Failed with no signal → failed" \ + "failed" "$( _classify_pod_failure Failed "" "" "" "")" + +# ── Tests: _print_unhealthy_pod_summary ─────────────────────────────────────── + +suite "_print_unhealthy_pod_summary pod" +echo "▶ _print_unhealthy_pod_summary" + +# Override warn/log to capture output +_captured="" +warn() { _captured+="WARN: $*"$'\n'; } +log() { _captured+="LOG: $*"$'\n'; } + +mk_pod_line() { + # ns name phase ready waiting + local ns=$1 name=$2 phase=$3 ready=$4 waiting=$5 + printf "%s" "${ns}${_POD_FS}${name}${_POD_FS}${phase}${_POD_FS}${ready}${_POD_FS}${_POD_FS}${_POD_FS}RS${_POD_FS}owner${_POD_FS}${waiting}${_POD_FS}${_POD_FS}0${_POD_FS}2026-06-16" +} + +# Scenario 1: mixed ImagePullBackOff (root cause) + PodInitializing (downstream) +_captured="" +declare -a POD_LINES=( + "$(mk_pod_line ai-platform head-pod Pending "0/2" "ImagePullBackOff")" + "$(mk_pod_line ai-platform gpu-worker-1 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-worker-2 Pending "0/1" "PodInitializing")" +) +_print_unhealthy_pod_summary +assert_eq "scenario 1: shows root-cause section header" \ + "1" "$(echo "${_captured}" | grep -c "^WARN: 1 root-cause pod")" +assert_eq "scenario 1: shows downstream section" \ + "1" "$(echo "${_captured}" | grep -c "still initializing")" +assert_eq "scenario 1: head pod appears in root-cause output" \ + "1" "$(echo "${_captured}" | grep -c "head-pod")" +assert_eq "scenario 1: tip mentions fixing root cause" \ + "1" "$(echo "${_captured}" | grep -c "fix the root-cause")" +assert_eq "scenario 1: downstream count is 2" \ + "1" "$(echo "${_captured}" | grep -c "^WARN: 2 pod(s) are still initializing")" + +# Scenario 2: only PodInitializing — no root cause +_captured="" +POD_LINES=( + "$(mk_pod_line ai-platform gpu-worker-1 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-worker-2 Pending "0/1" "PodInitializing")" +) +_print_unhealthy_pod_summary +assert_eq "scenario 2: no root-cause section shown" \ + "0" "$(echo "${_captured}" | grep -c "root-cause pod")" +assert_eq "scenario 2: shows 2 still initializing" \ + "1" "$(echo "${_captured}" | grep -c "^WARN: 2 pod(s) are still initializing")" +assert_eq "scenario 2: tip says re-run verifier" \ + "1" "$(echo "${_captured}" | grep -c "re-run the verifier")" + +# Scenario 3: only root causes, no PodInitializing — multi-namespace +_captured="" +POD_LINES=( + "$(mk_pod_line ai-platform head-pod Pending "0/2" "ImagePullBackOff")" + "$(mk_pod_line kube-system calico-node Pending "0/1" "CrashLoopBackOff")" +) +_print_unhealthy_pod_summary +assert_eq "scenario 3: 2 root-cause pods" \ + "1" "$(echo "${_captured}" | grep -c "^WARN: 2 root-cause pod")" +assert_eq "scenario 3: no downstream section" \ + "0" "$(echo "${_captured}" | grep -c "still initializing")" +assert_eq "scenario 3: both namespaces shown" \ + "1" "$(echo "${_captured}" | grep -c "ai-platform")" +assert_eq "scenario 3: kube-system also shown" \ + "1" "$(echo "${_captured}" | grep -c "kube-system")" + +# Scenario 4: max_per_ns truncation (>5 pods in one namespace) +_captured="" +POD_LINES=( + "$(mk_pod_line ai-platform head-pod Pending "0/2" "ImagePullBackOff")" + "$(mk_pod_line ai-platform gpu-w-1 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-w-2 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-w-3 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-w-4 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-w-5 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-w-6 Pending "0/1" "PodInitializing")" +) +_print_unhealthy_pod_summary +assert_eq "scenario 4: truncation ellipsis shown for downstream" \ + "1" "$(echo "${_captured}" | grep -c "… and")" + +# ── Summary ─────────────────────────────────────────────────────────────────── + +echo "" +echo "Results: ${PASS} passed, ${FAIL} failed, ${SKIP} skipped" +echo "" +if (( FAIL > 0 )); then + exit 1 +fi From 6b644234cc05a7e5fdea87e807ff5fbe687216bc Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Tue, 16 Jun 2026 09:40:52 +0530 Subject: [PATCH 5/6] fix: address PR #110 review comments on pod summary and test script k0s_cluster_with_stack.sh: - Remove no-op total=$(( total )) assignment - Restore max_per_ns named variable in _print_pod_section - Move _print_pod_section to a top-level helper (was re-defined as a global on every call to _print_unhealthy_pod_summary) test_k0s_cluster_with_stack.sh: - Drop set -e (grep -c returns exit 1 on zero matches, causing early exit) - Add || true to all grep -c expressions to make them non-fatal - Extract functions by name via _extract_fn() instead of brittle hard-coded line ranges; robust to unrelated edits shifting line numbers Co-Authored-By: Claude Sonnet 4.6 --- tools/cluster_setup/k0s_cluster_with_stack.sh | 72 ++++----- .../test_k0s_cluster_with_stack.sh | 137 +++++++++--------- 2 files changed, 107 insertions(+), 102 deletions(-) diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index f0a4bfe9..4d3df33e 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -4805,6 +4805,43 @@ _collect_pod_summary() { # We deliberately do NOT re-run kubectl by default: the diagnostics above # the banner already exhausted the freshest information; re-querying here # would add latency and risk a different snapshot, confusing the operator. + +# Helper: print a namespace-bucketed pod list from a delimited-line array. +# Called by _print_unhealthy_pod_summary for both root-cause and downstream sections. +_print_pod_section() { + local -a lines=("$@") + local -a ns_keys=() ns_counts=() + local i found_idx line ns _name _suffix pn _pname _psuffix + for line in "${lines[@]}"; do + IFS="${_POD_FS}" read -r ns _name _suffix <<<"${line}" + found_idx=-1 + for (( i=0; i < ${#ns_keys[@]}; i++ )); do + [[ "${ns_keys[$i]}" == "${ns}" ]] && { found_idx=$i; break; } + done + if (( found_idx == -1 )); then + ns_keys+=("${ns}"); ns_counts+=(1) + else + ns_counts[$found_idx]=$(( ns_counts[found_idx] + 1 )) + fi + done + for (( i=0; i < ${#ns_keys[@]}; i++ )); do + warn " • ${ns_keys[$i]} (${ns_counts[$i]}):" + local printed=0 + local max_per_ns=5 # avoid 200-line banners on truly broken clusters + for line in "${lines[@]}"; do + IFS="${_POD_FS}" read -r pn _pname _psuffix <<<"${line}" + [[ "${pn}" != "${ns_keys[$i]}" ]] && continue + warn " - ${_pname} ${_psuffix}" + printed=$(( printed + 1 )) + if (( printed >= max_per_ns )); then + local remaining=$(( ns_counts[i] - printed )) + (( remaining > 0 )) && warn " … and ${remaining} more in ${ns_keys[$i]} (run: kubectl get pods -n ${ns_keys[$i]})" + break + fi + done + done +} + _print_unhealthy_pod_summary() { local total=0 local line ns name phase ready reason message owner_kind owner_name waiting terminated restarts created @@ -4844,47 +4881,12 @@ _print_unhealthy_pod_summary() { done local downstream_count=${#downstream_lines[@]} - total=$(( total )) # root-cause count only if (( total == 0 && downstream_count == 0 )); then log "✅ All pods are healthy at banner time." return 0 fi - # Helper: print namespace-bucketed pod list from an array - _print_pod_section() { - local -a lines=("$@") - local -a ns_keys=() ns_counts=() - local i found_idx line ns _name _suffix pn _pname _psuffix - for line in "${lines[@]}"; do - IFS="${_POD_FS}" read -r ns _name _suffix <<<"${line}" - found_idx=-1 - for (( i=0; i < ${#ns_keys[@]}; i++ )); do - [[ "${ns_keys[$i]}" == "${ns}" ]] && { found_idx=$i; break; } - done - if (( found_idx == -1 )); then - ns_keys+=("${ns}"); ns_counts+=(1) - else - ns_counts[$found_idx]=$(( ns_counts[found_idx] + 1 )) - fi - done - for (( i=0; i < ${#ns_keys[@]}; i++ )); do - warn " • ${ns_keys[$i]} (${ns_counts[$i]}):" - local printed=0 - for line in "${lines[@]}"; do - IFS="${_POD_FS}" read -r pn _pname _psuffix <<<"${line}" - [[ "${pn}" != "${ns_keys[$i]}" ]] && continue - warn " - ${_pname} ${_psuffix}" - printed=$(( printed + 1 )) - if (( printed >= 5 )); then - local remaining=$(( ns_counts[i] - printed )) - (( remaining > 0 )) && warn " … and ${remaining} more in ${ns_keys[$i]} (run: kubectl get pods -n ${ns_keys[$i]})" - break - fi - done - done - } - if (( total > 0 )); then warn "${total} root-cause pod(s) need attention:" _print_pod_section "${root_cause_lines[@]}" diff --git a/tools/cluster_setup/test_k0s_cluster_with_stack.sh b/tools/cluster_setup/test_k0s_cluster_with_stack.sh index 1e73efc8..6cbeaaa1 100755 --- a/tools/cluster_setup/test_k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/test_k0s_cluster_with_stack.sh @@ -8,7 +8,9 @@ # ./test_k0s_cluster_with_stack.sh -v # verbose (show each assertion) # ./test_k0s_cluster_with_stack.sh pod # run only tests matching "pod" -set -euo pipefail +# Intentionally no set -e: grep -c returns exit code 1 on zero matches, which +# would cause the harness to exit early on legitimate "0 occurrences" assertions. +set -uo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT="${SCRIPT_DIR}/k0s_cluster_with_stack.sh" @@ -57,10 +59,23 @@ assert_rc() { fi } -# Source only the pure-logic functions we want to test. -# We stub out everything that touches the filesystem, network, or cluster. +# ── Function loader ──────────────────────────────────────────────────────────── +# Extract a named bash function (and its closing brace) from the script by +# name — robust to line-number shifts caused by unrelated edits. + +_extract_fn() { + local name="$1" + local start end + start=$(grep -n "^${name}()" "${SCRIPT}" | cut -d: -f1) + if [[ -z "${start}" ]]; then + echo "ERROR: function '${name}' not found in ${SCRIPT}" >&2 + return 1 + fi + end=$(awk -v s="${start}" 'NR>s && /^}$/{print NR; exit}' "${SCRIPT}") + sed -n "${start},${end}p" "${SCRIPT}" +} + _load_functions() { - # Minimal stubs so sourcing individual functions doesn't blow up log() { :; } warn() { :; } err() { echo "ERROR: $*" >&2; exit 1; } @@ -68,20 +83,15 @@ _load_functions() { pf_warn() { :; } pf_fail() { :; } - # Extract and eval each function by line range - eval "$(sed -n '569,581p' "${SCRIPT}")" # build_image_url - eval "$(sed -n '739,745p' "${SCRIPT}")" # object_store_auth_looks_like_placeholder - eval "$(sed -n '4385,4421p' "${SCRIPT}")" # _pod_is_healthy - eval "$(sed -n '4425,4454p' "${SCRIPT}")" # _classify_pod_failure + # Extract _POD_FS assignment (single line, not a function) + eval "$(grep '^_POD_FS=' "${SCRIPT}")" - # _POD_FS and summary helpers - eval "$(sed -n '4531p' "${SCRIPT}")" # _POD_FS=$'\x1f' - - # _print_unhealthy_pod_summary — find its actual end line dynamically - local start end - start=$(grep -n '^_print_unhealthy_pod_summary()' "${SCRIPT}" | cut -d: -f1) - end=$(awk -v s="${start}" 'NR>s && /^}$/{print NR; exit}' "${SCRIPT}") - eval "$(sed -n "${start},${end}p" "${SCRIPT}")" + eval "$(_extract_fn build_image_url)" + eval "$(_extract_fn object_store_auth_looks_like_placeholder)" + eval "$(_extract_fn _pod_is_healthy)" + eval "$(_extract_fn _classify_pod_failure)" + eval "$(_extract_fn _print_pod_section)" + eval "$(_extract_fn _print_unhealthy_pod_summary)" } _load_functions @@ -116,11 +126,6 @@ assert_eq "returns bare path when registry is null" \ suite "object_store_auth_looks_like_placeholder" echo "▶ object_store_auth_looks_like_placeholder" -_run_placeholder_check() { - MINIO_ROOT_USER="$1" MINIO_ROOT_PASSWORD="$2" \ - object_store_auth_looks_like_placeholder -} - assert_rc "detects angle bracket placeholder" 0 \ bash -c "$(declare -f object_store_auth_looks_like_placeholder); MINIO_ROOT_USER='' MINIO_ROOT_PASSWORD='secret' object_store_auth_looks_like_placeholder" @@ -139,20 +144,20 @@ suite "_pod_is_healthy" echo "▶ _pod_is_healthy" # args: phase ready waiting terminated reason -assert_rc "Running 2/2 is healthy" 0 _pod_is_healthy Running "2/2" "" "" "" -assert_rc "Running 1/2 is unhealthy" 1 _pod_is_healthy Running "1/2" "" "" "" -assert_rc "Succeeded is healthy" 0 _pod_is_healthy Succeeded "" "" "" "" -assert_rc "Pending is unhealthy" 1 _pod_is_healthy Pending "0/1" "" "" "" -assert_rc "Failed is unhealthy" 1 _pod_is_healthy Failed "0/1" "" "" "" -assert_rc "Unknown is unhealthy" 1 _pod_is_healthy Unknown "0/1" "" "" "" -assert_rc "CrashLoopBackOff is unhealthy" 1 _pod_is_healthy Running "0/1" "CrashLoopBackOff" "" "" -assert_rc "ImagePullBackOff is unhealthy" 1 _pod_is_healthy Running "0/1" "ImagePullBackOff" "" "" -assert_rc "ErrImagePull is unhealthy" 1 _pod_is_healthy Running "0/1" "ErrImagePull" "" "" -assert_rc "OOMKilled terminated is unhealthy" 1 _pod_is_healthy Running "1/1" "" "OOMKilled" "" -assert_rc "Error terminated is unhealthy" 1 _pod_is_healthy Running "1/1" "" "Error" "" -assert_rc "NodeLost reason is unhealthy" 1 _pod_is_healthy Running "1/1" "" "" "NodeLost" -assert_rc "Evicted reason is unhealthy" 1 _pod_is_healthy Running "1/1" "" "" "Evicted" -assert_rc "PodInitializing waiting is unhealthy" 1 _pod_is_healthy Pending "0/1" "PodInitializing" "" "" +assert_rc "Running 2/2 is healthy" 0 _pod_is_healthy Running "2/2" "" "" "" +assert_rc "Running 1/2 is unhealthy" 1 _pod_is_healthy Running "1/2" "" "" "" +assert_rc "Succeeded is healthy" 0 _pod_is_healthy Succeeded "" "" "" "" +assert_rc "Pending is unhealthy" 1 _pod_is_healthy Pending "0/1" "" "" "" +assert_rc "Failed is unhealthy" 1 _pod_is_healthy Failed "0/1" "" "" "" +assert_rc "Unknown is unhealthy" 1 _pod_is_healthy Unknown "0/1" "" "" "" +assert_rc "CrashLoopBackOff is unhealthy" 1 _pod_is_healthy Running "0/1" "CrashLoopBackOff" "" "" +assert_rc "ImagePullBackOff is unhealthy" 1 _pod_is_healthy Running "0/1" "ImagePullBackOff" "" "" +assert_rc "ErrImagePull is unhealthy" 1 _pod_is_healthy Running "0/1" "ErrImagePull" "" "" +assert_rc "OOMKilled terminated is unhealthy" 1 _pod_is_healthy Running "1/1" "" "OOMKilled" "" +assert_rc "Error terminated is unhealthy" 1 _pod_is_healthy Running "1/1" "" "Error" "" +assert_rc "NodeLost reason is unhealthy" 1 _pod_is_healthy Running "1/1" "" "" "NodeLost" +assert_rc "Evicted reason is unhealthy" 1 _pod_is_healthy Running "1/1" "" "" "Evicted" +assert_rc "PodInitializing waiting is unhealthy" 1 _pod_is_healthy Pending "0/1" "PodInitializing" "" "" # ── Tests: _classify_pod_failure ────────────────────────────────────────────── @@ -161,32 +166,30 @@ echo "▶ _classify_pod_failure" # args: phase reason waiting terminated message assert_eq "ImagePullBackOff → image-pull" \ - "image-pull" "$( _classify_pod_failure Pending "" "ImagePullBackOff" "" "")" + "image-pull" "$(_classify_pod_failure Pending "" "ImagePullBackOff" "" "")" assert_eq "ErrImagePull → image-pull" \ - "image-pull" "$( _classify_pod_failure Running "" "ErrImagePull" "" "")" + "image-pull" "$(_classify_pod_failure Running "" "ErrImagePull" "" "")" assert_eq "CrashLoopBackOff → crashloop" \ - "crashloop" "$( _classify_pod_failure Running "" "CrashLoopBackOff" "" "")" + "crashloop" "$(_classify_pod_failure Running "" "CrashLoopBackOff" "" "")" assert_eq "OOMKilled → oom" \ - "oom" "$( _classify_pod_failure Running "" "" "OOMKilled" "")" + "oom" "$(_classify_pod_failure Running "" "" "OOMKilled" "")" assert_eq "Evicted → evicted" \ - "evicted" "$( _classify_pod_failure Running "Evicted" "" "" "")" + "evicted" "$(_classify_pod_failure Running "Evicted" "" "" "")" assert_eq "Pending with no signal → pending-long" \ - "pending-long" "$(_classify_pod_failure Pending "" "" "" "")" + "pending-long" "$(_classify_pod_failure Pending "" "" "" "")" assert_eq "Failed with no signal → failed" \ - "failed" "$( _classify_pod_failure Failed "" "" "" "")" + "failed" "$(_classify_pod_failure Failed "" "" "" "")" # ── Tests: _print_unhealthy_pod_summary ─────────────────────────────────────── suite "_print_unhealthy_pod_summary pod" echo "▶ _print_unhealthy_pod_summary" -# Override warn/log to capture output _captured="" warn() { _captured+="WARN: $*"$'\n'; } log() { _captured+="LOG: $*"$'\n'; } mk_pod_line() { - # ns name phase ready waiting local ns=$1 name=$2 phase=$3 ready=$4 waiting=$5 printf "%s" "${ns}${_POD_FS}${name}${_POD_FS}${phase}${_POD_FS}${ready}${_POD_FS}${_POD_FS}${_POD_FS}RS${_POD_FS}owner${_POD_FS}${waiting}${_POD_FS}${_POD_FS}0${_POD_FS}2026-06-16" } @@ -200,15 +203,15 @@ declare -a POD_LINES=( ) _print_unhealthy_pod_summary assert_eq "scenario 1: shows root-cause section header" \ - "1" "$(echo "${_captured}" | grep -c "^WARN: 1 root-cause pod")" + "1" "$(echo "${_captured}" | grep -c "^WARN: 1 root-cause pod" || true)" assert_eq "scenario 1: shows downstream section" \ - "1" "$(echo "${_captured}" | grep -c "still initializing")" + "1" "$(echo "${_captured}" | grep -c "still initializing" || true)" assert_eq "scenario 1: head pod appears in root-cause output" \ - "1" "$(echo "${_captured}" | grep -c "head-pod")" + "1" "$(echo "${_captured}" | grep -c "head-pod" || true)" assert_eq "scenario 1: tip mentions fixing root cause" \ - "1" "$(echo "${_captured}" | grep -c "fix the root-cause")" + "1" "$(echo "${_captured}" | grep -c "fix the root-cause" || true)" assert_eq "scenario 1: downstream count is 2" \ - "1" "$(echo "${_captured}" | grep -c "^WARN: 2 pod(s) are still initializing")" + "1" "$(echo "${_captured}" | grep -c "^WARN: 2 pod(s) are still initializing" || true)" # Scenario 2: only PodInitializing — no root cause _captured="" @@ -218,11 +221,11 @@ POD_LINES=( ) _print_unhealthy_pod_summary assert_eq "scenario 2: no root-cause section shown" \ - "0" "$(echo "${_captured}" | grep -c "root-cause pod")" + "0" "$(echo "${_captured}" | grep -c "root-cause pod" || true)" assert_eq "scenario 2: shows 2 still initializing" \ - "1" "$(echo "${_captured}" | grep -c "^WARN: 2 pod(s) are still initializing")" + "1" "$(echo "${_captured}" | grep -c "^WARN: 2 pod(s) are still initializing" || true)" assert_eq "scenario 2: tip says re-run verifier" \ - "1" "$(echo "${_captured}" | grep -c "re-run the verifier")" + "1" "$(echo "${_captured}" | grep -c "re-run the verifier" || true)" # Scenario 3: only root causes, no PodInitializing — multi-namespace _captured="" @@ -232,28 +235,28 @@ POD_LINES=( ) _print_unhealthy_pod_summary assert_eq "scenario 3: 2 root-cause pods" \ - "1" "$(echo "${_captured}" | grep -c "^WARN: 2 root-cause pod")" + "1" "$(echo "${_captured}" | grep -c "^WARN: 2 root-cause pod" || true)" assert_eq "scenario 3: no downstream section" \ - "0" "$(echo "${_captured}" | grep -c "still initializing")" -assert_eq "scenario 3: both namespaces shown" \ - "1" "$(echo "${_captured}" | grep -c "ai-platform")" -assert_eq "scenario 3: kube-system also shown" \ - "1" "$(echo "${_captured}" | grep -c "kube-system")" + "0" "$(echo "${_captured}" | grep -c "still initializing" || true)" +assert_eq "scenario 3: ai-platform namespace shown" \ + "1" "$(echo "${_captured}" | grep -c "ai-platform" || true)" +assert_eq "scenario 3: kube-system namespace shown" \ + "1" "$(echo "${_captured}" | grep -c "kube-system" || true)" # Scenario 4: max_per_ns truncation (>5 pods in one namespace) _captured="" POD_LINES=( - "$(mk_pod_line ai-platform head-pod Pending "0/2" "ImagePullBackOff")" - "$(mk_pod_line ai-platform gpu-w-1 Pending "0/1" "PodInitializing")" - "$(mk_pod_line ai-platform gpu-w-2 Pending "0/1" "PodInitializing")" - "$(mk_pod_line ai-platform gpu-w-3 Pending "0/1" "PodInitializing")" - "$(mk_pod_line ai-platform gpu-w-4 Pending "0/1" "PodInitializing")" - "$(mk_pod_line ai-platform gpu-w-5 Pending "0/1" "PodInitializing")" - "$(mk_pod_line ai-platform gpu-w-6 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform head-pod Pending "0/2" "ImagePullBackOff")" + "$(mk_pod_line ai-platform gpu-w-1 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-w-2 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-w-3 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-w-4 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-w-5 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-w-6 Pending "0/1" "PodInitializing")" ) _print_unhealthy_pod_summary assert_eq "scenario 4: truncation ellipsis shown for downstream" \ - "1" "$(echo "${_captured}" | grep -c "… and")" + "1" "$(echo "${_captured}" | grep -c "… and" || true)" # ── Summary ─────────────────────────────────────────────────────────────────── From 5d97865290b2f29a7e94a4a2a4d73b61e93be621 Mon Sep 17 00:00:00 2001 From: Kumar Pratyush Date: Tue, 16 Jun 2026 09:51:07 +0530 Subject: [PATCH 6/6] fix: update banner comment to match new output format; remove unused local - Output format comment now shows the root-cause / still-initializing section headers instead of the old single-section format - Remove unused unhealthy_lines local (superseded by root_cause_lines and downstream_lines) Co-Authored-By: Claude Sonnet 4.6 --- tools/cluster_setup/k0s_cluster_with_stack.sh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 4d3df33e..29c72bb9 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -4796,11 +4796,15 @@ _collect_pod_summary() { # (e.g. verify ran a long time ago, or kubectl was unavailable) we fall back # to a single-line `kubectl get pods -A` query so the banner is never silent. # -# Output format: -# ⚠️ 3 unhealthy pod(s) across 2 namespace(s): -# ai-platform (1) airgap-cluster-l40s-…-l-worker-w957f [Running 1/2] -# kube-system (2) calico-node-f5qk7 [Pending 0/1, BackOff] -# konnectivity-agent-nkgrs [Pending 0/1] +# Output format (root-cause pods listed first, transient PodInitializing second): +# 1 root-cause pod(s) need attention: +# • ai-platform (1): +# - head-pod [Pending 0/2, ImagePullBackOff] +# +# 5 pod(s) are still initializing (will recover once root-cause pod(s) above are healthy): +# • ai-platform (5): +# - gpu-worker-1 [Pending 0/1, PodInitializing] +# … and 4 more in ai-platform (run: kubectl get pods -n ai-platform) # # We deliberately do NOT re-run kubectl by default: the diagnostics above # the banner already exhausted the freshest information; re-querying here @@ -4845,7 +4849,6 @@ _print_pod_section() { _print_unhealthy_pod_summary() { local total=0 local line ns name phase ready reason message owner_kind owner_name waiting terminated restarts created - local -a unhealthy_lines=() # Use cached POD_LINES if available; otherwise refresh once. if (( ${#POD_LINES[@]} == 0 )); then