diff --git a/tools/cluster_setup/k0s_cluster_with_stack.sh b/tools/cluster_setup/k0s_cluster_with_stack.sh index 7fb87726..29c72bb9 100755 --- a/tools/cluster_setup/k0s_cluster_with_stack.sh +++ b/tools/cluster_setup/k0s_cluster_with_stack.sh @@ -4796,19 +4796,59 @@ _collect_pod_summary() { # (e.g. verify ran a long time ago, or kubectl was unavailable) we fall back # to a single-line `kubectl get pods -A` query so the banner is never silent. # -# Output format: -# ⚠️ 3 unhealthy pod(s) across 2 namespace(s): -# ai-platform (1) airgap-cluster-l40s-…-l-worker-w957f [Running 1/2] -# kube-system (2) calico-node-f5qk7 [Pending 0/1, BackOff] -# konnectivity-agent-nkgrs [Pending 0/1] +# Output format (root-cause pods listed first, transient PodInitializing second): +# 1 root-cause pod(s) need attention: +# • ai-platform (1): +# - head-pod [Pending 0/2, ImagePullBackOff] +# +# 5 pod(s) are still initializing (will recover once root-cause pod(s) above are healthy): +# • ai-platform (5): +# - gpu-worker-1 [Pending 0/1, PodInitializing] +# … and 4 more in ai-platform (run: kubectl get pods -n ai-platform) # # We deliberately do NOT re-run kubectl by default: the diagnostics above # the banner already exhausted the freshest information; re-querying here # would add latency and risk a different snapshot, confusing the operator. + +# Helper: print a namespace-bucketed pod list from a delimited-line array. +# Called by _print_unhealthy_pod_summary for both root-cause and downstream sections. +_print_pod_section() { + local -a lines=("$@") + local -a ns_keys=() ns_counts=() + local i found_idx line ns _name _suffix pn _pname _psuffix + for line in "${lines[@]}"; do + IFS="${_POD_FS}" read -r ns _name _suffix <<<"${line}" + found_idx=-1 + for (( i=0; i < ${#ns_keys[@]}; i++ )); do + [[ "${ns_keys[$i]}" == "${ns}" ]] && { found_idx=$i; break; } + done + if (( found_idx == -1 )); then + ns_keys+=("${ns}"); ns_counts+=(1) + else + ns_counts[$found_idx]=$(( ns_counts[found_idx] + 1 )) + fi + done + for (( i=0; i < ${#ns_keys[@]}; i++ )); do + warn " • ${ns_keys[$i]} (${ns_counts[$i]}):" + local printed=0 + local max_per_ns=5 # avoid 200-line banners on truly broken clusters + for line in "${lines[@]}"; do + IFS="${_POD_FS}" read -r pn _pname _psuffix <<<"${line}" + [[ "${pn}" != "${ns_keys[$i]}" ]] && continue + warn " - ${_pname} ${_psuffix}" + printed=$(( printed + 1 )) + if (( printed >= max_per_ns )); then + local remaining=$(( ns_counts[i] - printed )) + (( remaining > 0 )) && warn " … and ${remaining} more in ${ns_keys[$i]} (run: kubectl get pods -n ${ns_keys[$i]})" + break + fi + done + done +} + _print_unhealthy_pod_summary() { local total=0 local line ns name phase ready reason message owner_kind owner_name waiting terminated restarts created - local -a unhealthy_lines=() # Use cached POD_LINES if available; otherwise refresh once. if (( ${#POD_LINES[@]} == 0 )); then @@ -4818,12 +4858,11 @@ _print_unhealthy_pod_summary() { } fi + local -a root_cause_lines=() downstream_lines=() for line in "${POD_LINES[@]}"; do [[ -z "${line}" ]] && continue IFS="${_POD_FS}" read -r ns name phase ready reason message owner_kind owner_name waiting terminated restarts created <<<"${line}" if ! _pod_is_healthy "${phase}" "${ready}" "${waiting}" "${terminated}" "${reason}"; then - # Build a compact "[Phase ready/total, reason]" suffix. We omit empty - # reason fields rather than printing literal "[Pending 0/1, ]". local suffix="[${phase} ${ready}" if [[ -n "${reason}" ]]; then suffix+=", ${reason}" @@ -4833,61 +4872,46 @@ _print_unhealthy_pod_summary() { suffix+=", ${terminated}" fi suffix+="]" - unhealthy_lines+=("${ns}${_POD_FS}${name}${_POD_FS}${suffix}") - total=$((total + 1)) + # PodInitializing is a transient downstream effect — another pod's failure + # is blocking this one. Separate it so the operator focuses on root causes. + if [[ "${waiting}" == "PodInitializing" || "${reason}" == "PodInitializing" ]]; then + downstream_lines+=("${ns}${_POD_FS}${name}${_POD_FS}${suffix}") + else + root_cause_lines+=("${ns}${_POD_FS}${name}${_POD_FS}${suffix}") + total=$((total + 1)) + fi fi done - if (( total == 0 )); then + local downstream_count=${#downstream_lines[@]} + + if (( total == 0 && downstream_count == 0 )); then log "✅ All pods are healthy at banner time." return 0 fi - # Bucket by namespace so the banner is easy to skim. We stick to plain - # arrays (bash 3.2 has no associative arrays) by collecting unique - # namespaces in encounter order and counting occurrences in a parallel - # array. - local -a ns_keys=() ns_counts=() - local i found_idx - for line in "${unhealthy_lines[@]}"; do - IFS="${_POD_FS}" read -r ns _name _suffix <<<"${line}" - found_idx=-1 - for (( i=0; i < ${#ns_keys[@]}; i++ )); do - if [[ "${ns_keys[$i]}" == "${ns}" ]]; then - found_idx="$i" - break - fi - done - if (( found_idx == -1 )); then - ns_keys+=("${ns}") - ns_counts+=(1) + if (( total > 0 )); then + warn "${total} root-cause pod(s) need attention:" + _print_pod_section "${root_cause_lines[@]}" + warn "" + fi + + if (( downstream_count > 0 )); then + if (( total > 0 )); then + warn "${downstream_count} pod(s) are still initializing (will recover once root-cause pod(s) above are healthy):" else - ns_counts[$found_idx]=$(( ns_counts[found_idx] + 1 )) + warn "${downstream_count} pod(s) are still initializing:" fi - done + _print_pod_section "${downstream_lines[@]}" + warn "" + fi - warn "${total} unhealthy pod(s) across ${#ns_keys[@]} namespace(s):" - for (( i=0; i < ${#ns_keys[@]}; i++ )); do - warn " • ${ns_keys[$i]} (${ns_counts[$i]}):" - local printed=0 - local max_per_ns=5 # avoid 200-line banners on truly broken clusters - local pn _pname _psuffix - for line in "${unhealthy_lines[@]}"; do - IFS="${_POD_FS}" read -r pn _pname _psuffix <<<"${line}" - [[ "${pn}" != "${ns_keys[$i]}" ]] && continue - warn " - ${_pname} ${_psuffix}" - printed=$(( printed + 1 )) - if (( printed >= max_per_ns )); then - local remaining=$(( ns_counts[i] - printed )) - if (( remaining > 0 )); then - warn " … and ${remaining} more in ${ns_keys[$i]} (run: kubectl get pods -n ${ns_keys[$i]})" - fi - break - fi - done - done - warn "" - warn "Tip: scroll up to see per-pod logs, events, and recommended fixes." + if (( total > 0 )); then + warn "Tip: fix the root-cause pod(s) first — initializing pods will recover automatically." + else + warn "Tip: pods are still starting up — re-run the verifier in a few minutes." + fi + warn " Scroll up to see per-pod logs, events, and recommended fixes." } # ====== SHOW PLATFORM ACCESS INFORMATION ====== @@ -5008,9 +5032,9 @@ show_platform_access_info() { log "============================================" log "📚 Documentation:" log " Setup Guide: ./tools/cluster_setup/K0S_README.md" - log " Setup Guide (Concise version): ./tools/cluster_setup/K0S_QUICKSTART.md" + log " Deployment Guide: ./tools/cluster_setup/DEPLOYMENT_GUIDE.md" + log " Troubleshooting: ./tools/cluster_setup/TROUBLESHOOTING.md" log " Custom Resources: ./docs/CustomResources.md" - log " Troubleshooting: Check operator logs and events above" log "============================================" log "" @@ -5031,8 +5055,7 @@ show_platform_access_info() { warn " Or re-run just the verifier (no install steps):" warn " CONFIG_FILE=${CONFIG_FILE:-} ${0} verify-pods" else - warn "⚠️ Your AI Platform is NOT ready to use yet: ${verify_rc} pod(s)" - warn " are unhealthy. Summary:" + warn "⚠️ Your AI Platform is NOT ready to use yet. Summary:" log "" _print_unhealthy_pod_summary warn " Re-run the verifier after fixing the issues above:" diff --git a/tools/cluster_setup/test_k0s_cluster_with_stack.sh b/tools/cluster_setup/test_k0s_cluster_with_stack.sh new file mode 100755 index 00000000..6cbeaaa1 --- /dev/null +++ b/tools/cluster_setup/test_k0s_cluster_with_stack.sh @@ -0,0 +1,268 @@ +#!/usr/bin/env bash +# test_k0s_cluster_with_stack.sh +# Unit tests for pure-logic functions in k0s_cluster_with_stack.sh. +# No cluster, SSH, kubectl, or network access required. +# +# Usage: +# ./test_k0s_cluster_with_stack.sh # run all tests +# ./test_k0s_cluster_with_stack.sh -v # verbose (show each assertion) +# ./test_k0s_cluster_with_stack.sh pod # run only tests matching "pod" + +# Intentionally no set -e: grep -c returns exit code 1 on zero matches, which +# would cause the harness to exit early on legitimate "0 occurrences" assertions. +set -uo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT="${SCRIPT_DIR}/k0s_cluster_with_stack.sh" + +VERBOSE=0 +FILTER="${1:-}" +if [[ "${FILTER}" == "-v" ]]; then VERBOSE=1; FILTER="${2:-}"; fi + +# ── Test framework ───────────────────────────────────────────────────────────── + +PASS=0; FAIL=0; SKIP=0 +_current_suite="" + +suite() { _current_suite="$1"; } + +assert_eq() { + local desc="$1" expected="$2" actual="$3" + if [[ -n "${FILTER}" && "${_current_suite} ${desc}" != *"${FILTER}"* ]]; then + SKIP=$(( SKIP + 1 )); return + fi + if [[ "${expected}" == "${actual}" ]]; then + PASS=$(( PASS + 1 )) + [[ "${VERBOSE}" == "1" ]] && echo " ✅ ${desc}" + else + FAIL=$(( FAIL + 1 )) + echo " ❌ ${desc}" + echo " expected: $(printf '%q' "${expected}")" + echo " actual : $(printf '%q' "${actual}")" + fi +} + +assert_rc() { + local desc="$1" expected_rc="$2" + shift 2 + if [[ -n "${FILTER}" && "${_current_suite} ${desc}" != *"${FILTER}"* ]]; then + SKIP=$(( SKIP + 1 )); return + fi + local actual_rc=0 + "$@" >/dev/null 2>&1 || actual_rc=$? + if [[ "${expected_rc}" == "${actual_rc}" ]]; then + PASS=$(( PASS + 1 )) + [[ "${VERBOSE}" == "1" ]] && echo " ✅ ${desc}" + else + FAIL=$(( FAIL + 1 )) + echo " ❌ ${desc} (expected rc=${expected_rc}, got rc=${actual_rc})" + fi +} + +# ── Function loader ──────────────────────────────────────────────────────────── +# Extract a named bash function (and its closing brace) from the script by +# name — robust to line-number shifts caused by unrelated edits. + +_extract_fn() { + local name="$1" + local start end + start=$(grep -n "^${name}()" "${SCRIPT}" | cut -d: -f1) + if [[ -z "${start}" ]]; then + echo "ERROR: function '${name}' not found in ${SCRIPT}" >&2 + return 1 + fi + end=$(awk -v s="${start}" 'NR>s && /^}$/{print NR; exit}' "${SCRIPT}") + sed -n "${start},${end}p" "${SCRIPT}" +} + +_load_functions() { + log() { :; } + warn() { :; } + err() { echo "ERROR: $*" >&2; exit 1; } + pf_ok() { :; } + pf_warn() { :; } + pf_fail() { :; } + + # Extract _POD_FS assignment (single line, not a function) + eval "$(grep '^_POD_FS=' "${SCRIPT}")" + + eval "$(_extract_fn build_image_url)" + eval "$(_extract_fn object_store_auth_looks_like_placeholder)" + eval "$(_extract_fn _pod_is_healthy)" + eval "$(_extract_fn _classify_pod_failure)" + eval "$(_extract_fn _print_pod_section)" + eval "$(_extract_fn _print_unhealthy_pod_summary)" +} + +_load_functions + +# ── Tests: build_image_url ───────────────────────────────────────────────────── + +suite "build_image_url" +echo "▶ build_image_url" + +assert_eq "prepends registry to bare path" \ + "my.registry.io/splunk/operator:1.0" \ + "$(build_image_url "my.registry.io" "splunk/operator:1.0")" + +assert_eq "skips registry when image already has a host" \ + "ghcr.io/splunk/operator:1.0" \ + "$(build_image_url "my.registry.io" "ghcr.io/splunk/operator:1.0")" + +assert_eq "skips registry when image has IP host" \ + "10.0.0.1:5000/operator:1.0" \ + "$(build_image_url "my.registry.io" "10.0.0.1:5000/operator:1.0")" + +assert_eq "returns bare path when registry is empty" \ + "splunk/operator:1.0" \ + "$(build_image_url "" "splunk/operator:1.0")" + +assert_eq "returns bare path when registry is null" \ + "splunk/operator:1.0" \ + "$(build_image_url "null" "splunk/operator:1.0")" + +# ── Tests: object_store_auth_looks_like_placeholder ─────────────────────────── + +suite "object_store_auth_looks_like_placeholder" +echo "▶ object_store_auth_looks_like_placeholder" + +assert_rc "detects angle bracket placeholder" 0 \ + bash -c "$(declare -f object_store_auth_looks_like_placeholder); MINIO_ROOT_USER='' MINIO_ROOT_PASSWORD='secret' object_store_auth_looks_like_placeholder" + +assert_rc "detects CHANGEME keyword in password" 0 \ + bash -c "$(declare -f object_store_auth_looks_like_placeholder); MINIO_ROOT_USER='admin' MINIO_ROOT_PASSWORD='CHANGEME' object_store_auth_looks_like_placeholder" + +assert_rc "detects changeme (lowercase)" 0 \ + bash -c "$(declare -f object_store_auth_looks_like_placeholder); MINIO_ROOT_USER='admin' MINIO_ROOT_PASSWORD='changeme' object_store_auth_looks_like_placeholder" + +assert_rc "accepts real credentials (returns 1)" 1 \ + bash -c "$(declare -f object_store_auth_looks_like_placeholder); MINIO_ROOT_USER='admin' MINIO_ROOT_PASSWORD='s3cr3t!' object_store_auth_looks_like_placeholder" + +# ── Tests: _pod_is_healthy ───────────────────────────────────────────────────── + +suite "_pod_is_healthy" +echo "▶ _pod_is_healthy" + +# args: phase ready waiting terminated reason +assert_rc "Running 2/2 is healthy" 0 _pod_is_healthy Running "2/2" "" "" "" +assert_rc "Running 1/2 is unhealthy" 1 _pod_is_healthy Running "1/2" "" "" "" +assert_rc "Succeeded is healthy" 0 _pod_is_healthy Succeeded "" "" "" "" +assert_rc "Pending is unhealthy" 1 _pod_is_healthy Pending "0/1" "" "" "" +assert_rc "Failed is unhealthy" 1 _pod_is_healthy Failed "0/1" "" "" "" +assert_rc "Unknown is unhealthy" 1 _pod_is_healthy Unknown "0/1" "" "" "" +assert_rc "CrashLoopBackOff is unhealthy" 1 _pod_is_healthy Running "0/1" "CrashLoopBackOff" "" "" +assert_rc "ImagePullBackOff is unhealthy" 1 _pod_is_healthy Running "0/1" "ImagePullBackOff" "" "" +assert_rc "ErrImagePull is unhealthy" 1 _pod_is_healthy Running "0/1" "ErrImagePull" "" "" +assert_rc "OOMKilled terminated is unhealthy" 1 _pod_is_healthy Running "1/1" "" "OOMKilled" "" +assert_rc "Error terminated is unhealthy" 1 _pod_is_healthy Running "1/1" "" "Error" "" +assert_rc "NodeLost reason is unhealthy" 1 _pod_is_healthy Running "1/1" "" "" "NodeLost" +assert_rc "Evicted reason is unhealthy" 1 _pod_is_healthy Running "1/1" "" "" "Evicted" +assert_rc "PodInitializing waiting is unhealthy" 1 _pod_is_healthy Pending "0/1" "PodInitializing" "" "" + +# ── Tests: _classify_pod_failure ────────────────────────────────────────────── + +suite "_classify_pod_failure" +echo "▶ _classify_pod_failure" + +# args: phase reason waiting terminated message +assert_eq "ImagePullBackOff → image-pull" \ + "image-pull" "$(_classify_pod_failure Pending "" "ImagePullBackOff" "" "")" +assert_eq "ErrImagePull → image-pull" \ + "image-pull" "$(_classify_pod_failure Running "" "ErrImagePull" "" "")" +assert_eq "CrashLoopBackOff → crashloop" \ + "crashloop" "$(_classify_pod_failure Running "" "CrashLoopBackOff" "" "")" +assert_eq "OOMKilled → oom" \ + "oom" "$(_classify_pod_failure Running "" "" "OOMKilled" "")" +assert_eq "Evicted → evicted" \ + "evicted" "$(_classify_pod_failure Running "Evicted" "" "" "")" +assert_eq "Pending with no signal → pending-long" \ + "pending-long" "$(_classify_pod_failure Pending "" "" "" "")" +assert_eq "Failed with no signal → failed" \ + "failed" "$(_classify_pod_failure Failed "" "" "" "")" + +# ── Tests: _print_unhealthy_pod_summary ─────────────────────────────────────── + +suite "_print_unhealthy_pod_summary pod" +echo "▶ _print_unhealthy_pod_summary" + +_captured="" +warn() { _captured+="WARN: $*"$'\n'; } +log() { _captured+="LOG: $*"$'\n'; } + +mk_pod_line() { + local ns=$1 name=$2 phase=$3 ready=$4 waiting=$5 + printf "%s" "${ns}${_POD_FS}${name}${_POD_FS}${phase}${_POD_FS}${ready}${_POD_FS}${_POD_FS}${_POD_FS}RS${_POD_FS}owner${_POD_FS}${waiting}${_POD_FS}${_POD_FS}0${_POD_FS}2026-06-16" +} + +# Scenario 1: mixed ImagePullBackOff (root cause) + PodInitializing (downstream) +_captured="" +declare -a POD_LINES=( + "$(mk_pod_line ai-platform head-pod Pending "0/2" "ImagePullBackOff")" + "$(mk_pod_line ai-platform gpu-worker-1 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-worker-2 Pending "0/1" "PodInitializing")" +) +_print_unhealthy_pod_summary +assert_eq "scenario 1: shows root-cause section header" \ + "1" "$(echo "${_captured}" | grep -c "^WARN: 1 root-cause pod" || true)" +assert_eq "scenario 1: shows downstream section" \ + "1" "$(echo "${_captured}" | grep -c "still initializing" || true)" +assert_eq "scenario 1: head pod appears in root-cause output" \ + "1" "$(echo "${_captured}" | grep -c "head-pod" || true)" +assert_eq "scenario 1: tip mentions fixing root cause" \ + "1" "$(echo "${_captured}" | grep -c "fix the root-cause" || true)" +assert_eq "scenario 1: downstream count is 2" \ + "1" "$(echo "${_captured}" | grep -c "^WARN: 2 pod(s) are still initializing" || true)" + +# Scenario 2: only PodInitializing — no root cause +_captured="" +POD_LINES=( + "$(mk_pod_line ai-platform gpu-worker-1 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-worker-2 Pending "0/1" "PodInitializing")" +) +_print_unhealthy_pod_summary +assert_eq "scenario 2: no root-cause section shown" \ + "0" "$(echo "${_captured}" | grep -c "root-cause pod" || true)" +assert_eq "scenario 2: shows 2 still initializing" \ + "1" "$(echo "${_captured}" | grep -c "^WARN: 2 pod(s) are still initializing" || true)" +assert_eq "scenario 2: tip says re-run verifier" \ + "1" "$(echo "${_captured}" | grep -c "re-run the verifier" || true)" + +# Scenario 3: only root causes, no PodInitializing — multi-namespace +_captured="" +POD_LINES=( + "$(mk_pod_line ai-platform head-pod Pending "0/2" "ImagePullBackOff")" + "$(mk_pod_line kube-system calico-node Pending "0/1" "CrashLoopBackOff")" +) +_print_unhealthy_pod_summary +assert_eq "scenario 3: 2 root-cause pods" \ + "1" "$(echo "${_captured}" | grep -c "^WARN: 2 root-cause pod" || true)" +assert_eq "scenario 3: no downstream section" \ + "0" "$(echo "${_captured}" | grep -c "still initializing" || true)" +assert_eq "scenario 3: ai-platform namespace shown" \ + "1" "$(echo "${_captured}" | grep -c "ai-platform" || true)" +assert_eq "scenario 3: kube-system namespace shown" \ + "1" "$(echo "${_captured}" | grep -c "kube-system" || true)" + +# Scenario 4: max_per_ns truncation (>5 pods in one namespace) +_captured="" +POD_LINES=( + "$(mk_pod_line ai-platform head-pod Pending "0/2" "ImagePullBackOff")" + "$(mk_pod_line ai-platform gpu-w-1 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-w-2 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-w-3 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-w-4 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-w-5 Pending "0/1" "PodInitializing")" + "$(mk_pod_line ai-platform gpu-w-6 Pending "0/1" "PodInitializing")" +) +_print_unhealthy_pod_summary +assert_eq "scenario 4: truncation ellipsis shown for downstream" \ + "1" "$(echo "${_captured}" | grep -c "… and" || true)" + +# ── Summary ─────────────────────────────────────────────────────────────────── + +echo "" +echo "Results: ${PASS} passed, ${FAIL} failed, ${SKIP} skipped" +echo "" +if (( FAIL > 0 )); then + exit 1 +fi