From 46a6cb1ce712e4f05095130759818c4de2366c6b Mon Sep 17 00:00:00 2001 From: "rw-codebundle-agent[bot]" Date: Thu, 25 Jun 2026 15:39:46 +0000 Subject: [PATCH] Add vast-k8s-csi-health CodeBundle for VAST CSI monitoring. Monitors VAST CSI driver health, NFS xprt metrics, PVC-to-view tracing, workload mount status, StorageClass config, and optional VMS correlation. Co-authored-by: Cursor --- .../generation-rules/vast-k8s-csi-health.yaml | 54 +++ .../templates/vast-k8s-csi-health-sli.yaml | 52 +++ .../templates/vast-k8s-csi-health-slx.yaml | 27 ++ .../vast-k8s-csi-health-taskset.yaml | 51 +++ .../vast-k8s-csi-health/.test/README.md | 18 + .../vast-k8s-csi-health/.test/Taskfile.yaml | 132 ++++++ .../.test/kubernetes/manifest.yaml | 63 +++ codebundles/vast-k8s-csi-health/README.md | 69 ++++ .../vast-k8s-csi-health/check-csi-metrics.sh | 139 +++++++ .../check-csi-pod-health.sh | 132 ++++++ .../check-nfs-xprt-health.sh | 124 ++++++ .../check-pod-mount-health.sh | 112 +++++ .../check-vast-storageclass-config.sh | 101 +++++ .../correlate-k8s-vast-events.sh | 117 ++++++ codebundles/vast-k8s-csi-health/runbook.robot | 382 ++++++++++++++++++ .../sli-vast-csi-health-score.sh | 95 +++++ codebundles/vast-k8s-csi-health/sli.robot | 103 +++++ .../vast-k8s-csi-health/trace-pvc-to-vast.sh | 119 ++++++ .../vast-k8s-csi-health/vast-csi-common.sh | 122 ++++++ 19 files changed, 2012 insertions(+) create mode 100644 codebundles/vast-k8s-csi-health/.runwhen/generation-rules/vast-k8s-csi-health.yaml create mode 100644 codebundles/vast-k8s-csi-health/.runwhen/templates/vast-k8s-csi-health-sli.yaml create mode 100644 codebundles/vast-k8s-csi-health/.runwhen/templates/vast-k8s-csi-health-slx.yaml create mode 100644 codebundles/vast-k8s-csi-health/.runwhen/templates/vast-k8s-csi-health-taskset.yaml create mode 100644 codebundles/vast-k8s-csi-health/.test/README.md create mode 100644 codebundles/vast-k8s-csi-health/.test/Taskfile.yaml create mode 100644 codebundles/vast-k8s-csi-health/.test/kubernetes/manifest.yaml create mode 100644 codebundles/vast-k8s-csi-health/README.md create mode 100755 codebundles/vast-k8s-csi-health/check-csi-metrics.sh create mode 100755 codebundles/vast-k8s-csi-health/check-csi-pod-health.sh create mode 100755 codebundles/vast-k8s-csi-health/check-nfs-xprt-health.sh create mode 100755 codebundles/vast-k8s-csi-health/check-pod-mount-health.sh create mode 100755 codebundles/vast-k8s-csi-health/check-vast-storageclass-config.sh create mode 100755 codebundles/vast-k8s-csi-health/correlate-k8s-vast-events.sh create mode 100644 codebundles/vast-k8s-csi-health/runbook.robot create mode 100755 codebundles/vast-k8s-csi-health/sli-vast-csi-health-score.sh create mode 100644 codebundles/vast-k8s-csi-health/sli.robot create mode 100755 codebundles/vast-k8s-csi-health/trace-pvc-to-vast.sh create mode 100755 codebundles/vast-k8s-csi-health/vast-csi-common.sh diff --git a/codebundles/vast-k8s-csi-health/.runwhen/generation-rules/vast-k8s-csi-health.yaml b/codebundles/vast-k8s-csi-health/.runwhen/generation-rules/vast-k8s-csi-health.yaml new file mode 100644 index 00000000..2a7a29c7 --- /dev/null +++ b/codebundles/vast-k8s-csi-health/.runwhen/generation-rules/vast-k8s-csi-health.yaml @@ -0,0 +1,54 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + generationRules: + # One SLX per namespace that has at least one VAST-backed PVC. + - resourceTypes: + - persistentvolumeclaim + matchRules: + - type: pattern + pattern: ".+" + properties: [name] + mode: substring + - type: or + matches: + - type: pattern + pattern: "vast" + properties: [spec/storageClassName] + mode: substring + - type: pattern + pattern: "vast" + properties: [metadata/annotations] + mode: substring + slxs: + - baseName: vast-k8s-csi-health + shortenedBaseName: vast-csi-hlth + qualifiers: ["namespace", "cluster"] + baseTemplateName: vast-k8s-csi-health + levelOfDetail: basic + outputItems: + - type: slx + - type: sli + - type: runbook + templateName: vast-k8s-csi-health-taskset.yaml + + # Optional cluster-level SLX for the CSI driver install namespace when + # operators want driver health monitoring before workload namespaces exist. + - resourceTypes: + - namespace + matchRules: + - type: pattern + pattern: "vast-csi" + properties: [name] + mode: substring + slxs: + - baseName: vast-csi-driver + shortenedBaseName: vast-csi-drv + qualifiers: ["namespace", "cluster"] + baseTemplateName: vast-k8s-csi-health + levelOfDetail: basic + outputItems: + - type: slx + - type: sli + - type: runbook + templateName: vast-k8s-csi-health-taskset.yaml diff --git a/codebundles/vast-k8s-csi-health/.runwhen/templates/vast-k8s-csi-health-sli.yaml b/codebundles/vast-k8s-csi-health/.runwhen/templates/vast-k8s-csi-health-sli.yaml new file mode 100644 index 00000000..74d64095 --- /dev/null +++ b/codebundles/vast-k8s-csi-health/.runwhen/templates/vast-k8s-csi-health-sli.yaml @@ -0,0 +1,52 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelIndicator +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + displayUnitsLong: OK + displayUnitsShort: ok + locations: + - {{default_location}} + description: Measures VAST CSI health using CSI pod readiness, PVC binding, mount success, and NFS xprt congestion. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/vast-k8s-csi-health/sli.robot + intervalStrategy: intermezzo + intervalSeconds: 300 + configProvided: + - name: CONTEXT + value: "{{context}}" + - name: NAMESPACE + value: "{{match_resource.resource.metadata.namespace | default(namespace.name)}}" + - name: CSI_NAMESPACE + value: "{{ custom.vast_csi_namespace | default('vast-csi') }}" + - name: KUBERNETES_DISTRIBUTION_BINARY + value: "{{ custom.kubernetes_distribution_binary | default('kubectl') }}" + - name: XPRT_PENDING_THRESHOLD + value: "{{ custom.xprt_pending_threshold | default('100') }}" + - name: RPC_ERROR_RATE_THRESHOLD + value: "{{ custom.rpc_error_rate_threshold | default('5') }}" + secretsProvided: + {% if wb_version %} + {% include "kubernetes-auth.yaml" ignore missing %} + {% else %} + - name: kubeconfig + workspaceKey: {{ custom.kubeconfig_secret_name | default("kubeconfig") }} + {% endif %} + alertConfig: + tasks: + persona: eager-edgar + sessionTTL: 10m diff --git a/codebundles/vast-k8s-csi-health/.runwhen/templates/vast-k8s-csi-health-slx.yaml b/codebundles/vast-k8s-csi-health/.runwhen/templates/vast-k8s-csi-health-slx.yaml new file mode 100644 index 00000000..20630f35 --- /dev/null +++ b/codebundles/vast-k8s-csi-health/.runwhen/templates/vast-k8s-csi-health-slx.yaml @@ -0,0 +1,27 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/pvc.svg + alias: {{namespace.name}} VAST CSI Health + asMeasuredBy: Aggregate score from CSI pod readiness, PVC binding, mount health, and NFS xprt metrics. + configProvided: + - name: OBJECT_NAME + value: {{match_resource.resource.metadata.name}} + owners: + - {{workspace.owner_email}} + statement: VAST CSI-backed storage in this namespace should have healthy driver pods, bound PVCs, and successful workload mounts. + additionalContext: + {% include "kubernetes-hierarchy.yaml" ignore missing %} + qualified_name: "{{ match_resource.qualified_name }}" + tags: + {% include "kubernetes-tags.yaml" ignore missing %} + - name: access + value: read-only + - name: storage + value: vast-csi diff --git a/codebundles/vast-k8s-csi-health/.runwhen/templates/vast-k8s-csi-health-taskset.yaml b/codebundles/vast-k8s-csi-health/.runwhen/templates/vast-k8s-csi-health-taskset.yaml new file mode 100644 index 00000000..04f740b9 --- /dev/null +++ b/codebundles/vast-k8s-csi-health/.runwhen/templates/vast-k8s-csi-health-taskset.yaml @@ -0,0 +1,51 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + description: Monitors VAST CSI driver health and traces workload storage for the namespace. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/vast-k8s-csi-health/runbook.robot + configProvided: + - name: CONTEXT + value: "{{context}}" + - name: NAMESPACE + value: "{{match_resource.resource.metadata.namespace | default(namespace.name)}}" + - name: CSI_NAMESPACE + value: "{{ custom.vast_csi_namespace | default('vast-csi') }}" + - name: KUBERNETES_DISTRIBUTION_BINARY + value: "{{ custom.kubernetes_distribution_binary | default('kubectl') }}" + - name: VAST_VMS_ENDPOINT + value: "{{ custom.vast_vms_endpoint | default('') }}" + - name: VAST_CLUSTER_NAME + value: "{{ custom.vast_cluster_name | default('') }}" + - name: XPRT_PENDING_THRESHOLD + value: "{{ custom.xprt_pending_threshold | default('100') }}" + - name: RPC_ERROR_RATE_THRESHOLD + value: "{{ custom.rpc_error_rate_threshold | default('5') }}" + secretsProvided: + {% if wb_version %} + {% include "kubernetes-auth.yaml" ignore missing %} + {% else %} + - name: kubeconfig + workspaceKey: {{ custom.kubeconfig_secret_name | default("kubeconfig") }} + {% endif %} + {% if custom.vast_vms_credentials_secret_name %} + - name: vast_vms_credentials + workspaceKey: {{ custom.vast_vms_credentials_secret_name }} + {% endif %} diff --git a/codebundles/vast-k8s-csi-health/.test/README.md b/codebundles/vast-k8s-csi-health/.test/README.md new file mode 100644 index 00000000..fd0974a5 --- /dev/null +++ b/codebundles/vast-k8s-csi-health/.test/README.md @@ -0,0 +1,18 @@ +# Test infrastructure for vast-k8s-csi-health + +Static Kubernetes manifests under `kubernetes/manifest.yaml` create: + +- Namespace `test-vast-csi-health` +- StorageClass `vast-test-sc` with provisioner `csi.vastdata.com` +- PVC and Deployment referencing VAST storage + +## Usage + +```bash +task build-infra # kubectl apply manifests +task validate-generation-rules +task default # requires pushed commits + RunWhen Local +task clean +``` + +The PVC will remain Pending without a real VAST CSI driver; generation rules still match the StorageClass name and annotations for SLX discovery testing. diff --git a/codebundles/vast-k8s-csi-health/.test/Taskfile.yaml b/codebundles/vast-k8s-csi-health/.test/Taskfile.yaml new file mode 100644 index 00000000..735d8679 --- /dev/null +++ b/codebundles/vast-k8s-csi-health/.test/Taskfile.yaml @@ -0,0 +1,132 @@ +version: "3" + +tasks: + default: + desc: "Run/refresh config" + cmds: + - task: check-unpushed-commits + - task: generate-rwl-config + - task: run-rwl-discovery + + clean: + desc: "Run cleanup tasks" + cmds: + - task: remove-kubernetes-objects + - task: delete-slxs + - task: clean-rwl-discovery + + build-infra: + desc: "Build test infrastructure" + cmds: + - task: create-kubernetes-objects + + create-kubernetes-objects: + desc: "Apply manifests from kubernetes directory using kubectl" + cmds: + - kubectl apply -f kubernetes/* + silent: true + + remove-kubernetes-objects: + desc: "Delete kubernetes objects" + cmds: + - kubectl delete -f kubernetes/* --ignore-not-found + silent: true + + check-unpushed-commits: + desc: Check if outstanding commits or file updates need to be pushed before testing. + vars: + BASE_DIR: "../" + cmds: + - | + echo "Checking for uncommitted changes in $BASE_DIR and $BASE_DIR.runwhen, excluding '.test'..." + UNCOMMITTED_FILES=$(git diff --name-only HEAD | grep -E "^${BASE_DIR}(\.runwhen|[^/]+)" | grep -v "/\.test/" || true) + if [ -n "$UNCOMMITTED_FILES" ]; then + echo "Uncommitted changes found:" + echo "$UNCOMMITTED_FILES" + exit 1 + fi + - | + echo "Checking for unpushed commits..." + git fetch origin + UNPUSHED_FILES=$(git diff --name-only origin/$(git rev-parse --abbrev-ref HEAD) HEAD | grep -E "^${BASE_DIR}(\.runwhen|[^/]+)" | grep -v "/\.test/" || true) + if [ -n "$UNPUSHED_FILES" ]; then + echo "Unpushed commits found:" + echo "$UNPUSHED_FILES" + exit 1 + fi + silent: true + + generate-rwl-config: + desc: "Generate RunWhen Local configuration (workspaceInfo.yaml)" + env: + RW_WORKSPACE: '{{.RW_WORKSPACE | default "my-workspace"}}' + cmds: + - | + repo_url=$(git config --get remote.origin.url) + branch_name=$(git rev-parse --abbrev-ref HEAD) + codebundle=$(basename "$(dirname "$PWD")") + namespace=$(yq e 'select(.kind == "Namespace") | .metadata.name' kubernetes/manifest.yaml -N) + cat < workspaceInfo.yaml + workspaceName: "$RW_WORKSPACE" + workspaceOwnerEmail: authors@runwhen.com + defaultLocation: location-01 + defaultLOD: none + cloudConfig: + kubernetes: + kubeconfigFile: /shared/kubeconfig + namespaceLODs: + $namespace: detailed + namespaces: + - $namespace + codeCollections: + - repoURL: "$repo_url" + branch: "$branch_name" + codeBundles: ["$codebundle"] + custom: + kubeconfig_secret_name: "kubeconfig" + kubernetes_distribution_binary: kubectl + vast_csi_namespace: vast-csi + EOF + silent: true + + run-rwl-discovery: + desc: "Run RunWhen Local Discovery on test infrastructure" + cmds: + - | + CONTAINER_NAME="RunWhenLocal" + docker rm -f $CONTAINER_NAME 2>/dev/null || true + sudo rm -rf output || true + mkdir -p output && chmod 777 output + kubeconfig=$(echo "$RW_FROM_FILE" | jq -r .kubeconfig) + docker run --name $CONTAINER_NAME -p 8081:8081 \ + -v "$(pwd)":/shared -v "$kubeconfig":/shared/kubeconfig \ + -d ghcr.io/runwhen-contrib/runwhen-local:latest + docker exec -w /workspace-builder $CONTAINER_NAME ./run.sh --verbose + silent: true + + validate-generation-rules: + desc: "Validate YAML files in .runwhen/generation-rules" + cmds: + - | + temp_dir=$(mktemp -d) + curl -s -o "$temp_dir/generation-rule-schema.json" \ + https://raw.githubusercontent.com/runwhen-contrib/runwhen-local/refs/heads/main/src/generation-rule-schema.json + for yaml_file in ../.runwhen/generation-rules/*.yaml; do + json_file="$temp_dir/$(basename "${yaml_file%.*}.json")" + yq -o=json "$yaml_file" > "$json_file" + ajv validate -s "$temp_dir/generation-rule-schema.json" -d "$json_file" --spec=draft2020 --strict=false + done + rm -rf "$temp_dir" + + clean-rwl-discovery: + desc: "Clean up RunWhen Local discovery output" + cmds: + - sudo rm -rf output + - rm -f workspaceInfo.yaml + silent: true + + delete-slxs: + desc: "Placeholder for platform SLX deletion (requires RW API credentials)" + cmds: + - echo "Configure RW_WORKSPACE, RW_API, and RW_PAT to delete SLXs from the platform." + silent: true diff --git a/codebundles/vast-k8s-csi-health/.test/kubernetes/manifest.yaml b/codebundles/vast-k8s-csi-health/.test/kubernetes/manifest.yaml new file mode 100644 index 00000000..619aab6d --- /dev/null +++ b/codebundles/vast-k8s-csi-health/.test/kubernetes/manifest.yaml @@ -0,0 +1,63 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: test-vast-csi-health + labels: + app.kubernetes.io/part-of: vast-k8s-csi-health-test + +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: vast-test-sc +provisioner: csi.vastdata.com +parameters: + endpoint: "192.0.2.10" + view_policy: "default" + tenant: "test-tenant" +reclaimPolicy: Delete +volumeBindingMode: Immediate + +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: test-vast-pvc + namespace: test-vast-csi-health + annotations: + volume.kubernetes.io/storage-provisioner: csi.vastdata.com +spec: + accessModes: + - ReadWriteOnce + storageClassName: vast-test-sc + resources: + requests: + storage: 1Gi + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: test-vast-consumer + namespace: test-vast-csi-health +spec: + replicas: 1 + selector: + matchLabels: + app: test-vast-consumer + template: + metadata: + labels: + app: test-vast-consumer + spec: + containers: + - name: app + image: busybox:1.36 + command: ["/bin/sh", "-c", "while true; do sleep 30; done"] + volumeMounts: + - name: data + mountPath: /data + volumes: + - name: data + persistentVolumeClaim: + claimName: test-vast-pvc diff --git a/codebundles/vast-k8s-csi-health/README.md b/codebundles/vast-k8s-csi-health/README.md new file mode 100644 index 00000000..9af0a243 --- /dev/null +++ b/codebundles/vast-k8s-csi-health/README.md @@ -0,0 +1,69 @@ +# VAST Data Kubernetes CSI Health + +Monitor the VAST CSI driver in Kubernetes and trace application storage from PVC/PV through to VAST views. Detects CSI driver failures, NFS transport congestion, mount issues, and optionally correlates in-cluster storage symptoms with VAST backend health. + +## Overview + +- **CSI driver health**: Controller and node pod readiness, CrashLoopBackOff, and restart counts in the CSI install namespace +- **CSI metrics**: RPC failure rates and slow operations from Prometheus `/metrics` on ports 9090 (node) and 9091 (controller) +- **NFS transport**: `csi_node_nfs_xprt_*` congestion, unhealthy VIPs, and pending request thresholds +- **PVC tracing**: Maps PVC → PV → StorageClass to VAST view path, tenant, and VIP identifiers +- **Workload mounts**: Pod mount failures, warning events, and VolumeAttachment issues for VAST volumes +- **StorageClass validation**: Endpoint, view policy, tenant, mount options, and expansion settings +- **VMS correlation**: Optional cross-reference of failing PVCs with VMS tenant capacity/QoS metrics + +## Configuration + +### Required Variables + +- `CONTEXT`: Kubernetes context name +- `NAMESPACE`: Kubernetes namespace for workload PVC tracing and mount checks + +### Optional Variables + +- `CSI_NAMESPACE`: Namespace where the VAST CSI driver is installed (default: `vast-csi`) +- `KUBERNETES_DISTRIBUTION_BINARY`: Kubernetes CLI binary (default: `kubectl`) +- `VAST_VMS_ENDPOINT`: Optional VMS REST base URL for backend correlation (e.g. `https://vms.example.com`) +- `VAST_CLUSTER_NAME`: Optional VAST cluster name used in correlation titles +- `XPRT_PENDING_THRESHOLD`: `csi_node_nfs_xprt_pending_requests` count that triggers an issue (default: `100`) +- `RPC_ERROR_RATE_THRESHOLD`: CSI RPC error rate percent threshold (default: `5`) + +### Secrets + +- `kubeconfig`: Standard kubeconfig YAML for Kubernetes cluster access +- `vast_vms_credentials` (optional): JSON object with `USERNAME` and `PASSWORD`, or `API_TOKEN`, for VMS API access when `VAST_VMS_ENDPOINT` is set + +## Tasks Overview + +### Check VAST CSI Driver Pod Health +Verifies CSI controller and node pods are Running/Ready; detects CrashLoopBackOff, not-Ready pods, high restarts, and replica gaps. + +### Check CSI Node and Controller Metrics for RPC Failures +Scrapes `/metrics` from CSI pods or headless metrics Services; flags elevated `csi_plugin_operations` error rates and slow RPC durations. + +### Check NFS Transport Health on CSI Nodes +Analyzes `csi_node_nfs_xprt_unhealthy`, `csi_node_nfs_xprt_congested_state`, and pending request metrics for VIP connectivity and congestion. + +### Trace Kubernetes PVCs to VAST Views +Produces a trace report linking each VAST-backed PVC to PV volumeHandle, StorageClass parameters, view path, tenant, and VIP. + +### Check End-to-End Pod Mount Health +Finds pods using VAST PVCs that are not Ready, plus mount-related warning events and VolumeAttachment failures. + +### Check VAST StorageClass Configuration +Validates VAST StorageClass parameters (endpoint, view policy, tenant, mount options) for misconfigurations. + +### Correlate Kubernetes Storage Events with VAST Tenant Metrics +When `VAST_VMS_ENDPOINT` is configured, fetches `/api/prometheusmetrics/tenants` and correlates unbound or failing PVCs with tenant signals. Skips gracefully with an informational report when the endpoint is unset. + +## Platform Notes + +- VAST CSI metrics are exposed at `GET /metrics` on node port **9090** and controller port **9091** (override via Helm `metrics.port`) +- Enable metrics in the Helm chart: `metrics.enabled=true` +- StorageClass provisioner ID: `csi.vastdata.com` +- See [VAST CSI metrics documentation](https://kb.vastdata.com/documentation/docs/exporting-vast-csi-driver-metrics-to-prometheus) + +## Related Bundles + +- `k8s-pvc-healthcheck`: General PVC health; this bundle adds VAST-specific CSI metrics and tracing +- `vast-tenant-storage-health`: Backend tenant QoS and capacity (complements this Kubernetes front-end view) diff --git a/codebundles/vast-k8s-csi-health/check-csi-metrics.sh b/codebundles/vast-k8s-csi-health/check-csi-metrics.sh new file mode 100755 index 00000000..94d6dc5d --- /dev/null +++ b/codebundles/vast-k8s-csi-health/check-csi-metrics.sh @@ -0,0 +1,139 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: CONTEXT, CSI_NAMESPACE +# OPTIONAL: RPC_ERROR_RATE_THRESHOLD (default 5) +# Scrapes CSI node (9090) and controller (9091) /metrics for RPC failures. +# Writes JSON array to csi_metrics_issues.json +# ----------------------------------------------------------------------------- +: "${CONTEXT:?Must set CONTEXT}" +: "${CSI_NAMESPACE:?Must set CSI_NAMESPACE}" + +OUTPUT_FILE="csi_metrics_issues.json" +RPC_ERROR_RATE_THRESHOLD="${RPC_ERROR_RATE_THRESHOLD:-5}" +NODE_METRICS_PORT="${NODE_METRICS_PORT:-9090}" +CONTROLLER_METRICS_PORT="${CONTROLLER_METRICS_PORT:-9091}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-csi-common.sh +source "${SCRIPT_DIR}/vast-csi-common.sh" + +issues_json='[]' +metrics_body="" + +print_report() { + { set +x; } 2>/dev/null || true + echo + echo "=== VAST CSI metrics probe (context '${CONTEXT}', namespace '${CSI_NAMESPACE}') ===" + echo "RPC error rate threshold: ${RPC_ERROR_RATE_THRESHOLD}%" + if [[ -n "$metrics_body" ]]; then + echo "$metrics_body" | head -n 40 + echo "... (truncated)" + else + echo " No metrics payload retrieved." + fi + echo + if [[ -s "$OUTPUT_FILE" ]]; then + jq -r '.[] | " - [sev=\(.severity)] \(.title)"' "$OUTPUT_FILE" 2>/dev/null || true + fi +} +trap print_report EXIT + +fetch_metrics() { + local role="$1" + local port="$2" + local pods_json svc_json body + + pods_json=$([ "$role" == "node" ] && find_csi_node_pods || find_csi_controller_pods) + local pod + pod=$(echo "$pods_json" | jq -r '.items[0].metadata.name // empty') + if [[ -n "$pod" ]]; then + body=$(curl_pod_metrics "$pod" "${CSI_NAMESPACE}" "$port") + if [[ -n "$body" ]]; then + echo "$body" + return 0 + fi + fi + + svc_json=$(find_metrics_services) + local svc + svc=$(echo "$svc_json" | jq -r --arg role "$role" ' + [.[] | select(.name | test($role; "i")) | .name][0] // empty + ') + if [[ -z "$svc" ]]; then + svc=$(echo "$svc_json" | jq -r '.[0].name // empty') + fi + if [[ -n "$svc" ]]; then + body=$(curl_service_metrics "$svc" "${CSI_NAMESPACE}" "$port") + if [[ -n "$body" ]]; then + echo "$body" + return 0 + fi + fi + return 1 +} + +analyze_rpc_metrics() { + local role="$1" + local body="$2" + [[ -z "$body" ]] && return + + local total failed rate slow_ops + total=$(echo "$body" | awk '/^csi_plugin_operations_total\{/{sum+=$NF} END{print sum+0}') + failed=$(echo "$body" | awk '/^csi_plugin_operations_total\{[^}]*grpc_code="(Internal|Unknown|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|FailedPrecondition)"/{sum+=$NF} END{print sum+0}') + if [[ "${total:-0}" -gt 0 ]]; then + rate=$(awk "BEGIN {printf \"%.2f\", (${failed:-0}/${total})*100}") + if awk "BEGIN {exit !(${rate} > ${RPC_ERROR_RATE_THRESHOLD})}"; then + issues_json=$(append_issue "$issues_json" \ + "Elevated CSI RPC error rate on ${role} metrics (context \`${CONTEXT}\`)" \ + "csi_plugin_operations_total failures=${failed} of ${total} (${rate}% > threshold ${RPC_ERROR_RATE_THRESHOLD}%)." \ + 3 \ + "Inspect ${role} pod logs in ${CSI_NAMESPACE}. Correlate with VMS health and NFS xprt congestion metrics.") + fi + fi + + slow_ops=$(echo "$body" | awk '/^csi_plugin_operations_seconds\{/{if ($NF > 5) c++} END{print c+0}') + if [[ "${slow_ops:-0}" -gt 0 ]]; then + issues_json=$(append_issue "$issues_json" \ + "Slow CSI RPC operations detected on ${role} metrics" \ + "Found ${slow_ops} csi_plugin_operations_seconds samples exceeding 5s in ${CSI_NAMESPACE}." \ + 3 \ + "Check VMS latency, network path to VIPs, and node CPU pressure on CSI ${role} pods.") + fi + + if ! echo "$body" | grep -q '^csi_plugin_operations_total'; then + issues_json=$(append_issue "$issues_json" \ + "CSI plugin operation metrics missing from ${role} endpoint" \ + "Metrics endpoint responded but csi_plugin_operations_total was not present; metrics may be disabled." \ + 4 \ + "Enable metrics in the VAST CSI Helm chart (metrics.enabled=true) and verify ServiceMonitor or headless metrics Services.") + fi +} + +node_metrics="" +controller_metrics="" + +if node_metrics=$(fetch_metrics "node" "$NODE_METRICS_PORT"); then + metrics_body+=$'\n'"# Node metrics (port ${NODE_METRICS_PORT})"$'\n'"${node_metrics}" + analyze_rpc_metrics "node" "$node_metrics" +else + issues_json=$(append_issue "$issues_json" \ + "Unable to scrape VAST CSI node metrics in namespace \`${CSI_NAMESPACE}\`" \ + "Could not reach /metrics on node pods (port ${NODE_METRICS_PORT}) or metrics Services." \ + 3 \ + "Enable node metrics in Helm values. Verify pod exec/network access from the RunWhen execution environment.") +fi + +if controller_metrics=$(fetch_metrics "controller" "$CONTROLLER_METRICS_PORT"); then + metrics_body+=$'\n'"# Controller metrics (port ${CONTROLLER_METRICS_PORT})"$'\n'"${controller_metrics}" + analyze_rpc_metrics "controller" "$controller_metrics" +else + issues_json=$(append_issue "$issues_json" \ + "Unable to scrape VAST CSI controller metrics in namespace \`${CSI_NAMESPACE}\`" \ + "Could not reach /metrics on controller pods (port ${CONTROLLER_METRICS_PORT}) or metrics Services." \ + 3 \ + "Enable controller metrics in Helm values and confirm the controller metrics Service has endpoints.") +fi + +write_issues "$OUTPUT_FILE" "$issues_json" diff --git a/codebundles/vast-k8s-csi-health/check-csi-pod-health.sh b/codebundles/vast-k8s-csi-health/check-csi-pod-health.sh new file mode 100755 index 00000000..accbb3dc --- /dev/null +++ b/codebundles/vast-k8s-csi-health/check-csi-pod-health.sh @@ -0,0 +1,132 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: CONTEXT, CSI_NAMESPACE +# Checks VAST CSI controller and node pods for readiness and restart issues. +# Writes JSON array to csi_pod_health_issues.json +# ----------------------------------------------------------------------------- +: "${CONTEXT:?Must set CONTEXT}" +: "${CSI_NAMESPACE:?Must set CSI_NAMESPACE}" + +OUTPUT_FILE="csi_pod_health_issues.json" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-csi-common.sh +source "${SCRIPT_DIR}/vast-csi-common.sh" + +issues_json='[]' + +print_report() { + { set +x; } 2>/dev/null || true + echo + echo "=== VAST CSI pods in namespace '${CSI_NAMESPACE}' (context '${CONTEXT}') ===" + k8s get pods -n "${CSI_NAMESPACE}" -o wide 2>/dev/null || echo " (unable to list pods)" + echo + if [[ -s "$OUTPUT_FILE" ]]; then + local ic + ic=$(jq 'length' "$OUTPUT_FILE" 2>/dev/null || echo 0) + echo "=== Findings (${ic}) ===" + jq -r '.[] | " - [sev=\(.severity)] \(.title)"' "$OUTPUT_FILE" 2>/dev/null || true + fi +} +trap print_report EXIT + +if ! k8s get ns "${CSI_NAMESPACE}" -o name &>/dev/null; then + issues_json=$(append_issue "$issues_json" \ + "VAST CSI namespace \`${CSI_NAMESPACE}\` not found in context \`${CONTEXT}\`" \ + "The configured CSI_NAMESPACE does not exist; driver health cannot be assessed." \ + 3 \ + "Verify CSI_NAMESPACE (default: vast-csi) and confirm the VAST CSI Helm release is installed.") + write_issues "$OUTPUT_FILE" "$issues_json" + exit 0 +fi + +check_pods() { + local role="$1" + local pods_json="$2" + local count + count=$(echo "$pods_json" | jq '.items | length') + if [[ "$count" -eq 0 ]]; then + issues_json=$(append_issue "$issues_json" \ + "No VAST CSI ${role} pods found in namespace \`${CSI_NAMESPACE}\`" \ + "Expected ${role} DaemonSet/Deployment pods for the VAST CSI driver were not discovered." \ + 2 \ + "Confirm the Helm release installed node/controller components. Check labels and pod selectors in ${CSI_NAMESPACE}.") + return + fi + + while IFS= read -r line; do + [[ -z "$line" ]] && continue + local name phase ready restarts crash + name=$(echo "$line" | jq -r '.name') + phase=$(echo "$line" | jq -r '.phase') + ready=$(echo "$line" | jq -r '.ready') + restarts=$(echo "$line" | jq -r '.restarts') + crash=$(echo "$line" | jq -r '.crash') + + if [[ "$crash" == "true" ]]; then + issues_json=$(append_issue "$issues_json" \ + "VAST CSI ${role} pod \`${name}\` is in CrashLoopBackOff" \ + "Pod phase=${phase}, ready=${ready}, restarts=${restarts} in namespace ${CSI_NAMESPACE}." \ + 2 \ + "Inspect logs: ${KUBECTL} logs -n ${CSI_NAMESPACE} ${name} --context ${CONTEXT}. Check VMS connectivity and mount permissions.") + elif [[ "$ready" != "True" ]]; then + issues_json=$(append_issue "$issues_json" \ + "VAST CSI ${role} pod \`${name}\` is not Ready" \ + "Pod phase=${phase}, restarts=${restarts} in namespace ${CSI_NAMESPACE}." \ + 2 \ + "Describe pod: ${KUBECTL} describe pod -n ${CSI_NAMESPACE} ${name} --context ${CONTEXT}.") + fi + + if [[ "${restarts}" =~ ^[0-9]+$ ]] && [[ "$restarts" -gt 5 ]]; then + issues_json=$(append_issue "$issues_json" \ + "Elevated restarts on VAST CSI ${role} pod \`${name}\`" \ + "Total container restarts: ${restarts} within namespace ${CSI_NAMESPACE}." \ + 2 \ + "Review recent logs and node NFS transport metrics; check for OOM or VMS endpoint instability.") + fi + done < <(echo "$pods_json" | jq -c '.items[] | { + name: .metadata.name, + phase: (.status.phase // "Unknown"), + ready: ((.status.conditions // []) | map(select(.type=="Ready")) | .[0].status // "False"), + restarts: ([.status.containerStatuses[]? | .restartCount // 0] | add // 0), + crash: ([.status.containerStatuses[]? | .state.waiting.reason? // empty] | map(select(. == "CrashLoopBackOff")) | length > 0) + }') +} + +node_pods=$(find_csi_node_pods) +controller_pods=$(find_csi_controller_pods) + +# Fallback: all pods in namespace if selectors miss custom installs +if [[ $(echo "$node_pods" | jq '.items | length') -eq 0 && $(echo "$controller_pods" | jq '.items | length') -eq 0 ]]; then + all_pods=$(k8s get pods -n "${CSI_NAMESPACE}" -o json 2>/dev/null || echo '{"items":[]}') + node_pods=$(echo "$all_pods" | jq '{items: [.items[] | select(.metadata.name | test("node"; "i"))]}') + controller_pods=$(echo "$all_pods" | jq '{items: [.items[] | select(.metadata.name | test("controller"; "i"))]}') +fi + +check_pods "node" "$node_pods" +check_pods "controller" "$controller_pods" + +# Deployment / DaemonSet replica alignment +for kind in deploy statefulset daemonset; do + resources=$(k8s get "$kind" -n "${CSI_NAMESPACE}" -o json 2>/dev/null || echo '{"items":[]}') + while IFS= read -r dline; do + [[ -z "$dline" ]] && continue + dname=$(echo "$dline" | jq -r '.name') + want=$(echo "$dline" | jq -r '.desired') + have=$(echo "$dline" | jq -r '.ready') + if [[ "$want" =~ ^[0-9]+$ ]] && [[ "$have" =~ ^[0-9]+$ ]] && [[ "$want" -gt 0 ]] && [[ "$have" -lt "$want" ]]; then + issues_json=$(append_issue "$issues_json" \ + "VAST CSI ${kind} \`${dname}\` is not fully Ready" \ + "readyReplicas=${have}, desired=${want} in namespace ${CSI_NAMESPACE}." \ + 2 \ + "${KUBECTL} describe ${kind} -n ${CSI_NAMESPACE} ${dname} --context ${CONTEXT}") + fi + done < <(echo "$resources" | jq -c '.items[] | select(.metadata.name | test("vast|csi"; "i")) | { + name: .metadata.name, + desired: (.spec.replicas // (.status.desiredNumberScheduled // 0)), + ready: (.status.readyReplicas // (.status.numberReady // 0)) + }') +done + +write_issues "$OUTPUT_FILE" "$issues_json" diff --git a/codebundles/vast-k8s-csi-health/check-nfs-xprt-health.sh b/codebundles/vast-k8s-csi-health/check-nfs-xprt-health.sh new file mode 100755 index 00000000..8ef0b8a0 --- /dev/null +++ b/codebundles/vast-k8s-csi-health/check-nfs-xprt-health.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: CONTEXT, CSI_NAMESPACE +# OPTIONAL: XPRT_PENDING_THRESHOLD (default 100) +# Analyzes csi_node_nfs_xprt_* metrics for congestion and unhealthy VIPs. +# Writes JSON array to nfs_xprt_issues.json +# ----------------------------------------------------------------------------- +: "${CONTEXT:?Must set CONTEXT}" +: "${CSI_NAMESPACE:?Must set CSI_NAMESPACE}" + +OUTPUT_FILE="nfs_xprt_issues.json" +XPRT_PENDING_THRESHOLD="${XPRT_PENDING_THRESHOLD:-100}" +NODE_METRICS_PORT="${NODE_METRICS_PORT:-9090}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-csi-common.sh +source "${SCRIPT_DIR}/vast-csi-common.sh" + +issues_json='[]' +metrics_body="" + +print_report() { + { set +x; } 2>/dev/null || true + echo + echo "=== NFS xprt metrics (namespace '${CSI_NAMESPACE}', threshold pending=${XPRT_PENDING_THRESHOLD}) ===" + if [[ -n "$metrics_body" ]]; then + echo "$metrics_body" | grep -E '^csi_node_nfs_xprt' | head -n 30 || echo " (no csi_node_nfs_xprt_* lines found)" + else + echo " No metrics retrieved." + fi +} +trap print_report EXIT + +fetch_node_metrics() { + local pods_json pod body + pods_json=$(find_csi_node_pods) + while IFS= read -r pod; do + [[ -z "$pod" ]] && continue + body=$(curl_pod_metrics "$pod" "${CSI_NAMESPACE}" "$NODE_METRICS_PORT") + if [[ -n "$body" ]] && echo "$body" | grep -q 'csi_node_nfs_xprt'; then + echo "$body" + return 0 + fi + done < <(echo "$pods_json" | jq -r '.items[].metadata.name // empty') + + local svc + svc=$(find_metrics_services | jq -r '[.[] | select(.name | test("node"; "i")) | .name][0] // empty') + if [[ -n "$svc" ]]; then + body=$(curl_service_metrics "$svc" "${CSI_NAMESPACE}" "$NODE_METRICS_PORT") + [[ -n "$body" ]] && echo "$body" && return 0 + fi + return 1 +} + +if ! metrics_body=$(fetch_node_metrics); then + issues_json=$(append_issue "$issues_json" \ + "NFS transport metrics unavailable from VAST CSI node pods" \ + "Could not retrieve csi_node_nfs_xprt_* metrics from ${CSI_NAMESPACE} on context ${CONTEXT}." \ + 3 \ + "Enable node metrics and ensure VIP connections are established. Metrics export only while VIPs are connected.") + write_issues "$OUTPUT_FILE" "$issues_json" + exit 0 +fi + +# Unhealthy transports +while IFS= read -r line; do + [[ -z "$line" ]] && continue + dest=$(echo "$line" | sed -n 's/.*destination="\([^"]*\)".*/\1/p') + issues_json=$(append_issue "$issues_json" \ + "Unhealthy NFS transport to VIP \`${dest:-unknown}\` on CSI node" \ + "Metric line: ${line}" \ + 3 \ + "Verify VIP reachability from worker nodes, check network ACLs, and inspect VMS cluster health for the destination VIP.") +done < <(echo "$metrics_body" | awk '/^csi_node_nfs_xprt_unhealthy\{/{if ($NF >= 1) print}' || true) + +# Congested state +while IFS= read -r line; do + [[ -z "$line" ]] && continue + dest=$(echo "$line" | sed -n 's/.*destination="\([^"]*\)".*/\1/p') + pending=$(echo "$metrics_body" | awk -v d="$dest" ' + /^csi_node_nfs_xprt_pending_requests\{/ { + if ($0 ~ "destination=\"" d "\"" && $NF >= '"${XPRT_PENDING_THRESHOLD}"') { print $NF; exit } + }') + details="Congested transport detected. Line: ${line}" + if [[ -n "${pending:-}" ]]; then + details="${details} pending_requests=${pending} (threshold ${XPRT_PENDING_THRESHOLD})." + fi + issues_json=$(append_issue "$issues_json" \ + "NFS transport congestion toward VIP \`${dest:-unknown}\`" \ + "$details" \ + 3 \ + "Investigate network congestion between workers and VAST VIPs. Review tenant QoS limits and workload I/O patterns.") +done < <(echo "$metrics_body" | awk '/^csi_node_nfs_xprt_congested_state\{/{if ($NF >= 1) print}' || true) + +# Pending requests threshold without congestion flag +while IFS= read -r line; do + [[ -z "$line" ]] && continue + val=$(echo "$line" | awk '{print $NF}') + dest=$(echo "$line" | sed -n 's/.*destination="\([^"]*\)".*/\1/p') + if [[ "${val%%.*}" =~ ^[0-9]+$ ]] && [[ "${val%%.*}" -gt "${XPRT_PENDING_THRESHOLD}" ]]; then + if ! echo "$issues_json" | jq -e --arg d "${dest:-unknown}" '.[] | select(.title | contains($d))' >/dev/null 2>&1; then + issues_json=$(append_issue "$issues_json" \ + "High pending NFS requests toward VIP \`${dest:-unknown}\`" \ + "csi_node_nfs_xprt_pending_requests=${val} exceeds threshold ${XPRT_PENDING_THRESHOLD}. Line: ${line}" \ + 3 \ + "Check for slow VMS responses or network latency. Consider scaling tenant QoS or reducing concurrent mount pressure.") + fi + fi +done < <(echo "$metrics_body" | awk '/^csi_node_nfs_xprt_pending_requests\{/{print}' || true) + +# No transports connected at all +xprt_total=$(echo "$metrics_body" | awk '/^csi_node_nfs_xprt_total /{print $NF; exit}') +xprt_connected=$(echo "$metrics_body" | awk '/^csi_node_nfs_xprt_connected /{print $NF; exit}') +if [[ "${xprt_total:-1}" == "0.0" || "${xprt_total:-1}" == "0" ]]; then + issues_json=$(append_issue "$issues_json" \ + "No NFS transports registered on VAST CSI node metrics" \ + "csi_node_nfs_xprt_total=${xprt_total:-0} indicates no active VIP connections." \ + 2 \ + "Confirm StorageClass endpoint/VIP configuration and that workloads have attempted mounts on this node.") +fi + +write_issues "$OUTPUT_FILE" "$issues_json" diff --git a/codebundles/vast-k8s-csi-health/check-pod-mount-health.sh b/codebundles/vast-k8s-csi-health/check-pod-mount-health.sh new file mode 100755 index 00000000..0e82977a --- /dev/null +++ b/codebundles/vast-k8s-csi-health/check-pod-mount-health.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: CONTEXT, NAMESPACE +# Finds pods using VAST CSI volumes with mount / VolumeAttachment failures. +# Writes JSON array to pod_mount_issues.json +# ----------------------------------------------------------------------------- +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="pod_mount_issues.json" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-csi-common.sh +source "${SCRIPT_DIR}/vast-csi-common.sh" + +issues_json='[]' + +print_report() { + { set +x; } 2>/dev/null || true + echo + echo "=== Pod mount health for VAST volumes in '${NAMESPACE}' ===" + k8s get pods -n "${NAMESPACE}" -o wide 2>/dev/null || true + echo + if [[ -s "$OUTPUT_FILE" ]]; then + jq -r '.[] | " - [sev=\(.severity)] \(.title)"' "$OUTPUT_FILE" 2>/dev/null || true + fi +} +trap print_report EXIT + +vast_pvcs=$(list_vast_pvcs_json "${NAMESPACE}") +pvc_names=$(echo "$vast_pvcs" | jq -r '.items[].metadata.name // empty') + +if [[ -z "$pvc_names" ]]; then + all_pvcs=$(k8s get pvc -n "${NAMESPACE}" -o json 2>/dev/null || echo '{"items":[]}') + pvc_names=$(while IFS= read -r line; do + [[ -z "$line" ]] && continue + is_vast_pvc_json "$line" && echo "$line" | jq -r '.metadata.name' + done < <(echo "$all_pvcs" | jq -c '.items[]?')) +fi + +if [[ -z "$pvc_names" ]]; then + write_issues "$OUTPUT_FILE" "$issues_json" + exit 0 +fi + +while IFS= read -r pvc; do + [[ -z "$pvc" ]] && continue + pods_using=$(k8s get pods -n "${NAMESPACE}" -o json 2>/dev/null | jq -r --arg pvc "$pvc" ' + .items[] | select(.spec.volumes[]?.persistentVolumeClaim.claimName == $pvc) | .metadata.name + ') + while IFS= read -r pod; do + [[ -z "$pod" ]] && continue + pod_json=$(k8s get pod "$pod" -n "${NAMESPACE}" -o json 2>/dev/null || echo '{}') + phase=$(echo "$pod_json" | jq -r '.status.phase // "Unknown"') + mount_fail=$(echo "$pod_json" | jq -r ' + [.status.containerStatuses[]?.state.waiting.reason? // empty, + .status.initContainerStatuses[]?.state.waiting.reason? // empty] | + map(select(. == "ContainerCreating" or . == "CreateContainerError")) | length + ') + + not_ready=$(echo "$pod_json" | jq -r ' + ([.status.conditions[]? | select(.type=="Ready") | .status][0] // "False") + ') + + if [[ "$phase" == "Pending" || "$not_ready" == "False" ]]; then + issues_json=$(append_issue "$issues_json" \ + "Pod \`${pod}\` using VAST PVC \`${pvc}\` is not running/ready" \ + "Pod phase=${phase}, ready=${not_ready}, mount-related waits=${mount_fail} in namespace ${NAMESPACE}." \ + 3 \ + "Describe pod ${pod} and check for FailedMount / FailedAttachVolume events. Inspect CSI node logs on the scheduled node.") + fi + + events=$(k8s get events -n "${NAMESPACE}" --field-selector "involvedObject.name=${pod}" -o json 2>/dev/null || echo '{"items":[]}') + while IFS= read -r ev; do + [[ -z "$ev" ]] && continue + msg=$(echo "$ev" | jq -r '.message') + reason=$(echo "$ev" | jq -r '.reason') + if echo "$msg $reason" | grep -qiE 'mount|publish|attach|volume|nfs|vast|csi'; then + issues_json=$(append_issue "$issues_json" \ + "Mount-related event for pod \`${pod}\` (PVC \`${pvc}\`)" \ + "Event reason=${reason}: ${msg}" \ + 3 \ + "Review VolumeAttachment objects and CSI node logs. Correlate with NFS xprt metrics if mounts hang.") + fi + done < <(echo "$events" | jq -c '.items[]? | select(.type == "Warning")') + done <<< "$pods_using" +done <<< "$pvc_names" + +# VolumeAttachment issues for VAST PVs in this namespace +vas=$(k8s get volumeattachment -o json 2>/dev/null || echo '{"items":[]}') +while IFS= read -r va; do + [[ -z "$va" ]] && continue + attached=$(echo "$va" | jq -r '.status.attached // false') + err=$(echo "$va" | jq -r '.status.attachError.message // empty') + pv=$(echo "$va" | jq -r '.spec.source.persistentVolumeName // empty') + pod_ref=$(echo "$va" | jq -r '.spec.source.inlineVolumeSpec.claimRef.name // empty') + driver=$(echo "$va" | jq -r '.spec.attacher // empty') + + [[ "$driver" != "csi.vastdata.com" ]] && continue + [[ -n "$pod_ref" ]] && ! echo "$pvc_names" | grep -qx "$pod_ref" && continue + + if [[ "$attached" != "true" || -n "$err" ]]; then + issues_json=$(append_issue "$issues_json" \ + "VolumeAttachment failure for VAST PV \`${pv:-unknown}\`" \ + "attached=${attached}, error=${err:-none}, claimRef=${pod_ref:-n/a}." \ + 2 \ + "Describe volumeattachment and verify node driver registrar health. Check for stale attachments after node drains.") + fi +done < <(echo "$vas" | jq -c '.items[]?') + +write_issues "$OUTPUT_FILE" "$issues_json" diff --git a/codebundles/vast-k8s-csi-health/check-vast-storageclass-config.sh b/codebundles/vast-k8s-csi-health/check-vast-storageclass-config.sh new file mode 100755 index 00000000..8b944825 --- /dev/null +++ b/codebundles/vast-k8s-csi-health/check-vast-storageclass-config.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: CONTEXT +# Validates VAST StorageClass parameters for common misconfigurations. +# Writes JSON array to storageclass_config_issues.json +# ----------------------------------------------------------------------------- +: "${CONTEXT:?Must set CONTEXT}" + +OUTPUT_FILE="storageclass_config_issues.json" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-csi-common.sh +source "${SCRIPT_DIR}/vast-csi-common.sh" + +issues_json='[]' + +print_report() { + { set +x; } 2>/dev/null || true + echo + echo "=== VAST StorageClasses in context '${CONTEXT}' ===" + k8s get storageclass -o custom-columns=NAME:.metadata.name,PROVISIONER:.provisioner 2>/dev/null \ + | awk 'NR==1 || /vast|csi\.vastdata/' || true +} +trap print_report EXIT + +scs=$(k8s get storageclass -o json 2>/dev/null || echo '{"items":[]}') +vast_scs=$(echo "$scs" | jq -c '[.items[] | select( + (.provisioner == "csi.vastdata.com") or + (.provisioner == "kubernetes.io/csi/csi.vastdata.com") or + (.metadata.name | test("vast"; "i")) +)]') + +count=$(echo "$vast_scs" | jq 'length') +if [[ "$count" -eq 0 ]]; then + issues_json=$(append_issue "$issues_json" \ + "No VAST CSI StorageClasses found in context \`${CONTEXT}\`" \ + "No StorageClass uses provisioner csi.vastdata.com." \ + 3 \ + "Install or register a VAST StorageClass via the CSI Helm chart. Confirm provisioner ID csi.vastdata.com.") + write_issues "$OUTPUT_FILE" "$issues_json" + exit 0 +fi + +while IFS= read -r sc; do + [[ -z "$sc" ]] && continue + name=$(echo "$sc" | jq -r '.metadata.name') + params=$(echo "$sc" | jq -r '.parameters // {}') + mount_opts=$(echo "$sc" | jq -r '.mountOptions // [] | join(",")') + reclaim=$(echo "$sc" | jq -r '.reclaimPolicy // "Delete"') + vol_expansion=$(echo "$sc" | jq -r '.allowVolumeExpansion // false') + + endpoint=$(echo "$params" | jq -r '.endpoint // .vip_pool // .vip // empty') + view_policy=$(echo "$params" | jq -r '.view_policy // .view // .root_export // empty') + tenant=$(echo "$params" | jq -r '.tenant // .tenant_name // empty') + qos=$(echo "$params" | jq -r '.qos_policy // .qos // empty') + + echo "StorageClass ${name}: endpoint=${endpoint:-missing}, view=${view_policy:-missing}, tenant=${tenant:-missing}, qos=${qos:-n/a}, mountOptions=${mount_opts:-none}" + + if [[ -z "$endpoint" ]]; then + issues_json=$(append_issue "$issues_json" \ + "VAST StorageClass \`${name}\` missing endpoint/VIP parameter" \ + "parameters.endpoint (or vip_pool/vip) is not set; dynamic provisioning may fail or use incorrect VIPs." \ + 3 \ + "Set endpoint to a reachable VAST VIP or DNS name in the StorageClass parameters.") + fi + + if [[ -z "$view_policy" ]]; then + issues_json=$(append_issue "$issues_json" \ + "VAST StorageClass \`${name}\` missing view policy parameter" \ + "No view_policy/view/root_export parameter found; view creation defaults may not match tenant layout." \ + 4 \ + "Align view_policy with VMS view templates for the target tenant ${tenant:-unknown}.") + fi + + if [[ -z "$tenant" ]]; then + issues_json=$(append_issue "$issues_json" \ + "VAST StorageClass \`${name}\` has no explicit tenant parameter" \ + "Tenant is not specified; volumes may land in an unexpected tenant context." \ + 4 \ + "Set tenant or tenant_name to the intended VMS tenant for capacity and QoS tracking.") + fi + + if [[ "$reclaim" == "Retain" && "$vol_expansion" != "true" ]]; then + issues_json=$(append_issue "$issues_json" \ + "VAST StorageClass \`${name}\` retains PVs without volume expansion enabled" \ + "reclaimPolicy=Retain with allowVolumeExpansion=false can block operational growth for stateful workloads." \ + 4 \ + "Enable allowVolumeExpansion or document manual expansion procedures for Retain volumes.") + fi + + if echo "$mount_opts" | grep -qi 'sync' && ! echo "$mount_opts" | grep -qi 'noatime'; then + issues_json=$(append_issue "$issues_json" \ + "VAST StorageClass \`${name}\` uses strict sync mount options" \ + "mountOptions=${mount_opts} may increase latency-sensitive workload impact on NFS." \ + 4 \ + "Review mountOptions (mountUmountTimeout, resolveMountSymlinks) against workload latency requirements.") + fi +done < <(echo "$vast_scs" | jq -c '.[]') + +write_issues "$OUTPUT_FILE" "$issues_json" diff --git a/codebundles/vast-k8s-csi-health/correlate-k8s-vast-events.sh b/codebundles/vast-k8s-csi-health/correlate-k8s-vast-events.sh new file mode 100755 index 00000000..a63f02de --- /dev/null +++ b/codebundles/vast-k8s-csi-health/correlate-k8s-vast-events.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: CONTEXT, NAMESPACE +# OPTIONAL: VAST_VMS_ENDPOINT, VAST_CLUSTER_NAME, vast_vms_credentials secret +# Cross-references failing PVCs with VMS tenant metrics when configured. +# Writes JSON array to vast_correlation_issues.json +# ----------------------------------------------------------------------------- +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="vast_correlation_issues.json" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-csi-common.sh +source "${SCRIPT_DIR}/vast-csi-common.sh" + +issues_json='[]' +info_report="" + +print_report() { + { set +x; } 2>/dev/null || true + echo + echo "=== Kubernetes/VMS correlation for namespace '${NAMESPACE}' ===" + echo "${info_report:- (no correlation output)}" +} +trap print_report EXIT + +if [[ -z "${VAST_VMS_ENDPOINT:-}" ]]; then + info_report="VAST_VMS_ENDPOINT is not configured; skipping backend correlation (informational only)." + issues_json=$(append_issue "$issues_json" \ + "VMS backend correlation skipped for namespace \`${NAMESPACE}\`" \ + "Set VAST_VMS_ENDPOINT and optional vast_vms_credentials to cross-reference tenant capacity/QoS with Kubernetes storage events." \ + 4 \ + "Configure VAST_VMS_ENDPOINT to the VMS REST base URL (e.g. https://vms.example.com) and provide API credentials.") + write_issues "$OUTPUT_FILE" "$issues_json" + exit 0 +fi + +# Parse optional credentials from env (injected by platform from secret) +VMS_USER="${VMS_USERNAME:-${USERNAME:-}}" +VMS_PASS="${VMS_PASSWORD:-${PASSWORD:-}}" +VMS_TOKEN="${VMS_API_TOKEN:-${API_TOKEN:-}}" +if [[ -n "${vast_vms_credentials:-}" ]]; then + VMS_USER="${VMS_USER:-$(echo "$vast_vms_credentials" | jq -r '.USERNAME // .username // empty')}" + VMS_PASS="${VMS_PASS:-$(echo "$vast_vms_credentials" | jq -r '.PASSWORD // .password // empty')}" + VMS_TOKEN="${VMS_TOKEN:-$(echo "$vast_vms_credentials" | jq -r '.API_TOKEN // .api_token // empty')}" +fi + +fetch_vms_metrics() { + local path="$1" + local url="${VAST_VMS_ENDPOINT%/}${path}" + if [[ -n "$VMS_TOKEN" ]]; then + curl -sf -H "Authorization: Bearer ${VMS_TOKEN}" "$url" 2>/dev/null || true + elif [[ -n "$VMS_USER" && -n "$VMS_PASS" ]]; then + curl -sf -u "${VMS_USER}:${VMS_PASS}" "$url" 2>/dev/null || true + else + curl -sf "$url" 2>/dev/null || true + fi +} + +metrics=$(fetch_vms_metrics "/api/prometheusmetrics/tenants") +if [[ -z "$metrics" ]]; then + issues_json=$(append_issue "$issues_json" \ + "Unable to fetch VMS tenant metrics from \`${VAST_VMS_ENDPOINT}\`" \ + "Prometheus-format tenant metrics were not returned; verify credentials and network access." \ + 3 \ + "Confirm vast_vms_credentials (USERNAME/PASSWORD or API_TOKEN) and VMS API reachability from the execution environment.") + write_issues "$OUTPUT_FILE" "$issues_json" + exit 0 +fi + +info_report+="VMS tenant metrics sample (first 20 lines):"$'\n' +info_report+=$(echo "$metrics" | head -n 20) + +# Collect failing / pressured PVCs in namespace +failing_pvcs=$(k8s get pvc -n "${NAMESPACE}" -o json 2>/dev/null | jq -c ' + [.items[] | select(.status.phase != "Bound" or (.metadata.annotations["volume.kubernetes.io/storage-provisioner"]? // "" | test("vast"; "i")))] +') + +while IFS= read -r pvc_line; do + [[ -z "$pvc_line" ]] && continue + is_vast_pvc_json "$pvc_line" || continue + pvc_name=$(echo "$pvc_line" | jq -r '.metadata.name') + phase=$(echo "$pvc_line" | jq -r '.status.phase') + sc=$(echo "$pvc_line" | jq -r '.spec.storageClassName // empty') + sc_json=$(k8s get storageclass "$sc" -o json 2>/dev/null || echo '{}') + tenant=$(echo "$sc_json" | jq -r '.parameters.tenant // .parameters.tenant_name // empty') + + [[ -z "$tenant" ]] && tenant="unknown" + tenant_pattern=$(echo "$tenant" | sed 's/[][\/.^$*+?{}|()-]/\\&/g') + + cap_line=$(echo "$metrics" | grep -i "tenant.*${tenant_pattern}.*capacity" | head -n 1 || true) + qos_line=$(echo "$metrics" | grep -i "tenant.*${tenant_pattern}.*qos" | head -n 1 || true) + + if [[ "$phase" != "Bound" ]]; then + details="PVC ${pvc_name} phase=${phase}, tenant=${tenant}." + [[ -n "$cap_line" ]] && details+=" VMS capacity hint: ${cap_line}" + [[ -n "$qos_line" ]] && details+=" VMS QoS hint: ${qos_line}" + cluster_label="${VAST_CLUSTER_NAME:-${CONTEXT}}" + issues_json=$(append_issue "$issues_json" \ + "Kubernetes PVC \`${pvc_name}\` failures may correlate with VMS tenant \`${tenant}\` on cluster \`${cluster_label}\`" \ + "$details" \ + 3 \ + "Compare CSI driver logs with VMS tenant capacity/QoS dashboards. Expand tenant quota or resolve QoS throttling if backend pressure is confirmed.") + fi +done < <(echo "$failing_pvcs" | jq -c '.[]?') + +if [[ $(echo "$issues_json" | jq 'length') -eq 0 ]]; then + issues_json=$(append_issue "$issues_json" \ + "VMS correlation completed for namespace \`${NAMESPACE}\`" \ + "No failing VAST PVCs required backend correlation. VMS endpoint ${VAST_VMS_ENDPOINT} responded successfully." \ + 4 \ + "Re-run when PVC mount or binding failures occur to distinguish driver vs backend pressure.") +fi + +write_issues "$OUTPUT_FILE" "$issues_json" diff --git a/codebundles/vast-k8s-csi-health/runbook.robot b/codebundles/vast-k8s-csi-health/runbook.robot new file mode 100644 index 00000000..d220a9c2 --- /dev/null +++ b/codebundles/vast-k8s-csi-health/runbook.robot @@ -0,0 +1,382 @@ +*** Settings *** +Documentation Monitors the VAST CSI driver and traces Kubernetes workload storage from PVCs through to VAST views, detecting driver failures, NFS congestion, and mount issues. +Metadata Author rw-codebundle-agent +Metadata Display Name VAST Data Kubernetes CSI Health +Metadata Supports Kubernetes VAST CSI NFS storage persistentvolumeclaim + +Force Tags Kubernetes VAST CSI storage health + +Library String +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform +Library RW.K8sHelper + +Suite Setup Suite Initialization + + +*** Tasks *** +Check VAST CSI Driver Pod Health in Namespace `${CSI_NAMESPACE}` on Cluster `${CONTEXT}` + [Documentation] Verifies CSI controller Deployment/StatefulSet and node DaemonSet pods are Running/Ready; checks for CrashLoopBackOff and recent restarts. + [Tags] Kubernetes VAST CSI access:read-only data:logs-config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-csi-pod-health.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=CONTEXT="${CONTEXT}" CSI_NAMESPACE="${CSI_NAMESPACE}" ./check-csi-pod-health.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat csi_pod_health_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for CSI pod health task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=VAST CSI controller and node pods should be Ready in namespace `${CSI_NAMESPACE}` + ... actual=Unhealthy CSI pod signals detected on context `${CONTEXT}` + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report VAST CSI pod health analysis: + RW.Core.Add Pre To Report ${result.stdout} + +Check CSI Node and Controller Metrics for RPC Failures in Namespace `${CSI_NAMESPACE}` + [Documentation] Scrapes /metrics from CSI node and controller endpoints; detects elevated csi_plugin_operations failures and slow RPC durations. + [Tags] Kubernetes VAST CSI metrics access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-csi-metrics.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=CONTEXT="${CONTEXT}" CSI_NAMESPACE="${CSI_NAMESPACE}" RPC_ERROR_RATE_THRESHOLD="${RPC_ERROR_RATE_THRESHOLD}" ./check-csi-metrics.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat csi_metrics_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for CSI metrics task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=CSI RPC error rates should remain below `${RPC_ERROR_RATE_THRESHOLD}` percent + ... actual=CSI metrics analysis reported issues in namespace `${CSI_NAMESPACE}` + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report VAST CSI metrics analysis: + RW.Core.Add Pre To Report ${result.stdout} + +Check NFS Transport Health on CSI Nodes in Namespace `${CSI_NAMESPACE}` + [Documentation] Analyzes csi_node_nfs_xprt metrics for network congestion and unhealthy VIP connections on CSI node pods. + [Tags] Kubernetes VAST NFS metrics access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-nfs-xprt-health.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=CONTEXT="${CONTEXT}" CSI_NAMESPACE="${CSI_NAMESPACE}" XPRT_PENDING_THRESHOLD="${XPRT_PENDING_THRESHOLD}" ./check-nfs-xprt-health.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat nfs_xprt_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for NFS xprt task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=NFS transports to VAST VIPs should be healthy with pending requests below `${XPRT_PENDING_THRESHOLD}` + ... actual=NFS xprt congestion or unhealthy VIP signals detected on context `${CONTEXT}` + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report NFS transport (xprt) analysis: + RW.Core.Add Pre To Report ${result.stdout} + +Trace Kubernetes PVCs to VAST Views for Namespace `${NAMESPACE}` + [Documentation] Maps PVC to PV to StorageClass parameters and produces a trace report linking workload storage to VAST view, tenant, and VIP identifiers. + [Tags] Kubernetes VAST PVC trace access:read-only data:config + + ${result}= RW.CLI.Run Bash File + ... bash_file=trace-pvc-to-vast.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=CONTEXT="${CONTEXT}" NAMESPACE="${NAMESPACE}" ./trace-pvc-to-vast.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat pvc_trace_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for PVC trace task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=VAST-backed PVCs in `${NAMESPACE}` should bind and expose traceable VAST identifiers + ... actual=PVC trace analysis completed for namespace `${NAMESPACE}` + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report PVC to VAST trace report: + RW.Core.Add Pre To Report ${result.stdout} + +Check End-to-End Pod Mount Health for VAST Storage in Namespace `${NAMESPACE}` + [Documentation] Identifies pods using VAST CSI volumes with mount failures, VolumeAttachment issues, or NodePublishVolume errors in events. + [Tags] Kubernetes VAST pod mount access:read-only data:logs-config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-pod-mount-health.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=CONTEXT="${CONTEXT}" NAMESPACE="${NAMESPACE}" ./check-pod-mount-health.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat pod_mount_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for pod mount health task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Pods using VAST PVCs in `${NAMESPACE}` should mount successfully and reach Ready state + ... actual=Mount or attachment issues detected for VAST storage workloads + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Pod mount health analysis: + RW.Core.Add Pre To Report ${result.stdout} + +Check VAST StorageClass Configuration for Cluster `${CONTEXT}` + [Documentation] Validates StorageClass parameters such as endpoint, view policy, mount options, and QoS settings for misconfigurations that limit workloads. + [Tags] Kubernetes VAST StorageClass access:read-only data:config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-vast-storageclass-config.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=CONTEXT="${CONTEXT}" ./check-vast-storageclass-config.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat storageclass_config_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for StorageClass config task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=VAST StorageClasses should define endpoint, view policy, and tenant parameters correctly + ... actual=StorageClass configuration review found gaps on context `${CONTEXT}` + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report VAST StorageClass configuration review: + RW.Core.Add Pre To Report ${result.stdout} + +Correlate Kubernetes Storage Events with VAST Tenant Metrics for Namespace `${NAMESPACE}` + [Documentation] When VAST_VMS_ENDPOINT is configured, cross-references failing PVCs with tenant capacity and QoS metrics from VMS to distinguish driver vs backend issues. + [Tags] Kubernetes VAST VMS correlation access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=correlate-k8s-vast-events.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... secret__vast_vms_credentials=${VMS_CREDENTIALS} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=CONTEXT="${CONTEXT}" NAMESPACE="${NAMESPACE}" VAST_VMS_ENDPOINT="${VAST_VMS_ENDPOINT}" ./correlate-k8s-vast-events.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat vast_correlation_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for VMS correlation task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Kubernetes storage symptoms should be explainable by VMS tenant health when backend correlation is enabled + ... actual=VMS correlation analysis completed for namespace `${NAMESPACE}` + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Kubernetes/VMS correlation results: + RW.Core.Add Pre To Report ${result.stdout} + + +*** Keywords *** +Suite Initialization + ${kubeconfig}= RW.Core.Import Secret + ... kubeconfig + ... type=string + ... description=Kubernetes kubeconfig for cluster access. + ... pattern=\w* + + TRY + ${vms_credentials}= RW.Core.Import Secret + ... vast_vms_credentials + ... type=string + ... description=Optional VMS credentials JSON with USERNAME, PASSWORD or API_TOKEN. + ... pattern=\w* + Set Suite Variable ${VMS_CREDENTIALS} ${vms_credentials} + EXCEPT + Log vast_vms_credentials not found; VMS correlation will skip unless endpoint allows anonymous access. WARN + Set Suite Variable ${VMS_CREDENTIALS} ${EMPTY} + END + + ${CONTEXT}= RW.Core.Import User Variable CONTEXT + ... type=string + ... description=Kubernetes context name. + ... pattern=\w* + ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE + ... type=string + ... description=Kubernetes namespace for workload PVC tracing. + ... pattern=\w* + ${CSI_NAMESPACE}= RW.Core.Import User Variable CSI_NAMESPACE + ... type=string + ... description=Namespace where the VAST CSI driver is installed. + ... pattern=\w* + ... default=vast-csi + ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY + ... type=string + ... description=Kubernetes CLI binary (kubectl or oc). + ... enum=[kubectl,oc] + ... default=kubectl + ${VAST_VMS_ENDPOINT}= RW.Core.Import User Variable VAST_VMS_ENDPOINT + ... type=string + ... description=Optional VMS endpoint for backend correlation task. + ... pattern=.* + ... default= + ${VAST_CLUSTER_NAME}= RW.Core.Import User Variable VAST_CLUSTER_NAME + ... type=string + ... description=Optional VAST cluster name for correlation task titles. + ... pattern=.* + ... default= + ${XPRT_PENDING_THRESHOLD}= RW.Core.Import User Variable XPRT_PENDING_THRESHOLD + ... type=string + ... description=csi_node_nfs_xprt_pending_requests count that triggers an issue. + ... pattern=^\d+$ + ... default=100 + ${RPC_ERROR_RATE_THRESHOLD}= RW.Core.Import User Variable RPC_ERROR_RATE_THRESHOLD + ... type=string + ... description=CSI RPC error rate percent threshold. + ... pattern=^\d+$ + ... default=5 + + Set Suite Variable ${kubeconfig} ${kubeconfig} + Set Suite Variable ${CONTEXT} ${CONTEXT} + Set Suite Variable ${NAMESPACE} ${NAMESPACE} + Set Suite Variable ${CSI_NAMESPACE} ${CSI_NAMESPACE} + Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} + Set Suite Variable ${VAST_VMS_ENDPOINT} ${VAST_VMS_ENDPOINT} + Set Suite Variable ${VAST_CLUSTER_NAME} ${VAST_CLUSTER_NAME} + Set Suite Variable ${XPRT_PENDING_THRESHOLD} ${XPRT_PENDING_THRESHOLD} + Set Suite Variable ${RPC_ERROR_RATE_THRESHOLD} ${RPC_ERROR_RATE_THRESHOLD} + + Set Suite Variable + ... ${env} + ... {"KUBECONFIG":"./${kubeconfig.key}","CONTEXT":"${CONTEXT}","NAMESPACE":"${NAMESPACE}","CSI_NAMESPACE":"${CSI_NAMESPACE}","KUBERNETES_DISTRIBUTION_BINARY":"${KUBERNETES_DISTRIBUTION_BINARY}","VAST_VMS_ENDPOINT":"${VAST_VMS_ENDPOINT}","VAST_CLUSTER_NAME":"${VAST_CLUSTER_NAME}","XPRT_PENDING_THRESHOLD":"${XPRT_PENDING_THRESHOLD}","RPC_ERROR_RATE_THRESHOLD":"${RPC_ERROR_RATE_THRESHOLD}"} + + RW.K8sHelper.Verify Cluster Connectivity + ... binary=${KUBERNETES_DISTRIBUTION_BINARY} + ... context=${CONTEXT} + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} diff --git a/codebundles/vast-k8s-csi-health/sli-vast-csi-health-score.sh b/codebundles/vast-k8s-csi-health/sli-vast-csi-health-score.sh new file mode 100755 index 00000000..7091e1d5 --- /dev/null +++ b/codebundles/vast-k8s-csi-health/sli-vast-csi-health-score.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +set -euo pipefail +# Lightweight SLI dimensions for VAST CSI health (stdout JSON object). +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" +: "${CSI_NAMESPACE:?Must set CSI_NAMESPACE}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-csi-common.sh +source "${SCRIPT_DIR}/vast-csi-common.sh" + +XPRT_PENDING_THRESHOLD="${XPRT_PENDING_THRESHOLD:-100}" +RPC_ERROR_RATE_THRESHOLD="${RPC_ERROR_RATE_THRESHOLD:-5}" + +csi_pod_score=1 +pvc_bound_score=1 +mount_score=1 +xprt_score=1 + +# CSI controller/node readiness +if k8s get ns "${CSI_NAMESPACE}" -o name &>/dev/null; then + node_pods=$(find_csi_node_pods) + controller_pods=$(find_csi_controller_pods) + pods=$(jq -n --argjson n "$node_pods" --argjson c "$controller_pods" '{items: ($n.items + $c.items)}') + not_ready=$(echo "$pods" | jq '[.items[] | select( + ((.status.conditions // []) | map(select(.type=="Ready")) | .[0].status // "False") != "True" + )] | length') + crash=$(echo "$pods" | jq '[.items[] | select( + ([.status.containerStatuses[]? | .state.waiting.reason? // empty] | index("CrashLoopBackOff")) + )] | length') + [[ "${not_ready:-0}" -gt 0 || "${crash:-0}" -gt 0 ]] && csi_pod_score=0 +else + csi_pod_score=0 +fi + +# VAST PVC binding in workload namespace +pvcs=$(list_vast_pvcs_json "${NAMESPACE}") +if [[ $(echo "$pvcs" | jq '.items | length') -eq 0 ]]; then + all=$(k8s get pvc -n "${NAMESPACE}" -o json 2>/dev/null || echo '{"items":[]}') + unbound=0 + total=0 + while IFS= read -r line; do + [[ -z "$line" ]] && continue + is_vast_pvc_json "$line" || continue + total=$((total + 1)) + phase=$(echo "$line" | jq -r '.status.phase') + [[ "$phase" != "Bound" ]] && unbound=$((unbound + 1)) + done < <(echo "$all" | jq -c '.items[]?') + [[ "$total" -gt 0 && "$unbound" -gt 0 ]] && pvc_bound_score=0 +else + unbound=$(echo "$pvcs" | jq '[.items[] | select(.status.phase != "Bound")] | length') + [[ "${unbound:-0}" -gt 0 ]] && pvc_bound_score=0 +fi + +# Mount health: pods using vast PVCs not ready +mount_problems=0 +while IFS= read -r pvc; do + [[ -z "$pvc" ]] && continue + pods=$(k8s get pods -n "${NAMESPACE}" -o json 2>/dev/null | jq -r --arg p "$pvc" ' + .items[] | select(.spec.volumes[]?.persistentVolumeClaim.claimName == $p) | .metadata.name') + while IFS= read -r pod; do + [[ -z "$pod" ]] && continue + ready=$(k8s get pod "$pod" -n "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo False) + phase=$(k8s get pod "$pod" -n "${NAMESPACE}" -o jsonpath='{.status.phase}' 2>/dev/null || echo Unknown) + if [[ "$ready" != "True" || "$phase" == "Pending" ]]; then + mount_problems=$((mount_problems + 1)) + fi + done <<< "$pods" +done < <(echo "$pvcs" | jq -r '.items[].metadata.name // empty') + +[[ "$mount_problems" -gt 0 ]] && mount_score=0 + +# NFS xprt congestion (best effort) +node_pod=$(find_csi_node_pods | jq -r '.items[0].metadata.name // empty') +if [[ -n "$node_pod" ]]; then + body=$(curl_pod_metrics "$node_pod" "${CSI_NAMESPACE}" "${NODE_METRICS_PORT:-9090}") + if echo "$body" | grep -q 'csi_node_nfs_xprt_congested_state'; then + if echo "$body" | awk '/^csi_node_nfs_xprt_congested_state\{/{if ($NF >= 1) found=1} END{exit !found}'; then + xprt_score=0 + fi + fi + if echo "$body" | awk -v th "$XPRT_PENDING_THRESHOLD" '/^csi_node_nfs_xprt_pending_requests\{/{if ($NF > th) found=1} END{exit !found}'; then + xprt_score=0 + fi + if echo "$body" | awk '/^csi_node_nfs_xprt_unhealthy\{/{if ($NF >= 1) found=1} END{exit !found}'; then + xprt_score=0 + fi +fi + +jq -n \ + --argjson c "$csi_pod_score" \ + --argjson p "$pvc_bound_score" \ + --argjson m "$mount_score" \ + --argjson x "$xprt_score" \ + '{csi_pods: $c, pvc_bound: $p, mounts: $m, nfs_xprt: $x}' diff --git a/codebundles/vast-k8s-csi-health/sli.robot b/codebundles/vast-k8s-csi-health/sli.robot new file mode 100644 index 00000000..e75c10b2 --- /dev/null +++ b/codebundles/vast-k8s-csi-health/sli.robot @@ -0,0 +1,103 @@ +*** Settings *** +Documentation Measures VAST CSI health by scoring CSI pod readiness, PVC binding, workload mounts, and NFS transport metrics. Produces a value between 0 (failing) and 1 (healthy). +Metadata Author rw-codebundle-agent +Metadata Display Name VAST Data Kubernetes CSI Health +Metadata Supports Kubernetes VAST CSI NFS storage + +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform +Library Collections + +Suite Setup Suite Initialization + + +*** Keywords *** +Suite Initialization + ${kubeconfig}= RW.Core.Import Secret + ... kubeconfig + ... type=string + ... description=Kubernetes kubeconfig for cluster access. + ... pattern=\w* + ${CONTEXT}= RW.Core.Import User Variable CONTEXT + ... type=string + ... description=Kubernetes context name. + ... pattern=\w* + ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE + ... type=string + ... description=Kubernetes namespace for workload PVC tracing. + ... pattern=\w* + ${CSI_NAMESPACE}= RW.Core.Import User Variable CSI_NAMESPACE + ... type=string + ... description=Namespace where the VAST CSI driver is installed. + ... pattern=\w* + ... default=vast-csi + ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY + ... type=string + ... description=Kubernetes CLI binary (kubectl or oc). + ... enum=[kubectl,oc] + ... default=kubectl + ${XPRT_PENDING_THRESHOLD}= RW.Core.Import User Variable XPRT_PENDING_THRESHOLD + ... type=string + ... description=csi_node_nfs_xprt_pending_requests count that triggers a failing NFS score. + ... pattern=^\d+$ + ... default=100 + ${RPC_ERROR_RATE_THRESHOLD}= RW.Core.Import User Variable RPC_ERROR_RATE_THRESHOLD + ... type=string + ... description=CSI RPC error rate percent threshold (reserved for future SLI expansion). + ... pattern=^\d+$ + ... default=5 + + Set Suite Variable ${kubeconfig} ${kubeconfig} + Set Suite Variable ${CONTEXT} ${CONTEXT} + Set Suite Variable ${NAMESPACE} ${NAMESPACE} + Set Suite Variable ${CSI_NAMESPACE} ${CSI_NAMESPACE} + Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} + Set Suite Variable ${XPRT_PENDING_THRESHOLD} ${XPRT_PENDING_THRESHOLD} + Set Suite Variable ${RPC_ERROR_RATE_THRESHOLD} ${RPC_ERROR_RATE_THRESHOLD} + Set Suite Variable + ... ${env} + ... {"KUBECONFIG":"./${kubeconfig.key}","CONTEXT":"${CONTEXT}","NAMESPACE":"${NAMESPACE}","CSI_NAMESPACE":"${CSI_NAMESPACE}","KUBERNETES_DISTRIBUTION_BINARY":"${KUBERNETES_DISTRIBUTION_BINARY}","XPRT_PENDING_THRESHOLD":"${XPRT_PENDING_THRESHOLD}","RPC_ERROR_RATE_THRESHOLD":"${RPC_ERROR_RATE_THRESHOLD}"} + + +*** Tasks *** +Score VAST CSI Health Dimensions for Namespace `${NAMESPACE}` + [Documentation] Runs a compact probe returning binary scores for CSI pods, PVC binding, mounts, and NFS xprt health. + [Tags] access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=sli-vast-csi-health-score.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=120 + ... include_in_history=false + ... cmd_override=./sli-vast-csi-health-score.sh + + TRY + ${dims}= Evaluate json.loads(r'''${result.stdout}''') json + ${csi}= Get From Dictionary ${dims} csi_pods + ${pvc}= Get From Dictionary ${dims} pvc_bound + ${mount}= Get From Dictionary ${dims} mounts + ${xprt}= Get From Dictionary ${dims} nfs_xprt + ${csi}= Convert To Integer ${csi} + ${pvc}= Convert To Integer ${pvc} + ${mount}= Convert To Integer ${mount} + ${xprt}= Convert To Integer ${xprt} + EXCEPT + Log SLI JSON parse failed; reporting zero health. WARN + ${csi}= Convert To Integer 0 + ${pvc}= Convert To Integer 0 + ${mount}= Convert To Integer 0 + ${xprt}= Convert To Integer 0 + END + + RW.Core.Push Metric ${csi} sub_name=csi_pods + RW.Core.Push Metric ${pvc} sub_name=pvc_bound + RW.Core.Push Metric ${mount} sub_name=mounts + RW.Core.Push Metric ${xprt} sub_name=nfs_xprt + + ${health_score}= Evaluate (${csi} + ${pvc} + ${mount} + ${xprt}) / 4.0 + ${health_score}= Convert to Number ${health_score} 2 + RW.Core.Add to Report Health Score: ${health_score} + RW.Core.Push Metric ${health_score} diff --git a/codebundles/vast-k8s-csi-health/trace-pvc-to-vast.sh b/codebundles/vast-k8s-csi-health/trace-pvc-to-vast.sh new file mode 100755 index 00000000..750c24e7 --- /dev/null +++ b/codebundles/vast-k8s-csi-health/trace-pvc-to-vast.sh @@ -0,0 +1,119 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: CONTEXT, NAMESPACE +# Maps PVC -> PV -> StorageClass -> VAST identifiers. Informational (severity 4). +# Writes JSON array to pvc_trace_issues.json +# ----------------------------------------------------------------------------- +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="pvc_trace_issues.json" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-csi-common.sh +source "${SCRIPT_DIR}/vast-csi-common.sh" + +issues_json='[]' +trace_report="" + +print_report() { + { set +x; } 2>/dev/null || true + echo + echo "=== VAST PVC trace for namespace '${NAMESPACE}' (context '${CONTEXT}') ===" + echo "${trace_report:- No VAST-backed PVCs found.}" +} +trap print_report EXIT + +pvcs_json=$(list_vast_pvcs_json "${NAMESPACE}") +pvc_count=$(echo "$pvcs_json" | jq '.items | length') + +if [[ "$pvc_count" -eq 0 ]]; then + # Refine: scan all PVCs and filter by bound PV driver / storage class provisioner + all_pvcs=$(k8s get pvc -n "${NAMESPACE}" -o json 2>/dev/null || echo '{"items":[]}') + pvcs_json=$(echo "$all_pvcs" | jq -c '{items: []}') + while IFS= read -r pvc_line; do + [[ -z "$pvc_line" ]] && continue + if is_vast_pvc_json "$pvc_line"; then + pvcs_json=$(echo "$pvcs_json" | jq -c --argjson item "$pvc_line" '.items += [$item]') + fi + done < <(echo "$all_pvcs" | jq -c '.items[]?') + pvc_count=$(echo "$pvcs_json" | jq '.items | length') +fi + +if [[ "$pvc_count" -eq 0 ]]; then + issues_json=$(append_issue "$issues_json" \ + "No VAST CSI-backed PVCs found in namespace \`${NAMESPACE}\`" \ + "No PersistentVolumeClaims using csi.vastdata.com (or VAST-named StorageClasses) were discovered." \ + 4 \ + "Confirm workloads in this namespace use a VAST StorageClass. Adjust generation rules if this namespace should not be monitored.") + write_issues "$OUTPUT_FILE" "$issues_json" + exit 0 +fi + +while IFS= read -r pvc_line; do + [[ -z "$pvc_line" ]] && continue + pvc_name=$(echo "$pvc_line" | jq -r '.metadata.name') + sc_name=$(echo "$pvc_line" | jq -r '.spec.storageClassName // "default"') + pv_name=$(echo "$pvc_line" | jq -r '.spec.volumeName // empty') + phase=$(echo "$pvc_line" | jq -r '.status.phase // "Unknown"') + + sc_json=$(k8s get storageclass "$sc_name" -o json 2>/dev/null || echo '{}') + sc_params=$(echo "$sc_json" | jq -c '.parameters // {}') + provisioner=$(echo "$sc_json" | jq -r '.provisioner // "unknown"') + + pv_json='{}' + volume_handle="" + driver="" + view_path="" + tenant="" + vip="" + if [[ -n "$pv_name" ]]; then + pv_json=$(k8s get pv "$pv_name" -o json 2>/dev/null || echo '{}') + volume_handle=$(echo "$pv_json" | jq -r '.spec.csi.volumeHandle // empty') + driver=$(echo "$pv_json" | jq -r '.spec.csi.driver // empty') + view_path=$(echo "$pv_json" | jq -r '.spec.csi.volumeAttributes.view_path // .spec.csi.volumeAttributes.root_export // empty') + tenant=$(echo "$pv_json" | jq -r '.spec.csi.volumeAttributes.tenant // .spec.csi.volumeAttributes.tenant_name // empty') + vip=$(echo "$pv_json" | jq -r '.spec.csi.volumeAttributes.vip // .spec.csi.volumeAttributes.endpoint // empty') + fi + + if [[ -z "$view_path" ]]; then + view_path=$(echo "$sc_params" | jq -r '.view_policy // .root_export // .view // empty') + fi + if [[ -z "$tenant" ]]; then + tenant=$(echo "$sc_params" | jq -r '.tenant // .tenant_name // empty') + fi + if [[ -z "$vip" ]]; then + vip=$(echo "$sc_params" | jq -r '.endpoint // .vip_pool // .vip // empty') + fi + + trace_report+=$'\n'"--- PVC: ${pvc_name} (phase=${phase})" + trace_report+=$'\n'" StorageClass: ${sc_name} (provisioner=${provisioner})" + trace_report+=$'\n'" PV: ${pv_name:-unbound} (driver=${driver:-n/a})" + trace_report+=$'\n'" volumeHandle: ${volume_handle:-n/a}" + trace_report+=$'\n'" VAST view/path: ${view_path:-unknown}" + trace_report+=$'\n'" tenant: ${tenant:-unknown}" + trace_report+=$'\n'" VIP/endpoint: ${vip:-unknown}" + + if [[ "$phase" != "Bound" ]]; then + issues_json=$(append_issue "$issues_json" \ + "VAST PVC \`${pvc_name}\` is not Bound in namespace \`${NAMESPACE}\`" \ + "PVC phase=${phase}. StorageClass=${sc_name}, PV=${pv_name:-none}. Trace: view=${view_path:-?}, tenant=${tenant:-?}, vip=${vip:-?}." \ + 3 \ + "Inspect PVC events and controller logs in ${CSI_NAMESPACE:-vast-csi}. Verify VMS view policy and quota for tenant ${tenant:-unknown}.") + elif [[ -z "$volume_handle" && "$driver" != "csi.vastdata.com" ]]; then + issues_json=$(append_issue "$issues_json" \ + "VAST PVC \`${pvc_name}\` missing CSI volumeHandle metadata" \ + "Bound PVC ${pvc_name} lacks parseable VAST identifiers in PV ${pv_name}." \ + 4 \ + "Describe PV ${pv_name} and confirm the VAST CSI driver populated volumeHandle and volumeAttributes.") + else + issues_json=$(append_issue "$issues_json" \ + "VAST storage trace for PVC \`${pvc_name}\` in namespace \`${NAMESPACE}\`" \ + "PVC ${pvc_name} -> PV ${pv_name} -> SC ${sc_name}. view=${view_path:-unknown}, tenant=${tenant:-unknown}, vip=${vip:-unknown}, volumeHandle=${volume_handle:-n/a}." \ + 4 \ + "Use this mapping when correlating workload symptoms with VMS tenant/view metrics.") + fi +done < <(echo "$pvcs_json" | jq -c '.items[]') + +write_issues "$OUTPUT_FILE" "$issues_json" diff --git a/codebundles/vast-k8s-csi-health/vast-csi-common.sh b/codebundles/vast-k8s-csi-health/vast-csi-common.sh new file mode 100755 index 00000000..1c823f90 --- /dev/null +++ b/codebundles/vast-k8s-csi-health/vast-csi-common.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +# Shared helpers for VAST CSI health scripts. +set -euo pipefail + +KUBECTL="${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}" +VAST_CSI_PROVISIONER="${VAST_CSI_PROVISIONER:-csi.vastdata.com}" +VAST_CSI_PROVISIONER_LEGACY="${VAST_CSI_PROVISIONER_LEGACY:-kubernetes.io/csi/csi.vastdata.com}" + +k8s() { + "${KUBECTL}" "$@" --context "${CONTEXT}" +} + +is_vast_storage_class() { + local sc="$1" + [[ -z "$sc" || "$sc" == "null" ]] && return 1 + local prov + prov=$(k8s get storageclass "$sc" -o jsonpath='{.provisioner}' 2>/dev/null || true) + [[ "$prov" == "$VAST_CSI_PROVISIONER" || "$prov" == "$VAST_CSI_PROVISIONER_LEGACY" ]] && return 0 + [[ "$sc" =~ [Vv][Aa][Ss][Tt] ]] && return 0 + return 1 +} + +is_vast_pv() { + local pv="$1" + [[ -z "$pv" || "$pv" == "null" ]] && return 1 + local driver + driver=$(k8s get pv "$pv" -o jsonpath='{.spec.csi.driver}' 2>/dev/null || true) + [[ "$driver" == "$VAST_CSI_PROVISIONER" ]] && return 0 + return 1 +} + +is_vast_pvc_json() { + local pvc_json="$1" + local sc pv + sc=$(echo "$pvc_json" | jq -r '.spec.storageClassName // empty') + pv=$(echo "$pvc_json" | jq -r '.spec.volumeName // empty') + if is_vast_storage_class "$sc"; then + return 0 + fi + if is_vast_pv "$pv"; then + return 0 + fi + return 1 +} + +list_vast_pvcs_json() { + local ns="${1:?namespace required}" + k8s get pvc -n "$ns" -o json 2>/dev/null | jq -c --arg ns "$ns" ' + .items // [] | map(select( + (.spec.storageClassName // "" | test("vast"; "i")) or + (.metadata.annotations["volume.beta.kubernetes.io/storage-provisioner"]? // "" | test("vast"; "i")) + )) | {items: .} + ' || echo '{"items":[]}' +} + +find_csi_node_pods() { + local ns="${CSI_NAMESPACE:?Must set CSI_NAMESPACE}" + k8s get pods -n "$ns" -o json 2>/dev/null | jq -c ' + {items: [.items[] | select( + (.metadata.labels["app.kubernetes.io/component"]? // "" | test("node"; "i")) or + (.metadata.labels["app"]? // "" | test("vast.*node|node"; "i")) or + (.metadata.name | test("vast.*node|node"; "i")) + )]} + ' || echo '{"items":[]}' +} + +find_csi_controller_pods() { + local ns="${CSI_NAMESPACE:?Must set CSI_NAMESPACE}" + k8s get pods -n "$ns" -o json 2>/dev/null | jq -c ' + {items: [.items[] | select( + (.metadata.labels["app.kubernetes.io/component"]? // "" | test("controller"; "i")) or + (.metadata.labels["app"]? // "" | test("vast.*controller|controller"; "i")) or + (.metadata.name | test("vast.*controller|controller"; "i")) + )]} + ' || echo '{"items":[]}' +} + +curl_pod_metrics() { + local pod="$1" + local ns="$2" + local port="${3:?port required}" + k8s exec -n "$ns" "$pod" -- sh -c "wget -qO- http://127.0.0.1:${port}/metrics 2>/dev/null || curl -sf http://127.0.0.1:${port}/metrics 2>/dev/null" 2>/dev/null || true +} + +curl_service_metrics() { + local svc="$1" + local ns="$2" + local port="$3" + k8s run "vast-metrics-probe-$$" -n "$ns" --rm -i --restart=Never \ + --image=curlimages/curl:8.5.0 --command -- \ + curl -sf --max-time 15 "http://${svc}.${ns}.svc.cluster.local:${port}/metrics" 2>/dev/null || true +} + +find_metrics_services() { + local ns="${CSI_NAMESPACE:?Must set CSI_NAMESPACE}" + k8s get svc -n "$ns" -o json 2>/dev/null | jq -c ' + [.items[] | select(.metadata.name | test("metrics|vast"; "i")) | { + name: .metadata.name, + ports: [.spec.ports[]? | {name: (.name // ""), port: .port}] + }] + ' || echo '[]' +} + +append_issue() { + local issues_json="$1" + local title="$2" + local details="$3" + local severity="$4" + local next_steps="$5" + echo "$issues_json" | jq \ + --arg title "$title" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "$next_steps" \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]' +} + +write_issues() { + local file="$1" + local issues_json="$2" + echo "$issues_json" >"$file" +}