From adfdf628623316d0cc90873356e53aca1bd4b02a Mon Sep 17 00:00:00 2001 From: "rw-codebundle-agent[bot]" Date: Thu, 25 Jun 2026 15:40:38 +0000 Subject: [PATCH] Add vast-cluster-health CodeBundle for VAST cluster monitoring. Implements VMS REST and Prometheus-based health checks with SLI scoring, generation rules, and mock fixture tests for issue #129. Co-authored-by: Cursor --- .../generation-rules/vast-cluster-health.yaml | 22 ++ .../templates/vast-cluster-health-sli.yaml | 48 +++ .../templates/vast-cluster-health-slx.yaml | 32 ++ .../vast-cluster-health-taskset.yaml | 41 +++ .../vast-cluster-health/.test/README.md | 16 + .../vast-cluster-health/.test/Taskfile.yaml | 23 ++ .../capacity_pressure/api_api_clusters_ | 15 + .../capacity_pressure/api_api_cnodes_ | 3 + .../capacity_pressure/api_api_dnodes_ | 3 + .../capacity_pressure/prometheus_alarms | 2 + .../capacity_pressure/prometheus_vms_state | 2 + .../.test/fixtures/degraded/api_api_clusters_ | 13 + .../.test/fixtures/degraded/api_api_cnodes_ | 3 + .../.test/fixtures/degraded/api_api_dnodes_ | 4 + .../.test/fixtures/degraded/prometheus_alarms | 2 + .../fixtures/degraded/prometheus_vms_state | 2 + .../.test/fixtures/healthy/api_api_clusters_ | 15 + .../.test/fixtures/healthy/api_api_cnodes_ | 4 + .../.test/fixtures/healthy/api_api_dnodes_ | 4 + .../.test/fixtures/healthy/api_health_ | 1 + .../.test/fixtures/healthy/prometheus_alarms | 2 + .../fixtures/healthy/prometheus_vms_state | 2 + .../.test/run-mock-scenarios.sh | 49 +++ .../.test/validate-vast-bundle-structure.sh | 32 ++ codebundles/vast-cluster-health/README.md | 83 +++++ .../analyze-cluster-performance.sh | 126 ++++++++ .../check-cluster-capacity.sh | 102 ++++++ .../check-degraded-components.sh | 102 ++++++ .../check-node-hardware-health.sh | 81 +++++ .../check-replication-status.sh | 108 +++++++ .../check-vms-cluster-health.sh | 102 ++++++ codebundles/vast-cluster-health/runbook.robot | 295 ++++++++++++++++++ .../sli-vast-cluster-health-score.sh | 90 ++++++ codebundles/vast-cluster-health/sli.robot | 128 ++++++++ .../vast-cluster-health/vast-vms-common.sh | 149 +++++++++ 35 files changed, 1706 insertions(+) create mode 100644 codebundles/vast-cluster-health/.runwhen/generation-rules/vast-cluster-health.yaml create mode 100644 codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-sli.yaml create mode 100644 codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-slx.yaml create mode 100644 codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-taskset.yaml create mode 100644 codebundles/vast-cluster-health/.test/README.md create mode 100644 codebundles/vast-cluster-health/.test/Taskfile.yaml create mode 100644 codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_clusters_ create mode 100644 codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_cnodes_ create mode 100644 codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_dnodes_ create mode 100644 codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_alarms create mode 100644 codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_vms_state create mode 100644 codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_clusters_ create mode 100644 codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_cnodes_ create mode 100644 codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_dnodes_ create mode 100644 codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_alarms create mode 100644 codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_vms_state create mode 100644 codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_clusters_ create mode 100644 codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_cnodes_ create mode 100644 codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_dnodes_ create mode 100644 codebundles/vast-cluster-health/.test/fixtures/healthy/api_health_ create mode 100644 codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_alarms create mode 100644 codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_vms_state create mode 100755 codebundles/vast-cluster-health/.test/run-mock-scenarios.sh create mode 100755 codebundles/vast-cluster-health/.test/validate-vast-bundle-structure.sh create mode 100644 codebundles/vast-cluster-health/README.md create mode 100755 codebundles/vast-cluster-health/analyze-cluster-performance.sh create mode 100755 codebundles/vast-cluster-health/check-cluster-capacity.sh create mode 100755 codebundles/vast-cluster-health/check-degraded-components.sh create mode 100755 codebundles/vast-cluster-health/check-node-hardware-health.sh create mode 100755 codebundles/vast-cluster-health/check-replication-status.sh create mode 100755 codebundles/vast-cluster-health/check-vms-cluster-health.sh create mode 100644 codebundles/vast-cluster-health/runbook.robot create mode 100755 codebundles/vast-cluster-health/sli-vast-cluster-health-score.sh create mode 100644 codebundles/vast-cluster-health/sli.robot create mode 100755 codebundles/vast-cluster-health/vast-vms-common.sh diff --git a/codebundles/vast-cluster-health/.runwhen/generation-rules/vast-cluster-health.yaml b/codebundles/vast-cluster-health/.runwhen/generation-rules/vast-cluster-health.yaml new file mode 100644 index 00000000..d4d34202 --- /dev/null +++ b/codebundles/vast-cluster-health/.runwhen/generation-rules/vast-cluster-health.yaml @@ -0,0 +1,22 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + platform: vast_data + generationRules: + - resourceTypes: + - vast_data_cluster + matchRules: + - type: pattern + pattern: ".+" + properties: ["name"] + mode: substring + slxs: + - baseName: vast-cluster-health + qualifiers: ["vast_cluster_name", "vast_vms_endpoint"] + baseTemplateName: vast-cluster-health + levelOfDetail: basic + outputItems: + - type: slx + - type: sli + - type: runbook + templateName: vast-cluster-health-taskset.yaml diff --git a/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-sli.yaml b/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-sli.yaml new file mode 100644 index 00000000..3113faa6 --- /dev/null +++ b/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-sli.yaml @@ -0,0 +1,48 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelIndicator +metadata: + name: {{ slx_name }} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + displayUnitsLong: OK + displayUnitsShort: ok + locations: + - {{ default_location }} + description: Lightweight VAST cluster health score for {{ match_resource.name }} from VMS state, capacity, nodes, alarms, and replication. + codeBundle: + {% if repo_url %} + repoUrl: {{ repo_url }} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ ref }} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/vast-cluster-health/sli.robot + intervalStrategy: intermezzo + intervalSeconds: 300 + configProvided: + - name: VAST_VMS_ENDPOINT + value: "{{ match_resource.vms_endpoint | default(custom.vast_vms_endpoint) }}" + - name: VAST_CLUSTER_NAME + value: "{{ match_resource.name }}" + - name: CAPACITY_THRESHOLD + value: "{{ custom.capacity_threshold | default('85') }}" + - name: CRITICAL_CAPACITY_THRESHOLD + value: "{{ custom.critical_capacity_threshold | default('95') }}" + secretsProvided: + {% if wb_version %} + {% include "vast_data-auth.yaml" ignore missing %} + {% else %} + - name: vast_vms_credentials + workspaceKey: AUTH DETAILS NOT FOUND + {% endif %} + alertConfig: + tasks: + persona: eager-edgar + sessionTTL: 10m diff --git a/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-slx.yaml b/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-slx.yaml new file mode 100644 index 00000000..c2845132 --- /dev/null +++ b/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-slx.yaml @@ -0,0 +1,32 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{ slx_name }} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/storage/storage.svg + alias: {{ match_resource.name }} VAST Cluster Health + asMeasuredBy: Composite 0-1 score from VMS state, capacity headroom, node health, alarms, and replication. + configProvided: + - name: SLX_PLACEHOLDER + value: SLX_PLACEHOLDER + owners: + - {{ workspace.owner_email }} + statement: VAST cluster {{ match_resource.name }} should remain CLUSTERED with healthy nodes, capacity headroom, and no active alarms. + additionalContext: + {% include "vast_data-hierarchy.yaml" ignore missing %} + qualified_name: "{{ match_resource.qualified_name }}" + vast_vms_endpoint: "{{ match_resource.vms_endpoint | default(custom.vast_vms_endpoint | default('')) }}" + tags: + {% include "vast_data-tags.yaml" ignore missing %} + - name: cloud + value: on-prem + - name: service + value: vast_data + - name: scope + value: cluster + - name: access + value: read-only diff --git a/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-taskset.yaml b/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-taskset.yaml new file mode 100644 index 00000000..510a48d4 --- /dev/null +++ b/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-taskset.yaml @@ -0,0 +1,41 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{ slx_name }} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{ default_location }} + description: Monitor VAST Data cluster-wide health via VMS REST and Prometheus metrics for {{ match_resource.name }}. + codeBundle: + {% if repo_url %} + repoUrl: {{ repo_url }} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ ref }} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/vast-cluster-health/runbook.robot + configProvided: + - name: VAST_VMS_ENDPOINT + value: "{{ match_resource.vms_endpoint | default(custom.vast_vms_endpoint) }}" + - name: VAST_CLUSTER_NAME + value: "{{ match_resource.name }}" + - name: RESOURCES + value: "{{ custom.resources | default('All') }}" + - name: CAPACITY_THRESHOLD + value: "{{ custom.capacity_threshold | default('85') }}" + - name: CRITICAL_CAPACITY_THRESHOLD + value: "{{ custom.critical_capacity_threshold | default('95') }}" + secretsProvided: + {% if wb_version %} + {% include "vast_data-auth.yaml" ignore missing %} + {% else %} + - name: vast_vms_credentials + workspaceKey: AUTH DETAILS NOT FOUND + {% endif %} diff --git a/codebundles/vast-cluster-health/.test/README.md b/codebundles/vast-cluster-health/.test/README.md new file mode 100644 index 00000000..5d0a2556 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/README.md @@ -0,0 +1,16 @@ +# Mock scenario fixtures for vast-cluster-health. + +Static JSON/Prometheus fixtures used when `VAST_MOCK_FIXTURE_DIR` is set (see `run-mock-scenarios.sh`). + +| Scenario | Expected issues | Description | +|----------|-----------------|-------------| +| `healthy` | 0 | CLUSTERED state, capacity below threshold, all nodes healthy | +| `degraded` | 2+ | DEGRADED vms_state with offline DNode and active alarm | +| `capacity_pressure` | 1+ | Logical capacity above CAPACITY_THRESHOLD with no hardware faults | + +Run: + +```bash +cd .test +task +``` diff --git a/codebundles/vast-cluster-health/.test/Taskfile.yaml b/codebundles/vast-cluster-health/.test/Taskfile.yaml new file mode 100644 index 00000000..bec8e895 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/Taskfile.yaml @@ -0,0 +1,23 @@ +version: "3" + +tasks: + default: + desc: "Validate structure and run mock scenario tests" + cmds: + - task: validate-structure + - task: test-mock-scenarios + + validate-structure: + desc: "Run static checks for required files" + cmds: + - ./validate-vast-bundle-structure.sh + + test-mock-scenarios: + desc: "Run task scripts against fixture-backed mock VMS responses" + cmds: + - ./run-mock-scenarios.sh + + clean: + desc: "Remove local test outputs" + cmds: + - rm -f ../*_output.json ../*_report.txt perf_analysis.json diff --git a/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_clusters_ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_clusters_ new file mode 100644 index 00000000..bebed842 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_clusters_ @@ -0,0 +1,15 @@ +[ + { + "id": 1, + "name": "vast-lab-cluster", + "title": "vast-lab-cluster", + "state": "ONLINE", + "enabled": true, + "physical_space_in_use_percent": 88.5, + "logical_space_in_use_percent": 91.2, + "physical_space_in_use_tb": 250.0, + "logical_space_in_use_tb": 230.0, + "auxiliary_space_in_use_percent": 45.0, + "replication_enabled": true + } +] diff --git a/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_cnodes_ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_cnodes_ new file mode 100644 index 00000000..8fcdaae3 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_cnodes_ @@ -0,0 +1,3 @@ +[ + {"id": 1, "name": "cnode-1", "state": "ACTIVE"} +] diff --git a/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_dnodes_ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_dnodes_ new file mode 100644 index 00000000..3f1c63f8 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_dnodes_ @@ -0,0 +1,3 @@ +[ + {"id": 1, "name": "dnode-1", "state": "ACTIVE"} +] diff --git a/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_alarms b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_alarms new file mode 100644 index 00000000..76838b5f --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_alarms @@ -0,0 +1,2 @@ +# TYPE vast_alarm_active gauge +vast_alarm_active 0 diff --git a/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_vms_state b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_vms_state new file mode 100644 index 00000000..289cb5a5 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_vms_state @@ -0,0 +1,2 @@ +# TYPE vms_state gauge +vms_state 1 diff --git a/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_clusters_ b/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_clusters_ new file mode 100644 index 00000000..70f49ca9 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_clusters_ @@ -0,0 +1,13 @@ +[ + { + "id": 1, + "name": "vast-lab-cluster", + "title": "vast-lab-cluster", + "state": "DEGRADED", + "enabled": true, + "physical_space_in_use_percent": 55.0, + "logical_space_in_use_percent": 52.0, + "auxiliary_space_in_use_percent": 20.0, + "replication_enabled": true + } +] diff --git a/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_cnodes_ b/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_cnodes_ new file mode 100644 index 00000000..8fcdaae3 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_cnodes_ @@ -0,0 +1,3 @@ +[ + {"id": 1, "name": "cnode-1", "state": "ACTIVE"} +] diff --git a/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_dnodes_ b/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_dnodes_ new file mode 100644 index 00000000..d7df40f5 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_dnodes_ @@ -0,0 +1,4 @@ +[ + {"id": 1, "name": "dnode-1", "state": "OFFLINE"}, + {"id": 2, "name": "dnode-2", "state": "ACTIVE"} +] diff --git a/codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_alarms b/codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_alarms new file mode 100644 index 00000000..901bd19f --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_alarms @@ -0,0 +1,2 @@ +# TYPE vast_alarm_active gauge +vast_alarm_active 1 diff --git a/codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_vms_state b/codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_vms_state new file mode 100644 index 00000000..11bac434 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_vms_state @@ -0,0 +1,2 @@ +# TYPE vms_state gauge +vms_state 0 diff --git a/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_clusters_ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_clusters_ new file mode 100644 index 00000000..e9575a99 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_clusters_ @@ -0,0 +1,15 @@ +[ + { + "id": 1, + "name": "vast-lab-cluster", + "title": "vast-lab-cluster", + "state": "ONLINE", + "enabled": true, + "physical_space_in_use_percent": 42.5, + "logical_space_in_use_percent": 38.0, + "physical_space_in_use_tb": 120.5, + "logical_space_in_use_tb": 95.2, + "auxiliary_space_in_use_percent": 12.0, + "replication_enabled": true + } +] diff --git a/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_cnodes_ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_cnodes_ new file mode 100644 index 00000000..bc3a8cb4 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_cnodes_ @@ -0,0 +1,4 @@ +[ + {"id": 1, "name": "cnode-1", "state": "ACTIVE"}, + {"id": 2, "name": "cnode-2", "state": "ACTIVE"} +] diff --git a/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_dnodes_ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_dnodes_ new file mode 100644 index 00000000..e55a8e51 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_dnodes_ @@ -0,0 +1,4 @@ +[ + {"id": 1, "name": "dnode-1", "state": "ACTIVE"}, + {"id": 2, "name": "dnode-2", "state": "ACTIVE"} +] diff --git a/codebundles/vast-cluster-health/.test/fixtures/healthy/api_health_ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_health_ new file mode 100644 index 00000000..2350ea93 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_health_ @@ -0,0 +1 @@ +{"state": "CLUSTERED"} diff --git a/codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_alarms b/codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_alarms new file mode 100644 index 00000000..76838b5f --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_alarms @@ -0,0 +1,2 @@ +# TYPE vast_alarm_active gauge +vast_alarm_active 0 diff --git a/codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_vms_state b/codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_vms_state new file mode 100644 index 00000000..289cb5a5 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_vms_state @@ -0,0 +1,2 @@ +# TYPE vms_state gauge +vms_state 1 diff --git a/codebundles/vast-cluster-health/.test/run-mock-scenarios.sh b/codebundles/vast-cluster-health/.test/run-mock-scenarios.sh new file mode 100755 index 00000000..b6527138 --- /dev/null +++ b/codebundles/vast-cluster-health/.test/run-mock-scenarios.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +cd "$ROOT" + +export VAST_VMS_ENDPOINT="https://vms.mock.local" +export VAST_CLUSTER_NAME="vast-lab-cluster" +export CAPACITY_THRESHOLD="85" +export CRITICAL_CAPACITY_THRESHOLD="95" +export VAST_VMS_CREDENTIALS_JSON='{"USERNAME":"admin","PASSWORD":"mock"}' + +run_scenario() { + local name="$1" + local fixture_dir="$ROOT/.test/fixtures/${name}" + local expected_min="${2:-0}" + local expected_max="${3:-999}" + + echo "=== Scenario: ${name} ===" + export VAST_MOCK_FIXTURE_DIR="$fixture_dir" + + rm -f *_output.json + ./check-vms-cluster-health.sh >/dev/null + ./check-cluster-capacity.sh >/dev/null + ./check-node-hardware-health.sh >/dev/null + ./check-degraded-components.sh >/dev/null + ./check-replication-status.sh >/dev/null + + total_issues=0 + for f in vms_cluster_health_output.json cluster_capacity_output.json node_hardware_health_output.json degraded_components_output.json replication_status_output.json; do + count="$(jq 'length' "$f")" + total_issues=$((total_issues + count)) + done + + echo "Total issues: ${total_issues} (expected between ${expected_min} and ${expected_max})" + if (( total_issues < expected_min || total_issues > expected_max )); then + echo "Scenario ${name} FAILED" >&2 + exit 1 + fi + + sli_json="$(./sli-vast-cluster-health-score.sh)" + echo "SLI scores: ${sli_json}" +} + +run_scenario healthy 0 0 +run_scenario degraded 2 10 +run_scenario capacity_pressure 1 3 + +echo "All mock scenarios passed" diff --git a/codebundles/vast-cluster-health/.test/validate-vast-bundle-structure.sh b/codebundles/vast-cluster-health/.test/validate-vast-bundle-structure.sh new file mode 100755 index 00000000..d8b5b28b --- /dev/null +++ b/codebundles/vast-cluster-health/.test/validate-vast-bundle-structure.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +cd "$ROOT" + +need=( + runbook.robot + sli.robot + README.md + vast-vms-common.sh + check-vms-cluster-health.sh + check-cluster-capacity.sh + check-node-hardware-health.sh + check-degraded-components.sh + analyze-cluster-performance.sh + check-replication-status.sh + sli-vast-cluster-health-score.sh + .runwhen/generation-rules/vast-cluster-health.yaml + .runwhen/templates/vast-cluster-health-slx.yaml + .runwhen/templates/vast-cluster-health-taskset.yaml + .runwhen/templates/vast-cluster-health-sli.yaml +) + +for f in "${need[@]}"; do + if [[ ! -e "$f" ]]; then + echo "missing: $f" >&2 + exit 1 + fi +done + +echo "vast-cluster-health structure OK" diff --git a/codebundles/vast-cluster-health/README.md b/codebundles/vast-cluster-health/README.md new file mode 100644 index 00000000..ec2769ae --- /dev/null +++ b/codebundles/vast-cluster-health/README.md @@ -0,0 +1,83 @@ +# VAST Data Cluster Health + +Monitor VAST Data cluster-wide health via the VMS REST API and Prometheus exporter endpoints. Detects degraded cluster state, capacity exhaustion, hardware failures on CNodes/DNodes, and cluster-level performance bottlenecks that affect all tenants and clients (Kubernetes, NFS, block, S3). + +## Overview + +- **VMS cluster state**: Queries `/api/prometheusmetrics/vms_state`, `/health/`, and `/api/clusters/` for DEGRADED vs CLUSTERED/ONLINE state +- **Capacity utilization**: Evaluates physical and logical capacity from cluster REST and Prometheus metrics against configurable thresholds +- **Node hardware health**: Inspects CNode/DNode REST state and SSD/SCM indicators from `/api/prometheusmetrics/devices` +- **Degraded components**: Surfaces active alarms, degraded boxes, and offline nodes +- **Protocol performance**: Samples cluster-wide IOPS and latency metrics for NFS, block, and S3 from Prometheus exporters +- **Replication and protection**: Checks replication streams, protection groups, and auxiliary/snapshot capacity pressure + +## Configuration + +### Required Variables + +- `VAST_VMS_ENDPOINT`: VMS REST API base URL (e.g. `https://vms.example.com`) +- `VAST_CLUSTER_NAME`: VAST cluster display name for scoping and issue titles + +### Optional Variables + +- `RESOURCES`: Cluster name(s) or `All` for auto-discovery via VMS `/api/clusters/` (default: `All`) +- `CAPACITY_THRESHOLD`: Physical/logical capacity utilization percent that triggers a warning issue (default: `85`) +- `CRITICAL_CAPACITY_THRESHOLD`: Critical capacity threshold percent (default: `95`) + +### Secrets + +- `vast_vms_credentials`: VMS API authentication credentials as JSON: + - `USERNAME` and `PASSWORD` for basic auth, or + - `API_TOKEN` for bearer token auth (when supported by your VMS version) + +## Tasks Overview + +### Check VMS Cluster Health Status for Cluster + +Queries `/api/prometheusmetrics/vms_state` and VMS cluster status to detect DEGRADED (0) vs CLUSTERED (1) state and cluster-level health regressions. + +### Check Cluster Capacity Utilization for Cluster + +Evaluates physical and logical capacity utilization from `/api/clusters/` and Prometheus capacity metrics; raises issues when usage exceeds `CAPACITY_THRESHOLD` or `CRITICAL_CAPACITY_THRESHOLD`. + +### Check CNode and DNode Hardware Health for Cluster + +Inspects CNode/DNode state from REST APIs and SSD/SCM health from Prometheus `/api/prometheusmetrics/devices`. + +### Check Cluster Degraded Components and Active Alerts for Cluster + +Lists degraded boxes, offline nodes, and active VMS alarms from `/api/prometheusmetrics/alarms` and related REST endpoints. + +### Analyze Cluster Protocol Performance for Cluster + +Reviews cluster-wide IOPS and latency by storage protocol (NFS, block, S3) from Prometheus base metrics to detect IO stalls or abnormal drops. + +### Check Replication and Protection Group Status for Cluster + +Verifies replication links, protection groups, and snapshot/auxiliary capacity pressure from REST and `/api/prometheusmetrics/replications`. + +## SLI + +The bundled `sli.robot` produces a 0–1 health score from five binary dimensions: + +1. VMS clustered state +2. Capacity headroom +3. Node hardware health +4. Active alarm clearance +5. Replication health + +## Platform Notes + +- Prometheus metrics are scraped directly from VMS REST paths such as `/api/prometheusmetrics/vms_state` and `/api/prometheusmetrics/all` — no local Prometheus server is required. +- Some endpoints (`/health/`, `/api/prometheusmetrics/replications`, `/api/protectiongroups/`) are unavailable on older VAST versions; tasks degrade gracefully and skip optional checks. +- API reference: [Exporting Metrics to Prometheus](https://kb.vastdata.com/documentation/docs/exporting-metrics-to-prometheus) +- VMS REST docs: `{VAST_VMS_ENDPOINT}/docs` + +## Testing + +Use mock fixtures under `.test/fixtures/` when a live VAST cluster is unavailable: + +```bash +cd codebundles/vast-cluster-health/.test +task +``` diff --git a/codebundles/vast-cluster-health/analyze-cluster-performance.sh b/codebundles/vast-cluster-health/analyze-cluster-performance.sh new file mode 100755 index 00000000..3330aa46 --- /dev/null +++ b/codebundles/vast-cluster-health/analyze-cluster-performance.sh @@ -0,0 +1,126 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Reviews cluster-wide protocol performance (NFS, block, S3) for IO stalls. +# ----------------------------------------------------------------------------- + +OUTPUT_FILE="cluster_performance_output.json" +REPORT_FILE="cluster_performance_report.txt" + +source "$(dirname "$0")/vast-vms-common.sh" + +PERFORMANCE_DROP_THRESHOLD="${PERFORMANCE_DROP_THRESHOLD:-90}" +MIN_BASELINE_IOPS="${MIN_BASELINE_IOPS:-100}" + +issues_json="$(vast_init_issues)" +report="Cluster protocol performance for \`${VAST_CLUSTER_NAME}\`\n" + +if ! _vast_load_credentials; then + issues_json="$(vast_api_error_issue "$issues_json" "credentials" "missing vast_vms_credentials")" + echo "$issues_json" > "$OUTPUT_FILE" + echo -e "$report" > "$REPORT_FILE" + exit 0 +fi + +metrics_text="" +if ! metrics_text="$(vast_prometheus_get "basic_no_views" 2>metrics.err || vast_prometheus_get "" 2>>metrics.err)"; then + err_msg="$(cat metrics.err 2>/dev/null || echo unknown)" + report+="Warning: performance metrics unavailable: ${err_msg}\n" + echo "$issues_json" > "$OUTPUT_FILE" + echo -e "$report" > "$REPORT_FILE" + echo -e "$report" + exit 0 +fi +rm -f metrics.err + +python3 - <<'PY' "$metrics_text" "$PERFORMANCE_DROP_THRESHOLD" "$MIN_BASELINE_IOPS" "$VAST_CLUSTER_NAME" > perf_analysis.json +import json, re, sys + +metrics_text, drop_threshold, min_baseline, cluster = sys.argv[1:5] +drop_threshold = float(drop_threshold) +min_baseline = float(min_baseline) + +protocol_patterns = { + "NFS": re.compile(r"(nfs|NFS).*iops", re.I), + "Block": re.compile(r"(block|BLOCK).*iops", re.I), + "S3": re.compile(r"(s3|S3).*iops", re.I), +} + +values = {} +for line in metrics_text.splitlines(): + if line.startswith("#") or not line.strip(): + continue + parts = line.split() + if len(parts) < 2: + continue + name, val = parts[0], parts[1] + try: + num = float(val) + except ValueError: + continue + for proto, pat in protocol_patterns.items(): + if pat.search(name): + values.setdefault(proto, []).append(num) + +issues = [] +report_lines = [] +for proto, nums in values.items(): + total = sum(nums) + report_lines.append(f"{proto} aggregate IOPS sample total={total:.0f} from {len(nums)} metric(s)") + if total >= min_baseline and total < min_baseline * (drop_threshold / 100.0): + issues.append({ + "title": f"Abnormally Low {proto} IOPS on VAST Cluster `{cluster}`", + "details": f"{proto} aggregate IOPS ({total:.0f}) is below {drop_threshold}% of baseline threshold ({min_baseline:.0f}).", + "severity": 3, + "next_steps": f"Check {proto} client connectivity, VIP health, and recent cluster events; compare with historical dashboards", + }) + +latency_hits = [] +for line in metrics_text.splitlines(): + if line.startswith("#"): + continue + if re.search(r"latency", line, re.I): + parts = line.split() + if len(parts) >= 2: + try: + val = float(parts[1]) + except ValueError: + continue + if val > 100: # ms threshold for cluster-wide latency gauges + latency_hits.append(f"{parts[0]}={val}") + +if latency_hits: + issues.append({ + "title": f"Elevated Cluster Protocol Latency on VAST Cluster `{cluster}`", + "details": "High latency metrics detected:\n" + "\n".join(latency_hits[:10]), + "severity": 3, + "next_steps": "Inspect network path, DNode load, and QoS policies; correlate with tenant-level metrics", + }) + report_lines.append(f"High latency metrics: {len(latency_hits)}") + +if not values and not latency_hits: + report_lines.append("No recognizable protocol IOPS/latency metrics in exporter output (graceful skip).") + +print(json.dumps({"issues": issues, "report": report_lines})) +PY + +mapfile -t report_lines < <(jq -r '.report[]' perf_analysis.json) +for line in "${report_lines[@]:-}"; do + report+="${line}\n" +done + +while IFS= read -r issue; do + [[ -z "$issue" || "$issue" == "null" ]] && continue + title="$(echo "$issue" | jq -r '.title')" + details="$(echo "$issue" | jq -r '.details')" + severity="$(echo "$issue" | jq -r '.severity')" + next_steps="$(echo "$issue" | jq -r '.next_steps')" + issues_json="$(vast_append_issue "$issues_json" "$title" "$details" "$severity" "$next_steps")" +done < <(jq -c '.issues[]?' perf_analysis.json 2>/dev/null || true) + +rm -f perf_analysis.json +echo "$issues_json" > "$OUTPUT_FILE" +echo -e "$report" > "$REPORT_FILE" +echo -e "$report" +echo "Analysis completed. Results saved to $OUTPUT_FILE" diff --git a/codebundles/vast-cluster-health/check-cluster-capacity.sh b/codebundles/vast-cluster-health/check-cluster-capacity.sh new file mode 100755 index 00000000..27acf977 --- /dev/null +++ b/codebundles/vast-cluster-health/check-cluster-capacity.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Evaluates physical and logical capacity utilization for the scoped cluster. +# ----------------------------------------------------------------------------- + +OUTPUT_FILE="cluster_capacity_output.json" +REPORT_FILE="cluster_capacity_report.txt" + +source "$(dirname "$0")/vast-vms-common.sh" + +issues_json="$(vast_init_issues)" +report="Capacity utilization for \`${VAST_CLUSTER_NAME}\` (threshold=${CAPACITY_THRESHOLD}%, critical=${CRITICAL_CAPACITY_THRESHOLD}%)\n" + +if ! _vast_load_credentials; then + issues_json="$(vast_api_error_issue "$issues_json" "credentials" "missing vast_vms_credentials")" + echo "$issues_json" > "$OUTPUT_FILE" + echo -e "$report" > "$REPORT_FILE" + exit 0 +fi + +physical_pct="" +logical_pct="" + +if clusters_json="$(vast_api_get "/api/clusters/" 2>clusters.err)"; then + cluster_obj="$(vast_find_cluster_json "$clusters_json" "$VAST_CLUSTER_NAME")" + if [[ -n "$cluster_obj" ]]; then + physical_pct="$(echo "$cluster_obj" | jq -r '.physical_space_in_use_percent // empty')" + logical_pct="$(echo "$cluster_obj" | jq -r '.logical_space_in_use_percent // empty')" + physical_tb="$(echo "$cluster_obj" | jq -r '.physical_space_in_use_tb // .physical_space_in_use // "n/a"')" + logical_tb="$(echo "$cluster_obj" | jq -r '.logical_space_in_use_tb // .logical_space_in_use // "n/a"')" + report+="REST capacity: physical=${physical_pct}% (${physical_tb} TB in use), logical=${logical_pct}% (${logical_tb} TB in use)\n" + fi +else + err_msg="$(cat clusters.err 2>/dev/null || echo unknown)" + issues_json="$(vast_api_error_issue "$issues_json" "cluster capacity" "$err_msg")" +fi +rm -f clusters.err + +if [[ -z "$physical_pct" || -z "$logical_pct" ]]; then + if metrics_text="$(vast_prometheus_get "basic_no_views" 2>metrics.err || vast_prometheus_get "" 2>metrics.err)"; then + physical_pct="$(vast_prometheus_gauge "$metrics_text" "physical_space_in_use_percent")" + logical_pct="$(vast_prometheus_gauge "$metrics_text" "logical_space_in_use_percent")" + if [[ -z "$physical_pct" ]]; then + physical_used="$(vast_prometheus_gauge "$metrics_text" "physical_space_in_use")" + physical_total="$(vast_prometheus_gauge "$metrics_text" "physical_space")" + if [[ -n "$physical_used" && -n "$physical_total" && "$physical_total" != "0" ]]; then + physical_pct="$(python3 - <= crit: + print("critical") +elif pct >= warn: + print("warning") +else: + print("ok") +PY +)" + if [[ "$cmp_critical" == "critical" ]]; then + issues_json="$(vast_append_issue "$issues_json" \ + "Critical ${kind^} Capacity for VAST Cluster \`${VAST_CLUSTER_NAME}\`" \ + "${kind^} capacity utilization is ${pct}% (critical threshold ${CRITICAL_CAPACITY_THRESHOLD}%)." \ + "2" \ + "Expedite capacity expansion, delete stale snapshots, or rebalance tenants before writes fail")" + elif [[ "$cmp_critical" == "warning" ]]; then + issues_json="$(vast_append_issue "$issues_json" \ + "Elevated ${kind^} Capacity for VAST Cluster \`${VAST_CLUSTER_NAME}\`" \ + "${kind^} capacity utilization is ${pct}% (warning threshold ${CAPACITY_THRESHOLD}%)." \ + "3" \ + "Plan capacity expansion and review snapshot/retention policies")" + fi +done + +echo "$issues_json" > "$OUTPUT_FILE" +echo -e "$report" > "$REPORT_FILE" +echo -e "$report" +echo "Analysis completed. Results saved to $OUTPUT_FILE" diff --git a/codebundles/vast-cluster-health/check-degraded-components.sh b/codebundles/vast-cluster-health/check-degraded-components.sh new file mode 100755 index 00000000..b3562ca2 --- /dev/null +++ b/codebundles/vast-cluster-health/check-degraded-components.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Lists degraded boxes, failed drives, offline nodes, and active VMS alarms. +# ----------------------------------------------------------------------------- + +OUTPUT_FILE="degraded_components_output.json" +REPORT_FILE="degraded_components_report.txt" + +source "$(dirname "$0")/vast-vms-common.sh" + +issues_json="$(vast_init_issues)" +alarm_count=0 +report="Degraded components and alerts for \`${VAST_CLUSTER_NAME}\`\n" + +if ! _vast_load_credentials; then + issues_json="$(vast_api_error_issue "$issues_json" "credentials" "missing vast_vms_credentials")" + echo "$issues_json" > "$OUTPUT_FILE" + echo -e "$report" > "$REPORT_FILE" + exit 0 +fi + +if alarms_text="$(vast_prometheus_get "alarms" 2>alarms.err)"; then + active_alarms="$(echo "$alarms_text" | awk ' + $0 !~ /^#/ && $2 != "0" && $2 != "0.0" { + print $0 + } + ' | head -30)" + alarm_count="$(echo "$active_alarms" | grep -c . || true)" + report+="Active alarm metrics lines: ${alarm_count}\n" + if [[ "$alarm_count" -gt 0 ]]; then + issues_json="$(vast_append_issue "$issues_json" \ + "Active VMS Alarms on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \ + "Prometheus alarms exporter reports ${alarm_count} active alarm metric(s):\n${active_alarms}" \ + "1" \ + "Review Alarms panel in VMS and remediate highest-severity items first")" + fi +else + report+="Note: /api/prometheusmetrics/alarms unavailable; skipping alarm scrape.\n" +fi +rm -f alarms.err + +for path in "/api/boxes/" "/api/dboxes/"; do + if boxes_json="$(vast_api_get "$path" 2>/tmp/boxes.err)"; then + degraded="$(echo "$boxes_json" | jq -r ' + (if type == "array" then . elif .results then .results else [.] end) + | map(select((.state // .status // "ONLINE") | ascii_upcase | test("DEGRADED|FAILED|OFFLINE|ERROR"))) + | map("\(.name // .title // .id // "box") state=\(.state // .status)") + | .[] + ' 2>/dev/null || true)" + if [[ -n "$degraded" ]]; then + while IFS= read -r line; do + [[ -z "$line" ]] && continue + issues_json="$(vast_append_issue "$issues_json" \ + "Degraded Box on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \ + "Box from ${path} reports: ${line}" \ + "1" \ + "Inspect box hardware in VMS and engage VAST support if state persists")" + report+="Degraded box: ${line}\n" + done <<< "$degraded" + fi + fi +done +rm -f /tmp/boxes.err + +offline_nodes=0 +for entry in "CNode:/api/cnodes/" "DNode:/api/dnodes/"; do + label="${entry%%:*}" + api_path="${entry#*:}" + if nodes_json="$(vast_api_get "$api_path" 2>/tmp/node.err)"; then + count="$(echo "$nodes_json" | jq ' + (if type == "array" then . elif .results then .results else [.] end) + | map(select((.state // .status // "ACTIVE") | ascii_upcase | test("OFFLINE|FAILED|INACTIVE|DISABLED|ERROR"))) + | length + ' 2>/dev/null || echo 0)" + offline_nodes=$((offline_nodes + count)) + if [[ "$count" -gt 0 ]]; then + sample="$(echo "$nodes_json" | jq -r ' + (if type == "array" then . elif .results then .results else [.] end) + | map(select((.state // .status // "ACTIVE") | ascii_upcase | test("OFFLINE|FAILED|INACTIVE|DISABLED|ERROR"))) + | .[0] | "\(.name // .hostname // .id // "node") state=\(.state // .status)" + ' 2>/dev/null || echo unknown)" + issues_json="$(vast_append_issue "$issues_json" \ + "Offline ${label}(s) on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \ + "${count} ${label}(s) offline or failed (example: ${sample}). Partial cluster failure may impact all tenants." \ + "1" \ + "Restore offline nodes or replace failed hardware; verify cluster quorum in VMS")" + report+="${count} offline ${label}(s).\n" + fi + fi +done +rm -f /tmp/node.err + +if [[ "$offline_nodes" -eq 0 && "$alarm_count" -eq 0 ]]; then + report+="No degraded boxes, offline nodes, or active alarms detected.\n" +fi + +echo "$issues_json" > "$OUTPUT_FILE" +echo -e "$report" > "$REPORT_FILE" +echo -e "$report" +echo "Analysis completed. Results saved to $OUTPUT_FILE" diff --git a/codebundles/vast-cluster-health/check-node-hardware-health.sh b/codebundles/vast-cluster-health/check-node-hardware-health.sh new file mode 100755 index 00000000..dffb5df1 --- /dev/null +++ b/codebundles/vast-cluster-health/check-node-hardware-health.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Inspects CNode/DNode and SSD/SCM hardware health from REST and Prometheus. +# ----------------------------------------------------------------------------- + +OUTPUT_FILE="node_hardware_health_output.json" +REPORT_FILE="node_hardware_health_report.txt" + +source "$(dirname "$0")/vast-vms-common.sh" + +issues_json="$(vast_init_issues)" +report="Node hardware health for \`${VAST_CLUSTER_NAME}\`\n" + +if ! _vast_load_credentials; then + issues_json="$(vast_api_error_issue "$issues_json" "credentials" "missing vast_vms_credentials")" + echo "$issues_json" > "$OUTPUT_FILE" + echo -e "$report" > "$REPORT_FILE" + exit 0 +fi + +check_nodes() { + local kind="$1" + local path="$2" + local nodes_json="" + if ! nodes_json="$(vast_api_get "$path" 2>"/tmp/${kind}.err")"; then + report+="Warning: ${kind} REST API unavailable.\n" + return 0 + fi + local bad + bad="$(echo "$nodes_json" | jq -r ' + (if type == "array" then . elif .results then .results else [.] end) + | map(select((.state // .status // "ACTIVE") | ascii_upcase | test("OFFLINE|FAILED|INACTIVE|DISABLED|ERROR"))) + | map("\(.name // .hostname // .id // "unknown") state=\(.state // .status)") + | .[] + ' 2>/dev/null || true)" + if [[ -n "$bad" ]]; then + while IFS= read -r line; do + [[ -z "$line" ]] && continue + issues_json="$(vast_append_issue "$issues_json" \ + "Unhealthy ${kind} on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \ + "${kind} reports unhealthy state: ${line}" \ + "2" \ + "Inspect ${kind} in VMS, verify hardware LEDs/cabling, and follow VAST support guidance for replacement")" + report+="Unhealthy ${kind}: ${line}\n" + done <<< "$bad" + else + count="$(echo "$nodes_json" | jq 'if type == "array" then length elif .results then (.results|length) else 1 end' 2>/dev/null || echo 0)" + report+="All ${count} ${kind}(s) appear healthy via REST.\n" + fi +} + +check_nodes "CNode" "/api/cnodes/" +check_nodes "DNode" "/api/dnodes/" + +if devices_text="$(vast_prometheus_get "devices" 2>devices.err)"; then + failed_devices="$(echo "$devices_text" | awk ' + $0 !~ /^#/ && ($0 ~ /state/ || $0 ~ /status/) && ($0 ~ /failed|error|offline|inactive|0$/) { + print $0 + } + ' | head -20)" + if [[ -n "$failed_devices" ]]; then + issues_json="$(vast_append_issue "$issues_json" \ + "SSD/SCM Hardware Faults on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \ + "Prometheus devices metrics indicate failed or unhealthy media:\n${failed_devices}" \ + "2" \ + "Review DBox device status in VMS and replace failed SSD/SCM modules")" + report+="Device metric faults detected (see issues).\n" + else + report+="No failed SSD/SCM indicators in /api/prometheusmetrics/devices.\n" + fi +else + report+="Note: /api/prometheusmetrics/devices unavailable on this VAST version.\n" +fi +rm -f devices.err /tmp/CNode.err /tmp/DNode.err + +echo "$issues_json" > "$OUTPUT_FILE" +echo -e "$report" > "$REPORT_FILE" +echo -e "$report" +echo "Analysis completed. Results saved to $OUTPUT_FILE" diff --git a/codebundles/vast-cluster-health/check-replication-status.sh b/codebundles/vast-cluster-health/check-replication-status.sh new file mode 100755 index 00000000..a187de08 --- /dev/null +++ b/codebundles/vast-cluster-health/check-replication-status.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Verifies replication links, protection groups, and snapshot pressure signals. +# ----------------------------------------------------------------------------- + +OUTPUT_FILE="replication_status_output.json" +REPORT_FILE="replication_status_report.txt" + +source "$(dirname "$0")/vast-vms-common.sh" + +issues_json="$(vast_init_issues)" +report="Replication and protection status for \`${VAST_CLUSTER_NAME}\`\n" + +if ! _vast_load_credentials; then + issues_json="$(vast_api_error_issue "$issues_json" "credentials" "missing vast_vms_credentials")" + echo "$issues_json" > "$OUTPUT_FILE" + echo -e "$report" > "$REPORT_FILE" + exit 0 +fi + +if clusters_json="$(vast_api_get "/api/clusters/" 2>clusters.err)"; then + cluster_obj="$(vast_find_cluster_json "$clusters_json" "$VAST_CLUSTER_NAME")" + if [[ -n "$cluster_obj" ]]; then + repl_enabled="$(echo "$cluster_obj" | jq -r '.replication_enabled // .replication // empty')" + aux_pct="$(echo "$cluster_obj" | jq -r '.auxiliary_space_in_use_percent // empty')" + if [[ "$repl_enabled" == "false" ]]; then + report+="Cluster replication_enabled=false (informational).\n" + fi + if [[ -n "$aux_pct" && "$aux_pct" != "null" ]]; then + report+="Auxiliary/snapshot space in use: ${aux_pct}%\n" + aux_cmp="$(python3 - <= float("${CRITICAL_CAPACITY_THRESHOLD}") else ("warn" if pct >= float("${CAPACITY_THRESHOLD}") else "ok")) +PY +)" + if [[ "$aux_cmp" == "high" ]]; then + issues_json="$(vast_append_issue "$issues_json" \ + "High Snapshot/Auxiliary Capacity on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \ + "Auxiliary space (snapshots/replication metadata) is ${aux_pct}% of capacity." \ + "2" \ + "Review snapshot retention, protection policies, and replication backlog")" + elif [[ "$aux_cmp" == "warn" ]]; then + issues_json="$(vast_append_issue "$issues_json" \ + "Elevated Snapshot/Auxiliary Capacity on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \ + "Auxiliary space is ${aux_pct}% (warning threshold ${CAPACITY_THRESHOLD}%)." \ + "3" \ + "Audit protection groups and snapshot schedules for capacity pressure")" + fi + fi + fi +else + err_msg="$(cat clusters.err 2>/dev/null || echo unknown)" + issues_json="$(vast_api_error_issue "$issues_json" "replication status" "$err_msg")" +fi +rm -f clusters.err + +if repl_text="$(vast_prometheus_get "replications" 2>repl.err)"; then + unhealthy="$(echo "$repl_text" | awk ' + $0 !~ /^#/ && ($0 ~ /state|status|lag|behind|failed|error/ || $0 ~ /replication/) && ($0 ~ /0$/ || $0 ~ /failed|error|lag|behind|stalled/i) { + print $0 + } + ' | head -20)" + if [[ -n "$unhealthy" ]]; then + issues_json="$(vast_append_issue "$issues_json" \ + "Replication Stream Issues on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \ + "Prometheus replications metrics indicate unhealthy streams:\n${unhealthy}" \ + "2" \ + "Verify replication peer connectivity, bandwidth limits, and protection group health in VMS")" + report+="Unhealthy replication metric samples detected.\n" + else + report+="Replication Prometheus metrics show no obvious failures.\n" + fi +else + report+="Note: /api/prometheusmetrics/replications unavailable (requires VAST 5.2-sp10+).\n" +fi +rm -f repl.err + +if pg_json="$(vast_api_get "/api/protectiongroups/" 2>pg.err)"; then + bad_pg="$(echo "$pg_json" | jq -r ' + (if type == "array" then . elif .results then .results else [.] end) + | map(select((.state // .status // "OK") | ascii_upcase | test("FAILED|ERROR|DEGRADED|OFFLINE"))) + | map("\(.name // .title // .id // "pg") state=\(.state // .status)") + | .[] + ' 2>/dev/null || true)" + if [[ -n "$bad_pg" ]]; then + while IFS= read -r line; do + [[ -z "$line" ]] && continue + issues_json="$(vast_append_issue "$issues_json" \ + "Protection Group Issue on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \ + "Protection group reports: ${line}" \ + "2" \ + "Review protection group configuration and replication targets in VMS")" + report+="Protection group issue: ${line}\n" + done <<< "$bad_pg" + else + report+="Protection groups REST check: no failed groups found.\n" + fi +else + report+="Note: /api/protectiongroups/ unavailable on this VAST version.\n" +fi +rm -f pg.err + +echo "$issues_json" > "$OUTPUT_FILE" +echo -e "$report" > "$REPORT_FILE" +echo -e "$report" +echo "Analysis completed. Results saved to $OUTPUT_FILE" diff --git a/codebundles/vast-cluster-health/check-vms-cluster-health.sh b/codebundles/vast-cluster-health/check-vms-cluster-health.sh new file mode 100755 index 00000000..2b04303b --- /dev/null +++ b/codebundles/vast-cluster-health/check-vms-cluster-health.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# REQUIRED ENV VARS: +# VAST_VMS_ENDPOINT, VAST_CLUSTER_NAME +# OPTIONAL: +# CAPACITY_THRESHOLD, CRITICAL_CAPACITY_THRESHOLD +# VAST_VMS_CREDENTIALS_FILE / VAST_VMS_CREDENTIALS_JSON +# +# Queries /api/prometheusmetrics/vms_state and cluster REST status. +# ----------------------------------------------------------------------------- + +OUTPUT_FILE="vms_cluster_health_output.json" +REPORT_FILE="vms_cluster_health_report.txt" + +source "$(dirname "$0")/vast-vms-common.sh" + +issues_json="$(vast_init_issues)" +report="VMS cluster health check for \`${VAST_CLUSTER_NAME}\` at ${VAST_VMS_ENDPOINT}\n" + +if ! _vast_load_credentials; then + issues_json="$(vast_api_error_issue "$issues_json" "credentials" "missing vast_vms_credentials")" + echo "$issues_json" > "$OUTPUT_FILE" + echo -e "$report" > "$REPORT_FILE" + exit 0 +fi + +vms_state_text="" +if vms_state_text="$(vast_prometheus_get "vms_state" 2>vms_state.err)"; then + vms_state="$(vast_prometheus_gauge "$vms_state_text" "vms_state")" + if [[ -z "$vms_state" ]]; then + vms_state="$(vast_prometheus_gauge "$vms_state_text" "vast_vms_state")" + fi + report+="VMS state metric: ${vms_state:-unknown} (1=CLUSTERED, 0=DEGRADED)\n" + if [[ "$vms_state" == "0" ]]; then + issues_json="$(vast_append_issue "$issues_json" \ + "VAST Cluster \`${VAST_CLUSTER_NAME}\` VMS State is DEGRADED" \ + "Prometheus vms_state gauge reports 0 (DEGRADED). Cluster-wide operations may be impaired." \ + "1" \ + "Review VMS alarms and degraded components; check offline CNodes/DNodes in VMS UI")" + elif [[ -z "$vms_state" ]]; then + report+="Warning: vms_state metric not found in exporter response (endpoint may be unavailable on older VAST versions).\n" + fi +else + err_msg="$(cat vms_state.err 2>/dev/null || echo unknown)" + report+="Warning: /api/prometheusmetrics/vms_state unavailable: ${err_msg}\n" +fi +rm -f vms_state.err + +clusters_json="" +if clusters_json="$(vast_api_get "/api/clusters/" 2>clusters.err)"; then + cluster_obj="$(vast_find_cluster_json "$clusters_json" "$VAST_CLUSTER_NAME")" + if [[ -n "$cluster_obj" ]]; then + cluster_state="$(echo "$cluster_obj" | jq -r '.state // "UNKNOWN"')" + enabled="$(echo "$cluster_obj" | jq -r '.enabled // true')" + report+="Cluster REST state: ${cluster_state}, enabled=${enabled}\n" + if [[ "$cluster_state" != "ONLINE" && "$cluster_state" != "CLUSTERED" ]]; then + issues_json="$(vast_append_issue "$issues_json" \ + "VAST Cluster \`${VAST_CLUSTER_NAME}\` State is ${cluster_state}" \ + "Cluster REST API reports state=${cluster_state} (expected ONLINE/CLUSTERED)." \ + "2" \ + "Inspect cluster events in VMS and verify all boxes and nodes are online")" + fi + if [[ "$enabled" != "true" ]]; then + issues_json="$(vast_append_issue "$issues_json" \ + "VAST Cluster \`${VAST_CLUSTER_NAME}\` is Disabled" \ + "Cluster enabled flag is false in VMS REST API." \ + "2" \ + "Re-enable the cluster in VMS if this was not intentional maintenance")" + fi + else + report+="Warning: cluster \`${VAST_CLUSTER_NAME}\` not found in /api/clusters/ response.\n" + fi +else + err_msg="$(cat clusters.err 2>/dev/null || echo unknown)" + issues_json="$(vast_api_error_issue "$issues_json" "cluster status" "$err_msg")" +fi +rm -f clusters.err + +health_text="" +if health_text="$(vast_api_get "/health/" 2>health.err)"; then + health_state="$(echo "$health_text" | jq -r '.state // .status // empty' 2>/dev/null || true)" + if [[ -n "$health_state" ]]; then + report+="VMS /health/ status: ${health_state}\n" + if [[ "$health_state" =~ ^(DEGRADED|UNHEALTHY|ERROR|FAILED)$ ]]; then + issues_json="$(vast_append_issue "$issues_json" \ + "VAST Cluster \`${VAST_CLUSTER_NAME}\` VMS Health Endpoint Reports ${health_state}" \ + "GET /health/ returned state=${health_state}." \ + "1" \ + "Review VMS health dashboard and active alarms")" + fi + fi +else + report+="Note: /health/ endpoint unavailable (requires VAST 5.4.3+).\n" +fi +rm -f health.err + +echo "$issues_json" > "$OUTPUT_FILE" +echo -e "$report" > "$REPORT_FILE" +echo -e "$report" +echo "Analysis completed. Results saved to $OUTPUT_FILE" diff --git a/codebundles/vast-cluster-health/runbook.robot b/codebundles/vast-cluster-health/runbook.robot new file mode 100644 index 00000000..bd9c83ac --- /dev/null +++ b/codebundles/vast-cluster-health/runbook.robot @@ -0,0 +1,295 @@ +*** Settings *** +Documentation Monitor VAST Data cluster-wide health via VMS REST and Prometheus exporter endpoints for degraded state, capacity pressure, hardware faults, and protocol performance. +Metadata Author rw-codebundle-agent +Metadata Display Name VAST Data Cluster Health +Metadata Supports VAST vast_data cluster storage metrics + +Force Tags VAST vast_data cluster storage health + +Library String +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform + +Suite Setup Suite Initialization + + +*** Tasks *** +Check VMS Cluster Health Status for Cluster `${VAST_CLUSTER_NAME}` + [Documentation] Queries /api/prometheusmetrics/vms_state and VMS cluster status to detect DEGRADED (0) vs CLUSTERED (1) state and any active cluster alerts. + [Tags] VAST vast_data cluster health access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-vms-cluster-health.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./check-vms-cluster-health.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat vms_cluster_health_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=VMS cluster health should report CLUSTERED/ONLINE state for cluster `${VAST_CLUSTER_NAME}` + ... actual=VMS cluster health check found degraded or unreachable cluster state + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report VMS Cluster Health Results:\n${result.stdout} + +Check Cluster Capacity Utilization for Cluster `${VAST_CLUSTER_NAME}` + [Documentation] Evaluates physical and logical capacity utilization from cluster REST and Prometheus metrics; raises issues when usage exceeds CAPACITY_THRESHOLD percent. + [Tags] VAST vast_data cluster capacity access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-cluster-capacity.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./check-cluster-capacity.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat cluster_capacity_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Cluster physical and logical capacity should remain below configured thresholds for cluster `${VAST_CLUSTER_NAME}` + ... actual=Cluster capacity utilization exceeds warning or critical thresholds + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Cluster Capacity Results:\n${result.stdout} + +Check CNode and DNode Hardware Health for Cluster `${VAST_CLUSTER_NAME}` + [Documentation] Inspects CNode/DNode state, SSD/SCM health, and hardware fault indicators from REST and Prometheus exporter metrics. + [Tags] VAST vast_data cnodes dnodes hardware access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-node-hardware-health.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=./check-node-hardware-health.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat node_hardware_health_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=All CNodes, DNodes, and storage devices should be healthy for cluster `${VAST_CLUSTER_NAME}` + ... actual=Node or device hardware health issues were detected + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Node Hardware Health Results:\n${result.stdout} + +Check Cluster Degraded Components and Active Alerts for Cluster `${VAST_CLUSTER_NAME}` + [Documentation] Lists degraded boxes, failed drives, offline nodes, and active VMS alerts that indicate partial cluster failure. + [Tags] VAST vast_data cluster alerts access:read-only data:logs-config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-degraded-components.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=./check-degraded-components.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat degraded_components_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Cluster should have no degraded boxes, offline nodes, or active alarms for cluster `${VAST_CLUSTER_NAME}` + ... actual=Degraded components or active VMS alarms were found + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Degraded Components Results:\n${result.stdout} + +Analyze Cluster Protocol Performance for Cluster `${VAST_CLUSTER_NAME}` + [Documentation] Reviews cluster-wide IOPS, bandwidth, and latency by storage protocol (NFS, block, S3) to detect IO stalls or abnormal drops in data flow. + [Tags] VAST vast_data cluster performance access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=analyze-cluster-performance.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=./analyze-cluster-performance.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat cluster_performance_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Cluster protocol performance should remain within expected IO and latency bounds for cluster `${VAST_CLUSTER_NAME}` + ... actual=Cluster protocol performance anomalies were detected + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Cluster Performance Results:\n${result.stdout} + +Check Replication and Protection Group Status for Cluster `${VAST_CLUSTER_NAME}` + [Documentation] Verifies replication links, protection groups, and snapshot policies are healthy and not blocking writes or causing capacity pressure. + [Tags] VAST vast_data cluster replication access:read-only data:config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-replication-status.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=./check-replication-status.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat replication_status_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Replication streams and protection groups should be healthy with acceptable snapshot/auxiliary space for cluster `${VAST_CLUSTER_NAME}` + ... actual=Replication or protection group issues were detected + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Replication Status Results:\n${result.stdout} + + +*** Keywords *** +Suite Initialization + TRY + ${vast_vms_credentials}= RW.Core.Import Secret vast_vms_credentials + ... type=string + ... description=VMS API credentials JSON with USERNAME/PASSWORD or API_TOKEN + ... pattern=\w* + Set Suite Variable ${vast_vms_credentials} ${vast_vms_credentials} + EXCEPT + Log vast_vms_credentials secret not found; VMS API tasks will fail until configured. WARN + Set Suite Variable ${vast_vms_credentials} ${EMPTY} + END + + ${VAST_VMS_ENDPOINT}= RW.Core.Import User Variable VAST_VMS_ENDPOINT + ... type=string + ... description=VMS REST API base URL (e.g. https://vms.example.com) + ... pattern=\w* + ${VAST_CLUSTER_NAME}= RW.Core.Import User Variable VAST_CLUSTER_NAME + ... type=string + ... description=VAST cluster display name for scoping and issue titles + ... pattern=\w* + ${RESOURCES}= RW.Core.Import User Variable RESOURCES + ... type=string + ... description=Cluster name(s) or All for auto-discovery via VMS API + ... pattern=^[\w,\s-]*$ + ... default=All + ${CAPACITY_THRESHOLD}= RW.Core.Import User Variable CAPACITY_THRESHOLD + ... type=string + ... description=Physical/logical capacity utilization percent that triggers an issue + ... pattern=^\d+$ + ... default=85 + ${CRITICAL_CAPACITY_THRESHOLD}= RW.Core.Import User Variable CRITICAL_CAPACITY_THRESHOLD + ... type=string + ... description=Critical capacity threshold percent + ... pattern=^\d+$ + ... default=95 + + Set Suite Variable ${VAST_VMS_ENDPOINT} ${VAST_VMS_ENDPOINT} + Set Suite Variable ${VAST_CLUSTER_NAME} ${VAST_CLUSTER_NAME} + Set Suite Variable ${RESOURCES} ${RESOURCES} + Set Suite Variable ${CAPACITY_THRESHOLD} ${CAPACITY_THRESHOLD} + Set Suite Variable ${CRITICAL_CAPACITY_THRESHOLD} ${CRITICAL_CAPACITY_THRESHOLD} + + ${cred_path}= Set Variable If '${vast_vms_credentials}' != '' ./${vast_vms_credentials.key} ${EMPTY} + ${env_dict}= Create Dictionary + ... VAST_VMS_ENDPOINT=${VAST_VMS_ENDPOINT} + ... VAST_CLUSTER_NAME=${VAST_CLUSTER_NAME} + ... RESOURCES=${RESOURCES} + ... CAPACITY_THRESHOLD=${CAPACITY_THRESHOLD} + ... CRITICAL_CAPACITY_THRESHOLD=${CRITICAL_CAPACITY_THRESHOLD} + ... VAST_VMS_CREDENTIALS_FILE=${cred_path} + Set Suite Variable ${env} ${env_dict} diff --git a/codebundles/vast-cluster-health/sli-vast-cluster-health-score.sh b/codebundles/vast-cluster-health/sli-vast-cluster-health-score.sh new file mode 100755 index 00000000..090dca93 --- /dev/null +++ b/codebundles/vast-cluster-health/sli-vast-cluster-health-score.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +set -euo pipefail +# Lightweight SLI scoring script — outputs JSON with binary sub-scores. + +source "$(dirname "$0")/vast-vms-common.sh" + +if ! _vast_load_credentials; then + jq -n '{ + vms_clustered: 0, + capacity_ok: 0, + nodes_healthy: 0, + alarms_clear: 0, + replication_ok: 1, + details: {error: "missing credentials"} + }' + exit 0 +fi + +vms_clustered=0 +capacity_ok=1 +nodes_healthy=1 +alarms_clear=1 +replication_ok=1 +details='{}' + +if vms_state_text="$(vast_prometheus_get "vms_state" 2>/dev/null)"; then + vms_state="$(vast_prometheus_gauge "$vms_state_text" "vms_state")" + [[ -z "$vms_state" ]] && vms_state="$(vast_prometheus_gauge "$vms_state_text" "vast_vms_state")" + [[ "$vms_state" == "1" ]] && vms_clustered=1 + details="$(jq -n --arg s "${vms_state:-unknown}" '{vms_state: $s}')" +else + if clusters_json="$(vast_api_get "/api/clusters/" 2>/dev/null)"; then + cluster_obj="$(vast_find_cluster_json "$clusters_json" "$VAST_CLUSTER_NAME")" + state="$(echo "$cluster_obj" | jq -r '.state // "UNKNOWN"')" + [[ "$state" == "ONLINE" || "$state" == "CLUSTERED" ]] && vms_clustered=1 + details="$(jq -n --arg s "$state" '{cluster_state: $s}')" + fi +fi + +if clusters_json="$(vast_api_get "/api/clusters/" 2>/dev/null)"; then + cluster_obj="$(vast_find_cluster_json "$clusters_json" "$VAST_CLUSTER_NAME")" + if [[ -n "$cluster_obj" ]]; then + for pct_field in physical_space_in_use_percent logical_space_in_use_percent; do + pct="$(echo "$cluster_obj" | jq -r --arg f "$pct_field" '.[$f] // empty')" + [[ -z "$pct" || "$pct" == "null" ]] && continue + ok="$(python3 - </dev/null)"; then + bad="$(echo "$nodes_json" | jq ' + (if type == "array" then . elif .results then .results else [.] end) + | map(select((.state // .status // "ACTIVE") | ascii_upcase | test("OFFLINE|FAILED|INACTIVE|DISABLED|ERROR"))) + | length + ' 2>/dev/null || echo 0)" + [[ "$bad" -gt 0 ]] && nodes_healthy=0 + fi +done + +if alarms_text="$(vast_prometheus_get "alarms" 2>/dev/null)"; then + alarm_lines="$(echo "$alarms_text" | awk '$0 !~ /^#/ && $2 != "0" && $2 != "0.0" {c++} END {print c+0}')" + [[ "$alarm_lines" -gt 0 ]] && alarms_clear=0 +fi + +if repl_text="$(vast_prometheus_get "replications" 2>/dev/null)"; then + bad="$(echo "$repl_text" | awk '$0 !~ /^#/ && ($0 ~ /failed|error|stalled/i) {c++} END {print c+0}')" + [[ "$bad" -gt 0 ]] && replication_ok=0 +fi + +jq -n \ + --argjson vms_clustered "$vms_clustered" \ + --argjson capacity_ok "$capacity_ok" \ + --argjson nodes_healthy "$nodes_healthy" \ + --argjson alarms_clear "$alarms_clear" \ + --argjson replication_ok "$replication_ok" \ + --argjson details "$details" \ + '{ + vms_clustered: $vms_clustered, + capacity_ok: $capacity_ok, + nodes_healthy: $nodes_healthy, + alarms_clear: $alarms_clear, + replication_ok: $replication_ok, + details: $details + }' diff --git a/codebundles/vast-cluster-health/sli.robot b/codebundles/vast-cluster-health/sli.robot new file mode 100644 index 00000000..efb2be23 --- /dev/null +++ b/codebundles/vast-cluster-health/sli.robot @@ -0,0 +1,128 @@ +*** Settings *** +Documentation Measures VAST cluster health across five binary dimensions (VMS clustered, capacity, nodes, alarms, replication) and averages them into a 0-1 score. +Metadata Author rw-codebundle-agent +Metadata Display Name VAST Data Cluster Health SLI +Metadata Supports VAST vast_data cluster storage metrics + +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform + +Suite Setup Suite Initialization + + +*** Tasks *** +Score VMS Cluster State + [Documentation] Binary score: 1 when vms_state=1 or cluster REST state is ONLINE/CLUSTERED, 0 otherwise. + [Tags] VAST sli access:read-only data:metrics + ${score}= Set Variable ${score_vms} + RW.Core.Push Metric ${score} sub_name=vms_clustered + +Score Cluster Capacity Headroom + [Documentation] Binary score: 1 when physical and logical utilization are below CAPACITY_THRESHOLD. + [Tags] VAST sli access:read-only data:metrics + ${score}= Set Variable ${score_capacity} + RW.Core.Push Metric ${score} sub_name=capacity_ok + +Score Node Hardware Health + [Documentation] Binary score: 1 when no CNodes or DNodes report offline/failed states. + [Tags] VAST sli access:read-only data:metrics + ${score}= Set Variable ${score_nodes} + RW.Core.Push Metric ${score} sub_name=nodes_healthy + +Score Active Alarm Clearance + [Documentation] Binary score: 1 when Prometheus alarms exporter reports no active alarms. + [Tags] VAST sli access:read-only data:metrics + ${score}= Set Variable ${score_alarms} + RW.Core.Push Metric ${score} sub_name=alarms_clear + +Score Replication Health + [Documentation] Binary score: 1 when replication Prometheus metrics show no failed/stalled streams (defaults to 1 if endpoint unavailable). + [Tags] VAST sli access:read-only data:metrics + ${score}= Set Variable ${score_replication} + RW.Core.Push Metric ${score} sub_name=replication_ok + +Generate Aggregate VAST Cluster Health Score + [Documentation] Averages sub-scores into the primary 0-1 health metric. + [Tags] VAST sli access:read-only data:metrics + ${total}= Evaluate int(${score_vms}) + int(${score_capacity}) + int(${score_nodes}) + int(${score_alarms}) + int(${score_replication}) + ${health_score}= Evaluate ${total} / 5.0 + ${health_score}= Convert To Number ${health_score} 2 + RW.Core.Add To Report VAST cluster health score: ${health_score} (vms=${score_vms}, capacity=${score_capacity}, nodes=${score_nodes}, alarms=${score_alarms}, replication=${score_replication}) + RW.Core.Push Metric ${health_score} + + +*** Keywords *** +Suite Initialization + TRY + ${vast_vms_credentials}= RW.Core.Import Secret vast_vms_credentials + ... type=string + ... description=VMS API credentials JSON with USERNAME/PASSWORD or API_TOKEN + ... pattern=\w* + Set Suite Variable ${vast_vms_credentials} ${vast_vms_credentials} + EXCEPT + Log vast_vms_credentials secret not found. WARN + Set Suite Variable ${vast_vms_credentials} ${EMPTY} + END + + ${VAST_VMS_ENDPOINT}= RW.Core.Import User Variable VAST_VMS_ENDPOINT + ... type=string + ... description=VMS REST API base URL + ... pattern=\w* + ${VAST_CLUSTER_NAME}= RW.Core.Import User Variable VAST_CLUSTER_NAME + ... type=string + ... description=VAST cluster display name + ... pattern=\w* + ${CAPACITY_THRESHOLD}= RW.Core.Import User Variable CAPACITY_THRESHOLD + ... type=string + ... description=Capacity warning threshold percent + ... pattern=^\d+$ + ... default=85 + ${CRITICAL_CAPACITY_THRESHOLD}= RW.Core.Import User Variable CRITICAL_CAPACITY_THRESHOLD + ... type=string + ... description=Critical capacity threshold percent + ... pattern=^\d+$ + ... default=95 + + Set Suite Variable ${VAST_VMS_ENDPOINT} ${VAST_VMS_ENDPOINT} + Set Suite Variable ${VAST_CLUSTER_NAME} ${VAST_CLUSTER_NAME} + Set Suite Variable ${CAPACITY_THRESHOLD} ${CAPACITY_THRESHOLD} + Set Suite Variable ${CRITICAL_CAPACITY_THRESHOLD} ${CRITICAL_CAPACITY_THRESHOLD} + + ${cred_path}= Set Variable If '${vast_vms_credentials}' != '' ./${vast_vms_credentials.key} ${EMPTY} + ${env_dict}= Create Dictionary + ... VAST_VMS_ENDPOINT=${VAST_VMS_ENDPOINT} + ... VAST_CLUSTER_NAME=${VAST_CLUSTER_NAME} + ... CAPACITY_THRESHOLD=${CAPACITY_THRESHOLD} + ... CRITICAL_CAPACITY_THRESHOLD=${CRITICAL_CAPACITY_THRESHOLD} + ... VAST_VMS_CREDENTIALS_FILE=${cred_path} + Set Suite Variable ${env} ${env_dict} + + Set Suite Variable ${score_vms} 0 + Set Suite Variable ${score_capacity} 0 + Set Suite Variable ${score_nodes} 0 + Set Suite Variable ${score_alarms} 0 + Set Suite Variable ${score_replication} 1 + + ${result}= RW.CLI.Run Bash File + ... bash_file=sli-vast-cluster-health-score.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... timeout_seconds=30 + ... include_in_history=false + TRY + ${data}= Evaluate json.loads(r'''${result.stdout}''') json + ${score_vms}= Set Variable ${data['vms_clustered']} + ${score_capacity}= Set Variable ${data['capacity_ok']} + ${score_nodes}= Set Variable ${data['nodes_healthy']} + ${score_alarms}= Set Variable ${data['alarms_clear']} + ${score_replication}= Set Variable ${data['replication_ok']} + EXCEPT + Log Failed to parse SLI score JSON; defaulting sub-scores to failure mode. WARN + END + Set Suite Variable ${score_vms} ${score_vms} + Set Suite Variable ${score_capacity} ${score_capacity} + Set Suite Variable ${score_nodes} ${score_nodes} + Set Suite Variable ${score_alarms} ${score_alarms} + Set Suite Variable ${score_replication} ${score_replication} diff --git a/codebundles/vast-cluster-health/vast-vms-common.sh b/codebundles/vast-cluster-health/vast-vms-common.sh new file mode 100755 index 00000000..f4386cef --- /dev/null +++ b/codebundles/vast-cluster-health/vast-vms-common.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env bash +# Shared helpers for VAST VMS REST and Prometheus exporter access. +# shellcheck disable=SC2034 + +set -euo pipefail + +: "${VAST_VMS_ENDPOINT:?Must set VAST_VMS_ENDPOINT}" +: "${VAST_CLUSTER_NAME:?Must set VAST_CLUSTER_NAME}" + +VAST_VMS_ENDPOINT="${VAST_VMS_ENDPOINT%/}" +CAPACITY_THRESHOLD="${CAPACITY_THRESHOLD:-85}" +CRITICAL_CAPACITY_THRESHOLD="${CRITICAL_CAPACITY_THRESHOLD:-95}" +VAST_TLS_INSECURE="${VAST_TLS_INSECURE:-true}" +VAST_CURL_TIMEOUT="${VAST_CURL_TIMEOUT:-60}" + +_vast_load_credentials() { + local creds_json="${1:-}" + if [[ -z "$creds_json" && -n "${VAST_VMS_CREDENTIALS_FILE:-}" && -f "${VAST_VMS_CREDENTIALS_FILE}" ]]; then + creds_json="$(cat "${VAST_VMS_CREDENTIALS_FILE}")" + fi + if [[ -z "$creds_json" && -n "${VAST_VMS_CREDENTIALS_JSON:-}" ]]; then + creds_json="${VAST_VMS_CREDENTIALS_JSON}" + fi + if [[ -z "$creds_json" ]]; then + echo "VAST credentials not configured (set vast_vms_credentials secret with USERNAME/PASSWORD or API_TOKEN)" >&2 + return 1 + fi + VAST_API_USERNAME="$(echo "$creds_json" | jq -r '.USERNAME // .username // empty')" + VAST_API_PASSWORD="$(echo "$creds_json" | jq -r '.PASSWORD // .password // empty')" + VAST_API_TOKEN="$(echo "$creds_json" | jq -r '.API_TOKEN // .api_token // .token // empty')" + export VAST_API_USERNAME VAST_API_PASSWORD VAST_API_TOKEN +} + +_vast_fixture_path() { + local kind="$1" + if [[ -n "${VAST_MOCK_FIXTURE_DIR:-}" ]]; then + local candidate="${VAST_MOCK_FIXTURE_DIR}/${kind}" + if [[ -f "$candidate" ]]; then + echo "$candidate" + return 0 + fi + fi + return 1 +} + +_vast_curl_common_args() { + local args=(-sS --connect-timeout 10 --max-time "${VAST_CURL_TIMEOUT}") + if [[ "${VAST_TLS_INSECURE}" == "true" ]]; then + args+=(-k) + fi + if [[ -n "${VAST_API_TOKEN:-}" ]]; then + args+=(-H "Authorization: Bearer ${VAST_API_TOKEN}") + elif [[ -n "${VAST_API_USERNAME:-}" && -n "${VAST_API_PASSWORD:-}" ]]; then + args+=(-u "${VAST_API_USERNAME}:${VAST_API_PASSWORD}") + fi + printf '%s\n' "${args[@]}" +} + +vast_api_get() { + local path="$1" + local fixture + if fixture="$(_vast_fixture_path "api${path//\//_}")"; then + cat "$fixture" + return 0 + fi + mapfile -t curl_args < <(_vast_curl_common_args) + curl "${curl_args[@]}" -H "Accept: application/json" "${VAST_VMS_ENDPOINT}${path}" +} + +vast_prometheus_get() { + local endpoint="$1" + local fixture + if fixture="$(_vast_fixture_path "prometheus_${endpoint//\//_}")"; then + cat "$fixture" + return 0 + fi + mapfile -t curl_args < <(_vast_curl_common_args) + curl "${curl_args[@]}" "${VAST_VMS_ENDPOINT}/api/prometheusmetrics/${endpoint}" +} + +vast_prometheus_gauge() { + local metrics_text="$1" + local metric_name="$2" + echo "$metrics_text" | awk -v name="$metric_name" ' + $0 !~ /^#/ && $1 ~ name { + val = $2 + gsub(/[^0-9.eE+-]/, "", val) + if (val != "") { print val; exit } + } + END { if (NR == 0) exit 1 } + ' 2>/dev/null || echo "" +} + +vast_prometheus_metric_sum() { + local metrics_text="$1" + local metric_regex="$2" + echo "$metrics_text" | awk -v re="$metric_regex" ' + $0 !~ /^#/ && $1 ~ re { + val = $2 + gsub(/[^0-9.eE+-]/, "", val) + if (val != "") sum += val + } + END { printf "%.0f", sum+0 } + ' +} + +vast_find_cluster_json() { + local clusters_json="$1" + local cluster_name="$2" + echo "$clusters_json" | jq -c --arg name "$cluster_name" ' + (if type == "array" then . elif .results then .results elif .clusters then .clusters else [.] end) + | map(select((.name // .title // "") | ascii_downcase == ($name | ascii_downcase))) + | .[0] // empty + ' +} + +vast_append_issue() { + local issues_json="$1" + local title="$2" + local details="$3" + local severity="$4" + local next_steps="$5" + echo "$issues_json" | jq \ + --arg title "$title" \ + --arg details "$details" \ + --arg severity "$severity" \ + --arg next_steps "$next_steps" \ + '. += [{ + "title": $title, + "details": $details, + "severity": ($severity | tonumber), + "next_steps": $next_steps + }]' +} + +vast_api_error_issue() { + local issues_json="$1" + local context="$2" + local err_msg="$3" + vast_append_issue "$issues_json" \ + "Cannot Access VAST Cluster \`${VAST_CLUSTER_NAME}\` (${context})" \ + "VMS API call failed: ${err_msg}" \ + "4" \ + "Verify VAST_VMS_ENDPOINT, network connectivity, and vast_vms_credentials permissions" +} + +vast_init_issues() { + echo '[]' +}