From adfdf628623316d0cc90873356e53aca1bd4b02a Mon Sep 17 00:00:00 2001
From: "rw-codebundle-agent[bot]"
 <rw-codebundle-agent[bot]@users.noreply.github.com>
Date: Thu, 25 Jun 2026 15:40:38 +0000
Subject: [PATCH] Add vast-cluster-health CodeBundle for VAST cluster
 monitoring.

Implements VMS REST and Prometheus-based health checks with SLI scoring,
generation rules, and mock fixture tests for issue #129.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../generation-rules/vast-cluster-health.yaml |  22 ++
 .../templates/vast-cluster-health-sli.yaml    |  48 +++
 .../templates/vast-cluster-health-slx.yaml    |  32 ++
 .../vast-cluster-health-taskset.yaml          |  41 +++
 .../vast-cluster-health/.test/README.md       |  16 +
 .../vast-cluster-health/.test/Taskfile.yaml   |  23 ++
 .../capacity_pressure/api_api_clusters_       |  15 +
 .../capacity_pressure/api_api_cnodes_         |   3 +
 .../capacity_pressure/api_api_dnodes_         |   3 +
 .../capacity_pressure/prometheus_alarms       |   2 +
 .../capacity_pressure/prometheus_vms_state    |   2 +
 .../.test/fixtures/degraded/api_api_clusters_ |  13 +
 .../.test/fixtures/degraded/api_api_cnodes_   |   3 +
 .../.test/fixtures/degraded/api_api_dnodes_   |   4 +
 .../.test/fixtures/degraded/prometheus_alarms |   2 +
 .../fixtures/degraded/prometheus_vms_state    |   2 +
 .../.test/fixtures/healthy/api_api_clusters_  |  15 +
 .../.test/fixtures/healthy/api_api_cnodes_    |   4 +
 .../.test/fixtures/healthy/api_api_dnodes_    |   4 +
 .../.test/fixtures/healthy/api_health_        |   1 +
 .../.test/fixtures/healthy/prometheus_alarms  |   2 +
 .../fixtures/healthy/prometheus_vms_state     |   2 +
 .../.test/run-mock-scenarios.sh               |  49 +++
 .../.test/validate-vast-bundle-structure.sh   |  32 ++
 codebundles/vast-cluster-health/README.md     |  83 +++++
 .../analyze-cluster-performance.sh            | 126 ++++++++
 .../check-cluster-capacity.sh                 | 102 ++++++
 .../check-degraded-components.sh              | 102 ++++++
 .../check-node-hardware-health.sh             |  81 +++++
 .../check-replication-status.sh               | 108 +++++++
 .../check-vms-cluster-health.sh               | 102 ++++++
 codebundles/vast-cluster-health/runbook.robot | 295 ++++++++++++++++++
 .../sli-vast-cluster-health-score.sh          |  90 ++++++
 codebundles/vast-cluster-health/sli.robot     | 128 ++++++++
 .../vast-cluster-health/vast-vms-common.sh    | 149 +++++++++
 35 files changed, 1706 insertions(+)
 create mode 100644 codebundles/vast-cluster-health/.runwhen/generation-rules/vast-cluster-health.yaml
 create mode 100644 codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-sli.yaml
 create mode 100644 codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-slx.yaml
 create mode 100644 codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-taskset.yaml
 create mode 100644 codebundles/vast-cluster-health/.test/README.md
 create mode 100644 codebundles/vast-cluster-health/.test/Taskfile.yaml
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_clusters_
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_cnodes_
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_dnodes_
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_alarms
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_vms_state
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_clusters_
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_cnodes_
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_dnodes_
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_alarms
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_vms_state
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_clusters_
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_cnodes_
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_dnodes_
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/healthy/api_health_
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_alarms
 create mode 100644 codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_vms_state
 create mode 100755 codebundles/vast-cluster-health/.test/run-mock-scenarios.sh
 create mode 100755 codebundles/vast-cluster-health/.test/validate-vast-bundle-structure.sh
 create mode 100644 codebundles/vast-cluster-health/README.md
 create mode 100755 codebundles/vast-cluster-health/analyze-cluster-performance.sh
 create mode 100755 codebundles/vast-cluster-health/check-cluster-capacity.sh
 create mode 100755 codebundles/vast-cluster-health/check-degraded-components.sh
 create mode 100755 codebundles/vast-cluster-health/check-node-hardware-health.sh
 create mode 100755 codebundles/vast-cluster-health/check-replication-status.sh
 create mode 100755 codebundles/vast-cluster-health/check-vms-cluster-health.sh
 create mode 100644 codebundles/vast-cluster-health/runbook.robot
 create mode 100755 codebundles/vast-cluster-health/sli-vast-cluster-health-score.sh
 create mode 100644 codebundles/vast-cluster-health/sli.robot
 create mode 100755 codebundles/vast-cluster-health/vast-vms-common.sh

diff --git a/codebundles/vast-cluster-health/.runwhen/generation-rules/vast-cluster-health.yaml b/codebundles/vast-cluster-health/.runwhen/generation-rules/vast-cluster-health.yaml
new file mode 100644
index 00000000..d4d34202
--- /dev/null
+++ b/codebundles/vast-cluster-health/.runwhen/generation-rules/vast-cluster-health.yaml
@@ -0,0 +1,22 @@
+apiVersion: runwhen.com/v1
+kind: GenerationRules
+spec:
+  platform: vast_data
+  generationRules:
+    - resourceTypes:
+        - vast_data_cluster
+      matchRules:
+        - type: pattern
+          pattern: ".+"
+          properties: ["name"]
+          mode: substring
+      slxs:
+        - baseName: vast-cluster-health
+          qualifiers: ["vast_cluster_name", "vast_vms_endpoint"]
+          baseTemplateName: vast-cluster-health
+          levelOfDetail: basic
+          outputItems:
+            - type: slx
+            - type: sli
+            - type: runbook
+              templateName: vast-cluster-health-taskset.yaml
diff --git a/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-sli.yaml b/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-sli.yaml
new file mode 100644
index 00000000..3113faa6
--- /dev/null
+++ b/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-sli.yaml
@@ -0,0 +1,48 @@
+apiVersion: runwhen.com/v1
+kind: ServiceLevelIndicator
+metadata:
+  name: {{ slx_name }}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  displayUnitsLong: OK
+  displayUnitsShort: ok
+  locations:
+    - {{ default_location }}
+  description: Lightweight VAST cluster health score for {{ match_resource.name }} from VMS state, capacity, nodes, alarms, and replication.
+  codeBundle:
+    {% if repo_url %}
+    repoUrl: {{ repo_url }}
+    {% else %}
+    repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
+    {% endif %}
+    {% if ref %}
+    ref: {{ ref }}
+    {% else %}
+    ref: main
+    {% endif %}
+    pathToRobot: codebundles/vast-cluster-health/sli.robot
+  intervalStrategy: intermezzo
+  intervalSeconds: 300
+  configProvided:
+    - name: VAST_VMS_ENDPOINT
+      value: "{{ match_resource.vms_endpoint | default(custom.vast_vms_endpoint) }}"
+    - name: VAST_CLUSTER_NAME
+      value: "{{ match_resource.name }}"
+    - name: CAPACITY_THRESHOLD
+      value: "{{ custom.capacity_threshold | default('85') }}"
+    - name: CRITICAL_CAPACITY_THRESHOLD
+      value: "{{ custom.critical_capacity_threshold | default('95') }}"
+  secretsProvided:
+  {% if wb_version %}
+    {% include "vast_data-auth.yaml" ignore missing %}
+  {% else %}
+    - name: vast_vms_credentials
+      workspaceKey: AUTH DETAILS NOT FOUND
+  {% endif %}
+  alertConfig:
+    tasks:
+      persona: eager-edgar
+      sessionTTL: 10m
diff --git a/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-slx.yaml b/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-slx.yaml
new file mode 100644
index 00000000..c2845132
--- /dev/null
+++ b/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-slx.yaml
@@ -0,0 +1,32 @@
+apiVersion: runwhen.com/v1
+kind: ServiceLevelX
+metadata:
+  name: {{ slx_name }}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/storage/storage.svg
+  alias: {{ match_resource.name }} VAST Cluster Health
+  asMeasuredBy: Composite 0-1 score from VMS state, capacity headroom, node health, alarms, and replication.
+  configProvided:
+    - name: SLX_PLACEHOLDER
+      value: SLX_PLACEHOLDER
+  owners:
+    - {{ workspace.owner_email }}
+  statement: VAST cluster {{ match_resource.name }} should remain CLUSTERED with healthy nodes, capacity headroom, and no active alarms.
+  additionalContext:
+    {% include "vast_data-hierarchy.yaml" ignore missing %}
+    qualified_name: "{{ match_resource.qualified_name }}"
+    vast_vms_endpoint: "{{ match_resource.vms_endpoint | default(custom.vast_vms_endpoint | default('')) }}"
+  tags:
+    {% include "vast_data-tags.yaml" ignore missing %}
+    - name: cloud
+      value: on-prem
+    - name: service
+      value: vast_data
+    - name: scope
+      value: cluster
+    - name: access
+      value: read-only
diff --git a/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-taskset.yaml b/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-taskset.yaml
new file mode 100644
index 00000000..510a48d4
--- /dev/null
+++ b/codebundles/vast-cluster-health/.runwhen/templates/vast-cluster-health-taskset.yaml
@@ -0,0 +1,41 @@
+apiVersion: runwhen.com/v1
+kind: Runbook
+metadata:
+  name: {{ slx_name }}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  location: {{ default_location }}
+  description: Monitor VAST Data cluster-wide health via VMS REST and Prometheus metrics for {{ match_resource.name }}.
+  codeBundle:
+    {% if repo_url %}
+    repoUrl: {{ repo_url }}
+    {% else %}
+    repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
+    {% endif %}
+    {% if ref %}
+    ref: {{ ref }}
+    {% else %}
+    ref: main
+    {% endif %}
+    pathToRobot: codebundles/vast-cluster-health/runbook.robot
+  configProvided:
+    - name: VAST_VMS_ENDPOINT
+      value: "{{ match_resource.vms_endpoint | default(custom.vast_vms_endpoint) }}"
+    - name: VAST_CLUSTER_NAME
+      value: "{{ match_resource.name }}"
+    - name: RESOURCES
+      value: "{{ custom.resources | default('All') }}"
+    - name: CAPACITY_THRESHOLD
+      value: "{{ custom.capacity_threshold | default('85') }}"
+    - name: CRITICAL_CAPACITY_THRESHOLD
+      value: "{{ custom.critical_capacity_threshold | default('95') }}"
+  secretsProvided:
+  {% if wb_version %}
+    {% include "vast_data-auth.yaml" ignore missing %}
+  {% else %}
+    - name: vast_vms_credentials
+      workspaceKey: AUTH DETAILS NOT FOUND
+  {% endif %}
diff --git a/codebundles/vast-cluster-health/.test/README.md b/codebundles/vast-cluster-health/.test/README.md
new file mode 100644
index 00000000..5d0a2556
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/README.md
@@ -0,0 +1,16 @@
+# Mock scenario fixtures for vast-cluster-health.
+
+Static JSON/Prometheus fixtures used when `VAST_MOCK_FIXTURE_DIR` is set (see `run-mock-scenarios.sh`).
+
+| Scenario | Expected issues | Description |
+|----------|-----------------|-------------|
+| `healthy` | 0 | CLUSTERED state, capacity below threshold, all nodes healthy |
+| `degraded` | 2+ | DEGRADED vms_state with offline DNode and active alarm |
+| `capacity_pressure` | 1+ | Logical capacity above CAPACITY_THRESHOLD with no hardware faults |
+
+Run:
+
+```bash
+cd .test
+task
+```
diff --git a/codebundles/vast-cluster-health/.test/Taskfile.yaml b/codebundles/vast-cluster-health/.test/Taskfile.yaml
new file mode 100644
index 00000000..bec8e895
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/Taskfile.yaml
@@ -0,0 +1,23 @@
+version: "3"
+
+tasks:
+  default:
+    desc: "Validate structure and run mock scenario tests"
+    cmds:
+      - task: validate-structure
+      - task: test-mock-scenarios
+
+  validate-structure:
+    desc: "Run static checks for required files"
+    cmds:
+      - ./validate-vast-bundle-structure.sh
+
+  test-mock-scenarios:
+    desc: "Run task scripts against fixture-backed mock VMS responses"
+    cmds:
+      - ./run-mock-scenarios.sh
+
+  clean:
+    desc: "Remove local test outputs"
+    cmds:
+      - rm -f ../*_output.json ../*_report.txt perf_analysis.json
diff --git a/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_clusters_ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_clusters_
new file mode 100644
index 00000000..bebed842
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_clusters_
@@ -0,0 +1,15 @@
+[
+  {
+    "id": 1,
+    "name": "vast-lab-cluster",
+    "title": "vast-lab-cluster",
+    "state": "ONLINE",
+    "enabled": true,
+    "physical_space_in_use_percent": 88.5,
+    "logical_space_in_use_percent": 91.2,
+    "physical_space_in_use_tb": 250.0,
+    "logical_space_in_use_tb": 230.0,
+    "auxiliary_space_in_use_percent": 45.0,
+    "replication_enabled": true
+  }
+]
diff --git a/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_cnodes_ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_cnodes_
new file mode 100644
index 00000000..8fcdaae3
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_cnodes_
@@ -0,0 +1,3 @@
+[
+  {"id": 1, "name": "cnode-1", "state": "ACTIVE"}
+]
diff --git a/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_dnodes_ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_dnodes_
new file mode 100644
index 00000000..3f1c63f8
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/api_api_dnodes_
@@ -0,0 +1,3 @@
+[
+  {"id": 1, "name": "dnode-1", "state": "ACTIVE"}
+]
diff --git a/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_alarms b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_alarms
new file mode 100644
index 00000000..76838b5f
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_alarms
@@ -0,0 +1,2 @@
+# TYPE vast_alarm_active gauge
+vast_alarm_active 0
diff --git a/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_vms_state b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_vms_state
new file mode 100644
index 00000000..289cb5a5
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/capacity_pressure/prometheus_vms_state
@@ -0,0 +1,2 @@
+# TYPE vms_state gauge
+vms_state 1
diff --git a/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_clusters_ b/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_clusters_
new file mode 100644
index 00000000..70f49ca9
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_clusters_
@@ -0,0 +1,13 @@
+[
+  {
+    "id": 1,
+    "name": "vast-lab-cluster",
+    "title": "vast-lab-cluster",
+    "state": "DEGRADED",
+    "enabled": true,
+    "physical_space_in_use_percent": 55.0,
+    "logical_space_in_use_percent": 52.0,
+    "auxiliary_space_in_use_percent": 20.0,
+    "replication_enabled": true
+  }
+]
diff --git a/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_cnodes_ b/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_cnodes_
new file mode 100644
index 00000000..8fcdaae3
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_cnodes_
@@ -0,0 +1,3 @@
+[
+  {"id": 1, "name": "cnode-1", "state": "ACTIVE"}
+]
diff --git a/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_dnodes_ b/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_dnodes_
new file mode 100644
index 00000000..d7df40f5
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/degraded/api_api_dnodes_
@@ -0,0 +1,4 @@
+[
+  {"id": 1, "name": "dnode-1", "state": "OFFLINE"},
+  {"id": 2, "name": "dnode-2", "state": "ACTIVE"}
+]
diff --git a/codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_alarms b/codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_alarms
new file mode 100644
index 00000000..901bd19f
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_alarms
@@ -0,0 +1,2 @@
+# TYPE vast_alarm_active gauge
+vast_alarm_active 1
diff --git a/codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_vms_state b/codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_vms_state
new file mode 100644
index 00000000..11bac434
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/degraded/prometheus_vms_state
@@ -0,0 +1,2 @@
+# TYPE vms_state gauge
+vms_state 0
diff --git a/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_clusters_ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_clusters_
new file mode 100644
index 00000000..e9575a99
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_clusters_
@@ -0,0 +1,15 @@
+[
+  {
+    "id": 1,
+    "name": "vast-lab-cluster",
+    "title": "vast-lab-cluster",
+    "state": "ONLINE",
+    "enabled": true,
+    "physical_space_in_use_percent": 42.5,
+    "logical_space_in_use_percent": 38.0,
+    "physical_space_in_use_tb": 120.5,
+    "logical_space_in_use_tb": 95.2,
+    "auxiliary_space_in_use_percent": 12.0,
+    "replication_enabled": true
+  }
+]
diff --git a/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_cnodes_ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_cnodes_
new file mode 100644
index 00000000..bc3a8cb4
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_cnodes_
@@ -0,0 +1,4 @@
+[
+  {"id": 1, "name": "cnode-1", "state": "ACTIVE"},
+  {"id": 2, "name": "cnode-2", "state": "ACTIVE"}
+]
diff --git a/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_dnodes_ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_dnodes_
new file mode 100644
index 00000000..e55a8e51
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_api_dnodes_
@@ -0,0 +1,4 @@
+[
+  {"id": 1, "name": "dnode-1", "state": "ACTIVE"},
+  {"id": 2, "name": "dnode-2", "state": "ACTIVE"}
+]
diff --git a/codebundles/vast-cluster-health/.test/fixtures/healthy/api_health_ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_health_
new file mode 100644
index 00000000..2350ea93
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/healthy/api_health_
@@ -0,0 +1 @@
+{"state": "CLUSTERED"}
diff --git a/codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_alarms b/codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_alarms
new file mode 100644
index 00000000..76838b5f
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_alarms
@@ -0,0 +1,2 @@
+# TYPE vast_alarm_active gauge
+vast_alarm_active 0
diff --git a/codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_vms_state b/codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_vms_state
new file mode 100644
index 00000000..289cb5a5
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/fixtures/healthy/prometheus_vms_state
@@ -0,0 +1,2 @@
+# TYPE vms_state gauge
+vms_state 1
diff --git a/codebundles/vast-cluster-health/.test/run-mock-scenarios.sh b/codebundles/vast-cluster-health/.test/run-mock-scenarios.sh
new file mode 100755
index 00000000..b6527138
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/run-mock-scenarios.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+export VAST_VMS_ENDPOINT="https://vms.mock.local"
+export VAST_CLUSTER_NAME="vast-lab-cluster"
+export CAPACITY_THRESHOLD="85"
+export CRITICAL_CAPACITY_THRESHOLD="95"
+export VAST_VMS_CREDENTIALS_JSON='{"USERNAME":"admin","PASSWORD":"mock"}'
+
+run_scenario() {
+  local name="$1"
+  local fixture_dir="$ROOT/.test/fixtures/${name}"
+  local expected_min="${2:-0}"
+  local expected_max="${3:-999}"
+
+  echo "=== Scenario: ${name} ==="
+  export VAST_MOCK_FIXTURE_DIR="$fixture_dir"
+
+  rm -f *_output.json
+  ./check-vms-cluster-health.sh >/dev/null
+  ./check-cluster-capacity.sh >/dev/null
+  ./check-node-hardware-health.sh >/dev/null
+  ./check-degraded-components.sh >/dev/null
+  ./check-replication-status.sh >/dev/null
+
+  total_issues=0
+  for f in vms_cluster_health_output.json cluster_capacity_output.json node_hardware_health_output.json degraded_components_output.json replication_status_output.json; do
+    count="$(jq 'length' "$f")"
+    total_issues=$((total_issues + count))
+  done
+
+  echo "Total issues: ${total_issues} (expected between ${expected_min} and ${expected_max})"
+  if (( total_issues < expected_min || total_issues > expected_max )); then
+    echo "Scenario ${name} FAILED" >&2
+    exit 1
+  fi
+
+  sli_json="$(./sli-vast-cluster-health-score.sh)"
+  echo "SLI scores: ${sli_json}"
+}
+
+run_scenario healthy 0 0
+run_scenario degraded 2 10
+run_scenario capacity_pressure 1 3
+
+echo "All mock scenarios passed"
diff --git a/codebundles/vast-cluster-health/.test/validate-vast-bundle-structure.sh b/codebundles/vast-cluster-health/.test/validate-vast-bundle-structure.sh
new file mode 100755
index 00000000..d8b5b28b
--- /dev/null
+++ b/codebundles/vast-cluster-health/.test/validate-vast-bundle-structure.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+need=(
+  runbook.robot
+  sli.robot
+  README.md
+  vast-vms-common.sh
+  check-vms-cluster-health.sh
+  check-cluster-capacity.sh
+  check-node-hardware-health.sh
+  check-degraded-components.sh
+  analyze-cluster-performance.sh
+  check-replication-status.sh
+  sli-vast-cluster-health-score.sh
+  .runwhen/generation-rules/vast-cluster-health.yaml
+  .runwhen/templates/vast-cluster-health-slx.yaml
+  .runwhen/templates/vast-cluster-health-taskset.yaml
+  .runwhen/templates/vast-cluster-health-sli.yaml
+)
+
+for f in "${need[@]}"; do
+  if [[ ! -e "$f" ]]; then
+    echo "missing: $f" >&2
+    exit 1
+  fi
+done
+
+echo "vast-cluster-health structure OK"
diff --git a/codebundles/vast-cluster-health/README.md b/codebundles/vast-cluster-health/README.md
new file mode 100644
index 00000000..ec2769ae
--- /dev/null
+++ b/codebundles/vast-cluster-health/README.md
@@ -0,0 +1,83 @@
+# VAST Data Cluster Health
+
+Monitor VAST Data cluster-wide health via the VMS REST API and Prometheus exporter endpoints. Detects degraded cluster state, capacity exhaustion, hardware failures on CNodes/DNodes, and cluster-level performance bottlenecks that affect all tenants and clients (Kubernetes, NFS, block, S3).
+
+## Overview
+
+- **VMS cluster state**: Queries `/api/prometheusmetrics/vms_state`, `/health/`, and `/api/clusters/` for DEGRADED vs CLUSTERED/ONLINE state
+- **Capacity utilization**: Evaluates physical and logical capacity from cluster REST and Prometheus metrics against configurable thresholds
+- **Node hardware health**: Inspects CNode/DNode REST state and SSD/SCM indicators from `/api/prometheusmetrics/devices`
+- **Degraded components**: Surfaces active alarms, degraded boxes, and offline nodes
+- **Protocol performance**: Samples cluster-wide IOPS and latency metrics for NFS, block, and S3 from Prometheus exporters
+- **Replication and protection**: Checks replication streams, protection groups, and auxiliary/snapshot capacity pressure
+
+## Configuration
+
+### Required Variables
+
+- `VAST_VMS_ENDPOINT`: VMS REST API base URL (e.g. `https://vms.example.com`)
+- `VAST_CLUSTER_NAME`: VAST cluster display name for scoping and issue titles
+
+### Optional Variables
+
+- `RESOURCES`: Cluster name(s) or `All` for auto-discovery via VMS `/api/clusters/` (default: `All`)
+- `CAPACITY_THRESHOLD`: Physical/logical capacity utilization percent that triggers a warning issue (default: `85`)
+- `CRITICAL_CAPACITY_THRESHOLD`: Critical capacity threshold percent (default: `95`)
+
+### Secrets
+
+- `vast_vms_credentials`: VMS API authentication credentials as JSON:
+  - `USERNAME` and `PASSWORD` for basic auth, or
+  - `API_TOKEN` for bearer token auth (when supported by your VMS version)
+
+## Tasks Overview
+
+### Check VMS Cluster Health Status for Cluster
+
+Queries `/api/prometheusmetrics/vms_state` and VMS cluster status to detect DEGRADED (0) vs CLUSTERED (1) state and cluster-level health regressions.
+
+### Check Cluster Capacity Utilization for Cluster
+
+Evaluates physical and logical capacity utilization from `/api/clusters/` and Prometheus capacity metrics; raises issues when usage exceeds `CAPACITY_THRESHOLD` or `CRITICAL_CAPACITY_THRESHOLD`.
+
+### Check CNode and DNode Hardware Health for Cluster
+
+Inspects CNode/DNode state from REST APIs and SSD/SCM health from Prometheus `/api/prometheusmetrics/devices`.
+
+### Check Cluster Degraded Components and Active Alerts for Cluster
+
+Lists degraded boxes, offline nodes, and active VMS alarms from `/api/prometheusmetrics/alarms` and related REST endpoints.
+
+### Analyze Cluster Protocol Performance for Cluster
+
+Reviews cluster-wide IOPS and latency by storage protocol (NFS, block, S3) from Prometheus base metrics to detect IO stalls or abnormal drops.
+
+### Check Replication and Protection Group Status for Cluster
+
+Verifies replication links, protection groups, and snapshot/auxiliary capacity pressure from REST and `/api/prometheusmetrics/replications`.
+
+## SLI
+
+The bundled `sli.robot` produces a 0–1 health score from five binary dimensions:
+
+1. VMS clustered state
+2. Capacity headroom
+3. Node hardware health
+4. Active alarm clearance
+5. Replication health
+
+## Platform Notes
+
+- Prometheus metrics are scraped directly from VMS REST paths such as `/api/prometheusmetrics/vms_state` and `/api/prometheusmetrics/all` — no local Prometheus server is required.
+- Some endpoints (`/health/`, `/api/prometheusmetrics/replications`, `/api/protectiongroups/`) are unavailable on older VAST versions; tasks degrade gracefully and skip optional checks.
+- API reference: [Exporting Metrics to Prometheus](https://kb.vastdata.com/documentation/docs/exporting-metrics-to-prometheus)
+- VMS REST docs: `{VAST_VMS_ENDPOINT}/docs`
+
+## Testing
+
+Use mock fixtures under `.test/fixtures/` when a live VAST cluster is unavailable:
+
+```bash
+cd codebundles/vast-cluster-health/.test
+task
+```
diff --git a/codebundles/vast-cluster-health/analyze-cluster-performance.sh b/codebundles/vast-cluster-health/analyze-cluster-performance.sh
new file mode 100755
index 00000000..3330aa46
--- /dev/null
+++ b/codebundles/vast-cluster-health/analyze-cluster-performance.sh
@@ -0,0 +1,126 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+# -----------------------------------------------------------------------------
+# Reviews cluster-wide protocol performance (NFS, block, S3) for IO stalls.
+# -----------------------------------------------------------------------------
+
+OUTPUT_FILE="cluster_performance_output.json"
+REPORT_FILE="cluster_performance_report.txt"
+
+source "$(dirname "$0")/vast-vms-common.sh"
+
+PERFORMANCE_DROP_THRESHOLD="${PERFORMANCE_DROP_THRESHOLD:-90}"
+MIN_BASELINE_IOPS="${MIN_BASELINE_IOPS:-100}"
+
+issues_json="$(vast_init_issues)"
+report="Cluster protocol performance for \`${VAST_CLUSTER_NAME}\`\n"
+
+if ! _vast_load_credentials; then
+  issues_json="$(vast_api_error_issue "$issues_json" "credentials" "missing vast_vms_credentials")"
+  echo "$issues_json" > "$OUTPUT_FILE"
+  echo -e "$report" > "$REPORT_FILE"
+  exit 0
+fi
+
+metrics_text=""
+if ! metrics_text="$(vast_prometheus_get "basic_no_views" 2>metrics.err || vast_prometheus_get "" 2>>metrics.err)"; then
+  err_msg="$(cat metrics.err 2>/dev/null || echo unknown)"
+  report+="Warning: performance metrics unavailable: ${err_msg}\n"
+  echo "$issues_json" > "$OUTPUT_FILE"
+  echo -e "$report" > "$REPORT_FILE"
+  echo -e "$report"
+  exit 0
+fi
+rm -f metrics.err
+
+python3 - <<'PY' "$metrics_text" "$PERFORMANCE_DROP_THRESHOLD" "$MIN_BASELINE_IOPS" "$VAST_CLUSTER_NAME" > perf_analysis.json
+import json, re, sys
+
+metrics_text, drop_threshold, min_baseline, cluster = sys.argv[1:5]
+drop_threshold = float(drop_threshold)
+min_baseline = float(min_baseline)
+
+protocol_patterns = {
+    "NFS": re.compile(r"(nfs|NFS).*iops", re.I),
+    "Block": re.compile(r"(block|BLOCK).*iops", re.I),
+    "S3": re.compile(r"(s3|S3).*iops", re.I),
+}
+
+values = {}
+for line in metrics_text.splitlines():
+    if line.startswith("#") or not line.strip():
+        continue
+    parts = line.split()
+    if len(parts) < 2:
+        continue
+    name, val = parts[0], parts[1]
+    try:
+        num = float(val)
+    except ValueError:
+        continue
+    for proto, pat in protocol_patterns.items():
+        if pat.search(name):
+            values.setdefault(proto, []).append(num)
+
+issues = []
+report_lines = []
+for proto, nums in values.items():
+    total = sum(nums)
+    report_lines.append(f"{proto} aggregate IOPS sample total={total:.0f} from {len(nums)} metric(s)")
+    if total >= min_baseline and total < min_baseline * (drop_threshold / 100.0):
+        issues.append({
+            "title": f"Abnormally Low {proto} IOPS on VAST Cluster `{cluster}`",
+            "details": f"{proto} aggregate IOPS ({total:.0f}) is below {drop_threshold}% of baseline threshold ({min_baseline:.0f}).",
+            "severity": 3,
+            "next_steps": f"Check {proto} client connectivity, VIP health, and recent cluster events; compare with historical dashboards",
+        })
+
+latency_hits = []
+for line in metrics_text.splitlines():
+    if line.startswith("#"):
+        continue
+    if re.search(r"latency", line, re.I):
+        parts = line.split()
+        if len(parts) >= 2:
+            try:
+                val = float(parts[1])
+            except ValueError:
+                continue
+            if val > 100:  # ms threshold for cluster-wide latency gauges
+                latency_hits.append(f"{parts[0]}={val}")
+
+if latency_hits:
+    issues.append({
+        "title": f"Elevated Cluster Protocol Latency on VAST Cluster `{cluster}`",
+        "details": "High latency metrics detected:\n" + "\n".join(latency_hits[:10]),
+        "severity": 3,
+        "next_steps": "Inspect network path, DNode load, and QoS policies; correlate with tenant-level metrics",
+    })
+    report_lines.append(f"High latency metrics: {len(latency_hits)}")
+
+if not values and not latency_hits:
+    report_lines.append("No recognizable protocol IOPS/latency metrics in exporter output (graceful skip).")
+
+print(json.dumps({"issues": issues, "report": report_lines}))
+PY
+
+mapfile -t report_lines < <(jq -r '.report[]' perf_analysis.json)
+for line in "${report_lines[@]:-}"; do
+  report+="${line}\n"
+done
+
+while IFS= read -r issue; do
+  [[ -z "$issue" || "$issue" == "null" ]] && continue
+  title="$(echo "$issue" | jq -r '.title')"
+  details="$(echo "$issue" | jq -r '.details')"
+  severity="$(echo "$issue" | jq -r '.severity')"
+  next_steps="$(echo "$issue" | jq -r '.next_steps')"
+  issues_json="$(vast_append_issue "$issues_json" "$title" "$details" "$severity" "$next_steps")"
+done < <(jq -c '.issues[]?' perf_analysis.json 2>/dev/null || true)
+
+rm -f perf_analysis.json
+echo "$issues_json" > "$OUTPUT_FILE"
+echo -e "$report" > "$REPORT_FILE"
+echo -e "$report"
+echo "Analysis completed. Results saved to $OUTPUT_FILE"
diff --git a/codebundles/vast-cluster-health/check-cluster-capacity.sh b/codebundles/vast-cluster-health/check-cluster-capacity.sh
new file mode 100755
index 00000000..27acf977
--- /dev/null
+++ b/codebundles/vast-cluster-health/check-cluster-capacity.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+# -----------------------------------------------------------------------------
+# Evaluates physical and logical capacity utilization for the scoped cluster.
+# -----------------------------------------------------------------------------
+
+OUTPUT_FILE="cluster_capacity_output.json"
+REPORT_FILE="cluster_capacity_report.txt"
+
+source "$(dirname "$0")/vast-vms-common.sh"
+
+issues_json="$(vast_init_issues)"
+report="Capacity utilization for \`${VAST_CLUSTER_NAME}\` (threshold=${CAPACITY_THRESHOLD}%, critical=${CRITICAL_CAPACITY_THRESHOLD}%)\n"
+
+if ! _vast_load_credentials; then
+  issues_json="$(vast_api_error_issue "$issues_json" "credentials" "missing vast_vms_credentials")"
+  echo "$issues_json" > "$OUTPUT_FILE"
+  echo -e "$report" > "$REPORT_FILE"
+  exit 0
+fi
+
+physical_pct=""
+logical_pct=""
+
+if clusters_json="$(vast_api_get "/api/clusters/" 2>clusters.err)"; then
+  cluster_obj="$(vast_find_cluster_json "$clusters_json" "$VAST_CLUSTER_NAME")"
+  if [[ -n "$cluster_obj" ]]; then
+    physical_pct="$(echo "$cluster_obj" | jq -r '.physical_space_in_use_percent // empty')"
+    logical_pct="$(echo "$cluster_obj" | jq -r '.logical_space_in_use_percent // empty')"
+    physical_tb="$(echo "$cluster_obj" | jq -r '.physical_space_in_use_tb // .physical_space_in_use // "n/a"')"
+    logical_tb="$(echo "$cluster_obj" | jq -r '.logical_space_in_use_tb // .logical_space_in_use // "n/a"')"
+    report+="REST capacity: physical=${physical_pct}% (${physical_tb} TB in use), logical=${logical_pct}% (${logical_tb} TB in use)\n"
+  fi
+else
+  err_msg="$(cat clusters.err 2>/dev/null || echo unknown)"
+  issues_json="$(vast_api_error_issue "$issues_json" "cluster capacity" "$err_msg")"
+fi
+rm -f clusters.err
+
+if [[ -z "$physical_pct" || -z "$logical_pct" ]]; then
+  if metrics_text="$(vast_prometheus_get "basic_no_views" 2>metrics.err || vast_prometheus_get "" 2>metrics.err)"; then
+    physical_pct="$(vast_prometheus_gauge "$metrics_text" "physical_space_in_use_percent")"
+    logical_pct="$(vast_prometheus_gauge "$metrics_text" "logical_space_in_use_percent")"
+    if [[ -z "$physical_pct" ]]; then
+      physical_used="$(vast_prometheus_gauge "$metrics_text" "physical_space_in_use")"
+      physical_total="$(vast_prometheus_gauge "$metrics_text" "physical_space")"
+      if [[ -n "$physical_used" && -n "$physical_total" && "$physical_total" != "0" ]]; then
+        physical_pct="$(python3 - <<PY
+used=float("${physical_used}")
+total=float("${physical_total}")
+print(round(100.0 * used / total, 2))
+PY
+)"
+      fi
+    fi
+    report+="Prometheus fallback: physical=${physical_pct:-n/a}%, logical=${logical_pct:-n/a}%\n"
+  else
+    report+="Warning: could not fetch capacity from REST or Prometheus.\n"
+  fi
+  rm -f metrics.err
+fi
+
+for kind in physical logical; do
+  if [[ "$kind" == "physical" ]]; then
+    pct="$physical_pct"
+  else
+    pct="$logical_pct"
+  fi
+  [[ -z "$pct" || "$pct" == "null" ]] && continue
+  report+="${kind} utilization: ${pct}%\n"
+  cmp_critical="$(python3 - <<PY
+pct=float("${pct}")
+crit=float("${CRITICAL_CAPACITY_THRESHOLD}")
+warn=float("${CAPACITY_THRESHOLD}")
+if pct >= crit:
+    print("critical")
+elif pct >= warn:
+    print("warning")
+else:
+    print("ok")
+PY
+)"
+  if [[ "$cmp_critical" == "critical" ]]; then
+    issues_json="$(vast_append_issue "$issues_json" \
+      "Critical ${kind^} Capacity for VAST Cluster \`${VAST_CLUSTER_NAME}\`" \
+      "${kind^} capacity utilization is ${pct}% (critical threshold ${CRITICAL_CAPACITY_THRESHOLD}%)." \
+      "2" \
+      "Expedite capacity expansion, delete stale snapshots, or rebalance tenants before writes fail")"
+  elif [[ "$cmp_critical" == "warning" ]]; then
+    issues_json="$(vast_append_issue "$issues_json" \
+      "Elevated ${kind^} Capacity for VAST Cluster \`${VAST_CLUSTER_NAME}\`" \
+      "${kind^} capacity utilization is ${pct}% (warning threshold ${CAPACITY_THRESHOLD}%)." \
+      "3" \
+      "Plan capacity expansion and review snapshot/retention policies")"
+  fi
+done
+
+echo "$issues_json" > "$OUTPUT_FILE"
+echo -e "$report" > "$REPORT_FILE"
+echo -e "$report"
+echo "Analysis completed. Results saved to $OUTPUT_FILE"
diff --git a/codebundles/vast-cluster-health/check-degraded-components.sh b/codebundles/vast-cluster-health/check-degraded-components.sh
new file mode 100755
index 00000000..b3562ca2
--- /dev/null
+++ b/codebundles/vast-cluster-health/check-degraded-components.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+# -----------------------------------------------------------------------------
+# Lists degraded boxes, failed drives, offline nodes, and active VMS alarms.
+# -----------------------------------------------------------------------------
+
+OUTPUT_FILE="degraded_components_output.json"
+REPORT_FILE="degraded_components_report.txt"
+
+source "$(dirname "$0")/vast-vms-common.sh"
+
+issues_json="$(vast_init_issues)"
+alarm_count=0
+report="Degraded components and alerts for \`${VAST_CLUSTER_NAME}\`\n"
+
+if ! _vast_load_credentials; then
+  issues_json="$(vast_api_error_issue "$issues_json" "credentials" "missing vast_vms_credentials")"
+  echo "$issues_json" > "$OUTPUT_FILE"
+  echo -e "$report" > "$REPORT_FILE"
+  exit 0
+fi
+
+if alarms_text="$(vast_prometheus_get "alarms" 2>alarms.err)"; then
+  active_alarms="$(echo "$alarms_text" | awk '
+    $0 !~ /^#/ && $2 != "0" && $2 != "0.0" {
+      print $0
+    }
+  ' | head -30)"
+  alarm_count="$(echo "$active_alarms" | grep -c . || true)"
+  report+="Active alarm metrics lines: ${alarm_count}\n"
+  if [[ "$alarm_count" -gt 0 ]]; then
+    issues_json="$(vast_append_issue "$issues_json" \
+      "Active VMS Alarms on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \
+      "Prometheus alarms exporter reports ${alarm_count} active alarm metric(s):\n${active_alarms}" \
+      "1" \
+      "Review Alarms panel in VMS and remediate highest-severity items first")"
+  fi
+else
+  report+="Note: /api/prometheusmetrics/alarms unavailable; skipping alarm scrape.\n"
+fi
+rm -f alarms.err
+
+for path in "/api/boxes/" "/api/dboxes/"; do
+  if boxes_json="$(vast_api_get "$path" 2>/tmp/boxes.err)"; then
+    degraded="$(echo "$boxes_json" | jq -r '
+      (if type == "array" then . elif .results then .results else [.] end)
+      | map(select((.state // .status // "ONLINE") | ascii_upcase | test("DEGRADED|FAILED|OFFLINE|ERROR")))
+      | map("\(.name // .title // .id // "box") state=\(.state // .status)")
+      | .[]
+    ' 2>/dev/null || true)"
+    if [[ -n "$degraded" ]]; then
+      while IFS= read -r line; do
+        [[ -z "$line" ]] && continue
+        issues_json="$(vast_append_issue "$issues_json" \
+          "Degraded Box on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \
+          "Box from ${path} reports: ${line}" \
+          "1" \
+          "Inspect box hardware in VMS and engage VAST support if state persists")"
+        report+="Degraded box: ${line}\n"
+      done <<< "$degraded"
+    fi
+  fi
+done
+rm -f /tmp/boxes.err
+
+offline_nodes=0
+for entry in "CNode:/api/cnodes/" "DNode:/api/dnodes/"; do
+  label="${entry%%:*}"
+  api_path="${entry#*:}"
+  if nodes_json="$(vast_api_get "$api_path" 2>/tmp/node.err)"; then
+    count="$(echo "$nodes_json" | jq '
+      (if type == "array" then . elif .results then .results else [.] end)
+      | map(select((.state // .status // "ACTIVE") | ascii_upcase | test("OFFLINE|FAILED|INACTIVE|DISABLED|ERROR")))
+      | length
+    ' 2>/dev/null || echo 0)"
+    offline_nodes=$((offline_nodes + count))
+    if [[ "$count" -gt 0 ]]; then
+      sample="$(echo "$nodes_json" | jq -r '
+        (if type == "array" then . elif .results then .results else [.] end)
+        | map(select((.state // .status // "ACTIVE") | ascii_upcase | test("OFFLINE|FAILED|INACTIVE|DISABLED|ERROR")))
+        | .[0] | "\(.name // .hostname // .id // "node") state=\(.state // .status)"
+      ' 2>/dev/null || echo unknown)"
+      issues_json="$(vast_append_issue "$issues_json" \
+        "Offline ${label}(s) on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \
+        "${count} ${label}(s) offline or failed (example: ${sample}). Partial cluster failure may impact all tenants." \
+        "1" \
+        "Restore offline nodes or replace failed hardware; verify cluster quorum in VMS")"
+      report+="${count} offline ${label}(s).\n"
+    fi
+  fi
+done
+rm -f /tmp/node.err
+
+if [[ "$offline_nodes" -eq 0 && "$alarm_count" -eq 0 ]]; then
+  report+="No degraded boxes, offline nodes, or active alarms detected.\n"
+fi
+
+echo "$issues_json" > "$OUTPUT_FILE"
+echo -e "$report" > "$REPORT_FILE"
+echo -e "$report"
+echo "Analysis completed. Results saved to $OUTPUT_FILE"
diff --git a/codebundles/vast-cluster-health/check-node-hardware-health.sh b/codebundles/vast-cluster-health/check-node-hardware-health.sh
new file mode 100755
index 00000000..dffb5df1
--- /dev/null
+++ b/codebundles/vast-cluster-health/check-node-hardware-health.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+# -----------------------------------------------------------------------------
+# Inspects CNode/DNode and SSD/SCM hardware health from REST and Prometheus.
+# -----------------------------------------------------------------------------
+
+OUTPUT_FILE="node_hardware_health_output.json"
+REPORT_FILE="node_hardware_health_report.txt"
+
+source "$(dirname "$0")/vast-vms-common.sh"
+
+issues_json="$(vast_init_issues)"
+report="Node hardware health for \`${VAST_CLUSTER_NAME}\`\n"
+
+if ! _vast_load_credentials; then
+  issues_json="$(vast_api_error_issue "$issues_json" "credentials" "missing vast_vms_credentials")"
+  echo "$issues_json" > "$OUTPUT_FILE"
+  echo -e "$report" > "$REPORT_FILE"
+  exit 0
+fi
+
+check_nodes() {
+  local kind="$1"
+  local path="$2"
+  local nodes_json=""
+  if ! nodes_json="$(vast_api_get "$path" 2>"/tmp/${kind}.err")"; then
+    report+="Warning: ${kind} REST API unavailable.\n"
+    return 0
+  fi
+  local bad
+  bad="$(echo "$nodes_json" | jq -r '
+    (if type == "array" then . elif .results then .results else [.] end)
+    | map(select((.state // .status // "ACTIVE") | ascii_upcase | test("OFFLINE|FAILED|INACTIVE|DISABLED|ERROR")))
+    | map("\(.name // .hostname // .id // "unknown") state=\(.state // .status)")
+    | .[]
+  ' 2>/dev/null || true)"
+  if [[ -n "$bad" ]]; then
+    while IFS= read -r line; do
+      [[ -z "$line" ]] && continue
+      issues_json="$(vast_append_issue "$issues_json" \
+        "Unhealthy ${kind} on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \
+        "${kind} reports unhealthy state: ${line}" \
+        "2" \
+        "Inspect ${kind} in VMS, verify hardware LEDs/cabling, and follow VAST support guidance for replacement")"
+      report+="Unhealthy ${kind}: ${line}\n"
+    done <<< "$bad"
+  else
+    count="$(echo "$nodes_json" | jq 'if type == "array" then length elif .results then (.results|length) else 1 end' 2>/dev/null || echo 0)"
+    report+="All ${count} ${kind}(s) appear healthy via REST.\n"
+  fi
+}
+
+check_nodes "CNode" "/api/cnodes/"
+check_nodes "DNode" "/api/dnodes/"
+
+if devices_text="$(vast_prometheus_get "devices" 2>devices.err)"; then
+  failed_devices="$(echo "$devices_text" | awk '
+    $0 !~ /^#/ && ($0 ~ /state/ || $0 ~ /status/) && ($0 ~ /failed|error|offline|inactive|0$/) {
+      print $0
+    }
+  ' | head -20)"
+  if [[ -n "$failed_devices" ]]; then
+    issues_json="$(vast_append_issue "$issues_json" \
+      "SSD/SCM Hardware Faults on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \
+      "Prometheus devices metrics indicate failed or unhealthy media:\n${failed_devices}" \
+      "2" \
+      "Review DBox device status in VMS and replace failed SSD/SCM modules")"
+    report+="Device metric faults detected (see issues).\n"
+  else
+    report+="No failed SSD/SCM indicators in /api/prometheusmetrics/devices.\n"
+  fi
+else
+  report+="Note: /api/prometheusmetrics/devices unavailable on this VAST version.\n"
+fi
+rm -f devices.err /tmp/CNode.err /tmp/DNode.err
+
+echo "$issues_json" > "$OUTPUT_FILE"
+echo -e "$report" > "$REPORT_FILE"
+echo -e "$report"
+echo "Analysis completed. Results saved to $OUTPUT_FILE"
diff --git a/codebundles/vast-cluster-health/check-replication-status.sh b/codebundles/vast-cluster-health/check-replication-status.sh
new file mode 100755
index 00000000..a187de08
--- /dev/null
+++ b/codebundles/vast-cluster-health/check-replication-status.sh
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+# -----------------------------------------------------------------------------
+# Verifies replication links, protection groups, and snapshot pressure signals.
+# -----------------------------------------------------------------------------
+
+OUTPUT_FILE="replication_status_output.json"
+REPORT_FILE="replication_status_report.txt"
+
+source "$(dirname "$0")/vast-vms-common.sh"
+
+issues_json="$(vast_init_issues)"
+report="Replication and protection status for \`${VAST_CLUSTER_NAME}\`\n"
+
+if ! _vast_load_credentials; then
+  issues_json="$(vast_api_error_issue "$issues_json" "credentials" "missing vast_vms_credentials")"
+  echo "$issues_json" > "$OUTPUT_FILE"
+  echo -e "$report" > "$REPORT_FILE"
+  exit 0
+fi
+
+if clusters_json="$(vast_api_get "/api/clusters/" 2>clusters.err)"; then
+  cluster_obj="$(vast_find_cluster_json "$clusters_json" "$VAST_CLUSTER_NAME")"
+  if [[ -n "$cluster_obj" ]]; then
+    repl_enabled="$(echo "$cluster_obj" | jq -r '.replication_enabled // .replication // empty')"
+    aux_pct="$(echo "$cluster_obj" | jq -r '.auxiliary_space_in_use_percent // empty')"
+    if [[ "$repl_enabled" == "false" ]]; then
+      report+="Cluster replication_enabled=false (informational).\n"
+    fi
+    if [[ -n "$aux_pct" && "$aux_pct" != "null" ]]; then
+      report+="Auxiliary/snapshot space in use: ${aux_pct}%\n"
+      aux_cmp="$(python3 - <<PY
+pct=float("${aux_pct}")
+print("high" if pct >= float("${CRITICAL_CAPACITY_THRESHOLD}") else ("warn" if pct >= float("${CAPACITY_THRESHOLD}") else "ok"))
+PY
+)"
+      if [[ "$aux_cmp" == "high" ]]; then
+        issues_json="$(vast_append_issue "$issues_json" \
+          "High Snapshot/Auxiliary Capacity on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \
+          "Auxiliary space (snapshots/replication metadata) is ${aux_pct}% of capacity." \
+          "2" \
+          "Review snapshot retention, protection policies, and replication backlog")"
+      elif [[ "$aux_cmp" == "warn" ]]; then
+        issues_json="$(vast_append_issue "$issues_json" \
+          "Elevated Snapshot/Auxiliary Capacity on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \
+          "Auxiliary space is ${aux_pct}% (warning threshold ${CAPACITY_THRESHOLD}%)." \
+          "3" \
+          "Audit protection groups and snapshot schedules for capacity pressure")"
+      fi
+    fi
+  fi
+else
+  err_msg="$(cat clusters.err 2>/dev/null || echo unknown)"
+  issues_json="$(vast_api_error_issue "$issues_json" "replication status" "$err_msg")"
+fi
+rm -f clusters.err
+
+if repl_text="$(vast_prometheus_get "replications" 2>repl.err)"; then
+  unhealthy="$(echo "$repl_text" | awk '
+    $0 !~ /^#/ && ($0 ~ /state|status|lag|behind|failed|error/ || $0 ~ /replication/) && ($0 ~ /0$/ || $0 ~ /failed|error|lag|behind|stalled/i) {
+      print $0
+    }
+  ' | head -20)"
+  if [[ -n "$unhealthy" ]]; then
+    issues_json="$(vast_append_issue "$issues_json" \
+      "Replication Stream Issues on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \
+      "Prometheus replications metrics indicate unhealthy streams:\n${unhealthy}" \
+      "2" \
+      "Verify replication peer connectivity, bandwidth limits, and protection group health in VMS")"
+    report+="Unhealthy replication metric samples detected.\n"
+  else
+    report+="Replication Prometheus metrics show no obvious failures.\n"
+  fi
+else
+  report+="Note: /api/prometheusmetrics/replications unavailable (requires VAST 5.2-sp10+).\n"
+fi
+rm -f repl.err
+
+if pg_json="$(vast_api_get "/api/protectiongroups/" 2>pg.err)"; then
+  bad_pg="$(echo "$pg_json" | jq -r '
+    (if type == "array" then . elif .results then .results else [.] end)
+    | map(select((.state // .status // "OK") | ascii_upcase | test("FAILED|ERROR|DEGRADED|OFFLINE")))
+    | map("\(.name // .title // .id // "pg") state=\(.state // .status)")
+    | .[]
+  ' 2>/dev/null || true)"
+  if [[ -n "$bad_pg" ]]; then
+    while IFS= read -r line; do
+      [[ -z "$line" ]] && continue
+      issues_json="$(vast_append_issue "$issues_json" \
+        "Protection Group Issue on VAST Cluster \`${VAST_CLUSTER_NAME}\`" \
+        "Protection group reports: ${line}" \
+        "2" \
+        "Review protection group configuration and replication targets in VMS")"
+      report+="Protection group issue: ${line}\n"
+    done <<< "$bad_pg"
+  else
+    report+="Protection groups REST check: no failed groups found.\n"
+  fi
+else
+  report+="Note: /api/protectiongroups/ unavailable on this VAST version.\n"
+fi
+rm -f pg.err
+
+echo "$issues_json" > "$OUTPUT_FILE"
+echo -e "$report" > "$REPORT_FILE"
+echo -e "$report"
+echo "Analysis completed. Results saved to $OUTPUT_FILE"
diff --git a/codebundles/vast-cluster-health/check-vms-cluster-health.sh b/codebundles/vast-cluster-health/check-vms-cluster-health.sh
new file mode 100755
index 00000000..2b04303b
--- /dev/null
+++ b/codebundles/vast-cluster-health/check-vms-cluster-health.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+# -----------------------------------------------------------------------------
+# REQUIRED ENV VARS:
+#   VAST_VMS_ENDPOINT, VAST_CLUSTER_NAME
+# OPTIONAL:
+#   CAPACITY_THRESHOLD, CRITICAL_CAPACITY_THRESHOLD
+#   VAST_VMS_CREDENTIALS_FILE / VAST_VMS_CREDENTIALS_JSON
+#
+# Queries /api/prometheusmetrics/vms_state and cluster REST status.
+# -----------------------------------------------------------------------------
+
+OUTPUT_FILE="vms_cluster_health_output.json"
+REPORT_FILE="vms_cluster_health_report.txt"
+
+source "$(dirname "$0")/vast-vms-common.sh"
+
+issues_json="$(vast_init_issues)"
+report="VMS cluster health check for \`${VAST_CLUSTER_NAME}\` at ${VAST_VMS_ENDPOINT}\n"
+
+if ! _vast_load_credentials; then
+  issues_json="$(vast_api_error_issue "$issues_json" "credentials" "missing vast_vms_credentials")"
+  echo "$issues_json" > "$OUTPUT_FILE"
+  echo -e "$report" > "$REPORT_FILE"
+  exit 0
+fi
+
+vms_state_text=""
+if vms_state_text="$(vast_prometheus_get "vms_state" 2>vms_state.err)"; then
+  vms_state="$(vast_prometheus_gauge "$vms_state_text" "vms_state")"
+  if [[ -z "$vms_state" ]]; then
+    vms_state="$(vast_prometheus_gauge "$vms_state_text" "vast_vms_state")"
+  fi
+  report+="VMS state metric: ${vms_state:-unknown} (1=CLUSTERED, 0=DEGRADED)\n"
+  if [[ "$vms_state" == "0" ]]; then
+    issues_json="$(vast_append_issue "$issues_json" \
+      "VAST Cluster \`${VAST_CLUSTER_NAME}\` VMS State is DEGRADED" \
+      "Prometheus vms_state gauge reports 0 (DEGRADED). Cluster-wide operations may be impaired." \
+      "1" \
+      "Review VMS alarms and degraded components; check offline CNodes/DNodes in VMS UI")"
+  elif [[ -z "$vms_state" ]]; then
+    report+="Warning: vms_state metric not found in exporter response (endpoint may be unavailable on older VAST versions).\n"
+  fi
+else
+  err_msg="$(cat vms_state.err 2>/dev/null || echo unknown)"
+  report+="Warning: /api/prometheusmetrics/vms_state unavailable: ${err_msg}\n"
+fi
+rm -f vms_state.err
+
+clusters_json=""
+if clusters_json="$(vast_api_get "/api/clusters/" 2>clusters.err)"; then
+  cluster_obj="$(vast_find_cluster_json "$clusters_json" "$VAST_CLUSTER_NAME")"
+  if [[ -n "$cluster_obj" ]]; then
+    cluster_state="$(echo "$cluster_obj" | jq -r '.state // "UNKNOWN"')"
+    enabled="$(echo "$cluster_obj" | jq -r '.enabled // true')"
+    report+="Cluster REST state: ${cluster_state}, enabled=${enabled}\n"
+    if [[ "$cluster_state" != "ONLINE" && "$cluster_state" != "CLUSTERED" ]]; then
+      issues_json="$(vast_append_issue "$issues_json" \
+        "VAST Cluster \`${VAST_CLUSTER_NAME}\` State is ${cluster_state}" \
+        "Cluster REST API reports state=${cluster_state} (expected ONLINE/CLUSTERED)." \
+        "2" \
+        "Inspect cluster events in VMS and verify all boxes and nodes are online")"
+    fi
+    if [[ "$enabled" != "true" ]]; then
+      issues_json="$(vast_append_issue "$issues_json" \
+        "VAST Cluster \`${VAST_CLUSTER_NAME}\` is Disabled" \
+        "Cluster enabled flag is false in VMS REST API." \
+        "2" \
+        "Re-enable the cluster in VMS if this was not intentional maintenance")"
+    fi
+  else
+    report+="Warning: cluster \`${VAST_CLUSTER_NAME}\` not found in /api/clusters/ response.\n"
+  fi
+else
+  err_msg="$(cat clusters.err 2>/dev/null || echo unknown)"
+  issues_json="$(vast_api_error_issue "$issues_json" "cluster status" "$err_msg")"
+fi
+rm -f clusters.err
+
+health_text=""
+if health_text="$(vast_api_get "/health/" 2>health.err)"; then
+  health_state="$(echo "$health_text" | jq -r '.state // .status // empty' 2>/dev/null || true)"
+  if [[ -n "$health_state" ]]; then
+    report+="VMS /health/ status: ${health_state}\n"
+    if [[ "$health_state" =~ ^(DEGRADED|UNHEALTHY|ERROR|FAILED)$ ]]; then
+      issues_json="$(vast_append_issue "$issues_json" \
+        "VAST Cluster \`${VAST_CLUSTER_NAME}\` VMS Health Endpoint Reports ${health_state}" \
+        "GET /health/ returned state=${health_state}." \
+        "1" \
+        "Review VMS health dashboard and active alarms")"
+    fi
+  fi
+else
+  report+="Note: /health/ endpoint unavailable (requires VAST 5.4.3+).\n"
+fi
+rm -f health.err
+
+echo "$issues_json" > "$OUTPUT_FILE"
+echo -e "$report" > "$REPORT_FILE"
+echo -e "$report"
+echo "Analysis completed. Results saved to $OUTPUT_FILE"
diff --git a/codebundles/vast-cluster-health/runbook.robot b/codebundles/vast-cluster-health/runbook.robot
new file mode 100644
index 00000000..bd9c83ac
--- /dev/null
+++ b/codebundles/vast-cluster-health/runbook.robot
@@ -0,0 +1,295 @@
+*** Settings ***
+Documentation       Monitor VAST Data cluster-wide health via VMS REST and Prometheus exporter endpoints for degraded state, capacity pressure, hardware faults, and protocol performance.
+Metadata            Author    rw-codebundle-agent
+Metadata            Display Name    VAST Data Cluster Health
+Metadata            Supports    VAST    vast_data    cluster    storage    metrics
+
+Force Tags          VAST    vast_data    cluster    storage    health
+
+Library             String
+Library             BuiltIn
+Library             RW.Core
+Library             RW.CLI
+Library             RW.platform
+
+Suite Setup         Suite Initialization
+
+
+*** Tasks ***
+Check VMS Cluster Health Status for Cluster `${VAST_CLUSTER_NAME}`
+    [Documentation]    Queries /api/prometheusmetrics/vms_state and VMS cluster status to detect DEGRADED (0) vs CLUSTERED (1) state and any active cluster alerts.
+    [Tags]    VAST    vast_data    cluster    health    access:read-only    data:metrics
+
+    ${result}=    RW.CLI.Run Bash File
+    ...    bash_file=check-vms-cluster-health.sh
+    ...    env=${env}
+    ...    secret__vast_vms_credentials=${vast_vms_credentials}
+    ...    timeout_seconds=180
+    ...    include_in_history=false
+    ...    show_in_rwl_cheatsheet=true
+    ...    cmd_override=./check-vms-cluster-health.sh
+
+    ${issues}=    RW.CLI.Run Cli
+    ...    cmd=cat vms_cluster_health_output.json
+
+    TRY
+        ${issue_list}=    Evaluate    json.loads(r'''${issues.stdout}''')    json
+    EXCEPT
+        Log    Failed to parse JSON for task, defaulting to empty list.    WARN
+        ${issue_list}=    Create List
+    END
+
+    IF    len(@{issue_list}) > 0
+        FOR    ${issue}    IN    @{issue_list}
+            RW.Core.Add Issue
+            ...    severity=${issue['severity']}
+            ...    expected=VMS cluster health should report CLUSTERED/ONLINE state for cluster `${VAST_CLUSTER_NAME}`
+            ...    actual=VMS cluster health check found degraded or unreachable cluster state
+            ...    title=${issue['title']}
+            ...    reproduce_hint=${result.cmd}
+            ...    details=${issue['details']}
+            ...    next_steps=${issue['next_steps']}
+        END
+    END
+
+    RW.Core.Add Pre To Report    VMS Cluster Health Results:\n${result.stdout}
+
+Check Cluster Capacity Utilization for Cluster `${VAST_CLUSTER_NAME}`
+    [Documentation]    Evaluates physical and logical capacity utilization from cluster REST and Prometheus metrics; raises issues when usage exceeds CAPACITY_THRESHOLD percent.
+    [Tags]    VAST    vast_data    cluster    capacity    access:read-only    data:metrics
+
+    ${result}=    RW.CLI.Run Bash File
+    ...    bash_file=check-cluster-capacity.sh
+    ...    env=${env}
+    ...    secret__vast_vms_credentials=${vast_vms_credentials}
+    ...    timeout_seconds=180
+    ...    include_in_history=false
+    ...    show_in_rwl_cheatsheet=true
+    ...    cmd_override=./check-cluster-capacity.sh
+
+    ${issues}=    RW.CLI.Run Cli
+    ...    cmd=cat cluster_capacity_output.json
+
+    TRY
+        ${issue_list}=    Evaluate    json.loads(r'''${issues.stdout}''')    json
+    EXCEPT
+        Log    Failed to parse JSON for task, defaulting to empty list.    WARN
+        ${issue_list}=    Create List
+    END
+
+    IF    len(@{issue_list}) > 0
+        FOR    ${issue}    IN    @{issue_list}
+            RW.Core.Add Issue
+            ...    severity=${issue['severity']}
+            ...    expected=Cluster physical and logical capacity should remain below configured thresholds for cluster `${VAST_CLUSTER_NAME}`
+            ...    actual=Cluster capacity utilization exceeds warning or critical thresholds
+            ...    title=${issue['title']}
+            ...    reproduce_hint=${result.cmd}
+            ...    details=${issue['details']}
+            ...    next_steps=${issue['next_steps']}
+        END
+    END
+
+    RW.Core.Add Pre To Report    Cluster Capacity Results:\n${result.stdout}
+
+Check CNode and DNode Hardware Health for Cluster `${VAST_CLUSTER_NAME}`
+    [Documentation]    Inspects CNode/DNode state, SSD/SCM health, and hardware fault indicators from REST and Prometheus exporter metrics.
+    [Tags]    VAST    vast_data    cnodes    dnodes    hardware    access:read-only    data:metrics
+
+    ${result}=    RW.CLI.Run Bash File
+    ...    bash_file=check-node-hardware-health.sh
+    ...    env=${env}
+    ...    secret__vast_vms_credentials=${vast_vms_credentials}
+    ...    timeout_seconds=180
+    ...    include_in_history=false
+    ...    cmd_override=./check-node-hardware-health.sh
+
+    ${issues}=    RW.CLI.Run Cli
+    ...    cmd=cat node_hardware_health_output.json
+
+    TRY
+        ${issue_list}=    Evaluate    json.loads(r'''${issues.stdout}''')    json
+    EXCEPT
+        Log    Failed to parse JSON for task, defaulting to empty list.    WARN
+        ${issue_list}=    Create List
+    END
+
+    IF    len(@{issue_list}) > 0
+        FOR    ${issue}    IN    @{issue_list}
+            RW.Core.Add Issue
+            ...    severity=${issue['severity']}
+            ...    expected=All CNodes, DNodes, and storage devices should be healthy for cluster `${VAST_CLUSTER_NAME}`
+            ...    actual=Node or device hardware health issues were detected
+            ...    title=${issue['title']}
+            ...    reproduce_hint=${result.cmd}
+            ...    details=${issue['details']}
+            ...    next_steps=${issue['next_steps']}
+        END
+    END
+
+    RW.Core.Add Pre To Report    Node Hardware Health Results:\n${result.stdout}
+
+Check Cluster Degraded Components and Active Alerts for Cluster `${VAST_CLUSTER_NAME}`
+    [Documentation]    Lists degraded boxes, failed drives, offline nodes, and active VMS alerts that indicate partial cluster failure.
+    [Tags]    VAST    vast_data    cluster    alerts    access:read-only    data:logs-config
+
+    ${result}=    RW.CLI.Run Bash File
+    ...    bash_file=check-degraded-components.sh
+    ...    env=${env}
+    ...    secret__vast_vms_credentials=${vast_vms_credentials}
+    ...    timeout_seconds=180
+    ...    include_in_history=false
+    ...    cmd_override=./check-degraded-components.sh
+
+    ${issues}=    RW.CLI.Run Cli
+    ...    cmd=cat degraded_components_output.json
+
+    TRY
+        ${issue_list}=    Evaluate    json.loads(r'''${issues.stdout}''')    json
+    EXCEPT
+        Log    Failed to parse JSON for task, defaulting to empty list.    WARN
+        ${issue_list}=    Create List
+    END
+
+    IF    len(@{issue_list}) > 0
+        FOR    ${issue}    IN    @{issue_list}
+            RW.Core.Add Issue
+            ...    severity=${issue['severity']}
+            ...    expected=Cluster should have no degraded boxes, offline nodes, or active alarms for cluster `${VAST_CLUSTER_NAME}`
+            ...    actual=Degraded components or active VMS alarms were found
+            ...    title=${issue['title']}
+            ...    reproduce_hint=${result.cmd}
+            ...    details=${issue['details']}
+            ...    next_steps=${issue['next_steps']}
+        END
+    END
+
+    RW.Core.Add Pre To Report    Degraded Components Results:\n${result.stdout}
+
+Analyze Cluster Protocol Performance for Cluster `${VAST_CLUSTER_NAME}`
+    [Documentation]    Reviews cluster-wide IOPS, bandwidth, and latency by storage protocol (NFS, block, S3) to detect IO stalls or abnormal drops in data flow.
+    [Tags]    VAST    vast_data    cluster    performance    access:read-only    data:metrics
+
+    ${result}=    RW.CLI.Run Bash File
+    ...    bash_file=analyze-cluster-performance.sh
+    ...    env=${env}
+    ...    secret__vast_vms_credentials=${vast_vms_credentials}
+    ...    timeout_seconds=180
+    ...    include_in_history=false
+    ...    cmd_override=./analyze-cluster-performance.sh
+
+    ${issues}=    RW.CLI.Run Cli
+    ...    cmd=cat cluster_performance_output.json
+
+    TRY
+        ${issue_list}=    Evaluate    json.loads(r'''${issues.stdout}''')    json
+    EXCEPT
+        Log    Failed to parse JSON for task, defaulting to empty list.    WARN
+        ${issue_list}=    Create List
+    END
+
+    IF    len(@{issue_list}) > 0
+        FOR    ${issue}    IN    @{issue_list}
+            RW.Core.Add Issue
+            ...    severity=${issue['severity']}
+            ...    expected=Cluster protocol performance should remain within expected IO and latency bounds for cluster `${VAST_CLUSTER_NAME}`
+            ...    actual=Cluster protocol performance anomalies were detected
+            ...    title=${issue['title']}
+            ...    reproduce_hint=${result.cmd}
+            ...    details=${issue['details']}
+            ...    next_steps=${issue['next_steps']}
+        END
+    END
+
+    RW.Core.Add Pre To Report    Cluster Performance Results:\n${result.stdout}
+
+Check Replication and Protection Group Status for Cluster `${VAST_CLUSTER_NAME}`
+    [Documentation]    Verifies replication links, protection groups, and snapshot policies are healthy and not blocking writes or causing capacity pressure.
+    [Tags]    VAST    vast_data    cluster    replication    access:read-only    data:config
+
+    ${result}=    RW.CLI.Run Bash File
+    ...    bash_file=check-replication-status.sh
+    ...    env=${env}
+    ...    secret__vast_vms_credentials=${vast_vms_credentials}
+    ...    timeout_seconds=180
+    ...    include_in_history=false
+    ...    cmd_override=./check-replication-status.sh
+
+    ${issues}=    RW.CLI.Run Cli
+    ...    cmd=cat replication_status_output.json
+
+    TRY
+        ${issue_list}=    Evaluate    json.loads(r'''${issues.stdout}''')    json
+    EXCEPT
+        Log    Failed to parse JSON for task, defaulting to empty list.    WARN
+        ${issue_list}=    Create List
+    END
+
+    IF    len(@{issue_list}) > 0
+        FOR    ${issue}    IN    @{issue_list}
+            RW.Core.Add Issue
+            ...    severity=${issue['severity']}
+            ...    expected=Replication streams and protection groups should be healthy with acceptable snapshot/auxiliary space for cluster `${VAST_CLUSTER_NAME}`
+            ...    actual=Replication or protection group issues were detected
+            ...    title=${issue['title']}
+            ...    reproduce_hint=${result.cmd}
+            ...    details=${issue['details']}
+            ...    next_steps=${issue['next_steps']}
+        END
+    END
+
+    RW.Core.Add Pre To Report    Replication Status Results:\n${result.stdout}
+
+
+*** Keywords ***
+Suite Initialization
+    TRY
+        ${vast_vms_credentials}=    RW.Core.Import Secret    vast_vms_credentials
+        ...    type=string
+        ...    description=VMS API credentials JSON with USERNAME/PASSWORD or API_TOKEN
+        ...    pattern=\w*
+        Set Suite Variable    ${vast_vms_credentials}    ${vast_vms_credentials}
+    EXCEPT
+        Log    vast_vms_credentials secret not found; VMS API tasks will fail until configured.    WARN
+        Set Suite Variable    ${vast_vms_credentials}    ${EMPTY}
+    END
+
+    ${VAST_VMS_ENDPOINT}=    RW.Core.Import User Variable    VAST_VMS_ENDPOINT
+    ...    type=string
+    ...    description=VMS REST API base URL (e.g. https://vms.example.com)
+    ...    pattern=\w*
+    ${VAST_CLUSTER_NAME}=    RW.Core.Import User Variable    VAST_CLUSTER_NAME
+    ...    type=string
+    ...    description=VAST cluster display name for scoping and issue titles
+    ...    pattern=\w*
+    ${RESOURCES}=    RW.Core.Import User Variable    RESOURCES
+    ...    type=string
+    ...    description=Cluster name(s) or All for auto-discovery via VMS API
+    ...    pattern=^[\w,\s-]*$
+    ...    default=All
+    ${CAPACITY_THRESHOLD}=    RW.Core.Import User Variable    CAPACITY_THRESHOLD
+    ...    type=string
+    ...    description=Physical/logical capacity utilization percent that triggers an issue
+    ...    pattern=^\d+$
+    ...    default=85
+    ${CRITICAL_CAPACITY_THRESHOLD}=    RW.Core.Import User Variable    CRITICAL_CAPACITY_THRESHOLD
+    ...    type=string
+    ...    description=Critical capacity threshold percent
+    ...    pattern=^\d+$
+    ...    default=95
+
+    Set Suite Variable    ${VAST_VMS_ENDPOINT}    ${VAST_VMS_ENDPOINT}
+    Set Suite Variable    ${VAST_CLUSTER_NAME}    ${VAST_CLUSTER_NAME}
+    Set Suite Variable    ${RESOURCES}    ${RESOURCES}
+    Set Suite Variable    ${CAPACITY_THRESHOLD}    ${CAPACITY_THRESHOLD}
+    Set Suite Variable    ${CRITICAL_CAPACITY_THRESHOLD}    ${CRITICAL_CAPACITY_THRESHOLD}
+
+    ${cred_path}=    Set Variable If    '${vast_vms_credentials}' != ''    ./${vast_vms_credentials.key}    ${EMPTY}
+    ${env_dict}=    Create Dictionary
+    ...    VAST_VMS_ENDPOINT=${VAST_VMS_ENDPOINT}
+    ...    VAST_CLUSTER_NAME=${VAST_CLUSTER_NAME}
+    ...    RESOURCES=${RESOURCES}
+    ...    CAPACITY_THRESHOLD=${CAPACITY_THRESHOLD}
+    ...    CRITICAL_CAPACITY_THRESHOLD=${CRITICAL_CAPACITY_THRESHOLD}
+    ...    VAST_VMS_CREDENTIALS_FILE=${cred_path}
+    Set Suite Variable    ${env}    ${env_dict}
diff --git a/codebundles/vast-cluster-health/sli-vast-cluster-health-score.sh b/codebundles/vast-cluster-health/sli-vast-cluster-health-score.sh
new file mode 100755
index 00000000..090dca93
--- /dev/null
+++ b/codebundles/vast-cluster-health/sli-vast-cluster-health-score.sh
@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+set -euo pipefail
+# Lightweight SLI scoring script — outputs JSON with binary sub-scores.
+
+source "$(dirname "$0")/vast-vms-common.sh"
+
+if ! _vast_load_credentials; then
+  jq -n '{
+    vms_clustered: 0,
+    capacity_ok: 0,
+    nodes_healthy: 0,
+    alarms_clear: 0,
+    replication_ok: 1,
+    details: {error: "missing credentials"}
+  }'
+  exit 0
+fi
+
+vms_clustered=0
+capacity_ok=1
+nodes_healthy=1
+alarms_clear=1
+replication_ok=1
+details='{}'
+
+if vms_state_text="$(vast_prometheus_get "vms_state" 2>/dev/null)"; then
+  vms_state="$(vast_prometheus_gauge "$vms_state_text" "vms_state")"
+  [[ -z "$vms_state" ]] && vms_state="$(vast_prometheus_gauge "$vms_state_text" "vast_vms_state")"
+  [[ "$vms_state" == "1" ]] && vms_clustered=1
+  details="$(jq -n --arg s "${vms_state:-unknown}" '{vms_state: $s}')"
+else
+  if clusters_json="$(vast_api_get "/api/clusters/" 2>/dev/null)"; then
+    cluster_obj="$(vast_find_cluster_json "$clusters_json" "$VAST_CLUSTER_NAME")"
+    state="$(echo "$cluster_obj" | jq -r '.state // "UNKNOWN"')"
+    [[ "$state" == "ONLINE" || "$state" == "CLUSTERED" ]] && vms_clustered=1
+    details="$(jq -n --arg s "$state" '{cluster_state: $s}')"
+  fi
+fi
+
+if clusters_json="$(vast_api_get "/api/clusters/" 2>/dev/null)"; then
+  cluster_obj="$(vast_find_cluster_json "$clusters_json" "$VAST_CLUSTER_NAME")"
+  if [[ -n "$cluster_obj" ]]; then
+    for pct_field in physical_space_in_use_percent logical_space_in_use_percent; do
+      pct="$(echo "$cluster_obj" | jq -r --arg f "$pct_field" '.[$f] // empty')"
+      [[ -z "$pct" || "$pct" == "null" ]] && continue
+      ok="$(python3 - <<PY
+print(1 if float("${pct}") < float("${CAPACITY_THRESHOLD}") else 0)
+PY
+)"
+      [[ "$ok" == "0" ]] && capacity_ok=0
+    done
+  fi
+fi
+
+for path in "/api/cnodes/" "/api/dnodes/"; do
+  if nodes_json="$(vast_api_get "$path" 2>/dev/null)"; then
+    bad="$(echo "$nodes_json" | jq '
+      (if type == "array" then . elif .results then .results else [.] end)
+      | map(select((.state // .status // "ACTIVE") | ascii_upcase | test("OFFLINE|FAILED|INACTIVE|DISABLED|ERROR")))
+      | length
+    ' 2>/dev/null || echo 0)"
+    [[ "$bad" -gt 0 ]] && nodes_healthy=0
+  fi
+done
+
+if alarms_text="$(vast_prometheus_get "alarms" 2>/dev/null)"; then
+  alarm_lines="$(echo "$alarms_text" | awk '$0 !~ /^#/ && $2 != "0" && $2 != "0.0" {c++} END {print c+0}')"
+  [[ "$alarm_lines" -gt 0 ]] && alarms_clear=0
+fi
+
+if repl_text="$(vast_prometheus_get "replications" 2>/dev/null)"; then
+  bad="$(echo "$repl_text" | awk '$0 !~ /^#/ && ($0 ~ /failed|error|stalled/i) {c++} END {print c+0}')"
+  [[ "$bad" -gt 0 ]] && replication_ok=0
+fi
+
+jq -n \
+  --argjson vms_clustered "$vms_clustered" \
+  --argjson capacity_ok "$capacity_ok" \
+  --argjson nodes_healthy "$nodes_healthy" \
+  --argjson alarms_clear "$alarms_clear" \
+  --argjson replication_ok "$replication_ok" \
+  --argjson details "$details" \
+  '{
+    vms_clustered: $vms_clustered,
+    capacity_ok: $capacity_ok,
+    nodes_healthy: $nodes_healthy,
+    alarms_clear: $alarms_clear,
+    replication_ok: $replication_ok,
+    details: $details
+  }'
diff --git a/codebundles/vast-cluster-health/sli.robot b/codebundles/vast-cluster-health/sli.robot
new file mode 100644
index 00000000..efb2be23
--- /dev/null
+++ b/codebundles/vast-cluster-health/sli.robot
@@ -0,0 +1,128 @@
+*** Settings ***
+Documentation       Measures VAST cluster health across five binary dimensions (VMS clustered, capacity, nodes, alarms, replication) and averages them into a 0-1 score.
+Metadata            Author    rw-codebundle-agent
+Metadata            Display Name    VAST Data Cluster Health SLI
+Metadata            Supports    VAST    vast_data    cluster    storage    metrics
+
+Library             BuiltIn
+Library             RW.Core
+Library             RW.CLI
+Library             RW.platform
+
+Suite Setup         Suite Initialization
+
+
+*** Tasks ***
+Score VMS Cluster State
+    [Documentation]    Binary score: 1 when vms_state=1 or cluster REST state is ONLINE/CLUSTERED, 0 otherwise.
+    [Tags]    VAST    sli    access:read-only    data:metrics
+    ${score}=    Set Variable    ${score_vms}
+    RW.Core.Push Metric    ${score}    sub_name=vms_clustered
+
+Score Cluster Capacity Headroom
+    [Documentation]    Binary score: 1 when physical and logical utilization are below CAPACITY_THRESHOLD.
+    [Tags]    VAST    sli    access:read-only    data:metrics
+    ${score}=    Set Variable    ${score_capacity}
+    RW.Core.Push Metric    ${score}    sub_name=capacity_ok
+
+Score Node Hardware Health
+    [Documentation]    Binary score: 1 when no CNodes or DNodes report offline/failed states.
+    [Tags]    VAST    sli    access:read-only    data:metrics
+    ${score}=    Set Variable    ${score_nodes}
+    RW.Core.Push Metric    ${score}    sub_name=nodes_healthy
+
+Score Active Alarm Clearance
+    [Documentation]    Binary score: 1 when Prometheus alarms exporter reports no active alarms.
+    [Tags]    VAST    sli    access:read-only    data:metrics
+    ${score}=    Set Variable    ${score_alarms}
+    RW.Core.Push Metric    ${score}    sub_name=alarms_clear
+
+Score Replication Health
+    [Documentation]    Binary score: 1 when replication Prometheus metrics show no failed/stalled streams (defaults to 1 if endpoint unavailable).
+    [Tags]    VAST    sli    access:read-only    data:metrics
+    ${score}=    Set Variable    ${score_replication}
+    RW.Core.Push Metric    ${score}    sub_name=replication_ok
+
+Generate Aggregate VAST Cluster Health Score
+    [Documentation]    Averages sub-scores into the primary 0-1 health metric.
+    [Tags]    VAST    sli    access:read-only    data:metrics
+    ${total}=    Evaluate    int(${score_vms}) + int(${score_capacity}) + int(${score_nodes}) + int(${score_alarms}) + int(${score_replication})
+    ${health_score}=    Evaluate    ${total} / 5.0
+    ${health_score}=    Convert To Number    ${health_score}    2
+    RW.Core.Add To Report    VAST cluster health score: ${health_score} (vms=${score_vms}, capacity=${score_capacity}, nodes=${score_nodes}, alarms=${score_alarms}, replication=${score_replication})
+    RW.Core.Push Metric    ${health_score}
+
+
+*** Keywords ***
+Suite Initialization
+    TRY
+        ${vast_vms_credentials}=    RW.Core.Import Secret    vast_vms_credentials
+        ...    type=string
+        ...    description=VMS API credentials JSON with USERNAME/PASSWORD or API_TOKEN
+        ...    pattern=\w*
+        Set Suite Variable    ${vast_vms_credentials}    ${vast_vms_credentials}
+    EXCEPT
+        Log    vast_vms_credentials secret not found.    WARN
+        Set Suite Variable    ${vast_vms_credentials}    ${EMPTY}
+    END
+
+    ${VAST_VMS_ENDPOINT}=    RW.Core.Import User Variable    VAST_VMS_ENDPOINT
+    ...    type=string
+    ...    description=VMS REST API base URL
+    ...    pattern=\w*
+    ${VAST_CLUSTER_NAME}=    RW.Core.Import User Variable    VAST_CLUSTER_NAME
+    ...    type=string
+    ...    description=VAST cluster display name
+    ...    pattern=\w*
+    ${CAPACITY_THRESHOLD}=    RW.Core.Import User Variable    CAPACITY_THRESHOLD
+    ...    type=string
+    ...    description=Capacity warning threshold percent
+    ...    pattern=^\d+$
+    ...    default=85
+    ${CRITICAL_CAPACITY_THRESHOLD}=    RW.Core.Import User Variable    CRITICAL_CAPACITY_THRESHOLD
+    ...    type=string
+    ...    description=Critical capacity threshold percent
+    ...    pattern=^\d+$
+    ...    default=95
+
+    Set Suite Variable    ${VAST_VMS_ENDPOINT}    ${VAST_VMS_ENDPOINT}
+    Set Suite Variable    ${VAST_CLUSTER_NAME}    ${VAST_CLUSTER_NAME}
+    Set Suite Variable    ${CAPACITY_THRESHOLD}    ${CAPACITY_THRESHOLD}
+    Set Suite Variable    ${CRITICAL_CAPACITY_THRESHOLD}    ${CRITICAL_CAPACITY_THRESHOLD}
+
+    ${cred_path}=    Set Variable If    '${vast_vms_credentials}' != ''    ./${vast_vms_credentials.key}    ${EMPTY}
+    ${env_dict}=    Create Dictionary
+    ...    VAST_VMS_ENDPOINT=${VAST_VMS_ENDPOINT}
+    ...    VAST_CLUSTER_NAME=${VAST_CLUSTER_NAME}
+    ...    CAPACITY_THRESHOLD=${CAPACITY_THRESHOLD}
+    ...    CRITICAL_CAPACITY_THRESHOLD=${CRITICAL_CAPACITY_THRESHOLD}
+    ...    VAST_VMS_CREDENTIALS_FILE=${cred_path}
+    Set Suite Variable    ${env}    ${env_dict}
+
+    Set Suite Variable    ${score_vms}    0
+    Set Suite Variable    ${score_capacity}    0
+    Set Suite Variable    ${score_nodes}    0
+    Set Suite Variable    ${score_alarms}    0
+    Set Suite Variable    ${score_replication}    1
+
+    ${result}=    RW.CLI.Run Bash File
+    ...    bash_file=sli-vast-cluster-health-score.sh
+    ...    env=${env}
+    ...    secret__vast_vms_credentials=${vast_vms_credentials}
+    ...    timeout_seconds=30
+    ...    include_in_history=false
+    TRY
+        ${data}=    Evaluate    json.loads(r'''${result.stdout}''')    json
+        ${score_vms}=    Set Variable    ${data['vms_clustered']}
+        ${score_capacity}=    Set Variable    ${data['capacity_ok']}
+        ${score_nodes}=    Set Variable    ${data['nodes_healthy']}
+        ${score_alarms}=    Set Variable    ${data['alarms_clear']}
+        ${score_replication}=    Set Variable    ${data['replication_ok']}
+    EXCEPT
+        Log    Failed to parse SLI score JSON; defaulting sub-scores to failure mode.    WARN
+    END
+    Set Suite Variable    ${score_vms}    ${score_vms}
+    Set Suite Variable    ${score_capacity}    ${score_capacity}
+    Set Suite Variable    ${score_nodes}    ${score_nodes}
+    Set Suite Variable    ${score_alarms}    ${score_alarms}
+    Set Suite Variable    ${score_replication}    ${score_replication}
diff --git a/codebundles/vast-cluster-health/vast-vms-common.sh b/codebundles/vast-cluster-health/vast-vms-common.sh
new file mode 100755
index 00000000..f4386cef
--- /dev/null
+++ b/codebundles/vast-cluster-health/vast-vms-common.sh
@@ -0,0 +1,149 @@
+#!/usr/bin/env bash
+# Shared helpers for VAST VMS REST and Prometheus exporter access.
+# shellcheck disable=SC2034
+
+set -euo pipefail
+
+: "${VAST_VMS_ENDPOINT:?Must set VAST_VMS_ENDPOINT}"
+: "${VAST_CLUSTER_NAME:?Must set VAST_CLUSTER_NAME}"
+
+VAST_VMS_ENDPOINT="${VAST_VMS_ENDPOINT%/}"
+CAPACITY_THRESHOLD="${CAPACITY_THRESHOLD:-85}"
+CRITICAL_CAPACITY_THRESHOLD="${CRITICAL_CAPACITY_THRESHOLD:-95}"
+VAST_TLS_INSECURE="${VAST_TLS_INSECURE:-true}"
+VAST_CURL_TIMEOUT="${VAST_CURL_TIMEOUT:-60}"
+
+_vast_load_credentials() {
+  local creds_json="${1:-}"
+  if [[ -z "$creds_json" && -n "${VAST_VMS_CREDENTIALS_FILE:-}" && -f "${VAST_VMS_CREDENTIALS_FILE}" ]]; then
+    creds_json="$(cat "${VAST_VMS_CREDENTIALS_FILE}")"
+  fi
+  if [[ -z "$creds_json" && -n "${VAST_VMS_CREDENTIALS_JSON:-}" ]]; then
+    creds_json="${VAST_VMS_CREDENTIALS_JSON}"
+  fi
+  if [[ -z "$creds_json" ]]; then
+    echo "VAST credentials not configured (set vast_vms_credentials secret with USERNAME/PASSWORD or API_TOKEN)" >&2
+    return 1
+  fi
+  VAST_API_USERNAME="$(echo "$creds_json" | jq -r '.USERNAME // .username // empty')"
+  VAST_API_PASSWORD="$(echo "$creds_json" | jq -r '.PASSWORD // .password // empty')"
+  VAST_API_TOKEN="$(echo "$creds_json" | jq -r '.API_TOKEN // .api_token // .token // empty')"
+  export VAST_API_USERNAME VAST_API_PASSWORD VAST_API_TOKEN
+}
+
+_vast_fixture_path() {
+  local kind="$1"
+  if [[ -n "${VAST_MOCK_FIXTURE_DIR:-}" ]]; then
+    local candidate="${VAST_MOCK_FIXTURE_DIR}/${kind}"
+    if [[ -f "$candidate" ]]; then
+      echo "$candidate"
+      return 0
+    fi
+  fi
+  return 1
+}
+
+_vast_curl_common_args() {
+  local args=(-sS --connect-timeout 10 --max-time "${VAST_CURL_TIMEOUT}")
+  if [[ "${VAST_TLS_INSECURE}" == "true" ]]; then
+    args+=(-k)
+  fi
+  if [[ -n "${VAST_API_TOKEN:-}" ]]; then
+    args+=(-H "Authorization: Bearer ${VAST_API_TOKEN}")
+  elif [[ -n "${VAST_API_USERNAME:-}" && -n "${VAST_API_PASSWORD:-}" ]]; then
+    args+=(-u "${VAST_API_USERNAME}:${VAST_API_PASSWORD}")
+  fi
+  printf '%s\n' "${args[@]}"
+}
+
+vast_api_get() {
+  local path="$1"
+  local fixture
+  if fixture="$(_vast_fixture_path "api${path//\//_}")"; then
+    cat "$fixture"
+    return 0
+  fi
+  mapfile -t curl_args < <(_vast_curl_common_args)
+  curl "${curl_args[@]}" -H "Accept: application/json" "${VAST_VMS_ENDPOINT}${path}"
+}
+
+vast_prometheus_get() {
+  local endpoint="$1"
+  local fixture
+  if fixture="$(_vast_fixture_path "prometheus_${endpoint//\//_}")"; then
+    cat "$fixture"
+    return 0
+  fi
+  mapfile -t curl_args < <(_vast_curl_common_args)
+  curl "${curl_args[@]}" "${VAST_VMS_ENDPOINT}/api/prometheusmetrics/${endpoint}"
+}
+
+vast_prometheus_gauge() {
+  local metrics_text="$1"
+  local metric_name="$2"
+  echo "$metrics_text" | awk -v name="$metric_name" '
+    $0 !~ /^#/ && $1 ~ name {
+      val = $2
+      gsub(/[^0-9.eE+-]/, "", val)
+      if (val != "") { print val; exit }
+    }
+    END { if (NR == 0) exit 1 }
+  ' 2>/dev/null || echo ""
+}
+
+vast_prometheus_metric_sum() {
+  local metrics_text="$1"
+  local metric_regex="$2"
+  echo "$metrics_text" | awk -v re="$metric_regex" '
+    $0 !~ /^#/ && $1 ~ re {
+      val = $2
+      gsub(/[^0-9.eE+-]/, "", val)
+      if (val != "") sum += val
+    }
+    END { printf "%.0f", sum+0 }
+  '
+}
+
+vast_find_cluster_json() {
+  local clusters_json="$1"
+  local cluster_name="$2"
+  echo "$clusters_json" | jq -c --arg name "$cluster_name" '
+    (if type == "array" then . elif .results then .results elif .clusters then .clusters else [.] end)
+    | map(select((.name // .title // "") | ascii_downcase == ($name | ascii_downcase)))
+    | .[0] // empty
+  '
+}
+
+vast_append_issue() {
+  local issues_json="$1"
+  local title="$2"
+  local details="$3"
+  local severity="$4"
+  local next_steps="$5"
+  echo "$issues_json" | jq \
+    --arg title "$title" \
+    --arg details "$details" \
+    --arg severity "$severity" \
+    --arg next_steps "$next_steps" \
+    '. += [{
+      "title": $title,
+      "details": $details,
+      "severity": ($severity | tonumber),
+      "next_steps": $next_steps
+    }]'
+}
+
+vast_api_error_issue() {
+  local issues_json="$1"
+  local context="$2"
+  local err_msg="$3"
+  vast_append_issue "$issues_json" \
+    "Cannot Access VAST Cluster \`${VAST_CLUSTER_NAME}\` (${context})" \
+    "VMS API call failed: ${err_msg}" \
+    "4" \
+    "Verify VAST_VMS_ENDPOINT, network connectivity, and vast_vms_credentials permissions"
+}
+
+vast_init_issues() {
+  echo '[]'
+}