runwhen-contrib · rw-codebundle-agent · Jun 25, 2026
@@ -0,0 +1,22 @@
+apiVersion: runwhen.com/v1
+kind: GenerationRules
+spec:
+  platform: vast_data
+  generationRules:
+    - resourceTypes:
+        - vast_data_cluster
+      matchRules:
+        - type: pattern
+          pattern: ".+"
+          properties: ["name"]
+          mode: substring
+      slxs:
+        - baseName: vast-cluster-health
+          qualifiers: ["vast_cluster_name", "vast_vms_endpoint"]
+          baseTemplateName: vast-cluster-health
+          levelOfDetail: basic
+          outputItems:
+            - type: slx
+            - type: sli
+            - type: runbook
+              templateName: vast-cluster-health-taskset.yaml
@@ -0,0 +1,48 @@
+apiVersion: runwhen.com/v1
+kind: ServiceLevelIndicator
+metadata:
+  name: {{ slx_name }}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  displayUnitsLong: OK
+  displayUnitsShort: ok
+  locations:
+    - {{ default_location }}
+  description: Lightweight VAST cluster health score for {{ match_resource.name }} from VMS state, capacity, nodes, alarms, and replication.
+  codeBundle:
+    {% if repo_url %}
+    repoUrl: {{ repo_url }}
+    {% else %}
+    repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
+    {% endif %}
+    {% if ref %}
+    ref: {{ ref }}
+    {% else %}
+    ref: main
+    {% endif %}
+    pathToRobot: codebundles/vast-cluster-health/sli.robot
+  intervalStrategy: intermezzo
+  intervalSeconds: 300
+  configProvided:
+    - name: VAST_VMS_ENDPOINT
+      value: "{{ match_resource.vms_endpoint | default(custom.vast_vms_endpoint) }}"
+    - name: VAST_CLUSTER_NAME
+      value: "{{ match_resource.name }}"
+    - name: CAPACITY_THRESHOLD
+      value: "{{ custom.capacity_threshold | default('85') }}"
+    - name: CRITICAL_CAPACITY_THRESHOLD
+      value: "{{ custom.critical_capacity_threshold | default('95') }}"
+  secretsProvided:
+  {% if wb_version %}
+    {% include "vast_data-auth.yaml" ignore missing %}
+  {% else %}
+    - name: vast_vms_credentials
+      workspaceKey: AUTH DETAILS NOT FOUND
+  {% endif %}
+  alertConfig:
+    tasks:
+      persona: eager-edgar
+      sessionTTL: 10m
@@ -0,0 +1,32 @@
+apiVersion: runwhen.com/v1
+kind: ServiceLevelX
+metadata:
+  name: {{ slx_name }}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/storage/storage.svg
+  alias: {{ match_resource.name }} VAST Cluster Health
+  asMeasuredBy: Composite 0-1 score from VMS state, capacity headroom, node health, alarms, and replication.
+  configProvided:
+    - name: SLX_PLACEHOLDER
+      value: SLX_PLACEHOLDER
+  owners:
+    - {{ workspace.owner_email }}
+  statement: VAST cluster {{ match_resource.name }} should remain CLUSTERED with healthy nodes, capacity headroom, and no active alarms.
+  additionalContext:
+    {% include "vast_data-hierarchy.yaml" ignore missing %}
+    qualified_name: "{{ match_resource.qualified_name }}"
+    vast_vms_endpoint: "{{ match_resource.vms_endpoint | default(custom.vast_vms_endpoint | default('')) }}"
+  tags:
+    {% include "vast_data-tags.yaml" ignore missing %}
+    - name: cloud
+      value: on-prem
+    - name: service
+      value: vast_data
+    - name: scope
+      value: cluster
+    - name: access
+      value: read-only
@@ -0,0 +1,41 @@
+apiVersion: runwhen.com/v1
+kind: Runbook
+metadata:
+  name: {{ slx_name }}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  location: {{ default_location }}
+  description: Monitor VAST Data cluster-wide health via VMS REST and Prometheus metrics for {{ match_resource.name }}.
+  codeBundle:
+    {% if repo_url %}
+    repoUrl: {{ repo_url }}
+    {% else %}
+    repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
+    {% endif %}
+    {% if ref %}
+    ref: {{ ref }}
+    {% else %}
+    ref: main
+    {% endif %}
+    pathToRobot: codebundles/vast-cluster-health/runbook.robot
+  configProvided:
+    - name: VAST_VMS_ENDPOINT
+      value: "{{ match_resource.vms_endpoint | default(custom.vast_vms_endpoint) }}"
+    - name: VAST_CLUSTER_NAME
+      value: "{{ match_resource.name }}"
+    - name: RESOURCES
+      value: "{{ custom.resources | default('All') }}"
+    - name: CAPACITY_THRESHOLD
+      value: "{{ custom.capacity_threshold | default('85') }}"
+    - name: CRITICAL_CAPACITY_THRESHOLD
+      value: "{{ custom.critical_capacity_threshold | default('95') }}"
+  secretsProvided:
+  {% if wb_version %}
+    {% include "vast_data-auth.yaml" ignore missing %}
+  {% else %}
+    - name: vast_vms_credentials
+      workspaceKey: AUTH DETAILS NOT FOUND
+  {% endif %}
@@ -0,0 +1,16 @@
+# Mock scenario fixtures for vast-cluster-health.
+
+Static JSON/Prometheus fixtures used when `VAST_MOCK_FIXTURE_DIR` is set (see `run-mock-scenarios.sh`).
+
+| Scenario | Expected issues | Description |
+|----------|-----------------|-------------|
+| `healthy` | 0 | CLUSTERED state, capacity below threshold, all nodes healthy |
+| `degraded` | 2+ | DEGRADED vms_state with offline DNode and active alarm |
+| `capacity_pressure` | 1+ | Logical capacity above CAPACITY_THRESHOLD with no hardware faults |
+
+Run:
+
+```bash
+cd .test
+task
+```
@@ -0,0 +1,23 @@
+version: "3"
+
+tasks:
+  default:
+    desc: "Validate structure and run mock scenario tests"
+    cmds:
+      - task: validate-structure
+      - task: test-mock-scenarios
+
+  validate-structure:
+    desc: "Run static checks for required files"
+    cmds:
+      - ./validate-vast-bundle-structure.sh
+
+  test-mock-scenarios:
+    desc: "Run task scripts against fixture-backed mock VMS responses"
+    cmds:
+      - ./run-mock-scenarios.sh
+
+  clean:
+    desc: "Remove local test outputs"
+    cmds:
+      - rm -f ../*_output.json ../*_report.txt perf_analysis.json
@@ -0,0 +1,15 @@
+[
+  {
+    "id": 1,
+    "name": "vast-lab-cluster",
+    "title": "vast-lab-cluster",
+    "state": "ONLINE",
+    "enabled": true,
+    "physical_space_in_use_percent": 88.5,
+    "logical_space_in_use_percent": 91.2,
+    "physical_space_in_use_tb": 250.0,
+    "logical_space_in_use_tb": 230.0,
+    "auxiliary_space_in_use_percent": 45.0,
+    "replication_enabled": true
+  }
+]
@@ -0,0 +1,3 @@
+[
+  {"id": 1, "name": "cnode-1", "state": "ACTIVE"}
+]
@@ -0,0 +1,3 @@
+[
+  {"id": 1, "name": "dnode-1", "state": "ACTIVE"}
+]
@@ -0,0 +1,2 @@
+# TYPE vast_alarm_active gauge
+vast_alarm_active 0
@@ -0,0 +1,2 @@
+# TYPE vms_state gauge
+vms_state 1
@@ -0,0 +1,13 @@
+[
+  {
+    "id": 1,
+    "name": "vast-lab-cluster",
+    "title": "vast-lab-cluster",
+    "state": "DEGRADED",
+    "enabled": true,
+    "physical_space_in_use_percent": 55.0,
+    "logical_space_in_use_percent": 52.0,
+    "auxiliary_space_in_use_percent": 20.0,
+    "replication_enabled": true
+  }
+]
@@ -0,0 +1,3 @@
+[
+  {"id": 1, "name": "cnode-1", "state": "ACTIVE"}
+]
@@ -0,0 +1,4 @@
+[
+  {"id": 1, "name": "dnode-1", "state": "OFFLINE"},
+  {"id": 2, "name": "dnode-2", "state": "ACTIVE"}
+]
@@ -0,0 +1,2 @@
+# TYPE vast_alarm_active gauge
+vast_alarm_active 1
@@ -0,0 +1,2 @@
+# TYPE vms_state gauge
+vms_state 0
@@ -0,0 +1,15 @@
+[
+  {
+    "id": 1,
+    "name": "vast-lab-cluster",
+    "title": "vast-lab-cluster",
+    "state": "ONLINE",
+    "enabled": true,
+    "physical_space_in_use_percent": 42.5,
+    "logical_space_in_use_percent": 38.0,
+    "physical_space_in_use_tb": 120.5,
+    "logical_space_in_use_tb": 95.2,
+    "auxiliary_space_in_use_percent": 12.0,
+    "replication_enabled": true
+  }
+]
@@ -0,0 +1,4 @@
+[
+  {"id": 1, "name": "cnode-1", "state": "ACTIVE"},
+  {"id": 2, "name": "cnode-2", "state": "ACTIVE"}
+]
@@ -0,0 +1,4 @@
+[
+  {"id": 1, "name": "dnode-1", "state": "ACTIVE"},
+  {"id": 2, "name": "dnode-2", "state": "ACTIVE"}
+]
@@ -0,0 +1 @@
+{"state": "CLUSTERED"}
@@ -0,0 +1,2 @@
+# TYPE vast_alarm_active gauge
+vast_alarm_active 0
@@ -0,0 +1,2 @@
+# TYPE vms_state gauge
+vms_state 1
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+export VAST_VMS_ENDPOINT="https://vms.mock.local"
+export VAST_CLUSTER_NAME="vast-lab-cluster"
+export CAPACITY_THRESHOLD="85"
+export CRITICAL_CAPACITY_THRESHOLD="95"
+export VAST_VMS_CREDENTIALS_JSON='{"USERNAME":"admin","PASSWORD":"mock"}'
+
+run_scenario() {
+  local name="$1"
+  local fixture_dir="$ROOT/.test/fixtures/${name}"
+  local expected_min="${2:-0}"
+  local expected_max="${3:-999}"
+
+  echo "=== Scenario: ${name} ==="
+  export VAST_MOCK_FIXTURE_DIR="$fixture_dir"
+
+  rm -f *_output.json
+  ./check-vms-cluster-health.sh >/dev/null
+  ./check-cluster-capacity.sh >/dev/null
+  ./check-node-hardware-health.sh >/dev/null
+  ./check-degraded-components.sh >/dev/null
+  ./check-replication-status.sh >/dev/null
+
+  total_issues=0
+  for f in vms_cluster_health_output.json cluster_capacity_output.json node_hardware_health_output.json degraded_components_output.json replication_status_output.json; do
+    count="$(jq 'length' "$f")"
+    total_issues=$((total_issues + count))
+  done
+
+  echo "Total issues: ${total_issues} (expected between ${expected_min} and ${expected_max})"
+  if (( total_issues < expected_min || total_issues > expected_max )); then
+    echo "Scenario ${name} FAILED" >&2
+    exit 1
+  fi
+
+  sli_json="$(./sli-vast-cluster-health-score.sh)"
+  echo "SLI scores: ${sli_json}"
+}
+
+run_scenario healthy 0 0
+run_scenario degraded 2 10
+run_scenario capacity_pressure 1 3
+
+echo "All mock scenarios passed"
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+need=(
+  runbook.robot
+  sli.robot
+  README.md
+  vast-vms-common.sh
+  check-vms-cluster-health.sh
+  check-cluster-capacity.sh
+  check-node-hardware-health.sh
+  check-degraded-components.sh
+  analyze-cluster-performance.sh
+  check-replication-status.sh
+  sli-vast-cluster-health-score.sh
+  .runwhen/generation-rules/vast-cluster-health.yaml
+  .runwhen/templates/vast-cluster-health-slx.yaml
+  .runwhen/templates/vast-cluster-health-taskset.yaml
+  .runwhen/templates/vast-cluster-health-sli.yaml
+)
+
+for f in "${need[@]}"; do
+  if [[ ! -e "$f" ]]; then
+    echo "missing: $f" >&2
+    exit 1
+  fi
+done
+
+echo "vast-cluster-health structure OK"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# TYPE vast_alarm_active gauge
		vast_alarm_active 0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# TYPE vast_alarm_active gauge
		vast_alarm_active 1