Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: runwhen.com/v1
kind: GenerationRules
spec:
platform: vast_data
generationRules:
- resourceTypes:
- vast_data_cluster
matchRules:
- type: pattern
pattern: ".+"
properties: ["name"]
mode: substring
slxs:
- baseName: vast-cluster-health
qualifiers: ["vast_cluster_name", "vast_vms_endpoint"]
baseTemplateName: vast-cluster-health
levelOfDetail: basic
outputItems:
- type: slx
- type: sli
- type: runbook
templateName: vast-cluster-health-taskset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
apiVersion: runwhen.com/v1
kind: ServiceLevelIndicator
metadata:
name: {{ slx_name }}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
displayUnitsLong: OK
displayUnitsShort: ok
locations:
- {{ default_location }}
description: Lightweight VAST cluster health score for {{ match_resource.name }} from VMS state, capacity, nodes, alarms, and replication.
codeBundle:
{% if repo_url %}
repoUrl: {{ repo_url }}
{% else %}
repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
{% endif %}
{% if ref %}
ref: {{ ref }}
{% else %}
ref: main
{% endif %}
pathToRobot: codebundles/vast-cluster-health/sli.robot
intervalStrategy: intermezzo
intervalSeconds: 300
configProvided:
- name: VAST_VMS_ENDPOINT
value: "{{ match_resource.vms_endpoint | default(custom.vast_vms_endpoint) }}"
- name: VAST_CLUSTER_NAME
value: "{{ match_resource.name }}"
- name: CAPACITY_THRESHOLD
value: "{{ custom.capacity_threshold | default('85') }}"
- name: CRITICAL_CAPACITY_THRESHOLD
value: "{{ custom.critical_capacity_threshold | default('95') }}"
secretsProvided:
{% if wb_version %}
{% include "vast_data-auth.yaml" ignore missing %}
{% else %}
- name: vast_vms_credentials
workspaceKey: AUTH DETAILS NOT FOUND
{% endif %}
alertConfig:
tasks:
persona: eager-edgar
sessionTTL: 10m
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
apiVersion: runwhen.com/v1
kind: ServiceLevelX
metadata:
name: {{ slx_name }}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/storage/storage.svg
alias: {{ match_resource.name }} VAST Cluster Health
asMeasuredBy: Composite 0-1 score from VMS state, capacity headroom, node health, alarms, and replication.
configProvided:
- name: SLX_PLACEHOLDER
value: SLX_PLACEHOLDER
owners:
- {{ workspace.owner_email }}
statement: VAST cluster {{ match_resource.name }} should remain CLUSTERED with healthy nodes, capacity headroom, and no active alarms.
additionalContext:
{% include "vast_data-hierarchy.yaml" ignore missing %}
qualified_name: "{{ match_resource.qualified_name }}"
vast_vms_endpoint: "{{ match_resource.vms_endpoint | default(custom.vast_vms_endpoint | default('')) }}"
tags:
{% include "vast_data-tags.yaml" ignore missing %}
- name: cloud
value: on-prem
- name: service
value: vast_data
- name: scope
value: cluster
- name: access
value: read-only
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
apiVersion: runwhen.com/v1
kind: Runbook
metadata:
name: {{ slx_name }}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
location: {{ default_location }}
description: Monitor VAST Data cluster-wide health via VMS REST and Prometheus metrics for {{ match_resource.name }}.
codeBundle:
{% if repo_url %}
repoUrl: {{ repo_url }}
{% else %}
repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
{% endif %}
{% if ref %}
ref: {{ ref }}
{% else %}
ref: main
{% endif %}
pathToRobot: codebundles/vast-cluster-health/runbook.robot
configProvided:
- name: VAST_VMS_ENDPOINT
value: "{{ match_resource.vms_endpoint | default(custom.vast_vms_endpoint) }}"
- name: VAST_CLUSTER_NAME
value: "{{ match_resource.name }}"
- name: RESOURCES
value: "{{ custom.resources | default('All') }}"
- name: CAPACITY_THRESHOLD
value: "{{ custom.capacity_threshold | default('85') }}"
- name: CRITICAL_CAPACITY_THRESHOLD
value: "{{ custom.critical_capacity_threshold | default('95') }}"
secretsProvided:
{% if wb_version %}
{% include "vast_data-auth.yaml" ignore missing %}
{% else %}
- name: vast_vms_credentials
workspaceKey: AUTH DETAILS NOT FOUND
{% endif %}
16 changes: 16 additions & 0 deletions codebundles/vast-cluster-health/.test/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Mock scenario fixtures for vast-cluster-health.

Static JSON/Prometheus fixtures used when `VAST_MOCK_FIXTURE_DIR` is set (see `run-mock-scenarios.sh`).

| Scenario | Expected issues | Description |
|----------|-----------------|-------------|
| `healthy` | 0 | CLUSTERED state, capacity below threshold, all nodes healthy |
| `degraded` | 2+ | DEGRADED vms_state with offline DNode and active alarm |
| `capacity_pressure` | 1+ | Logical capacity above CAPACITY_THRESHOLD with no hardware faults |

Run:

```bash
cd .test
task
```
23 changes: 23 additions & 0 deletions codebundles/vast-cluster-health/.test/Taskfile.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
version: "3"

tasks:
default:
desc: "Validate structure and run mock scenario tests"
cmds:
- task: validate-structure
- task: test-mock-scenarios

validate-structure:
desc: "Run static checks for required files"
cmds:
- ./validate-vast-bundle-structure.sh

test-mock-scenarios:
desc: "Run task scripts against fixture-backed mock VMS responses"
cmds:
- ./run-mock-scenarios.sh

clean:
desc: "Remove local test outputs"
cmds:
- rm -f ../*_output.json ../*_report.txt perf_analysis.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[
{
"id": 1,
"name": "vast-lab-cluster",
"title": "vast-lab-cluster",
"state": "ONLINE",
"enabled": true,
"physical_space_in_use_percent": 88.5,
"logical_space_in_use_percent": 91.2,
"physical_space_in_use_tb": 250.0,
"logical_space_in_use_tb": 230.0,
"auxiliary_space_in_use_percent": 45.0,
"replication_enabled": true
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[
{"id": 1, "name": "cnode-1", "state": "ACTIVE"}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[
{"id": 1, "name": "dnode-1", "state": "ACTIVE"}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# TYPE vast_alarm_active gauge
vast_alarm_active 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# TYPE vms_state gauge
vms_state 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[
{
"id": 1,
"name": "vast-lab-cluster",
"title": "vast-lab-cluster",
"state": "DEGRADED",
"enabled": true,
"physical_space_in_use_percent": 55.0,
"logical_space_in_use_percent": 52.0,
"auxiliary_space_in_use_percent": 20.0,
"replication_enabled": true
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[
{"id": 1, "name": "cnode-1", "state": "ACTIVE"}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[
{"id": 1, "name": "dnode-1", "state": "OFFLINE"},
{"id": 2, "name": "dnode-2", "state": "ACTIVE"}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# TYPE vast_alarm_active gauge
vast_alarm_active 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# TYPE vms_state gauge
vms_state 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[
{
"id": 1,
"name": "vast-lab-cluster",
"title": "vast-lab-cluster",
"state": "ONLINE",
"enabled": true,
"physical_space_in_use_percent": 42.5,
"logical_space_in_use_percent": 38.0,
"physical_space_in_use_tb": 120.5,
"logical_space_in_use_tb": 95.2,
"auxiliary_space_in_use_percent": 12.0,
"replication_enabled": true
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[
{"id": 1, "name": "cnode-1", "state": "ACTIVE"},
{"id": 2, "name": "cnode-2", "state": "ACTIVE"}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[
{"id": 1, "name": "dnode-1", "state": "ACTIVE"},
{"id": 2, "name": "dnode-2", "state": "ACTIVE"}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"state": "CLUSTERED"}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# TYPE vast_alarm_active gauge
vast_alarm_active 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# TYPE vms_state gauge
vms_state 1
49 changes: 49 additions & 0 deletions codebundles/vast-cluster-health/.test/run-mock-scenarios.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env bash
set -euo pipefail

ROOT="$(cd "$(dirname "$0")/.." && pwd)"
cd "$ROOT"

export VAST_VMS_ENDPOINT="https://vms.mock.local"
export VAST_CLUSTER_NAME="vast-lab-cluster"
export CAPACITY_THRESHOLD="85"
export CRITICAL_CAPACITY_THRESHOLD="95"
export VAST_VMS_CREDENTIALS_JSON='{"USERNAME":"admin","PASSWORD":"mock"}'

run_scenario() {
local name="$1"
local fixture_dir="$ROOT/.test/fixtures/${name}"
local expected_min="${2:-0}"
local expected_max="${3:-999}"

echo "=== Scenario: ${name} ==="
export VAST_MOCK_FIXTURE_DIR="$fixture_dir"

rm -f *_output.json
./check-vms-cluster-health.sh >/dev/null
./check-cluster-capacity.sh >/dev/null
./check-node-hardware-health.sh >/dev/null
./check-degraded-components.sh >/dev/null
./check-replication-status.sh >/dev/null

total_issues=0
for f in vms_cluster_health_output.json cluster_capacity_output.json node_hardware_health_output.json degraded_components_output.json replication_status_output.json; do
count="$(jq 'length' "$f")"
total_issues=$((total_issues + count))
done

echo "Total issues: ${total_issues} (expected between ${expected_min} and ${expected_max})"
if (( total_issues < expected_min || total_issues > expected_max )); then
echo "Scenario ${name} FAILED" >&2
exit 1
fi

sli_json="$(./sli-vast-cluster-health-score.sh)"
echo "SLI scores: ${sli_json}"
}

run_scenario healthy 0 0
run_scenario degraded 2 10
run_scenario capacity_pressure 1 3

echo "All mock scenarios passed"
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env bash
set -euo pipefail

ROOT="$(cd "$(dirname "$0")/.." && pwd)"
cd "$ROOT"

need=(
runbook.robot
sli.robot
README.md
vast-vms-common.sh
check-vms-cluster-health.sh
check-cluster-capacity.sh
check-node-hardware-health.sh
check-degraded-components.sh
analyze-cluster-performance.sh
check-replication-status.sh
sli-vast-cluster-health-score.sh
.runwhen/generation-rules/vast-cluster-health.yaml
.runwhen/templates/vast-cluster-health-slx.yaml
.runwhen/templates/vast-cluster-health-taskset.yaml
.runwhen/templates/vast-cluster-health-sli.yaml
)

for f in "${need[@]}"; do
if [[ ! -e "$f" ]]; then
echo "missing: $f" >&2
exit 1
fi
done

echo "vast-cluster-health structure OK"
Loading
Loading