Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: runwhen.com/v1
kind: GenerationRules
spec:
platform: vast_data
generationRules:
- resourceTypes:
- vast_data_tenant
matchRules:
- type: pattern
pattern: ".+"
properties: ["name"]
mode: substring
slxs:
- baseName: vast-tenant-storage
qualifiers: ["cluster", "tenant"]
baseTemplateName: vast-tenant-storage-health
levelOfDetail: basic
outputItems:
- type: slx
- type: sli
- type: runbook
templateName: vast-tenant-storage-health-taskset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
apiVersion: runwhen.com/v1
kind: ServiceLevelIndicator
metadata:
name: {{slx_name}}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
displayUnitsLong: Health Score
displayUnitsShort: score
locations:
- {{default_location}}
description: Measures VAST tenant storage health from capacity utilization, QoS wait times, and read/write latency.
codeBundle:
{% if repo_url %}
repoUrl: {{repo_url}}
{% else %}
repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
{% endif %}
{% if ref %}
ref: {{ref}}
{% else %}
ref: main
{% endif %}
pathToRobot: codebundles/vast-tenant-storage-health/sli.robot
intervalStrategy: intermezzo
intervalSeconds: 300
configProvided:
- name: VAST_VMS_ENDPOINT
value: "{{ custom.vast_vms_endpoint | default('') }}"
- name: VAST_CLUSTER_NAME
value: "{{ custom.vast_cluster_name | default(match_resource.cluster_name) }}"
- name: VAST_TENANT_NAME
value: "{{ match_resource.resource_name }}"
- name: CAPACITY_THRESHOLD
value: "{{ custom.capacity_threshold | default('85') }}"
- name: QOS_UTILIZATION_THRESHOLD
value: "{{ custom.qos_utilization_threshold | default('90') }}"
- name: LATENCY_THRESHOLD_MS
value: "{{ custom.latency_threshold_ms | default('10') }}"
secretsProvided:
{% if wb_version %}
{% include "vast-data-auth.yaml" ignore missing %}
{% else %}
- name: vast_vms_credentials
workspaceKey: AUTH DETAILS NOT FOUND
{% endif %}
alertConfig:
tasks:
persona: eager-edgar
sessionTTL: 10m
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
apiVersion: runwhen.com/v1
kind: ServiceLevelX
metadata:
name: {{ slx_name }}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/storage/storage.svg
alias: {{ match_resource.resource_name }} VAST Tenant Storage Health
asMeasuredBy: Combined capacity, QoS, and latency health for VAST tenant {{ match_resource.resource_name }} on cluster {{ custom.vast_cluster_name | default(match_resource.cluster_name) }}.
configProvided:
- name: VAST_VMS_ENDPOINT
value: SLX_PLACEHOLDER
owners:
- {{ workspace.owner_email }}
statement: VAST tenant storage should remain within quota, QoS limits, and latency thresholds for all client types.
additionalContext:
qualified_name: "{{ match_resource.qualified_name }}"
tags:
- name: platform
value: vast_data
- name: service
value: storage
- name: scope
value: tenant
- name: access
value: read-only
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
apiVersion: runwhen.com/v1
kind: Runbook
metadata:
name: {{slx_name}}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
location: {{default_location}}
description: Monitor VAST tenant capacity, QoS throttling, latency, configuration, and block volume health for {{ match_resource.resource_name }}.
codeBundle:
{% if repo_url %}
repoUrl: {{repo_url}}
{% else %}
repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
{% endif %}
{% if ref %}
ref: {{ref}}
{% else %}
ref: main
{% endif %}
pathToRobot: codebundles/vast-tenant-storage-health/runbook.robot
configProvided:
- name: VAST_VMS_ENDPOINT
value: "{{ custom.vast_vms_endpoint | default('') }}"
- name: VAST_CLUSTER_NAME
value: "{{ custom.vast_cluster_name | default(match_resource.cluster_name) }}"
- name: VAST_TENANT_NAME
value: "{{ match_resource.resource_name }}"
- name: TENANTS
value: "{{ custom.tenants | default('All') }}"
- name: CAPACITY_THRESHOLD
value: "{{ custom.capacity_threshold | default('85') }}"
- name: QOS_UTILIZATION_THRESHOLD
value: "{{ custom.qos_utilization_threshold | default('90') }}"
- name: LATENCY_THRESHOLD_MS
value: "{{ custom.latency_threshold_ms | default('10') }}"
secretsProvided:
{% if wb_version %}
{% include "vast-data-auth.yaml" ignore missing %}
{% else %}
- name: vast_vms_credentials
workspaceKey: AUTH DETAILS NOT FOUND
{% endif %}
20 changes: 20 additions & 0 deletions codebundles/vast-tenant-storage-health/.test/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# vast-tenant-storage-health test infrastructure

Static validation and mock VMS scenario tests run without a live VAST cluster.

## Tasks

```bash
cd .test
task
```

## Scenarios

| Scenario | Description | Expected issues |
|----------|-------------|-----------------|
| `healthy_tenant` | Tenant under quota with normal IO and latency | 0 |
| `full_view` | View at 98% logical capacity | 1+ |
| `qos_throttled` | Sustained QoS wait times and IOPS near limits | 1+ |

Mock responses live under `mock-vms/responses/<scenario>/`.
24 changes: 24 additions & 0 deletions codebundles/vast-tenant-storage-health/.test/Taskfile.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
version: "3"

tasks:
default:
desc: "Validate CodeBundle structure and run mock scenario tests"
cmds:
- task: validate-structure
- task: test-scenarios

validate-structure:
desc: "Run static checks for required files"
cmds:
- ./validate-vast-bundle-structure.sh

test-scenarios:
desc: "Run task scripts against the local mock VMS server"
cmds:
- ./run-mock-scenario-tests.sh

clean:
desc: "Remove local test outputs and stop mock server"
cmds:
- rm -rf output workspaceInfo.yaml
- pkill -f "mock-vms-server.py" || true
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/usr/bin/env python3
"""Minimal mock VMS HTTP server for vast-tenant-storage-health scenario tests."""

from __future__ import annotations

import json
import re
import threading
from http.server import BaseHTTPRequestHandler, HTTPServer
from pathlib import Path

ROOT = Path(__file__).resolve().parent
RESP = ROOT / "responses"

SCENARIOS = {
"healthy_tenant": "healthy",
"full_view": "full_view",
"qos_throttled": "qos_throttled",
}


class Handler(BaseHTTPRequestHandler):
scenario = "healthy"

def _auth_ok(self) -> bool:
auth = self.headers.get("Authorization", "")
if auth.startswith("Bearer "):
return True
if self.headers.get("Authorization") or self.headers.get("authorization"):
return True
# Basic auth via urllib is not always exposed; accept any request in tests.
return True

def _read(self, name: str) -> bytes:
path = RESP / self.scenario / name
if not path.exists():
path = RESP / "healthy" / name
return path.read_bytes()

def do_GET(self) -> None: # noqa: N802
if not self._auth_ok():
self.send_response(401)
self.end_headers()
return

if self.path.startswith("/api/prometheusmetrics/"):
metric = self.path.rstrip("/").split("/")[-1]
body = self._read(f"prometheus-{metric}.txt")
self.send_response(200)
self.send_header("Content-Type", "text/plain")
self.end_headers()
self.wfile.write(body)
return

if self.path.startswith("/api/tenants"):
body = self._read("tenants.json")
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(body)
return

if self.path.startswith("/api/views"):
body = self._read("views.json")
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(body)
return

if self.path.startswith("/api/quotas"):
body = self._read("quotas.json")
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(body)
return

self.send_response(404)
self.end_headers()

def log_message(self, format: str, *args) -> None: # noqa: A003
return


def main() -> None:
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--port", type=int, default=18080)
parser.add_argument("--scenario", default="healthy")
args = parser.parse_args()

Handler.scenario = SCENARIOS.get(args.scenario, args.scenario)
server = HTTPServer(("127.0.0.1", args.port), Handler)
thread = threading.Thread(target=server.serve_forever, daemon=True)
thread.start()
print(f"mock-vms-server listening on http://127.0.0.1:{args.port} scenario={Handler.scenario}")
thread.join()


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# TYPE vast_quota_used_capacity gauge
vast_quota_used_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 98000000000
vast_quota_hard_limit{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 100000000000
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
vast_tenant_metrics_TenantMetrics_read_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 2.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# TYPE vast_view_logical_capacity gauge
vast_view_logical_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 98000000000
vast_view_physical_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 45000000000
# TYPE vast_quota_used_capacity gauge
vast_quota_used_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 98000000000
vast_quota_hard_limit{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 100000000000
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[
{
"name": "demo-tenant",
"cluster_name": "prod-cluster",
"enabled": true,
"qos": {
"read_iops": 5000,
"write_iops": 5000,
"read_bw": 1000000000,
"write_bw": 1000000000
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"results": []}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# TYPE vast_quota_used_capacity gauge
vast_quota_used_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 70000000000
# TYPE vast_quota_hard_limit gauge
vast_quota_hard_limit{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 100000000000
# TYPE vast_tenant_metrics_TenantMetrics_read_latency gauge
vast_tenant_metrics_TenantMetrics_read_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 2.5
vast_tenant_metrics_TenantMetrics_write_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 3.1
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# TYPE vast_tenant_metrics_TenantMetrics_read_latency gauge
vast_tenant_metrics_TenantMetrics_read_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 2.5
vast_tenant_metrics_TenantMetrics_write_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 3.1
vast_tenant_metrics_TenantMetrics_read_iops_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 1200
vast_tenant_metrics_TenantMetrics_write_iops_count{cluster="prod-cluster",tenant_name="demo-tenant"} 800
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# TYPE vast_view_logical_capacity gauge
vast_view_logical_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 50000000000
vast_view_physical_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 20000000000
vast_view_metrics_ViewMetrics_qos_wait_for_budget_time_sum{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"results": []}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[
{
"name": "demo-tenant",
"cluster_name": "prod-cluster",
"enabled": true,
"qos": {
"read_iops": 5000,
"write_iops": 5000,
"read_bw": 1000000000,
"write_bw": 1000000000
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"results": []}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
vast_tenant_metrics_TenantMetrics_read_iops_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 4950
vast_tenant_metrics_TenantMetrics_write_iops_count{cluster="prod-cluster",tenant_name="demo-tenant"} 4900
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
vast_view_metrics_ViewMetrics_qos_wait_for_budget_time_sum{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 1500
vast_user_view_read_md_iops{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 950
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"results": []}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[
{
"name": "demo-tenant",
"cluster_name": "prod-cluster",
"enabled": true,
"qos": {
"read_iops": 5000,
"write_iops": 5000,
"read_bw": 1000000000,
"write_bw": 1000000000
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"results": []}
Loading
Loading