runwhen-contrib · rw-codebundle-agent · Jun 25, 2026
@@ -0,0 +1,22 @@
+apiVersion: runwhen.com/v1
+kind: GenerationRules
+spec:
+  platform: vast_data
+  generationRules:
+    - resourceTypes:
+        - vast_data_tenant
+      matchRules:
+        - type: pattern
+          pattern: ".+"
+          properties: ["name"]
+          mode: substring
+      slxs:
+        - baseName: vast-tenant-storage
+          qualifiers: ["cluster", "tenant"]
+          baseTemplateName: vast-tenant-storage-health
+          levelOfDetail: basic
+          outputItems:
+            - type: slx
+            - type: sli
+            - type: runbook
+              templateName: vast-tenant-storage-health-taskset.yaml
@@ -0,0 +1,52 @@
+apiVersion: runwhen.com/v1
+kind: ServiceLevelIndicator
+metadata:
+  name: {{slx_name}}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  displayUnitsLong: Health Score
+  displayUnitsShort: score
+  locations:
+    - {{default_location}}
+  description: Measures VAST tenant storage health from capacity utilization, QoS wait times, and read/write latency.
+  codeBundle:
+    {% if repo_url %}
+    repoUrl: {{repo_url}}
+    {% else %}
+    repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
+    {% endif %}
+    {% if ref %}
+    ref: {{ref}}
+    {% else %}
+    ref: main
+    {% endif %}
+    pathToRobot: codebundles/vast-tenant-storage-health/sli.robot
+  intervalStrategy: intermezzo
+  intervalSeconds: 300
+  configProvided:
+    - name: VAST_VMS_ENDPOINT
+      value: "{{ custom.vast_vms_endpoint | default('') }}"
+    - name: VAST_CLUSTER_NAME
+      value: "{{ custom.vast_cluster_name | default(match_resource.cluster_name) }}"
+    - name: VAST_TENANT_NAME
+      value: "{{ match_resource.resource_name }}"
+    - name: CAPACITY_THRESHOLD
+      value: "{{ custom.capacity_threshold | default('85') }}"
+    - name: QOS_UTILIZATION_THRESHOLD
+      value: "{{ custom.qos_utilization_threshold | default('90') }}"
+    - name: LATENCY_THRESHOLD_MS
+      value: "{{ custom.latency_threshold_ms | default('10') }}"
+  secretsProvided:
+  {% if wb_version %}
+    {% include "vast-data-auth.yaml" ignore missing %}
+  {% else %}
+    - name: vast_vms_credentials
+      workspaceKey: AUTH DETAILS NOT FOUND
+  {% endif %}
+  alertConfig:
+    tasks:
+      persona: eager-edgar
+      sessionTTL: 10m
@@ -0,0 +1,29 @@
+apiVersion: runwhen.com/v1
+kind: ServiceLevelX
+metadata:
+  name: {{ slx_name }}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/storage/storage.svg
+  alias: {{ match_resource.resource_name }} VAST Tenant Storage Health
+  asMeasuredBy: Combined capacity, QoS, and latency health for VAST tenant {{ match_resource.resource_name }} on cluster {{ custom.vast_cluster_name | default(match_resource.cluster_name) }}.
+  configProvided:
+    - name: VAST_VMS_ENDPOINT
+      value: SLX_PLACEHOLDER
+  owners:
+    - {{ workspace.owner_email }}
+  statement: VAST tenant storage should remain within quota, QoS limits, and latency thresholds for all client types.
+  additionalContext:
+    qualified_name: "{{ match_resource.qualified_name }}"
+  tags:
+    - name: platform
+      value: vast_data
+    - name: service
+      value: storage
+    - name: scope
+      value: tenant
+    - name: access
+      value: read-only
@@ -0,0 +1,45 @@
+apiVersion: runwhen.com/v1
+kind: Runbook
+metadata:
+  name: {{slx_name}}
+  labels:
+    {% include "common-labels.yaml" %}
+  annotations:
+    {% include "common-annotations.yaml" %}
+spec:
+  location: {{default_location}}
+  description: Monitor VAST tenant capacity, QoS throttling, latency, configuration, and block volume health for {{ match_resource.resource_name }}.
+  codeBundle:
+    {% if repo_url %}
+    repoUrl: {{repo_url}}
+    {% else %}
+    repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
+    {% endif %}
+    {% if ref %}
+    ref: {{ref}}
+    {% else %}
+    ref: main
+    {% endif %}
+    pathToRobot: codebundles/vast-tenant-storage-health/runbook.robot
+  configProvided:
+    - name: VAST_VMS_ENDPOINT
+      value: "{{ custom.vast_vms_endpoint | default('') }}"
+    - name: VAST_CLUSTER_NAME
+      value: "{{ custom.vast_cluster_name | default(match_resource.cluster_name) }}"
+    - name: VAST_TENANT_NAME
+      value: "{{ match_resource.resource_name }}"
+    - name: TENANTS
+      value: "{{ custom.tenants | default('All') }}"
+    - name: CAPACITY_THRESHOLD
+      value: "{{ custom.capacity_threshold | default('85') }}"
+    - name: QOS_UTILIZATION_THRESHOLD
+      value: "{{ custom.qos_utilization_threshold | default('90') }}"
+    - name: LATENCY_THRESHOLD_MS
+      value: "{{ custom.latency_threshold_ms | default('10') }}"
+  secretsProvided:
+  {% if wb_version %}
+    {% include "vast-data-auth.yaml" ignore missing %}
+  {% else %}
+    - name: vast_vms_credentials
+      workspaceKey: AUTH DETAILS NOT FOUND
+  {% endif %}
@@ -0,0 +1,20 @@
+# vast-tenant-storage-health test infrastructure
+
+Static validation and mock VMS scenario tests run without a live VAST cluster.
+
+## Tasks
+
+```bash
+cd .test
+task
+```
+
+## Scenarios
+
+| Scenario | Description | Expected issues |
+|----------|-------------|-----------------|
+| `healthy_tenant` | Tenant under quota with normal IO and latency | 0 |
+| `full_view` | View at 98% logical capacity | 1+ |
+| `qos_throttled` | Sustained QoS wait times and IOPS near limits | 1+ |
+
+Mock responses live under `mock-vms/responses/<scenario>/`.
@@ -0,0 +1,24 @@
+version: "3"
+
+tasks:
+  default:
+    desc: "Validate CodeBundle structure and run mock scenario tests"
+    cmds:
+      - task: validate-structure
+      - task: test-scenarios
+
+  validate-structure:
+    desc: "Run static checks for required files"
+    cmds:
+      - ./validate-vast-bundle-structure.sh
+
+  test-scenarios:
+    desc: "Run task scripts against the local mock VMS server"
+    cmds:
+      - ./run-mock-scenario-tests.sh
+
+  clean:
+    desc: "Remove local test outputs and stop mock server"
+    cmds:
+      - rm -rf output workspaceInfo.yaml
+      - pkill -f "mock-vms-server.py" || true
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""Minimal mock VMS HTTP server for vast-tenant-storage-health scenario tests."""
+
+from __future__ import annotations
+
+import json
+import re
+import threading
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent
+RESP = ROOT / "responses"
+
+SCENARIOS = {
+    "healthy_tenant": "healthy",
+    "full_view": "full_view",
+    "qos_throttled": "qos_throttled",
+}
+
+
+class Handler(BaseHTTPRequestHandler):
+    scenario = "healthy"
+
+    def _auth_ok(self) -> bool:
+        auth = self.headers.get("Authorization", "")
+        if auth.startswith("Bearer "):
+            return True
+        if self.headers.get("Authorization") or self.headers.get("authorization"):
+            return True
+        # Basic auth via urllib is not always exposed; accept any request in tests.
+        return True
+
+    def _read(self, name: str) -> bytes:
+        path = RESP / self.scenario / name
+        if not path.exists():
+            path = RESP / "healthy" / name
+        return path.read_bytes()
+
+    def do_GET(self) -> None:  # noqa: N802
+        if not self._auth_ok():
+            self.send_response(401)
+            self.end_headers()
+            return
+
+        if self.path.startswith("/api/prometheusmetrics/"):
+            metric = self.path.rstrip("/").split("/")[-1]
+            body = self._read(f"prometheus-{metric}.txt")
+            self.send_response(200)
+            self.send_header("Content-Type", "text/plain")
+            self.end_headers()
+            self.wfile.write(body)
+            return
+
+        if self.path.startswith("/api/tenants"):
+            body = self._read("tenants.json")
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            self.wfile.write(body)
+            return
+
+        if self.path.startswith("/api/views"):
+            body = self._read("views.json")
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            self.wfile.write(body)
+            return
+
+        if self.path.startswith("/api/quotas"):
+            body = self._read("quotas.json")
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            self.wfile.write(body)
+            return
+
+        self.send_response(404)
+        self.end_headers()
+
+    def log_message(self, format: str, *args) -> None:  # noqa: A003
+        return
+
+
+def main() -> None:
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int, default=18080)
+    parser.add_argument("--scenario", default="healthy")
+    args = parser.parse_args()
+
+    Handler.scenario = SCENARIOS.get(args.scenario, args.scenario)
+    server = HTTPServer(("127.0.0.1", args.port), Handler)
+    thread = threading.Thread(target=server.serve_forever, daemon=True)
+    thread.start()
+    print(f"mock-vms-server listening on http://127.0.0.1:{args.port} scenario={Handler.scenario}")
+    thread.join()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,3 @@
+# TYPE vast_quota_used_capacity gauge
+vast_quota_used_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 98000000000
+vast_quota_hard_limit{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 100000000000
@@ -0,0 +1 @@
+vast_tenant_metrics_TenantMetrics_read_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 2.0
@@ -0,0 +1,6 @@
+# TYPE vast_view_logical_capacity gauge
+vast_view_logical_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 98000000000
+vast_view_physical_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 45000000000
+# TYPE vast_quota_used_capacity gauge
+vast_quota_used_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 98000000000
+vast_quota_hard_limit{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 100000000000
@@ -0,0 +1,13 @@
+[
+  {
+    "name": "demo-tenant",
+    "cluster_name": "prod-cluster",
+    "enabled": true,
+    "qos": {
+      "read_iops": 5000,
+      "write_iops": 5000,
+      "read_bw": 1000000000,
+      "write_bw": 1000000000
+    }
+  }
+]
@@ -0,0 +1 @@
+{"results": []}
@@ -0,0 +1,7 @@
+# TYPE vast_quota_used_capacity gauge
+vast_quota_used_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 70000000000
+# TYPE vast_quota_hard_limit gauge
+vast_quota_hard_limit{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 100000000000
+# TYPE vast_tenant_metrics_TenantMetrics_read_latency gauge
+vast_tenant_metrics_TenantMetrics_read_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 2.5
+vast_tenant_metrics_TenantMetrics_write_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 3.1
@@ -0,0 +1,5 @@
+# TYPE vast_tenant_metrics_TenantMetrics_read_latency gauge
+vast_tenant_metrics_TenantMetrics_read_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 2.5
+vast_tenant_metrics_TenantMetrics_write_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 3.1
+vast_tenant_metrics_TenantMetrics_read_iops_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 1200
+vast_tenant_metrics_TenantMetrics_write_iops_count{cluster="prod-cluster",tenant_name="demo-tenant"} 800
@@ -0,0 +1,4 @@
+# TYPE vast_view_logical_capacity gauge
+vast_view_logical_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 50000000000
+vast_view_physical_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 20000000000
+vast_view_metrics_ViewMetrics_qos_wait_for_budget_time_sum{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 0
@@ -0,0 +1 @@
+{"results": []}
@@ -0,0 +1,13 @@
+[
+  {
+    "name": "demo-tenant",
+    "cluster_name": "prod-cluster",
+    "enabled": true,
+    "qos": {
+      "read_iops": 5000,
+      "write_iops": 5000,
+      "read_bw": 1000000000,
+      "write_bw": 1000000000
+    }
+  }
+]
@@ -0,0 +1 @@
+{"results": []}
@@ -0,0 +1,2 @@
+vast_tenant_metrics_TenantMetrics_read_iops_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 4950
+vast_tenant_metrics_TenantMetrics_write_iops_count{cluster="prod-cluster",tenant_name="demo-tenant"} 4900
@@ -0,0 +1,2 @@
+vast_view_metrics_ViewMetrics_qos_wait_for_budget_time_sum{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 1500
+vast_user_view_read_md_iops{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 950
@@ -0,0 +1 @@
+{"results": []}
@@ -0,0 +1,13 @@
+[
+  {
+    "name": "demo-tenant",
+    "cluster_name": "prod-cluster",
+    "enabled": true,
+    "qos": {
+      "read_iops": 5000,
+      "write_iops": 5000,
+      "read_bw": 1000000000,
+      "write_bw": 1000000000
+    }
+  }
+]
@@ -0,0 +1 @@
+{"results": []}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		vast_tenant_metrics_TenantMetrics_read_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 2.0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		vast_tenant_metrics_TenantMetrics_read_iops_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 4950
		vast_tenant_metrics_TenantMetrics_write_iops_count{cluster="prod-cluster",tenant_name="demo-tenant"} 4900
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		vast_view_metrics_ViewMetrics_qos_wait_for_budget_time_sum{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 1500
		vast_user_view_read_md_iops{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 950