From f8eb9c3e3e0c14917c728ebc78aa7bf49a8cab87 Mon Sep 17 00:00:00 2001 From: "rw-codebundle-agent[bot]" Date: Thu, 25 Jun 2026 15:41:41 +0000 Subject: [PATCH] [Creator] [design-spec] vast-tenant-storage-health (runwhen-contrib/codecollection-registry#130) --- .../vast-tenant-storage-health.yaml | 22 ++ .../vast-tenant-storage-health-sli.yaml | 52 +++ .../vast-tenant-storage-health-slx.yaml | 29 ++ .../vast-tenant-storage-health-taskset.yaml | 45 +++ .../.test/README.md | 20 + .../.test/Taskfile.yaml | 24 ++ .../.test/mock-vms/mock-vms-server.py | 103 +++++ .../responses/full_view/prometheus-quotas.txt | 3 + .../full_view/prometheus-tenants.txt | 1 + .../responses/full_view/prometheus-views.txt | 6 + .../mock-vms/responses/full_view/tenants.json | 13 + .../mock-vms/responses/full_view/views.json | 1 + .../responses/healthy/prometheus-quotas.txt | 7 + .../responses/healthy/prometheus-tenants.txt | 5 + .../responses/healthy/prometheus-views.txt | 4 + .../mock-vms/responses/healthy/quotas.json | 1 + .../mock-vms/responses/healthy/tenants.json | 13 + .../mock-vms/responses/healthy/views.json | 1 + .../qos_throttled/prometheus-tenants.txt | 2 + .../qos_throttled/prometheus-views.txt | 2 + .../responses/qos_throttled/quotas.json | 1 + .../responses/qos_throttled/tenants.json | 13 + .../responses/qos_throttled/views.json | 1 + .../.test/run-mock-scenario-tests.sh | 68 ++++ .../.test/validate-vast-bundle-structure.sh | 31 ++ .../vast-tenant-storage-health/README.md | 102 +++++ .../analyze-tenant-latency.sh | 118 ++++++ .../analyze-tenant-qos.sh | 132 +++++++ .../check-block-volume-health.sh | 134 +++++++ .../check-qos-wait-times.sh | 130 +++++++ .../check-tenant-capacity.sh | 111 ++++++ .../check-tenant-config.sh | 125 +++++++ .../check-view-capacity.sh | 139 +++++++ .../discover-vast-tenants.sh | 61 +++ .../qos_wait_issues.json | 8 + .../vast-tenant-storage-health/runbook.robot | 352 ++++++++++++++++++ .../sli-vast-capacity-score.sh | 38 ++ .../sli-vast-latency-score.sh | 52 +++ .../sli-vast-qos-score.sh | 33 ++ .../vast-tenant-storage-health/sli.robot | 132 +++++++ .../tenant_qos_issues.json | 1 + .../vast-vms-helpers.sh | 265 +++++++++++++ 42 files changed, 2401 insertions(+) create mode 100644 codebundles/vast-tenant-storage-health/.runwhen/generation-rules/vast-tenant-storage-health.yaml create mode 100644 codebundles/vast-tenant-storage-health/.runwhen/templates/vast-tenant-storage-health-sli.yaml create mode 100644 codebundles/vast-tenant-storage-health/.runwhen/templates/vast-tenant-storage-health-slx.yaml create mode 100644 codebundles/vast-tenant-storage-health/.runwhen/templates/vast-tenant-storage-health-taskset.yaml create mode 100644 codebundles/vast-tenant-storage-health/.test/README.md create mode 100644 codebundles/vast-tenant-storage-health/.test/Taskfile.yaml create mode 100755 codebundles/vast-tenant-storage-health/.test/mock-vms/mock-vms-server.py create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/prometheus-quotas.txt create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/prometheus-tenants.txt create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/prometheus-views.txt create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/tenants.json create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/views.json create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/prometheus-quotas.txt create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/prometheus-tenants.txt create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/prometheus-views.txt create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/quotas.json create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/tenants.json create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/views.json create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/prometheus-tenants.txt create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/prometheus-views.txt create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/quotas.json create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/tenants.json create mode 100644 codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/views.json create mode 100755 codebundles/vast-tenant-storage-health/.test/run-mock-scenario-tests.sh create mode 100755 codebundles/vast-tenant-storage-health/.test/validate-vast-bundle-structure.sh create mode 100644 codebundles/vast-tenant-storage-health/README.md create mode 100755 codebundles/vast-tenant-storage-health/analyze-tenant-latency.sh create mode 100755 codebundles/vast-tenant-storage-health/analyze-tenant-qos.sh create mode 100755 codebundles/vast-tenant-storage-health/check-block-volume-health.sh create mode 100755 codebundles/vast-tenant-storage-health/check-qos-wait-times.sh create mode 100755 codebundles/vast-tenant-storage-health/check-tenant-capacity.sh create mode 100755 codebundles/vast-tenant-storage-health/check-tenant-config.sh create mode 100755 codebundles/vast-tenant-storage-health/check-view-capacity.sh create mode 100755 codebundles/vast-tenant-storage-health/discover-vast-tenants.sh create mode 100644 codebundles/vast-tenant-storage-health/qos_wait_issues.json create mode 100644 codebundles/vast-tenant-storage-health/runbook.robot create mode 100755 codebundles/vast-tenant-storage-health/sli-vast-capacity-score.sh create mode 100755 codebundles/vast-tenant-storage-health/sli-vast-latency-score.sh create mode 100755 codebundles/vast-tenant-storage-health/sli-vast-qos-score.sh create mode 100644 codebundles/vast-tenant-storage-health/sli.robot create mode 100644 codebundles/vast-tenant-storage-health/tenant_qos_issues.json create mode 100755 codebundles/vast-tenant-storage-health/vast-vms-helpers.sh diff --git a/codebundles/vast-tenant-storage-health/.runwhen/generation-rules/vast-tenant-storage-health.yaml b/codebundles/vast-tenant-storage-health/.runwhen/generation-rules/vast-tenant-storage-health.yaml new file mode 100644 index 000000000..72efaabc8 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.runwhen/generation-rules/vast-tenant-storage-health.yaml @@ -0,0 +1,22 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + platform: vast_data + generationRules: + - resourceTypes: + - vast_data_tenant + matchRules: + - type: pattern + pattern: ".+" + properties: ["name"] + mode: substring + slxs: + - baseName: vast-tenant-storage + qualifiers: ["cluster", "tenant"] + baseTemplateName: vast-tenant-storage-health + levelOfDetail: basic + outputItems: + - type: slx + - type: sli + - type: runbook + templateName: vast-tenant-storage-health-taskset.yaml diff --git a/codebundles/vast-tenant-storage-health/.runwhen/templates/vast-tenant-storage-health-sli.yaml b/codebundles/vast-tenant-storage-health/.runwhen/templates/vast-tenant-storage-health-sli.yaml new file mode 100644 index 000000000..6fcbfce5d --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.runwhen/templates/vast-tenant-storage-health-sli.yaml @@ -0,0 +1,52 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelIndicator +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + displayUnitsLong: Health Score + displayUnitsShort: score + locations: + - {{default_location}} + description: Measures VAST tenant storage health from capacity utilization, QoS wait times, and read/write latency. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/vast-tenant-storage-health/sli.robot + intervalStrategy: intermezzo + intervalSeconds: 300 + configProvided: + - name: VAST_VMS_ENDPOINT + value: "{{ custom.vast_vms_endpoint | default('') }}" + - name: VAST_CLUSTER_NAME + value: "{{ custom.vast_cluster_name | default(match_resource.cluster_name) }}" + - name: VAST_TENANT_NAME + value: "{{ match_resource.resource_name }}" + - name: CAPACITY_THRESHOLD + value: "{{ custom.capacity_threshold | default('85') }}" + - name: QOS_UTILIZATION_THRESHOLD + value: "{{ custom.qos_utilization_threshold | default('90') }}" + - name: LATENCY_THRESHOLD_MS + value: "{{ custom.latency_threshold_ms | default('10') }}" + secretsProvided: + {% if wb_version %} + {% include "vast-data-auth.yaml" ignore missing %} + {% else %} + - name: vast_vms_credentials + workspaceKey: AUTH DETAILS NOT FOUND + {% endif %} + alertConfig: + tasks: + persona: eager-edgar + sessionTTL: 10m diff --git a/codebundles/vast-tenant-storage-health/.runwhen/templates/vast-tenant-storage-health-slx.yaml b/codebundles/vast-tenant-storage-health/.runwhen/templates/vast-tenant-storage-health-slx.yaml new file mode 100644 index 000000000..18630c493 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.runwhen/templates/vast-tenant-storage-health-slx.yaml @@ -0,0 +1,29 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{ slx_name }} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/storage/storage.svg + alias: {{ match_resource.resource_name }} VAST Tenant Storage Health + asMeasuredBy: Combined capacity, QoS, and latency health for VAST tenant {{ match_resource.resource_name }} on cluster {{ custom.vast_cluster_name | default(match_resource.cluster_name) }}. + configProvided: + - name: VAST_VMS_ENDPOINT + value: SLX_PLACEHOLDER + owners: + - {{ workspace.owner_email }} + statement: VAST tenant storage should remain within quota, QoS limits, and latency thresholds for all client types. + additionalContext: + qualified_name: "{{ match_resource.qualified_name }}" + tags: + - name: platform + value: vast_data + - name: service + value: storage + - name: scope + value: tenant + - name: access + value: read-only diff --git a/codebundles/vast-tenant-storage-health/.runwhen/templates/vast-tenant-storage-health-taskset.yaml b/codebundles/vast-tenant-storage-health/.runwhen/templates/vast-tenant-storage-health-taskset.yaml new file mode 100644 index 000000000..9195de9b4 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.runwhen/templates/vast-tenant-storage-health-taskset.yaml @@ -0,0 +1,45 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + description: Monitor VAST tenant capacity, QoS throttling, latency, configuration, and block volume health for {{ match_resource.resource_name }}. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/vast-tenant-storage-health/runbook.robot + configProvided: + - name: VAST_VMS_ENDPOINT + value: "{{ custom.vast_vms_endpoint | default('') }}" + - name: VAST_CLUSTER_NAME + value: "{{ custom.vast_cluster_name | default(match_resource.cluster_name) }}" + - name: VAST_TENANT_NAME + value: "{{ match_resource.resource_name }}" + - name: TENANTS + value: "{{ custom.tenants | default('All') }}" + - name: CAPACITY_THRESHOLD + value: "{{ custom.capacity_threshold | default('85') }}" + - name: QOS_UTILIZATION_THRESHOLD + value: "{{ custom.qos_utilization_threshold | default('90') }}" + - name: LATENCY_THRESHOLD_MS + value: "{{ custom.latency_threshold_ms | default('10') }}" + secretsProvided: + {% if wb_version %} + {% include "vast-data-auth.yaml" ignore missing %} + {% else %} + - name: vast_vms_credentials + workspaceKey: AUTH DETAILS NOT FOUND + {% endif %} diff --git a/codebundles/vast-tenant-storage-health/.test/README.md b/codebundles/vast-tenant-storage-health/.test/README.md new file mode 100644 index 000000000..92cc49088 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/README.md @@ -0,0 +1,20 @@ +# vast-tenant-storage-health test infrastructure + +Static validation and mock VMS scenario tests run without a live VAST cluster. + +## Tasks + +```bash +cd .test +task +``` + +## Scenarios + +| Scenario | Description | Expected issues | +|----------|-------------|-----------------| +| `healthy_tenant` | Tenant under quota with normal IO and latency | 0 | +| `full_view` | View at 98% logical capacity | 1+ | +| `qos_throttled` | Sustained QoS wait times and IOPS near limits | 1+ | + +Mock responses live under `mock-vms/responses//`. diff --git a/codebundles/vast-tenant-storage-health/.test/Taskfile.yaml b/codebundles/vast-tenant-storage-health/.test/Taskfile.yaml new file mode 100644 index 000000000..4b0380b4f --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/Taskfile.yaml @@ -0,0 +1,24 @@ +version: "3" + +tasks: + default: + desc: "Validate CodeBundle structure and run mock scenario tests" + cmds: + - task: validate-structure + - task: test-scenarios + + validate-structure: + desc: "Run static checks for required files" + cmds: + - ./validate-vast-bundle-structure.sh + + test-scenarios: + desc: "Run task scripts against the local mock VMS server" + cmds: + - ./run-mock-scenario-tests.sh + + clean: + desc: "Remove local test outputs and stop mock server" + cmds: + - rm -rf output workspaceInfo.yaml + - pkill -f "mock-vms-server.py" || true diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/mock-vms-server.py b/codebundles/vast-tenant-storage-health/.test/mock-vms/mock-vms-server.py new file mode 100755 index 000000000..52aa47302 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/mock-vms-server.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +"""Minimal mock VMS HTTP server for vast-tenant-storage-health scenario tests.""" + +from __future__ import annotations + +import json +import re +import threading +from http.server import BaseHTTPRequestHandler, HTTPServer +from pathlib import Path + +ROOT = Path(__file__).resolve().parent +RESP = ROOT / "responses" + +SCENARIOS = { + "healthy_tenant": "healthy", + "full_view": "full_view", + "qos_throttled": "qos_throttled", +} + + +class Handler(BaseHTTPRequestHandler): + scenario = "healthy" + + def _auth_ok(self) -> bool: + auth = self.headers.get("Authorization", "") + if auth.startswith("Bearer "): + return True + if self.headers.get("Authorization") or self.headers.get("authorization"): + return True + # Basic auth via urllib is not always exposed; accept any request in tests. + return True + + def _read(self, name: str) -> bytes: + path = RESP / self.scenario / name + if not path.exists(): + path = RESP / "healthy" / name + return path.read_bytes() + + def do_GET(self) -> None: # noqa: N802 + if not self._auth_ok(): + self.send_response(401) + self.end_headers() + return + + if self.path.startswith("/api/prometheusmetrics/"): + metric = self.path.rstrip("/").split("/")[-1] + body = self._read(f"prometheus-{metric}.txt") + self.send_response(200) + self.send_header("Content-Type", "text/plain") + self.end_headers() + self.wfile.write(body) + return + + if self.path.startswith("/api/tenants"): + body = self._read("tenants.json") + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(body) + return + + if self.path.startswith("/api/views"): + body = self._read("views.json") + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(body) + return + + if self.path.startswith("/api/quotas"): + body = self._read("quotas.json") + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(body) + return + + self.send_response(404) + self.end_headers() + + def log_message(self, format: str, *args) -> None: # noqa: A003 + return + + +def main() -> None: + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--port", type=int, default=18080) + parser.add_argument("--scenario", default="healthy") + args = parser.parse_args() + + Handler.scenario = SCENARIOS.get(args.scenario, args.scenario) + server = HTTPServer(("127.0.0.1", args.port), Handler) + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + print(f"mock-vms-server listening on http://127.0.0.1:{args.port} scenario={Handler.scenario}") + thread.join() + + +if __name__ == "__main__": + main() diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/prometheus-quotas.txt b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/prometheus-quotas.txt new file mode 100644 index 000000000..adf2e6dc8 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/prometheus-quotas.txt @@ -0,0 +1,3 @@ +# TYPE vast_quota_used_capacity gauge +vast_quota_used_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 98000000000 +vast_quota_hard_limit{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 100000000000 diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/prometheus-tenants.txt b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/prometheus-tenants.txt new file mode 100644 index 000000000..eaf75e4a0 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/prometheus-tenants.txt @@ -0,0 +1 @@ +vast_tenant_metrics_TenantMetrics_read_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 2.0 diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/prometheus-views.txt b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/prometheus-views.txt new file mode 100644 index 000000000..72f795491 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/prometheus-views.txt @@ -0,0 +1,6 @@ +# TYPE vast_view_logical_capacity gauge +vast_view_logical_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 98000000000 +vast_view_physical_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 45000000000 +# TYPE vast_quota_used_capacity gauge +vast_quota_used_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 98000000000 +vast_quota_hard_limit{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/full-view"} 100000000000 diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/tenants.json b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/tenants.json new file mode 100644 index 000000000..a89a1c8da --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/tenants.json @@ -0,0 +1,13 @@ +[ + { + "name": "demo-tenant", + "cluster_name": "prod-cluster", + "enabled": true, + "qos": { + "read_iops": 5000, + "write_iops": 5000, + "read_bw": 1000000000, + "write_bw": 1000000000 + } + } +] diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/views.json b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/views.json new file mode 100644 index 000000000..330e8fe31 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/full_view/views.json @@ -0,0 +1 @@ +{"results": []} diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/prometheus-quotas.txt b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/prometheus-quotas.txt new file mode 100644 index 000000000..4362f597a --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/prometheus-quotas.txt @@ -0,0 +1,7 @@ +# TYPE vast_quota_used_capacity gauge +vast_quota_used_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 70000000000 +# TYPE vast_quota_hard_limit gauge +vast_quota_hard_limit{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 100000000000 +# TYPE vast_tenant_metrics_TenantMetrics_read_latency gauge +vast_tenant_metrics_TenantMetrics_read_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 2.5 +vast_tenant_metrics_TenantMetrics_write_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 3.1 diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/prometheus-tenants.txt b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/prometheus-tenants.txt new file mode 100644 index 000000000..0d1c90051 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/prometheus-tenants.txt @@ -0,0 +1,5 @@ +# TYPE vast_tenant_metrics_TenantMetrics_read_latency gauge +vast_tenant_metrics_TenantMetrics_read_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 2.5 +vast_tenant_metrics_TenantMetrics_write_latency_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 3.1 +vast_tenant_metrics_TenantMetrics_read_iops_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 1200 +vast_tenant_metrics_TenantMetrics_write_iops_count{cluster="prod-cluster",tenant_name="demo-tenant"} 800 diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/prometheus-views.txt b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/prometheus-views.txt new file mode 100644 index 000000000..3a92324b5 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/prometheus-views.txt @@ -0,0 +1,4 @@ +# TYPE vast_view_logical_capacity gauge +vast_view_logical_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 50000000000 +vast_view_physical_capacity{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 20000000000 +vast_view_metrics_ViewMetrics_qos_wait_for_budget_time_sum{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 0 diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/quotas.json b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/quotas.json new file mode 100644 index 000000000..330e8fe31 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/quotas.json @@ -0,0 +1 @@ +{"results": []} diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/tenants.json b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/tenants.json new file mode 100644 index 000000000..a89a1c8da --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/tenants.json @@ -0,0 +1,13 @@ +[ + { + "name": "demo-tenant", + "cluster_name": "prod-cluster", + "enabled": true, + "qos": { + "read_iops": 5000, + "write_iops": 5000, + "read_bw": 1000000000, + "write_bw": 1000000000 + } + } +] diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/views.json b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/views.json new file mode 100644 index 000000000..330e8fe31 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/healthy/views.json @@ -0,0 +1 @@ +{"results": []} diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/prometheus-tenants.txt b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/prometheus-tenants.txt new file mode 100644 index 000000000..e9d839cee --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/prometheus-tenants.txt @@ -0,0 +1,2 @@ +vast_tenant_metrics_TenantMetrics_read_iops_avg{cluster="prod-cluster",tenant_name="demo-tenant"} 4950 +vast_tenant_metrics_TenantMetrics_write_iops_count{cluster="prod-cluster",tenant_name="demo-tenant"} 4900 diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/prometheus-views.txt b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/prometheus-views.txt new file mode 100644 index 000000000..2d6e452f1 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/prometheus-views.txt @@ -0,0 +1,2 @@ +vast_view_metrics_ViewMetrics_qos_wait_for_budget_time_sum{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 1500 +vast_user_view_read_md_iops{cluster="prod-cluster",tenant_name="demo-tenant",path="/demo/view1"} 950 diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/quotas.json b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/quotas.json new file mode 100644 index 000000000..330e8fe31 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/quotas.json @@ -0,0 +1 @@ +{"results": []} diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/tenants.json b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/tenants.json new file mode 100644 index 000000000..a89a1c8da --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/tenants.json @@ -0,0 +1,13 @@ +[ + { + "name": "demo-tenant", + "cluster_name": "prod-cluster", + "enabled": true, + "qos": { + "read_iops": 5000, + "write_iops": 5000, + "read_bw": 1000000000, + "write_bw": 1000000000 + } + } +] diff --git a/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/views.json b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/views.json new file mode 100644 index 000000000..330e8fe31 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/mock-vms/responses/qos_throttled/views.json @@ -0,0 +1 @@ +{"results": []} diff --git a/codebundles/vast-tenant-storage-health/.test/run-mock-scenario-tests.sh b/codebundles/vast-tenant-storage-health/.test/run-mock-scenario-tests.sh new file mode 100755 index 000000000..950bf20f2 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/run-mock-scenario-tests.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# Exercise task scripts against the local mock VMS for design-spec scenarios. +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +MOCK_DIR="$ROOT/.test/mock-vms" +PORT=18080 + +run_scenario() { + local scenario="$1" + local expect_min="${2:-0}" + local expect_max="${3:-999}" + + pkill -f "mock-vms-server.py --port ${PORT}" 2>/dev/null || true + sleep 0.2 + python3 "$MOCK_DIR/mock-vms-server.py" --port "$PORT" --scenario "$scenario" & + local pid=$! + sleep 0.5 + + export VAST_VMS_ENDPOINT="http://127.0.0.1:${PORT}" + export VAST_CLUSTER_NAME="prod-cluster" + export VAST_TENANT_NAME="demo-tenant" + export CAPACITY_THRESHOLD="85" + export QOS_UTILIZATION_THRESHOLD="90" + export LATENCY_THRESHOLD_MS="10" + export vast_vms_credentials='{"USERNAME":"test","PASSWORD":"test"}' + + cd "$ROOT" + rm -f *_issues.json + + case "$scenario" in + healthy_tenant) + ./check-tenant-capacity.sh >/dev/null + ./check-view-capacity.sh >/dev/null + ./analyze-tenant-qos.sh >/dev/null + ./check-qos-wait-times.sh >/dev/null + ;; + full_view) + ./check-view-capacity.sh >/dev/null + ;; + qos_throttled) + ./analyze-tenant-qos.sh >/dev/null + ./check-qos-wait-times.sh >/dev/null + ;; + esac + + local total=0 + for f in *_issues.json; do + [[ -f "$f" ]] || continue + local c + c="$(jq 'length' "$f")" + total=$((total + c)) + done + + kill "$pid" 2>/dev/null || true + wait "$pid" 2>/dev/null || true + + if (( total < expect_min || total > expect_max )); then + echo "Scenario ${scenario} expected ${expect_min}-${expect_max} issues, got ${total}" >&2 + exit 1 + fi + echo "Scenario ${scenario} OK (${total} issues)" +} + +run_scenario healthy_tenant 0 0 +run_scenario full_view 1 3 +run_scenario qos_throttled 1 4 +echo "All mock scenario tests passed" diff --git a/codebundles/vast-tenant-storage-health/.test/validate-vast-bundle-structure.sh b/codebundles/vast-tenant-storage-health/.test/validate-vast-bundle-structure.sh new file mode 100755 index 000000000..fd8707118 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/.test/validate-vast-bundle-structure.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Static validation for vast-tenant-storage-health. +set -euo pipefail +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +test -f "$ROOT/runbook.robot" +test -f "$ROOT/sli.robot" +test -f "$ROOT/README.md" +test -f "$ROOT/vast-vms-helpers.sh" +test -f "$ROOT/.runwhen/generation-rules/vast-tenant-storage-health.yaml" +test -f "$ROOT/.runwhen/templates/vast-tenant-storage-health-slx.yaml" +test -f "$ROOT/.runwhen/templates/vast-tenant-storage-health-taskset.yaml" +test -f "$ROOT/.runwhen/templates/vast-tenant-storage-health-sli.yaml" + +for f in \ + discover-vast-tenants.sh \ + check-tenant-capacity.sh \ + check-view-capacity.sh \ + analyze-tenant-qos.sh \ + check-qos-wait-times.sh \ + check-tenant-config.sh \ + analyze-tenant-latency.sh \ + check-block-volume-health.sh \ + sli-vast-capacity-score.sh \ + sli-vast-qos-score.sh \ + sli-vast-latency-score.sh +do + test -f "$ROOT/$f" +done + +echo "vast-tenant-storage-health bundle structure OK" diff --git a/codebundles/vast-tenant-storage-health/README.md b/codebundles/vast-tenant-storage-health/README.md new file mode 100644 index 000000000..2118679e5 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/README.md @@ -0,0 +1,102 @@ +# VAST Data Tenant Storage Health + +Monitor per-tenant and per-view storage health on VAST Data: volume capacity, quota utilization, QoS throttling, IO performance, and configuration policies that may limit users. Applies to all client types (Kubernetes PVCs backed by views, NFS clients, block volumes, S3 buckets). + +## Overview + +- **Tenant capacity**: Compares logical capacity, DRR, and quota utilization from `/api/prometheusmetrics/tenants`, `/api/prometheusmetrics/quotas`, and `/api/tenants/`. +- **View capacity**: Detects NFS exports and block views approaching capacity limits via `/api/prometheusmetrics/views`. +- **QoS saturation**: Evaluates tenant read/write IOPS and bandwidth against configured QoS ceilings. +- **QoS wait times**: Inspects `qos_wait_for_budget_time` and metadata IOPS limits for sustained throttling. +- **Tenant configuration**: Reviews export permissions, disabled tenants, and quota policies from VMS REST. +- **Latency anomalies**: Detects elevated read/write latency from tenant and view metrics. +- **Block volume health**: Monitors block volumes via `/api/prometheusmetrics/volumes` (VAST 5.4.3+). + +Tenant and view names often appear in Kubernetes StorageClass parameters or PVC annotations when tracing from Kubernetes workloads. This bundle remains platform-native and queries VMS directly. + +## Configuration + +### Required Variables + +- `VAST_VMS_ENDPOINT`: VMS REST API base URL (for example `https://vms.example.com`). +- `VAST_CLUSTER_NAME`: VAST cluster name used as an SLX qualifier and metric filter. +- `VAST_TENANT_NAME`: VAST tenant name used as the SLX qualifier and `X-Tenant-Name` scope for metrics. + +### Optional Variables + +- `TENANTS`: Tenant name or `All` for auto-discovery during generation (default: `All`). +- `CAPACITY_THRESHOLD`: Tenant/view capacity utilization percent threshold (default: `85`). +- `QOS_UTILIZATION_THRESHOLD`: Percent of QoS limit sustained that triggers a throttling issue (default: `90`). +- `LATENCY_THRESHOLD_MS`: Read/write latency in milliseconds above which to raise an issue (default: `10`). + +### Secrets + +- `vast_vms_credentials`: VMS API authentication credentials in JSON format: + +```json +{ + "USERNAME": "vms-readonly-user", + "PASSWORD": "secret" +} +``` + +Alternative token-based auth: + +```json +{ + "API_TOKEN": "your-jwt-or-api-token" +} +``` + +A VMS manager user with the built-in read-only role is sufficient for Prometheus exporter endpoints. + +## Tasks Overview + +### Check Tenant Capacity Utilization + +Compares tenant logical capacity and quota utilization against `CAPACITY_THRESHOLD`. Raises severity 2–3 issues when utilization exceeds the threshold. + +### Check View Volume Capacity + +Identifies views with high logical or quota utilization. Detects views at or near capacity that may block writes. + +### Analyze Tenant IOPS and Bandwidth Against QoS Limits + +Evaluates tenant read/write IOPS and bandwidth metrics versus QoS policy limits configured in VMS. + +### Check QoS Wait Times and Throttling + +Inspects view-level QoS wait time metrics and metadata IOPS limits to detect sustained throttling. + +### Check User and Permission Configuration + +Reviews tenant state, export policies, and quota flags that may restrict client access or capacity. + +### Analyze Read Write Latency Anomalies + +Detects elevated tenant and view read/write latency above `LATENCY_THRESHOLD_MS`. + +### Check Block Volume Health + +Monitors block volume IOPS and latency from `/api/prometheusmetrics/volumes`. Requires VAST Cluster 5.4.3+ with live monitoring enabled on volumes. + +## SLI + +The bundled SLI averages three binary dimensions into a 0–1 health score: + +1. Capacity utilization below `CAPACITY_THRESHOLD` +2. No elevated QoS wait times +3. Read/write latency below `LATENCY_THRESHOLD_MS` + +## Related CodeBundles + +- `vast-cluster-health`: Cluster-level hardware and VMS state (complements this tenant/view bundle). +- `vast-k8s-csi-health`: Kubernetes CSI and PVC tracing. +- `k8s-pvc-healthcheck`: In-cluster PVC mount utilization. +- `gcp-bucket-health`: Similar capacity/access pattern for object storage on GCP. + +## API References + +- VAST Prometheus exporter: `/api/prometheusmetrics/tenants`, `/views`, `/quotas`, `/volumes` +- VMS REST: `/api/tenants/`, `/api/views/`, `/api/quotas/` +- Block volume metrics reference: https://kb.vastdata.com/documentation/docs/block-volume-metrics-reference diff --git a/codebundles/vast-tenant-storage-health/analyze-tenant-latency.sh b/codebundles/vast-tenant-storage-health/analyze-tenant-latency.sh new file mode 100755 index 000000000..11817b721 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/analyze-tenant-latency.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# Detect elevated read/write/metadata latency from tenant and view metrics. +set -euo pipefail +set -x + +: "${VAST_VMS_ENDPOINT:?Must set VAST_VMS_ENDPOINT}" +: "${VAST_CLUSTER_NAME:?Must set VAST_CLUSTER_NAME}" +: "${VAST_TENANT_NAME:?Must set VAST_TENANT_NAME}" + +LATENCY_THRESHOLD_MS="${LATENCY_THRESHOLD_MS:-10}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-vms-helpers.sh +source "${SCRIPT_DIR}/vast-vms-helpers.sh" + +vast_require_cmd curl +vast_require_cmd jq +vast_require_cmd python3 + +OUTPUT_FILE="tenant_latency_issues.json" +issues_json='[]' + +if ! vast_auth_configured; then + issues_json="$(vast_add_api_error_issue "$issues_json" \ + "Cannot authenticate to VMS for tenant \`${VAST_TENANT_NAME}\`" \ + "vast_vms_credentials secret missing USERNAME/PASSWORD or API_TOKEN." \ + 4)" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +tenant_resp="$(vast_fetch_prometheus_metrics "tenants")" +tenant_code="$(vast_fetch_http_code "$tenant_resp")" +tenant_body="$(vast_fetch_body "$tenant_resp")" + +views_resp="$(vast_fetch_prometheus_metrics "views")" +views_code="$(vast_fetch_http_code "$views_resp")" +views_body="$(vast_fetch_body "$views_resp")" + +if [[ "$tenant_code" != "200" && "$views_code" != "200" ]]; then + issues_json="$(vast_add_api_error_issue "$issues_json" \ + "Latency metrics unavailable for tenant \`${VAST_TENANT_NAME}\`" \ + "Tenant metrics HTTP ${tenant_code}, view metrics HTTP ${views_code}." \ + 4)" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +printf '%s' "$tenant_body" >/tmp/vast_tenant_latency.prom +printf '%s' "$views_body" >/tmp/vast_views_latency.prom + +while IFS=$'\t' read -r dimension latency_ms severity details; do + [[ -z "$dimension" ]] && continue + echo "Latency ${dimension}: ${latency_ms} ms (threshold ${LATENCY_THRESHOLD_MS} ms)" + if [[ -n "$severity" ]]; then + issues_json="$(echo "$issues_json" | jq \ + --arg title "Elevated ${dimension} latency for tenant \`${VAST_TENANT_NAME}\`" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "Investigate cluster load, QoS throttling, network path, and client IO patterns causing elevated ${dimension} latency." \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]')" + fi +done < <(LATENCY_THRESHOLD_MS="$LATENCY_THRESHOLD_MS" VAST_TENANT_NAME="$VAST_TENANT_NAME" VAST_CLUSTER_NAME="$VAST_CLUSTER_NAME" python3 <<'PY' +import os + +threshold = float(os.environ["LATENCY_THRESHOLD_MS"]) +tenant = os.environ["VAST_TENANT_NAME"] +cluster = os.environ["VAST_CLUSTER_NAME"] +tenant_metrics = open("/tmp/vast_tenant_latency.prom").read() +view_metrics = open("/tmp/vast_views_latency.prom").read() + +latency_prefixes = [ + ("tenant_read_latency", "vast_tenant_metrics_TenantMetrics_read_latency", tenant_metrics), + ("tenant_write_latency", "vast_tenant_metrics_TenantMetrics_write_latency", tenant_metrics), + ("view_read_latency", "vast_user_view_read_latency", view_metrics), + ("view_write_latency", "vast_user_view_write_latency", view_metrics), +] + +def max_metric(text, prefix): + best = 0.0 + found = False + for line in text.splitlines(): + if not line or line.startswith("#"): + continue + if not line.startswith(prefix): + continue + if tenant and tenant not in line: + continue + if cluster and f'cluster="{cluster}"' not in line: + continue + try: + best = max(best, float(line.rsplit(" ", 1)[-1])) + found = True + except ValueError: + pass + return best if found else None + +for name, prefix, text in latency_prefixes: + if not text.strip(): + continue + val = max_metric(text, prefix) + if val is None: + continue + # VAST latency metrics are typically in microseconds; convert to ms when values are large. + latency_ms = val / 1000.0 if val > 1000 else val + sev = "" + if latency_ms >= threshold: + sev = "4" if latency_ms >= threshold * 2 else "3" + details = f"{name} measured {latency_ms:.2f} ms (threshold {threshold} ms) for tenant `{tenant}` on cluster `{cluster}`" + print(f"{name}\t{latency_ms:.2f}\t{sev}\t{details}") +PY +) + +rm -f /tmp/vast_tenant_latency.prom /tmp/vast_views_latency.prom + +echo "$issues_json" >"$OUTPUT_FILE" +echo "Analysis completed. Results saved to $OUTPUT_FILE" +cat "$OUTPUT_FILE" diff --git a/codebundles/vast-tenant-storage-health/analyze-tenant-qos.sh b/codebundles/vast-tenant-storage-health/analyze-tenant-qos.sh new file mode 100755 index 000000000..de44fa0de --- /dev/null +++ b/codebundles/vast-tenant-storage-health/analyze-tenant-qos.sh @@ -0,0 +1,132 @@ +#!/usr/bin/env bash +# Evaluate tenant read/write IOPS and bandwidth against configured QoS ceilings. +set -euo pipefail +set -x + +: "${VAST_VMS_ENDPOINT:?Must set VAST_VMS_ENDPOINT}" +: "${VAST_CLUSTER_NAME:?Must set VAST_CLUSTER_NAME}" +: "${VAST_TENANT_NAME:?Must set VAST_TENANT_NAME}" + +QOS_UTILIZATION_THRESHOLD="${QOS_UTILIZATION_THRESHOLD:-90}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-vms-helpers.sh +source "${SCRIPT_DIR}/vast-vms-helpers.sh" + +vast_require_cmd curl +vast_require_cmd jq +vast_require_cmd python3 + +OUTPUT_FILE="tenant_qos_issues.json" +issues_json='[]' + +if ! vast_auth_configured; then + issues_json="$(vast_add_api_error_issue "$issues_json" \ + "Cannot authenticate to VMS for tenant \`${VAST_TENANT_NAME}\`" \ + "vast_vms_credentials secret missing USERNAME/PASSWORD or API_TOKEN." \ + 4)" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +metrics_resp="$(vast_fetch_prometheus_metrics "tenants")" +metrics_code="$(vast_fetch_http_code "$metrics_resp")" +metrics_body="$(vast_fetch_body "$metrics_resp")" + +if [[ "$metrics_code" != "200" ]]; then + issues_json="$(vast_add_api_error_issue "$issues_json" \ + "Tenant QoS metrics API error for \`${VAST_TENANT_NAME}\`" \ + "HTTP ${metrics_code} from /api/prometheusmetrics/tenants." \ + 4)" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +tenants_resp="$(vast_fetch_json_api "/tenants/")" +tenants_code="$(vast_fetch_http_code "$tenants_resp")" +tenants_body="$(vast_fetch_body "$tenants_resp")" +tenant_json="{}" +if [[ "$tenants_code" == "200" ]]; then + tenant_json="$(printf '%s' "$tenants_body" | vast_find_tenant_json "$tenants_body")" +fi + +printf '%s' "$metrics_body" >/tmp/vast_tenant_metrics.prom +printf '%s' "$tenant_json" >/tmp/vast_tenant_config.json + +while IFS=$'\t' read -r dimension current limit util_pct severity details; do + [[ -z "$dimension" ]] && continue + echo "QoS ${dimension}: current=${current} limit=${limit} util=${util_pct}%" + if [[ -n "$severity" ]]; then + issues_json="$(echo "$issues_json" | jq \ + --arg title "Tenant QoS saturation on ${dimension} for \`${VAST_TENANT_NAME}\`" \ + --arg details "${details}" \ + --argjson severity "$severity" \ + --arg next_steps "Review tenant QoS policy limits in VMS, burst workloads, and redistribute IO across views or clients." \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]')" + fi +done < <(QOS_UTILIZATION_THRESHOLD="$QOS_UTILIZATION_THRESHOLD" VAST_TENANT_NAME="$VAST_TENANT_NAME" VAST_CLUSTER_NAME="$VAST_CLUSTER_NAME" python3 <<'PY' +import json, os, re + +threshold = float(os.environ["QOS_UTILIZATION_THRESHOLD"]) +tenant = os.environ["VAST_TENANT_NAME"] +cluster = os.environ["VAST_CLUSTER_NAME"] +metrics = open("/tmp/vast_tenant_metrics.prom").read() +tenant_cfg = json.load(open("/tmp/vast_tenant_config.json")) + +checks = [ + ("read_iops", "vast_tenant_metrics_TenantMetrics_read_iops", ["qos.read_iops", "qos.max_read_iops", "read_iops_limit"]), + ("write_iops", "vast_tenant_metrics_TenantMetrics_write_iops", ["qos.write_iops", "qos.max_write_iops", "write_iops_limit"]), + ("read_bw", "vast_tenant_metrics_TenantMetrics_read_bw", ["qos.read_bw", "qos.max_read_bw", "read_bw_limit"]), + ("write_bw", "vast_tenant_metrics_TenantMetrics_write_bw", ["qos.write_bw", "qos.max_write_bw", "write_bw_limit"]), +] + +def metric_max(prefix): + best = 0.0 + for line in metrics.splitlines(): + if not line or line.startswith("#"): + continue + if not any(line.startswith(p) for p in (prefix, prefix + "_avg", prefix + "_sum", prefix + "_count")): + continue + if tenant and tenant not in line: + continue + if cluster and f'cluster="{cluster}"' not in line: + continue + val = line.rsplit(" ", 1)[-1] + try: + best = max(best, float(val)) + except ValueError: + pass + return best + +def cfg_limit(paths): + cur = tenant_cfg + for p in paths: + if isinstance(cur, dict) and p in cur: + cur = cur[p] + else: + return None + try: + return float(cur) + except (TypeError, ValueError): + return None + +for name, metric_prefix, cfg_paths in checks: + current = metric_max(metric_prefix) + limit = cfg_limit(cfg_paths) + if not limit or limit <= 0: + print(f"{name}\t{current}\t\t\t") + continue + util = (current / limit) * 100.0 + sev = "" + if util >= threshold: + sev = "4" if util >= 98 else "3" + details = f"{name} at {util:.1f}% of QoS limit ({current:.2f}/{limit:.2f}) for tenant `{tenant}` on cluster `{cluster}`" + print(f"{name}\t{current}\t{limit}\t{util:.2f}\t{sev}\t{details}") +PY +) + +rm -f /tmp/vast_tenant_metrics.prom /tmp/vast_tenant_config.json + +echo "$issues_json" >"$OUTPUT_FILE" +echo "Analysis completed. Results saved to $OUTPUT_FILE" +cat "$OUTPUT_FILE" diff --git a/codebundles/vast-tenant-storage-health/check-block-volume-health.sh b/codebundles/vast-tenant-storage-health/check-block-volume-health.sh new file mode 100755 index 000000000..08a6d7bf0 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/check-block-volume-health.sh @@ -0,0 +1,134 @@ +#!/usr/bin/env bash +# Monitor block volume IOPS, bandwidth, and latency via /api/prometheusmetrics/volumes. +set -euo pipefail +set -x + +: "${VAST_VMS_ENDPOINT:?Must set VAST_VMS_ENDPOINT}" +: "${VAST_CLUSTER_NAME:?Must set VAST_CLUSTER_NAME}" +: "${VAST_TENANT_NAME:?Must set VAST_TENANT_NAME}" + +LATENCY_THRESHOLD_MS="${LATENCY_THRESHOLD_MS:-10}" +QOS_UTILIZATION_THRESHOLD="${QOS_UTILIZATION_THRESHOLD:-90}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-vms-helpers.sh +source "${SCRIPT_DIR}/vast-vms-helpers.sh" + +vast_require_cmd curl +vast_require_cmd jq +vast_require_cmd python3 + +OUTPUT_FILE="block_volume_issues.json" +issues_json='[]' + +if ! vast_auth_configured; then + issues_json="$(vast_add_api_error_issue "$issues_json" \ + "Cannot authenticate to VMS for tenant \`${VAST_TENANT_NAME}\`" \ + "vast_vms_credentials secret missing USERNAME/PASSWORD or API_TOKEN." \ + 4)" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +volumes_resp="$(vast_fetch_prometheus_metrics "volumes")" +volumes_code="$(vast_fetch_http_code "$volumes_resp")" +volumes_body="$(vast_fetch_body "$volumes_resp")" + +if [[ "$volumes_code" == "404" ]]; then + issues_json="$(echo "$issues_json" | jq \ + --arg title "Block volume metrics endpoint unavailable for tenant \`${VAST_TENANT_NAME}\`" \ + --arg details "HTTP 404 from /api/prometheusmetrics/volumes. Block volume metrics require VAST Cluster 5.4.3+ and at least one IO on monitored volumes." \ + --argjson severity 4 \ + --arg next_steps "Upgrade cluster to 5.4.3+, enable live monitoring on block volumes, and ensure volumes have recent IO." \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]')" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +if [[ "$volumes_code" != "200" ]]; then + issues_json="$(vast_add_api_error_issue "$issues_json" \ + "Block volume metrics API error for tenant \`${VAST_TENANT_NAME}\`" \ + "HTTP ${volumes_code} from /api/prometheusmetrics/volumes." \ + 4)" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +printf '%s' "$volumes_body" >/tmp/vast_volume_metrics.prom + +while IFS=$'\t' read -r volume title details severity; do + [[ -z "$volume" ]] && continue + echo "Block volume ${volume}: ${details}" + issues_json="$(echo "$issues_json" | jq \ + --arg title "$title" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "Inspect block volume mapping, host multipath, QoS policy, and recent IO errors for volume ${volume}. Toggle live monitoring if metrics are stale." \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]')" +done < <(LATENCY_THRESHOLD_MS="$LATENCY_THRESHOLD_MS" QOS_UTILIZATION_THRESHOLD="$QOS_UTILIZATION_THRESHOLD" VAST_TENANT_NAME="$VAST_TENANT_NAME" VAST_CLUSTER_NAME="$VAST_CLUSTER_NAME" python3 <<'PY' +import os + +latency_threshold = float(os.environ["LATENCY_THRESHOLD_MS"]) +tenant = os.environ["VAST_TENANT_NAME"] +cluster = os.environ["VAST_CLUSTER_NAME"] +metrics = open("/tmp/vast_volume_metrics.prom").read() + +volumes = {} + +for line in metrics.splitlines(): + if not line or line.startswith("#") or "{" not in line: + continue + if tenant and tenant not in line: + continue + if cluster and f'cluster="{cluster}"' not in line: + continue + name_part, rest = line.split("{", 1) + labels_part, value_part = rest.rsplit("}", 1) + vol = None + for token in labels_part.split(","): + token = token.strip() + for key in ("volume", "volume_name", "name"): + if token.startswith(f'{key}="'): + vol = token.split("=", 1)[1].strip('"') + if not vol: + continue + try: + val = float(value_part.strip()) + except ValueError: + continue + entry = volumes.setdefault(vol, {"read_iops": 0.0, "write_iops": 0.0, "read_latency": 0.0, "write_latency": 0.0, "read_bw": 0.0, "write_bw": 0.0}) + metric = name_part.strip() + if "read_iops" in metric: + entry["read_iops"] = max(entry["read_iops"], val) + elif "write_iops" in metric: + entry["write_iops"] = max(entry["write_iops"], val) + elif "read_latency" in metric: + entry["read_latency"] = max(entry["read_latency"], val) + elif "write_latency" in metric: + entry["write_latency"] = max(entry["write_latency"], val) + elif "read_bw" in metric: + entry["read_bw"] = max(entry["read_bw"], val) + elif "write_bw" in metric: + entry["write_bw"] = max(entry["write_bw"], val) + +if not volumes: + print("none\tNo block volume metrics for tenant\tNo volume series matched tenant/cluster filters. Enable live monitoring and send IO to volumes.\t4") + raise SystemExit + +for vol, data in sorted(volumes.items()): + total_iops = data["read_iops"] + data["write_iops"] + max_latency = max(data["read_latency"], data["write_latency"]) + latency_ms = max_latency / 1000.0 if max_latency > 1000 else max_latency + if total_iops == 0: + print(f"{vol}\tBlock volume `{vol}` shows zero IO\tVolume has no read/write IOPS in exported metrics; verify host connectivity and monitoring.\t4") + if latency_ms >= latency_threshold: + sev = 4 if latency_ms >= latency_threshold * 2 else 3 + print(f"{vol}\tElevated block volume latency on `{vol}`\tread_latency={data['read_latency']:.2f} write_latency={data['write_latency']:.2f} (~{latency_ms:.2f} ms)\t{sev}") +PY +) + +rm -f /tmp/vast_volume_metrics.prom + +echo "$issues_json" >"$OUTPUT_FILE" +echo "Analysis completed. Results saved to $OUTPUT_FILE" +cat "$OUTPUT_FILE" diff --git a/codebundles/vast-tenant-storage-health/check-qos-wait-times.sh b/codebundles/vast-tenant-storage-health/check-qos-wait-times.sh new file mode 100755 index 000000000..d0b36634f --- /dev/null +++ b/codebundles/vast-tenant-storage-health/check-qos-wait-times.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# Inspect QoS wait time metrics and metadata IOPS limits for tenant throttling. +set -euo pipefail +set -x + +: "${VAST_VMS_ENDPOINT:?Must set VAST_VMS_ENDPOINT}" +: "${VAST_CLUSTER_NAME:?Must set VAST_CLUSTER_NAME}" +: "${VAST_TENANT_NAME:?Must set VAST_TENANT_NAME}" + +QOS_UTILIZATION_THRESHOLD="${QOS_UTILIZATION_THRESHOLD:-90}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-vms-helpers.sh +source "${SCRIPT_DIR}/vast-vms-helpers.sh" + +vast_require_cmd curl +vast_require_cmd jq +vast_require_cmd python3 + +OUTPUT_FILE="qos_wait_issues.json" +issues_json='[]' + +if ! vast_auth_configured; then + issues_json="$(vast_add_api_error_issue "$issues_json" \ + "Cannot authenticate to VMS for tenant \`${VAST_TENANT_NAME}\`" \ + "vast_vms_credentials secret missing USERNAME/PASSWORD or API_TOKEN." \ + 4)" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +views_resp="$(vast_fetch_prometheus_metrics "views")" +views_code="$(vast_fetch_http_code "$views_resp")" +views_body="$(vast_fetch_body "$views_resp")" + +if [[ "$views_code" != "200" ]]; then + issues_json="$(vast_add_api_error_issue "$issues_json" \ + "QoS wait metrics API error for tenant \`${VAST_TENANT_NAME}\`" \ + "HTTP ${views_code} from /api/prometheusmetrics/views." \ + 4)" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +tenants_resp="$(vast_fetch_json_api "/tenants/")" +tenants_code="$(vast_fetch_http_code "$tenants_resp")" +tenants_body="$(vast_fetch_body "$tenants_resp")" +tenant_json="{}" +if [[ "$tenants_code" == "200" ]]; then + tenant_json="$(printf '%s' "$tenants_body" | vast_find_tenant_json "$tenants_body")" +fi + +printf '%s' "$views_body" >/tmp/vast_views_qos.prom +printf '%s' "$tenant_json" >/tmp/vast_tenant_qos.json + +analysis="$(QOS_UTILIZATION_THRESHOLD="$QOS_UTILIZATION_THRESHOLD" VAST_TENANT_NAME="$VAST_TENANT_NAME" VAST_CLUSTER_NAME="$VAST_CLUSTER_NAME" python3 <<'PY' +import json, os + +threshold = float(os.environ["QOS_UTILIZATION_THRESHOLD"]) +tenant = os.environ["VAST_TENANT_NAME"] +cluster = os.environ["VAST_CLUSTER_NAME"] +metrics = open("/tmp/vast_views_qos.prom").read() +tenant_cfg = json.load(open("/tmp/vast_tenant_qos.json")) + +wait_total = 0.0 +wait_samples = 0 +md_iops = 0.0 +for line in metrics.splitlines(): + if not line or line.startswith("#"): + continue + if tenant and tenant not in line: + continue + if cluster and f'cluster="{cluster}"' not in line: + continue + if "qos_wait_for_budget_time" in line: + try: + wait_total += float(line.rsplit(" ", 1)[-1]) + wait_samples += 1 + except ValueError: + pass + if "read_md_iops" in line or "write_md_iops" in line: + try: + md_iops = max(md_iops, float(line.rsplit(" ", 1)[-1])) + except ValueError: + pass + +avg_wait = wait_total / wait_samples if wait_samples else 0.0 +md_limit = None +for key in ("metadata_iops", "md_iops", "max_metadata_iops"): + val = tenant_cfg.get("qos", {}).get(key) if isinstance(tenant_cfg.get("qos"), dict) else tenant_cfg.get(key) + if val is not None: + try: + md_limit = float(val) + break + except (TypeError, ValueError): + pass + +print(f"summary\tavg_qos_wait={avg_wait:.4f}\tmd_iops={md_iops:.2f}\tmd_limit={md_limit or 'unknown'}") + +if wait_samples and avg_wait > 0: + print(f"ISSUE\tQoS wait time elevated\tTenant `{tenant}` average qos_wait_for_budget_time={avg_wait:.4f} across {wait_samples} view metric series.\t4") + +if md_limit and md_limit > 0: + util = (md_iops / md_limit) * 100.0 + if util >= threshold: + sev = 4 if util >= 98 else 3 + print(f"ISSUE\tMetadata IOPS near QoS limit\tMetadata IOPS {md_iops:.2f}/{md_limit:.2f} ({util:.1f}%) for tenant `{tenant}`.\t{sev}") +PY +)" + +while IFS=$'\t' read -r kind title details severity; do + if [[ "$kind" == "summary" ]]; then + echo "${title}" + continue + fi + if [[ "$kind" == "ISSUE" ]]; then + issues_json="$(echo "$issues_json" | jq \ + --arg title "${title} for \`${VAST_TENANT_NAME}\`" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "Inspect tenant QoS policy, metadata IOPS limits, and workloads causing sustained budget waits. Adjust QoS or spread metadata-heavy operations." \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]')" + fi +done <<<"$analysis" + +rm -f /tmp/vast_views_qos.prom /tmp/vast_tenant_qos.json + +echo "$issues_json" >"$OUTPUT_FILE" +echo "Analysis completed. Results saved to $OUTPUT_FILE" +cat "$OUTPUT_FILE" diff --git a/codebundles/vast-tenant-storage-health/check-tenant-capacity.sh b/codebundles/vast-tenant-storage-health/check-tenant-capacity.sh new file mode 100755 index 000000000..13865cc4d --- /dev/null +++ b/codebundles/vast-tenant-storage-health/check-tenant-capacity.sh @@ -0,0 +1,111 @@ +#!/usr/bin/env bash +# Compare tenant logical capacity and quota utilization from VMS metrics and REST. +set -euo pipefail +set -x + +: "${VAST_VMS_ENDPOINT:?Must set VAST_VMS_ENDPOINT}" +: "${VAST_CLUSTER_NAME:?Must set VAST_CLUSTER_NAME}" +: "${VAST_TENANT_NAME:?Must set VAST_TENANT_NAME}" + +CAPACITY_THRESHOLD="${CAPACITY_THRESHOLD:-85}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-vms-helpers.sh +source "${SCRIPT_DIR}/vast-vms-helpers.sh" + +vast_require_cmd curl +vast_require_cmd jq +vast_require_cmd python3 + +OUTPUT_FILE="tenant_capacity_issues.json" +issues_json='[]' + +if ! vast_auth_configured; then + issues_json="$(vast_add_api_error_issue "$issues_json" \ + "Cannot authenticate to VMS for tenant \`${VAST_TENANT_NAME}\`" \ + "vast_vms_credentials secret missing USERNAME/PASSWORD or API_TOKEN." \ + 4 \ + "Configure vast_vms_credentials with USERNAME and PASSWORD or API_TOKEN.")" + echo "$issues_json" >"$OUTPUT_FILE" + echo "Wrote $OUTPUT_FILE" + exit 0 +fi + +tenant_metrics_resp="$(vast_fetch_prometheus_metrics "tenants")" +tenant_metrics_code="$(vast_fetch_http_code "$tenant_metrics_resp")" +tenant_metrics_body="$(vast_fetch_body "$tenant_metrics_resp")" + +if [[ "$tenant_metrics_code" != "200" ]]; then + issues_json="$(vast_add_api_error_issue "$issues_json" \ + "Tenant metrics API error for \`${VAST_TENANT_NAME}\` on cluster \`${VAST_CLUSTER_NAME}\`" \ + "HTTP ${tenant_metrics_code} from /api/prometheusmetrics/tenants. Body (truncated): ${tenant_metrics_body:0:400}" \ + 4)" + echo "$issues_json" >"$OUTPUT_FILE" + echo "Wrote $OUTPUT_FILE" + exit 0 +fi + +quota_metrics_resp="$(vast_fetch_prometheus_metrics "quotas")" +quota_metrics_code="$(vast_fetch_http_code "$quota_metrics_resp")" +quota_metrics_body="$(vast_fetch_body "$quota_metrics_resp")" + +tenants_resp="$(vast_fetch_json_api "/tenants/")" +tenants_code="$(vast_fetch_http_code "$tenants_resp")" +tenants_body="$(vast_fetch_body "$tenants_resp")" + +logical_used="$(vast_prom_metric_sum "$tenant_metrics_body" "vast_tenant_metrics_TenantMetrics_logical_capacity" || true)" +physical_used="$(vast_prom_metric_sum "$tenant_metrics_body" "vast_tenant_metrics_TenantMetrics_physical_capacity" || true)" +drr="$(vast_prom_metric_values "$tenant_metrics_body" "vast_cluster_drr" || true)" + +quota_used="" +quota_hard="" +if [[ "$quota_metrics_code" == "200" ]]; then + quota_used="$(vast_prom_metric_sum "$quota_metrics_body" "vast_quota_used_capacity" || true)" + quota_hard="$(vast_prom_metric_values "$quota_metrics_body" "vast_quota_hard_limit" || true)" +fi + +tenant_quota_hard="" +tenant_quota_soft="" +if [[ "$tenants_code" == "200" ]]; then + tenant_json="$(printf '%s' "$tenants_body" | vast_find_tenant_json "$tenants_body")" + tenant_quota_hard="$(printf '%s' "$tenant_json" | jq -r '.capacity_limits.hard_limit // .quota.hard_limit // .hard_quota // empty' 2>/dev/null || true)" + tenant_quota_soft="$(printf '%s' "$tenant_json" | jq -r '.capacity_limits.soft_limit // .quota.soft_limit // .soft_quota // empty' 2>/dev/null || true)" +fi + +used_bytes="${quota_used:-$logical_used}" +limit_bytes="${quota_hard:-$tenant_quota_hard}" + +echo "Tenant ${VAST_TENANT_NAME} on cluster ${VAST_CLUSTER_NAME}: logical=${logical_used:-n/a} physical=${physical_used:-n/a} drr=${drr:-n/a} quota_used=${quota_used:-n/a} quota_hard=${quota_hard:-n/a}" + +util_pct="" +if [[ -n "$used_bytes" && -n "$limit_bytes" ]]; then + util_pct="$(vast_percent_util "$used_bytes" "$limit_bytes")" +fi + +if [[ -n "$util_pct" ]]; then + over_threshold="$(python3 - "$util_pct" "$CAPACITY_THRESHOLD" <<'PY' +import sys +util = float(sys.argv[1]) +threshold = float(sys.argv[2]) +print("yes" if util >= threshold else "no") +PY +)" + if [[ "$over_threshold" == "yes" ]]; then + severity=3 + if python3 -c "import sys; print('yes' if float(sys.argv[1]) >= 95 else 'no')" "$util_pct" | grep -q yes; then + severity=2 + fi + issues_json="$(echo "$issues_json" | jq \ + --arg title "Tenant capacity utilization high for \`${VAST_TENANT_NAME}\` on cluster \`${VAST_CLUSTER_NAME}\`" \ + --arg details "Utilization ${util_pct}% exceeds threshold ${CAPACITY_THRESHOLD}%. used_bytes=${used_bytes} limit_bytes=${limit_bytes} logical=${logical_used:-unknown} physical=${physical_used:-unknown} drr=${drr:-unknown} soft_quota=${tenant_quota_soft:-unknown}" \ + --argjson severity "$severity" \ + --arg next_steps "Review tenant quotas and data growth on VAST. Increase quota, archive cold data, or expand tenant capacity limits in VMS." \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]')" + fi +elif [[ -z "$limit_bytes" && -n "$logical_used" ]]; then + echo "No quota limit found in metrics or tenant REST; skipping utilization percentage check." +fi + +echo "$issues_json" >"$OUTPUT_FILE" +echo "Analysis completed. Results saved to $OUTPUT_FILE" +cat "$OUTPUT_FILE" diff --git a/codebundles/vast-tenant-storage-health/check-tenant-config.sh b/codebundles/vast-tenant-storage-health/check-tenant-config.sh new file mode 100755 index 000000000..a7cc78411 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/check-tenant-config.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +# Review tenant user/group policies, export permissions, and quota policies. +set -euo pipefail +set -x + +: "${VAST_VMS_ENDPOINT:?Must set VAST_VMS_ENDPOINT}" +: "${VAST_CLUSTER_NAME:?Must set VAST_CLUSTER_NAME}" +: "${VAST_TENANT_NAME:?Must set VAST_TENANT_NAME}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-vms-helpers.sh +source "${SCRIPT_DIR}/vast-vms-helpers.sh" + +vast_require_cmd curl +vast_require_cmd jq +vast_require_cmd python3 + +OUTPUT_FILE="tenant_config_issues.json" +issues_json='[]' + +if ! vast_auth_configured; then + issues_json="$(vast_add_api_error_issue "$issues_json" \ + "Cannot authenticate to VMS for tenant \`${VAST_TENANT_NAME}\`" \ + "vast_vms_credentials secret missing USERNAME/PASSWORD or API_TOKEN." \ + 4)" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +tenants_resp="$(vast_fetch_json_api "/tenants/")" +tenants_code="$(vast_fetch_http_code "$tenants_resp")" +tenants_body="$(vast_fetch_body "$tenants_resp")" + +if [[ "$tenants_code" != "200" ]]; then + issues_json="$(vast_add_api_error_issue "$issues_json" \ + "Tenant configuration API error for \`${VAST_TENANT_NAME}\`" \ + "HTTP ${tenants_code} from /api/tenants/. Body (truncated): ${tenants_body:0:400}" \ + 4)" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +tenant_json="$(printf '%s' "$tenants_body" | vast_find_tenant_json "$tenants_body")" +tenant_name="$(printf '%s' "$tenant_json" | jq -r '.name // .tenant_name // empty')" +if [[ -z "$tenant_name" ]]; then + issues_json="$(echo "$issues_json" | jq \ + --arg title "Tenant not found in VMS: \`${VAST_TENANT_NAME}\`" \ + --arg details "No tenant named ${VAST_TENANT_NAME} matched cluster ${VAST_CLUSTER_NAME} in /api/tenants/." \ + --argjson severity 3 \ + --arg next_steps "Verify VAST_TENANT_NAME and VAST_CLUSTER_NAME qualifiers match VMS tenant records." \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]')" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +views_resp="$(vast_fetch_json_api "/views/?tenant_name=${VAST_TENANT_NAME}")" +views_code="$(vast_fetch_http_code "$views_resp")" +views_body="$(vast_fetch_body "$views_resp")" + +quotas_resp="$(vast_fetch_json_api "/quotas/?tenant_name=${VAST_TENANT_NAME}")" +quotas_code="$(vast_fetch_http_code "$quotas_resp")" +quotas_body="$(vast_fetch_body "$quotas_resp")" + +printf '%s' "$tenant_json" >/tmp/vast_tenant_config_check.json +printf '%s' "$views_body" >/tmp/vast_views_config.json +printf '%s' "$quotas_body" >/tmp/vast_quotas_config.json + +echo "Tenant configuration loaded for ${VAST_TENANT_NAME} (views HTTP ${views_code}, quotas HTTP ${quotas_code})" + +while IFS=$'\t' read -r title details severity; do + [[ -z "$title" ]] && continue + issues_json="$(echo "$issues_json" | jq \ + --arg title "$title" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "Review tenant policies in VMS: user/group mappings, export permissions, and quota definitions that may restrict client access or capacity." \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]')" +done < <(VAST_TENANT_NAME="$VAST_TENANT_NAME" python3 <<'PY' +import json + +tenant = json.load(open("/tmp/vast_tenant_config_check.json")) +views_raw = open("/tmp/vast_views_config.json").read().strip() +quotas_raw = open("/tmp/vast_quotas_config.json").read().strip() + +def as_list(raw): + if not raw: + return [] + data = json.loads(raw) + if isinstance(data, list): + return data + return data.get("results", data.get("views", data.get("quotas", []))) + +views = as_list(views_raw) +quotas = as_list(quotas_raw) + +if tenant.get("enabled") is False or tenant.get("state") in ("disabled", "suspended"): + print(f"Tenant disabled in VMS\tTenant `{tenant.get('name', tenant.get('tenant_name'))}` state={tenant.get('state', tenant.get('enabled'))}\t3") + +qos = tenant.get("qos") or tenant.get("qos_policy") or {} +if isinstance(qos, dict) and qos.get("enabled") is False: + print(f"Tenant QoS policy disabled\tQoS policy is disabled; workloads may hit cluster defaults unexpectedly.\t4") + +for view in views: + path = view.get("path") or view.get("name") + policy = view.get("policy") or view.get("export_policy") or {} + if policy.get("permission") in ("RO", "read_only", "READ_ONLY"): + print(f"Read-only export policy on view `{path}`\tExport policy permission={policy.get('permission')} may block client writes.\t3") + if view.get("blocked") or view.get("write_blocked"): + print(f"View write blocked: `{path}`\tView reports blocked/write_blocked flag.\t3") + +for quota in quotas: + name = quota.get("name") or quota.get("path") or "quota" + if quota.get("exceeded") or quota.get("is_exceeded"): + print(f"Quota exceeded: `{name}`\tQuota exceeded flag set in VMS configuration.\t3") + if quota.get("blocked_users_count", 0) or quota.get("blocked_user_count", 0): + count = quota.get("blocked_users_count") or quota.get("blocked_user_count") + print(f"Quota blocking users on `{name}`\tblocked_users_count={count}\t4") +PY +) + +rm -f /tmp/vast_tenant_config_check.json /tmp/vast_views_config.json /tmp/vast_quotas_config.json + +echo "$issues_json" >"$OUTPUT_FILE" +echo "Analysis completed. Results saved to $OUTPUT_FILE" +cat "$OUTPUT_FILE" diff --git a/codebundles/vast-tenant-storage-health/check-view-capacity.sh b/codebundles/vast-tenant-storage-health/check-view-capacity.sh new file mode 100755 index 000000000..637dcd299 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/check-view-capacity.sh @@ -0,0 +1,139 @@ +#!/usr/bin/env bash +# Identify views approaching or exceeding capacity limits from VMS view metrics. +set -euo pipefail +set -x + +: "${VAST_VMS_ENDPOINT:?Must set VAST_VMS_ENDPOINT}" +: "${VAST_CLUSTER_NAME:?Must set VAST_CLUSTER_NAME}" +: "${VAST_TENANT_NAME:?Must set VAST_TENANT_NAME}" + +CAPACITY_THRESHOLD="${CAPACITY_THRESHOLD:-85}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-vms-helpers.sh +source "${SCRIPT_DIR}/vast-vms-helpers.sh" + +vast_require_cmd curl +vast_require_cmd jq +vast_require_cmd python3 + +OUTPUT_FILE="view_capacity_issues.json" +issues_json='[]' + +if ! vast_auth_configured; then + issues_json="$(vast_add_api_error_issue "$issues_json" \ + "Cannot authenticate to VMS for tenant \`${VAST_TENANT_NAME}\`" \ + "vast_vms_credentials secret missing USERNAME/PASSWORD or API_TOKEN." \ + 4)" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +views_resp="$(vast_fetch_prometheus_metrics "views")" +views_code="$(vast_fetch_http_code "$views_resp")" +views_body="$(vast_fetch_body "$views_resp")" + +if [[ "$views_code" != "200" ]]; then + issues_json="$(vast_add_api_error_issue "$issues_json" \ + "View metrics API error for tenant \`${VAST_TENANT_NAME}\`" \ + "HTTP ${views_code} from /api/prometheusmetrics/views. Body (truncated): ${views_body:0:400}" \ + 4)" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +quota_resp="$(vast_fetch_prometheus_metrics "quotas")" +quota_code="$(vast_fetch_http_code "$quota_resp")" +quota_body="" +if [[ "$quota_code" == "200" ]]; then + quota_body="$(vast_fetch_body "$quota_resp")" +fi + +printf '%s' "$views_body" >/tmp/vast_views_metrics.prom +if [[ -n "$quota_body" ]]; then + printf '%s' "$quota_body" >/tmp/vast_quota_metrics.prom +else + rm -f /tmp/vast_quota_metrics.prom +fi + +analysis="$(VAST_TENANT_NAME="$VAST_TENANT_NAME" VAST_CLUSTER_NAME="$VAST_CLUSTER_NAME" \ + CAPACITY_THRESHOLD="$CAPACITY_THRESHOLD" python3 <<'PY' +import os + +tenant = os.environ["VAST_TENANT_NAME"] +cluster = os.environ["VAST_CLUSTER_NAME"] +threshold = float(os.environ["CAPACITY_THRESHOLD"]) +views_text = open("/tmp/vast_views_metrics.prom").read() +quota_text = open("/tmp/vast_quota_metrics.prom").read() if os.path.exists("/tmp/vast_quota_metrics.prom") else "" + +def parse_metrics(text, prefix): + out = {} + for line in text.splitlines(): + if not line or line.startswith("#") or not line.startswith(prefix): + continue + if "{" not in line: + continue + _, rest = line.split("{", 1) + labels_part, value_part = rest.rsplit("}", 1) + labels = "{" + labels_part + "}" + if tenant and f'tenant_name="{tenant}"' not in labels and f'tenant="{tenant}"' not in labels: + continue + if cluster and f'cluster="{cluster}"' not in labels: + continue + path = None + for token in labels_part.split(","): + token = token.strip() + if token.startswith('path="'): + path = token.split("=", 1)[1].strip('"') + if not path: + continue + try: + out[path] = float(value_part.strip()) + except ValueError: + pass + return out + +logical = parse_metrics(views_text, "vast_view_logical_capacity") +physical = parse_metrics(views_text, "vast_view_physical_capacity") +quota_used = parse_metrics(quota_text, "vast_quota_used_capacity") +quota_hard = parse_metrics(quota_text, "vast_quota_hard_limit") + +rows = [] +for path in sorted(set(logical) | set(physical) | set(quota_used) | set(quota_hard)): + log_val = logical.get(path, 0.0) + phys_val = physical.get(path, log_val) + used = quota_used.get(path, log_val) + hard = quota_hard.get(path) + util = None + if hard and hard > 0: + util = (used / hard) * 100.0 + rows.append((path, log_val, phys_val, used, hard, util)) + +for path, log_val, phys_val, used, hard, util in rows: + util_s = f"{util:.2f}" if util is not None else "" + print(f"{path}\t{log_val}\t{phys_val}\t{used}\t{hard or ''}\t{util_s}") + if util is not None and util >= threshold: + sev = 2 if util >= 95 else 3 + print(f"ISSUE\t{path}\t{util:.2f}\t{sev}\tlogical={log_val} physical={phys_val} quota_used={used} quota_hard={hard}") +PY +)" + +while IFS= read -r line; do + if [[ "$line" == ISSUE* ]]; then + IFS=$'\t' read -r _ view_path util_pct severity details <<<"$line" + issues_json="$(echo "$issues_json" | jq \ + --arg title "View capacity high for \`${view_path}\` (tenant \`${VAST_TENANT_NAME}\`)" \ + --arg details "View ${view_path} utilization ${util_pct}% exceeds threshold ${CAPACITY_THRESHOLD}%. ${details}" \ + --argjson severity "$severity" \ + --arg next_steps "Review view quota policies, client write patterns, and snapshot retention for path ${view_path}. Free space or raise view/tenant quota in VMS." \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]')" + elif [[ -n "$line" ]]; then + echo "View metrics: $line" + fi +done <<<"$analysis" + +rm -f /tmp/vast_views_metrics.prom /tmp/vast_quota_metrics.prom + +echo "$issues_json" >"$OUTPUT_FILE" +echo "Analysis completed. Results saved to $OUTPUT_FILE" +cat "$OUTPUT_FILE" diff --git a/codebundles/vast-tenant-storage-health/discover-vast-tenants.sh b/codebundles/vast-tenant-storage-health/discover-vast-tenants.sh new file mode 100755 index 000000000..3e132c8fa --- /dev/null +++ b/codebundles/vast-tenant-storage-health/discover-vast-tenants.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# Discover VAST tenants via /api/tenants/ for runbook tenant scoping. +set -euo pipefail +set -x + +: "${VAST_VMS_ENDPOINT:?Must set VAST_VMS_ENDPOINT}" +: "${VAST_CLUSTER_NAME:?Must set VAST_CLUSTER_NAME}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-vms-helpers.sh +source "${SCRIPT_DIR}/vast-vms-helpers.sh" + +vast_require_cmd curl +vast_require_cmd jq +vast_require_cmd python3 + +if ! vast_auth_configured; then + echo "[]" + exit 0 +fi + +response="$(vast_fetch_json_api "/tenants/")" || { + echo "[]" + exit 0 +} + +http_code="$(vast_fetch_http_code "$response")" +body="$(vast_fetch_body "$response")" + +if [[ "$http_code" != "200" ]]; then + echo "[]" + exit 0 +fi + +printf '%s' "$body" | python3 - "$VAST_CLUSTER_NAME" <<'PY' +import json, sys + +cluster_name = sys.argv[1] +raw = sys.stdin.read().strip() +if not raw: + print("[]") + raise SystemExit + +data = json.loads(raw) +items = data if isinstance(data, list) else data.get("results", data.get("tenants", [])) +if not isinstance(items, list): + items = [data] + +names = [] +for item in items: + name = item.get("name") or item.get("tenant_name") + if not name: + continue + cluster = item.get("cluster_name") or item.get("cluster") or item.get("cluster_id") + if cluster_name: + if cluster and str(cluster) != cluster_name and cluster_name not in str(cluster): + continue + names.append(name) + +print(json.dumps(sorted(set(names)))) +PY diff --git a/codebundles/vast-tenant-storage-health/qos_wait_issues.json b/codebundles/vast-tenant-storage-health/qos_wait_issues.json new file mode 100644 index 000000000..3b7d87f24 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/qos_wait_issues.json @@ -0,0 +1,8 @@ +[ + { + "title": "QoS wait time elevated for `demo-tenant`", + "details": "Tenant `demo-tenant` average qos_wait_for_budget_time=1500.0000 across 1 view metric series.", + "severity": 4, + "next_steps": "Inspect tenant QoS policy, metadata IOPS limits, and workloads causing sustained budget waits. Adjust QoS or spread metadata-heavy operations." + } +] diff --git a/codebundles/vast-tenant-storage-health/runbook.robot b/codebundles/vast-tenant-storage-health/runbook.robot new file mode 100644 index 000000000..26b6cf6cc --- /dev/null +++ b/codebundles/vast-tenant-storage-health/runbook.robot @@ -0,0 +1,352 @@ +*** Settings *** +Documentation Monitors per-tenant and per-view storage health on VAST Data including capacity, QoS throttling, latency, and configuration policies. +Metadata Author rw-codebundle-agent +Metadata Display Name VAST Data Tenant Storage Health +Metadata Supports VAST vast_data tenant storage QoS capacity + +Force Tags VAST vast_data tenant storage health + +Library String +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform +Library Collections + +Suite Setup Suite Initialization + +*** Tasks *** +Check Tenant Capacity Utilization for Tenant `${VAST_TENANT_NAME}` on Cluster `${VAST_CLUSTER_NAME}` + [Documentation] Compares tenant logical capacity and DRR against assigned quotas from /api/prometheusmetrics/tenants and /tenants/ REST endpoints. + [Tags] VAST vast_data tenant capacity access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-tenant-capacity.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... include_in_history=false + ... timeout_seconds=180 + ... show_in_rwl_cheatsheet=true + ... cmd_override=VAST_TENANT_NAME="${VAST_TENANT_NAME}" ./check-tenant-capacity.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat tenant_capacity_issues.json + ... timeout_seconds=30 + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for tenant capacity task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Tenant capacity utilization should remain below configured threshold + ... actual=${issue['details']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + RW.Core.Add Pre To Report Tenant capacity (${VAST_TENANT_NAME}):\n${result.stdout} + +Check View Volume Capacity for Tenant `${VAST_TENANT_NAME}` on Cluster `${VAST_CLUSTER_NAME}` + [Documentation] Identifies views approaching or exceeding capacity limits and detects full volumes blocking writes. + [Tags] VAST vast_data view capacity access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-view-capacity.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... include_in_history=false + ... timeout_seconds=180 + ... show_in_rwl_cheatsheet=true + ... cmd_override=VAST_TENANT_NAME="${VAST_TENANT_NAME}" ./check-view-capacity.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat view_capacity_issues.json + ... timeout_seconds=30 + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for view capacity task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=View logical capacity should remain below configured threshold + ... actual=${issue['details']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + RW.Core.Add Pre To Report View capacity (${VAST_TENANT_NAME}):\n${result.stdout} + +Analyze Tenant IOPS and Bandwidth Against QoS Limits for Tenant `${VAST_TENANT_NAME}` + [Documentation] Evaluates read/write IOPS and bandwidth metrics versus configured QoS ceilings and detects sustained throttling. + [Tags] VAST vast_data tenant QoS access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=analyze-tenant-qos.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... include_in_history=false + ... timeout_seconds=180 + ... show_in_rwl_cheatsheet=true + ... cmd_override=VAST_TENANT_NAME="${VAST_TENANT_NAME}" ./analyze-tenant-qos.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat tenant_qos_issues.json + ... timeout_seconds=30 + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for tenant QoS task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Tenant IO should remain below configured QoS limits + ... actual=${issue['details']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + RW.Core.Add Pre To Report Tenant QoS (${VAST_TENANT_NAME}):\n${result.stdout} + +Check QoS Wait Times and Throttling for Tenant `${VAST_TENANT_NAME}` + [Documentation] Inspects QoS wait time metrics and metadata IOPS limits to detect configurations limiting user throughput. + [Tags] VAST vast_data tenant QoS access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-qos-wait-times.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... include_in_history=false + ... timeout_seconds=180 + ... show_in_rwl_cheatsheet=true + ... cmd_override=VAST_TENANT_NAME="${VAST_TENANT_NAME}" ./check-qos-wait-times.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat qos_wait_issues.json + ... timeout_seconds=30 + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for QoS wait task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=QoS wait times should be minimal under normal load + ... actual=${issue['details']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + RW.Core.Add Pre To Report QoS wait times (${VAST_TENANT_NAME}):\n${result.stdout} + +Check User and Permission Configuration for Tenant `${VAST_TENANT_NAME}` + [Documentation] Reviews tenant user/group policies, export permissions, and quota policies that may restrict client access or capacity. + [Tags] VAST vast_data tenant config access:read-only data:config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-tenant-config.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... include_in_history=false + ... timeout_seconds=180 + ... show_in_rwl_cheatsheet=false + ... cmd_override=VAST_TENANT_NAME="${VAST_TENANT_NAME}" ./check-tenant-config.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat tenant_config_issues.json + ... timeout_seconds=30 + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for tenant config task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Tenant configuration should not block legitimate client access + ... actual=${issue['details']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + RW.Core.Add Pre To Report Tenant configuration (${VAST_TENANT_NAME}):\n${result.stdout} + +Analyze Read Write Latency Anomalies for Tenant `${VAST_TENANT_NAME}` + [Documentation] Detects elevated read/write/metadata latency from tenant metrics indicating storage performance degradation or IO stalls. + [Tags] VAST vast_data tenant latency access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=analyze-tenant-latency.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... include_in_history=false + ... timeout_seconds=180 + ... show_in_rwl_cheatsheet=true + ... cmd_override=VAST_TENANT_NAME="${VAST_TENANT_NAME}" ./analyze-tenant-latency.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat tenant_latency_issues.json + ... timeout_seconds=30 + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for tenant latency task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Tenant read/write latency should remain below configured threshold + ... actual=${issue['details']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + RW.Core.Add Pre To Report Tenant latency (${VAST_TENANT_NAME}):\n${result.stdout} + +Check Block Volume Health for Tenant `${VAST_TENANT_NAME}` + [Documentation] Monitors block volume IOPS, bandwidth, and latency via /api/prometheusmetrics/volumes for volumes not flowing data normally. + [Tags] VAST vast_data block volume access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-block-volume-health.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... include_in_history=false + ... timeout_seconds=180 + ... show_in_rwl_cheatsheet=true + ... cmd_override=VAST_TENANT_NAME="${VAST_TENANT_NAME}" ./check-block-volume-health.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat block_volume_issues.json + ... timeout_seconds=30 + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for block volume task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Block volumes should report healthy IO and latency + ... actual=${issue['details']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + RW.Core.Add Pre To Report Block volume health (${VAST_TENANT_NAME}):\n${result.stdout} + +*** Keywords *** +Suite Initialization + TRY + ${vast_vms_credentials}= RW.Core.Import Secret vast_vms_credentials + ... type=string + ... description=VMS API authentication credentials (USERNAME/PASSWORD or API_TOKEN JSON) + ... pattern=\w* + Set Suite Variable ${vast_vms_credentials} ${vast_vms_credentials} + EXCEPT + Log vast_vms_credentials secret not found; VMS API tasks will fail until configured. WARN + Set Suite Variable ${vast_vms_credentials} ${EMPTY} + END + + ${VAST_VMS_ENDPOINT}= RW.Core.Import User Variable VAST_VMS_ENDPOINT + ... type=string + ... description=VMS REST API base URL + ... pattern=\S+ + ${VAST_CLUSTER_NAME}= RW.Core.Import User Variable VAST_CLUSTER_NAME + ... type=string + ... description=VAST cluster name for qualifier scoping + ... pattern=\S+ + ${VAST_TENANT_NAME}= RW.Core.Import User Variable VAST_TENANT_NAME + ... type=string + ... description=VAST tenant name used as SLX qualifier + ... pattern=\S+ + ${TENANTS}= RW.Core.Import User Variable TENANTS + ... type=string + ... description=Tenant name or All for auto-discovery during generation + ... pattern=\S+ + ... default=All + ${CAPACITY_THRESHOLD}= RW.Core.Import User Variable CAPACITY_THRESHOLD + ... type=string + ... description=Tenant/view capacity utilization percent threshold + ... pattern=^[0-9.]+$ + ... default=85 + ${QOS_UTILIZATION_THRESHOLD}= RW.Core.Import User Variable QOS_UTILIZATION_THRESHOLD + ... type=string + ... description=Percent of QoS limit sustained that triggers throttling issue + ... pattern=^[0-9.]+$ + ... default=90 + ${LATENCY_THRESHOLD_MS}= RW.Core.Import User Variable LATENCY_THRESHOLD_MS + ... type=string + ... description=Read/write latency milliseconds above which to raise issue + ... pattern=^[0-9.]+$ + ... default=10 + + ${env}= Create Dictionary + ... VAST_VMS_ENDPOINT=${VAST_VMS_ENDPOINT} + ... VAST_CLUSTER_NAME=${VAST_CLUSTER_NAME} + ... VAST_TENANT_NAME=${VAST_TENANT_NAME} + ... TENANTS=${TENANTS} + ... CAPACITY_THRESHOLD=${CAPACITY_THRESHOLD} + ... QOS_UTILIZATION_THRESHOLD=${QOS_UTILIZATION_THRESHOLD} + ... LATENCY_THRESHOLD_MS=${LATENCY_THRESHOLD_MS} + + Set Suite Variable ${VAST_VMS_ENDPOINT} ${VAST_VMS_ENDPOINT} + Set Suite Variable ${VAST_CLUSTER_NAME} ${VAST_CLUSTER_NAME} + Set Suite Variable ${VAST_TENANT_NAME} ${VAST_TENANT_NAME} + Set Suite Variable ${TENANTS} ${TENANTS} + Set Suite Variable ${CAPACITY_THRESHOLD} ${CAPACITY_THRESHOLD} + Set Suite Variable ${QOS_UTILIZATION_THRESHOLD} ${QOS_UTILIZATION_THRESHOLD} + Set Suite Variable ${LATENCY_THRESHOLD_MS} ${LATENCY_THRESHOLD_MS} + Set Suite Variable ${env} ${env} + + IF '${TENANTS}' == 'All' and '${VAST_TENANT_NAME}' == '' + ${disco}= RW.CLI.Run Bash File + ... bash_file=discover-vast-tenants.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... include_in_history=false + ... timeout_seconds=120 + ... show_in_rwl_cheatsheet=true + ... cmd_override=./discover-vast-tenants.sh + TRY + ${tenant_list}= Evaluate json.loads(r'''${disco.stdout}''') json + EXCEPT + Log Failed to parse tenant discovery JSON. WARN + ${tenant_list}= Create List + END + ${n}= Get Length ${tenant_list} + RW.Core.Add Pre To Report Discovered ${n} tenant(s): ${tenant_list} + END diff --git a/codebundles/vast-tenant-storage-health/sli-vast-capacity-score.sh b/codebundles/vast-tenant-storage-health/sli-vast-capacity-score.sh new file mode 100755 index 000000000..20864032d --- /dev/null +++ b/codebundles/vast-tenant-storage-health/sli-vast-capacity-score.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Lightweight SLI: tenant quota/capacity utilization below CAPACITY_THRESHOLD. +set -euo pipefail + +: "${VAST_VMS_ENDPOINT:?Must set VAST_VMS_ENDPOINT}" +: "${VAST_CLUSTER_NAME:?Must set VAST_CLUSTER_NAME}" +: "${VAST_TENANT_NAME:?Must set VAST_TENANT_NAME}" + +CAPACITY_THRESHOLD="${CAPACITY_THRESHOLD:-85}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-vms-helpers.sh +source "${SCRIPT_DIR}/vast-vms-helpers.sh" + +vast_require_cmd curl +vast_require_cmd jq +vast_require_cmd python3 + +if ! vast_auth_configured; then + echo '{"score":0,"reason":"missing credentials"}' + exit 0 +fi + +quota_resp="$(vast_fetch_prometheus_metrics "quotas")" +quota_code="$(vast_fetch_http_code "$quota_resp")" +quota_body="$(vast_fetch_body "$quota_resp")" + +score=1 +if [[ "$quota_code" == "200" ]]; then + util="$(quota_used="$(vast_prom_metric_sum "$quota_body" "vast_quota_used_capacity" || true)"; \ + quota_hard="$(vast_prom_metric_values "$quota_body" "vast_quota_hard_limit" || true)"; \ + if [[ -n "$quota_used" && -n "$quota_hard" ]]; then vast_percent_util "$quota_used" "$quota_hard"; else echo ""; fi)" + if [[ -n "$util" ]] && python3 -c "import sys; sys.exit(0 if float(sys.argv[1]) < float(sys.argv[2]) else 1)" "$util" "$CAPACITY_THRESHOLD"; then + score=0 + fi +fi + +echo "{\"score\":${score},\"utilization_pct\":\"${util:-unknown}\"}" diff --git a/codebundles/vast-tenant-storage-health/sli-vast-latency-score.sh b/codebundles/vast-tenant-storage-health/sli-vast-latency-score.sh new file mode 100755 index 000000000..a5542abc0 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/sli-vast-latency-score.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Lightweight SLI: tenant read/write latency below LATENCY_THRESHOLD_MS. +set -euo pipefail + +: "${VAST_VMS_ENDPOINT:?Must set VAST_VMS_ENDPOINT}" +: "${VAST_CLUSTER_NAME:?Must set VAST_CLUSTER_NAME}" +: "${VAST_TENANT_NAME:?Must set VAST_TENANT_NAME}" + +LATENCY_THRESHOLD_MS="${LATENCY_THRESHOLD_MS:-10}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-vms-helpers.sh +source "${SCRIPT_DIR}/vast-vms-helpers.sh" + +vast_require_cmd curl +vast_require_cmd python3 + +if ! vast_auth_configured; then + echo '{"score":0,"reason":"missing credentials"}' + exit 0 +fi + +tenant_resp="$(vast_fetch_prometheus_metrics "tenants")" +tenant_code="$(vast_fetch_http_code "$tenant_resp")" +tenant_body="$(vast_fetch_body "$tenant_resp")" + +score=1 +max_latency="" +if [[ "$tenant_code" == "200" ]]; then + read_lat="$(printf '%s' "$tenant_body" | vast_prom_metric_values "vast_tenant_metrics_TenantMetrics_read_latency" || true)" + write_lat="$(printf '%s' "$tenant_body" | vast_prom_metric_values "vast_tenant_metrics_TenantMetrics_write_latency" || true)" + max_latency="$(python3 - "$read_lat" "$write_lat" <<'PY' +import sys +vals = [] +for v in sys.argv[1:]: + if v: + try: + vals.append(float(v)) + except ValueError: + pass +print(max(vals) if vals else "") +PY +)" + if [[ -n "$max_latency" ]]; then + latency_ms="$(python3 -c "v=float('$max_latency'); print(v/1000 if v>1000 else v)")" + if python3 -c "import sys; sys.exit(0 if float(sys.argv[1]) <= float(sys.argv[2]) else 1)" "$latency_ms" "$LATENCY_THRESHOLD_MS"; then + score=0 + fi + fi +fi + +echo "{\"score\":${score},\"max_latency_ms\":\"${latency_ms:-unknown}\"}" diff --git a/codebundles/vast-tenant-storage-health/sli-vast-qos-score.sh b/codebundles/vast-tenant-storage-health/sli-vast-qos-score.sh new file mode 100755 index 000000000..29ec09395 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/sli-vast-qos-score.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Lightweight SLI: no sustained QoS wait time samples for tenant views. +set -euo pipefail + +: "${VAST_VMS_ENDPOINT:?Must set VAST_VMS_ENDPOINT}" +: "${VAST_CLUSTER_NAME:?Must set VAST_CLUSTER_NAME}" +: "${VAST_TENANT_NAME:?Must set VAST_TENANT_NAME}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=vast-vms-helpers.sh +source "${SCRIPT_DIR}/vast-vms-helpers.sh" + +vast_require_cmd curl +vast_require_cmd python3 + +if ! vast_auth_configured; then + echo '{"score":0,"reason":"missing credentials"}' + exit 0 +fi + +views_resp="$(vast_fetch_prometheus_metrics "views")" +views_code="$(vast_fetch_http_code "$views_resp")" +views_body="$(vast_fetch_body "$views_resp")" + +score=1 +if [[ "$views_code" == "200" ]]; then + wait_sum="$(printf '%s' "$views_body" | vast_prom_metric_sum "vast_view_metrics_ViewMetrics_qos_wait_for_budget_time" || true)" + if [[ -n "$wait_sum" ]] && python3 -c "import sys; sys.exit(0 if float(sys.argv[1]) <= 0 else 1)" "$wait_sum"; then + score=0 + fi +fi + +echo "{\"score\":${score},\"qos_wait_sum\":\"${wait_sum:-0}\"}" diff --git a/codebundles/vast-tenant-storage-health/sli.robot b/codebundles/vast-tenant-storage-health/sli.robot new file mode 100644 index 000000000..4faedba62 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/sli.robot @@ -0,0 +1,132 @@ +*** Settings *** +Documentation Measures VAST tenant storage health from capacity, QoS wait times, and latency. Produces a score between 0 and 1. +Metadata Author rw-codebundle-agent +Metadata Display Name VAST Data Tenant Storage Health SLI +Metadata Supports VAST vast_data tenant storage + +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform + +Suite Setup Suite Initialization + +*** Keywords *** +Suite Initialization + TRY + ${vast_vms_credentials}= RW.Core.Import Secret vast_vms_credentials + ... type=string + ... description=VMS API authentication credentials (USERNAME/PASSWORD or API_TOKEN JSON) + ... pattern=\w* + Set Suite Variable ${vast_vms_credentials} ${vast_vms_credentials} + EXCEPT + Log vast_vms_credentials secret not found. WARN + Set Suite Variable ${vast_vms_credentials} ${EMPTY} + END + + ${VAST_VMS_ENDPOINT}= RW.Core.Import User Variable VAST_VMS_ENDPOINT + ... type=string + ... description=VMS REST API base URL + ... pattern=\S+ + ${VAST_CLUSTER_NAME}= RW.Core.Import User Variable VAST_CLUSTER_NAME + ... type=string + ... description=VAST cluster name for qualifier scoping + ... pattern=\S+ + ${VAST_TENANT_NAME}= RW.Core.Import User Variable VAST_TENANT_NAME + ... type=string + ... description=VAST tenant name used as SLX qualifier + ... pattern=\S+ + ${CAPACITY_THRESHOLD}= RW.Core.Import User Variable CAPACITY_THRESHOLD + ... type=string + ... description=Tenant/view capacity utilization percent threshold + ... pattern=^[0-9.]+$ + ... default=85 + ${QOS_UTILIZATION_THRESHOLD}= RW.Core.Import User Variable QOS_UTILIZATION_THRESHOLD + ... type=string + ... description=Percent of QoS limit sustained that triggers throttling issue + ... pattern=^[0-9.]+$ + ... default=90 + ${LATENCY_THRESHOLD_MS}= RW.Core.Import User Variable LATENCY_THRESHOLD_MS + ... type=string + ... description=Read/write latency milliseconds above which to raise issue + ... pattern=^[0-9.]+$ + ... default=10 + + ${env}= Create Dictionary + ... VAST_VMS_ENDPOINT=${VAST_VMS_ENDPOINT} + ... VAST_CLUSTER_NAME=${VAST_CLUSTER_NAME} + ... VAST_TENANT_NAME=${VAST_TENANT_NAME} + ... CAPACITY_THRESHOLD=${CAPACITY_THRESHOLD} + ... QOS_UTILIZATION_THRESHOLD=${QOS_UTILIZATION_THRESHOLD} + ... LATENCY_THRESHOLD_MS=${LATENCY_THRESHOLD_MS} + Set Suite Variable ${env} ${env} + Set Suite Variable ${score_capacity} 0 + Set Suite Variable ${score_qos} 0 + Set Suite Variable ${score_latency} 0 + +*** Tasks *** +Score Tenant Capacity Utilization + [Documentation] Binary 1/0 score when tenant quota utilization is below CAPACITY_THRESHOLD. + [Tags] VAST vast_data sli access:read-only data:metrics + ${out}= RW.CLI.Run Bash File + ... bash_file=sli-vast-capacity-score.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... include_in_history=false + ... timeout_seconds=60 + TRY + ${data}= Evaluate json.loads(r'''${out.stdout}''') json + EXCEPT + Log SLI capacity JSON parse failed; scoring 0. WARN + ${data}= Create Dictionary score=0 + END + ${s}= Set Variable ${data.get('score', 0)} + Set Suite Variable ${score_capacity} ${s} + RW.Core.Push Metric ${s} sub_name=capacity + +Score Tenant QoS Wait Times + [Documentation] Binary 1/0 score when QoS wait time metrics indicate no throttling. + [Tags] VAST vast_data sli access:read-only data:metrics + ${out}= RW.CLI.Run Bash File + ... bash_file=sli-vast-qos-score.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... include_in_history=false + ... timeout_seconds=60 + TRY + ${data}= Evaluate json.loads(r'''${out.stdout}''') json + EXCEPT + Log SLI QoS JSON parse failed; scoring 0. WARN + ${data}= Create Dictionary score=0 + END + ${s}= Set Variable ${data.get('score', 0)} + Set Suite Variable ${score_qos} ${s} + RW.Core.Push Metric ${s} sub_name=qos + +Score Tenant Read Write Latency + [Documentation] Binary 1/0 score when tenant latency remains below LATENCY_THRESHOLD_MS. + [Tags] VAST vast_data sli access:read-only data:metrics + ${out}= RW.CLI.Run Bash File + ... bash_file=sli-vast-latency-score.sh + ... env=${env} + ... secret__vast_vms_credentials=${vast_vms_credentials} + ... include_in_history=false + ... timeout_seconds=60 + TRY + ${data}= Evaluate json.loads(r'''${out.stdout}''') json + EXCEPT + Log SLI latency JSON parse failed; scoring 0. WARN + ${data}= Create Dictionary score=0 + END + ${s}= Set Variable ${data.get('score', 0)} + Set Suite Variable ${score_latency} ${s} + RW.Core.Push Metric ${s} sub_name=latency + +Generate Aggregate VAST Tenant Storage Health Score + [Documentation] Averages binary sub-scores into the primary 0-1 SLI metric. + [Tags] VAST vast_data sli access:read-only data:metrics + ${health_score}= Evaluate (int(${score_capacity}) + int(${score_qos}) + int(${score_latency})) / 3.0 + ${health_score}= Convert To Number ${health_score} 2 + ${report_line}= Set Variable VAST tenant storage health score: ${health_score} [capacity=${score_capacity}, qos=${score_qos}, latency=${score_latency}] + RW.Core.Add to Report ${report_line} + RW.Core.Push Metric ${health_score} diff --git a/codebundles/vast-tenant-storage-health/tenant_qos_issues.json b/codebundles/vast-tenant-storage-health/tenant_qos_issues.json new file mode 100644 index 000000000..fe51488c7 --- /dev/null +++ b/codebundles/vast-tenant-storage-health/tenant_qos_issues.json @@ -0,0 +1 @@ +[] diff --git a/codebundles/vast-tenant-storage-health/vast-vms-helpers.sh b/codebundles/vast-tenant-storage-health/vast-vms-helpers.sh new file mode 100755 index 000000000..3d96b2f3c --- /dev/null +++ b/codebundles/vast-tenant-storage-health/vast-vms-helpers.sh @@ -0,0 +1,265 @@ +#!/usr/bin/env bash +# Shared VAST VMS API helpers. Source from task scripts; do not execute directly. + +vast_require_cmd() { + command -v "$1" >/dev/null 2>&1 || { + echo "Required command not found: $1" >&2 + exit 1 + } +} + +vast_normalize_endpoint() { + local ep="${VAST_VMS_ENDPOINT%/}" + ep="${ep%/api}" + echo "$ep" +} + +vast_load_auth() { + local creds_json="${vast_vms_credentials:-${VAST_VMS_CREDENTIALS:-}}" + + if [[ -n "$creds_json" ]]; then + VAST_USERNAME="$(printf '%s' "$creds_json" | jq -r '.USERNAME // .username // empty')" + VAST_PASSWORD="$(printf '%s' "$creds_json" | jq -r '.PASSWORD // .password // empty')" + VAST_API_TOKEN="$(printf '%s' "$creds_json" | jq -r '.API_TOKEN // .api_token // empty')" + fi + + VAST_USERNAME="${VAST_USERNAME:-${USERNAME:-}}" + VAST_PASSWORD="${VAST_PASSWORD:-${PASSWORD:-}}" + VAST_API_TOKEN="${VAST_API_TOKEN:-${API_TOKEN:-}}" +} + +vast_auth_configured() { + vast_load_auth + if [[ -n "${VAST_API_TOKEN:-}" ]]; then + return 0 + fi + if [[ -n "${VAST_USERNAME:-}" && -n "${VAST_PASSWORD:-}" ]]; then + return 0 + fi + return 1 +} + +vast_curl_common_args() { + local -n _out=$1 + _out=(-sS --max-time "${VAST_API_TIMEOUT:-90}" -k) + if [[ -n "${VAST_API_TOKEN:-}" ]]; then + _out+=(-H "Authorization: Bearer ${VAST_API_TOKEN}") + elif [[ -n "${VAST_USERNAME:-}" && -n "${VAST_PASSWORD:-}" ]]; then + _out+=(-u "${VAST_USERNAME}:${VAST_PASSWORD}") + fi + if [[ -n "${VAST_TENANT_NAME:-}" ]]; then + _out+=(-H "X-Tenant-Name: ${VAST_TENANT_NAME}") + fi +} + +vast_api_url() { + local path="$1" + local endpoint + endpoint="$(vast_normalize_endpoint)" + if [[ "$path" != /* ]]; then + path="/${path}" + fi + if [[ "$path" == /api/* ]]; then + echo "${endpoint}${path}" + else + echo "${endpoint}/api${path}" + fi +} + +vast_http_request() { + local method="${1:-GET}" + local path="$2" + local url + url="$(vast_api_url "$path")" + local -a curl_args=() + vast_curl_common_args curl_args + curl "${curl_args[@]}" -X "$method" -w $'\n%{http_code}' "$url" +} + +vast_fetch_body() { + local response="$1" + printf '%s' "$response" | sed '$d' +} + +vast_fetch_http_code() { + local response="$1" + printf '%s' "$response" | tail -n1 +} + +vast_fetch_prometheus_metrics() { + local metrics_path="$1" + vast_http_request GET "/api/prometheusmetrics/${metrics_path}" +} + +vast_fetch_json_api() { + local path="$1" + vast_http_request GET "$path" +} + +vast_add_api_error_issue() { + local issues_json="$1" + local title="$2" + local details="$3" + local severity="${4:-4}" + local next_steps="${5:-Verify VAST_VMS_ENDPOINT, credentials, and network access to the VMS API.}" + echo "$issues_json" | jq \ + --arg title "$title" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "$next_steps" \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]' +} + +vast_prom_metric_values() { + local prom_text="$1" + local metric_prefix="$2" + local tenant="${VAST_TENANT_NAME:-}" + local cluster="${VAST_CLUSTER_NAME:-}" + python3 - "$metric_prefix" "$tenant" "$cluster" <<'PY' +import sys + +metric_prefix, tenant, cluster = sys.argv[1:4] +text = sys.stdin.read() +values = [] + +def labels_match(labels: str) -> bool: + if tenant: + tenant_keys = ( + f'tenant_name="{tenant}"', + f'tenant="{tenant}"', + f'name="{tenant}"', + ) + if not any(k in labels for k in tenant_keys): + return False + if cluster and f'cluster="{cluster}"' not in labels: + return False + return True + +for line in text.splitlines(): + if not line or line.startswith("#"): + continue + if not line.startswith(metric_prefix): + continue + if "{" in line: + name_part, rest = line.split("{", 1) + labels_part, value_part = rest.rsplit("}", 1) + if not name_part.startswith(metric_prefix): + continue + if not labels_match("{" + labels_part + "}"): + continue + try: + values.append(float(value_part.strip())) + except ValueError: + pass + else: + parts = line.split() + if len(parts) >= 2 and parts[0].startswith(metric_prefix): + try: + values.append(float(parts[-1])) + except ValueError: + pass + +if values: + print(max(values)) +PY +} + +vast_prom_metric_sum() { + local prom_text="$1" + local metric_prefix="$2" + local tenant="${VAST_TENANT_NAME:-}" + local cluster="${VAST_CLUSTER_NAME:-}" + python3 - "$metric_prefix" "$tenant" "$cluster" <<'PY' +import sys + +metric_prefix, tenant, cluster = sys.argv[1:4] +text = sys.stdin.read() +total = 0.0 +found = False + +def labels_match(labels: str) -> bool: + if tenant: + tenant_keys = ( + f'tenant_name="{tenant}"', + f'tenant="{tenant}"', + f'name="{tenant}"', + ) + if not any(k in labels for k in tenant_keys): + return False + if cluster and f'cluster="{cluster}"' not in labels: + return False + return True + +for line in text.splitlines(): + if not line or line.startswith("#"): + continue + if not line.startswith(metric_prefix): + continue + if "{" in line: + name_part, rest = line.split("{", 1) + labels_part, value_part = rest.rsplit("}", 1) + if not name_part.startswith(metric_prefix): + continue + if not labels_match("{" + labels_part + "}"): + continue + try: + total += float(value_part.strip()) + found = True + except ValueError: + pass + +print(total if found else "") +PY +} + +vast_percent_util() { + local used="$1" + local limit="$2" + python3 - "$used" "$limit" <<'PY' +import sys +used, limit = sys.argv[1:3] +try: + u = float(used) + l = float(limit) +except ValueError: + print("") + raise SystemExit +if l <= 0: + print("") +else: + print(f"{(u / l) * 100:.2f}") +PY +} + +vast_find_tenant_json() { + local tenants_json="$1" + local tenant_name="${VAST_TENANT_NAME:-}" + local cluster_name="${VAST_CLUSTER_NAME:-}" + python3 - "$tenant_name" "$cluster_name" <<'PY' +import json, sys + +tenant_name, cluster_name = sys.argv[1:3] +raw = sys.stdin.read().strip() +if not raw: + print("{}") + raise SystemExit + +data = json.loads(raw) +items = data if isinstance(data, list) else data.get("results", data.get("tenants", [])) +if not isinstance(items, list): + items = [data] + +for item in items: + name = item.get("name") or item.get("tenant_name") + if name != tenant_name: + continue + if cluster_name: + cluster = item.get("cluster_name") or item.get("cluster") or item.get("cluster_id") + if cluster and str(cluster) != cluster_name and cluster_name not in str(cluster): + continue + print(json.dumps(item)) + raise SystemExit + +print("{}") +PY +}