diff --git a/.gitignore b/.gitignore index fd41c7a0..c941dfc9 100644 --- a/.gitignore +++ b/.gitignore @@ -63,4 +63,5 @@ build **vm_disk_stdout** **/.test/.gitconfig Runbook Log.html -**/debug/** \ No newline at end of file +**/debug/** +**.cb-temp/** */ \ No newline at end of file diff --git a/codebundles/k8s-seaweedfs-healthcheck/.runwhen/generation-rules/k8s-seaweedfs-healthcheck.yaml b/codebundles/k8s-seaweedfs-healthcheck/.runwhen/generation-rules/k8s-seaweedfs-healthcheck.yaml new file mode 100644 index 00000000..3f4d8d80 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/.runwhen/generation-rules/k8s-seaweedfs-healthcheck.yaml @@ -0,0 +1,32 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + generationRules: + - resourceTypes: + - statefulset + matchRules: + - type: and + matches: + - type: pattern + pattern: "seaweedfs" + properties: [label-values] + mode: substring + - type: pattern + pattern: "seaweedfs-" + properties: [label-values] + mode: substring + - type: pattern + pattern: "master" + properties: [label-values] + mode: substring + slxs: + - baseName: swfs-hlth + shortenedBaseName: swfs-hc + qualifiers: ["namespace", "cluster"] + baseTemplateName: k8s-seaweedfs-healthcheck + levelOfDetail: basic + outputItems: + - type: slx + - type: sli + - type: runbook + templateName: k8s-seaweedfs-healthcheck-taskset.yaml diff --git a/codebundles/k8s-seaweedfs-healthcheck/.runwhen/templates/k8s-seaweedfs-healthcheck-sli.yaml b/codebundles/k8s-seaweedfs-healthcheck/.runwhen/templates/k8s-seaweedfs-healthcheck-sli.yaml new file mode 100644 index 00000000..de202283 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/.runwhen/templates/k8s-seaweedfs-healthcheck-sli.yaml @@ -0,0 +1,52 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelIndicator +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + displayUnitsLong: OK + displayUnitsShort: ok + locations: + - {{default_location}} + description: Aggregates SeaweedFS workload readiness, master health, volume slot availability, and filer connectivity into a 0-1 score. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/k8s-seaweedfs-healthcheck/sli.robot + intervalStrategy: intermezzo + intervalSeconds: 300 + configProvided: + - name: KUBERNETES_DISTRIBUTION_BINARY + value: "{{ custom.kubernetes_distribution_binary | default('kubectl') }}" + - name: CONTEXT + value: "{{ context }}" + - name: NAMESPACE + value: "{{ match_resource.resource.metadata.namespace }}" + - name: SEAWEEDFS_RELEASE_NAME + value: "{{ match_resource.resource.metadata.labels['app.kubernetes.io/instance'] | default(custom.seaweedfs_release_name | default('')) }}" + - name: SEAWEEDFS_CHART + value: "{{ match_resource.resource.metadata.labels['helm.sh/chart'] | default(custom.seaweedfs_chart | default('')) }}" + - name: MIN_FREE_VOLUME_SLOTS + value: "{{ custom.min_free_volume_slots | default('1') }}" + secretsProvided: + {% if wb_version %} + {% include "kubernetes-auth.yaml" ignore missing %} + {% else %} + - name: kubeconfig + workspaceKey: {{ custom.kubeconfig_secret_name | default("kubeconfig") }} + {% endif %} + alertConfig: + tasks: + persona: eager-edgar + sessionTTL: 10m diff --git a/codebundles/k8s-seaweedfs-healthcheck/.runwhen/templates/k8s-seaweedfs-healthcheck-slx.yaml b/codebundles/k8s-seaweedfs-healthcheck/.runwhen/templates/k8s-seaweedfs-healthcheck-slx.yaml new file mode 100644 index 00000000..a3e39031 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/.runwhen/templates/k8s-seaweedfs-healthcheck-slx.yaml @@ -0,0 +1,27 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/seaweed-logo.png + alias: SeaweedFS Health in {{ match_resource.resource.metadata.namespace }} + asMeasuredBy: SeaweedFS master leadership, volume slot availability, filer connectivity, and optional S3 gateway probes. + configProvided: + - name: SEAWEEDFS_RELEASE_NAME + value: "{{ match_resource.resource.metadata.labels['app.kubernetes.io/instance'] | default('') }}" + - name: SEAWEEDFS_CHART + value: "{{ match_resource.resource.metadata.labels['helm.sh/chart'] | default('') }}" + owners: + - {{ workspace.owner_email }} + statement: SeaweedFS storage in namespace {{ match_resource.resource.metadata.namespace }} should have healthy masters, volume capacity, and working filer connectivity. + additionalContext: + {% include "kubernetes-hierarchy.yaml" ignore missing %} + qualified_name: "{{ match_resource.qualified_name }}" + tags: + {% include "kubernetes-tags.yaml" ignore missing %} + - name: access + value: read-only diff --git a/codebundles/k8s-seaweedfs-healthcheck/.runwhen/templates/k8s-seaweedfs-healthcheck-taskset.yaml b/codebundles/k8s-seaweedfs-healthcheck/.runwhen/templates/k8s-seaweedfs-healthcheck-taskset.yaml new file mode 100644 index 00000000..ea674139 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/.runwhen/templates/k8s-seaweedfs-healthcheck-taskset.yaml @@ -0,0 +1,63 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + description: Validates SeaweedFS master, volume, filer, and S3 gateway health in the target namespace. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/k8s-seaweedfs-healthcheck/runbook.robot + configProvided: + - name: KUBERNETES_DISTRIBUTION_BINARY + value: "{{ custom.kubernetes_distribution_binary | default('kubectl') }}" + - name: CONTEXT + value: "{{ context }}" + - name: NAMESPACE + value: "{{ match_resource.resource.metadata.namespace }}" + - name: SEAWEEDFS_RELEASE_NAME + value: "{{ match_resource.resource.metadata.labels['app.kubernetes.io/instance'] | default(custom.seaweedfs_release_name | default('')) }}" + - name: SEAWEEDFS_CHART + value: "{{ match_resource.resource.metadata.labels['helm.sh/chart'] | default(custom.seaweedfs_chart | default('')) }}" + - name: SEAWEEDFS_MASTER_SERVICE + value: "{{ custom.seaweedfs_master_service | default('') }}" + - name: SEAWEEDFS_FILER_SERVICE + value: "{{ custom.seaweedfs_filer_service | default('') }}" + - name: SEAWEEDFS_S3_ENDPOINT + value: "{{ custom.seaweedfs_s3_endpoint | default('') }}" + - name: MIN_FREE_VOLUME_SLOTS + value: "{{ custom.min_free_volume_slots | default('1') }}" + - name: MIN_FREE_DISK_PERCENT + value: "{{ custom.min_free_disk_percent | default('10') }}" + - name: S3_PROBE_BUCKET + value: "{{ custom.s3_probe_bucket | default('') }}" + - name: CAPACITY_WARN_PERCENT + value: "{{ custom.capacity_warn_percent | default('80') }}" + - name: MIN_PROJECTION_HOURS + value: "{{ custom.min_projection_hours | default('24') }}" + - name: MAX_PICK_FOR_WRITE_ERRORS + value: "{{ custom.max_pick_for_write_errors | default('100') }}" + - name: MAX_VOLUME_DISK_ERRORS + value: "{{ custom.max_volume_disk_errors | default('50') }}" + secretsProvided: + {% if wb_version %} + {% include "kubernetes-auth.yaml" ignore missing %} + {% else %} + - name: kubeconfig + workspaceKey: {{ custom.kubeconfig_secret_name | default("kubeconfig") }} + - name: seaweedfs_s3_credentials + workspaceKey: {{ custom.seaweedfs_s3_credentials_secret_name | default("seaweedfs_s3_credentials") }} + {% endif %} diff --git a/codebundles/k8s-seaweedfs-healthcheck/.test/.gitignore b/codebundles/k8s-seaweedfs-healthcheck/.test/.gitignore new file mode 100644 index 00000000..51224c00 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/.test/.gitignore @@ -0,0 +1,8 @@ +output/ +workspaceInfo.yaml +kubeconfig.secret +.azure-devops/ +.kube/ +terraform/.terraform/ +terraform/terraform.tfstate* +terraform/*.tfvars.secret diff --git a/codebundles/k8s-seaweedfs-healthcheck/.test/README.md b/codebundles/k8s-seaweedfs-healthcheck/.test/README.md new file mode 100644 index 00000000..f13f236d --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/.test/README.md @@ -0,0 +1,65 @@ +# SeaweedFS Healthcheck — `.test` discovery + +Two modes: + +| Task | Use when | +|---|---| +| `task` / `task discover-live` | SeaweedFS already exists on a shared cluster (e.g. `runwhen-env-test`) | +| `task discover-ci` | Spin up isolated SeaweedFS via Terraform (Kind / CI) | +| `task validate-generation-rules` | Schema-check generation rules only | + +## Live cluster discovery (recommended here) + +RunWhen Local reads your kubeconfig, scans **one namespace** for SeaweedFS master StatefulSets, and writes SLX/SLI/runbook YAML under `output/`. + +```bash +cd codecollection/codebundles/k8s-seaweedfs-healthcheck/.test + +export RW_FROM_FILE='{"kubeconfig":"/home/runwhen/auth/shared-kubeconfig"}' +export TEST_NAMESPACE='runwhen-env-test' # namespace with SeaweedFS master STS +export RW_WORKSPACE='seaweedfs-dev' # output folder name + +task discover-live +# or simply: task +``` + +Review generated SLXs: + +```bash +ls output/workspaces/seaweedfs-dev/slxs/ +``` + +### Environment variables + +| Variable | Default | Purpose | +|---|---|---| +| `RW_FROM_FILE` | — | JSON with `kubeconfig` path (same as `ro` dev mode) | +| `KUBECONFIG` | — | Alternative kubeconfig path | +| `KUBECONFIG_PATH` | `/home/runwhen/auth/shared-kubeconfig` | Fallback path | +| `TEST_NAMESPACE` | `runwhen-env-test` | Namespace scoped to `detailed` LOD | +| `RW_WORKSPACE` | `seaweedfs-dev` | Workspace name in output tree | + +You can also drop a file named `kubeconfig.secret` in this directory and skip `prepare-kubeconfig`. + +### Important: discovery uses the remote git branch + +RunWhen Local **clones your codecollection repo from GitHub**, not the local working tree. Generation-rule or template changes only appear in discovery after you **commit and push** the branch referenced in `workspaceInfo.yaml`. + +For quick script/robot iteration without discovery, use `ro` in the bundle root instead. + +## Terraform test infra (CI / isolated cluster) + +```bash +cd .test +# edit terraform/terraform.tfvars for your kube context +task build-infra +task discover-ci +task clean # destroys Terraform release + discovery output +``` + +## Cleanup + +```bash +task clean-rwl-discovery # output/, workspaceInfo.yaml, kubeconfig.secret only +task clean # also runs terraform destroy when state exists +``` diff --git a/codebundles/k8s-seaweedfs-healthcheck/.test/Taskfile.yaml b/codebundles/k8s-seaweedfs-healthcheck/.test/Taskfile.yaml new file mode 100644 index 00000000..2fcefa56 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/.test/Taskfile.yaml @@ -0,0 +1,250 @@ +version: "3" + +tasks: + default: + desc: "Discover SLXs against a live cluster (no Terraform)" + cmds: + - task: discover-live + + discover-live: + desc: "Generate workspaceInfo and run RunWhen Local discovery on an existing cluster" + cmds: + - task: prepare-kubeconfig + - task: generate-rwl-config + - task: run-rwl-discovery + + discover-ci: + desc: "Terraform test infra + discovery (CI / Kind)" + cmds: + - task: check-unpushed-commits + - task: build-infra + - task: prepare-kubeconfig + - task: generate-rwl-config-from-terraform + - task: run-rwl-discovery + + clean: + desc: "Destroy Terraform resources (if any) and local discovery output" + cmds: + - task: check-and-cleanup-terraform + - task: clean-rwl-discovery + + build-infra: + desc: "Deploy SeaweedFS Helm release via Terraform" + cmds: + - task: build-terraform-infra + + build-terraform-infra: + desc: "Run terraform apply" + dir: terraform + cmds: + - terraform init + - terraform apply -auto-approve + silent: true + + check-and-cleanup-terraform: + desc: "Destroy Terraform resources when state exists" + dir: terraform + cmds: + - | + if [ -f "terraform.tfstate" ]; then + terraform destroy -auto-approve + fi + silent: true + + prepare-kubeconfig: + desc: "Copy kubeconfig into .test/kubeconfig.secret for RunWhen Local" + cmds: + - | + set -euo pipefail + src="" + if [ -n "${RW_FROM_FILE:-}" ]; then + src=$(echo "$RW_FROM_FILE" | jq -r '.kubeconfig // empty') + elif [ -n "${KUBECONFIG:-}" ]; then + src="$KUBECONFIG" + elif [ -f "${KUBECONFIG_PATH:-/home/runwhen/auth/shared-kubeconfig}" ]; then + src="${KUBECONFIG_PATH:-/home/runwhen/auth/shared-kubeconfig}" + fi + if [ -n "$src" ] && [ -f "$src" ]; then + cp "$src" kubeconfig.secret + echo "Prepared kubeconfig.secret from $src" + elif [ -f "kubeconfig.secret" ]; then + echo "Using existing kubeconfig.secret" + else + echo "No kubeconfig found." + echo "Set RW_FROM_FILE='{\"kubeconfig\":\"/path/to/kubeconfig\"}', KUBECONFIG, or KUBECONFIG_PATH," + echo "or place a file named kubeconfig.secret in $(pwd)." + exit 1 + fi + chmod 644 kubeconfig.secret + silent: true + + generate-rwl-config: + desc: "Write workspaceInfo.yaml scoped to TEST_NAMESPACE on a live cluster" + env: + RW_WORKSPACE: '{{.RW_WORKSPACE | default "seaweedfs-dev"}}' + TEST_NAMESPACE: '{{.TEST_NAMESPACE | default "runwhen-env-test"}}' + cmds: + - | + set -euo pipefail + repo_url=$(git -C .. config --get remote.origin.url) + branch_name=$(git -C .. rev-parse --abbrev-ref HEAD) + codebundle=$(basename "$(dirname "$PWD")") + + cat < workspaceInfo.yaml + workspaceName: "$RW_WORKSPACE" + workspaceOwnerEmail: authors@runwhen.com + defaultLocation: location-01 + defaultLOD: none + cloudConfig: + kubernetes: + kubeconfigFile: /shared/kubeconfig.secret + namespaceLODs: + $TEST_NAMESPACE: detailed + namespaces: + - $TEST_NAMESPACE + codeCollections: + - repoURL: "$repo_url" + branch: "$branch_name" + codeBundles: ["$codebundle"] + custom: + kubeconfig_secret_name: "kubeconfig" + kubernetes_distribution_binary: kubectl + EOF + echo "Generated workspaceInfo.yaml (namespace=$TEST_NAMESPACE, codebundle=$codebundle, branch=$branch_name)" + silent: true + + generate-rwl-config-from-terraform: + desc: "Write workspaceInfo.yaml using namespace from Terraform outputs" + env: + RW_WORKSPACE: '{{.RW_WORKSPACE | default "seaweedfs-dev"}}' + cmds: + - | + set -euo pipefail + repo_url=$(git -C .. config --get remote.origin.url) + branch_name=$(git -C .. rev-parse --abbrev-ref HEAD) + codebundle=$(basename "$(dirname "$PWD")") + namespace=$(terraform -chdir=terraform output -raw namespace) + + cat < workspaceInfo.yaml + workspaceName: "$RW_WORKSPACE" + workspaceOwnerEmail: authors@runwhen.com + defaultLocation: location-01 + defaultLOD: none + cloudConfig: + kubernetes: + kubeconfigFile: /shared/kubeconfig.secret + namespaceLODs: + $namespace: detailed + namespaces: + - $namespace + codeCollections: + - repoURL: "$repo_url" + branch: "$branch_name" + codeBundles: ["$codebundle"] + custom: + kubeconfig_secret_name: "kubeconfig" + kubernetes_distribution_binary: kubectl + EOF + echo "Generated workspaceInfo.yaml (namespace=$namespace from Terraform)" + silent: true + + run-rwl-discovery: + desc: "Start RunWhen Local and run workspace builder discovery" + cmds: + - | + set -euo pipefail + CONTAINER_NAME="RunWhenLocal" + + if [ ! -f "workspaceInfo.yaml" ]; then + echo "Missing workspaceInfo.yaml — run generate-rwl-config first." + exit 1 + fi + if [ ! -f "kubeconfig.secret" ]; then + echo "Missing kubeconfig.secret — run prepare-kubeconfig first." + exit 1 + fi + + if docker ps -q --filter "name=$CONTAINER_NAME" | grep -q .; then + echo "Stopping and removing existing container $CONTAINER_NAME..." + docker stop $CONTAINER_NAME && docker rm $CONTAINER_NAME + elif docker ps -a -q --filter "name=$CONTAINER_NAME" | grep -q .; then + echo "Removing existing stopped container $CONTAINER_NAME..." + docker rm $CONTAINER_NAME + fi + + sudo rm -rf output || rm -rf output + mkdir -p output .azure-devops .kube && chmod 777 output .azure-devops .kube + + docker run --name $CONTAINER_NAME -p 8081:8081 \ + -v "$(pwd)":/shared \ + -d ghcr.io/runwhen-contrib/runwhen-local:latest + + echo "Waiting for workspace builder to start..." + for i in $(seq 1 30); do + if docker exec $CONTAINER_NAME curl -sf http://127.0.0.1:8081/health >/dev/null 2>&1; then + break + fi + sleep 2 + done + + docker exec -w /workspace-builder $CONTAINER_NAME ./run.sh {{.CLI_ARGS}} --verbose + + echo "" + echo "Discovery output: output/workspaces/${RW_WORKSPACE:-seaweedfs-dev}/slxs/" + ls -1 "output/workspaces/${RW_WORKSPACE:-seaweedfs-dev}/slxs/" 2>/dev/null || true + silent: true + + check-unpushed-commits: + desc: "Fail if bundle changes are not committed and pushed (required for CI discovery)" + vars: + BASE_DIR: "../" + cmds: + - | + UNCOMMITTED_FILES=$(git -C .. diff --name-only HEAD | grep -E "^(\.runwhen|[^/]+)" | grep -v "^\.test/" || true) + if [ -n "$UNCOMMITTED_FILES" ]; then + echo "Uncommitted changes in the codebundle:" + echo "$UNCOMMITTED_FILES" + exit 1 + fi + - | + git -C .. fetch origin + branch=$(git -C .. rev-parse --abbrev-ref HEAD) + UNPUSHED_FILES=$(git -C .. diff --name-only "origin/${branch}" HEAD | grep -E "^(\.runwhen|[^/]+)" | grep -v "^\.test/" || true) + if [ -n "$UNPUSHED_FILES" ]; then + echo "Unpushed commits in the codebundle:" + echo "$UNPUSHED_FILES" + echo "Push your branch before discovery — RunWhen Local clones from the remote." + exit 1 + fi + silent: true + + validate-generation-rules: + desc: "Validate YAML files in .runwhen/generation-rules" + cmds: + - | + for cmd in curl yq; do + command -v $cmd >/dev/null || { echo "Error: $cmd is required."; exit 1; } + done + temp_dir=$(mktemp -d) + curl -s -o "$temp_dir/generation-rule-schema.json" \ + https://raw.githubusercontent.com/runwhen-contrib/runwhen-local/refs/heads/main/src/generation-rule-schema.json + for yaml_file in ../.runwhen/generation-rules/*.yaml; do + echo "Validating $yaml_file" + json_file="$temp_dir/$(basename "${yaml_file%.*}.json")" + yq -o=json "$yaml_file" > "$json_file" + if command -v ajv >/dev/null; then + ajv validate -s "$temp_dir/generation-rule-schema.json" -d "$json_file" \ + --spec=draft2020 --strict=false \ + && echo "$yaml_file is valid." || echo "$yaml_file is invalid." + else + echo "ajv not installed; skipped schema validation for $yaml_file" + fi + done + rm -rf "$temp_dir" + silent: true + + clean-rwl-discovery: + desc: "Remove local discovery artifacts" + cmds: + - rm -rf output workspaceInfo.yaml kubeconfig.secret || true + silent: true diff --git a/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/backend.tf b/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/backend.tf new file mode 100644 index 00000000..3c533e6b --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/backend.tf @@ -0,0 +1,5 @@ +terraform { + backend "local" { + path = "terraform.tfstate" + } +} diff --git a/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/main.tf b/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/main.tf new file mode 100644 index 00000000..32db6623 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/main.tf @@ -0,0 +1,52 @@ +resource "kubernetes_namespace" "seaweedfs" { + metadata { + name = var.namespace + labels = { + "app.kubernetes.io/name" = "seaweedfs-test" + } + } +} + +resource "helm_release" "seaweedfs" { + name = var.release_name + namespace = kubernetes_namespace.seaweedfs.metadata[0].name + repository = "https://seaweedfs.github.io/seaweedfs/helm" + chart = "seaweedfs" + version = var.chart_version + wait = true + timeout = 600 + + values = [ + yamlencode({ + master = { + replicas = 1 + data = { + type = "emptyDir" + } + logs = { + type = "emptyDir" + } + } + volume = { + replicas = 1 + data = { + type = "emptyDir" + } + } + filer = { + replicas = 1 + s3 = { + enabled = true + } + data = { + type = "emptyDir" + } + } + global = { + seaweedfs = { + enableSecurity = false + } + } + }) + ] +} diff --git a/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/outputs.tf b/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/outputs.tf new file mode 100644 index 00000000..a3f68cf7 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/outputs.tf @@ -0,0 +1,11 @@ +output "namespace" { + value = kubernetes_namespace.seaweedfs.metadata[0].name +} + +output "release_name" { + value = helm_release.seaweedfs.name +} + +output "context" { + value = var.kube_context +} diff --git a/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/providers.tf b/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/providers.tf new file mode 100644 index 00000000..018fa05d --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/providers.tf @@ -0,0 +1,25 @@ +terraform { + required_version = ">= 1.5.0" + required_providers { + helm = { + source = "hashicorp/helm" + version = "~> 2.13" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.27" + } + } +} + +provider "kubernetes" { + config_path = var.kubeconfig_path + config_context = var.kube_context +} + +provider "helm" { + kubernetes { + config_path = var.kubeconfig_path + config_context = var.kube_context + } +} diff --git a/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/terraform.tfvars b/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/terraform.tfvars new file mode 100644 index 00000000..f5448007 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/terraform.tfvars @@ -0,0 +1,3 @@ +kube_context = "kind-seaweedfs-health-test" +namespace = "test-seaweedfs-health" +release_name = "seaweedfs" diff --git a/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/variables.tf b/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/variables.tf new file mode 100644 index 00000000..d5a01a56 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/.test/terraform/variables.tf @@ -0,0 +1,29 @@ +variable "kubeconfig_path" { + description = "Path to kubeconfig used for test cluster access" + type = string + default = "~/.kube/config" +} + +variable "kube_context" { + description = "Kubernetes context for the test cluster" + type = string + default = "kind-seaweedfs-health-test" +} + +variable "namespace" { + description = "Namespace for SeaweedFS test deployment" + type = string + default = "test-seaweedfs-health" +} + +variable "release_name" { + description = "Helm release name for SeaweedFS" + type = string + default = "seaweedfs" +} + +variable "chart_version" { + description = "SeaweedFS Helm chart version" + type = string + default = "4.0.386" +} diff --git a/codebundles/k8s-seaweedfs-healthcheck/README.md b/codebundles/k8s-seaweedfs-healthcheck/README.md new file mode 100644 index 00000000..7202dac2 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/README.md @@ -0,0 +1,114 @@ +# Kubernetes SeaweedFS Storage Health Check + +This CodeBundle validates SeaweedFS storage health in a Kubernetes namespace deployed via the official Helm chart or compatible operator installs. It inspects master Raft leadership, volume slot availability, disk capacity, filer connectivity, and optional S3 gateway operations so operators can detect misconfiguration before workloads fail. + +## Overview + +- **Resource discovery**: Maps SeaweedFS master, volume, filer, services, and PVCs; flags missing components or zero-ready workloads. +- **Workload health**: Checks StatefulSet/Deployment replica alignment, CrashLoopBackOff pods, pending scheduling, and recent Warning events. +- **Master cluster**: Queries `/cluster/status` and `/cluster/healthz` via in-cluster master HTTP APIs. +- **Volume slots**: Parses `/dir/status` topology for free slot exhaustion at cluster and nested topology nodes. +- **Disk capacity**: Reads volume server `/status` for free disk percentage and read-only volumes. +- **Writable layouts**: Detects zero-writable replication layouts and read-only volume IDs. +- **Connectivity**: Validates filer health endpoints and volume server registration in master topology. +- **S3 gateway**: When enabled, runs ListBuckets plus put/get/delete of a temporary probe object (read-write task). +- **Volume configuration audit**: Validates Helm-rendered commands, env, mounts, replication vs volume replica count, and peer wiring. +- **GC / compaction signals**: Reads Prometheus metrics for pick-for-write errors, crowded layouts, disk write failures, and delete-blocking read-only volumes. +- **Capacity projection**: Flags high slot/disk utilization and estimates time-to-full when a prior snapshot exists in `CODEBUNDLE_TEMP_DIR`. +- **Known version issues**: Matches `helm.sh/chart` version against a curated issue catalog. + +## Configuration + +### Required Variables + +- `CONTEXT`: Kubernetes context for the target cluster. +- `NAMESPACE`: Namespace where SeaweedFS is deployed. + +### Optional Variables + +- `KUBERNETES_DISTRIBUTION_BINARY`: Kubernetes CLI binary (`kubectl` or `oc`; default: `kubectl`). +- `SEAWEEDFS_RELEASE_NAME`: Helm release name override for label-based discovery; leave empty for auto-discovery (default: empty). +- `SEAWEEDFS_MASTER_SERVICE`: Override master service DNS `host:port` (default: empty, auto-discovered via master pod exec). +- `SEAWEEDFS_FILER_SERVICE`: Override filer service DNS `host:port` (default: empty). +- `SEAWEEDFS_S3_ENDPOINT`: Override S3 endpoint URL (default: empty). +- `MIN_FREE_VOLUME_SLOTS`: Minimum free volume slots before raising an issue (default: `1`). +- `MIN_FREE_DISK_PERCENT`: Minimum free disk percentage on volume servers (default: `10`). +- `S3_PROBE_BUCKET`: Existing bucket for the S3 probe; a temporary object key is used (default: empty, auto-create when permitted). +- `CAPACITY_WARN_PERCENT`: Slot or disk utilization percent that triggers capacity projection warnings (default: `80`). +- `MIN_PROJECTION_HOURS`: Hours-until-full estimate that triggers slot exhaustion issues (default: `24`). +- `MAX_PICK_FOR_WRITE_ERRORS`: Master pick-for-write error counter threshold (default: `100`). +- `MAX_VOLUME_DISK_ERRORS`: Volume server disk write error counter threshold (default: `50`). +- `SEAWEEDFS_CHART`: Exact `helm.sh/chart` label (e.g. `seaweedfs-4.25.0`); auto-discovered when empty. + +### Secrets + +- `kubeconfig`: Kubernetes kubeconfig YAML with read access to the namespace. +- `seaweedfs_s3_credentials` (optional): JSON with `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` (or SeaweedFS IAM equivalents). Omit when S3 allows anonymous access or S3 is disabled. + +## Tasks Overview + +### List SeaweedFS Resources in Namespace + +Builds a component map of SeaweedFS workloads, services, and PVCs. Raises issues when expected master/volume/filer components are missing or have zero ready replicas. + +### Check SeaweedFS Workload Replica Health in Namespace + +Verifies replica counts, CrashLoopBackOff pods, pending scheduling, and Warning events tied to SeaweedFS workloads. + +### Check SeaweedFS Master Cluster Status in Namespace + +Queries master HTTP APIs for health and Raft leader election status. + +### Check SeaweedFS Volume Slot Availability in Namespace + +Evaluates `/dir/status` free volume slots against `MIN_FREE_VOLUME_SLOTS`. + +### Check SeaweedFS Volume Server Disk Capacity in Namespace + +Inspects volume `/status` disk usage and read-only volume counts against `MIN_FREE_DISK_PERCENT`. + +### Check SeaweedFS Writable Volume Layout in Namespace + +Detects layouts with zero writables or read-only volume IDs in topology. + +### Check SeaweedFS Filer and Component Connectivity in Namespace + +Confirms filer `/healthz` or `/status` and that volume servers appear in master topology. + +### Verify SeaweedFS S3 Gateway Operations in Namespace + +Performs a minimal S3 probe when the filer S3 port is enabled; skips gracefully when S3 is disabled. + +### Check SeaweedFS Volume Configuration in Namespace + +Audits master/volume/filer workload commands, env vars, volume mounts, `defaultReplication`, and peer/replica alignment. + +### Check SeaweedFS Garbage Collection and Compaction Signals in Namespace + +Inspects master and volume `:9327/metrics` for pick-for-write errors, crowded layouts, heartbeat errors, and read-only volumes that block deletes. + +### Check SeaweedFS Capacity Projection in Namespace + +Reports slot and disk utilization headroom; compares against a prior snapshot in `CODEBUNDLE_TEMP_DIR` to estimate hours until slot exhaustion. + +### Check SeaweedFS Known Version Issues in Namespace + +Matches the installed chart version against `seaweedfs-known-issues.json` for upgrade cautions and version-specific behavior notes. + +## Local testing + +The `.test/` directory includes Terraform to deploy the [official SeaweedFS Helm chart](https://github.com/seaweedfs/seaweedfs/tree/master/k8s/charts/seaweedfs) into a dedicated namespace on an existing cluster (Kind/minikube). Prerequisites: `terraform`, `helm`, `kubectl`, and cluster admin access. + +```bash +cd .test +task build-infra # terraform apply (Helm release) +task clean # terraform destroy +``` + +Use `task validate-generation-rules` to validate generation rule YAML against the RunWhen Local schema. + +## Related bundles + +- `k8s-pvc-healthcheck`: generic PVC binding and utilization (complements this bundle). +- `k8s-statefulset-healthcheck`: generic StatefulSet replica/probe checks. +- `k8s-loki-healthcheck`: similar in-cluster HTTP status API pattern. diff --git a/codebundles/k8s-seaweedfs-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-seaweedfs-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 00000000..c1728eb8 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,353 @@ +--- +name: k8s-seaweedfs-healthcheck +kind: skill-template +description: Validates SeaweedFS storage health in a Kubernetes namespace—master leadership, volume slots, disk capacity, filer connectivity, and optional S3 probes. Use when triaging or monitoring SeaweedFS Helm installs on Kubernetes. +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes] +resource_types: [statefulset] +access: read-write +--- + +# Kubernetes SeaweedFS Storage Health Check + +## Summary + +This CodeBundle validates SeaweedFS storage health in a Kubernetes namespace deployed via the official Helm chart or compatible operator installs. It inspects master Raft leadership, volume slot availability, disk capacity, filer connectivity, optional S3 gateway operations, configuration audits, GC/compaction signals, capacity projection, and known chart-version issues. + +See [README.md](README.md) for additional context. + +## Tools + +### List SeaweedFS Resources in Namespace `${NAMESPACE}` + +Discovers SeaweedFS master, volume, filer, and S3 gateway workloads, services, and PVCs and surfaces missing components. + +- **Robot task name**: List SeaweedFS Resources in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `list-seaweedfs-resources.sh` +- **Tags**: `Kubernetes`, `SeaweedFS`, `discovery`, `access:read-only`, `data:logs-config` +- **Reads**: `CONTEXT`, `NAMESPACE`, `SEAWEEDFS_RELEASE_NAME` +- **Writes**: `list_seaweedfs_resources_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check SeaweedFS Workload Replica Health in Namespace `${NAMESPACE}` + +Verifies StatefulSets and Deployments for SeaweedFS components have desired replicas ready and flags CrashLoopBackOff or pending pods. + +- **Robot task name**: Check SeaweedFS Workload Replica Health in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-workload-health.sh` +- **Tags**: `Kubernetes`, `SeaweedFS`, `workload`, `access:read-only`, `data:logs-config` +- **Reads**: `CONTEXT`, `NAMESPACE`, `SEAWEEDFS_RELEASE_NAME` +- **Writes**: `workload_health_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check SeaweedFS Master Cluster Status in Namespace `${NAMESPACE}` + +Queries master /cluster/status and /cluster/healthz to validate Raft leadership and master health endpoints. + +- **Robot task name**: Check SeaweedFS Master Cluster Status in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-master-cluster-status.sh` +- **Tags**: `Kubernetes`, `SeaweedFS`, `master`, `access:read-only`, `data:metrics` +- **Reads**: `CONTEXT`, `NAMESPACE`, `SEAWEEDFS_MASTER_SERVICE` +- **Writes**: `master_cluster_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check SeaweedFS Volume Slot Availability in Namespace `${NAMESPACE}` + +Parses /dir/status topology to ensure free volume slots exist before workloads fail on allocation. + +- **Robot task name**: Check SeaweedFS Volume Slot Availability in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-volume-slots.sh` +- **Tags**: `Kubernetes`, `SeaweedFS`, `volumes`, `access:read-only`, `data:metrics` +- **Reads**: `CONTEXT`, `MIN_FREE_VOLUME_SLOTS`, `NAMESPACE` +- **Writes**: `volume_slots_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check SeaweedFS Volume Server Disk Capacity in Namespace `${NAMESPACE}` + +Inspects volume server /status and topology for disk usage, read-only volumes, and min-free-space threshold breaches. + +- **Robot task name**: Check SeaweedFS Volume Server Disk Capacity in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-volume-capacity.sh` +- **Tags**: `Kubernetes`, `SeaweedFS`, `capacity`, `access:read-only`, `data:metrics` +- **Reads**: `CONTEXT`, `MIN_FREE_DISK_PERCENT`, `NAMESPACE` +- **Writes**: `volume_capacity_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check SeaweedFS Writable Volume Layout in Namespace `${NAMESPACE}` + +Evaluates /dir/status layouts for writable volume IDs and flags zero-writable or read-only placement problems. + +- **Robot task name**: Check SeaweedFS Writable Volume Layout in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-writable-layouts.sh` +- **Tags**: `Kubernetes`, `SeaweedFS`, `layout`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT`, `NAMESPACE` +- **Writes**: `writable_layouts_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check SeaweedFS Filer and Component Connectivity in Namespace `${NAMESPACE}` + +Confirms filer health endpoints respond and volume servers appear registered in master topology. + +- **Robot task name**: Check SeaweedFS Filer and Component Connectivity in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-component-connectivity.sh` +- **Tags**: `Kubernetes`, `SeaweedFS`, `connectivity`, `access:read-only`, `data:logs-config` +- **Reads**: `CONTEXT`, `NAMESPACE`, `SEAWEEDFS_FILER_SERVICE` +- **Writes**: `component_connectivity_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Verify SeaweedFS S3 Gateway Operations in Namespace `${NAMESPACE}` + +Performs ListBuckets and put/get/delete of a temporary test object against the filer S3 endpoint when enabled. + +- **Robot task name**: Verify SeaweedFS S3 Gateway Operations in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `verify-s3-gateway.sh` +- **Tags**: `Kubernetes`, `SeaweedFS`, `S3`, `access:read-write`, `data:metrics` +- **Reads**: `CONTEXT`, `NAMESPACE`, `S3_PROBE_BUCKET`, `SEAWEEDFS_S3_ENDPOINT` +- **Writes**: `s3_gateway_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check SeaweedFS Volume Configuration in Namespace `${NAMESPACE}` + +Audits Helm-rendered workload commands, env, mounts, replication, and volume limits for misconfiguration. + +- **Robot task name**: Check SeaweedFS Volume Configuration in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-volume-config.sh` +- **Tags**: `Kubernetes`, `SeaweedFS`, `config`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT`, `NAMESPACE`, `SEAWEEDFS_CHART`, `SEAWEEDFS_RELEASE_NAME` +- **Writes**: `volume_config_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check SeaweedFS Garbage Collection and Compaction Signals in Namespace `${NAMESPACE}` + +Reads master and volume Prometheus metrics for pick-for-write errors, crowded layouts, disk write failures, and delete-blocking read-only volumes. + +- **Robot task name**: Check SeaweedFS Garbage Collection and Compaction Signals in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-gc-compaction.sh` +- **Tags**: `Kubernetes`, `SeaweedFS`, `gc`, `access:read-only`, `data:metrics` +- **Reads**: `CONTEXT`, `MAX_PICK_FOR_WRITE_ERRORS`, `MAX_VOLUME_DISK_ERRORS`, `NAMESPACE`, `SEAWEEDFS_CHART`, `SEAWEEDFS_RELEASE_NAME` +- **Writes**: `gc_compaction_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check SeaweedFS Capacity Projection in Namespace `${NAMESPACE}` + +Evaluates slot and disk utilization headroom and estimates time-to-full when a prior capacity snapshot exists. + +- **Robot task name**: Check SeaweedFS Capacity Projection in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-capacity-projection.sh` +- **Tags**: `Kubernetes`, `SeaweedFS`, `capacity`, `access:read-only`, `data:metrics` +- **Reads**: `CAPACITY_WARN_PERCENT`, `CONTEXT`, `MIN_PROJECTION_HOURS`, `NAMESPACE`, `SEAWEEDFS_CHART`, `SEAWEEDFS_RELEASE_NAME` +- **Writes**: `capacity_projection_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check SeaweedFS Known Version Issues in Namespace `${NAMESPACE}` + +Matches the installed helm.sh/chart version against a curated catalog of SeaweedFS known issues and upgrade cautions. + +- **Robot task name**: Check SeaweedFS Known Version Issues in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-known-issues.sh` +- **Tags**: `Kubernetes`, `SeaweedFS`, `version`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT`, `NAMESPACE`, `SEAWEEDFS_CHART`, `SEAWEEDFS_RELEASE_NAME` +- **Writes**: `known_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Measures SeaweedFS storage health using workload readiness, master leadership, volume slot availability, and filer connectivity. Produces a value between 0 (failing) and 1 (healthy). + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `300s` + +### Sub-checks + +The monitor task runs `sli-seaweedfs-dimensions.sh` once and emits four binary sub-metrics (`0` or `1`). The aggregate score is their arithmetic mean. + +#### Workload readiness + +SeaweedFS StatefulSets and Deployments with `replicas > 0` must have `ready == replicas`. + +- **Robot task name**: Score SeaweedFS Health Dimensions in Namespace `${NAMESPACE}` +- **Sub-metric name**: `workload` +- **Underlying script**: `sli-seaweedfs-dimensions.sh` +- **Tags**: `access:read-only`, `data:metrics` +- **Reads**: `CONTEXT`, `NAMESPACE`, `SEAWEEDFS_RELEASE_NAME`, `SEAWEEDFS_CHART` +- **Pass condition**: no SeaweedFS workload has ready replicas below desired count + + +#### Master health + +Master `/cluster/healthz` must respond with an ok/healthy/success body. + +- **Robot task name**: Score SeaweedFS Health Dimensions in Namespace `${NAMESPACE}` +- **Sub-metric name**: `master` +- **Underlying script**: `sli-seaweedfs-dimensions.sh` +- **Tags**: `access:read-only`, `data:metrics` +- **Reads**: `CONTEXT`, `NAMESPACE`, `SEAWEEDFS_RELEASE_NAME`, `SEAWEEDFS_CHART` +- **Pass condition**: master health endpoint reachable and healthy + + +#### Volume slot availability + +Master `/dir/status` topology free slots must meet `MIN_FREE_VOLUME_SLOTS`. + +- **Robot task name**: Score SeaweedFS Health Dimensions in Namespace `${NAMESPACE}` +- **Sub-metric name**: `volume_slots` +- **Underlying script**: `sli-seaweedfs-dimensions.sh` +- **Tags**: `access:read-only`, `data:metrics` +- **Reads**: `CONTEXT`, `MIN_FREE_VOLUME_SLOTS`, `NAMESPACE`, `SEAWEEDFS_RELEASE_NAME`, `SEAWEEDFS_CHART` +- **Pass condition**: topology free slots ≥ `MIN_FREE_VOLUME_SLOTS` (default `1`) + + +#### Filer connectivity + +A filer pod must exist and respond on `/healthz` or `/status`. + +- **Robot task name**: Score SeaweedFS Health Dimensions in Namespace `${NAMESPACE}` +- **Sub-metric name**: `connectivity` +- **Underlying script**: `sli-seaweedfs-dimensions.sh` +- **Tags**: `access:read-only`, `data:metrics` +- **Reads**: `CONTEXT`, `NAMESPACE`, `SEAWEEDFS_RELEASE_NAME`, `SEAWEEDFS_CHART` +- **Pass condition**: filer pod present and health endpoint responds + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Kubernetes CLI binary (kubectl or oc). | `kubectl` | no | +| `CONTEXT` | string | Kubernetes context for the target cluster. | — | yes | +| `NAMESPACE` | string | Namespace where SeaweedFS is deployed. | — | yes | +| `SEAWEEDFS_RELEASE_NAME` | string | Helm release instance label (parent release for subchart installs). | `` | no | +| `SEAWEEDFS_CHART` | string | Exact helm.sh/chart label for the SeaweedFS subchart (e.g. seaweedfs-4.25.0). | `` | no | +| `SEAWEEDFS_MASTER_SERVICE` | string | Override master service host:port when auto-discovery is insufficient. | `` | no | +| `SEAWEEDFS_FILER_SERVICE` | string | Override filer service host:port when auto-discovery is insufficient. | `` | no | +| `SEAWEEDFS_S3_ENDPOINT` | string | Override S3 endpoint URL for gateway probe. | `` | no | +| `MIN_FREE_VOLUME_SLOTS` | string | Minimum free volume slots required before raising an issue. | `1` | no | +| `MIN_FREE_DISK_PERCENT` | string | Minimum free disk percentage required on volume servers. | `10` | no | +| `S3_PROBE_BUCKET` | string | Existing bucket for S3 probe; temporary object prefix is used. | `` | no | +| `CAPACITY_WARN_PERCENT` | string | Slot or disk utilization percent that triggers capacity projection warnings. | `80` | no | +| `MIN_PROJECTION_HOURS` | string | Hours-until-full estimate that triggers slot exhaustion projection issues. | `24` | no | +| `MAX_PICK_FOR_WRITE_ERRORS` | string | Master pick-for-write error counter threshold for GC/compaction checks. | `100` | no | +| `MAX_VOLUME_DISK_ERRORS` | string | Volume server disk write error counter threshold for GC/compaction checks. | `50` | no | + +## Secrets + +| Name | Type | Description | Required | +|---|---|---|---| +| `kubeconfig` | string | Kubernetes kubeconfig YAML with read access to the namespace. | yes | +| `seaweedfs_s3_credentials` | string | Optional JSON with `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` for the S3 gateway probe. | no | + +## Outputs + +- Monitor sub-metrics: `workload`, `master`, `volume_slots`, `connectivity` (each `0` or `1`) +- `list_seaweedfs_resources_issues.json` +- `workload_health_issues.json` +- `master_cluster_issues.json` +- `volume_slots_issues.json` +- `volume_capacity_issues.json` +- `writable_layouts_issues.json` +- `component_connectivity_issues.json` +- `s3_gateway_issues.json` +- `volume_config_issues.json` +- `gc_compaction_issues.json` +- `capacity_projection_issues.json` +- `known_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-seaweedfs-healthcheck/runbook.robot` +- **Monitor**: `codebundles/k8s-seaweedfs-healthcheck/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-seaweedfs-healthcheck +export RW_MODE=dev +export RW_FROM_FILE='{"kubeconfig":"/path/to/kubeconfig"}' +export CONTEXT=... +export NAMESPACE=... +export SEAWEEDFS_RELEASE_NAME=... # optional; chart-aware auto-discovery when empty +export SEAWEEDFS_CHART=... # optional; e.g. seaweedfs-4.25.0 +ro runbook.robot +ro sli.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-seaweedfs-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export SEAWEEDFS_RELEASE_NAME=... +bash check-capacity-projection.sh +bash check-component-connectivity.sh +bash check-gc-compaction.sh +bash check-known-issues.sh +bash check-master-cluster-status.sh +bash check-volume-capacity.sh +bash check-volume-config.sh +bash check-volume-slots.sh +bash check-workload-health.sh +bash check-writable-layouts.sh +bash list-seaweedfs-resources.sh +bash seaweedfs-lib.sh +# ... and 2 more scripts +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `check-capacity-projection.sh` — Bash helper script `check-capacity-projection.sh`. +- `check-component-connectivity.sh` — Bash helper script `check-component-connectivity.sh`. +- `check-gc-compaction.sh` — Bash helper script `check-gc-compaction.sh`. +- `check-known-issues.sh` — Bash helper script `check-known-issues.sh`. +- `check-master-cluster-status.sh` — Bash helper script `check-master-cluster-status.sh`. +- `check-volume-capacity.sh` — Bash helper script `check-volume-capacity.sh`. +- `check-volume-config.sh` — Bash helper script `check-volume-config.sh`. +- `check-volume-slots.sh` — Bash helper script `check-volume-slots.sh`. +- `check-workload-health.sh` — Bash helper script `check-workload-health.sh`. +- `check-writable-layouts.sh` — Bash helper script `check-writable-layouts.sh`. +- `list-seaweedfs-resources.sh` — Bash helper script `list-seaweedfs-resources.sh`. +- `seaweedfs-lib.sh` — Bash helper script `seaweedfs-lib.sh`. +- `sli-seaweedfs-dimensions.sh` — Bash helper script `sli-seaweedfs-dimensions.sh`. +- `verify-s3-gateway.sh` — Bash helper script `verify-s3-gateway.sh`. diff --git a/codebundles/k8s-seaweedfs-healthcheck/check-capacity-projection.sh b/codebundles/k8s-seaweedfs-healthcheck/check-capacity-projection.sh new file mode 100755 index 00000000..17a1eeaf --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/check-capacity-projection.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +set -euo pipefail +# Projects capacity headroom from topology/metrics and optional snapshot delta. +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="capacity_projection_issues.json" +PROJECTION_SNAPSHOT_FILE="${PROJECTION_SNAPSHOT_FILE:-seaweedfs_capacity_projection_snapshot.json}" +CAPACITY_WARN_PERCENT="${CAPACITY_WARN_PERCENT:-80}" +MIN_PROJECTION_HOURS="${MIN_PROJECTION_HOURS:-24}" +# shellcheck disable=SC1091 +source seaweedfs-lib.sh + +print_report() { + echo "=== SeaweedFS capacity projection ===" + [[ -f "$PROJECTION_SNAPSHOT_FILE" ]] && jq '.' "$PROJECTION_SNAPSHOT_FILE" 2>/dev/null || true + jq -r '.[] | " - [sev=\(.severity)] \(.title)"' "$OUTPUT_FILE" 2>/dev/null || true +} +trap print_report EXIT + +now_epoch=$(date +%s) +snapshot=$(jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --argjson epoch "$now_epoch" '{timestamp: $ts, epoch: $epoch, slots: {}, disk: []}') + +if dir_status=$(swf_master_http "/dir/status" 2>/dev/null); then + free=$(echo "$dir_status" | jq -r '.Topology.Free // .topology.free // empty') + max=$(echo "$dir_status" | jq -r '.Topology.Max // .topology.max // empty') + if [[ "$free" =~ ^[0-9]+$ ]] && [[ "$max" =~ ^[0-9]+$ ]] && [[ "$max" -gt 0 ]]; then + used=$((max - free)) + util_pct=$(awk "BEGIN {printf \"%.1f\", ($used / $max) * 100}") + snapshot=$(echo "$snapshot" | jq --argjson free "$free" --argjson max "$max" --arg util "$util_pct" \ + '.slots = {free: $free, max: $max, used: ($max - $free), utilization_percent: ($util | tonumber)}') + + if awk "BEGIN {exit !($util_pct >= $CAPACITY_WARN_PERCENT)}"; then + swf_add_issue \ + "SeaweedFS volume slot utilization at ${util_pct}% in \`${NAMESPACE}\`" \ + "Topology Free=${free}, Max=${max}, warn threshold=${CAPACITY_WARN_PERCENT}%" \ + 2 \ + "Plan volume server scale-out before Free slots reach zero." + fi + fi +fi + +master_pod=$(swf_find_pod "master") +if [[ -n "$master_pod" ]] && master_metrics=$(swf_fetch_pod_metrics "$master_pod" "$METRICS_PORT" 2>/dev/null); then + writable_sum=$(swf_metric_sum_matching "$master_metrics" "^SeaweedFS_master_volume_layout_writable") + crowded_sum=$(swf_metric_sum_matching "$master_metrics" "^SeaweedFS_master_volume_layout_crowded") + snapshot=$(echo "$snapshot" | jq \ + --argjson writable "${writable_sum:-0}" \ + --argjson crowded "${crowded_sum:-0}" \ + '.master_metrics = {writable_volumes: $writable, crowded_layouts: $crowded}') + if [[ "${crowded_sum:-0}" =~ ^[0-9]+$ ]] && [[ "${crowded_sum:-0}" -gt 0 ]]; then + swf_add_issue \ + "SeaweedFS crowded layouts may exhaust write headroom soon in \`${NAMESPACE}\`" \ + "Sum of crowded layout gauges=${crowded_sum}" \ + 3 \ + "Add writable volumes or rebalance collections before writes fail." + fi +fi + +while IFS= read -r pod; do + [[ -z "$pod" ]] && continue + if status_json=$(swf_volume_http "$pod" "/status" 2>/dev/null); then + vol_count=$(echo "$status_json" | jq '.Volumes // [] | length') + ro_count=$(echo "$status_json" | jq '[.Volumes[]? | select(.readOnly == true or .ReadOnly == true)] | length') + disk_line=$(echo "$status_json" | jq -r '.DiskUsages[]? | "\(.dir // .Dir // "data") \(.percent_free // .PercentFree // 100)"' 2>/dev/null | head -1) + pct_free=$(echo "$disk_line" | awk '{print $2}') + disk_util="0" + if [[ -n "$pct_free" ]] && [[ "$pct_free" =~ ^[0-9.]+$ ]]; then + disk_util=$(awk "BEGIN {printf \"%.1f\", 100 - $pct_free}") + fi + entry=$(jq -n \ + --arg pod "$pod" \ + --argjson volumes "${vol_count:-0}" \ + --argjson read_only "${ro_count:-0}" \ + --arg util "$disk_util" \ + '{pod: $pod, volume_count: $volumes, read_only_volumes: $read_only, disk_utilization_percent: ($util | tonumber)}') + snapshot=$(echo "$snapshot" | jq --argjson e "$entry" '.disk += [$e]') + + if [[ "$disk_util" != "0" ]] && awk "BEGIN {exit !($disk_util >= $CAPACITY_WARN_PERCENT)}"; then + swf_add_issue \ + "Volume server \`${pod}\` disk utilization projected high (${disk_util}%)" \ + "volume_count=${vol_count}, read_only=${ro_count}" \ + 2 \ + "Expand PVC/storage class or add volume nodes before disk fills." + fi + fi +done < <("${KUBECTL}" get pods -n "${NAMESPACE}" --context "${CONTEXT}" \ + -l "$(swf_label_selector volume)" --field-selector=status.phase=Running \ + -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null) + +prior_path=$(swf_capacity_snapshot_path) +if [[ -f "$prior_path" ]]; then + prior_epoch=$(jq -r '.epoch // 0' "$prior_path" 2>/dev/null || echo 0) + prior_used=$(jq -r '.slots.used // empty' "$prior_path" 2>/dev/null || true) + cur_used=$(echo "$snapshot" | jq -r '.slots.used // empty') + elapsed_hours=$(awk "BEGIN {printf \"%.2f\", ($now_epoch - $prior_epoch) / 3600}") + if [[ "$prior_used" =~ ^[0-9]+$ ]] && [[ "$cur_used" =~ ^[0-9]+$ ]] && [[ "$prior_epoch" =~ ^[0-9]+$ ]] \ + && awk "BEGIN {exit !($elapsed_hours >= 1)}"; then + delta=$((cur_used - prior_used)) + if [[ "$delta" -gt 0 ]]; then + max_slots=$(echo "$snapshot" | jq -r '.slots.max // empty') + rate_per_hour=$(awk "BEGIN {printf \"%.2f\", $delta / $elapsed_hours}") + if [[ "$max_slots" =~ ^[0-9]+$ ]] && [[ "$max_slots" -gt "$cur_used" ]]; then + remaining=$((max_slots - cur_used)) + hours_left=$(awk "BEGIN {printf \"%.1f\", $remaining / $rate_per_hour}") + snapshot=$(echo "$snapshot" | jq \ + --argjson delta "$delta" \ + --arg rate "$rate_per_hour" \ + --arg hours "$hours_left" \ + --arg elapsed "$elapsed_hours" \ + '.projection = {slots_consumed_since_prior: $delta, hours_since_prior: ($elapsed | tonumber), slots_per_hour: ($rate | tonumber), estimated_hours_until_full: ($hours | tonumber)}') + if awk "BEGIN {exit !($hours_left < $MIN_PROJECTION_HOURS)}"; then + swf_add_issue \ + "SeaweedFS volume slots may exhaust within ${hours_left}h at current growth rate" \ + "Consumed ${delta} slots in ${elapsed_hours}h (~${rate_per_hour}/h); ${remaining} slots remain." \ + 2 \ + "Scale volume servers or increase max volumes before slot exhaustion." + fi + fi + fi + fi +fi + +echo "$snapshot" >"$PROJECTION_SNAPSHOT_FILE" +mkdir -p "$(dirname "$prior_path")" 2>/dev/null || true +echo "$snapshot" >"$prior_path" +swf_write_issues "$OUTPUT_FILE" diff --git a/codebundles/k8s-seaweedfs-healthcheck/check-component-connectivity.sh b/codebundles/k8s-seaweedfs-healthcheck/check-component-connectivity.sh new file mode 100755 index 00000000..06f822cf --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/check-component-connectivity.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# Confirms filer health and data node registration in master topology. +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="component_connectivity_issues.json" +# shellcheck disable=SC1091 +source seaweedfs-lib.sh + +print_report() { + { set +x; } 2>/dev/null || true + echo "=== SeaweedFS component connectivity ===" + jq -r '.[] | " - [sev=\(.severity)] \(.title)"' "$OUTPUT_FILE" 2>/dev/null || true +} +trap print_report EXIT + +filer_pod=$(swf_find_pod "filer") +if [[ -z "$filer_pod" ]]; then + swf_add_issue \ + "No running SeaweedFS filer pod in namespace \`${NAMESPACE}\`" \ + "Filer connectivity checks were skipped." \ + 3 \ + "Verify filer StatefulSet and app.kubernetes.io/component=filer label." +else + filer_ok=false + for path in "/healthz" "/status"; do + if resp=$(swf_filer_http "$path" 2>/dev/null); then + if echo "$resp" | grep -qiE 'ok|healthy|running|success|version'; then + filer_ok=true + break + fi + fi + done + if [[ "$filer_ok" != true ]]; then + swf_add_issue \ + "SeaweedFS filer health endpoint unhealthy in namespace \`${NAMESPACE}\`" \ + "Neither /healthz nor /status returned a healthy response from pod ${filer_pod}." \ + 2 \ + "Check filer logs and master address configuration (WEED_CLUSTER_SW_MASTER)." + fi +fi + +if ! dir_status=$(swf_master_http "/dir/status" 2>/dev/null); then + swf_add_issue \ + "Cannot validate volume server registration: /dir/status failed in \`${NAMESPACE}\`" \ + "Master topology unavailable." \ + 2 \ + "Restore master API connectivity." + swf_write_issues "$OUTPUT_FILE" + exit 0 +fi + +# Count data nodes in topology +data_nodes=$(echo "$dir_status" | jq '[.. | objects | select(has("Url") or has("url") or has("PublicUrl") or has("publicUrl")) | (.Url // .url // .PublicUrl // .publicUrl)] | unique | length' 2>/dev/null || echo 0) +volume_pods=$(swf_count_running_pods "volume") + +if [[ "$volume_pods" =~ ^[0-9]+$ ]] && [[ "$volume_pods" -gt 0 ]]; then + if [[ "$data_nodes" =~ ^[0-9]+$ ]] && [[ "$data_nodes" -eq 0 ]]; then + swf_add_issue \ + "SeaweedFS volume pods run but master topology lists zero data nodes in \`${NAMESPACE}\`" \ + "Running volume pods=${volume_pods}, topology nodes=${data_nodes}" \ + 2 \ + "Verify volume servers can reach master on port ${MASTER_PORT}; check weed shell logs." + elif [[ "$data_nodes" =~ ^[0-9]+$ ]] && [[ "$data_nodes" -lt "$volume_pods" ]]; then + swf_add_issue \ + "SeaweedFS topology missing registered volume servers in \`${NAMESPACE}\`" \ + "Running volume pods=${volume_pods}, registered topology nodes=${data_nodes}" \ + 3 \ + "Restart unregistered volume pods and inspect heartbeat errors." + fi +fi + +# Stale / unreachable hints from ec shards or failed nodes (best-effort) +stale=$(echo "$dir_status" | jq '[.. | strings | select(test("stale|unreachable|offline"; "i"))] | length' 2>/dev/null || echo 0) +if [[ "$stale" =~ ^[0-9]+$ ]] && [[ "$stale" -gt 0 ]]; then + swf_add_issue \ + "SeaweedFS topology contains stale or unreachable node hints in \`${NAMESPACE}\`" \ + "Found ${stale} stale/unreachable markers in /dir/status payload." \ + 3 \ + "Compare topology data nodes with Running volume pods and decommission dead nodes." +fi + +swf_write_issues "$OUTPUT_FILE" diff --git a/codebundles/k8s-seaweedfs-healthcheck/check-gc-compaction.sh b/codebundles/k8s-seaweedfs-healthcheck/check-gc-compaction.sh new file mode 100755 index 00000000..338b0062 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/check-gc-compaction.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +set -euo pipefail +# Surfaces garbage-collection, compaction, and delete-path error signals from SeaweedFS metrics. +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="gc_compaction_issues.json" +GC_SNAPSHOT_FILE="${GC_SNAPSHOT_FILE:-seaweedfs_gc_snapshot.json}" +MAX_PICK_FOR_WRITE_ERRORS="${MAX_PICK_FOR_WRITE_ERRORS:-100}" +MAX_VOLUME_DISK_ERRORS="${MAX_VOLUME_DISK_ERRORS:-50}" +MAX_READONLY_NO_DELETE="${MAX_READONLY_NO_DELETE:-5}" +# shellcheck disable=SC1091 +source seaweedfs-lib.sh + +print_report() { + echo "=== SeaweedFS GC / compaction signals ===" + [[ -f "$GC_SNAPSHOT_FILE" ]] && jq '.' "$GC_SNAPSHOT_FILE" 2>/dev/null || true + jq -r '.[] | " - [sev=\(.severity)] \(.title)"' "$OUTPUT_FILE" 2>/dev/null || true +} +trap print_report EXIT + +snapshot='{"master":{},"volumes":[],"filer":{}}' + +master_pod=$(swf_find_pod "master") +if [[ -n "$master_pod" ]]; then + if master_metrics=$(swf_fetch_pod_metrics "$master_pod" "$METRICS_PORT" 2>/dev/null); then + pick_err=$(swf_metric_gauge_value "$master_metrics" "SeaweedFS_master_pick_for_write_error") + crowded=$(swf_metric_sum_matching "$master_metrics" "^SeaweedFS_master_volume_layout_crowded") + hb_err=$(echo "$master_metrics" | awk '/SeaweedFS_master_received_heartbeats\{type="error"\}/ {print $2; exit}' || echo 0) + snapshot=$(echo "$snapshot" | jq \ + --argjson pick "${pick_err:-0}" \ + --argjson crowded "${crowded:-0}" \ + --argjson hb_err "${hb_err:-0}" \ + '.master = {pick_for_write_error: $pick, crowded_layouts: $crowded, heartbeat_errors: $hb_err}') + + if [[ "${pick_err:-0}" =~ ^[0-9]+$ ]] && [[ "${pick_err:-0}" -gt "$MAX_PICK_FOR_WRITE_ERRORS" ]]; then + swf_add_issue \ + "SeaweedFS master pick-for-write errors elevated in \`${NAMESPACE}\`" \ + "SeaweedFS_master_pick_for_write_error=${pick_err} (threshold=${MAX_PICK_FOR_WRITE_ERRORS})" \ + 2 \ + "Inspect writable layouts, read-only volumes, and slot availability; scale volume servers." + fi + if [[ "${crowded:-0}" =~ ^[0-9]+$ ]] && [[ "${crowded:-0}" -gt 0 ]]; then + swf_add_issue \ + "SeaweedFS reports crowded volume layouts in \`${NAMESPACE}\`" \ + "SeaweedFS_master_volume_layout_crowded sum=${crowded}" \ + 2 \ + "Run volume vacuum/balance if needed; add capacity or tune volumeSizeLimitMB." + fi + if [[ "${hb_err:-0}" =~ ^[0-9]+$ ]] && [[ "${hb_err:-0}" -gt 0 ]]; then + swf_add_issue \ + "SeaweedFS master received volume heartbeat errors" \ + "SeaweedFS_master_received_heartbeats{type=\"error\"}=${hb_err}" \ + 2 \ + "Check volume server logs and network paths to master port ${MASTER_PORT}." + fi + fi +fi + +filer_pod=$(swf_find_pod "filer") +if [[ -n "$filer_pod" ]]; then + if filer_metrics=$(swf_fetch_pod_metrics "$filer_pod" "$METRICS_PORT" 2>/dev/null); then + delete_ops=$(swf_metric_sum_matching "$filer_metrics" 'SeaweedFS_filerStore_request_seconds_count.*type="delete"') + snapshot=$(echo "$snapshot" | jq --argjson del "${delete_ops:-0}" '.filer = {delete_store_ops: $del}') + fi +fi + +while IFS= read -r pod; do + [[ -z "$pod" ]] && continue + vol_entry='{}' + if vol_metrics=$(swf_fetch_pod_metrics "$pod" "$METRICS_PORT" 2>/dev/null); then + disk_err=$(swf_metric_sum_matching "$vol_metrics" 'errorWriteToLocalDisk') + size_err=$(swf_metric_sum_matching "$vol_metrics" 'errorSizeMismatchOffsetSize') + ro_no_delete=$(swf_metric_sum_matching "$vol_metrics" 'noWriteOrDelete') + ro_can_delete=$(swf_metric_sum_matching "$vol_metrics" 'noWriteCanDelete') + vol_entry=$(jq -n \ + --arg pod "$pod" \ + --argjson disk_err "${disk_err:-0}" \ + --argjson size_err "${size_err:-0}" \ + --argjson ro_no_delete "${ro_no_delete:-0}" \ + --argjson ro_can_delete "${ro_can_delete:-0}" \ + '{pod: $pod, disk_write_errors: $disk_err, size_mismatch_errors: $size_err, read_only_no_delete: $ro_no_delete, read_only_can_delete: $ro_can_delete}') + snapshot=$(echo "$snapshot" | jq --argjson v "$vol_entry" '.volumes += [$v]') + + if [[ "${disk_err:-0}" =~ ^[0-9]+$ ]] && [[ "${disk_err:-0}" -gt "$MAX_VOLUME_DISK_ERRORS" ]]; then + swf_add_issue \ + "Volume server \`${pod}\` reports disk write errors (possible GC/compaction pressure)" \ + "SeaweedFS_volumeServer_handler_total{type=\"errorWriteToLocalDisk\"}=${disk_err}" \ + 2 \ + "Check disk space, PVC capacity, and read-only volumes on ${pod}." + fi + if [[ "${ro_no_delete:-0}" =~ ^[0-9]+$ ]] && [[ "${ro_no_delete:-0}" -gt "$MAX_READONLY_NO_DELETE" ]]; then + swf_add_issue \ + "Volume server \`${pod}\` has volumes in noWriteOrDelete state" \ + "read_only_noWriteOrDelete=${ro_no_delete} volumes may block deletes and GC." \ + 2 \ + "Investigate collection TTL/vacuum settings and disk pressure on ${pod}." + fi + fi +done < <("${KUBECTL}" get pods -n "${NAMESPACE}" --context "${CONTEXT}" \ + -l "$(swf_label_selector volume)" --field-selector=status.phase=Running \ + -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null) + +echo "$snapshot" >"$GC_SNAPSHOT_FILE" +swf_write_issues "$OUTPUT_FILE" diff --git a/codebundles/k8s-seaweedfs-healthcheck/check-known-issues.sh b/codebundles/k8s-seaweedfs-healthcheck/check-known-issues.sh new file mode 100755 index 00000000..dcc00765 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/check-known-issues.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +set -euo pipefail +# Matches installed chart version against curated SeaweedFS known issues. +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="known_issues.json" +KNOWN_ISSUES_FILE="${KNOWN_ISSUES_FILE:-seaweedfs-known-issues.json}" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source seaweedfs-lib.sh + +print_report() { + echo "=== SeaweedFS known version issues ===" + echo " chart_version=$(swf_chart_version) chart=${SEAWEEDFS_CHART:-$(swf_resolve_chart_label)}" + jq -r '.[] | " - [sev=\(.severity)] \(.title)"' "$OUTPUT_FILE" 2>/dev/null || true +} +trap print_report EXIT + +catalog="${SCRIPT_DIR}/${KNOWN_ISSUES_FILE}" +if [[ ! -f "$catalog" ]]; then + swf_add_issue \ + "SeaweedFS known-issues catalog missing" \ + "Expected ${catalog}" \ + 4 \ + "Restore seaweedfs-known-issues.json in the codebundle directory." + swf_write_issues "$OUTPUT_FILE" + exit 0 +fi + +version=$(swf_chart_version) +if [[ -z "$version" ]]; then + swf_add_issue \ + "Unable to determine SeaweedFS chart version in \`${NAMESPACE}\`" \ + "Set SEAWEEDFS_CHART or ensure helm.sh/chart label is present on master StatefulSet." \ + 3 \ + "Export SEAWEEDFS_CHART=seaweedfs-X.Y.Z for local runs." + swf_write_issues "$OUTPUT_FILE" + exit 0 +fi + +while IFS= read -r row; do + [[ -z "$row" ]] && continue + title=$(echo "$row" | jq -r '.title') + details=$(echo "$row" | jq -r '.details') + severity=$(echo "$row" | jq -r '.severity') + next_steps=$(echo "$row" | jq -r '.next_steps') + swf_add_issue "$title" "$details" "$severity" "$next_steps" +done < <(jq -c --arg v "$version" ' + def pad(v): (v | split(".") | map(tonumber)) + [0,0,0] | .[0:3]; + .[] | select((pad($v) >= pad(.min_version)) and (pad($v) <= pad(.max_version))) +' "$catalog" 2>/dev/null || true) + +swf_write_issues "$OUTPUT_FILE" diff --git a/codebundles/k8s-seaweedfs-healthcheck/check-master-cluster-status.sh b/codebundles/k8s-seaweedfs-healthcheck/check-master-cluster-status.sh new file mode 100755 index 00000000..3120b113 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/check-master-cluster-status.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# Queries SeaweedFS master /cluster/status and /cluster/healthz. +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="master_cluster_issues.json" +# shellcheck disable=SC1091 +source seaweedfs-lib.sh + +print_report() { + { set +x; } 2>/dev/null || true + echo "=== SeaweedFS master cluster status ===" + [[ -f master_status_snapshot.json ]] && jq '.' master_status_snapshot.json || echo "(no snapshot)" + jq -r '.[] | " - [sev=\(.severity)] \(.title)"' "$OUTPUT_FILE" 2>/dev/null || true +} +trap print_report EXIT + +master_pod=$(swf_find_pod "master") +if [[ -z "$master_pod" ]]; then + swf_add_issue \ + "No running SeaweedFS master pod found in namespace \`${NAMESPACE}\`" \ + "Cannot query /cluster/status without a master pod." \ + 2 \ + "Verify master StatefulSet is running and labeled app.kubernetes.io/component=master." + swf_write_issues "$OUTPUT_FILE" + exit 0 +fi + +healthz="" +status_json="" +if ! healthz=$(swf_master_http "/cluster/healthz" 2>/dev/null); then + swf_add_issue \ + "SeaweedFS master /cluster/healthz unreachable in namespace \`${NAMESPACE}\`" \ + "HTTP probe to master API failed from pod ${master_pod}." \ + 2 \ + "Port-forward or exec into ${master_pod} and curl http://127.0.0.1:${MASTER_PORT}/cluster/healthz" +fi + +if ! status_json=$(swf_master_http "/cluster/status" 2>/dev/null); then + swf_add_issue \ + "SeaweedFS master /cluster/status unreachable in namespace \`${NAMESPACE}\`" \ + "Could not retrieve Raft cluster status from master." \ + 2 \ + "Check master logs and network policies blocking in-cluster HTTP on port ${MASTER_PORT}." +else + echo "$status_json" >master_status_snapshot.json +fi + +if [[ -n "$healthz" ]]; then + if ! echo "$healthz" | grep -qiE 'ok|healthy|success'; then + swf_add_issue \ + "SeaweedFS master health check returned unhealthy response in \`${NAMESPACE}\`" \ + "Response: ${healthz}" \ + 2 \ + "Investigate master Raft peers and restart stuck master pods if leadership is lost." + fi +fi + +if [[ -n "$status_json" ]]; then + leader=$(echo "$status_json" | jq -r '.Leader // .leader // empty' 2>/dev/null || true) + is_leader=$(echo "$status_json" | jq -r '.IsLeader // .isLeader // empty' 2>/dev/null || true) + peers=$(echo "$status_json" | jq -r '(.Peers // .peers // []) | length' 2>/dev/null || echo 0) + + if [[ -z "$leader" && "$is_leader" != "true" && "$is_leader" != "True" ]]; then + swf_add_issue \ + "SeaweedFS master cluster has no elected leader in namespace \`${NAMESPACE}\`" \ + "cluster/status did not report Leader or IsLeader=true. peers=${peers}" \ + 2 \ + "Review master StatefulSet ordinals, persistent volumes, and Raft logs." + fi + + if [[ "$peers" =~ ^[0-9]+$ ]] && [[ "$peers" -eq 0 ]]; then + master_replicas=1 + map_json=$(swf_discover_components) + master_replicas=$(echo "$map_json" | jq '[.statefulsets[] | select(.component == "master" or (.name | test("master"; "i")))] | .[0].replicas // 1') + if [[ "$master_replicas" =~ ^[0-9]+$ ]] && [[ "$master_replicas" -gt 1 ]]; then + swf_add_issue \ + "SeaweedFS master reports zero Raft peers in namespace \`${NAMESPACE}\`" \ + "Peer membership may be incomplete for HA master setups." \ + 3 \ + "Confirm master.replicas and cluster bootstrap settings in Helm values." + fi + fi +fi + +swf_write_issues "$OUTPUT_FILE" diff --git a/codebundles/k8s-seaweedfs-healthcheck/check-volume-capacity.sh b/codebundles/k8s-seaweedfs-healthcheck/check-volume-capacity.sh new file mode 100755 index 00000000..1bfdf795 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/check-volume-capacity.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# Inspects volume server disk usage and read-only volume signals. +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="volume_capacity_issues.json" +MIN_FREE_PCT="${MIN_FREE_DISK_PERCENT:-10}" +# shellcheck disable=SC1091 +source seaweedfs-lib.sh + +print_report() { + { set +x; } 2>/dev/null || true + echo "=== SeaweedFS volume server capacity ===" + jq -r '.[] | " - [sev=\(.severity)] \(.title)"' "$OUTPUT_FILE" 2>/dev/null || true +} +trap print_report EXIT + +volume_pods=$("${KUBECTL}" get pods -n "${NAMESPACE}" --context "${CONTEXT}" -o json 2>/dev/null \ + | jq -r '.items[] | select(.status.phase=="Running") | select( + (.metadata.labels["app.kubernetes.io/component"]? == "volume") or + (.metadata.name | test("volume"; "i")) + ) | .metadata.name' || true) + +if [[ -z "$volume_pods" ]]; then + swf_add_issue \ + "No running SeaweedFS volume server pods in namespace \`${NAMESPACE}\`" \ + "Volume capacity cannot be assessed without volume servers." \ + 3 \ + "Enable volume servers in Helm chart and verify pods are Running." + swf_write_issues "$OUTPUT_FILE" + exit 0 +fi + +while IFS= read -r pod; do + [[ -z "$pod" ]] && continue + status_json="" + if ! status_json=$(swf_volume_http "$pod" "/status" 2>/dev/null); then + swf_add_issue \ + "Volume server \`${pod}\` /status unreachable in \`${NAMESPACE}\`" \ + "HTTP probe on port ${VOLUME_PORT} failed." \ + 3 \ + "kubectl logs ${pod} -n ${NAMESPACE} --context ${CONTEXT}" + continue + fi + + disk_usage=$(echo "$status_json" | jq -r '.DiskUsages[]? | "\(.dir // .Dir // "unknown") free=\(.free // .Free // "?") percent_free=\(.percent_free // .PercentFree // "?")"' 2>/dev/null || true) + if [[ -n "$disk_usage" ]]; then + while IFS= read -r line; do + [[ -z "$line" ]] && continue + pct=$(echo "$line" | sed -n 's/.*percent_free=\([0-9.]*\).*/\1/p') + dir=$(echo "$line" | sed -n 's/^\([^ ]*\).*/\1/p') + if [[ -n "$pct" ]] && awk "BEGIN {exit !($pct < $MIN_FREE_PCT)}"; then + swf_add_issue \ + "Volume server \`${pod}\` disk free below ${MIN_FREE_PCT}% on \`${dir}\`" \ + "$line" \ + 2 \ + "Free disk space on volume node or adjust minFreeSpacePercent in Helm values." + fi + done <<< "$disk_usage" + fi + + read_only_count=$(echo "$status_json" | jq '[.Volumes[]? | select(.readOnly == true or .ReadOnly == true)] | length' 2>/dev/null || echo 0) + if [[ "$read_only_count" =~ ^[0-9]+$ ]] && [[ "$read_only_count" -gt 0 ]]; then + swf_add_issue \ + "Volume server \`${pod}\` reports ${read_only_count} read-only volume(s)" \ + "Read-only volumes often indicate disk pressure or manual marks." \ + 2 \ + "Inspect /status on ${pod} and master topology for readOnly volumes." + fi +done <<< "$volume_pods" + +# Master topology may also expose aggregate disk signals +if dir_status=$(swf_master_http "/dir/status" 2>/dev/null); then + low_nodes=$(echo "$dir_status" | jq '[.. | objects | select(has("Max") and has("Free")) | select(.Max > 0 and (.Free / .Max * 100) < '"$MIN_FREE_PCT"')] | length' 2>/dev/null || echo 0) + if [[ "$low_nodes" =~ ^[0-9]+$ ]] && [[ "$low_nodes" -gt 0 ]]; then + swf_add_issue \ + "SeaweedFS topology reports ${low_nodes} node(s) with low free slot ratio in \`${NAMESPACE}\`" \ + "Derived from /dir/status Free/Max ratios below ${MIN_FREE_PCT}%." \ + 3 \ + "Add capacity or retire full volume nodes." + fi +fi + +swf_write_issues "$OUTPUT_FILE" diff --git a/codebundles/k8s-seaweedfs-healthcheck/check-volume-config.sh b/codebundles/k8s-seaweedfs-healthcheck/check-volume-config.sh new file mode 100755 index 00000000..d85d17d1 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/check-volume-config.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash +set -euo pipefail +# Audits SeaweedFS Helm/Kubernetes workload configuration (args, env, mounts, replication). +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="volume_config_issues.json" +CONFIG_SNAPSHOT_FILE="${CONFIG_SNAPSHOT_FILE:-seaweedfs_config_snapshot.json}" +# shellcheck disable=SC1091 +source seaweedfs-lib.sh + +print_report() { + echo "=== SeaweedFS volume configuration audit ===" + [[ -f "$CONFIG_SNAPSHOT_FILE" ]] && jq '.' "$CONFIG_SNAPSHOT_FILE" 2>/dev/null || true + jq -r '.[] | " - [sev=\(.severity)] \(.title)"' "$OUTPUT_FILE" 2>/dev/null || true +} +trap print_report EXIT + +map_json=$(swf_discover_components) +workloads=$(swf_get_filtered_workloads_raw) +echo "$workloads" | jq --argjson map "$map_json" ' + { + release: $map.release, + chart: $map.chart, + workloads: [ + .items[] | { + kind: .kind, + name: .metadata.name, + component: (.metadata.labels["app.kubernetes.io/component"] // "unknown"), + replicas: (.spec.replicas // 1), + command: ((.spec.template.spec.containers[0].command // []) | join(" ") | gsub("\\\\ "; " ")), + env: [.spec.template.spec.containers[0].env[]? | {name, value: (.value // "")}], + volumeMounts: [.spec.template.spec.containers[0].volumeMounts[]? | .mountPath], + volumes: [.spec.template.spec.volumes[]? | {name, claim: (.persistentVolumeClaim.claimName // "")}] + } + ] + } +' >"$CONFIG_SNAPSHOT_FILE" + +default_replication="" +master_replicas=0 +volume_replicas=0 +volume_max="" +volume_dirs=() +master_peers="" + +while IFS= read -r wl; do + [[ -z "$wl" ]] && continue + component=$(echo "$wl" | jq -r '.component') + name=$(echo "$wl" | jq -r '.name') + cmd=$(echo "$wl" | jq -r '.command') + replicas=$(echo "$wl" | jq -r '.replicas') + + case "$component" in + master) + master_replicas=$replicas + if echo "$cmd" | grep -qE '\-defaultReplication='; then + default_replication=$(echo "$cmd" | sed -n 's/.*-defaultReplication=\([^ ]*\).*/\1/p') + fi + if echo "$cmd" | grep -qE '\-peers='; then + master_peers=$(echo "$cmd" | sed -n 's/.*-peers=\([^ ]*\).*/\1/p') + peer_count=$(echo "$master_peers" | tr ',' '\n' | grep -c . || echo 0) + if [[ "$peer_count" =~ ^[0-9]+$ ]] && [[ "$replicas" =~ ^[0-9]+$ ]] && [[ "$peer_count" -ne "$replicas" ]]; then + swf_add_issue \ + "SeaweedFS master peer list count (${peer_count}) differs from StatefulSet replicas (${replicas})" \ + "Workload \`${name}\`, peers=${master_peers}" \ + 2 \ + "Align master.peers in Helm values with master.replicas for HA." + fi + fi + mdir=$(echo "$cmd" | sed -n 's/.*-mdir=\([^ ]*\).*/\1/p') + if [[ -n "$mdir" ]]; then + mounted=$(echo "$wl" | jq -r --arg p "$mdir" '.volumeMounts[]? | select(. == $p) // empty') + if [[ -z "$mounted" ]]; then + swf_add_issue \ + "SeaweedFS master metadata dir \`${mdir}\` is not mounted in \`${name}\`" \ + "Command declares -mdir but no matching volumeMount." \ + 2 \ + "Add a PVC/volumeMount for ${mdir} or fix Helm master.data persistence settings." + fi + fi + ;; + volume) + volume_replicas=$replicas + if echo "$cmd" | grep -qE '\-max='; then + volume_max=$(echo "$cmd" | sed -n 's/.*-max=\([^ ]*\).*/\1/p') + fi + while IFS= read -r dir; do + [[ -z "$dir" ]] && continue + volume_dirs+=("$dir") + mounted=$(echo "$wl" | jq -r --arg p "$dir" '.volumeMounts[]? | select(. == $p) // empty') + if [[ -z "$mounted" ]]; then + swf_add_issue \ + "SeaweedFS volume data dir \`${dir}\` is not mounted in \`${name}\`" \ + "Command declares -dir=${dir} without a matching volumeMount." \ + 2 \ + "Verify volume.dataDirs and persistence in Helm values." + fi + done < <(echo "$cmd" | grep -oE '\-dir=[^ ]+' | sed 's/-dir=//' || true) + ;; + filer) + master_env=$(echo "$wl" | jq -r '.env[] | select(.name=="WEED_CLUSTER_SW_MASTER") | .value // empty') + if [[ -z "$master_env" ]]; then + swf_add_issue \ + "SeaweedFS filer \`${name}\` missing WEED_CLUSTER_SW_MASTER env" \ + "Filer may not discover the master service in-cluster." \ + 3 \ + "Set filer cluster master address in Helm values." + fi + ;; + esac +done < <(jq -c '.workloads[]' "$CONFIG_SNAPSHOT_FILE") + +if [[ -n "$default_replication" ]]; then + min_vols=$(swf_replication_min_volumes "$default_replication") + if [[ "$volume_replicas" =~ ^[0-9]+$ ]] && [[ "$min_vols" =~ ^[0-9]+$ ]] && [[ "$volume_replicas" -lt "$min_vols" ]]; then + swf_add_issue \ + "SeaweedFS defaultReplication \`${default_replication}\` requires at least ${min_vols} volume server(s)" \ + "Running volume replicas=${volume_replicas}, chart defaultReplication=${default_replication}" \ + 2 \ + "Increase volume.replicas or lower defaultReplication in Helm values." + fi +else + swf_add_issue \ + "SeaweedFS master defaultReplication not found in workload command" \ + "Could not parse -defaultReplication from master container command." \ + 3 \ + "Confirm master.extraArgs or chart defaults set defaultReplication explicitly." +fi + +if [[ -n "$volume_max" ]] && [[ "$volume_max" =~ ^[0-9]+$ ]] && [[ "$volume_max" -lt 10 ]]; then + swf_add_issue \ + "SeaweedFS volume server -max=${volume_max} is low for production" \ + "Each volume pod is capped at ${volume_max} volumes." \ + 3 \ + "Raise volume.maxVolumes in Helm values if slot exhaustion is frequent." +fi + +if [[ "$master_replicas" -eq 1 && -n "$master_peers" ]]; then + swf_add_issue \ + "SeaweedFS master runs single replica with explicit peer bootstrap" \ + "Single master with -peers configured; verify this matches intended topology." \ + 3 \ + "Use master.replicas=3 for HA or remove redundant peer wiring on single-node installs." +fi + +swf_write_issues "$OUTPUT_FILE" diff --git a/codebundles/k8s-seaweedfs-healthcheck/check-volume-slots.sh b/codebundles/k8s-seaweedfs-healthcheck/check-volume-slots.sh new file mode 100755 index 00000000..cc7ed27c --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/check-volume-slots.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# Parses /dir/status topology for free volume slots. +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="volume_slots_issues.json" +MIN_FREE="${MIN_FREE_VOLUME_SLOTS:-1}" +# shellcheck disable=SC1091 +source seaweedfs-lib.sh + +print_report() { + { set +x; } 2>/dev/null || true + echo "=== SeaweedFS volume slot topology ===" + [[ -f dir_status_snapshot.json ]] && jq '{Free: .Topology.Free, Max: .Topology.Max, DataCenters: (.Topology.DataCenters // {} | keys)}' dir_status_snapshot.json 2>/dev/null || true + jq -r '.[] | " - [sev=\(.severity)] \(.title)"' "$OUTPUT_FILE" 2>/dev/null || true +} +trap print_report EXIT + +if ! dir_status=$(swf_master_http "/dir/status" 2>/dev/null); then + swf_add_issue \ + "Unable to query SeaweedFS /dir/status in namespace \`${NAMESPACE}\`" \ + "Master topology API was unreachable." \ + 2 \ + "Ensure master pod is Ready and HTTP is enabled (master.disableHttp=false)." + swf_write_issues "$OUTPUT_FILE" + exit 0 +fi + +echo "$dir_status" >dir_status_snapshot.json + +root_free=$(echo "$dir_status" | jq -r '.Topology.Free // .topology.free // empty' 2>/dev/null || true) +root_max=$(echo "$dir_status" | jq -r '.Topology.Max // .topology.max // empty' 2>/dev/null || true) + +if [[ -n "$root_free" && "$root_free" =~ ^[0-9]+$ ]]; then + if [[ "$root_free" -lt "$MIN_FREE" ]]; then + swf_add_issue \ + "SeaweedFS cluster free volume slots below threshold in \`${NAMESPACE}\`" \ + "Topology Free=${root_free}, Max=${root_max:-unknown}, required minimum=${MIN_FREE}" \ + 2 \ + "Add volume servers or increase max volumes per node in Helm values." + fi +else + swf_add_issue \ + "SeaweedFS /dir/status missing Topology.Free in namespace \`${NAMESPACE}\`" \ + "Could not parse free slot count from master response." \ + 3 \ + "Verify SeaweedFS version compatibility; inspect raw /dir/status output." +fi + +# Walk nested topology nodes for local exhaustion +while IFS= read -r node; do + [[ -z "$node" ]] && continue + path=$(echo "$node" | jq -r '.path') + free=$(echo "$node" | jq -r '.free') + max=$(echo "$node" | jq -r '.max') + if [[ "$free" =~ ^[0-9]+$ ]] && [[ "$free" -lt "$MIN_FREE" ]]; then + swf_add_issue \ + "Low free volume slots at topology node \`${path}\` in \`${NAMESPACE}\`" \ + "Free=${free}, Max=${max}" \ + 3 \ + "Scale volume servers in rack/datacenter ${path} or rebalance volumes." + fi +done < <(echo "$dir_status" | jq -c ' + def walk_nodes(obj; path): + (obj // {}) | to_entries[] | + . as $e | + (path + "/" + $e.key) as $p | + ($e.value | if (.Free? != null) or (.free? != null) then + {path: $p, free: ($e.value.Free // $e.value.free // 0), max: ($e.value.Max // $e.value.max // 0)} + else empty end), + (if ($e.value | type) == "object" then walk_nodes($e.value; $p) else empty end); + .Topology // .topology // {} | + walk_nodes(.DataCenters // .dataCenters // {}; "root") +') + +swf_write_issues "$OUTPUT_FILE" diff --git a/codebundles/k8s-seaweedfs-healthcheck/check-workload-health.sh b/codebundles/k8s-seaweedfs-healthcheck/check-workload-health.sh new file mode 100755 index 00000000..42af0a45 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/check-workload-health.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# Verifies SeaweedFS StatefulSets/Deployments replica health and warning events. +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="workload_health_issues.json" +# shellcheck disable=SC1091 +source seaweedfs-lib.sh + +print_report() { + { set +x; } 2>/dev/null || true + echo "=== SeaweedFS workload health (${NAMESPACE}) ===" + if [[ -f "$COMPONENT_MAP_FILE" ]]; then + jq -r ' + (.statefulsets + .deployments) + | .[] + | " \(.name) ready=\(.ready)/\(.replicas) component=\(.component)" + ' "$COMPONENT_MAP_FILE" 2>/dev/null || true + fi + jq -r '.[] | " - [sev=\(.severity)] \(.title)"' "$OUTPUT_FILE" 2>/dev/null || true +} +trap print_report EXIT + +map_json=$(swf_discover_components) +echo "$map_json" >"$COMPONENT_MAP_FILE" + +while IFS= read -r wl; do + [[ -z "$wl" ]] && continue + name=$(echo "$wl" | jq -r '.name') + kind="statefulset" + if echo "$map_json" | jq -e --arg n "$name" '.deployments[] | select(.name==$n)' >/dev/null; then + kind="deployment" + fi + want=$(echo "$wl" | jq -r '.replicas') + ready=$(echo "$wl" | jq -r '.ready') + if [[ "$want" =~ ^[0-9]+$ ]] && [[ "$ready" =~ ^[0-9]+$ ]] && [[ "$want" -gt 0 ]] && [[ "$ready" -lt "$want" ]]; then + swf_add_issue \ + "SeaweedFS ${kind} \`${name}\` is not fully Ready" \ + "readyReplicas=${ready}, desired=${want}" \ + 2 \ + "kubectl describe ${kind} ${name} -n ${NAMESPACE} --context ${CONTEXT}" + fi + + pods_json=$("${KUBECTL}" get pods -n "${NAMESPACE}" --context "${CONTEXT}" -o json 2>/dev/null \ + | jq --arg n "$name" '{items: [.items[] | select(.metadata.name | startswith($n))]}' || echo '{"items":[]}') + + while IFS= read -r pline; do + [[ -z "$pline" ]] && continue + pname=$(echo "$pline" | jq -r '.name') + phase=$(echo "$pline" | jq -r '.phase') + crash=$(echo "$pline" | jq -r '.crash') + pending=$(echo "$pline" | jq -r '.pending') + if [[ "$crash" == "true" ]]; then + swf_add_issue \ + "SeaweedFS pod \`${pname}\` is in CrashLoopBackOff" \ + "Workload ${name}, phase=${phase}" \ + 2 \ + "kubectl logs ${pname} -n ${NAMESPACE} --context ${CONTEXT} --previous" + elif [[ "$pending" == "true" ]]; then + swf_add_issue \ + "SeaweedFS pod \`${pname}\` is pending scheduling" \ + "Workload ${name}, phase=${phase}" \ + 3 \ + "kubectl describe pod ${pname} -n ${NAMESPACE} --context ${CONTEXT}" + fi + done < <(echo "$pods_json" | jq -c '.items[] | { + name: .metadata.name, + phase: (.status.phase // "Unknown"), + crash: ([.status.containerStatuses[]? | .state.waiting.reason? // empty] | any(. == "CrashLoopBackOff")), + pending: (.status.phase == "Pending") + }') + + events=$("${KUBECTL}" get events -n "${NAMESPACE}" --context "${CONTEXT}" -o json 2>/dev/null \ + | jq --arg n "$name" '[.items[] | select(.type=="Warning") | select(.involvedObject.name | contains($n)) | .message] | unique | .[0:3] | join("; ")' || echo "") + if [[ -n "$events" && "$events" != "null" ]]; then + swf_add_issue \ + "Recent Warning events for SeaweedFS workload \`${name}\`" \ + "$events" \ + 3 \ + "kubectl get events -n ${NAMESPACE} --context ${CONTEXT} --field-selector involvedObject.name=${name}" + fi +done < <(echo "$map_json" | jq -c '.statefulsets[], .deployments[]') + +swf_write_issues "$OUTPUT_FILE" diff --git a/codebundles/k8s-seaweedfs-healthcheck/check-writable-layouts.sh b/codebundles/k8s-seaweedfs-healthcheck/check-writable-layouts.sh new file mode 100755 index 00000000..0547e285 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/check-writable-layouts.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# Evaluates writable volume layouts from /dir/status. +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="writable_layouts_issues.json" +# shellcheck disable=SC1091 +source seaweedfs-lib.sh + +print_report() { + { set +x; } 2>/dev/null || true + echo "=== SeaweedFS writable layouts ===" + [[ -f writable_layouts_snapshot.json ]] && jq '.' writable_layouts_snapshot.json 2>/dev/null || true + jq -r '.[] | " - [sev=\(.severity)] \(.title)"' "$OUTPUT_FILE" 2>/dev/null || true +} +trap print_report EXIT + +if ! dir_status=$(swf_master_http "/dir/status" 2>/dev/null); then + swf_add_issue \ + "Unable to evaluate writable layouts: /dir/status unreachable in \`${NAMESPACE}\`" \ + "Master API call failed." \ + 2 \ + "Restore master HTTP access before checking writable layouts." + swf_write_issues "$OUTPUT_FILE" + exit 0 +fi + +layouts=$(echo "$dir_status" | jq -c ' + [ + (.Layouts // .layouts // {} | to_entries[] | {name: .key, writable: (.value.writables // .value.Writables // [] | length), replication: (.value.replication // .value.Replication // "unknown")}), + (.Topology.Layouts // .topology.layouts // {} | to_entries[]? | {name: .key, writable: (.value.writables // .value.Writables // [] | length), replication: (.value.replication // .value.Replication // "unknown")}) + ] | map(select(.name != null)) +' 2>/dev/null || echo '[]') + +echo "$layouts" >writable_layouts_snapshot.json + +layout_count=$(echo "$layouts" | jq 'length') +if [[ "$layout_count" -eq 0 ]]; then + # Fallback: inspect topology writables at root + root_writables=$(echo "$dir_status" | jq -r '.Topology.Writables // .topology.writables // [] | length' 2>/dev/null || echo "") + if [[ -n "$root_writables" && "$root_writables" =~ ^[0-9]+$ && "$root_writables" -eq 0 ]]; then + swf_add_issue \ + "SeaweedFS topology root has zero writable volumes in \`${NAMESPACE}\`" \ + "/dir/status reported no writables at cluster root." \ + 2 \ + "Verify volume servers are registered and not read-only; check defaultReplication settings." + fi +else + while IFS= read -r layout; do + [[ -z "$layout" ]] && continue + lname=$(echo "$layout" | jq -r '.name') + writable=$(echo "$layout" | jq -r '.writable') + repl=$(echo "$layout" | jq -r '.replication') + if [[ "$writable" =~ ^[0-9]+$ ]] && [[ "$writable" -eq 0 ]]; then + swf_add_issue \ + "SeaweedFS layout \`${lname}\` has zero writable volumes in \`${NAMESPACE}\`" \ + "replication=${repl}, writable count=0" \ + 2 \ + "Ensure enough volume servers exist for replication ${repl}; check collection placement." + fi + done < <(echo "$layouts" | jq -c '.[]') +fi + +# Read-only volumes in layouts +readonly_vols=$(echo "$dir_status" | jq '[.. | objects | .readOnlyVolumeIds? // .ReadOnlyVolumeIds? // empty | .[]? ] | length' 2>/dev/null || echo 0) +if [[ "$readonly_vols" =~ ^[0-9]+$ ]] && [[ "$readonly_vols" -gt 0 ]]; then + swf_add_issue \ + "SeaweedFS reports ${readonly_vols} read-only volume id(s) in layouts for \`${NAMESPACE}\`" \ + "Read-only volumes block writes for affected collections." \ + 2 \ + "Investigate disk space on hosting volume servers and clear readOnly flags when safe." +fi + +swf_write_issues "$OUTPUT_FILE" diff --git a/codebundles/k8s-seaweedfs-healthcheck/gc_compaction_issues.json b/codebundles/k8s-seaweedfs-healthcheck/gc_compaction_issues.json new file mode 100644 index 00000000..698d23c2 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/gc_compaction_issues.json @@ -0,0 +1,26 @@ +[ + { + "title": "SeaweedFS master pick-for-write errors elevated in `runwhen-env-test`", + "details": "SeaweedFS_master_pick_for_write_error=4260 (threshold=100)", + "severity": 2, + "next_steps": "Inspect writable layouts, read-only volumes, and slot availability; scale volume servers." + }, + { + "title": "SeaweedFS master received volume heartbeat errors", + "details": "SeaweedFS_master_received_heartbeats{type=\"error\"}=1", + "severity": 2, + "next_steps": "Check volume server logs and network paths to master port 9333." + }, + { + "title": "Volume server `rw-seaweedfs-volume-0` reports disk write errors (possible GC/compaction pressure)", + "details": "SeaweedFS_volumeServer_handler_total{type=\"errorWriteToLocalDisk\"}=921", + "severity": 2, + "next_steps": "Check disk space, PVC capacity, and read-only volumes on rw-seaweedfs-volume-0." + }, + { + "title": "Volume server `rw-seaweedfs-volume-0` has volumes in noWriteOrDelete state", + "details": "read_only_noWriteOrDelete=20 volumes may block deletes and GC.", + "severity": 2, + "next_steps": "Investigate collection TTL/vacuum settings and disk pressure on rw-seaweedfs-volume-0." + } +] diff --git a/codebundles/k8s-seaweedfs-healthcheck/known_issues.json b/codebundles/k8s-seaweedfs-healthcheck/known_issues.json new file mode 100644 index 00000000..cc698e8d --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/known_issues.json @@ -0,0 +1,14 @@ +[ + { + "title": "SeaweedFS 4.x single-master installs report zero Raft peers in /cluster/status", + "details": "Single-replica master clusters often omit Peers in status JSON; rely on IsLeader and /cluster/healthz instead of peer count alone.", + "severity": 3, + "next_steps": "Use HA master replicas (>1) for production; ignore zero-peer status on single-node dev installs." + }, + { + "title": "Dedicated S3 deployment may not expose S3 on filer port 8333", + "details": "Official Helm chart can run S3 as a separate Deployment; probes must target the s3 component pod or rw-seaweedfs-s3 service, not the filer.", + "severity": 3, + "next_steps": "Confirm app.kubernetes.io/component=s3 exists or enable filer.s3 on the filer StatefulSet." + } +] diff --git a/codebundles/k8s-seaweedfs-healthcheck/list-seaweedfs-resources.sh b/codebundles/k8s-seaweedfs-healthcheck/list-seaweedfs-resources.sh new file mode 100755 index 00000000..7c884d52 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/list-seaweedfs-resources.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# Discovers SeaweedFS workloads, services, and PVCs in the namespace. +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="list_seaweedfs_resources_issues.json" +# shellcheck disable=SC1091 +source seaweedfs-lib.sh + +print_report() { + { set +x; } 2>/dev/null || true + echo + echo "=== SeaweedFS component map (namespace ${NAMESPACE}, context ${CONTEXT}) ===" + if [[ -f "$COMPONENT_MAP_FILE" ]]; then + jq '.' "$COMPONENT_MAP_FILE" + fi + local ic + ic=$(jq 'length' "$OUTPUT_FILE" 2>/dev/null || echo 0) + echo "=== Findings (${ic}) ===" + if [[ "$ic" -eq 0 ]]; then + echo " Discovery completed without blocking issues." + else + jq -r '.[] | " - [sev=\(.severity)] \(.title)"' "$OUTPUT_FILE" + fi +} +trap print_report EXIT + +if ! "${KUBECTL}" get ns "${NAMESPACE}" --context "${CONTEXT}" -o name &>/dev/null; then + swf_add_issue \ + "Namespace \`${NAMESPACE}\` not accessible in context \`${CONTEXT}\`" \ + "kubectl cannot read the target namespace." \ + 4 \ + "Verify NAMESPACE and kubeconfig RBAC for namespace read access." + swf_write_issues "$OUTPUT_FILE" + exit 0 +fi + +map_json=$(swf_discover_components) +echo "$map_json" >"$COMPONENT_MAP_FILE" + +sts_count=$(echo "$map_json" | jq '.statefulsets | length') +dep_count=$(echo "$map_json" | jq '.deployments | length') +if [[ "$sts_count" -eq 0 && "$dep_count" -eq 0 ]]; then + swf_add_issue \ + "No SeaweedFS workloads found in namespace \`${NAMESPACE}\`" \ + "No StatefulSets or Deployments matched SeaweedFS labels or name patterns." \ + 3 \ + "Confirm SeaweedFS is installed. Set SEAWEEDFS_RELEASE_NAME if using non-standard labels." +fi + +for required in master volume filer; do + found=$(echo "$map_json" | jq --arg c "$required" '[.statefulsets[], .deployments[]] | map(select(.component == $c or (.name | test($c; "i")))) | length') + if [[ "$found" -eq 0 ]]; then + swf_add_issue \ + "Missing SeaweedFS \`${required}\` component in namespace \`${NAMESPACE}\`" \ + "Expected a workload for component ${required} but none was discovered." \ + 4 \ + "Check Helm values for ${required}.enabled and label selectors app.kubernetes.io/component=${required}." + fi +done + +while IFS= read -r wl; do + [[ -z "$wl" ]] && continue + name=$(echo "$wl" | jq -r '.name') + want=$(echo "$wl" | jq -r '.replicas') + ready=$(echo "$wl" | jq -r '.ready') + if [[ "$want" =~ ^[0-9]+$ ]] && [[ "$want" -gt 0 ]] && [[ "$ready" == "0" ]]; then + swf_add_issue \ + "SeaweedFS workload \`${name}\` has zero ready replicas" \ + "desired=${want}, ready=${ready}" \ + 3 \ + "Inspect pods for ${name}: kubectl get pods -n ${NAMESPACE} -l app.kubernetes.io/name=seaweedfs --context ${CONTEXT}" + fi +done < <(echo "$map_json" | jq -c '.statefulsets[], .deployments[]') + +swf_write_issues "$OUTPUT_FILE" diff --git a/codebundles/k8s-seaweedfs-healthcheck/runbook.robot b/codebundles/k8s-seaweedfs-healthcheck/runbook.robot new file mode 100644 index 00000000..33a0854e --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/runbook.robot @@ -0,0 +1,620 @@ +*** Settings *** +Documentation Validates SeaweedFS storage configuration health in a Kubernetes namespace by checking master leadership, volume slots, disk capacity, component connectivity, and S3 gateway operations. +Metadata Author rw-codebundle-agent +Metadata Display Name Kubernetes SeaweedFS Storage Health Check +Metadata Supports Kubernetes SeaweedFS storage health S3 + +Library String +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform +Library RW.K8sHelper + +Force Tags Kubernetes SeaweedFS storage health + +Suite Setup Suite Initialization + + +*** Tasks *** +List SeaweedFS Resources in Namespace `${NAMESPACE}` + [Documentation] Discovers SeaweedFS master, volume, filer, and S3 gateway workloads, services, and PVCs and surfaces missing components. + [Tags] Kubernetes SeaweedFS discovery access:read-only data:logs-config + + ${result}= RW.CLI.Run Bash File + ... bash_file=list-seaweedfs-resources.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=CONTEXT="${CONTEXT}" NAMESPACE="${NAMESPACE}" SEAWEEDFS_RELEASE_NAME="${SEAWEEDFS_RELEASE_NAME}" ./list-seaweedfs-resources.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat list_seaweedfs_resources_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for list resources task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=SeaweedFS components should be discoverable in namespace `${NAMESPACE}` + ... actual=Resource discovery found gaps for release `${SEAWEEDFS_RELEASE_NAME}` + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report SeaweedFS resource discovery (stdout): + RW.Core.Add Pre To Report ${result.stdout} + +Check SeaweedFS Workload Replica Health in Namespace `${NAMESPACE}` + [Documentation] Verifies StatefulSets and Deployments for SeaweedFS components have desired replicas ready and flags CrashLoopBackOff or pending pods. + [Tags] Kubernetes SeaweedFS workload access:read-only data:logs-config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-workload-health.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=CONTEXT="${CONTEXT}" NAMESPACE="${NAMESPACE}" SEAWEEDFS_RELEASE_NAME="${SEAWEEDFS_RELEASE_NAME}" ./check-workload-health.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat workload_health_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for workload health task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=SeaweedFS workloads in `${NAMESPACE}` should have ready replicas matching desired counts + ... actual=Workload health checks reported problems for release `${SEAWEEDFS_RELEASE_NAME}` + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report SeaweedFS workload health (stdout): + RW.Core.Add Pre To Report ${result.stdout} + +Check SeaweedFS Master Cluster Status in Namespace `${NAMESPACE}` + [Documentation] Queries master /cluster/status and /cluster/healthz to validate Raft leadership and master health endpoints. + [Tags] Kubernetes SeaweedFS master access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-master-cluster-status.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=CONTEXT="${CONTEXT}" NAMESPACE="${NAMESPACE}" SEAWEEDFS_MASTER_SERVICE="${SEAWEEDFS_MASTER_SERVICE}" ./check-master-cluster-status.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat master_cluster_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for master cluster task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=SeaweedFS master should be healthy with an elected leader in `${NAMESPACE}` + ... actual=Master cluster API checks failed for context `${CONTEXT}` + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report SeaweedFS master cluster status (stdout): + RW.Core.Add Pre To Report ${result.stdout} + +Check SeaweedFS Volume Slot Availability in Namespace `${NAMESPACE}` + [Documentation] Parses /dir/status topology to ensure free volume slots exist before workloads fail on allocation. + [Tags] Kubernetes SeaweedFS volumes access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-volume-slots.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=CONTEXT="${CONTEXT}" NAMESPACE="${NAMESPACE}" MIN_FREE_VOLUME_SLOTS="${MIN_FREE_VOLUME_SLOTS}" ./check-volume-slots.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat volume_slots_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for volume slots task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=SeaweedFS should maintain at least `${MIN_FREE_VOLUME_SLOTS}` free volume slots in `${NAMESPACE}` + ... actual=Topology free slots are below threshold + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report SeaweedFS volume slot analysis (stdout): + RW.Core.Add Pre To Report ${result.stdout} + +Check SeaweedFS Volume Server Disk Capacity in Namespace `${NAMESPACE}` + [Documentation] Inspects volume server /status and topology for disk usage, read-only volumes, and min-free-space threshold breaches. + [Tags] Kubernetes SeaweedFS capacity access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-volume-capacity.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=CONTEXT="${CONTEXT}" NAMESPACE="${NAMESPACE}" MIN_FREE_DISK_PERCENT="${MIN_FREE_DISK_PERCENT}" ./check-volume-capacity.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat volume_capacity_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for volume capacity task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Volume servers in `${NAMESPACE}` should maintain at least `${MIN_FREE_DISK_PERCENT}` percent free disk + ... actual=Disk capacity or read-only volume signals were detected + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report SeaweedFS volume capacity analysis (stdout): + RW.Core.Add Pre To Report ${result.stdout} + +Check SeaweedFS Writable Volume Layout in Namespace `${NAMESPACE}` + [Documentation] Evaluates /dir/status layouts for writable volume IDs and flags zero-writable or read-only placement problems. + [Tags] Kubernetes SeaweedFS layout access:read-only data:config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-writable-layouts.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=CONTEXT="${CONTEXT}" NAMESPACE="${NAMESPACE}" ./check-writable-layouts.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat writable_layouts_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for writable layouts task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=SeaweedFS writable layouts in `${NAMESPACE}` should have allocatable volumes for configured replication + ... actual=Layout evaluation found zero-writable or read-only volumes + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report SeaweedFS writable layout analysis (stdout): + RW.Core.Add Pre To Report ${result.stdout} + +Check SeaweedFS Filer and Component Connectivity in Namespace `${NAMESPACE}` + [Documentation] Confirms filer health endpoints respond and volume servers appear registered in master topology. + [Tags] Kubernetes SeaweedFS connectivity access:read-only data:logs-config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-component-connectivity.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=CONTEXT="${CONTEXT}" NAMESPACE="${NAMESPACE}" SEAWEEDFS_FILER_SERVICE="${SEAWEEDFS_FILER_SERVICE}" ./check-component-connectivity.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat component_connectivity_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for connectivity task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Filer and volume servers in `${NAMESPACE}` should be reachable and registered with master + ... actual=Component connectivity checks reported problems + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report SeaweedFS connectivity analysis (stdout): + RW.Core.Add Pre To Report ${result.stdout} + +Verify SeaweedFS S3 Gateway Operations in Namespace `${NAMESPACE}` + [Documentation] Performs ListBuckets and put/get/delete of a temporary test object against the filer S3 endpoint when enabled. + [Tags] Kubernetes SeaweedFS S3 access:read-write data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=verify-s3-gateway.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... secret__seaweedfs_s3_credentials=${SEAWEEDFS_S3_CREDENTIALS} + ... timeout_seconds=240 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=CONTEXT="${CONTEXT}" NAMESPACE="${NAMESPACE}" S3_PROBE_BUCKET="${S3_PROBE_BUCKET}" SEAWEEDFS_S3_ENDPOINT="${SEAWEEDFS_S3_ENDPOINT}" ./verify-s3-gateway.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat s3_gateway_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for S3 gateway task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=S3 gateway operations should succeed for SeaweedFS in `${NAMESPACE}` when S3 is enabled + ... actual=S3 probe reported failures or skipped checks + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report SeaweedFS S3 gateway probe (stdout): + RW.Core.Add Pre To Report ${result.stdout} + +Check SeaweedFS Volume Configuration in Namespace `${NAMESPACE}` + [Documentation] Audits Helm-rendered workload commands, env, mounts, replication, and volume limits for misconfiguration. + [Tags] Kubernetes SeaweedFS config access:read-only data:config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-volume-config.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=CONTEXT="${CONTEXT}" NAMESPACE="${NAMESPACE}" SEAWEEDFS_RELEASE_NAME="${SEAWEEDFS_RELEASE_NAME}" SEAWEEDFS_CHART="${SEAWEEDFS_CHART}" ./check-volume-config.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat volume_config_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for volume config task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=SeaweedFS Helm workload configuration in `${NAMESPACE}` should match replication and persistence requirements + ... actual=Volume configuration audit reported problems + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report SeaweedFS volume configuration audit (stdout): + RW.Core.Add Pre To Report ${result.stdout} + +Check SeaweedFS Garbage Collection and Compaction Signals in Namespace `${NAMESPACE}` + [Documentation] Reads master and volume Prometheus metrics for pick-for-write errors, crowded layouts, disk write failures, and delete-blocking read-only volumes. + [Tags] Kubernetes SeaweedFS gc access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-gc-compaction.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=CONTEXT="${CONTEXT}" NAMESPACE="${NAMESPACE}" SEAWEEDFS_RELEASE_NAME="${SEAWEEDFS_RELEASE_NAME}" SEAWEEDFS_CHART="${SEAWEEDFS_CHART}" MAX_PICK_FOR_WRITE_ERRORS="${MAX_PICK_FOR_WRITE_ERRORS}" MAX_VOLUME_DISK_ERRORS="${MAX_VOLUME_DISK_ERRORS}" ./check-gc-compaction.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat gc_compaction_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for GC/compaction task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=SeaweedFS GC and compaction paths in `${NAMESPACE}` should not show sustained error counters + ... actual=GC/compaction metric checks reported problems + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report SeaweedFS GC/compaction analysis (stdout): + RW.Core.Add Pre To Report ${result.stdout} + +Check SeaweedFS Capacity Projection in Namespace `${NAMESPACE}` + [Documentation] Evaluates slot and disk utilization headroom and estimates time-to-full when a prior capacity snapshot exists. + [Tags] Kubernetes SeaweedFS capacity access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-capacity-projection.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... cmd_override=CONTEXT="${CONTEXT}" NAMESPACE="${NAMESPACE}" SEAWEEDFS_RELEASE_NAME="${SEAWEEDFS_RELEASE_NAME}" SEAWEEDFS_CHART="${SEAWEEDFS_CHART}" CAPACITY_WARN_PERCENT="${CAPACITY_WARN_PERCENT}" MIN_PROJECTION_HOURS="${MIN_PROJECTION_HOURS}" ./check-capacity-projection.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat capacity_projection_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for capacity projection task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=SeaweedFS capacity in `${NAMESPACE}` should maintain headroom below `${CAPACITY_WARN_PERCENT}` percent utilization + ... actual=Capacity projection checks reported risk of exhaustion + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report SeaweedFS capacity projection (stdout): + RW.Core.Add Pre To Report ${result.stdout} + +Check SeaweedFS Known Version Issues in Namespace `${NAMESPACE}` + [Documentation] Matches the installed helm.sh/chart version against a curated catalog of SeaweedFS known issues and upgrade cautions. + [Tags] Kubernetes SeaweedFS version access:read-only data:config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-known-issues.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=120 + ... include_in_history=false + ... cmd_override=CONTEXT="${CONTEXT}" NAMESPACE="${NAMESPACE}" SEAWEEDFS_RELEASE_NAME="${SEAWEEDFS_RELEASE_NAME}" SEAWEEDFS_CHART="${SEAWEEDFS_CHART}" ./check-known-issues.sh + + ${raw}= RW.CLI.Run Cli + ... cmd=cat known_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${raw.stdout}''') json + EXCEPT + Log Failed to parse JSON for known issues task. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Installed SeaweedFS chart version in `${NAMESPACE}` should not match known issue patterns + ... actual=Known-issue catalog matched this chart version + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report SeaweedFS known version issues (stdout): + RW.Core.Add Pre To Report ${result.stdout} + + +*** Keywords *** +Suite Initialization + ${kubeconfig}= RW.Core.Import Secret + ... kubeconfig + ... type=string + ... description=Kubernetes kubeconfig for cluster access. + ... pattern=\w* + TRY + ${SEAWEEDFS_S3_CREDENTIALS}= RW.Core.Import Secret + ... seaweedfs_s3_credentials + ... type=string + ... description=Optional JSON S3 credentials for filer gateway probe. + ... pattern=.* + EXCEPT + Log Optional seaweedfs_s3_credentials secret not provided. WARN + ${SEAWEEDFS_S3_CREDENTIALS}= Set Variable ${EMPTY} + END + ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY + ... type=string + ... description=Kubernetes CLI binary (kubectl or oc). + ... enum=[kubectl,oc] + ... default=kubectl + ${CONTEXT}= RW.Core.Import User Variable CONTEXT + ... type=string + ... description=Kubernetes context for the target cluster. + ... pattern=\w* + ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE + ... type=string + ... description=Namespace where SeaweedFS is deployed. + ... pattern=\w* + ${SEAWEEDFS_RELEASE_NAME}= RW.Core.Import User Variable SEAWEEDFS_RELEASE_NAME + ... type=string + ... description=Helm release instance label (parent release for subchart installs). + ... default= + ... pattern=.* + ${SEAWEEDFS_CHART}= RW.Core.Import User Variable SEAWEEDFS_CHART + ... type=string + ... description=Exact helm.sh/chart label for the SeaweedFS subchart (e.g. seaweedfs-4.25.0). + ... default= + ... pattern=.* + ${SEAWEEDFS_MASTER_SERVICE}= RW.Core.Import User Variable SEAWEEDFS_MASTER_SERVICE + ... type=string + ... description=Override master service host:port when auto-discovery is insufficient. + ... default= + ... pattern=.* + ${SEAWEEDFS_FILER_SERVICE}= RW.Core.Import User Variable SEAWEEDFS_FILER_SERVICE + ... type=string + ... description=Override filer service host:port when auto-discovery is insufficient. + ... default= + ... pattern=.* + ${SEAWEEDFS_S3_ENDPOINT}= RW.Core.Import User Variable SEAWEEDFS_S3_ENDPOINT + ... type=string + ... description=Override S3 endpoint URL for gateway probe. + ... default= + ... pattern=.* + ${MIN_FREE_VOLUME_SLOTS}= RW.Core.Import User Variable MIN_FREE_VOLUME_SLOTS + ... type=string + ... description=Minimum free volume slots required before raising an issue. + ... default=1 + ... pattern=^\d+$ + ${MIN_FREE_DISK_PERCENT}= RW.Core.Import User Variable MIN_FREE_DISK_PERCENT + ... type=string + ... description=Minimum free disk percentage required on volume servers. + ... default=10 + ... pattern=^\d+$ + ${S3_PROBE_BUCKET}= RW.Core.Import User Variable S3_PROBE_BUCKET + ... type=string + ... description=Existing bucket for S3 probe; temporary object prefix is used. + ... default= + ... pattern=.* + ${CAPACITY_WARN_PERCENT}= RW.Core.Import User Variable CAPACITY_WARN_PERCENT + ... type=string + ... description=Slot or disk utilization percent that triggers capacity projection warnings. + ... default=80 + ... pattern=^\d+$ + ${MIN_PROJECTION_HOURS}= RW.Core.Import User Variable MIN_PROJECTION_HOURS + ... type=string + ... description=Hours-until-full estimate that triggers slot exhaustion projection issues. + ... default=24 + ... pattern=^\d+$ + ${MAX_PICK_FOR_WRITE_ERRORS}= RW.Core.Import User Variable MAX_PICK_FOR_WRITE_ERRORS + ... type=string + ... description=Master pick-for-write error counter threshold for GC/compaction checks. + ... default=100 + ... pattern=^\d+$ + ${MAX_VOLUME_DISK_ERRORS}= RW.Core.Import User Variable MAX_VOLUME_DISK_ERRORS + ... type=string + ... description=Volume server disk write error counter threshold for GC/compaction checks. + ... default=50 + ... pattern=^\d+$ + Set Suite Variable ${kubeconfig} ${kubeconfig} + Set Suite Variable ${SEAWEEDFS_S3_CREDENTIALS} ${SEAWEEDFS_S3_CREDENTIALS} + Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} + Set Suite Variable ${CONTEXT} ${CONTEXT} + Set Suite Variable ${NAMESPACE} ${NAMESPACE} + Set Suite Variable ${SEAWEEDFS_RELEASE_NAME} ${SEAWEEDFS_RELEASE_NAME} + Set Suite Variable ${SEAWEEDFS_CHART} ${SEAWEEDFS_CHART} + Set Suite Variable ${SEAWEEDFS_MASTER_SERVICE} ${SEAWEEDFS_MASTER_SERVICE} + Set Suite Variable ${SEAWEEDFS_FILER_SERVICE} ${SEAWEEDFS_FILER_SERVICE} + Set Suite Variable ${SEAWEEDFS_S3_ENDPOINT} ${SEAWEEDFS_S3_ENDPOINT} + Set Suite Variable ${MIN_FREE_VOLUME_SLOTS} ${MIN_FREE_VOLUME_SLOTS} + Set Suite Variable ${MIN_FREE_DISK_PERCENT} ${MIN_FREE_DISK_PERCENT} + Set Suite Variable ${S3_PROBE_BUCKET} ${S3_PROBE_BUCKET} + Set Suite Variable ${CAPACITY_WARN_PERCENT} ${CAPACITY_WARN_PERCENT} + Set Suite Variable ${MIN_PROJECTION_HOURS} ${MIN_PROJECTION_HOURS} + Set Suite Variable ${MAX_PICK_FOR_WRITE_ERRORS} ${MAX_PICK_FOR_WRITE_ERRORS} + Set Suite Variable ${MAX_VOLUME_DISK_ERRORS} ${MAX_VOLUME_DISK_ERRORS} + Set Suite Variable + ... ${env} + ... {"KUBECONFIG":"./${kubeconfig.key}","CONTEXT":"${CONTEXT}","NAMESPACE":"${NAMESPACE}","KUBERNETES_DISTRIBUTION_BINARY":"${KUBERNETES_DISTRIBUTION_BINARY}","SEAWEEDFS_RELEASE_NAME":"${SEAWEEDFS_RELEASE_NAME}","SEAWEEDFS_CHART":"${SEAWEEDFS_CHART}","SEAWEEDFS_MASTER_SERVICE":"${SEAWEEDFS_MASTER_SERVICE}","SEAWEEDFS_FILER_SERVICE":"${SEAWEEDFS_FILER_SERVICE}","SEAWEEDFS_S3_ENDPOINT":"${SEAWEEDFS_S3_ENDPOINT}","MIN_FREE_VOLUME_SLOTS":"${MIN_FREE_VOLUME_SLOTS}","MIN_FREE_DISK_PERCENT":"${MIN_FREE_DISK_PERCENT}","S3_PROBE_BUCKET":"${S3_PROBE_BUCKET}","CAPACITY_WARN_PERCENT":"${CAPACITY_WARN_PERCENT}","MIN_PROJECTION_HOURS":"${MIN_PROJECTION_HOURS}","MAX_PICK_FOR_WRITE_ERRORS":"${MAX_PICK_FOR_WRITE_ERRORS}","MAX_VOLUME_DISK_ERRORS":"${MAX_VOLUME_DISK_ERRORS}"} + + RW.K8sHelper.Verify Cluster Connectivity + ... binary=${KUBERNETES_DISTRIBUTION_BINARY} + ... context=${CONTEXT} + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} diff --git a/codebundles/k8s-seaweedfs-healthcheck/seaweedfs-known-issues.json b/codebundles/k8s-seaweedfs-healthcheck/seaweedfs-known-issues.json new file mode 100644 index 00000000..546c83c8 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/seaweedfs-known-issues.json @@ -0,0 +1,34 @@ +[ + { + "min_version": "4.20.0", + "max_version": "4.99.99", + "severity": 3, + "title": "SeaweedFS 4.x single-master installs report zero Raft peers in /cluster/status", + "details": "Single-replica master clusters often omit Peers in status JSON; rely on IsLeader and /cluster/healthz instead of peer count alone.", + "next_steps": "Use HA master replicas (>1) for production; ignore zero-peer status on single-node dev installs." + }, + { + "min_version": "4.0.0", + "max_version": "4.99.99", + "severity": 3, + "title": "Dedicated S3 deployment may not expose S3 on filer port 8333", + "details": "Official Helm chart can run S3 as a separate Deployment; probes must target the s3 component pod or rw-seaweedfs-s3 service, not the filer.", + "next_steps": "Confirm app.kubernetes.io/component=s3 exists or enable filer.s3 on the filer StatefulSet." + }, + { + "min_version": "4.0.0", + "max_version": "4.24.99", + "severity": 2, + "title": "Review SeaweedFS 4.25+ before upgrade from 4.24.x", + "details": "Minor chart/app upgrades may change default volume growth, metrics names, and filer store defaults.", + "next_steps": "Read SeaweedFS release notes and Helm chart changelog before upgrading past 4.24.x." + }, + { + "min_version": "3.0.0", + "max_version": "3.99.99", + "severity": 2, + "title": "SeaweedFS 3.x is outside the chart version validated by this bundle", + "details": "Health probes and /dir/status layout fields differ on older 3.x releases.", + "next_steps": "Upgrade to SeaweedFS 4.x chart or validate APIs manually before trusting automated checks." + } +] diff --git a/codebundles/k8s-seaweedfs-healthcheck/seaweedfs-lib.sh b/codebundles/k8s-seaweedfs-healthcheck/seaweedfs-lib.sh new file mode 100755 index 00000000..439fadcc --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/seaweedfs-lib.sh @@ -0,0 +1,480 @@ +#!/usr/bin/env bash +# Shared helpers for SeaweedFS healthcheck scripts. +# shellcheck disable=SC2034 +set -euo pipefail + +KUBECTL="${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}" +MASTER_PORT="${SEAWEEDFS_MASTER_PORT:-9333}" +VOLUME_PORT="${SEAWEEDFS_VOLUME_PORT:-8080}" +FILER_PORT="${SEAWEEDFS_FILER_PORT:-8888}" +S3_PORT="${SEAWEEDFS_S3_PORT:-8333}" +SEAWEEDFS_CHART_PREFIX="${SEAWEEDFS_CHART_PREFIX:-seaweedfs}" + +issues_json='[]' +COMPONENT_MAP_FILE="${COMPONENT_MAP_FILE:-seaweedfs_component_map.json}" + +swf_add_issue() { + local title="$1" + local details="$2" + local severity="$3" + local next_steps="$4" + issues_json=$(echo "$issues_json" | jq \ + --arg title "$title" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "$next_steps" \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]') +} + +swf_write_issues() { + local output_file="$1" + echo "$issues_json" >"$output_file" +} + +# jq filter args: chart_prefix (e.g. seaweedfs-), chart_exact, release_name +swf_jq_filter_args() { + jq -n \ + --arg chart_prefix "${SEAWEEDFS_CHART_PREFIX}-" \ + --arg chart_exact "${SEAWEEDFS_CHART:-}" \ + --arg release "${SEAWEEDFS_RELEASE_NAME:-}" \ + '{chart_prefix: $chart_prefix, chart_exact: $chart_exact, release: $release}' +} + +# Returns true when labels belong to the SeaweedFS Helm chart (subchart or standalone). +swf_labels_match() { + local labels_json="$1" + local filter_args + filter_args=$(swf_jq_filter_args) + echo "$labels_json" | jq -e --argjson f "$filter_args" ' + (.["app.kubernetes.io/name"]? == "seaweedfs") and + ( + if ($f.chart_exact | length) > 0 then + (.["helm.sh/chart"]? == $f.chart_exact) + elif (.["helm.sh/chart"]? // "" | length) > 0 then + (.["helm.sh/chart"] | startswith($f.chart_prefix)) + else + true + end + ) and + ( + if ($f.release | length) > 0 then + (.["app.kubernetes.io/instance"]? == $f.release) + else + true + end + ) + ' >/dev/null 2>&1 +} + +swf_filter_resource_list() { + local json="$1" + local filter_args + filter_args=$(swf_jq_filter_args) + echo "$json" | jq --argjson f "$filter_args" ' + .items |= map( + select( + (.metadata.labels["app.kubernetes.io/name"]? == "seaweedfs") and + ( + if ($f.chart_exact | length) > 0 then + (.metadata.labels["helm.sh/chart"]? == $f.chart_exact) + elif (.metadata.labels["helm.sh/chart"]? // "" | length) > 0 then + (.metadata.labels["helm.sh/chart"] | startswith($f.chart_prefix)) + else + (.metadata.name | test("seaweedfs"; "i")) + end + ) and + ( + if ($f.release | length) > 0 then + (.metadata.labels["app.kubernetes.io/instance"]? == $f.release) + else + true + end + ) + ) + ) + ' +} + +swf_filter_service_list() { + local json="$1" + local filter_args + filter_args=$(swf_jq_filter_args) + echo "$json" | jq --argjson f "$filter_args" ' + .items |= map( + select( + ( + (.metadata.labels["app.kubernetes.io/name"]? == "seaweedfs") and + ( + if ($f.chart_exact | length) > 0 then + (.metadata.labels["helm.sh/chart"]? == $f.chart_exact) + elif (.metadata.labels["helm.sh/chart"]? // "" | length) > 0 then + (.metadata.labels["helm.sh/chart"] | startswith($f.chart_prefix)) + else + false + end + ) + ) or ( + (.metadata.name | test("seaweedfs"; "i")) and + ( + if ($f.release | length) > 0 then + (.metadata.labels["app.kubernetes.io/instance"]? == $f.release) + else + true + end + ) + ) + ) + ) + ' +} + +swf_filter_pvc_list() { + local json="$1" + local filter_args + filter_args=$(swf_jq_filter_args) + echo "$json" | jq --argjson f "$filter_args" ' + .items |= map( + select( + (.metadata.labels["app.kubernetes.io/name"]? == "seaweedfs") and + ( + if ($f.release | length) > 0 then + (.metadata.labels["app.kubernetes.io/instance"]? == $f.release) + else + true + end + ) and + ( + (.metadata.name | test("seaweedfs"; "i")) or + (.metadata.labels["app.kubernetes.io/component"]? != null) + ) + ) + ) + ' +} + +swf_resolve_release_name() { + if [[ -n "${SEAWEEDFS_RELEASE_NAME:-}" ]]; then + echo "${SEAWEEDFS_RELEASE_NAME}" + return 0 + fi + local raw filtered + raw=$("${KUBECTL}" get statefulset,deployment -n "${NAMESPACE}" --context "${CONTEXT}" \ + -l 'app.kubernetes.io/name=seaweedfs' -o json 2>/dev/null || echo '{"items":[]}') + filtered=$(swf_filter_resource_list "$raw") + local from_label + from_label=$(echo "$filtered" | jq -r '.items[0].metadata.labels["app.kubernetes.io/instance"] // empty' 2>/dev/null || true) + if [[ -n "$from_label" ]]; then + echo "$from_label" + return 0 + fi + echo "" +} + +swf_resolve_chart_label() { + if [[ -n "${SEAWEEDFS_CHART:-}" ]]; then + echo "${SEAWEEDFS_CHART}" + return 0 + fi + local raw filtered + raw=$("${KUBECTL}" get statefulset -n "${NAMESPACE}" --context "${CONTEXT}" \ + -l 'app.kubernetes.io/name=seaweedfs,app.kubernetes.io/component=master' -o json 2>/dev/null || echo '{"items":[]}') + filtered=$(swf_filter_resource_list "$raw") + echo "$filtered" | jq -r '.items[0].metadata.labels["helm.sh/chart"] // empty' 2>/dev/null || true +} + +swf_label_selector() { + local component="${1:-}" + local release + release=$(swf_resolve_release_name) + local parts=() + parts+=("app.kubernetes.io/name=seaweedfs") + if [[ -n "$release" ]]; then + parts+=("app.kubernetes.io/instance=${release}") + fi + if [[ -n "$component" ]]; then + parts+=("app.kubernetes.io/component=${component}") + fi + local IFS=',' + echo "${parts[*]}" +} + +swf_filter_pods_json() { + local json="$1" + swf_filter_resource_list "$json" | jq 'if .items|type == "array" then . else {items: .items} end' +} + +swf_find_pod() { + local component="$1" + local selector pod raw filtered + selector=$(swf_label_selector "$component") + pod=$("${KUBECTL}" get pods -n "${NAMESPACE}" --context "${CONTEXT}" \ + -l "$selector" --field-selector=status.phase=Running \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) + if [[ -n "$pod" ]]; then + echo "$pod" + return 0 + fi + raw=$("${KUBECTL}" get pods -n "${NAMESPACE}" --context "${CONTEXT}" -o json 2>/dev/null || echo '{"items":[]}') + filtered=$(swf_filter_resource_list "$raw") + pod=$(echo "$filtered" | jq -r --arg c "$component" \ + '.items[] | select(.status.phase=="Running") | select(.metadata.labels["app.kubernetes.io/component"]? == $c) | .metadata.name' \ + | head -n1 || true) + if [[ -n "$pod" ]]; then + echo "$pod" + return 0 + fi + echo "$filtered" | jq -r --arg c "$component" \ + '.items[] | select(.status.phase=="Running") | select(.metadata.name | test($c; "i")) | .metadata.name' \ + | head -n1 || true +} + +swf_count_running_pods() { + local component="${1:-}" + local selector raw filtered count + if [[ -n "$component" ]]; then + selector=$(swf_label_selector "$component") + count=$("${KUBECTL}" get pods -n "${NAMESPACE}" --context "${CONTEXT}" \ + -l "$selector" --field-selector=status.phase=Running -o json 2>/dev/null \ + | jq '.items | length' || echo 0) + if [[ "$count" =~ ^[0-9]+$ ]] && [[ "$count" -gt 0 ]]; then + echo "$count" + return 0 + fi + fi + raw=$("${KUBECTL}" get pods -n "${NAMESPACE}" --context "${CONTEXT}" -o json 2>/dev/null || echo '{"items":[]}') + filtered=$(swf_filter_resource_list "$raw") + if [[ -n "$component" ]]; then + echo "$filtered" | jq --arg c "$component" \ + '[.items[] | select(.status.phase=="Running") | select(.metadata.labels["app.kubernetes.io/component"]? == $c)] | length' + else + echo "$filtered" | jq '[.items[] | select(.status.phase=="Running")] | length' + fi +} + +swf_pod_http() { + local pod="$1" + local port="$2" + local path="$3" + [[ -z "$pod" ]] && return 1 + "${KUBECTL}" exec -n "${NAMESPACE}" --context "${CONTEXT}" "$pod" -- \ + sh -c "wget -qO- 'http://127.0.0.1:${port}${path}' 2>/dev/null || curl -sf 'http://127.0.0.1:${port}${path}' 2>/dev/null" 2>/dev/null +} + +# True when the pod accepts HTTP on port (any status, including 403). +swf_pod_http_listening() { + local pod="$1" + local port="$2" + local path="${3:-/}" + local out + [[ -z "$pod" ]] && return 1 + out=$("${KUBECTL}" exec -n "${NAMESPACE}" --context "${CONTEXT}" "$pod" -- \ + wget -S -O /dev/null "http://127.0.0.1:${port}${path}" 2>&1 || true) + if echo "$out" | grep -qiE 'HTTP/1\.[0-9]+ [0-9]'; then + return 0 + fi + out=$("${KUBECTL}" exec -n "${NAMESPACE}" --context "${CONTEXT}" "$pod" -- \ + curl -sI "http://127.0.0.1:${port}${path}" 2>/dev/null || true) + echo "$out" | grep -qiE 'HTTP/' +} + +swf_master_http() { + local path="$1" + if [[ -n "${SEAWEEDFS_MASTER_SERVICE:-}" ]]; then + local host="${SEAWEEDFS_MASTER_SERVICE%%:*}" + local port="${SEAWEEDFS_MASTER_SERVICE#*:}" + [[ "$port" == "$host" ]] && port="$MASTER_PORT" + local svc_pod + svc_pod=$(swf_find_pod "master") + [[ -z "$svc_pod" ]] && return 1 + "${KUBECTL}" exec -n "${NAMESPACE}" --context "${CONTEXT}" "$svc_pod" -- \ + sh -c "wget -qO- 'http://${host}:${port}${path}' 2>/dev/null || curl -sf 'http://${host}:${port}${path}' 2>/dev/null" 2>/dev/null + return $? + fi + local pod + pod=$(swf_find_pod "master") + swf_pod_http "$pod" "$MASTER_PORT" "$path" +} + +swf_filer_http() { + local path="$1" + if [[ -n "${SEAWEEDFS_FILER_SERVICE:-}" ]]; then + local host="${SEAWEEDFS_FILER_SERVICE%%:*}" + local port="${SEAWEEDFS_FILER_SERVICE#*:}" + [[ "$port" == "$host" ]] && port="$FILER_PORT" + local pod + pod=$(swf_find_pod "filer") + [[ -z "$pod" ]] && return 1 + "${KUBECTL}" exec -n "${NAMESPACE}" --context "${CONTEXT}" "$pod" -- \ + sh -c "wget -qO- 'http://${host}:${port}${path}' 2>/dev/null || curl -sf 'http://${host}:${port}${path}' 2>/dev/null" 2>/dev/null + return $? + fi + local pod + pod=$(swf_find_pod "filer") + swf_pod_http "$pod" "$FILER_PORT" "$path" +} + +swf_volume_http() { + local pod="$1" + local path="$2" + swf_pod_http "$pod" "$VOLUME_PORT" "$path" +} + +swf_discover_components() { + local release chart + release=$(swf_resolve_release_name) + chart=$(swf_resolve_chart_label) + + local sts_json dep_json svc_json pvc_json sts_raw dep_raw svc_raw pvc_raw + sts_raw=$("${KUBECTL}" get statefulset -n "${NAMESPACE}" --context "${CONTEXT}" \ + -l 'app.kubernetes.io/name=seaweedfs' -o json 2>/dev/null || echo '{"items":[]}') + dep_raw=$("${KUBECTL}" get deployment -n "${NAMESPACE}" --context "${CONTEXT}" \ + -l 'app.kubernetes.io/name=seaweedfs' -o json 2>/dev/null || echo '{"items":[]}') + svc_raw=$("${KUBECTL}" get svc -n "${NAMESPACE}" --context "${CONTEXT}" \ + -l 'app.kubernetes.io/name=seaweedfs' -o json 2>/dev/null || echo '{"items":[]}') + pvc_raw=$("${KUBECTL}" get pvc -n "${NAMESPACE}" --context "${CONTEXT}" \ + -l 'app.kubernetes.io/name=seaweedfs' -o json 2>/dev/null || echo '{"items":[]}') + + sts_json=$(swf_filter_resource_list "$sts_raw") + dep_json=$(swf_filter_resource_list "$dep_raw") + svc_json=$(swf_filter_service_list "$svc_raw") + pvc_json=$(swf_filter_pvc_list "$pvc_raw") + + jq -n \ + --arg release "$release" \ + --arg chart "$chart" \ + --arg namespace "${NAMESPACE}" \ + --argjson statefulsets "$sts_json" \ + --argjson deployments "$dep_json" \ + --argjson services "$svc_json" \ + --argjson pvcs "$pvc_json" \ + '{ + release: $release, + chart: $chart, + namespace: $namespace, + statefulsets: [$statefulsets.items[] | {name: .metadata.name, component: (.metadata.labels["app.kubernetes.io/component"] // "unknown"), replicas: (.spec.replicas // 0), ready: (.status.readyReplicas // 0)}], + deployments: [$deployments.items[] | {name: .metadata.name, component: (.metadata.labels["app.kubernetes.io/component"] // "unknown"), replicas: (.spec.replicas // 0), ready: (.status.readyReplicas // 0)}], + services: [$services.items[] | {name: .metadata.name, type: .spec.type, ports: [.spec.ports[]? | {port: .port, name: .name}]}], + pvcs: [$pvcs.items[] | {name: .metadata.name, phase: (.status.phase // "Unknown"), capacity: (.status.capacity.storage // "unknown")}] + }' +} + +swf_seaweed_workloads() { + swf_discover_components | jq -c '.statefulsets[], .deployments[]' +} + +swf_parse_s3_credentials() { + if [[ -z "${seaweedfs_s3_credentials:-}" && -z "${SEAWEEDFS_S3_CREDENTIALS:-}" ]]; then + return 0 + fi + local raw="${seaweedfs_s3_credentials:-${SEAWEEDFS_S3_CREDENTIALS:-}}" + export AWS_ACCESS_KEY_ID + export AWS_SECRET_ACCESS_KEY + AWS_ACCESS_KEY_ID=$(echo "$raw" | jq -r '.AWS_ACCESS_KEY_ID // .access_key // .accessKey // empty') + AWS_SECRET_ACCESS_KEY=$(echo "$raw" | jq -r '.AWS_SECRET_ACCESS_KEY // .secret_key // .secretKey // empty') +} + +swf_find_s3_probe_pod() { + local pod + pod=$(swf_find_pod "s3") + if [[ -n "$pod" ]]; then + echo "$pod" + return 0 + fi + swf_find_pod "filer" +} + +swf_s3_endpoint_url() { + if [[ -n "${SEAWEEDFS_S3_ENDPOINT:-}" ]]; then + echo "${SEAWEEDFS_S3_ENDPOINT}" + return 0 + fi + local map_json svc_name + map_json=$(swf_discover_components) + svc_name=$(echo "$map_json" | jq -r ' + [.services[] | select(.name | test("s3"; "i"))][0].name // empty + ') + if [[ -n "$svc_name" ]]; then + echo "http://${svc_name}.${NAMESPACE}.svc.cluster.local:${S3_PORT}" + return 0 + fi + svc_name=$(echo "$map_json" | jq -r ' + [.services[] | select(.name | test("seaweedfs"; "i")) | select(.ports[]?.port == 8333)][0].name // empty + ') + if [[ -n "$svc_name" ]]; then + echo "http://${svc_name}.${NAMESPACE}.svc.cluster.local:${S3_PORT}" + return 0 + fi + local pod + pod=$(swf_find_s3_probe_pod) + if [[ -n "$pod" ]]; then + echo "http://127.0.0.1:${S3_PORT}" + fi +} + +METRICS_PORT="${SEAWEEDFS_METRICS_PORT:-9327}" + +swf_get_filtered_workloads_raw() { + local sts_raw dep_raw + sts_raw=$("${KUBECTL}" get statefulset -n "${NAMESPACE}" --context "${CONTEXT}" \ + -l 'app.kubernetes.io/name=seaweedfs' -o json 2>/dev/null || echo '{"items":[]}') + dep_raw=$("${KUBECTL}" get deployment -n "${NAMESPACE}" --context "${CONTEXT}" \ + -l 'app.kubernetes.io/name=seaweedfs' -o json 2>/dev/null || echo '{"items":[]}') + sts_raw=$(swf_filter_resource_list "$sts_raw") + dep_raw=$(swf_filter_resource_list "$dep_raw") + jq -n --argjson sts "$sts_raw" --argjson dep "$dep_raw" \ + '{items: (($sts.items // []) + ($dep.items // []))}' +} + +swf_weed_command_text() { + local workload_json="$1" + echo "$workload_json" | jq -r ' + (.spec.template.spec.containers[0].command // []) | join(" ") + ' | tr '\\' ' ' +} + +swf_fetch_pod_metrics() { + local pod="$1" + local port="${2:-9327}" + [[ -z "$pod" ]] && return 1 + swf_pod_http "$pod" "$port" "/metrics" +} + +swf_chart_version() { + local chart="${SEAWEEDFS_CHART:-}" + if [[ -z "$chart" ]]; then + chart=$(swf_resolve_chart_label) + fi + echo "$chart" | sed -E 's/^seaweedfs-//; s/_.*$//' +} + +swf_replication_min_volumes() { + local repl="$1" + local extras=0 + local i c + for ((i = 0; i < ${#repl}; i++)); do + c="${repl:$i:1}" + if [[ "$c" =~ [1-9] ]]; then + extras=$((extras + 1)) + fi + done + echo $((1 + extras)) +} + +swf_metric_gauge_value() { + local metrics="$1" + local name="$2" + echo "$metrics" | awk -v n="$name" '$1 ~ "^" n "\\{" || $1 == n {print $2; exit}' +} + +swf_metric_sum_matching() { + local metrics="$1" + local pattern="$2" + echo "$metrics" | awk -v pat="$pattern" '$1 ~ pat {sum += $2} END {print sum+0}' +} + +swf_capacity_snapshot_path() { + local release + release=$(swf_resolve_release_name) + local base="${CODEBUNDLE_TEMP_DIR:-/tmp}" + echo "${base}/seaweedfs_capacity_${NAMESPACE}_${release:-all}.json" +} diff --git a/codebundles/k8s-seaweedfs-healthcheck/seaweedfs_capacity_projection_snapshot.json b/codebundles/k8s-seaweedfs-healthcheck/seaweedfs_capacity_projection_snapshot.json new file mode 100644 index 00000000..a8432a78 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/seaweedfs_capacity_projection_snapshot.json @@ -0,0 +1,22 @@ +{ + "timestamp": "2026-06-26T02:12:17Z", + "epoch": 1782439937, + "slots": { + "free": 21, + "max": 120, + "used": 99, + "utilization_percent": 82.5 + }, + "disk": [ + { + "pod": "rw-seaweedfs-volume-0", + "volume_count": 99, + "read_only_volumes": 20, + "disk_utilization_percent": 0 + } + ], + "master_metrics": { + "writable_volumes": 76, + "crowded_layouts": 0 + } +} diff --git a/codebundles/k8s-seaweedfs-healthcheck/seaweedfs_config_snapshot.json b/codebundles/k8s-seaweedfs-healthcheck/seaweedfs_config_snapshot.json new file mode 100644 index 00000000..fb5665ee --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/seaweedfs_config_snapshot.json @@ -0,0 +1,278 @@ +{ + "release": "rw", + "chart": "seaweedfs-4.25.0", + "workloads": [ + { + "kind": "StatefulSet", + "name": "rw-seaweedfs-filer", + "component": "filer", + "replicas": 1, + "command": "/bin/sh -ec exec /usr/bin/weed \\\n-logdir=/logs \\\n-v=1 \\\nfiler \\\n-port=8888 \\\n-metricsPort=9327 \\\n-dirListLimit=100000 \\\n-defaultReplicaPlacement=000 \\\n-ip=${POD_IP} \\\n-ip.bind=0.0.0.0 \\\n-master=rw-seaweedfs-master-0.rw-seaweedfs-master.runwhen-env-test:9333 \\\n", + "env": [ + { + "name": "POD_IP", + "value": "" + }, + { + "name": "POD_NAME", + "value": "" + }, + { + "name": "NAMESPACE", + "value": "" + }, + { + "name": "WEED_MYSQL_USERNAME", + "value": "" + }, + { + "name": "WEED_MYSQL_PASSWORD", + "value": "" + }, + { + "name": "SEAWEEDFS_FULLNAME", + "value": "rw-seaweedfs" + }, + { + "name": "WEED_CLUSTER_DEFAULT", + "value": "sw" + }, + { + "name": "WEED_CLUSTER_SW_FILER", + "value": "rw-seaweedfs-filer-client.runwhen-env-test:8888" + }, + { + "name": "WEED_CLUSTER_SW_MASTER", + "value": "rw-seaweedfs-master.runwhen-env-test:9333" + }, + { + "name": "WEED_FILER_BUCKETS_FOLDER", + "value": "/buckets" + }, + { + "name": "WEED_FILER_OPTIONS_RECURSIVE_DELETE", + "value": "false" + }, + { + "name": "WEED_LEVELDB2_ENABLED", + "value": "true" + }, + { + "name": "WEED_MYSQL_CONNECTION_MAX_IDLE", + "value": "5" + }, + { + "name": "WEED_MYSQL_CONNECTION_MAX_LIFETIME_SECONDS", + "value": "600" + }, + { + "name": "WEED_MYSQL_CONNECTION_MAX_OPEN", + "value": "75" + }, + { + "name": "WEED_MYSQL_DATABASE", + "value": "sw_database" + }, + { + "name": "WEED_MYSQL_ENABLED", + "value": "false" + }, + { + "name": "WEED_MYSQL_HOSTNAME", + "value": "mysql-db-host" + }, + { + "name": "WEED_MYSQL_INTERPOLATEPARAMS", + "value": "true" + }, + { + "name": "WEED_MYSQL_PORT", + "value": "3306" + } + ], + "volumeMounts": [ + "/logs/", + "/data", + "/tmp" + ], + "volumes": [ + { + "name": "db-schema-config-volume", + "claim": "" + }, + { + "name": "tmp", + "claim": "" + } + ] + }, + { + "kind": "StatefulSet", + "name": "rw-seaweedfs-master", + "component": "master", + "replicas": 1, + "command": "/bin/sh -ec exec /usr/bin/weed \\\n-logdir=/logs \\\n-v=1 \\\nmaster \\\n-port=9333 \\\n-mdir=/data \\\n-ip.bind=0.0.0.0 \\\n-defaultReplication=000 \\\n-metricsPort=9327 \\\n-volumeSizeLimitMB=1024 \\\n-electionTimeout=10s \\\n-heartbeatInterval=300ms \\\n-ip=${POD_NAME}.rw-seaweedfs-master.runwhen-env-test \\\n-peers=rw-seaweedfs-master-0.rw-seaweedfs-master.runwhen-env-test:9333 \\\n", + "env": [ + { + "name": "POD_IP", + "value": "" + }, + { + "name": "POD_NAME", + "value": "" + }, + { + "name": "NAMESPACE", + "value": "" + }, + { + "name": "SEAWEEDFS_FULLNAME", + "value": "rw-seaweedfs" + }, + { + "name": "WEED_CLUSTER_DEFAULT", + "value": "sw" + }, + { + "name": "WEED_CLUSTER_SW_FILER", + "value": "rw-seaweedfs-filer-client.runwhen-env-test:8888" + }, + { + "name": "WEED_CLUSTER_SW_MASTER", + "value": "rw-seaweedfs-master.runwhen-env-test:9333" + }, + { + "name": "WEED_MASTER_VOLUME_GROWTH_COPY_1", + "value": "7" + }, + { + "name": "WEED_MASTER_VOLUME_GROWTH_COPY_2", + "value": "6" + }, + { + "name": "WEED_MASTER_VOLUME_GROWTH_COPY_3", + "value": "3" + }, + { + "name": "WEED_MASTER_VOLUME_GROWTH_COPY_OTHER", + "value": "1" + } + ], + "volumeMounts": [ + "/data", + "/logs/", + "/etc/seaweedfs/master.toml", + "/tmp" + ], + "volumes": [ + { + "name": "master-config", + "claim": "" + }, + { + "name": "tmp", + "claim": "" + } + ] + }, + { + "kind": "StatefulSet", + "name": "rw-seaweedfs-volume", + "component": "volume", + "replicas": 1, + "command": "/bin/sh -ec exec /usr/bin/weed \\\n -logdir=/logs \\\n -v=1 \\\n volume \\\n -port=8080 \\\n -metricsPort=9327 \\\n -dir /data1 \\\n -max 120 \\\n -ip.bind=0.0.0.0 \\\n -readMode=proxy \\\n -minFreeSpacePercent=1 \\\n -ip=${POD_NAME}.rw-seaweedfs-volume.runwhen-env-test \\\n -compactionMBps=50 \\\n -master=rw-seaweedfs-master-0.rw-seaweedfs-master.runwhen-env-test:9333 \\\n", + "env": [ + { + "name": "POD_NAME", + "value": "" + }, + { + "name": "NAMESPACE", + "value": "" + }, + { + "name": "HOST_IP", + "value": "" + }, + { + "name": "SEAWEEDFS_FULLNAME", + "value": "rw-seaweedfs" + }, + { + "name": "WEED_CLUSTER_DEFAULT", + "value": "sw" + }, + { + "name": "WEED_CLUSTER_SW_FILER", + "value": "rw-seaweedfs-filer-client.runwhen-env-test:8888" + }, + { + "name": "WEED_CLUSTER_SW_MASTER", + "value": "rw-seaweedfs-master.runwhen-env-test:9333" + } + ], + "volumeMounts": [ + "/data1/", + "/logs/", + "/tmp" + ], + "volumes": [ + { + "name": "tmp", + "claim": "" + } + ] + }, + { + "kind": "Deployment", + "name": "rw-seaweedfs-s3", + "component": "s3", + "replicas": 1, + "command": "/bin/sh -ec exec /usr/bin/weed \\\n-logtostderr=true \\\n-v=1 \\\ns3 \\\n-ip.bind=0.0.0.0 \\\n-port=8333 \\\n-metricsPort 9327 \\\n-config=/etc/sw/seaweedfs_s3_config \\\n-filer=rw-seaweedfs-filer-client.runwhen-env-test:8888 \\\n-idleTimeout=3600 \\\n-allowEmptyFolder \\\n-allowDeleteBucketNotEmpty \\\n", + "env": [ + { + "name": "POD_IP", + "value": "" + }, + { + "name": "POD_NAME", + "value": "" + }, + { + "name": "NAMESPACE", + "value": "" + }, + { + "name": "SEAWEEDFS_FULLNAME", + "value": "rw-seaweedfs" + }, + { + "name": "WEED_CLUSTER_DEFAULT", + "value": "sw" + }, + { + "name": "WEED_CLUSTER_SW_FILER", + "value": "rw-seaweedfs-filer-client.runwhen-env-test:8888" + }, + { + "name": "WEED_CLUSTER_SW_MASTER", + "value": "rw-seaweedfs-master.runwhen-env-test:9333" + } + ], + "volumeMounts": [ + "/etc/sw", + "/tmp" + ], + "volumes": [ + { + "name": "config-users", + "claim": "" + }, + { + "name": "tmp", + "claim": "" + } + ] + } + ] +} diff --git a/codebundles/k8s-seaweedfs-healthcheck/seaweedfs_gc_snapshot.json b/codebundles/k8s-seaweedfs-healthcheck/seaweedfs_gc_snapshot.json new file mode 100644 index 00000000..5e5bca61 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/seaweedfs_gc_snapshot.json @@ -0,0 +1,19 @@ +{ + "master": { + "pick_for_write_error": 4260, + "crowded_layouts": 0, + "heartbeat_errors": 1 + }, + "volumes": [ + { + "pod": "rw-seaweedfs-volume-0", + "disk_write_errors": 921, + "size_mismatch_errors": 54, + "read_only_no_delete": 20, + "read_only_can_delete": 0 + } + ], + "filer": { + "delete_store_ops": 2 + } +} diff --git a/codebundles/k8s-seaweedfs-healthcheck/sli-seaweedfs-dimensions.sh b/codebundles/k8s-seaweedfs-healthcheck/sli-seaweedfs-dimensions.sh new file mode 100755 index 00000000..46becdc3 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/sli-seaweedfs-dimensions.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +set -euo pipefail +# Lightweight SLI dimension probe; prints JSON to stdout. +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +# shellcheck disable=SC1091 +source seaweedfs-lib.sh + +score_workload=1 +score_master=1 +score_slots=1 +score_connectivity=1 + +map_json=$(swf_discover_components) +bad_workloads=$(echo "$map_json" | jq '[.statefulsets[], .deployments[] | select(.replicas > 0 and .ready < .replicas)] | length') +if [[ "$bad_workloads" -gt 0 ]]; then + score_workload=0 +fi + +if health=$(swf_master_http "/cluster/healthz" 2>/dev/null); then + if ! echo "$health" | grep -qiE 'ok|healthy|success'; then + score_master=0 + fi +else + score_master=0 +fi + +min_free="${MIN_FREE_VOLUME_SLOTS:-1}" +if dir_status=$(swf_master_http "/dir/status" 2>/dev/null); then + free=$(echo "$dir_status" | jq -r '.Topology.Free // .topology.free // 999' 2>/dev/null || echo 999) + if [[ "$free" =~ ^[0-9]+$ ]] && [[ "$free" -lt "$min_free" ]]; then + score_slots=0 + fi +else + score_slots=0 +fi + +filer_pod=$(swf_find_pod "filer") +if [[ -z "$filer_pod" ]]; then + score_connectivity=0 +else + if ! swf_filer_http "/healthz" >/dev/null 2>&1 && ! swf_filer_http "/status" >/dev/null 2>&1; then + score_connectivity=0 + fi +fi + +jq -n \ + --argjson workload "$score_workload" \ + --argjson master "$score_master" \ + --argjson slots "$score_slots" \ + --argjson connectivity "$score_connectivity" \ + '{workload: $workload, master: $master, slots: $slots, connectivity: $connectivity}' diff --git a/codebundles/k8s-seaweedfs-healthcheck/sli.robot b/codebundles/k8s-seaweedfs-healthcheck/sli.robot new file mode 100644 index 00000000..9a59e2da --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/sli.robot @@ -0,0 +1,103 @@ +*** Settings *** +Documentation Measures SeaweedFS storage health using workload readiness, master leadership, volume slot availability, and filer connectivity. Produces a value between 0 (failing) and 1 (healthy). +Metadata Author rw-codebundle-agent +Metadata Display Name Kubernetes SeaweedFS Storage Health Check +Metadata Supports Kubernetes SeaweedFS storage health S3 + +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform +Library Collections + +Suite Setup Suite Initialization + + +*** Keywords *** +Suite Initialization + ${kubeconfig}= RW.Core.Import Secret + ... kubeconfig + ... type=string + ... description=Kubernetes kubeconfig for cluster access. + ... pattern=\w* + ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY + ... type=string + ... description=Kubernetes CLI binary (kubectl or oc). + ... enum=[kubectl,oc] + ... default=kubectl + ${CONTEXT}= RW.Core.Import User Variable CONTEXT + ... type=string + ... description=Kubernetes context for the target cluster. + ... pattern=\w* + ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE + ... type=string + ... description=Namespace where SeaweedFS is deployed. + ... pattern=\w* + ${SEAWEEDFS_RELEASE_NAME}= RW.Core.Import User Variable SEAWEEDFS_RELEASE_NAME + ... type=string + ... description=Helm release instance label (parent release for subchart installs). + ... default= + ... pattern=.* + ${SEAWEEDFS_CHART}= RW.Core.Import User Variable SEAWEEDFS_CHART + ... type=string + ... description=Exact helm.sh/chart label for the SeaweedFS subchart (e.g. seaweedfs-4.25.0). + ... default= + ... pattern=.* + ${MIN_FREE_VOLUME_SLOTS}= RW.Core.Import User Variable MIN_FREE_VOLUME_SLOTS + ... type=string + ... description=Minimum free volume slots required for a passing slots score. + ... default=1 + ... pattern=^\d+$ + Set Suite Variable ${kubeconfig} ${kubeconfig} + Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} + Set Suite Variable ${CONTEXT} ${CONTEXT} + Set Suite Variable ${NAMESPACE} ${NAMESPACE} + Set Suite Variable ${SEAWEEDFS_RELEASE_NAME} ${SEAWEEDFS_RELEASE_NAME} + Set Suite Variable ${SEAWEEDFS_CHART} ${SEAWEEDFS_CHART} + Set Suite Variable ${MIN_FREE_VOLUME_SLOTS} ${MIN_FREE_VOLUME_SLOTS} + Set Suite Variable + ... ${env} + ... {"KUBECONFIG":"./${kubeconfig.key}","CONTEXT":"${CONTEXT}","NAMESPACE":"${NAMESPACE}","KUBERNETES_DISTRIBUTION_BINARY":"${KUBERNETES_DISTRIBUTION_BINARY}","SEAWEEDFS_RELEASE_NAME":"${SEAWEEDFS_RELEASE_NAME}","SEAWEEDFS_CHART":"${SEAWEEDFS_CHART}","MIN_FREE_VOLUME_SLOTS":"${MIN_FREE_VOLUME_SLOTS}"} + + +*** Tasks *** +Score SeaweedFS Health Dimensions in Namespace `${NAMESPACE}` + [Documentation] Runs a compact probe returning binary scores for workload, master, slots, and connectivity dimensions. + [Tags] access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=sli-seaweedfs-dimensions.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=120 + ... include_in_history=false + ... cmd_override=./sli-seaweedfs-dimensions.sh + + TRY + ${dims}= Evaluate json.loads(r'''${result.stdout}''') json + ${workload}= Get From Dictionary ${dims} workload + ${master}= Get From Dictionary ${dims} master + ${slots}= Get From Dictionary ${dims} slots + ${connectivity}= Get From Dictionary ${dims} connectivity + ${workload}= Convert To Integer ${workload} + ${master}= Convert To Integer ${master} + ${slots}= Convert To Integer ${slots} + ${connectivity}= Convert To Integer ${connectivity} + EXCEPT + Log SLI dimension JSON parse failed; reporting zero health. WARN + ${workload}= Convert To Integer 0 + ${master}= Convert To Integer 0 + ${slots}= Convert To Integer 0 + ${connectivity}= Convert To Integer 0 + END + + RW.Core.Push Metric ${workload} sub_name=workload + RW.Core.Push Metric ${master} sub_name=master + RW.Core.Push Metric ${slots} sub_name=volume_slots + RW.Core.Push Metric ${connectivity} sub_name=connectivity + + ${health_score}= Evaluate (${workload} + ${master} + ${slots} + ${connectivity}) / 4.0 + ${health_score}= Convert to Number ${health_score} 2 + ${report_line}= Set Variable Health Score: ${health_score} (workload=${workload}, master=${master}, slots=${slots}, connectivity=${connectivity}) + RW.Core.Add to Report ${report_line} + RW.Core.Push Metric ${health_score} diff --git a/codebundles/k8s-seaweedfs-healthcheck/verify-s3-gateway.sh b/codebundles/k8s-seaweedfs-healthcheck/verify-s3-gateway.sh new file mode 100755 index 00000000..33f06a3b --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/verify-s3-gateway.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# Performs minimal S3 ListBuckets and put/get/delete of a temporary object. +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="s3_gateway_issues.json" +PROBE_PREFIX="runwhen-seaweedfs-probe" +# shellcheck disable=SC1091 +source seaweedfs-lib.sh + +print_report() { + { set +x; } 2>/dev/null || true + echo "=== SeaweedFS S3 gateway probe ===" + jq -r '.[] | " - [sev=\(.severity)] \(.title)"' "$OUTPUT_FILE" 2>/dev/null || true +} +trap print_report EXIT + +s3_pod=$(swf_find_pod "s3") +filer_pod=$(swf_find_pod "filer") +probe_pod="${s3_pod:-$filer_pod}" + +if [[ -z "$probe_pod" ]]; then + swf_add_issue \ + "S3 gateway probe skipped: no filer or s3 pod in namespace \`${NAMESPACE}\`" \ + "S3 is served from the filer or dedicated s3 deployment in Helm installs." \ + 3 \ + "Enable filer/s3 components in Helm values." + swf_write_issues "$OUTPUT_FILE" + exit 0 +fi + +# Detect if S3 port responds on the probe pod (dedicated s3 deployment or embedded filer S3) +if ! swf_pod_http_listening "$probe_pod" "$S3_PORT" "/"; then + swf_add_issue \ + "SeaweedFS S3 endpoint not listening on port ${S3_PORT} in \`${NAMESPACE}\`" \ + "S3 may be disabled in Helm values; probe skipped without raising critical failure." \ + 3 \ + "Enable s3 component or filer.s3 and expose port ${S3_PORT} if S3 is required." + swf_write_issues "$OUTPUT_FILE" + exit 0 +fi + +swf_parse_s3_credentials + +bucket="${S3_PROBE_BUCKET:-runwhen-healthcheck}" +object_key="${PROBE_PREFIX}/$(date +%s)-$$.txt" +tmp_body="${CODEBUNDLE_TEMP_DIR:-/tmp}/seaweedfs_probe_$$.txt" +echo "runwhen seaweedfs probe $(date -u +%Y-%m-%dT%H:%M:%SZ)" >"$tmp_body" + +run_in_probe_pod() { + local cmd="$1" + "${KUBECTL}" exec -n "${NAMESPACE}" --context "${CONTEXT}" "$probe_pod" -- sh -c "$cmd" +} + +aws_env="" +if [[ -n "${AWS_ACCESS_KEY_ID:-}" ]]; then + aws_env="AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}" +fi + +endpoint="http://127.0.0.1:${S3_PORT}" + +# List buckets +if ! list_out=$(run_in_probe_pod "${aws_env} AWS_EC2_METADATA_DISABLED=true aws --endpoint-url ${endpoint} s3 ls 2>&1" 2>/dev/null); then + if echo "$list_out" | grep -qiE 'Unable to locate credentials|AccessDenied|403|401'; then + swf_add_issue \ + "SeaweedFS S3 ListBuckets failed (auth) in namespace \`${NAMESPACE}\`" \ + "$list_out" \ + 2 \ + "Provide seaweedfs_s3_credentials secret or configure anonymous access for probe bucket." + swf_write_issues "$OUTPUT_FILE" + rm -f "$tmp_body" + exit 0 + fi + if ! echo "$list_out" | grep -qi 'aws: not found'; then + swf_add_issue \ + "SeaweedFS S3 ListBuckets failed in namespace \`${NAMESPACE}\`" \ + "$list_out" \ + 2 \ + "Verify filer S3 configuration and IAM user mappings." + swf_write_issues "$OUTPUT_FILE" + rm -f "$tmp_body" + exit 0 + fi +fi + +# Create bucket if missing (best effort) +if [[ -z "${S3_PROBE_BUCKET:-}" ]]; then + run_in_probe_pod "${aws_env} AWS_EC2_METADATA_DISABLED=true aws --endpoint-url ${endpoint} s3 mb s3://${bucket} 2>/dev/null" || true +fi + +put_cmd="cat > ${tmp_body} && ${aws_env} AWS_EC2_METADATA_DISABLED=true aws --endpoint-url ${endpoint} s3 cp ${tmp_body} s3://${bucket}/${object_key}" +get_cmd="${aws_env} AWS_EC2_METADATA_DISABLED=true aws --endpoint-url ${endpoint} s3 cp s3://${bucket}/${object_key} -" +del_cmd="${aws_env} AWS_EC2_METADATA_DISABLED=true aws --endpoint-url ${endpoint} s3 rm s3://${bucket}/${object_key}" + +if ! run_in_probe_pod "$put_cmd" >/dev/null 2>&1; then + swf_add_issue \ + "SeaweedFS S3 put object failed in namespace \`${NAMESPACE}\`" \ + "Could not upload s3://${bucket}/${object_key}" \ + 2 \ + "Check filer S3 auth, bucket policy, and filer-to-volume connectivity." + swf_write_issues "$OUTPUT_FILE" + rm -f "$tmp_body" + exit 0 +fi + +if ! got=$(run_in_probe_pod "$get_cmd" 2>/dev/null); then + swf_add_issue \ + "SeaweedFS S3 get object failed in namespace \`${NAMESPACE}\`" \ + "Uploaded object s3://${bucket}/${object_key} could not be read back." \ + 2 \ + "Inspect filer and volume logs for write/read errors." + run_in_probe_pod "$del_cmd" >/dev/null 2>&1 || true + swf_write_issues "$OUTPUT_FILE" + rm -f "$tmp_body" + exit 0 +fi + +if ! echo "$got" | grep -q 'runwhen seaweedfs probe'; then + swf_add_issue \ + "SeaweedFS S3 object content mismatch in namespace \`${NAMESPACE}\`" \ + "Read payload did not match uploaded probe object." \ + 2 \ + "Investigate filer metadata store and erasure coding health." +fi + +run_in_probe_pod "$del_cmd" >/dev/null 2>&1 || true +rm -f "$tmp_body" + +swf_write_issues "$OUTPUT_FILE" diff --git a/codebundles/k8s-seaweedfs-healthcheck/volume_config_issues.json b/codebundles/k8s-seaweedfs-healthcheck/volume_config_issues.json new file mode 100644 index 00000000..8dd381b2 --- /dev/null +++ b/codebundles/k8s-seaweedfs-healthcheck/volume_config_issues.json @@ -0,0 +1,8 @@ +[ + { + "title": "SeaweedFS master runs single replica with explicit peer bootstrap", + "details": "Single master with -peers configured; verify this matches intended topology.", + "severity": 3, + "next_steps": "Use master.replicas=3 for HA or remove redundant peer wiring on single-node installs." + } +]