diff --git a/codebundles/aws-account-cost-health/SKILL-TEMPLATE.md b/codebundles/aws-account-cost-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..770adb39a --- /dev/null +++ b/codebundles/aws-account-cost-health/SKILL-TEMPLATE.md @@ -0,0 +1,117 @@ +--- +name: aws-account-cost-health +kind: skill-template +description: AWS Account Cost Report: Generates historical cost breakdown reports by service using the AWS Cost Explorer API.... Use when triaging or monitoring AWS, Cost, Management workloads with skill templa... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [AWS, Cost, Management, Cost, Reporting, Trend, Analysis, Reserved, Instances, Savings, Plans] +resource_types: [aws_resource] +access: read-only +--- + +# AWS Account Cost Report + +## Summary + +This codebundle monitors AWS account cost trends using the Cost Explorer API and provides Reserved Instance and Savings Plans purchase recommendations. + +See [README.md](README.md) for additional context. + +## Tools + +### Generate AWS Cost Report By Service for Account `${AWS_ACCOUNT_NAME}` + +Generates a detailed cost breakdown report for the configured lookback period showing actual spending by AWS service. Includes period-over-period comparison and raises an issue if cost increase exceeds configured threshold. + +- **Robot task name**: Generate AWS Cost Report By Service for Account `${AWS_ACCOUNT_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `aws_cost_report.sh` +- **Tags**: `AWS`, `Cost`, `Analysis`, `Cost`, `Management`, `Reporting`, `Trend`, `Analysis`, `access:read-only`, `data:config` +- **Reads**: `AWS_ACCOUNT_NAME`, `TIMEOUT_SECONDS` +- **Writes**: `aws_cost_trend_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze AWS Reserved Instance and Savings Plans Recommendations for Account `${AWS_ACCOUNT_NAME}` + +Queries AWS Cost Explorer for Reserved Instance and Savings Plans purchase recommendations. Calculates potential savings from commitments for EC2, RDS, ElastiCache, and Compute Savings Plans. + +- **Robot task name**: Analyze AWS Reserved Instance and Savings Plans Recommendations for Account `${AWS_ACCOUNT_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `aws_ri_recommendations.sh` +- **Tags**: `AWS`, `Cost`, `Analysis`, `Reserved`, `Instances`, `Savings`, `Plans`, `access:read-only`, `data:config` +- **Reads**: `AWS_ACCOUNT_NAME`, `TIMEOUT_SECONDS` +- **Writes**: `aws_ri_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AWS_REGION` | string | AWS Region for Cost Explorer API calls | `us-east-1` | no | +| `AWS_ACCOUNT_NAME` | string | AWS account name or alias for display purposes | `""` | no | +| `COST_ANALYSIS_LOOKBACK_DAYS` | string | Number of days to look back for cost analysis (default: 30) | `30` | no | +| `COST_INCREASE_THRESHOLD` | string | Percentage threshold for cost increase alerts. An issue will be raised if period-over-period cost increase exceeds this value (e.g., 10 for 10% increase). | `10` | no | +| `OUTPUT_FORMAT` | string | Output format for cost report: table, csv, json, or all (default: table) | `table` | no | +| `COST_BUDGET` | string | Budget threshold in USD for the analysis period. An issue will be raised if total costs exceed this value. Set to 0 to disable (default: 0). | `0` | no | +| `COST_CONCENTRATION_THRESHOLD` | string | Maximum percentage of total cost that any single service should represent. An issue will be raised if exceeded (default: 25). | `25` | no | +| `TIMEOUT_SECONDS` | string | Timeout in seconds for tasks (default: 600 = 10 minutes). | `600` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- `aws_cost_trend_issues.json` +- `aws_ri_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/aws-account-cost-health/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/aws-account-cost-health +export AWS_REGION=... +export AWS_ACCOUNT_NAME=... +export COST_ANALYSIS_LOOKBACK_DAYS=... +export COST_INCREASE_THRESHOLD=... +export OUTPUT_FORMAT=... +export COST_BUDGET=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/aws-account-cost-health +export AWS_REGION=... +export AWS_ACCOUNT_NAME=... +export COST_ANALYSIS_LOOKBACK_DAYS=... +export COST_INCREASE_THRESHOLD=... +bash aws_cost_report.sh +bash aws_ri_recommendations.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `aws_cost_report.sh` — Bash helper script `aws_cost_report.sh`. +- `aws_ri_recommendations.sh` — Bash helper script `aws_ri_recommendations.sh`. diff --git a/codebundles/aws-cloudwatch-overused-ec2/SKILL-TEMPLATE.md b/codebundles/aws-cloudwatch-overused-ec2/SKILL-TEMPLATE.md new file mode 100644 index 000000000..a10d548e3 --- /dev/null +++ b/codebundles/aws-cloudwatch-overused-ec2/SKILL-TEMPLATE.md @@ -0,0 +1,82 @@ +--- +name: aws-cloudwatch-overused-ec2 +kind: skill-template +description: Queries AWS CloudWatch for a list of EC2 instances with a high amount of resource utilization, raising issues when... Use when triaging or monitoring AWS, CloudWatch workloads with skill template `... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [AWS, CloudWatch] +resource_types: [ec2_instance] +access: read-only +--- + +# AWS CloudWatch Overutlized EC2 Inspection + +## Summary + +This taskset can be used to check a fleet of EC2 instance and return the list of instances which are classified as overutilized. + +See [README.md](README.md) for additional context. + +## Tools + +### Check For Overutilized Ec2 Instances + +Fetches CloudWatch metrics for a list of EC2 instances and raises issues if they're over-utilized based on a configurable threshold. + +- **Robot task name**: Check For Overutilized Ec2 Instances +- **Robot file**: `runbook.robot` +- **Tags**: `cloudwatch`, `metrics`, `ec2`, `utilization`, `data:config` +- **Reads**: `AWS_DEFAULT_REGION`, `UTILIZATION_THRESHOLD` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AWS_DEFAULT_REGION` | string | The AWS region to scope API requests to. | `us-west-1` | no | +| `UTILIZATION_THRESHOLD` | string | The threshold at which an instance is determined as overutilized. | `0.8` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `aws_credentials` | AWS credentials from the workspace (from aws-auth block; e.g. aws:access_key@cli, aws:irsa@cli). | yes | + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/aws-cloudwatch-overused-ec2/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/aws-cloudwatch-overused-ec2 +export AWS_DEFAULT_REGION=... +export UTILIZATION_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/aws-eks-health/SKILL-TEMPLATE.md b/codebundles/aws-eks-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..b66563c41 --- /dev/null +++ b/codebundles/aws-eks-health/SKILL-TEMPLATE.md @@ -0,0 +1,142 @@ +--- +name: aws-eks-health +kind: skill-template +description: Checks the health status of an EKS cluster including node groups, add-ons, and Fargate profiles. Use when triaging or monitoring AWS, EKS, Fargate workloads with skill template `aws-eks-health`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [AWS, EKS, Fargate] +resource_types: [eks_cluster] +access: read-only +--- + +# AWS EKS Cluster Health + +## Summary + +Comprehensive health checks for Amazon EKS clusters in a given AWS region. + +See [README.md](README.md) for additional context. + +## Tools + +### Check EKS Cluster `${EKS_CLUSTER_NAME}` Health in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` + +Checks overall EKS cluster health including status, configuration, add-ons, and node group summary. + +- **Robot task name**: Check EKS Cluster `${EKS_CLUSTER_NAME}` Health in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_eks_cluster_health.sh` +- **Tags**: `EKS`, `Cluster`, `Health`, `AWS`, `Kubernetes`, `access:read-only`, `data:config` +- **Reads**: `AWS_REGION`, `EKS_CLUSTER_NAME` +- **Writes**: `eks_cluster_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Fargate Profile Health for EKS Cluster `${EKS_CLUSTER_NAME}` in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` + +Checks the health status of all Fargate profiles for the EKS cluster. + +- **Robot task name**: Check Fargate Profile Health for EKS Cluster `${EKS_CLUSTER_NAME}` in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_eks_fargate_cluster_health_status.sh` +- **Tags**: `EKS`, `Fargate`, `Cluster`, `Health`, `AWS`, `Kubernetes`, `Pods`, `access:read-only`, `data:config` +- **Reads**: `AWS_REGION`, `EKS_CLUSTER_NAME` +- **Writes**: `eks_fargate_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Kubernetes Version Support for EKS Cluster `${EKS_CLUSTER_NAME}` in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` + +Checks whether the EKS cluster is running a deprecated or extended-support Kubernetes version and estimates cost impact. AWS charges a $0.60/hr/cluster surcharge for versions in extended support (7x standard cost). + +- **Robot task name**: Check Kubernetes Version Support for EKS Cluster `${EKS_CLUSTER_NAME}` in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_eks_version_support.sh` +- **Tags**: `EKS`, `Version`, `Deprecation`, `Cost`, `AWS`, `Kubernetes`, `access:read-only`, `data:config` +- **Reads**: `AWS_REGION`, `EKS_CLUSTER_NAME` +- **Writes**: `eks_version_support.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Node Group Health for EKS Cluster `${EKS_CLUSTER_NAME}` in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` + +Checks the health and scaling status of all managed node groups for the EKS cluster. + +- **Robot task name**: Check Node Group Health for EKS Cluster `${EKS_CLUSTER_NAME}` in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_eks_nodegroup_health.sh` +- **Tags**: `AWS`, `EKS`, `Node`, `Health`, `Kubernetes`, `Nodes`, `access:read-only`, `data:config` +- **Reads**: `AWS_REGION`, `EKS_CLUSTER_NAME` +- **Writes**: `eks_nodegroup_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AWS_REGION` | string | AWS Region | — | yes | +| `EKS_CLUSTER_NAME` | string | The name of the EKS cluster to check. | — | yes | +| `AWS_ACCOUNT_NAME` | string | AWS account name or alias for display purposes. | `Unknown` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `aws_credentials` | AWS credentials from the workspace (from aws-auth block; e.g. aws:access_key@cli, aws:irsa@cli). | yes | + +## Outputs + +- `eks_cluster_health.json` +- `eks_fargate_health.json` +- `eks_version_support.json` +- `eks_nodegroup_health.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/aws-eks-health/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/aws-eks-health +export AWS_REGION=... +export EKS_CLUSTER_NAME=... +export AWS_ACCOUNT_NAME=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/aws-eks-health +export AWS_REGION=... +export EKS_CLUSTER_NAME=... +export AWS_ACCOUNT_NAME=... +bash check_eks_cluster_health.sh +bash check_eks_fargate_cluster_health_status.sh +bash check_eks_nodegroup_health.sh +bash check_eks_version_support.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `check_eks_cluster_health.sh` — Bash helper script `check_eks_cluster_health.sh`. +- `check_eks_fargate_cluster_health_status.sh` — Bash helper script `check_eks_fargate_cluster_health_status.sh`. +- `check_eks_nodegroup_health.sh` — Bash helper script `check_eks_nodegroup_health.sh`. +- `check_eks_version_support.sh` — Bash helper script `check_eks_version_support.sh`. diff --git a/codebundles/aws-elasticache-redis-health/SKILL-TEMPLATE.md b/codebundles/aws-elasticache-redis-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..89f4972b5 --- /dev/null +++ b/codebundles/aws-elasticache-redis-health/SKILL-TEMPLATE.md @@ -0,0 +1,118 @@ +--- +name: aws-elasticache-redis-health +kind: skill-template +description: Checks the health status of Elasticache redis in the given region. Use when triaging or monitoring AWS, Elasticache, Redis workloads with skill template `aws-elasticache-redis-health`. +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [AWS, Elasticache, Redis] +resource_types: [elasticache_cluster] +access: read-only +--- + +# AWS ElastiCache Health Check + +## Summary + +This runbook provides a comprehensive guide to managing and troubleshooting AWS Elasticache Redis configurations. + +See [README.md](README.md) for additional context. + +## Tools + +### Scan AWS Elasticache Redis Status in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` + +Checks the high level metrics and status of the elasticache redis instances in the region. + +- **Robot task name**: Scan AWS Elasticache Redis Status in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` +- **Robot file**: `runbook.robot` +- **Tags**: `AWS`, `Elasticache`, `configuration`, `endpoint`, `configuration`, `access:read-only`, `data:config` +- **Reads**: `AWS_REGION` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Monitors the health status of elasticache redis in the AWS region. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Scan ElastiCaches in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` + +Performs a broad health scan of all Elasticache instances in the region. + +- **Robot task name**: Scan ElastiCaches in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` +- **Sub-metric name**: `redis_health` +- **Tags**: `bash`, `script`, `AWS`, `Elasticache`, `Health`, `data:config` +- **Reads**: — + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AWS_REGION` | string | AWS Region | — | yes | +| `AWS_ACCOUNT_NAME` | string | AWS account name or alias for display purposes. | `Unknown` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `aws_credentials` | AWS credentials from the workspace (from aws-auth block; e.g. aws:access_key@cli, aws:irsa@cli). | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/aws-elasticache-redis-health/runbook.robot` +- **Monitor**: `codebundles/aws-elasticache-redis-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/aws-elasticache-redis-health +export AWS_REGION=... +export AWS_ACCOUNT_NAME=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/aws-elasticache-redis-health +export AWS_REGION=... +export AWS_ACCOUNT_NAME=... +bash analyze_aws_elasticache_redis_metrics.sh +bash monitor_redis_performance.sh +bash redis_status_scan.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `analyze_aws_elasticache_redis_metrics.sh` — Bash helper script `analyze_aws_elasticache_redis_metrics.sh`. +- `monitor_redis_performance.sh` — Bash helper script `monitor_redis_performance.sh`. +- `redis_status_scan.sh` — Bash helper script `redis_status_scan.sh`. diff --git a/codebundles/aws-lambda-health/SKILL-TEMPLATE.md b/codebundles/aws-lambda-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..3d0276e5c --- /dev/null +++ b/codebundles/aws-lambda-health/SKILL-TEMPLATE.md @@ -0,0 +1,142 @@ +--- +name: aws-lambda-health +kind: skill-template +description: Scans for AWS Lambda invocation errors. Use when triaging or monitoring AWS, Lambda workloads with skill template `aws-lambda-health`. +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [AWS, Lambda] +resource_types: [lambda_function] +access: read-only +--- + +# AWS Lambda Health Check + +## Summary + +This runbook provides a comprehensive guide to managing and troubleshooting AWS Lambda functions. + +See [README.md](README.md) for additional context. + +## Tools + +### List Lambda Versions and Runtimes in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` + +This script is designed to list all the versions and runtimes of a specified AWS Lambda function. + +- **Robot task name**: List Lambda Versions and Runtimes in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` +- **Robot file**: `runbook.robot` +- **Tags**: `AWS`, `Lambda`, `Versions`, `Runtimes`, `access:read-only`, `data:config` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze AWS Lambda Invocation Errors in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` + +This bash script is designed to analyze AWS Lambda Invocation Errors for a specified function within a specified region. + +- **Robot task name**: Analyze AWS Lambda Invocation Errors in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` +- **Robot file**: `runbook.robot` +- **Tags**: `AWS`, `Lambda`, `Error`, `Analysis`, `Invocation`, `Errors`, `CloudWatch`, `Logs`, `access:read-only`, `data:logs-regexp` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Monitor AWS Lambda Performance Metrics in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` + +This script is a bash utility for AWS Lambda functions the lists their notable metrics. + +- **Robot task name**: Monitor AWS Lambda Performance Metrics in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` +- **Robot file**: `runbook.robot` +- **Tags**: `AWS`, `Lambda`, `CloudWatch`, `Logs`, `Metrics`, `access:read-only`, `data:config` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Monitor AWS Lambda Invocation Errors + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Analyze AWS Lambda Invocation Errors in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` + +This bash script is designed to analyze AWS Lambda Invocation Errors for a specified function within a specified region. + +- **Robot task name**: Analyze AWS Lambda Invocation Errors in Account `${AWS_ACCOUNT_NAME}` Region `${AWS_REGION}` +- **Sub-metric name**: `invocation_errors` +- **Tags**: `AWS`, `Lambda`, `Error`, `Analysis`, `Invocation`, `Errors`, `CloudWatch`, `Logs`, `data:logs-regexp` +- **Reads**: — + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AWS_REGION` | string | AWS Region | — | yes | +| `AWS_ACCOUNT_NAME` | string | AWS account name or alias for display purposes. | `Unknown` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `aws_credentials` | AWS credentials from the workspace (from aws-auth block; e.g. aws:access_key@cli, aws:irsa@cli). | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/aws-lambda-health/runbook.robot` +- **Monitor**: `codebundles/aws-lambda-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/aws-lambda-health +export AWS_REGION=... +export AWS_ACCOUNT_NAME=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/aws-lambda-health +export AWS_REGION=... +export AWS_ACCOUNT_NAME=... +bash analyze_lambda_invocation_errors.sh +bash list_lambda_runtimes.sh +bash monitor_aws_lambda_performance_metrics.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `analyze_lambda_invocation_errors.sh` — Bash helper script `analyze_lambda_invocation_errors.sh`. +- `list_lambda_runtimes.sh` — Bash helper script `list_lambda_runtimes.sh`. +- `monitor_aws_lambda_performance_metrics.sh` — Bash helper script `monitor_aws_lambda_performance_metrics.sh`. diff --git a/codebundles/aws-sqs-dlq-investigation/SKILL-TEMPLATE.md b/codebundles/aws-sqs-dlq-investigation/SKILL-TEMPLATE.md new file mode 100644 index 000000000..142bb3206 --- /dev/null +++ b/codebundles/aws-sqs-dlq-investigation/SKILL-TEMPLATE.md @@ -0,0 +1,196 @@ +--- +name: aws-sqs-dlq-investigation +kind: skill-template +description: Investigates Amazon SQS dead-letter queues by correlating queue configuration, DLQ backlog, sampled messages, Lambda... Use when triaging or monitoring AWS, SQS, Lambda workloads with skill templat... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [AWS, SQS, Lambda, CloudWatch] +resource_types: [sqs_queue] +access: read-only +--- + +# AWS SQS Dead Letter Queue Investigation + +## Summary + +This CodeBundle detects messages that have landed on an SQS dead-letter queue (DLQ), quantifies backlog, samples DLQ payloads, finds Lambda consumers via event source mappings, pulls recent processor errors from CloudWatch Logs, and snapshots queue metrics—mirroring the queue-plus-logs workflow used for Azure Service Bus health on RunWhen. + +See [README.md](README.md) for additional context. + +## Tools + +### Check SQS Redrive Policy and DLQ Depth for Queues in `${AWS_REGION}` `${AWS_ACCOUNT_NAME}` + +Reads RedrivePolicy and DLQ attributes, flags backlog versus DLQ_DEPTH_THRESHOLD and stale message age. + +- **Robot task name**: Check SQS Redrive Policy and DLQ Depth for Queues in `${AWS_REGION}` `${AWS_ACCOUNT_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `sqs-redrive-and-dlq-depth.sh` +- **Tags**: `AWS`, `SQS`, `DLQ`, `access:read-only`, `data:metrics` +- **Reads**: `AWS_REGION` +- **Writes**: `redrive_dlq_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Peek Sample Messages on Dead Letter Queues in `${AWS_REGION}` + +Non-destructively receives a limited batch from each DLQ with a short visibility timeout for operator review. + +- **Robot task name**: Peek Sample Messages on Dead Letter Queues in `${AWS_REGION}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `sqs-peek-dlq-messages.sh` +- **Tags**: `AWS`, `SQS`, `DLQ`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AWS_REGION` +- **Writes**: `peek_dlq_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Discover Lambda Consumers for SQS Queues in `${AWS_REGION}` + +Lists Lambda event source mappings for each primary queue ARN to support log correlation. + +- **Robot task name**: Discover Lambda Consumers for SQS Queues in `${AWS_REGION}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `sqs-discover-lambda-consumers.sh` +- **Tags**: `AWS`, `Lambda`, `SQS`, `access:read-only`, `data:config` +- **Reads**: `AWS_REGION` +- **Writes**: `discover_lambda_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Recent Lambda Processor Errors from CloudWatch Logs in `${AWS_REGION}` + +Searches Lambda (and optional extra) log groups for errors within the lookback window. + +- **Robot task name**: Fetch Recent Lambda Processor Errors from CloudWatch Logs in `${AWS_REGION}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `sqs-fetch-lambda-error-logs.sh` +- **Tags**: `AWS`, `CloudWatch`, `Logs`, `Lambda`, `access:read-only`, `data:logs-regexp` +- **Reads**: `AWS_REGION` +- **Writes**: `fetch_lambda_logs_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Summarize CloudWatch Metrics for SQS Queues and DLQs in `${AWS_REGION}` + +Optional traffic and backlog snapshot via CloudWatch metrics for the primary queue and DLQ. + +- **Robot task name**: Summarize CloudWatch Metrics for SQS Queues and DLQs in `${AWS_REGION}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `sqs-cloudwatch-queue-metrics.sh` +- **Tags**: `AWS`, `SQS`, `CloudWatch`, `access:read-only`, `data:metrics` +- **Reads**: `AWS_REGION` +- **Writes**: `cloudwatch_queue_metrics_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Measures SQS DLQ health as a 0–1 score: 1 when redrive/DLQ analysis reports no issues, otherwise 0. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Score DLQ Clearance for SQS Queues in `${AWS_REGION}` + +Runs the redrive/DLQ depth check and maps an empty issue list to score 1, else 0. + +- **Robot task name**: Score DLQ Clearance for SQS Queues in `${AWS_REGION}` +- **Sub-metric name**: `dlq_issue_count_clear` +- **Underlying script**: `sqs-redrive-and-dlq-depth.sh` +- **Tags**: `access:read-only`, `data:metrics` +- **Reads**: `AWS_REGION` +- **Pass condition**: `${n} == 0` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AWS_REGION` | string | AWS region containing the queues. | — | yes | +| `AWS_ACCOUNT_NAME` | string | Human-readable account alias for titles and reports. | `` | yes | +| `SQS_QUEUE_URLS` | string | Comma-separated primary SQS queue URLs (optional if listing by RESOURCES). | `` | yes | +| `RESOURCES` | string | Queue name substring filter or All for discovery-driven runs. | `All` | no | +| `DLQ_DEPTH_THRESHOLD` | string | Flag DLQ when ApproximateNumberOfMessagesVisible exceeds this value (0 means any message is an issue). | `0` | no | +| `CLOUDWATCH_LOG_LOOKBACK_MINUTES` | string | How far back to search processor logs for errors. | `60` | no | +| `EXTRA_LOG_GROUP_NAMES` | string | Optional extra CloudWatch log groups for non-Lambda processors. | `` | yes | +| `MAX_DLQ_SAMPLE_MESSAGES` | string | Maximum DLQ messages to sample per queue in one run. | `5` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `aws_credentials` | AWS credentials from the workspace aws-auth block. | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `redrive_dlq_issues.json` +- `peek_dlq_issues.json` +- `discover_lambda_issues.json` +- `fetch_lambda_logs_issues.json` +- `cloudwatch_queue_metrics_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/aws-sqs-dlq-investigation/runbook.robot` +- **Monitor**: `codebundles/aws-sqs-dlq-investigation/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/aws-sqs-dlq-investigation +export AWS_REGION=... +export AWS_ACCOUNT_NAME=... +export SQS_QUEUE_URLS=... +export RESOURCES=... +export DLQ_DEPTH_THRESHOLD=... +export CLOUDWATCH_LOG_LOOKBACK_MINUTES=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/aws-sqs-dlq-investigation +export AWS_REGION=... +export AWS_ACCOUNT_NAME=... +export SQS_QUEUE_URLS=... +export RESOURCES=... +bash sqs-cloudwatch-queue-metrics.sh +bash sqs-common.sh +bash sqs-discover-lambda-consumers.sh +bash sqs-fetch-lambda-error-logs.sh +bash sqs-peek-dlq-messages.sh +bash sqs-redrive-and-dlq-depth.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `sqs-cloudwatch-queue-metrics.sh` — Bash helper script `sqs-cloudwatch-queue-metrics.sh`. +- `sqs-common.sh` — Bash helper script `sqs-common.sh`. +- `sqs-discover-lambda-consumers.sh` — Bash helper script `sqs-discover-lambda-consumers.sh`. +- `sqs-fetch-lambda-error-logs.sh` — Bash helper script `sqs-fetch-lambda-error-logs.sh`. +- `sqs-peek-dlq-messages.sh` — Bash helper script `sqs-peek-dlq-messages.sh`. +- `sqs-redrive-and-dlq-depth.sh` — Bash helper script `sqs-redrive-and-dlq-depth.sh`. diff --git a/codebundles/azure-acr-health/SKILL-TEMPLATE.md b/codebundles/azure-acr-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..ca3bf71cb --- /dev/null +++ b/codebundles/azure-acr-health/SKILL-TEMPLATE.md @@ -0,0 +1,291 @@ +--- +name: azure-acr-health +kind: skill-template +description: Comprehensive health checks for Azure Container Registry (ACR), including network configuration, resource health,... Use when triaging or monitoring Azure, Container, Registry workloads with skill ... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, Container, Registry, ACR, Health, Network, Security, Storage] +resource_types: [container_registry] +access: read-only +--- + +# Azure ACR Health Check + +## Summary + +This bundle provides comprehensive health checks for Azure Container Registries (ACR), including network configuration analysis, resource health monitoring, authentication testing, storage utilization analysis, pull/push metrics, and security assessments. + +See [README.md](README.md) for additional context. + +## Tools + +### Check Network Configuration for ACR `${ACR_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Analyze network access rules, private endpoints, firewall settings, and connectivity. + +- **Robot task name**: Check Network Configuration for ACR `${ACR_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `acr_network_config.sh` +- **Tags**: `access:read-only`, `ACR`, `Azure`, `Network`, `Security`, `Connectivity`, `data:config` +- **Reads**: `ACR_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check DNS & TLS Reachability for Registry `${ACR_NAME}` + +Verifies DNS resolution and HTTPS/TLS for ACR endpoint. + +- **Robot task name**: Check DNS & TLS Reachability for Registry `${ACR_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `acr_reachability.sh` +- **Tags**: `access:read-only`, `ACR`, `Azure`, `DNS`, `TLS`, `Connectivity`, `Health`, `data:config` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check ACR Login & Authentication for Registry `${ACR_NAME}` + +Attempts az acr login and docker login using intended workload identity. + +- **Robot task name**: Check ACR Login & Authentication for Registry `${ACR_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `acr_authentication.sh` +- **Tags**: `access:read-only`, `ACR`, `Azure`, `Login`, `Auth`, `Connectivity`, `Health`, `data:config` +- **Reads**: `ACR_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check ACR SKU and Usage Metrics for Registry `${ACR_NAME}` + +Analyzes ACR SKU configuration, usage limits, and provides recommendations. + +- **Robot task name**: Check ACR SKU and Usage Metrics for Registry `${ACR_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `acr_usage_sku.sh` +- **Tags**: `access:read-only`, `ACR`, `Azure`, `SKU`, `Usage`, `Health`, `data:config` +- **Reads**: `ACR_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check ACR Storage Utilization for Registry `${ACR_NAME}` + +Comprehensive analysis of ACR storage usage, repository sizes, and cleanup recommendations. + +- **Robot task name**: Check ACR Storage Utilization for Registry `${ACR_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `acr_storage_utilization.sh` +- **Tags**: `access:read-only`, `ACR`, `Azure`, `Storage`, `Utilization`, `Health`, `data:config` +- **Reads**: `ACR_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze ACR Pull/Push Success Ratio for Registry `${ACR_NAME}` + +Analyzes pull and push operation success rates using Azure Monitor metrics and Log Analytics. + +- **Robot task name**: Analyze ACR Pull/Push Success Ratio for Registry `${ACR_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `acr_pull_push_ratio.sh` +- **Tags**: `access:read-only`, `ACR`, `Azure`, `Pull`, `Push`, `Metrics`, `Health`, `data:config` +- **Reads**: `ACR_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check ACR Repository Event Failures for Registry `${ACR_NAME}` + +Queries Log Analytics for recent failed pushes/pulls and repo errors. + +- **Robot task name**: Check ACR Repository Event Failures for Registry `${ACR_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `acr_events.sh` +- **Tags**: `access:read-only`, `ACR`, `Azure`, `Events`, `Health`, `data:logs-regexp` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check ACR Security Configuration and RBAC for Registry `${ACR_NAME}` + +Comprehensive security analysis of ACR including RBAC assignments, admin user status, + +- **Robot task name**: Check ACR Security Configuration and RBAC for Registry `${ACR_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `acr_rbac_security.sh` +- **Tags**: `acr`, `security`, `rbac`, `authentication`, `network`, `data:config` +- **Reads**: `ACR_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Calculates Azure ACR health by checking reachability, SKU, pull/push ratio, and storage utilization. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Check ACR Reachability for Registry `${ACR_NAME}` + +Checks if the ACR endpoint is reachable. + +- **Robot task name**: Check ACR Reachability for Registry `${ACR_NAME}` +- **Sub-metric name**: `reachability` +- **Underlying script**: `acr_reachability.sh` +- **Tags**: `ACR`, `Azure`, `Reachability`, `Health`, `data:config` +- **Reads**: — + + +#### Check ACR Usage SKU Metric for Registry `${ACR_NAME}` + +Checks the SKU and usage limits for the ACR. + +- **Robot task name**: Check ACR Usage SKU Metric for Registry `${ACR_NAME}` +- **Sub-metric name**: `sku_usage` +- **Underlying script**: `acr_usage_sku.sh` +- **Tags**: `ACR`, `Azure`, `SKU`, `Health`, `data:config` +- **Reads**: — + + +#### Check ACR Pull/Push Success Ratio for Registry `${ACR_NAME}` + +Checks the success rate of image pull and push operations. + +- **Robot task name**: Check ACR Pull/Push Success Ratio for Registry `${ACR_NAME}` +- **Sub-metric name**: `pull_push_ratio` +- **Underlying script**: `acr_pull_push_ratio.sh` +- **Tags**: `ACR`, `Azure`, `PullPush`, `Health`, `data:config` +- **Reads**: — + + +#### Check ACR Storage Utilization for Registry `${ACR_NAME}` + +Checks the storage usage of the ACR. + +- **Robot task name**: Check ACR Storage Utilization for Registry `${ACR_NAME}` +- **Sub-metric name**: `storage_utilization` +- **Underlying script**: `acr_storage_utilization.sh` +- **Tags**: `ACR`, `Azure`, `Storage`, `Health`, `data:config` +- **Reads**: — + + +#### Check ACR Network Configuration for Registry `${ACR_NAME}` + +Checks network access rules, private endpoints, and connectivity. + +- **Robot task name**: Check ACR Network Configuration for Registry `${ACR_NAME}` +- **Sub-metric name**: `network_config` +- **Underlying script**: `acr_network_config.sh` +- **Tags**: `ACR`, `Azure`, `Network`, `Health`, `data:config` +- **Reads**: — + + +#### Check ACR Security Configuration + +Analyzes ACR security configuration including RBAC, admin user settings, network access, and authentication methods. + +- **Robot task name**: Check ACR Security Configuration +- **Sub-metric name**: `security` +- **Underlying script**: `acr_rbac_security.sh` +- **Tags**: `ACR`, `Azure`, `Security`, `RBAC`, `SLI`, `data:config` +- **Reads**: — + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZ_RESOURCE_GROUP` | string | The resource group containing the ACR. | — | yes | +| `ACR_NAME` | string | Azure Container Registry Name. | — | yes | +| `AZURE_SUBSCRIPTION_ID` | string | The Azure Subscription ID. | — | yes | +| `AZURE_SUBSCRIPTION_NAME` | string | The Azure Subscription Name. | — | yes | +| `USAGE_THRESHOLD` | string | Threshold for acr usage | `80` | no | +| `CRITICAL_THRESHOLD` | string | Storage usage critical threshold percentage. | `95` | no | +| `TIME_PERIOD_HOURS` | string | Time period in hours for pull/push metrics analysis. | `24` | no | +| `PULL_SUCCESS_THRESHOLD` | string | Minimum pull success ratio percentage threshold. | `95` | no | +| `PUSH_SUCCESS_THRESHOLD` | string | Minimum push success ratio percentage threshold. | `98` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `storage_utilization_issues.json` +- `network_config_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-acr-health/runbook.robot` +- **Monitor**: `codebundles/azure-acr-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-acr-health +export AZ_RESOURCE_GROUP=... +export ACR_NAME=... +export AZURE_SUBSCRIPTION_ID=... +export AZURE_SUBSCRIPTION_NAME=... +export USAGE_THRESHOLD=... +export CRITICAL_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-acr-health +export AZ_RESOURCE_GROUP=... +export ACR_NAME=... +export AZURE_SUBSCRIPTION_ID=... +export AZURE_SUBSCRIPTION_NAME=... +bash acr_authentication.sh +bash acr_events.sh +bash acr_network_config.sh +bash acr_pull_push_ratio.sh +bash acr_rbac_security.sh +bash acr_reachability.sh +bash acr_storage_usage.sh +bash acr_storage_utilization.sh +bash acr_usage_sku.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `acr_authentication.sh` — Bash helper script `acr_authentication.sh`. +- `acr_events.sh` — Bash helper script `acr_events.sh`. +- `acr_network_config.sh` — Bash helper script `acr_network_config.sh`. +- `acr_pull_push_ratio.sh` — Bash helper script `acr_pull_push_ratio.sh`. +- `acr_rbac_security.sh` — Bash helper script `acr_rbac_security.sh`. +- `acr_reachability.sh` — Bash helper script `acr_reachability.sh`. +- `acr_storage_usage.sh` — Bash helper script `acr_storage_usage.sh`. +- `acr_storage_utilization.sh` — Bash helper script `acr_storage_utilization.sh`. +- `acr_usage_sku.sh` — Bash helper script `acr_usage_sku.sh`. diff --git a/codebundles/azure-acr-image-sync/SKILL-TEMPLATE.md b/codebundles/azure-acr-image-sync/SKILL-TEMPLATE.md new file mode 100644 index 000000000..ba4fed35e --- /dev/null +++ b/codebundles/azure-acr-image-sync/SKILL-TEMPLATE.md @@ -0,0 +1,119 @@ +--- +name: azure-acr-image-sync +kind: skill-template +description: This CodeBundle syncs images from public repostitories into an Azure Container Registry. Use when triaging or monitoring Azure, ACR workloads with skill template `azure-acr-image-sync`. +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, ACR] +resource_types: [container_registry] +access: read-only +--- + +# Azure ACR Image Sync + +## Summary + +**Purpose**: This CodeBundle synchronizes container images from public repositories into an Azure Container Registry (ACR). + +See [README.md](README.md) for additional context. + +## Tools + +### Sync Container Images into Azure Container Registry `${ACR_REGISTRY}` + +Synchronizes the latest container images into an ACR repository + +- **Robot task name**: Sync Container Images into Azure Container Registry `${ACR_REGISTRY}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `acr_sync_images.sh` +- **Tags**: `azure`, `acr`, `registry`, `runwhen`, `data:config` +- **Reads**: `DOCKER_TOKEN`, `DOCKER_USERNAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +This CodeBundle counts the number of container images (from a configured list) outdated. It compares upstream images with those in the registry and counts the number that are outdated. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Count Outdated Images in Azure Container Registry `${ACR_REGISTRY}` + +Counts the number of images that need updating in ACR from the upstream source. + +- **Robot task name**: Count Outdated Images in Azure Container Registry `${ACR_REGISTRY}` +- **Sub-metric name**: `outdated_images` +- **Underlying script**: `check_for_image_updates.sh` +- **Tags**: `azure`, `acr`, `registry`, `runwhen`, `data:config` +- **Reads**: `DOCKER_TOKEN`, `DOCKER_USERNAME` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `ACR_REGISTRY` | string | The name of the Azure Container Registry to import images into. | `myacr.azurecr.io` | no | +| `USE_DATE_TAG_PATTERN` | string | Change the image tag to use the current date and time. Useful when importing 'latest' tags | `false` | no | +| `AZURE_RESOURCE_SUBSCRIPTION_ID` | string | The Azure Subscription ID for the resource. | `""` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-acr-image-sync/runbook.robot` +- **Monitor**: `codebundles/azure-acr-image-sync/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-acr-image-sync +export ACR_REGISTRY=... +export USE_DATE_TAG_PATTERN=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-acr-image-sync +export ACR_REGISTRY=... +export USE_DATE_TAG_PATTERN=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +bash acr_sync_images.sh +bash check_for_image_updates.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `acr_sync_images.sh` — Bash helper script `acr_sync_images.sh`. +- `check_for_image_updates.sh` — Bash helper script `check_for_image_updates.sh`. diff --git a/codebundles/azure-adf-health/SKILL-TEMPLATE.md b/codebundles/azure-adf-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..67623ab57 --- /dev/null +++ b/codebundles/azure-adf-health/SKILL-TEMPLATE.md @@ -0,0 +1,247 @@ +--- +name: azure-adf-health +kind: skill-template +description: Azure Data Factories health checks including resource health status, frequent pipeline errors, failed pipeline runs,... Use when triaging or monitoring Azure, Data, factories workloads with skill t... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, Data, factories] +resource_types: [data_factory] +access: read-only +--- + +# Azure Data factories Health + +## Summary + +This codebundle runs a suite of metrics checks for Data Factory in Azure. + +See [README.md](README.md) for additional context. + +## Tools + +### Check for Resource Health Issues Affecting Data Factories in resource group `${AZURE_RESOURCE_GROUP}` + +Fetch health status for all Data Factories in the resource group + +- **Robot task name**: Check for Resource Health Issues Affecting Data Factories in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `resource_health.sh` +- **Tags**: `datafactory`, `resourcehealth`, `access:read-only`, `data:config` +- **Reads**: `AZURE_RESOURCE_GROUP` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List Frequent Pipeline Errors in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` + +List frequently occurring errors in Data Factory pipelines + +- **Robot task name**: List Frequent Pipeline Errors in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `error_trend.sh` +- **Tags**: `datafactory`, `pipeline-errors`, `access:read-only`, `data:logs-regexp` +- **Reads**: `AZURE_RESOURCE_GROUP`, `FAILURE_THRESHOLD` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List Failed Pipelines in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` + +List failed pipeline runs in Data Factory pipelines + +- **Robot task name**: List Failed Pipelines in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `failed_pipeline.sh` +- **Tags**: `datafactory`, `pipeline-failures`, `access:read-only`, `data:logs-regexp` +- **Reads**: `AZURE_RESOURCE_GROUP` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Find Large Data Operations in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` + +List large data operations in Data Factory pipelines + +- **Robot task name**: Find Large Data Operations in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `data_volume_audit.sh` +- **Tags**: `datafactory`, `data-volume`, `access:read-only`, `data:config` +- **Reads**: `AZURE_RESOURCE_GROUP` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Azure Data Factory Details in resource group `${AZURE_RESOURCE_GROUP}` + +List comprehensive details about Azure Data Factories + +- **Robot task name**: Fetch Azure Data Factory Details in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `adf_details.sh` +- **Tags**: — +- **Reads**: `AZURE_RESOURCE_GROUP` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List Long Running Pipeline Runs in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` + +List long running pipeline runs in Data Factory pipelines + +- **Robot task name**: List Long Running Pipeline Runs in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `long_pipeline_runs.sh` +- **Tags**: `datafactory`, `long-running-pipelines`, `access:read-only`, `data:config` +- **Reads**: `AZURE_RESOURCE_GROUP` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Azure Data Factories health checks including resource health status, frequent pipeline errors, failed pipeline runs, and large data operations monitoring. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Identify Health Issues Affecting Data Factories in resource group `${AZURE_RESOURCE_GROUP}` + +Fetch health status for all Data Factories in the resource group + +- **Robot task name**: Identify Health Issues Affecting Data Factories in resource group `${AZURE_RESOURCE_GROUP}` +- **Sub-metric name**: `resource_health` +- **Underlying script**: `resource_health.sh` +- **Tags**: `datafactory`, `resourcehealth`, `access:read-only`, `data:config` +- **Reads**: — + + +#### Count Frequent Pipeline Errors in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` + +Count frequently occurring errors in Data Factory pipelines + +- **Robot task name**: Count Frequent Pipeline Errors in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` +- **Sub-metric name**: `pipeline_errors` +- **Underlying script**: `error_trend.sh` +- **Tags**: `datafactory`, `pipeline-errors`, `access:read-only`, `data:logs-regexp` +- **Reads**: — +- **Pass condition**: `${error_count} == 0` + + +#### Count Failed Pipelines in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` + +Count failed pipeline runs in Data Factory pipelines + +- **Robot task name**: Count Failed Pipelines in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` +- **Sub-metric name**: `failed_pipelines` +- **Underlying script**: `failed_pipeline.sh` +- **Tags**: `datafactory`, `pipeline-failures`, `access:read-only`, `data:logs-regexp` +- **Reads**: — +- **Pass condition**: `${failed_count} == 0` + + +#### Count Large Data Operations in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` + +Count large data operations in Data Factory pipelines + +- **Robot task name**: Count Large Data Operations in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` +- **Sub-metric name**: `data_operations` +- **Underlying script**: `data_volume_audit.sh` +- **Tags**: `datafactory`, `data-volume`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `${data_volume_alerts_count} == 0` + + +#### Count Long Running Pipeline Runs in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` + +Count long running pipeline runs in Data Factory pipelines + +- **Robot task name**: Count Long Running Pipeline Runs in Data Factories in resource group `${AZURE_RESOURCE_GROUP}` +- **Sub-metric name**: `long_running_pipelines` +- **Underlying script**: `long_pipeline_runs.sh` +- **Tags**: `datafactory`, `pipeline-long-running`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `${long_running_count} == 0` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZURE_RESOURCE_SUBSCRIPTION_ID` | string | The Azure Subscription ID for the resource. | `""` | no | +| `AZURE_RESOURCE_GROUP` | string | Azure resource group. | — | yes | +| `LOOKBACK_PERIOD` | string | The lookback period for querying failed pipelines (e.g., 1d, 7d, 30d). | `7d` | no | +| `THRESHOLD_MB` | string | The threshold for data volume in MB. | `1000` | no | +| `FAILURE_THRESHOLD` | string | The threshold for failure count. | `1` | no | +| `RUN_TIME_THRESHOLD` | string | The threshold for run time of a pipeline in seconds. | `600` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-adf-health/runbook.robot` +- **Monitor**: `codebundles/azure-adf-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-adf-health +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +export AZURE_RESOURCE_GROUP=... +export LOOKBACK_PERIOD=... +export THRESHOLD_MB=... +export FAILURE_THRESHOLD=... +export RUN_TIME_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-adf-health +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +export AZURE_RESOURCE_GROUP=... +export LOOKBACK_PERIOD=... +export THRESHOLD_MB=... +bash adf_details.sh +bash data_volume_audit.sh +bash error_trend.sh +bash failed_pipeline.sh +bash long_pipeline_runs.sh +bash resource_health.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `adf_details.sh` — Bash helper script `adf_details.sh`. +- `data_volume_audit.sh` — Bash helper script `data_volume_audit.sh`. +- `error_trend.sh` — Bash helper script `error_trend.sh`. +- `failed_pipeline.sh` — Bash helper script `failed_pipeline.sh`. +- `long_pipeline_runs.sh` — Bash helper script `long_pipeline_runs.sh`. +- `resource_health.sh` — Bash helper script `resource_health.sh`. diff --git a/codebundles/azure-aks-cost-optimization/SKILL-TEMPLATE.md b/codebundles/azure-aks-cost-optimization/SKILL-TEMPLATE.md new file mode 100644 index 000000000..26a5bde6d --- /dev/null +++ b/codebundles/azure-aks-cost-optimization/SKILL-TEMPLATE.md @@ -0,0 +1,102 @@ +--- +name: azure-aks-cost-optimization +kind: skill-template +description: Azure AKS Cost Optimization: Analyzes AKS cluster node pools to identify cost optimization opportunities by... Use when triaging or monitoring Azure, Cost, Optimization workloads with skill templat... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, Cost, Optimization, AKS, Kubernetes, Node, Pools, Autoscaling, Capacity, Planning] +resource_types: [aks_cluster] +access: read-only +--- + +# Azure AKS Cost Optimization + +## Summary + +This codebundle analyzes Azure Kubernetes Service (AKS) cluster node pools to identify cost optimization opportunities. + +See [README.md](README.md) for additional context. + +## Tools + +### Analyze AKS Node Pool Resizing Opportunities Based on Utilization Metrics in Resource Group `${AZURE_RESOURCE_GROUPS}` for Subscription `${AZURE_SUBSCRIPTION_NAME}` + +Analyzes AKS cluster node pools across specified subscriptions, examines both average and peak CPU/memory utilization over the past 30 days, and provides capacity-planned recommendations for reducing minimum node counts or changing VM types to optimize costs. Uses a two-tier approach: minimum nodes based on average utilization (150% safety margin), maximum nodes based on peak utilization (150% safety margin). This ensures cost-effective baseline capacity while maintaining ceiling for traffic spikes. Safety margins are configurable via MIN_NODE_SAFETY_MARGIN_PERCENT and MAX_NODE_SAFETY_MARGIN_PERCENT. + +- **Robot task name**: Analyze AKS Node Pool Resizing Opportunities Based on Utilization Metrics in Resource Group `${AZURE_RESOURCE_GROUPS}` for Subscription `${AZURE_SUBSCRIPTION_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `analyze_aks_node_pool_optimization.sh` +- **Tags**: `Azure`, `Cost`, `Optimization`, `AKS`, `Kubernetes`, `Node`, `Pools`, `Autoscaling`, `Capacity`, `Planning`, `access:read-only`, `data:config` +- **Reads**: `TIMEOUT_SECONDS` +- **Writes**: `aks_node_pool_optimization_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZURE_SUBSCRIPTION_IDS` | string | Comma-separated list of Azure subscription IDs to analyze for AKS optimization. | `""` | no | +| `AZURE_RESOURCE_GROUPS` | string | Comma-separated list of resource groups to analyze (leave empty to analyze all resource groups in the subscription) | `""` | no | +| `AZURE_SUBSCRIPTION_NAME` | string | Azure subscription name for reporting purposes | `""` | no | +| `COST_ANALYSIS_LOOKBACK_DAYS` | string | Number of days to look back for utilization analysis (default: 30) | `30` | no | +| `LOW_COST_THRESHOLD` | string | Monthly savings threshold for LOW classification (default: 500) | `500` | no | +| `MEDIUM_COST_THRESHOLD` | string | Monthly savings threshold for MEDIUM classification (default: 2000) | `2000` | no | +| `HIGH_COST_THRESHOLD` | string | Monthly savings threshold for HIGH classification (default: 10000) | `10000` | no | +| `AZURE_DISCOUNT_PERCENTAGE` | string | Discount percentage off MSRP for Azure services (default: 0) | `0` | no | +| `TIMEOUT_SECONDS` | string | Timeout in seconds for tasks (default: 1500 = 25 minutes). | `1500` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- `aks_node_pool_optimization_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-aks-cost-optimization/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-aks-cost-optimization +export AZURE_SUBSCRIPTION_IDS=... +export AZURE_RESOURCE_GROUPS=... +export AZURE_SUBSCRIPTION_NAME=... +export COST_ANALYSIS_LOOKBACK_DAYS=... +export LOW_COST_THRESHOLD=... +export MEDIUM_COST_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-aks-cost-optimization +export AZURE_SUBSCRIPTION_IDS=... +export AZURE_RESOURCE_GROUPS=... +export AZURE_SUBSCRIPTION_NAME=... +export COST_ANALYSIS_LOOKBACK_DAYS=... +bash analyze_aks_node_pool_optimization.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `analyze_aks_node_pool_optimization.sh` — Bash helper script `analyze_aks_node_pool_optimization.sh`. diff --git a/codebundles/azure-aks-triage/SKILL-TEMPLATE.md b/codebundles/azure-aks-triage/SKILL-TEMPLATE.md new file mode 100644 index 000000000..b8238646d --- /dev/null +++ b/codebundles/azure-aks-triage/SKILL-TEMPLATE.md @@ -0,0 +1,178 @@ +--- +name: azure-aks-triage +kind: skill-template +description: Runs diagnostic checks against an AKS cluster. Use when triaging or monitoring Azure, AKS, Kubernetes workloads with skill template `azure-aks-triage`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, AKS, Kubernetes, Service, Triage, Health] +resource_types: [aks_cluster] +access: read-only +--- + +# Azure AKS Triage + +## Summary + +This CodeBundle checks for AKS Cluster Health based on how Azure is reporting resource health, network configuration recommendations, activities that have occured, and provisioning status of resources. + +See [README.md](README.md) for additional context. + +## Tools + +### Check for Resource Health Issues Affecting AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch a list of issues that might affect the AKS cluster + +- **Robot task name**: Check for Resource Health Issues Affecting AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `aks_resource_health.sh` +- **Tags**: `aks`, `config`, `access:read-only`, `data:config` +- **Reads**: `AKS_CLUSTER`, `AZ_RESOURCE_GROUP` +- **Writes**: `az_resource_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Configuration Health of AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch the config of the AKS cluster in azure + +- **Robot task name**: Check Configuration Health of AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `aks_cluster_health.sh` +- **Tags**: `AKS`, `config`, `access:read-only`, `data:config` +- **Reads**: `AKS_CLUSTER`, `AZ_RESOURCE_GROUP` +- **Writes**: `az_cluster_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Network Configuration of AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch the network configuration, generating resource URLs and basic recommendations + +- **Robot task name**: Check Network Configuration of AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `aks_network.sh` +- **Tags**: `AKS`, `config`, `network`, `route`, `firewall`, `access:read-only`, `data:config` +- **Reads**: `AKS_CLUSTER`, `AZ_RESOURCE_GROUP` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Activities for AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Gets the activities for the AKS cluster set and checks for errors + +- **Robot task name**: Fetch Activities for AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `aks_activities.sh` +- **Tags**: `AKS`, `activities`, `monitor`, `events`, `errors`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AKS_CLUSTER`, `AZ_RESOURCE_GROUP` +- **Writes**: `aks_activities_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Kubernetes Version Support for AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Checks whether the AKS cluster is running an unsupported or soon-to-expire Kubernetes version. AKS supports each version for ~12 months. Running unsupported versions loses SLA coverage and security patches. Premium tier with LTS ($0.60/hr) extends support for up to 2 years. + +- **Robot task name**: Check Kubernetes Version Support for AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `aks_version_support.sh` +- **Tags**: `AKS`, `Version`, `Deprecation`, `Cost`, `LTS`, `access:read-only`, `data:config` +- **Reads**: `AKS_CLUSTER`, `AZ_RESOURCE_GROUP` +- **Writes**: `aks_version_support.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze AKS Cluster Cost Optimization Opportunities for `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Analyzes 30-day utilization trends using Azure Monitor to identify underutilized node pools with cost savings opportunities. Provides Azure VM pricing-based estimates for potential monthly and annual savings with severity bands: Sev4 <$2k/month, Sev3 $2k-$10k/month, Sev2 >$10k/month. + +- **Robot task name**: Analyze AKS Cluster Cost Optimization Opportunities for `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `aks_cost_optimization.sh` +- **Tags**: `aks`, `cost-optimization`, `underutilization`, `azure-monitor`, `pricing`, `access:read-only`, `data:config` +- **Reads**: `AKS_CLUSTER`, `AZ_RESOURCE_GROUP`, `TIMEOUT_SECONDS` +- **Writes**: `aks_cost_optimization_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZ_RESOURCE_GROUP` | string | The resource group to perform actions against. | — | yes | +| `AKS_CLUSTER` | string | The Azure AKS cluster to triage. | — | yes | +| `RW_LOOKBACK_WINDOW` | string | The time period, in minutes, to look back for activites/events. | `60` | no | +| `TIMEOUT_SECONDS` | string | Timeout in seconds for tasks (default: 900). | `900` | no | +| `AZURE_RESOURCE_SUBSCRIPTION_ID` | string | The Azure Subscription ID for the resource. | `""` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `azure_credentials` | — | yes | + +## Outputs + +- `az_resource_health.json` +- `az_cluster_health.json` +- `aks_activities_issues.json` +- `aks_version_support.json` +- `aks_cost_optimization_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-aks-triage/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-aks-triage +export AZ_RESOURCE_GROUP=... +export AKS_CLUSTER=... +export RW_LOOKBACK_WINDOW=... +export TIMEOUT_SECONDS=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-aks-triage +export AZ_RESOURCE_GROUP=... +export AKS_CLUSTER=... +export RW_LOOKBACK_WINDOW=... +export TIMEOUT_SECONDS=... +bash aks_activities.sh +bash aks_cluster_health.sh +bash aks_cost_optimization.sh +bash aks_network.sh +bash aks_resource_health.sh +bash aks_version_support.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `aks_activities.sh` — Bash helper script `aks_activities.sh`. +- `aks_cluster_health.sh` — Bash helper script `aks_cluster_health.sh`. +- `aks_cost_optimization.sh` — Bash helper script `aks_cost_optimization.sh`. +- `aks_network.sh` — Bash helper script `aks_network.sh`. +- `aks_resource_health.sh` — Bash helper script `aks_resource_health.sh`. +- `aks_version_support.sh` — Bash helper script `aks_version_support.sh`. diff --git a/codebundles/azure-apim-health/SKILL-TEMPLATE.md b/codebundles/azure-apim-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..909b6b3d6 --- /dev/null +++ b/codebundles/azure-apim-health/SKILL-TEMPLATE.md @@ -0,0 +1,328 @@ +--- +name: azure-apim-health +kind: skill-template +description: Runs diagnostic checks to check the health of APIM instances. Use when triaging or monitoring Azure, APIM, Service workloads with skill template `azure-apim-health`. +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, APIM, Service, Triage, Health] +resource_types: [azure_resource] +access: read-only +--- + +# Azure APIM Health + +## Summary + +as login --use-device-code export APP_SERVICE_NAME=azure-apim-health-f1. + +See [README.md](README.md) for additional context. + +## Tools + +### Gather APIM Resource Information for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Collect fundamental details about the Azure subscription, resource group, + +- **Robot task name**: Gather APIM Resource Information for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `gather_apim_resource_information.sh` +- **Tags**: `apim`, `config`, `access:read-only`, `data:config` +- **Reads**: — +- **Writes**: `apim_config_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check for Resource Health Issues Affecting APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch Resource Health status and evaluate any reported issues for the APIM instance. + +- **Robot task name**: Check for Resource Health Issues Affecting APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `apim_resource_health.sh` +- **Tags**: `apim`, `resourcehealth`, `access:read-only`, `data:config` +- **Reads**: `APIM_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `apim_resource_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Key Metrics for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Gather APIM metrics from Azure Monitor. Raises issues if thresholds are violated. + +- **Robot task name**: Fetch Key Metrics for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `apim_metrics.sh` +- **Tags**: `apim`, `metrics`, `analytics`, `access:read-only`, `data:config` +- **Reads**: `APIM_NAME` +- **Writes**: `apim_metrics.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Logs for Errors with APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Run apim_diagnostic_logs.sh, parse results, raise issues if logs exceed thresholds. + +- **Robot task name**: Check Logs for Errors with APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `apim_diagnostic_logs.sh` +- **Tags**: `apim`, `logs`, `diagnostics`, `access:read-only`, `data:logs-regexp` +- **Reads**: `APIM_NAME` +- **Writes**: `apim_diagnostic_log_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Activity Logs for APIM Management Operations `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Review Azure Activity Logs for administrative operations on the APIM instance + +- **Robot task name**: Check Activity Logs for APIM Management Operations `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `apim_activity_logs.sh` +- **Tags**: `apim`, `activity-logs`, `management`, `access:read-only`, `data:logs-bulk` +- **Reads**: `APIM_NAME` +- **Writes**: `apim_activity_log_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Application Insights Integration for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Verify Application Insights integration and analyze telemetry if configured + +- **Robot task name**: Check Application Insights Integration for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_apim_appinsights.sh` +- **Tags**: `apim`, `application-insights`, `telemetry`, `access:read-only`, `data:config` +- **Reads**: — +- **Writes**: `apim_appinsights_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Key Vault Dependencies for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Verify Key Vault dependencies and access for certificates and secrets + +- **Robot task name**: Check Key Vault Dependencies for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_apim_keyvault.sh` +- **Tags**: `apim`, `keyvault`, `certificates`, `access:read-only`, `data:config` +- **Reads**: — +- **Writes**: `apim_keyvault_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Verify APIM Policy Configurations for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Validates APIM policies for malformed XML, authentication issues, and backend connectivity problems. + +- **Robot task name**: Verify APIM Policy Configurations for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `verify_apim_policies.sh` +- **Tags**: `apim`, `policy`, `xml`, `authentication`, `backend`, `access:read-only`, `data:config` +- **Reads**: `APIM_NAME` +- **Writes**: `apim_policy_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check APIM SSL Certificates for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Verify certificate validity, expiration, thumbprint, and domain matches + +- **Robot task name**: Check APIM SSL Certificates for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_apim_ssl_certs.sh` +- **Tags**: `apim`, `ssl`, `certificate`, `access:read-only`, `data:config` +- **Reads**: — +- **Writes**: `apim_ssl_certificate_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Inspect Dependencies and Related Resources for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Runs inspect_apim_dependencies.sh to discover & validate Key Vault, backends, DNS, etc. + +- **Robot task name**: Inspect Dependencies and Related Resources for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `inspect_apim_dependencies.sh` +- **Tags**: `apim`, `dependencies`, `external`, `keyvault`, `access:read-only`, `data:config` +- **Reads**: — +- **Writes**: `apim_dependencies.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Runs diagnostic checks to check the health of APIM instances + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Check for Resource Health Issues Affecting APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch Resource Health status and evaluate any reported issues for the APIM instance. + +- **Robot task name**: Check for Resource Health Issues Affecting APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `resource_health` +- **Underlying script**: `apim_resource_health.sh` +- **Tags**: `apim`, `resourcehealth`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `"${resource_health_output_json["properties"]["title"]}" == "Available"` + + +#### Fetch Key Metrics for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Gather APIM metrics from Azure Monitor. Raises issues if thresholds are violated. + +- **Robot task name**: Fetch Key Metrics for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `metrics` +- **Underlying script**: `apim_metrics.sh` +- **Tags**: `apim`, `metrics`, `analytics`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `len(@{issues_list["issues"]}) == 0` + + +#### Check Logs for Errors with APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Run apim_diagnostic_logs.sh, parse results, raise issues if logs exceed thresholds. + +- **Robot task name**: Check Logs for Errors with APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `diagnostic_logs` +- **Underlying script**: `apim_diagnostic_logs.sh` +- **Tags**: `apim`, `logs`, `diagnostics`, `access:read-only`, `data:logs-regexp` +- **Reads**: — +- **Pass condition**: `len(@{issue_list["issues"]}) == 0` + + +#### Verify APIM Policy Configurations for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Runs a shell script to enumerate all APIM policies and check for missing tags. + +- **Robot task name**: Verify APIM Policy Configurations for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `policy_config` +- **Underlying script**: `verify_apim_policies.sh` +- **Tags**: `apim`, `policy`, `config`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `len(@{issue_list["issues"]}) == 0` + + +#### Check APIM SSL Certificates for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Verify certificate validity, expiration, thumbprint, and domain matches + +- **Robot task name**: Check APIM SSL Certificates for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `ssl_certificates` +- **Underlying script**: `check_apim_ssl_certs.sh` +- **Tags**: `apim`, `ssl`, `certificate`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `len(@{issue_list["issues"]}) == 0` + + +#### Inspect Dependencies and Related Resources for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Runs inspect_apim_dependencies.sh to discover & validate Key Vault, backends, DNS, etc. + +- **Robot task name**: Inspect Dependencies and Related Resources for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `dependencies` +- **Underlying script**: `inspect_apim_dependencies.sh` +- **Tags**: `apim`, `dependencies`, `external`, `keyvault`, `data:config` +- **Reads**: — +- **Pass condition**: `len(@{issue_list}) == 0` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZ_RESOURCE_GROUP` | string | The resource group to perform actions against. | — | yes | +| `APIM_NAME` | string | The APIM Instance Name | — | yes | +| `RW_LOOKBACK_WINDOW` | string | The time period, in minutes, to look back for activites/events. | `60` | no | +| `AZURE_RESOURCE_SUBSCRIPTION_ID` | string | The Azure Subscription ID for the resource. | `""` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `apim_config_issues.json` +- `apim_resource_health.json` +- `apim_metrics.json` +- `apim_diagnostic_log_issues.json` +- `apim_activity_log_issues.json` +- `apim_appinsights_issues.json` +- `apim_keyvault_issues.json` +- `apim_policy_issues.json` +- `apim_ssl_certificate_issues.json` +- `apim_dependencies.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-apim-health/runbook.robot` +- **Monitor**: `codebundles/azure-apim-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-apim-health +export AZ_RESOURCE_GROUP=... +export APIM_NAME=... +export RW_LOOKBACK_WINDOW=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-apim-health +export AZ_RESOURCE_GROUP=... +export APIM_NAME=... +export RW_LOOKBACK_WINDOW=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +bash apim_activity_logs.sh +bash apim_diagnostic_logs.sh +bash apim_metrics.sh +bash apim_policies.sh +bash apim_resource_health.sh +bash check_apim_appinsights.sh +bash check_apim_keyvault.sh +bash check_apim_ssl_certs.sh +bash gather_apim_resource_information.sh +bash inspect_apim_dependencies.sh +bash verify_apim_policies.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `apim_activity_logs.sh` — Bash helper script `apim_activity_logs.sh`. +- `apim_diagnostic_logs.sh` — Bash helper script `apim_diagnostic_logs.sh`. +- `apim_metrics.sh` — Bash helper script `apim_metrics.sh`. +- `apim_policies.sh` — Bash helper script `apim_policies.sh`. +- `apim_resource_health.sh` — Bash helper script `apim_resource_health.sh`. +- `check_apim_appinsights.sh` — Bash helper script `check_apim_appinsights.sh`. +- `check_apim_keyvault.sh` — Bash helper script `check_apim_keyvault.sh`. +- `check_apim_ssl_certs.sh` — Bash helper script `check_apim_ssl_certs.sh`. +- `gather_apim_resource_information.sh` — Bash helper script `gather_apim_resource_information.sh`. +- `inspect_apim_dependencies.sh` — Bash helper script `inspect_apim_dependencies.sh`. +- `verify_apim_policies.sh` — Bash helper script `verify_apim_policies.sh`. diff --git a/codebundles/azure-appgateway-health/SKILL-TEMPLATE.md b/codebundles/azure-appgateway-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..b21fa8d4e --- /dev/null +++ b/codebundles/azure-appgateway-health/SKILL-TEMPLATE.md @@ -0,0 +1,296 @@ +--- +name: azure-appgateway-health +kind: skill-template +description: Performs a health check on Azure Application Gateways and the backend pools used by them, generating a report of... Use when triaging or monitoring Azure, Application, Gateway workloads with skill ... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, Application, Gateway] +resource_types: [application_gateway] +access: read-only +--- + +# Azure Application Gateway Health + +## Summary + +Checks key metrics for Azure Application Gateways and queries the health status of backend pools used by the gateway. + +See [README.md](README.md) for additional context. + +## Tools + +### Check for Resource Health Issues Affecting Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch a list of issues that might affect the application gateway cluster + +- **Robot task name**: Check for Resource Health Issues Affecting Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `app_gateway_resource_health.sh` +- **Tags**: `appgateway`, `resourcehealth`, `access:read-only`, `data:config` +- **Reads**: `APP_GATEWAY_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `app_gateway_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Configuration Health of Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch the details and health of the application gateway configuration + +- **Robot task name**: Check Configuration Health of Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `app_gateway_config_health.sh` +- **Tags**: `appgateway`, `config`, `health`, `access:read-only`, `data:config` +- **Reads**: `APP_GATEWAY_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `app_gateway_config_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Backend Pool Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch the health of the application gateway backend pool members + +- **Robot task name**: Check Backend Pool Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `app_gateway_backend_health.sh` +- **Tags**: `appgateway`, `logs`, `tail`, `access:read-only`, `data:config` +- **Reads**: `APP_GATEWAY_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `backend_pool_members_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Log Analytics for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch log analytics for the application gateway + +- **Robot task name**: Fetch Log Analytics for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `app_gateway_log_analytics.sh` +- **Tags**: `access:read-only`, `appgateway`, `logs`, `analytics`, `uri_errors`, `requests`, `ssl`, `errors`, `data:logs-regexp` +- **Reads**: — +- **Writes**: `app_gateway_log_metrics.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Metrics for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch metrics for the application gateway + +- **Robot task name**: Fetch Metrics for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `app_gateway_metrics.sh` +- **Tags**: `access:read-only`, `appgateway`, `metrics`, `analytics`, `data:config` +- **Reads**: `APP_GATEWAY_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `app_gateway_metrics.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check SSL Certificate Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch SSL certificates and validate expiry dates for Azure Application Gateway instances + +- **Robot task name**: Check SSL Certificate Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `app_gateway_ssl_certs.sh` +- **Tags**: `access:read-only`, `appgateway`, `ssl`, `expiry`, `data:config` +- **Reads**: `APP_GATEWAY_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `appgw_ssl_certificate_checks.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Logs for Errors with Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Query log analytics workspace for common errors like IP mismatches or subnet issues + +- **Robot task name**: Check Logs for Errors with Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `app_gateway_log_errors.sh` +- **Tags**: `access:read-only`, `appgateway`, `logs`, `network`, `errors`, `data:logs-regexp` +- **Reads**: `APP_GATEWAY_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `appgw_diagnostic_log_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List Related Azure Resources for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch a list of resources that are releated to the application gateway + +- **Robot task name**: List Related Azure Resources for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `app_gateway_related_resources.sh` +- **Tags**: `access:read-only`, `appgateway`, `resources`, `azure`, `related`, `data:config` +- **Reads**: — +- **Writes**: `appgw_resource_discovery.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Queries the health of an Azure Application Gateway, returning 1 when it's healthy and 0 when it's unhealthy. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Check for Resource Health Issues Affecting Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch a list of issues that might affect the Application Gateway as reported from Azure. + +- **Robot task name**: Check for Resource Health Issues Affecting Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `resource_health` +- **Underlying script**: `app_gateway_resource_health.sh` +- **Tags**: `appgateway`, `resource`, `health`, `service`, `azure`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `"${resource_health_output_json["properties"]["title"]}" == "Available"` + + +#### Check Configuration Health of Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch the config of the AKS cluster in azure + +- **Robot task name**: Check Configuration Health of Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `configuration` +- **Underlying script**: `app_gateway_config_health.sh` +- **Tags**: `appgateway`, `config`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `len(@{issue_list["issues"]}) == 0` + + +#### Check Backend Pool Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch the health of the application gateway backend pool members + +- **Robot task name**: Check Backend Pool Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `backend_pools` +- **Underlying script**: `app_gateway_backend_health.sh` +- **Tags**: `appservice`, `logs`, `tail`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `len(@{issue_list["issues"]}) == 0` + + +#### Fetch Metrics for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch metrics for the application gateway + +- **Robot task name**: Fetch Metrics for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `metrics` +- **Underlying script**: `app_gateway_metrics.sh` +- **Tags**: `appgateway`, `metrics`, `analytics`, `data:config` +- **Reads**: — +- **Pass condition**: `len(@{issue_list["issues"]}) == 0` + + +#### Check SSL Certificate Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch SSL certificates and validate expiry dates for Azure Application Gateway instances + +- **Robot task name**: Check SSL Certificate Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `ssl_certificates` +- **Underlying script**: `app_gateway_ssl_certs.sh` +- **Tags**: `appgateway`, `ssl`, `expiry`, `data:config` +- **Reads**: — +- **Pass condition**: `len(@{issue_list["issues"]}) == 0` + + +#### Check Logs for Errors with Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Query log analytics workspace for common errors like IP mismatches or subnet issues + +- **Robot task name**: Check Logs for Errors with Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `error_logs` +- **Underlying script**: `app_gateway_log_errors.sh` +- **Tags**: `appgateway`, `logs`, `network`, `errors`, `data:logs-regexp` +- **Reads**: — +- **Pass condition**: `len(@{issue_list["issues"]}) == 0` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZ_RESOURCE_GROUP` | string | The resource group to perform actions against. | — | yes | +| `APP_GATEWAY_NAME` | string | The Azure Application Gateway to health check. | — | yes | +| `AZURE_RESOURCE_SUBSCRIPTION_ID` | string | The Azure Subscription ID for the resource. | `""` | no | +| `AZURE_SUBSCRIPTION_NAME` | string | The friendly name of the subscription ID. | `subscription-01` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `app_gateway_health.json` +- `app_gateway_config_health.json` +- `backend_pool_members_health.json` +- `app_gateway_log_metrics.json` +- `app_gateway_metrics.json` +- `appgw_ssl_certificate_checks.json` +- `appgw_diagnostic_log_issues.json` +- `appgw_resource_discovery.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-appgateway-health/runbook.robot` +- **Monitor**: `codebundles/azure-appgateway-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-appgateway-health +export AZ_RESOURCE_GROUP=... +export APP_GATEWAY_NAME=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +export AZURE_SUBSCRIPTION_NAME=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-appgateway-health +export AZ_RESOURCE_GROUP=... +export APP_GATEWAY_NAME=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +export AZURE_SUBSCRIPTION_NAME=... +bash app_gateway_backend_health.sh +bash app_gateway_comprehensive_log_check.sh +bash app_gateway_config_health.sh +bash app_gateway_log_analytics.sh +bash app_gateway_log_errors.sh +bash app_gateway_metrics.sh +bash app_gateway_related_resources.sh +bash app_gateway_resource_health.sh +bash app_gateway_ssl_certs.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `app_gateway_backend_health.sh` — Bash helper script `app_gateway_backend_health.sh`. +- `app_gateway_comprehensive_log_check.sh` — Bash helper script `app_gateway_comprehensive_log_check.sh`. +- `app_gateway_config_health.sh` — Bash helper script `app_gateway_config_health.sh`. +- `app_gateway_log_analytics.sh` — Bash helper script `app_gateway_log_analytics.sh`. +- `app_gateway_log_errors.sh` — Bash helper script `app_gateway_log_errors.sh`. +- `app_gateway_metrics.sh` — Bash helper script `app_gateway_metrics.sh`. +- `app_gateway_related_resources.sh` — Bash helper script `app_gateway_related_resources.sh`. +- `app_gateway_resource_health.sh` — Bash helper script `app_gateway_resource_health.sh`. +- `app_gateway_ssl_certs.sh` — Bash helper script `app_gateway_ssl_certs.sh`. diff --git a/codebundles/azure-appservice-cost-optimization/SKILL-TEMPLATE.md b/codebundles/azure-appservice-cost-optimization/SKILL-TEMPLATE.md new file mode 100644 index 000000000..2500315b3 --- /dev/null +++ b/codebundles/azure-appservice-cost-optimization/SKILL-TEMPLATE.md @@ -0,0 +1,101 @@ +--- +name: azure-appservice-cost-optimization +kind: skill-template +description: Azure App Service Cost Optimization: Analyzes App Service Plans to identify empty plans, underutilized resources,... Use when triaging or monitoring Azure, Cost, Optimization workloads with skill t... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, Cost, Optimization, App, Service, Plans, Function, Apps, Web, Apps, Rightsizing] +resource_types: [app_service] +access: read-only +--- + +# Azure App Service Cost Optimization + +## Summary + +This codebundle analyzes Azure App Service Plans to identify cost optimization opportunities. + +See [README.md](README.md) for additional context. + +## Tools + +### Analyze App Service Plan Cost Optimization in Resource Group `${AZURE_RESOURCE_GROUPS}` for Subscription `${AZURE_SUBSCRIPTION_NAME}` + +Analyzes App Service Plans across subscriptions to identify empty plans, underutilized resources, and rightsizing opportunities with cost savings estimates. Supports three optimization strategies (aggressive/balanced/conservative) and provides comprehensive options tables with risk assessments for each plan. + +- **Robot task name**: Analyze App Service Plan Cost Optimization in Resource Group `${AZURE_RESOURCE_GROUPS}` for Subscription `${AZURE_SUBSCRIPTION_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `azure_appservice_cost_optimization.sh` +- **Tags**: `Azure`, `Cost`, `Optimization`, `App`, `Service`, `Plans`, `Function`, `Apps`, `Web`, `Apps`, `Rightsizing`, `access:read-only`, `data:config` +- **Reads**: `TIMEOUT_SECONDS` +- **Writes**: `azure_appservice_cost_optimization_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZURE_SUBSCRIPTION_IDS` | string | Comma-separated list of Azure subscription IDs to analyze for App Service optimization. | `""` | no | +| `AZURE_RESOURCE_GROUPS` | string | Comma-separated list of resource groups to analyze (leave empty to analyze all resource groups in the subscription) | `""` | no | +| `AZURE_SUBSCRIPTION_NAME` | string | Azure subscription name for reporting purposes | `""` | no | +| `LOW_COST_THRESHOLD` | string | Monthly savings threshold for LOW classification (default: 500) | `500` | no | +| `MEDIUM_COST_THRESHOLD` | string | Monthly savings threshold for MEDIUM classification (default: 2000) | `2000` | no | +| `HIGH_COST_THRESHOLD` | string | Monthly savings threshold for HIGH classification (default: 10000) | `10000` | no | +| `OPTIMIZATION_STRATEGY` | string | Optimization strategy: 'aggressive' (max savings, 85-90% target CPU, dev/test), 'balanced' (default, 75-80% target CPU, standard prod), or 'conservative' (safest, 60-70% target CPU, critical prod) | `balanced` | no | +| `TIMEOUT_SECONDS` | string | Timeout in seconds for tasks (default: 1500 = 25 minutes). | `1500` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- `azure_appservice_cost_optimization_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-appservice-cost-optimization/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-appservice-cost-optimization +export AZURE_SUBSCRIPTION_IDS=... +export AZURE_RESOURCE_GROUPS=... +export AZURE_SUBSCRIPTION_NAME=... +export LOW_COST_THRESHOLD=... +export MEDIUM_COST_THRESHOLD=... +export HIGH_COST_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-appservice-cost-optimization +export AZURE_SUBSCRIPTION_IDS=... +export AZURE_RESOURCE_GROUPS=... +export AZURE_SUBSCRIPTION_NAME=... +export LOW_COST_THRESHOLD=... +bash azure_appservice_cost_optimization.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `azure_appservice_cost_optimization.sh` — Bash helper script `azure_appservice_cost_optimization.sh`. diff --git a/codebundles/azure-appservice-functionapp-health/SKILL-TEMPLATE.md b/codebundles/azure-appservice-functionapp-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..d926174ed --- /dev/null +++ b/codebundles/azure-appservice-functionapp-health/SKILL-TEMPLATE.md @@ -0,0 +1,364 @@ +--- +name: azure-appservice-functionapp-health +kind: skill-template +description: Triages an Azure Function App and its workloads, checking its status and logs and verifying key metrics. Use when triaging or monitoring Azure, AppService, Health workloads with skill template `azu... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, AppService, Health] +resource_types: [app_service] +access: read-only +--- + +# Azure Function App Health + +## Summary + +Checks key Function App metrics, individual function invocations, service plan utilization, fetches logs, config and activities for the service and generates a report of present issues for any found. + +See [README.md](README.md) for additional context. + +## Tools + +### Check for Resource Health Issues Affecting Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch a list of issues that might affect the Function App as reported from Azure. + +- **Robot task name**: Check for Resource Health Issues Affecting Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_resource_health.sh` +- **Tags**: `aks`, `resource`, `health`, `service`, `azure`, `access:read-only`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `FUNCTION_APP_NAME` +- **Writes**: `function_app_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze Function Failure Patterns for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Enhanced failure pattern analysis with temporal correlation and structured data collection. + +- **Robot task name**: Analyze Function Failure Patterns for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `function_failure_analysis.sh` +- **Tags**: `access:read-only`, `functionapp`, `failure-analysis`, `pattern-analysis`, `enhanced`, `data:logs-regexp` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Function App `${FUNCTION_APP_NAME}` Health in Resource Group `${AZ_RESOURCE_GROUP}` + +Checks the health status of a appservice workload. + +- **Robot task name**: Check Function App `${FUNCTION_APP_NAME}` Health in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_health_metric.sh` +- **Tags**: `access:read-only`, `appservice`, `health`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `FUNCTION_APP_NAME` +- **Writes**: `function_app_health_check_metrics.json`, `function_app_health_check_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Function App `${FUNCTION_APP_NAME}` Plan Utilization Metrics In Resource Group `${AZ_RESOURCE_GROUP}` + +Reviews key metrics for the Function App plan and generates a report + +- **Robot task name**: Fetch Function App `${FUNCTION_APP_NAME}` Plan Utilization Metrics In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_plan_utilization_health.sh` +- **Tags**: `access:read-only`, `appservice`, `utilization`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `FUNCTION_APP_NAME` +- **Writes**: `function_app_plan_metrics.json`, `function_app_plan_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Individual Function Invocations Health for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Analyzes the health and metrics of individual function invocations, including execution counts, errors, throttles, and performance metrics. + +- **Robot task name**: Check Individual Function Invocations Health for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `function_invocation_health.sh` +- **Tags**: `access:read-only`, `functionapp`, `functions`, `invocations`, `metrics`, `performance`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `FUNCTION_APP_NAME` +- **Writes**: `function_invocation_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Get Function App `${FUNCTION_APP_NAME}` Logs and Analyze Errors In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch logs of appservice workload and analyze for errors + +- **Robot task name**: Get Function App `${FUNCTION_APP_NAME}` Logs and Analyze Errors In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_logs.sh` +- **Tags**: `appservice`, `logs`, `analysis`, `access:read-only`, `data:logs-regexp` +- **Reads**: `AZ_RESOURCE_GROUP`, `FUNCTION_APP_NAME` +- **Writes**: `function_app_log_issues_report.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Configuration Health of Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch the configuration health of the Function App + +- **Robot task name**: Check Configuration Health of Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_config_health.sh` +- **Tags**: `appservice`, `logs`, `tail`, `access:read-only`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `FUNCTION_APP_NAME` +- **Writes**: `az_function_app_config_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Deployment Health of Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch deployment health of the Function App + +- **Robot task name**: Check Deployment Health of Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_deployment_health.sh` +- **Tags**: `appservice`, `deployment`, `access:read-only`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `FUNCTION_APP_NAME` +- **Writes**: `deployment_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Function App `${FUNCTION_APP_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}` + +Gets the events of function app and checks for start/stop operations and errors + +- **Robot task name**: Fetch Function App `${FUNCTION_APP_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `functionapp_activities.sh` +- **Tags**: `appservice`, `monitor`, `events`, `errors`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AZ_RESOURCE_GROUP`, `FUNCTION_APP_NAME` +- **Writes**: `function_app_activities_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Azure Recommendations and Notifications for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch Azure Advisor recommendations, Service Health notifications, and security assessments for the Function App + +- **Robot task name**: Fetch Azure Recommendations and Notifications for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_recommendations.sh` +- **Tags**: `appservice`, `recommendations`, `advisor`, `notifications`, `access:read-only`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `FUNCTION_APP_NAME` +- **Writes**: `function_app_recommendations_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Recent Activities for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Analyze recent Azure activities for the Function App, including critical operations and user actions. + +- **Robot task name**: Check Recent Activities for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `functionapp_activities.sh` +- **Tags**: `access:read-only`, `functionapp`, `activities`, `audit`, `data:logs-bulk` +- **Reads**: `AZ_RESOURCE_GROUP`, `FUNCTION_APP_NAME` +- **Writes**: `function_app_activities_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Diagnostic Logs for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Check for diagnostic logs configuration and search them for relevant events if they exist. + +- **Robot task name**: Check Diagnostic Logs for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `functionapp_diagnostic_logs.sh` +- **Tags**: `access:read-only`, `functionapp`, `diagnostic-logs`, `monitoring`, `data:logs-regexp` +- **Reads**: `AZ_RESOURCE_GROUP`, `FUNCTION_APP_NAME` +- **Writes**: `functionapp_diagnostic_logs.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Queries the health status of an Function App, and returns 0 when it's not healthy, and 1 when it is. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Check for Resource Health Issues Affecting Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch a list of issues that might affect the Function App as reported from Azure. + +- **Robot task name**: Check for Resource Health Issues Affecting Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `resource_health` +- **Underlying script**: `appservice_resource_health.sh` +- **Tags**: `aks`, `resource`, `health`, `service`, `azure`, `data:config` +- **Reads**: — +- **Pass condition**: `"${resource_health_output_json["properties"]["title"]}" == "Available"` + + +#### Check Function App `${FUNCTION_APP_NAME}` Health Check Metrics In Resource Group `${AZ_RESOURCE_GROUP}` + +Checks the health check metric of a appservice workload. If issues are generated with severity 1 or 2, the score is 0 / unhealthy. + +- **Robot task name**: Check Function App `${FUNCTION_APP_NAME}` Health Check Metrics In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `health_checks` +- **Underlying script**: `appservice_health_metric.sh` +- **Tags**: `healthcheck`, `metric`, `appservice`, `data:config` +- **Reads**: — + + +#### Check Function App `${FUNCTION_APP_NAME}` Configuration Health In Resource Group `${AZ_RESOURCE_GROUP}` + +Checks the configuration health of a appservice workload. 1 = healthy, 0 = unhealthy. + +- **Robot task name**: Check Function App `${FUNCTION_APP_NAME}` Configuration Health In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `configuration` +- **Underlying script**: `appservice_config_health.sh` +- **Tags**: `appservice`, `configuration`, `health`, `data:config` +- **Reads**: — + + +#### Check Deployment Health of Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch deployment health of the Function App + +- **Robot task name**: Check Deployment Health of Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `deployment_health` +- **Underlying script**: `appservice_deployment_health.sh` +- **Tags**: `appservice`, `deployment`, `data:config` +- **Reads**: — + + +#### Fetch Function App `${FUNCTION_APP_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}` + +Gets the events of appservice and checks for errors + +- **Robot task name**: Fetch Function App `${FUNCTION_APP_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `activities` +- **Underlying script**: `appservice_activities.sh` +- **Tags**: `appservice`, `monitor`, `events`, `errors`, `data:logs-bulk` +- **Reads**: — + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZ_RESOURCE_GROUP` | string | The resource group to perform actions against. | — | yes | +| `FUNCTION_APP_NAME` | string | The Azure AppService to triage. | — | yes | +| `AZURE_RESOURCE_SUBSCRIPTION_ID` | string | The Azure Subscription ID for the resource. | `""` | no | +| `RW_LOOKBACK_WINDOW` | string | The time period, in minutes, to look back for activites/events. | `10` | no | +| `TIME_PERIOD_DAYS` | string | The time period, in days, to look back for recommendations and notifications. | `7` | no | +| `CPU_THRESHOLD` | string | The CPU % threshold in which to generate an issue. | `80` | no | +| `REQUESTS_THRESHOLD` | string | The threshold of requests/s in which to generate an issue. | `1000` | no | +| `BYTES_RECEIVED_THRESHOLD` | string | The threshold of received bytes/s in which to generate an issue. | `10485760` | no | +| `HTTP5XX_THRESHOLD` | string | The threshold of HTTP5XX/s in which to generate an issue. Higher than this value indicates a high error rate. | `5` | no | +| `HTTP2XX_THRESHOLD` | string | The threshold of HTTP2XX/s in which to generate an issue. Less than this value indicates low success rate. | `50` | no | +| `HTTP4XX_THRESHOLD` | string | The threshold of HTTP4XX/s in which to generate an issue. Higher than this value indicates high client error rate. | `200` | no | +| `DISK_USAGE_THRESHOLD` | string | The threshold of disk usage % in which to generate an issue. | `90` | no | +| `AVG_RSP_TIME` | string | The threshold of average response time (ms) in which to generate an issue. Higher than this value indicates slow response time. | `300` | no | +| `FUNCTION_ERROR_RATE_THRESHOLD` | string | The threshold of function error rate (%) in which to generate an issue. Higher than this value indicates high function error rate. | `10` | no | +| `FUNCTION_MEMORY_THRESHOLD` | string | The threshold of function memory usage (MB) in which to generate an issue. Higher than this value indicates high memory usage. | `512` | no | +| `FUNCTION_DURATION_THRESHOLD` | string | The threshold of function execution duration (ms) in which to generate an issue. Higher than this value indicates slow function execution. | `5000` | no | +| `EXECUTION_UNITS_COST_THRESHOLD` | string | Static threshold for execution units cost alerts - represents ~$500/month at default (default: 10000000) | `10000000` | no | +| `EXECUTION_UNITS_ANOMALY_MULTIPLIER` | string | Multiplier for anomaly detection - alerts when execution units are X times higher than baseline (default: 5) | `5` | no | +| `BASELINE_LOOKBACK_DAYS` | string | Number of days to look back for baseline calculation (default: 7) | `7` | no | +| `AZURE_SUBSCRIPTION_NAME` | string | The friendly name of the subscription ID. | `subscription-01` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `function_app_health.json` +- `function_app_health_check_metrics.json` +- `function_app_health_check_issues.json` +- `function_app_plan_metrics.json` +- `function_app_plan_issues.json` +- `function_invocation_health.json` +- `function_app_log_issues_report.json` +- `az_function_app_config_health.json` +- `deployment_health.json` +- `function_app_activities_issues.json` +- `function_app_recommendations_issues.json` +- `functionapp_diagnostic_logs.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-appservice-functionapp-health/runbook.robot` +- **Monitor**: `codebundles/azure-appservice-functionapp-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-appservice-functionapp-health +export AZ_RESOURCE_GROUP=... +export FUNCTION_APP_NAME=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +export RW_LOOKBACK_WINDOW=... +export TIME_PERIOD_DAYS=... +export CPU_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-appservice-functionapp-health +export AZ_RESOURCE_GROUP=... +export FUNCTION_APP_NAME=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +export RW_LOOKBACK_WINDOW=... +bash appservice_activities.sh +bash appservice_activities_enhanced.sh +bash appservice_config_health.sh +bash appservice_deployment_health.sh +bash appservice_health_metric.sh +bash appservice_logs.sh +bash appservice_plan_utilization_health.sh +bash appservice_recommendations.sh +bash appservice_recommendations_enhanced.sh +bash appservice_resource_health.sh +bash function_failure_analysis.sh +bash function_invocation_health.sh +# ... and 3 more scripts +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `appservice_activities.sh` — Bash helper script `appservice_activities.sh`. +- `appservice_activities_enhanced.sh` — Bash helper script `appservice_activities_enhanced.sh`. +- `appservice_config_health.sh` — Bash helper script `appservice_config_health.sh`. +- `appservice_deployment_health.sh` — Bash helper script `appservice_deployment_health.sh`. +- `appservice_health_metric.sh` — Bash helper script `appservice_health_metric.sh`. +- `appservice_logs.sh` — Bash helper script `appservice_logs.sh`. +- `appservice_plan_utilization_health.sh` — Bash helper script `appservice_plan_utilization_health.sh`. +- `appservice_recommendations.sh` — Bash helper script `appservice_recommendations.sh`. +- `appservice_recommendations_enhanced.sh` — Bash helper script `appservice_recommendations_enhanced.sh`. +- `appservice_resource_health.sh` — Bash helper script `appservice_resource_health.sh`. +- `function_failure_analysis.sh` — Bash helper script `function_failure_analysis.sh`. +- `function_invocation_health.sh` — Bash helper script `function_invocation_health.sh`. +- `function_invocation_logger.sh` — Bash helper script `function_invocation_logger.sh`. +- `functionapp_activities.sh` — Bash helper script `functionapp_activities.sh`. +- `functionapp_diagnostic_logs.sh` — Bash helper script `functionapp_diagnostic_logs.sh`. diff --git a/codebundles/azure-appservice-plan-health/SKILL-TEMPLATE.md b/codebundles/azure-appservice-plan-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..941426551 --- /dev/null +++ b/codebundles/azure-appservice-plan-health/SKILL-TEMPLATE.md @@ -0,0 +1,217 @@ +--- +name: azure-appservice-plan-health +kind: skill-template +description: Check Azure App Service Plan health by identifying availability issues, high usage issues, and providing scaling... Use when triaging or monitoring Azure, App, Service workloads with skill template... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, App, Service, Plan, Health] +resource_types: [app_service] +access: read-only +--- + +# Azure App Service Plan Health + +## Summary + +This codebundle runs a suite of metrics checks for App Service Plan health in Azure. + +See [README.md](README.md) for additional context. + +## Tools + +### Check Azure App Service Plan Resource Health in resource group `${AZURE_RESOURCE_GROUP}` + +Check the Azure Resource Health API for any known issues affecting App Service Plans + +- **Robot task name**: Check Azure App Service Plan Resource Health in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `asp-health-check.sh` +- **Tags**: `AppServicePlan`, `Azure`, `Health`, `access:read-only`, `data:config` +- **Reads**: `AZURE_RESOURCE_GROUP`, `AZURE_SUBSCRIPTION_NAME` +- **Writes**: `asp_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check App Service Plan Capacity and Recommendations in resource group `${AZURE_RESOURCE_GROUP}` + +Check App Service Plan capacity, report high usage issues, and provide scaling recommendations + +- **Robot task name**: Check App Service Plan Capacity and Recommendations in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_appservice_plan_capacity.sh` +- **Tags**: `AppService`, `Azure`, `Capacity`, `Recommendations`, `access:read-only`, `data:config` +- **Reads**: `AZURE_RESOURCE_GROUP`, `AZURE_SUBSCRIPTION_NAME` +- **Writes**: `asp_high_usage_metrics.json`, `asp_recommendations.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze App Service Plan Cost Optimization Opportunities in resource group `${AZURE_RESOURCE_GROUP}` + +Analyzes 30-day utilization trends using Azure Monitor to identify underutilized App Service Plans with cost savings opportunities. Provides Azure pricing-based estimates for potential monthly and annual savings with severity bands: Sev4 <$2k/month, Sev3 $2k-$10k/month, Sev2 >$10k/month. + +- **Robot task name**: Analyze App Service Plan Cost Optimization Opportunities in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `asp_cost_optimization.sh` +- **Tags**: `AppServicePlan`, `cost-optimization`, `underutilization`, `azure-monitor`, `pricing`, `access:read-only`, `data:config` +- **Reads**: `TIMEOUT_SECONDS` +- **Writes**: `asp_cost_optimization_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze App Service Plan Weekly Utilization Trends in resource group `${AZURE_RESOURCE_GROUP}` + +Analyzes week-over-week utilization trends for App Service Plans including CPU, memory, request counts, HTTP error rates, and response times. Detects growth patterns that may indicate scaling needs or performance issues. + +- **Robot task name**: Analyze App Service Plan Weekly Utilization Trends in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `asp_weekly_trend_analysis.sh` +- **Tags**: `AppServicePlan`, `Azure`, `Trends`, `Utilization`, `Performance`, `access:read-only`, `data:config` +- **Reads**: `TIMEOUT_SECONDS` +- **Writes**: `asp_weekly_trend_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check App Service Plan Changes in resource group `${AZURE_RESOURCE_GROUP}` + +Lists App Service Plan changes and operations from Azure Activity Log + +- **Robot task name**: Check App Service Plan Changes in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `asp-audit.sh` +- **Tags**: `AppServicePlan`, `Azure`, `Audit`, `Security`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AZURE_RESOURCE_GROUP`, `AZURE_SUBSCRIPTION_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Check Azure App Service Plan health by identifying availability issues, high capacity usage + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Count App Service Plans with Health Status of `Available` in resource group `${AZURE_RESOURCE_GROUP}` + +Count Azure App Service Plans with health status of `Available` + +- **Robot task name**: Count App Service Plans with Health Status of `Available` in resource group `${AZURE_RESOURCE_GROUP}` +- **Sub-metric name**: `availability` +- **Underlying script**: `asp-health-check.sh` +- **Tags**: `AppServicePlan`, `Azure`, `Health`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `int(${count}) >= 1` + + +#### Count App Service Plans with High Capacity Usage in resource group `${AZURE_RESOURCE_GROUP}` + +Count App Service Plans with high CPU, memory, or disk queue usage + +- **Robot task name**: Count App Service Plans with High Capacity Usage in resource group `${AZURE_RESOURCE_GROUP}` +- **Sub-metric name**: `capacity_usage` +- **Underlying script**: `check_appservice_plan_capacity.sh` +- **Tags**: `AppService`, `Azure`, `Health`, `access:read-only`, `data:config` +- **Reads**: `MAX_HIGH_USAGE_APP_SERVICE_PLAN` +- **Pass condition**: `int(${count}) <= int(${MAX_HIGH_USAGE_APP_SERVICE_PLAN})` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZURE_RESOURCE_GROUP` | string | Azure resource group. | — | yes | +| `AZURE_SUBSCRIPTION_ID` | string | The Azure Subscription ID for the resource. | `""` | no | +| `AZURE_SUBSCRIPTION_NAME` | string | Azure subscription name. | `""` | no | +| `AZURE_ACTIVITY_LOG_OFFSET` | string | Time offset for activity log collection (e.g., 24h, 7d) (default: 24h) | `24h` | no | +| `CPU_THRESHOLD` | string | CPU usage threshold percentage for high usage alerts (default: 80) | `80` | no | +| `MEMORY_THRESHOLD` | string | Memory usage threshold percentage for high usage alerts (default: 80) | `80` | no | +| `DISK_QUEUE_THRESHOLD` | string | Disk queue length threshold for high usage alerts (default: 10) | `10` | no | +| `SCALE_UP_CPU_THRESHOLD` | string | CPU usage threshold percentage for scale up recommendations (default: 70) | `70` | no | +| `SCALE_UP_MEMORY_THRESHOLD` | string | Memory usage threshold percentage for scale up recommendations (default: 70) | `70` | no | +| `SCALE_DOWN_CPU_THRESHOLD` | string | CPU usage threshold percentage for scale down recommendations (default: 30) | `30` | no | +| `SCALE_DOWN_MEMORY_THRESHOLD` | string | Memory usage threshold percentage for scale down recommendations (default: 30) | `30` | no | +| `METRICS_OFFSET` | string | Time offset for metrics collection (e.g., 24h, 7d) (default: 24h) | `24h` | no | +| `METRICS_INTERVAL` | string | Metrics collection interval (e.g., PT1H, PT5M) (default: PT1H) | `PT1H` | no | +| `LOOKBACK_WEEKS` | string | Number of weeks to analyze for trend analysis (default: 4) | `4` | no | +| `TIMEOUT_SECONDS` | string | Timeout in seconds for tasks (default: 900). | `900` | no | +| `MAX_UNUSED_DISK` | string | The maximum number of unused disks allowed in the subscription. | `0` | no | +| `MAX_UNUSED_SNAPSHOT` | string | The maximum number of unused snapshots allowed in the subscription. | `0` | no | +| `UNUSED_STORAGE_ACCOUNT_TIMEFRAME` | string | The timeframe in hours to check for unused storage accounts (e.g., 720 for 30 days) | `24` | no | +| `MAX_UNUSED_STORAGE_ACCOUNT` | string | The maximum number of unused storage accounts allowed in the subscription. | `0` | no | +| `MAX_PUBLIC_ACCESS_STORAGE_ACCOUNT` | string | The maximum number of storage accounts with public access allowed in the subscription. | `0` | no | +| `MAX_HIGH_USAGE_APP_SERVICE_PLAN` | string | The maximum number of high usage App Service Plans allowed in the subscription. | `0` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `asp_health.json` +- `asp_high_usage_metrics.json` +- `asp_recommendations.json` +- `asp_cost_optimization_issues.json` +- `asp_weekly_trend_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-appservice-plan-health/runbook.robot` +- **Monitor**: `codebundles/azure-appservice-plan-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-appservice-plan-health +export AZURE_RESOURCE_GROUP=... +export AZURE_SUBSCRIPTION_ID=... +export AZURE_SUBSCRIPTION_NAME=... +export AZURE_ACTIVITY_LOG_OFFSET=... +export CPU_THRESHOLD=... +export MEMORY_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-appservice-plan-health +export AZURE_RESOURCE_GROUP=... +export AZURE_SUBSCRIPTION_ID=... +export AZURE_SUBSCRIPTION_NAME=... +export AZURE_ACTIVITY_LOG_OFFSET=... +bash asp-audit.sh +bash asp-health-check.sh +bash asp_cost_optimization.sh +bash asp_weekly_trend_analysis.sh +bash check_appservice_plan_capacity.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `asp-audit.sh` — Bash helper script `asp-audit.sh`. +- `asp-health-check.sh` — Bash helper script `asp-health-check.sh`. +- `asp_cost_optimization.sh` — Bash helper script `asp_cost_optimization.sh`. +- `asp_weekly_trend_analysis.sh` — Bash helper script `asp_weekly_trend_analysis.sh`. +- `check_appservice_plan_capacity.sh` — Bash helper script `check_appservice_plan_capacity.sh`. diff --git a/codebundles/azure-appservice-webapp-health/SKILL-TEMPLATE.md b/codebundles/azure-appservice-webapp-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..c8d02cf27 --- /dev/null +++ b/codebundles/azure-appservice-webapp-health/SKILL-TEMPLATE.md @@ -0,0 +1,337 @@ +--- +name: azure-appservice-webapp-health +kind: skill-template +description: Triages an Azure App Service and its workloads, checking its status and logs and verifying key metrics. Use when triaging or monitoring Azure, AppService, Triage workloads with skill template `azur... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, AppService, Triage] +resource_types: [app_service] +access: read-only +--- + +# Azure App Service Webapp Health + +## Summary + +Checks key App Service metrics and the service plan, fetches logs, config and activities for the service and generates a report of present issues for any found. + +See [README.md](README.md) for additional context. + +## Tools + +### Check for Resource Health Issues Affecting App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch a list of issues that might affect the APP Service as reported from Azure. + +- **Robot task name**: Check for Resource Health Issues Affecting App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_resource_health.sh` +- **Tags**: `aks`, `resource`, `health`, `service`, `azure`, `access:read-only`, `data:config` +- **Reads**: `APP_SERVICE_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `app_service_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check App Service `${APP_SERVICE_NAME}` Health in Resource Group `${AZ_RESOURCE_GROUP}` + +Checks the health status of a appservice workload. + +- **Robot task name**: Check App Service `${APP_SERVICE_NAME}` Health in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_health_metric.sh` +- **Tags**: `access:read-only`, `appservice`, `health`, `data:config` +- **Reads**: `APP_SERVICE_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `app_service_health_check_metrics.json`, `app_service_health_check_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch App Service `${APP_SERVICE_NAME}` Utilization Metrics In Resource Group `${AZ_RESOURCE_GROUP}` + +Reviews all key metrics (CPU, Requests, Bandwidth, HTTP status codes, Threads, Disk, Response Time) for the last 30 minutes with 5-minute intervals + +- **Robot task name**: Fetch App Service `${APP_SERVICE_NAME}` Utilization Metrics In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_metric_health.sh` +- **Tags**: `access:read-only`, `appservice`, `utilization`, `metrics`, `data:config` +- **Reads**: `APP_SERVICE_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `app_service_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Get App Service `${APP_SERVICE_NAME}` Logs In Resource Group `${AZ_RESOURCE_GROUP}` + +Download and display recent raw log files from App Service (last 50 lines from each log file) + +- **Robot task name**: Get App Service `${APP_SERVICE_NAME}` Logs In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_logs.sh` +- **Tags**: `appservice`, `logs`, `display`, `raw`, `access:read-only`, `data:logs-bulk` +- **Reads**: `APP_SERVICE_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Configuration Health of App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch the configuration health of the App Service + +- **Robot task name**: Check Configuration Health of App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_config_health.sh` +- **Tags**: `appservice`, `logs`, `tail`, `access:read-only`, `data:config` +- **Reads**: `APP_SERVICE_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `az_app_service_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Deployment Health of App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch deployment health of the App Service + +- **Robot task name**: Check Deployment Health of App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_deployment_health.sh` +- **Tags**: `appservice`, `deployment`, `access:read-only`, `data:config` +- **Reads**: `APP_SERVICE_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `deployment_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch App Service `${APP_SERVICE_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}` + +Gets the events of appservice and checks for errors + +- **Robot task name**: Fetch App Service `${APP_SERVICE_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_activities.sh` +- **Tags**: `appservice`, `monitor`, `events`, `errors`, `access:read-only`, `data:logs-bulk` +- **Reads**: `APP_SERVICE_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `app_service_activities_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Recent Activities for App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Analyze recent Azure activities for the App Service, including critical operations and user actions. + +- **Robot task name**: Check Recent Activities for App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_activities_enhanced.sh` +- **Tags**: `access:read-only`, `appservice`, `activities`, `audit`, `data:logs-bulk` +- **Reads**: `APP_SERVICE_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `app_service_activities_enhanced.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Recommendations and Notifications for App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch Azure Advisor, Service Health, and Security Center recommendations for the App Service. + +- **Robot task name**: Check Recommendations and Notifications for App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_recommendations.sh` +- **Tags**: `access:read-only`, `appservice`, `recommendations`, `notifications`, `data:config` +- **Reads**: `APP_SERVICE_NAME`, `AZURE_SUBSCRIPTION_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `app_service_recommendations.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Diagnostic Logs for App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Check diagnostic settings, query Log Analytics and Application Insights for errors and failed requests + +- **Robot task name**: Check Diagnostic Logs for App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_diagnostic_logs.sh` +- **Tags**: `appservice`, `logs`, `diagnostics`, `analysis`, `azure-monitor`, `access:read-only`, `data:logs-regexp` +- **Reads**: `APP_SERVICE_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `app_service_diagnostic_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Logs for Errors in App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Analyze App Service logs for errors using Azure Monitor APIs and Application Insights - creates structured issues for detected problems + +- **Robot task name**: Check Logs for Errors in App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_log_analysis.sh` +- **Tags**: `appservice`, `logs`, `errors`, `analysis`, `azure-monitor`, `access:read-only`, `data:logs-regexp` +- **Reads**: `APP_SERVICE_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `app_service_log_issues_report.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Queries the health status of an App Service, and returns 0 when it's not healthy, and 1 when it is. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Check for Resource Health Issues Affecting App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch a list of issues that might affect the APP Service as reported from Azure. + +- **Robot task name**: Check for Resource Health Issues Affecting App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `resource_health` +- **Underlying script**: `appservice_resource_health.sh` +- **Tags**: `aks`, `resource`, `health`, `service`, `azure`, `data:config` +- **Reads**: `APP_SERVICE_RUNNING` +- **Pass condition**: `"${resource_health_output_json["properties"]["title"]}" == "Available"` + + +#### Check App Service `${APP_SERVICE_NAME}` Health Check Metrics In Resource Group `${AZ_RESOURCE_GROUP}` + +Checks the health check metric of a appservice workload. If issues are generated with severity 1 or 2, the score is 0 / unhealthy. + +- **Robot task name**: Check App Service `${APP_SERVICE_NAME}` Health Check Metrics In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `health_checks` +- **Underlying script**: `appservice_health_metric.sh` +- **Tags**: `healthcheck`, `metric`, `appservice`, `data:config` +- **Reads**: `APP_SERVICE_RUNNING` + + +#### Check App Service `${APP_SERVICE_NAME}` Configuration Health In Resource Group `${AZ_RESOURCE_GROUP}` + +Checks the configuration health of a appservice workload. 1 = healthy, 0 = unhealthy. + +- **Robot task name**: Check App Service `${APP_SERVICE_NAME}` Configuration Health In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `configuration` +- **Underlying script**: `appservice_config_health.sh` +- **Tags**: `appservice`, `configuration`, `health`, `data:config` +- **Reads**: `APP_SERVICE_RUNNING` + + +#### Check Deployment Health of App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch deployment health of the App Service + +- **Robot task name**: Check Deployment Health of App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `deployment_health` +- **Underlying script**: `appservice_deployment_health.sh` +- **Tags**: `appservice`, `deployment`, `data:config` +- **Reads**: `APP_SERVICE_RUNNING` + + +#### Fetch App Service `${APP_SERVICE_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}` + +Gets the events of appservice and checks for errors + +- **Robot task name**: Fetch App Service `${APP_SERVICE_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `activities` +- **Underlying script**: `appservice_activities.sh` +- **Tags**: `appservice`, `monitor`, `events`, `errors`, `data:logs-bulk` +- **Reads**: `APP_SERVICE_RUNNING` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZ_RESOURCE_GROUP` | string | The resource group to perform actions against. | — | yes | +| `APP_SERVICE_NAME` | string | The Azure AppService to triage. | — | yes | +| `AZURE_RESOURCE_SUBSCRIPTION_ID` | string | The Azure Subscription ID for the resource. | `""` | no | +| `RW_LOOKBACK_WINDOW` | string | The time period, in minutes, to look back for activites/events. | `10` | no | +| `CPU_THRESHOLD` | string | The CPU % threshold in which to generate an issue. | `80` | no | +| `REQUESTS_THRESHOLD` | string | The threshold of requests/s in which to generate an issue. | `1000` | no | +| `BYTES_RECEIVED_THRESHOLD` | string | The threshold of received bytes/s in which to generate an issue. | `10485760` | no | +| `HTTP5XX_THRESHOLD` | string | The threshold of HTTP5XX/s in which to generate an issue. Higher than this value indicates a high error rate. | `5` | no | +| `HTTP2XX_THRESHOLD` | string | The threshold of HTTP2XX/s in which to generate an issue. Less than this value indicates low success rate. | `50` | no | +| `HTTP4XX_THRESHOLD` | string | The threshold of HTTP4XX/s in which to generate an issue. Higher than this value indicates high client error rate. | `200` | no | +| `DISK_USAGE_THRESHOLD` | string | The threshold of disk usage % in which to generate an issue. | `90` | no | +| `AVG_RSP_TIME` | string | The threshold of average response time (ms) in which to generate an issue. Higher than this value indicates slow response time. | `300` | no | +| `AZURE_SUBSCRIPTION_NAME` | string | The friendly name of the subscription ID. | `subscription-01` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `app_service_health.json` +- `app_service_health_check_metrics.json` +- `app_service_health_check_issues.json` +- `app_service_issues.json` +- `az_app_service_health.json` +- `deployment_health.json` +- `app_service_activities_issues.json` +- `app_service_activities_enhanced.json` +- `app_service_recommendations.json` +- `app_service_diagnostic_issues.json` +- `app_service_log_issues_report.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-appservice-webapp-health/runbook.robot` +- **Monitor**: `codebundles/azure-appservice-webapp-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-appservice-webapp-health +export AZ_RESOURCE_GROUP=... +export APP_SERVICE_NAME=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +export RW_LOOKBACK_WINDOW=... +export CPU_THRESHOLD=... +export REQUESTS_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-appservice-webapp-health +export AZ_RESOURCE_GROUP=... +export APP_SERVICE_NAME=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +export RW_LOOKBACK_WINDOW=... +bash appservice_activities.sh +bash appservice_activities_enhanced.sh +bash appservice_config_health.sh +bash appservice_deployment_health.sh +bash appservice_diagnostic_logs.sh +bash appservice_health_metric.sh +bash appservice_log_analysis.sh +bash appservice_logs.sh +bash appservice_metric_health.sh +bash appservice_recommendations.sh +bash appservice_resource_health.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `appservice_activities.sh` — Bash helper script `appservice_activities.sh`. +- `appservice_activities_enhanced.sh` — Bash helper script `appservice_activities_enhanced.sh`. +- `appservice_config_health.sh` — Bash helper script `appservice_config_health.sh`. +- `appservice_deployment_health.sh` — Bash helper script `appservice_deployment_health.sh`. +- `appservice_diagnostic_logs.sh` — Bash helper script `appservice_diagnostic_logs.sh`. +- `appservice_health_metric.sh` — Bash helper script `appservice_health_metric.sh`. +- `appservice_log_analysis.sh` — Bash helper script `appservice_log_analysis.sh`. +- `appservice_logs.sh` — Bash helper script `appservice_logs.sh`. +- `appservice_metric_health.sh` — Bash helper script `appservice_metric_health.sh`. +- `appservice_recommendations.sh` — Bash helper script `appservice_recommendations.sh`. +- `appservice_resource_health.sh` — Bash helper script `appservice_resource_health.sh`. diff --git a/codebundles/azure-appservice-webapp-ops/SKILL-TEMPLATE.md b/codebundles/azure-appservice-webapp-ops/SKILL-TEMPLATE.md new file mode 100644 index 000000000..4da5751e6 --- /dev/null +++ b/codebundles/azure-appservice-webapp-ops/SKILL-TEMPLATE.md @@ -0,0 +1,193 @@ +--- +name: azure-appservice-webapp-ops +kind: skill-template +description: Operational tasks for an Azure App Services, such as restarting, scaling or re-deploying. Use when triaging or monitoring Azure, AppService, Ops workloads with skill template `azure-appservice-weba... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, AppService, Ops] +resource_types: [app_service] +access: read-only +--- + +# Azure App Service Operations + +## Summary + +- Checks whether the plan supports deployment slots (Standard or Premium tier). + +See [README.md](README.md) for additional context. + +## Tools + +### Restart App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Restarts the Azure App Service and verifies success. + +- **Robot task name**: Restart App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_restart.sh` +- **Tags**: — +- **Reads**: `APP_SERVICE_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Swap Deployment Slots for App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Calls the script that checks plan tier, lists slots, auto-determines source/target if only one non-prod slot + +- **Robot task name**: Swap Deployment Slots for App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_slot_swap.sh` +- **Tags**: — +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scale Up App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Scales up the App Service to the next plan from current SKU + +- **Robot task name**: Scale Up App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_plan_scaleup.sh` +- **Tags**: — +- **Reads**: `APP_SERVICE_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scale Down App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` + +Decreases SKU based on a predefined map (e.g. S2->S1, S1->B3, etc.) + +- **Robot task name**: Scale Down App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_plan_scaledown.sh` +- **Tags**: — +- **Reads**: `APP_SERVICE_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scale Out Instances for App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` by ${SCALE_OUT_FACTOR}x + +Multiplies current worker count by SCALE_OUT_FACTOR + +- **Robot task name**: Scale Out Instances for App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` by ${SCALE_OUT_FACTOR}x +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_scale_out.sh` +- **Tags**: — +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scale In Instances for App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` to 1/${SCALE_IN_FACTOR} + +Decreases the number of instances within the current App Service Plan + +- **Robot task name**: Scale In Instances for App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` to 1/${SCALE_IN_FACTOR} +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_scale_in.sh` +- **Tags**: — +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Redeploy App Service `${APP_SERVICE_NAME}` from Latest Source in Resource Group `${AZ_RESOURCE_GROUP}` + +Forces a re-deployment of the Azure App Service from the configured code or container source. + +- **Robot task name**: Redeploy App Service `${APP_SERVICE_NAME}` from Latest Source in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `appservice_redeploy.sh` +- **Tags**: — +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZ_RESOURCE_GROUP` | string | The resource group to perform actions against. | — | yes | +| `APP_SERVICE_NAME` | string | The Azure AppService to triage. | — | yes | +| `AZURE_RESOURCE_SUBSCRIPTION_ID` | string | The Azure Subscription ID for the resource. | `""` | no | +| `RW_LOOKBACK_WINDOW` | string | The time period, in minutes, to look back for activites/events. | `10` | no | +| `SCALE_OUT_FACTOR` | string | The factor by which to increase the amount of instances within the given App Service Plan. | `2` | no | +| `SCALE_IN_FACTOR` | string | The factor by which to decrease the amount of instances within the given App Service Plan. | `2` | no | +| `SOURCE_SLOT` | string | The source slot for deployment promotion. | `""` | no | +| `TARGET_SLOT` | string | The target slot for deployment promotion. | `""` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-appservice-webapp-ops/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-appservice-webapp-ops +export AZ_RESOURCE_GROUP=... +export APP_SERVICE_NAME=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +export RW_LOOKBACK_WINDOW=... +export SCALE_OUT_FACTOR=... +export SCALE_IN_FACTOR=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-appservice-webapp-ops +export AZ_RESOURCE_GROUP=... +export APP_SERVICE_NAME=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +export RW_LOOKBACK_WINDOW=... +bash appservice_logs.sh +bash appservice_plan_scaledown.sh +bash appservice_plan_scaleup.sh +bash appservice_redeploy.sh +bash appservice_restart.sh +bash appservice_scale_in.sh +bash appservice_scale_out.sh +bash appservice_slot_swap.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `appservice_logs.sh` — Bash helper script `appservice_logs.sh`. +- `appservice_plan_scaledown.sh` — Bash helper script `appservice_plan_scaledown.sh`. +- `appservice_plan_scaleup.sh` — Bash helper script `appservice_plan_scaleup.sh`. +- `appservice_redeploy.sh` — Bash helper script `appservice_redeploy.sh`. +- `appservice_restart.sh` — Bash helper script `appservice_restart.sh`. +- `appservice_scale_in.sh` — Bash helper script `appservice_scale_in.sh`. +- `appservice_scale_out.sh` — Bash helper script `appservice_scale_out.sh`. +- `appservice_slot_swap.sh` — Bash helper script `appservice_slot_swap.sh`. diff --git a/codebundles/azure-databricks-cost-optimization/SKILL-TEMPLATE.md b/codebundles/azure-databricks-cost-optimization/SKILL-TEMPLATE.md new file mode 100644 index 000000000..46710e657 --- /dev/null +++ b/codebundles/azure-databricks-cost-optimization/SKILL-TEMPLATE.md @@ -0,0 +1,102 @@ +--- +name: azure-databricks-cost-optimization +kind: skill-template +description: Azure Databricks Cost Optimization: Analyzes Databricks workspaces and clusters to identify cost optimization... Use when triaging or monitoring Azure, Cost, Optimization workloads with skill templ... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, Cost, Optimization, Databricks, Spark, Clusters, Auto-Termination] +resource_types: [databricks_workspace] +access: read-only +--- + +# Azure Databricks Cost Optimization + +## Summary + +This codebundle analyzes Azure Databricks workspaces and clusters to identify cost optimization opportunities. + +See [README.md](README.md) for additional context. + +## Tools + +### Analyze Databricks Cluster Auto-Termination and Over-Provisioning Opportunities in Resource Group `${AZURE_RESOURCE_GROUPS}` for Subscription `${AZURE_SUBSCRIPTION_NAME}` + +Analyzes Azure Databricks workspaces and clusters across specified subscriptions to identify cost optimization opportunities. Focuses on: 1) Clusters without auto-termination configured or running idle, 2) Over-provisioned clusters with low CPU/memory utilization. Calculates both VM costs and DBU (Databricks Unit) costs to provide accurate savings estimates. + +- **Robot task name**: Analyze Databricks Cluster Auto-Termination and Over-Provisioning Opportunities in Resource Group `${AZURE_RESOURCE_GROUPS}` for Subscription `${AZURE_SUBSCRIPTION_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `analyze_databricks_cluster_optimization.sh` +- **Tags**: `Azure`, `Cost`, `Optimization`, `Databricks`, `Spark`, `Clusters`, `Auto-Termination`, `access:read-only`, `data:config` +- **Reads**: `TIMEOUT_SECONDS` +- **Writes**: `databricks_cluster_optimization_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZURE_SUBSCRIPTION_IDS` | string | Comma-separated list of Azure subscription IDs to analyze for Databricks optimization. | `""` | no | +| `AZURE_RESOURCE_GROUPS` | string | Comma-separated list of resource groups to analyze (leave empty to analyze all resource groups in the subscription) | `""` | no | +| `AZURE_SUBSCRIPTION_NAME` | string | Azure subscription name for reporting purposes | `""` | no | +| `COST_ANALYSIS_LOOKBACK_DAYS` | string | Number of days to look back for utilization analysis (default: 30) | `30` | no | +| `LOW_COST_THRESHOLD` | string | Monthly savings threshold for LOW classification (default: 500) | `500` | no | +| `MEDIUM_COST_THRESHOLD` | string | Monthly savings threshold for MEDIUM classification (default: 2000) | `2000` | no | +| `HIGH_COST_THRESHOLD` | string | Monthly savings threshold for HIGH classification (default: 10000) | `10000` | no | +| `AZURE_DISCOUNT_PERCENTAGE` | string | Discount percentage off MSRP for Azure services (default: 0) | `0` | no | +| `TIMEOUT_SECONDS` | string | Timeout in seconds for tasks (default: 1500 = 25 minutes). | `1500` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- `databricks_cluster_optimization_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-databricks-cost-optimization/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-databricks-cost-optimization +export AZURE_SUBSCRIPTION_IDS=... +export AZURE_RESOURCE_GROUPS=... +export AZURE_SUBSCRIPTION_NAME=... +export COST_ANALYSIS_LOOKBACK_DAYS=... +export LOW_COST_THRESHOLD=... +export MEDIUM_COST_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-databricks-cost-optimization +export AZURE_SUBSCRIPTION_IDS=... +export AZURE_RESOURCE_GROUPS=... +export AZURE_SUBSCRIPTION_NAME=... +export COST_ANALYSIS_LOOKBACK_DAYS=... +bash analyze_databricks_cluster_optimization.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `analyze_databricks_cluster_optimization.sh` — Bash helper script `analyze_databricks_cluster_optimization.sh`. diff --git a/codebundles/azure-devops-organization-health/SKILL-TEMPLATE.md b/codebundles/azure-devops-organization-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..b01f72264 --- /dev/null +++ b/codebundles/azure-devops-organization-health/SKILL-TEMPLATE.md @@ -0,0 +1,188 @@ +--- +name: azure-devops-organization-health +kind: skill-template +description: Comprehensive Azure DevOps organization health monitoring focusing on platform-wide issues and shared resources. Use when triaging or monitoring AzureDevOps, CICD workloads with skill template `azu... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [AzureDevOps, CICD] +resource_types: [azure_devops] +access: read-only +--- + +# Azure DevOps Organization Health + +## Summary + +This codebundle provides comprehensive health monitoring for Azure DevOps organizations, focusing on platform-wide issues, shared resources, and organizational capacity management. + +See [README.md](README.md) for additional context. + +## Tools + +### Check Service Health Status for Azure DevOps Organization `${AZURE_DEVOPS_ORG}` + +Tests connectivity and access to core Azure DevOps APIs and services. Identifies service issues vs permission limitations. + +- **Robot task name**: Check Service Health Status for Azure DevOps Organization `${AZURE_DEVOPS_ORG}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `organization-service-health.sh` +- **Tags**: `Organization`, `Service`, `Health`, `Platform`, `access:read-only`, `data:logs-config` +- **Reads**: `AZURE_DEVOPS_ORG` +- **Writes**: `organization_service_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Agent Pool Capacity and Utilization for Organization `${AZURE_DEVOPS_ORG}` + +Analyzes self-hosted agent pools for capacity issues including offline agents, utilization thresholds, and configuration problems. + +- **Robot task name**: Check Agent Pool Capacity and Utilization for Organization `${AZURE_DEVOPS_ORG}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `agent-pool-capacity.sh` +- **Tags**: `Organization`, `AgentPools`, `Capacity`, `Distribution`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AZURE_DEVOPS_ORG` +- **Writes**: `agent_pool_capacity.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Validate Organization Policies and Security Settings for `${AZURE_DEVOPS_ORG}` + +Examines organization security groups, user access levels, and policy configurations. Requires elevated permissions for full analysis. + +- **Robot task name**: Validate Organization Policies and Security Settings for `${AZURE_DEVOPS_ORG}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `organization-policies.sh` +- **Tags**: `Organization`, `Policies`, `Compliance`, `Security`, `access:read-only`, `data:logs-config` +- **Reads**: `AZURE_DEVOPS_ORG` +- **Writes**: `organization_policies.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check License Utilization and Capacity for Organization `${AZURE_DEVOPS_ORG}` + +Analyzes user license assignments for cost optimization opportunities and identifies inactive users or licensing inefficiencies. + +- **Robot task name**: Check License Utilization and Capacity for Organization `${AZURE_DEVOPS_ORG}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `license-utilization.sh` +- **Tags**: `Organization`, `Licenses`, `Capacity`, `Utilization`, `access:read-only`, `data:logs-config` +- **Reads**: `AZURE_DEVOPS_ORG` +- **Writes**: `license_utilization.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Investigate Platform-wide Service Incidents for Organization `${AZURE_DEVOPS_ORG}` + +Monitors Azure DevOps platform status and detects service-wide incidents by checking official status pages and API performance. + +- **Robot task name**: Investigate Platform-wide Service Incidents for Organization `${AZURE_DEVOPS_ORG}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `service-incident-check.sh` +- **Tags**: `Organization`, `Incidents`, `Platform`, `Service`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AZURE_DEVOPS_ORG` +- **Writes**: `service_incidents.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze Cross-Project Dependencies for Organization `${AZURE_DEVOPS_ORG}` + +Identifies shared resources between projects including agent pools, service connections, and potential naming conflicts. + +- **Robot task name**: Analyze Cross-Project Dependencies for Organization `${AZURE_DEVOPS_ORG}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `cross-project-dependencies.sh` +- **Tags**: `Organization`, `Dependencies`, `Projects`, `Integration`, `access:read-only`, `data:logs-config` +- **Reads**: `AZURE_DEVOPS_ORG` +- **Writes**: `cross_project_dependencies.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Investigate Platform Issues for Organization `${AZURE_DEVOPS_ORG}` + +Performs detailed investigation of agent pool issues and analyzes recent pipeline failures across all projects. + +- **Robot task name**: Investigate Platform Issues for Organization `${AZURE_DEVOPS_ORG}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `platform-issue-investigation.sh` +- **Tags**: `Organization`, `Investigation`, `Platform`, `Performance`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AZURE_DEVOPS_ORG` +- **Writes**: `platform_issue_investigation.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZURE_DEVOPS_ORG` | string | Azure DevOps organization name. | — | yes | +| `AGENT_UTILIZATION_THRESHOLD` | string | Agent pool utilization threshold percentage (0-100) above which capacity issues are flagged. | `80` | no | +| `LICENSE_UTILIZATION_THRESHOLD` | string | License utilization threshold percentage (0-100) above which licensing issues are flagged. | `90` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- `organization_service_health.json` +- `agent_pool_capacity.json` +- `organization_policies.json` +- `license_utilization.json` +- `service_incidents.json` +- `cross_project_dependencies.json` +- `platform_issue_investigation.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-devops-organization-health/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-devops-organization-health +export AZURE_DEVOPS_ORG=... +export AGENT_UTILIZATION_THRESHOLD=... +export LICENSE_UTILIZATION_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-devops-organization-health +export AZURE_DEVOPS_ORG=... +export AGENT_UTILIZATION_THRESHOLD=... +export LICENSE_UTILIZATION_THRESHOLD=... +bash agent-pool-capacity.sh +bash cross-project-dependencies.sh +bash license-utilization.sh +bash organization-policies.sh +bash organization-service-health.sh +bash platform-issue-investigation.sh +bash service-incident-check.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `agent-pool-capacity.sh` — Bash helper script `agent-pool-capacity.sh`. +- `cross-project-dependencies.sh` — Bash helper script `cross-project-dependencies.sh`. +- `license-utilization.sh` — Bash helper script `license-utilization.sh`. +- `organization-policies.sh` — Bash helper script `organization-policies.sh`. +- `organization-service-health.sh` — Bash helper script `organization-service-health.sh`. +- `platform-issue-investigation.sh` — Bash helper script `platform-issue-investigation.sh`. +- `service-incident-check.sh` — Bash helper script `service-incident-check.sh`. diff --git a/codebundles/azure-devops-project-health/SKILL-TEMPLATE.md b/codebundles/azure-devops-project-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..d6a579ade --- /dev/null +++ b/codebundles/azure-devops-project-health/SKILL-TEMPLATE.md @@ -0,0 +1,229 @@ +--- +name: azure-devops-project-health +kind: skill-template +description: Comprehensive Azure DevOps project health monitoring with conditional deep investigation. Use when triaging or monitoring Azure, DevOps, Projects workloads with skill template `azure-devops-project... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, DevOps, Projects, Health] +resource_types: [azure_devops] +access: read-only +--- + +# Azure DevOps Project Health + +## Summary + +This codebundle monitors Azure DevOps project health across multiple projects, identifying issues with pipelines, agent pools, repository policies, and service connections. + +See [README.md](README.md) for additional context. + +## Tools + +### Check Agent Pool Availability Across Projects in `${AZURE_DEVOPS_ORG}` + +Check agent pool health and capacity issues + +- **Robot task name**: Check Agent Pool Availability Across Projects in `${AZURE_DEVOPS_ORG}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `agent-pools.sh` +- **Tags**: `DevOps`, `Azure`, `Health`, `access:read-only`, `data:logs-config` +- **Reads**: `AZURE_DEVOPS_ORG` +- **Writes**: `agent_pools_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check for Failed Pipelines Across Projects in `${AZURE_DEVOPS_ORG}` + +Identify failed pipeline runs with detailed logs + +- **Robot task name**: Check for Failed Pipelines Across Projects in `${AZURE_DEVOPS_ORG}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `pipeline-logs.sh` +- **Tags**: `DevOps`, `Azure`, `Pipelines`, `Failures`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AZURE_DEVOPS_ORG` +- **Writes**: `pipeline_logs_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check for Long-Running Pipelines Across Projects in `${AZURE_DEVOPS_ORG}` (Threshold: ${DURATION_THRESHOLD}) + +Identify pipelines exceeding duration thresholds + +- **Robot task name**: Check for Long-Running Pipelines Across Projects in `${AZURE_DEVOPS_ORG}` (Threshold: ${DURATION_THRESHOLD}) +- **Robot file**: `runbook.robot` +- **Underlying script**: `long-running-pipelines.sh` +- **Tags**: `DevOps`, `Azure`, `Pipelines`, `Performance`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AZURE_DEVOPS_ORG`, `DURATION_THRESHOLD` +- **Writes**: `long_running_pipelines.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check for Queued Pipelines Across Projects in `${AZURE_DEVOPS_ORG}` (Threshold: ${QUEUE_THRESHOLD}) + +Identify pipelines queued beyond threshold limits + +- **Robot task name**: Check for Queued Pipelines Across Projects in `${AZURE_DEVOPS_ORG}` (Threshold: ${QUEUE_THRESHOLD}) +- **Robot file**: `runbook.robot` +- **Underlying script**: `queued-pipelines.sh` +- **Tags**: `DevOps`, `Azure`, `Pipelines`, `Queue`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AZURE_DEVOPS_ORG`, `QUEUE_THRESHOLD` +- **Writes**: `queued_pipelines.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Repository Branch Policies Across Projects in `${AZURE_DEVOPS_ORG}` + +Verify repository branch policies compliance + +- **Robot task name**: Check Repository Branch Policies Across Projects in `${AZURE_DEVOPS_ORG}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `repo-policies.sh` +- **Tags**: `DevOps`, `Azure`, `Repository`, `Policies`, `access:read-only`, `data:logs-config` +- **Reads**: `AZURE_DEVOPS_ORG` +- **Writes**: `repo_policies_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Service Connection Health Across Projects in `${AZURE_DEVOPS_ORG}` + +Verify service connection availability and readiness + +- **Robot task name**: Check Service Connection Health Across Projects in `${AZURE_DEVOPS_ORG}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `service-connections.sh` +- **Tags**: `DevOps`, `Azure`, `ServiceConnections`, `access:read-only`, `data:logs-config` +- **Reads**: `AZURE_DEVOPS_ORG` +- **Writes**: `service_connections_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Investigate Pipeline Performance Issues Across Projects in `${AZURE_DEVOPS_ORG}` + +Analyze pipeline performance trends and bottlenecks + +- **Robot task name**: Investigate Pipeline Performance Issues Across Projects in `${AZURE_DEVOPS_ORG}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `pipeline-performance-analysis.sh` +- **Tags**: `Investigation`, `Performance`, `Trends`, `Bottlenecks`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AZURE_DEVOPS_ORG` +- **Writes**: `pipeline_performance_analysis.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Investigate Failed Pipeline Runs with Commit Correlation Across Projects in `${AZURE_DEVOPS_ORG}` + +Correlate failed pipeline runs with recent commits to identify what changed and caused failures + +- **Robot task name**: Investigate Failed Pipeline Runs with Commit Correlation Across Projects in `${AZURE_DEVOPS_ORG}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `pipeline-failure-investigation.sh` +- **Tags**: `DevOps`, `Azure`, `Pipelines`, `Investigation`, `Commits`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AZURE_DEVOPS_ORG` +- **Writes**: `pipeline_failure_investigation.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze Recent Repository Activity Across Projects in `${AZURE_DEVOPS_ORG}` + +Summarize recent commit activity, pull request status, and branch health across all project repositories to show what changed + +- **Robot task name**: Analyze Recent Repository Activity Across Projects in `${AZURE_DEVOPS_ORG}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `repository-health-analysis.sh` +- **Tags**: `DevOps`, `Azure`, `Repository`, `Activity`, `Commits`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AZURE_DEVOPS_ORG` +- **Writes**: `repository_health_analysis.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZURE_DEVOPS_ORG` | string | Azure DevOps organization name. | — | yes | +| `AZURE_DEVOPS_PROJECTS` | string | Comma-separated list of Azure DevOps projects to monitor (e.g., "project1,project2,project3") or "All" to monitor all projects. | `All` | no | +| `DURATION_THRESHOLD` | string | Threshold for long-running pipelines (format: 60m, 2h) | `60m` | no | +| `QUEUE_THRESHOLD` | string | Threshold for queued pipelines (format: 10m, 1h) | `30m` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- `agent_pools_issues.json` +- `pipeline_logs_issues.json` +- `long_running_pipelines.json` +- `queued_pipelines.json` +- `repo_policies_issues.json` +- `service_connections_issues.json` +- `pipeline_performance_analysis.json` +- `pipeline_failure_investigation.json` +- `repository_health_analysis.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-devops-project-health/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-devops-project-health +export AZURE_DEVOPS_ORG=... +export AZURE_DEVOPS_PROJECTS=... +export DURATION_THRESHOLD=... +export QUEUE_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-devops-project-health +export AZURE_DEVOPS_ORG=... +export AZURE_DEVOPS_PROJECTS=... +export DURATION_THRESHOLD=... +export QUEUE_THRESHOLD=... +bash _az_helpers.sh +bash agent-pools.sh +bash discover-projects.sh +bash long-running-pipelines.sh +bash pipeline-failure-investigation.sh +bash pipeline-logs.sh +bash pipeline-performance-analysis.sh +bash preflight-check.sh +bash queued-pipelines.sh +bash repo-policies.sh +bash repository-health-analysis.sh +bash service-connections.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `_az_helpers.sh` — Bash helper script `_az_helpers.sh`. +- `agent-pools.sh` — Bash helper script `agent-pools.sh`. +- `discover-projects.sh` — Bash helper script `discover-projects.sh`. +- `long-running-pipelines.sh` — Bash helper script `long-running-pipelines.sh`. +- `pipeline-failure-investigation.sh` — Bash helper script `pipeline-failure-investigation.sh`. +- `pipeline-logs.sh` — Bash helper script `pipeline-logs.sh`. +- `pipeline-performance-analysis.sh` — Bash helper script `pipeline-performance-analysis.sh`. +- `preflight-check.sh` — Bash helper script `preflight-check.sh`. +- `queued-pipelines.sh` — Bash helper script `queued-pipelines.sh`. +- `repo-policies.sh` — Bash helper script `repo-policies.sh`. +- `repository-health-analysis.sh` — Bash helper script `repository-health-analysis.sh`. +- `service-connections.sh` — Bash helper script `service-connections.sh`. diff --git a/codebundles/azure-devops-repository-health/SKILL-TEMPLATE.md b/codebundles/azure-devops-repository-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..5df083d6d --- /dev/null +++ b/codebundles/azure-devops-repository-health/SKILL-TEMPLATE.md @@ -0,0 +1,202 @@ +--- +name: azure-devops-repository-health +kind: skill-template +description: Repository health monitoring for Azure DevOps focusing on code quality, security, and configuration issues that... Use when triaging or monitoring Azure, DevOps, Repository workloads with skill tem... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, DevOps, Repository, CodeQuality, Security, Troubleshooting] +resource_types: [azure_devops] +access: read-only +--- + +# Azure DevOps Repository Health + +## Summary + +This codebundle provides comprehensive repository-level health monitoring for Azure DevOps, focusing on identifying root causes of repository issues and misconfigurations that impact development workflows. + +See [README.md](README.md) for additional context. + +## Tools + +### Investigate Recent Code Changes for Repositories in Project `${AZURE_DEVOPS_PROJECT}` + +Analyze recent commits, releases, and code changes that might be causing application failures + +- **Robot task name**: Investigate Recent Code Changes for Repositories in Project `${AZURE_DEVOPS_PROJECT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `recent-changes-analysis.sh` +- **Tags**: `Repository`, `Troubleshooting`, `RecentChanges`, `Commits`, `Releases`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AZURE_DEVOPS_PAT` +- **Writes**: `recent_changes_analysis.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze Pipeline Failures for Repositories in Project `${AZURE_DEVOPS_PROJECT}` + +Review recent CI/CD pipeline failures that might be affecting application deployments + +- **Robot task name**: Analyze Pipeline Failures for Repositories in Project `${AZURE_DEVOPS_PROJECT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `pipeline-failure-analysis.sh` +- **Tags**: `Repository`, `Troubleshooting`, `Pipelines`, `CI/CD`, `Failures`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AZURE_DEVOPS_PAT` +- **Writes**: `pipeline_failure_analysis.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Repository Security Configuration for Repositories in Project `${AZURE_DEVOPS_PROJECT}` + +Check repository security settings, branch policies, and access controls for misconfigurations + +- **Robot task name**: Check Repository Security Configuration for Repositories in Project `${AZURE_DEVOPS_PROJECT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `repository-security-analysis.sh` +- **Tags**: `Repository`, `Security`, `Configuration`, `BranchPolicies`, `access:read-only`, `data:logs-config` +- **Reads**: `AZURE_DEVOPS_PROJECT` +- **Writes**: `repository_security_analysis.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze Code Quality for Repositories in Project `${AZURE_DEVOPS_PROJECT}` + +Analyze repository for code quality issues, technical debt, and maintainability problems + +- **Robot task name**: Analyze Code Quality for Repositories in Project `${AZURE_DEVOPS_PROJECT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `code-quality-analysis.sh` +- **Tags**: `Repository`, `CodeQuality`, `TechnicalDebt`, `Maintainability`, `access:read-only`, `data:logs-config` +- **Reads**: `AZURE_DEVOPS_PAT` +- **Writes**: `code_quality_analysis.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Branch Management for Repositories in Project `${AZURE_DEVOPS_PROJECT}` + +Analyze branch structure, stale branches, and merge patterns that indicate workflow issues + +- **Robot task name**: Check Branch Management for Repositories in Project `${AZURE_DEVOPS_PROJECT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `branch-management-analysis.sh` +- **Tags**: `Repository`, `BranchManagement`, `Workflow`, `GitFlow`, `access:read-only`, `data:logs-config` +- **Reads**: `AZURE_DEVOPS_PAT` +- **Writes**: `branch_management_analysis.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze Pull Request and Collaboration Patterns for Repositories in Project `${AZURE_DEVOPS_PROJECT}` + +Examine PR review patterns, contributor activity, and collaboration health indicators + +- **Robot task name**: Analyze Pull Request and Collaboration Patterns for Repositories in Project `${AZURE_DEVOPS_PROJECT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `collaboration-analysis.sh` +- **Tags**: `Repository`, `PullRequests`, `Collaboration`, `CodeReview`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AZURE_DEVOPS_PAT` +- **Writes**: `collaboration_analysis.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Investigate Critical Repository Issues for Repositories in Project `${AZURE_DEVOPS_PROJECT}` + +Perform comprehensive investigation of critical repository issues that might impact operations + +- **Robot task name**: Investigate Critical Repository Issues for Repositories in Project `${AZURE_DEVOPS_PROJECT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `critical-repository-investigation.sh` +- **Tags**: `Repository`, `Critical`, `Investigation`, `Operations`, `access:read-only`, `data:logs-bulk` +- **Reads**: `AZURE_DEVOPS_PAT` +- **Writes**: `critical_repository_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZURE_DEVOPS_ORG` | string | Azure DevOps organization name. | — | yes | +| `AZURE_DEVOPS_PROJECT` | string | Azure DevOps project name. | — | yes | +| `AZURE_DEVOPS_REPOS` | string | Repository name(s) to analyze. Can be a single repository, comma-separated list, or 'All' for all repositories in the project. | `All` | no | +| `REPO_SIZE_THRESHOLD_MB` | string | Repository size threshold in MB above which performance issues are flagged. | `500` | no | +| `STALE_BRANCH_DAYS` | string | Number of days after which branches are considered stale. | `90` | no | +| `MIN_CODE_COVERAGE` | string | Minimum code coverage percentage threshold. | `80` | no | +| `ANALYSIS_DAYS` | string | Number of days to look back for recent changes and pipeline failures analysis. | `7` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- `recent_changes_analysis.json` +- `pipeline_failure_analysis.json` +- `repository_security_analysis.json` +- `code_quality_analysis.json` +- `branch_management_analysis.json` +- `collaboration_analysis.json` +- `critical_repository_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-devops-repository-health/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-devops-repository-health +export AZURE_DEVOPS_ORG=... +export AZURE_DEVOPS_PROJECT=... +export AZURE_DEVOPS_REPOS=... +export REPO_SIZE_THRESHOLD_MB=... +export STALE_BRANCH_DAYS=... +export MIN_CODE_COVERAGE=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-devops-repository-health +export AZURE_DEVOPS_ORG=... +export AZURE_DEVOPS_PROJECT=... +export AZURE_DEVOPS_REPOS=... +export REPO_SIZE_THRESHOLD_MB=... +bash branch-management-analysis.sh +bash code-quality-analysis.sh +bash collaboration-analysis.sh +bash critical-repository-investigation.sh +bash discover-repositories.sh +bash pipeline-failure-analysis.sh +bash recent-changes-analysis.sh +bash repository-performance-analysis.sh +bash repository-security-analysis.sh +bash security-incident-check.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `branch-management-analysis.sh` — Bash helper script `branch-management-analysis.sh`. +- `code-quality-analysis.sh` — Bash helper script `code-quality-analysis.sh`. +- `collaboration-analysis.sh` — Bash helper script `collaboration-analysis.sh`. +- `critical-repository-investigation.sh` — Bash helper script `critical-repository-investigation.sh`. +- `discover-repositories.sh` — Bash helper script `discover-repositories.sh`. +- `pipeline-failure-analysis.sh` — Bash helper script `pipeline-failure-analysis.sh`. +- `recent-changes-analysis.sh` — Bash helper script `recent-changes-analysis.sh`. +- `repository-performance-analysis.sh` — Bash helper script `repository-performance-analysis.sh`. +- `repository-security-analysis.sh` — Bash helper script `repository-security-analysis.sh`. +- `security-incident-check.sh` — Bash helper script `security-incident-check.sh`. diff --git a/codebundles/azure-kv-health/SKILL-TEMPLATE.md b/codebundles/azure-kv-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..1db371372 --- /dev/null +++ b/codebundles/azure-kv-health/SKILL-TEMPLATE.md @@ -0,0 +1,269 @@ +--- +name: azure-kv-health +kind: skill-template +description: Check Azure Key Vault health by checking availability metrics, configuration settings, expiring items... Use when triaging or monitoring Azure, Key, Vault workloads with skill template `azure-kv-he... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, Key, Vault, Health] +resource_types: [key_vault] +access: read-only +--- + +# Azure Key Vault Health + +## Summary + +This codebundle runs a suite of metrics checks for Key Vault in Azure. + +See [README.md](README.md) for additional context. + +## Tools + +### Check Key Vault Resource Health in resource group `${AZURE_RESOURCE_GROUP}` + +Check the health status of Key Vaults in the specified resource group + +- **Robot task name**: Check Key Vault Resource Health in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `kv_resource_health.sh` +- **Tags**: `KeyVault`, `Azure`, `Health`, `access:read-only`, `data:config` +- **Reads**: `AZURE_RESOURCE_GROUP` +- **Writes**: `keyvault_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Key Vault Availability in resource group `${AZURE_RESOURCE_GROUP}` + +List number of Azure key vault vaults with availability below 100% + +- **Robot task name**: Check Key Vault Availability in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `availability.sh` +- **Tags**: `KeyVault`, `Azure`, `Health`, `Monitoring`, `access:read-only`, `data:config` +- **Reads**: `AZURE_RESOURCE_GROUP` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Key Vault Configuration in resource group `${AZURE_RESOURCE_GROUP}` + +List Key Vault miss-configuration + +- **Robot task name**: Check Key Vault Configuration in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `kv_config.sh` +- **Tags**: `KeyVault`, `Azure`, `Configuration`, `access:read-only`, `data:config` +- **Reads**: `AZURE_RESOURCE_GROUP` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Expiring Key Vault Items in resource group `${AZURE_RESOURCE_GROUP}` + +Check for expiring secrets, certificates, and keys in Key Vaults + +- **Robot task name**: Check Expiring Key Vault Items in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `expiry-checks.sh` +- **Tags**: `KeyVault`, `Azure`, `Expiry`, `access:read-only`, `data:config` +- **Reads**: `AZURE_RESOURCE_GROUP` +- **Writes**: `kv_expiry_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Key Vault Logs for Issues in resource group `${AZURE_RESOURCE_GROUP}` + +Check Key Vault log issues + +- **Robot task name**: Check Key Vault Logs for Issues in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `log.sh` +- **Tags**: `KeyVault`, `Azure`, `Logs`, `access:read-only`, `data:logs-regexp` +- **Reads**: `AZURE_RESOURCE_GROUP` +- **Writes**: `kv_log_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Key Vault Performance Metrics in resource group `${AZURE_RESOURCE_GROUP}` + +Check Key Vault performance metrics for excessive requests and high latency + +- **Robot task name**: Check Key Vault Performance Metrics in resource group `${AZURE_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `performance_metrics.sh` +- **Tags**: `KeyVault`, `Azure`, `Metrics`, `access:read-only`, `data:config` +- **Reads**: `AZURE_RESOURCE_GROUP` +- **Writes**: `azure_keyvault_performance_metrics.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Counts Azure Key Vault health by checking availability metrics, configuration settings, expiring items (secrets/certificates/keys), log issues, and performance metrics + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Count Key Vault Resource Health in resource group `${AZURE_RESOURCE_GROUP}` + +Counts the health status of Key Vaults in the specified resource group + +- **Robot task name**: Count Key Vault Resource Health in resource group `${AZURE_RESOURCE_GROUP}` +- **Sub-metric name**: `resource_health` +- **Underlying script**: `kv_resource_health.sh` +- **Tags**: `KeyVault`, `Azure`, `Health`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `${issue_count} == 0` + + +#### Count Key Vault Availability in resource group `${AZURE_RESOURCE_GROUP}` + +Counts number of Azure key vault vaults with availability below 100% + +- **Robot task name**: Count Key Vault Availability in resource group `${AZURE_RESOURCE_GROUP}` +- **Sub-metric name**: `availability` +- **Underlying script**: `availability.sh` +- **Tags**: `KeyVault`, `Azure`, `Health`, `Monitoring`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `${issue_count} == 0` + + +#### Count Key Vault configuration in resource group `${AZURE_RESOURCE_GROUP}` + +Count Key vault's miss-configuration + +- **Robot task name**: Count Key Vault configuration in resource group `${AZURE_RESOURCE_GROUP}` +- **Sub-metric name**: `configuration` +- **Underlying script**: `kv_config.sh` +- **Tags**: `KeyVault`, `Azure`, `Configuration`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `${issue_count} == 0` + + +#### Count Expiring Key Vault Items in resource group `${AZURE_RESOURCE_GROUP}` + +Count expiring secrets, certificates, and keys in Key Vaults + +- **Robot task name**: Count Expiring Key Vault Items in resource group `${AZURE_RESOURCE_GROUP}` +- **Sub-metric name**: `expiring_items` +- **Underlying script**: `expiry-checks.sh` +- **Tags**: `KeyVault`, `Azure`, `Expiry`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `${issue_count} == 0` + + +#### Count Key Vault Log Issues in resource group `${AZURE_RESOURCE_GROUP}` + +Count Key Vault log issues + +- **Robot task name**: Count Key Vault Log Issues in resource group `${AZURE_RESOURCE_GROUP}` +- **Sub-metric name**: `log_issues` +- **Underlying script**: `log.sh` +- **Tags**: `KeyVault`, `Azure`, `Logs`, `access:read-only`, `data:logs-regexp` +- **Reads**: — +- **Pass condition**: `${issue_count} == 0` + + +#### Count Key Vault Performance Metrics in resource group `${AZURE_RESOURCE_GROUP}` + +Count Key Vault performance metrics issues + +- **Robot task name**: Count Key Vault Performance Metrics in resource group `${AZURE_RESOURCE_GROUP}` +- **Sub-metric name**: `performance_metrics` +- **Underlying script**: `performance_metrics.sh` +- **Tags**: `KeyVault`, `Azure`, `Metrics`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `${issue_count} == 0` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZURE_RESOURCE_SUBSCRIPTION_ID` | string | The Azure Subscription ID for the resource. | `""` | no | +| `AZURE_RESOURCE_GROUP` | string | Azure resource group. | — | yes | +| `THRESHOLD_DAYS` | integer | Number of days before expiration to trigger alerts | `31` | no | +| `REQUEST_THRESHOLD` | integer | Threshold for excessive requests (requests/hour) | `1000` | no | +| `LATENCY_THRESHOLD` | integer | Threshold for high latency (milliseconds) | `500` | no | +| `REQUEST_INTERVAL` | string | Interval for request count metrics (format: PT1H, PT30M, PT5M, etc.) | `PT1H` | no | +| `LATENCY_INTERVAL` | string | Interval for latency metrics (format: PT1H, PT30M, PT5M, etc.) | `PT1H` | no | +| `TIME_RANGE` | integer | Time range in hours to look back for metrics | `24` | no | +| `LOG_QUERY_DAYS` | string | Time range for log queries (format: 1d, 7d, 30d, etc.) | `1d` | no | +| `SEVERITY_REQUEST` | string | Severity level for excessive request issues (1=Low, 2=Medium, 3=High, 4=Critical) | `3` | no | +| `SEVERITY_LATENCY` | string | Severity level for high latency issues (1=Low, 2=Medium, 3=High, 4=Critical) | `3` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `keyvault_health.json` +- `kv_expiry_issues.json` +- `kv_log_issues.json` +- `azure_keyvault_performance_metrics.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-kv-health/runbook.robot` +- **Monitor**: `codebundles/azure-kv-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-kv-health +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +export AZURE_RESOURCE_GROUP=... +export THRESHOLD_DAYS=... +export REQUEST_THRESHOLD=... +export LATENCY_THRESHOLD=... +export REQUEST_INTERVAL=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-kv-health +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +export AZURE_RESOURCE_GROUP=... +export THRESHOLD_DAYS=... +export REQUEST_THRESHOLD=... +bash availability.sh +bash expiry-checks.sh +bash kv_config.sh +bash kv_resource_health.sh +bash log.sh +bash performance_metrics.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `availability.sh` — Bash helper script `availability.sh`. +- `expiry-checks.sh` — Bash helper script `expiry-checks.sh`. +- `kv_config.sh` — Bash helper script `kv_config.sh`. +- `kv_resource_health.sh` — Bash helper script `kv_resource_health.sh`. +- `log.sh` — Bash helper script `log.sh`. +- `performance_metrics.sh` — Bash helper script `performance_metrics.sh`. diff --git a/codebundles/azure-loadbalancer-triage/SKILL-TEMPLATE.md b/codebundles/azure-loadbalancer-triage/SKILL-TEMPLATE.md new file mode 100644 index 000000000..09c3da821 --- /dev/null +++ b/codebundles/azure-loadbalancer-triage/SKILL-TEMPLATE.md @@ -0,0 +1,78 @@ +--- +name: azure-loadbalancer-triage +kind: skill-template +description: Triages issues related to a Azure Loadbalancers and its activity logs. Use when triaging or monitoring Kubernetes, AKS, Azure workloads with skill template `azure-loadbalancer-triage`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, Azure] +resource_types: [load_balancer] +access: read-only +--- + +# Azure Internal LoadBalancer Triage + +## Summary + +Queries the activity logs of internal loadbalancers (AKS ingress) objects in Azure and optionally inspects internal AKS ingress objects if available. + +See [README.md](README.md) for additional context. + +## Tools + +### Check Activity Logs for Azure Load Balancer `${AZ_LB_NAME}` + +Queries a Azure Loadbalancer's health probe to determine if it's in a healthy state. + +- **Robot task name**: Check Activity Logs for Azure Load Balancer `${AZ_LB_NAME}` +- **Robot file**: `runbook.robot` +- **Tags**: `loadbalancer`, `network`, `azure`, `${az_lb_name}`, `data:logs-bulk` +- **Reads**: `AZURE_RESOURCE_SUBSCRIPTION_ID` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZURE_RESOURCE_SUBSCRIPTION_ID` | string | The Azure Subscription ID for the resource. | `""` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-loadbalancer-triage/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-loadbalancer-triage +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/azure-servicebus-health/SKILL-TEMPLATE.md b/codebundles/azure-servicebus-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..78212c145 --- /dev/null +++ b/codebundles/azure-servicebus-health/SKILL-TEMPLATE.md @@ -0,0 +1,335 @@ +--- +name: azure-servicebus-health +kind: skill-template +description: Performs a health check on Azure Service Bus instances and the components using them, generating a report of issues... Use when triaging or monitoring Azure, ServiceBus workloads with skill templat... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, ServiceBus] +resource_types: [service_bus] +access: read-only +--- + +# Azure Service Bus Health + +## Summary + +This codebundle performs a health check on Azure Service Bus resources and provides insights and recommended actions for detected issues. + +See [README.md](README.md) for additional context. + +## Tools + +### Check for Resource Health Issues Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch a list of issues that might affect the service bus instance + +- **Robot task name**: Check for Resource Health Issues Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `service_bus_resource_health.sh` +- **Tags**: `azure`, `servicebus`, `resourcehealth`, `access:read-only`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `SB_NAMESPACE_NAME` +- **Writes**: `service_bus_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Configuration Health for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch the details and health of the service bus configuration + +- **Robot task name**: Check Configuration Health for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `service_bus_config_health.sh` +- **Tags**: `servicebus`, `logs`, `config`, `access:read-only`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `SB_NAMESPACE_NAME` +- **Writes**: `service_bus_config_health.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Metrics for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Analyze Service Bus metrics for potential issues + +- **Robot task name**: Check Metrics for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `service_bus_metrics.sh` +- **Tags**: `servicebus`, `metrics`, `performance`, `access:read-only`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `SB_NAMESPACE_NAME` +- **Writes**: `service_bus_metrics.json`, `service_bus_metrics_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Queue Health for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Analyze Service Bus queues for health issues + +- **Robot task name**: Check Queue Health for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `service_bus_queue_health.sh` +- **Tags**: `servicebus`, `queues`, `messages`, `access:read-only`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `SB_NAMESPACE_NAME` +- **Writes**: `service_bus_queues.json`, `service_bus_queue_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Topic Health for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Analyze Service Bus topics and subscriptions for health issues + +- **Robot task name**: Check Topic Health for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `service_bus_topic_health.sh` +- **Tags**: `servicebus`, `topics`, `subscriptions`, `access:read-only`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `SB_NAMESPACE_NAME` +- **Writes**: `service_bus_topics.json`, `service_bus_topic_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Log Analytics for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Query Log Analytics for Service Bus related logs and errors + +- **Robot task name**: Check Log Analytics for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `service_bus_log_analytics.sh` +- **Tags**: `servicebus`, `logs`, `diagnostics`, `access:read-only`, `data:logs-regexp` +- **Reads**: `AZ_RESOURCE_GROUP`, `SB_NAMESPACE_NAME` +- **Writes**: `service_bus_logs.json`, `service_bus_log_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Capacity and Quota Headroom for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Analyze Service Bus capacity utilization and quota headroom + +- **Robot task name**: Check Capacity and Quota Headroom for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `service_bus_capacity.sh` +- **Tags**: `servicebus`, `capacity`, `quota`, `access:read-only`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `SB_NAMESPACE_NAME` +- **Writes**: `service_bus_capacity.json`, `service_bus_capacity_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Geo-Disaster Recovery for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Check the geo-disaster recovery configuration and health + +- **Robot task name**: Check Geo-Disaster Recovery for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `service_bus_disaster_recovery.sh` +- **Tags**: `servicebus`, `disaster-recovery`, `geo-replication`, `access:read-only`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `SB_NAMESPACE_NAME` +- **Writes**: `service_bus_dr.json`, `service_bus_dr_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Security Configuration for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Audit SAS keys and RBAC assignments for security best practices + +- **Robot task name**: Check Security Configuration for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `service_bus_security_audit.sh` +- **Tags**: `servicebus`, `security`, `rbac`, `access:read-only`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `SB_NAMESPACE_NAME` +- **Writes**: `service_bus_security.json`, `service_bus_security_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Discover Related Resources for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Discover and map Azure resources related to the Service Bus namespace + +- **Robot task name**: Discover Related Resources for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `service_bus_related_resources.sh` +- **Tags**: `servicebus`, `related-resources`, `mapping`, `access:read-only`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `SB_NAMESPACE_NAME` +- **Writes**: `service_bus_related_resources.json`, `service_bus_related_resources_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Test Connectivity to Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Test network connectivity to the Service Bus namespace + +- **Robot task name**: Test Connectivity to Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `service_bus_connectivity_test.sh` +- **Tags**: `servicebus`, `connectivity`, `network`, `access:read-only`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `SB_NAMESPACE_NAME` +- **Writes**: `service_bus_connectivity.json`, `service_bus_connectivity_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Azure Monitor Alerts for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Check for the presence and configuration of Azure Monitor alerts + +- **Robot task name**: Check Azure Monitor Alerts for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `service_bus_alerts_check.sh` +- **Tags**: `servicebus`, `alerts`, `monitoring`, `access:read-only`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `SB_NAMESPACE_NAME` +- **Writes**: `service_bus_alerts.json`, `service_bus_alerts_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Performs a health check on Azure Service Bus instances and the components using them, generating a report of issues and next steps. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Check for Resource Health Issues Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch a list of issues that might affect the service bus instance + +- **Robot task name**: Check for Resource Health Issues Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `resource_health` +- **Underlying script**: `service_bus_resource_health.sh` +- **Tags**: `azure`, `servicebus`, `resourcehealth`, `access:read-only`, `data:config` +- **Reads**: — +- **Pass condition**: `"${sb_health_output_list["properties"]["title"]}" == "Available"` + + +#### Check Basic Connectivity for Service Bus `${SB_NAMESPACE_NAME}` + +Quick connectivity test to detect network issues + +- **Robot task name**: Check Basic Connectivity for Service Bus `${SB_NAMESPACE_NAME}` +- **Sub-metric name**: `connectivity` +- **Underlying script**: `service_bus_connectivity_test.sh` +- **Tags**: `azure`, `servicebus`, `connectivity`, `access:read-only`, `data:config` +- **Reads**: — + + +#### Check Critical Metrics for Service Bus `${SB_NAMESPACE_NAME}` + +Quick check of critical metrics that indicate immediate issues + +- **Robot task name**: Check Critical Metrics for Service Bus `${SB_NAMESPACE_NAME}` +- **Sub-metric name**: `critical_metrics` +- **Underlying script**: `service_bus_metrics.sh` +- **Tags**: `azure`, `servicebus`, `metrics`, `access:read-only`, `data:config` +- **Reads**: — + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZ_RESOURCE_GROUP` | string | The resource group to perform actions against. | — | yes | +| `SB_NAMESPACE_NAME` | string | The Azure Service Bus to health check. | — | yes | +| `AZURE_RESOURCE_SUBSCRIPTION_ID` | string | The Azure Subscription ID for the resource. | `""` | no | +| `ACTIVE_MESSAGE_THRESHOLD` | string | Threshold for active message count alerts (default: 1000) | `1000` | no | +| `DEAD_LETTER_THRESHOLD` | string | Threshold for dead letter message count alerts (default: 100) | `100` | no | +| `SIZE_PERCENTAGE_THRESHOLD` | string | Size percentage threshold for namespace/queue/topic alerts (default: 80) | `80` | no | +| `LATENCY_THRESHOLD_MS` | string | Latency threshold in milliseconds for connectivity alerts (default: 100) | `100` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `service_bus_health.json` +- `service_bus_config_health.json` +- `service_bus_metrics.json` +- `service_bus_metrics_issues.json` +- `service_bus_queues.json` +- `service_bus_queue_issues.json` +- `service_bus_topics.json` +- `service_bus_topic_issues.json` +- `service_bus_logs.json` +- `service_bus_log_issues.json` +- `service_bus_capacity.json` +- `service_bus_capacity_issues.json` +- `service_bus_dr.json` +- `service_bus_dr_issues.json` +- `service_bus_security.json` +- `service_bus_security_issues.json` +- `service_bus_related_resources.json` +- `service_bus_related_resources_issues.json` +- `service_bus_connectivity.json` +- `service_bus_connectivity_issues.json` +- `service_bus_alerts.json` +- `service_bus_alerts_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-servicebus-health/runbook.robot` +- **Monitor**: `codebundles/azure-servicebus-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-servicebus-health +export AZ_RESOURCE_GROUP=... +export SB_NAMESPACE_NAME=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +export ACTIVE_MESSAGE_THRESHOLD=... +export DEAD_LETTER_THRESHOLD=... +export SIZE_PERCENTAGE_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-servicebus-health +export AZ_RESOURCE_GROUP=... +export SB_NAMESPACE_NAME=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +export ACTIVE_MESSAGE_THRESHOLD=... +bash service_bus_alerts_check.sh +bash service_bus_capacity.sh +bash service_bus_config_health.sh +bash service_bus_connectivity_test.sh +bash service_bus_disaster_recovery.sh +bash service_bus_log_analytics.sh +bash service_bus_metrics.sh +bash service_bus_queue_health.sh +bash service_bus_related_resources.sh +bash service_bus_resource_health.sh +bash service_bus_security_audit.sh +bash service_bus_topic_health.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `service_bus_alerts_check.sh` — Bash helper script `service_bus_alerts_check.sh`. +- `service_bus_capacity.sh` — Bash helper script `service_bus_capacity.sh`. +- `service_bus_config_health.sh` — Bash helper script `service_bus_config_health.sh`. +- `service_bus_connectivity_test.sh` — Bash helper script `service_bus_connectivity_test.sh`. +- `service_bus_disaster_recovery.sh` — Bash helper script `service_bus_disaster_recovery.sh`. +- `service_bus_log_analytics.sh` — Bash helper script `service_bus_log_analytics.sh`. +- `service_bus_metrics.sh` — Bash helper script `service_bus_metrics.sh`. +- `service_bus_queue_health.sh` — Bash helper script `service_bus_queue_health.sh`. +- `service_bus_related_resources.sh` — Bash helper script `service_bus_related_resources.sh`. +- `service_bus_resource_health.sh` — Bash helper script `service_bus_resource_health.sh`. +- `service_bus_security_audit.sh` — Bash helper script `service_bus_security_audit.sh`. +- `service_bus_topic_health.sh` — Bash helper script `service_bus_topic_health.sh`. diff --git a/codebundles/azure-storage-cost-optimization/SKILL-TEMPLATE.md b/codebundles/azure-storage-cost-optimization/SKILL-TEMPLATE.md new file mode 100644 index 000000000..8a760fd70 --- /dev/null +++ b/codebundles/azure-storage-cost-optimization/SKILL-TEMPLATE.md @@ -0,0 +1,105 @@ +--- +name: azure-storage-cost-optimization +kind: skill-template +description: Azure Storage Cost Optimization: Analyzes storage resources to identify cost optimization opportunities including... Use when triaging or monitoring Azure, Cost, Optimization workloads with skill t... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, Cost, Optimization, Storage, Managed, Disks, Snapshots, Blob, Storage, Lifecycle, Management] +resource_types: [storage_account] +access: read-only +--- + +# Azure Storage Cost Optimization + +## Summary + +This codebundle analyzes Azure storage resources to identify cost optimization opportunities. + +See [README.md](README.md) for additional context. + +## Tools + +### Analyze Azure Storage Cost Optimization Opportunities in Resource Group `${AZURE_RESOURCE_GROUPS}` for Subscription `${AZURE_SUBSCRIPTION_NAME}` + +Analyzes Azure storage resources across specified subscriptions to identify cost optimization opportunities. Focuses on: 1) Unattached/orphaned managed disks still incurring costs, 2) Old snapshots (>90 days by default) consuming storage, 3) Storage accounts without lifecycle management policies, 4) Over-provisioned redundancy (GRS/GZRS that could use LRS/ZRS), 5) Premium disks with low IOPS utilization that could be downgraded to Standard SSD. + +- **Robot task name**: Analyze Azure Storage Cost Optimization Opportunities in Resource Group `${AZURE_RESOURCE_GROUPS}` for Subscription `${AZURE_SUBSCRIPTION_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `analyze_storage_optimization.sh` +- **Tags**: `Azure`, `Cost`, `Optimization`, `Storage`, `Managed`, `Disks`, `Snapshots`, `Blob`, `Storage`, `Lifecycle`, `Management`, `access:read-only`, `data:config` +- **Reads**: `TIMEOUT_SECONDS` +- **Writes**: `storage_optimization_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZURE_SUBSCRIPTION_IDS` | string | Comma-separated list of Azure subscription IDs to analyze for storage optimization. | `""` | no | +| `AZURE_RESOURCE_GROUPS` | string | Comma-separated list of resource groups to analyze (leave empty to analyze all resource groups in the subscription) | `""` | no | +| `AZURE_SUBSCRIPTION_NAME` | string | Azure subscription name for reporting purposes | `""` | no | +| `COST_ANALYSIS_LOOKBACK_DAYS` | string | Number of days to look back for utilization analysis (default: 30) | `30` | no | +| `LOW_COST_THRESHOLD` | string | Monthly savings threshold for LOW classification (default: 500) | `500` | no | +| `MEDIUM_COST_THRESHOLD` | string | Monthly savings threshold for MEDIUM classification (default: 2000) | `2000` | no | +| `HIGH_COST_THRESHOLD` | string | Monthly savings threshold for HIGH classification (default: 10000) | `10000` | no | +| `AZURE_DISCOUNT_PERCENTAGE` | string | Discount percentage off MSRP for Azure services (default: 0) | `0` | no | +| `SNAPSHOT_AGE_THRESHOLD_DAYS` | string | Age threshold in days for identifying old snapshots that may be candidates for deletion (default: 90) | `90` | no | +| `SCAN_MODE` | string | Performance mode: 'full' (detailed, actual metrics), 'quick' (fast, estimates usage), 'sample' (analyze subset and extrapolate). Default: full | `full` | no | +| `MAX_PARALLEL_JOBS` | string | Maximum parallel jobs for metrics collection in full mode (default: 10) | `10` | no | +| `TIMEOUT_SECONDS` | string | Timeout in seconds for tasks (default: 1500 = 25 minutes). | `1500` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- `storage_optimization_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-storage-cost-optimization/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-storage-cost-optimization +export AZURE_SUBSCRIPTION_IDS=... +export AZURE_RESOURCE_GROUPS=... +export AZURE_SUBSCRIPTION_NAME=... +export COST_ANALYSIS_LOOKBACK_DAYS=... +export LOW_COST_THRESHOLD=... +export MEDIUM_COST_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-storage-cost-optimization +export AZURE_SUBSCRIPTION_IDS=... +export AZURE_RESOURCE_GROUPS=... +export AZURE_SUBSCRIPTION_NAME=... +export COST_ANALYSIS_LOOKBACK_DAYS=... +bash analyze_storage_optimization.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `analyze_storage_optimization.sh` — Bash helper script `analyze_storage_optimization.sh`. diff --git a/codebundles/azure-subscription-cost-report/SKILL-TEMPLATE.md b/codebundles/azure-subscription-cost-report/SKILL-TEMPLATE.md new file mode 100644 index 000000000..dad4977f2 --- /dev/null +++ b/codebundles/azure-subscription-cost-report/SKILL-TEMPLATE.md @@ -0,0 +1,116 @@ +--- +name: azure-subscription-cost-report +kind: skill-template +description: Azure Cost Report: Generates historical cost breakdown reports by service and resource group using the Cost... Use when triaging or monitoring Azure, Cost, Management workloads with skill template ... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, Cost, Management, Cost, Reporting, Trend, Analysis, Reserved, Instances] +resource_types: [subscription] +access: read-only +--- + +# Azure Subscription Cost Report + +## Summary + +This codebundle generates detailed cost breakdown reports for Azure subscriptions using the Cost Management API, and provides Reserved Instance purchase recommendations from Azure Advisor. + +See [README.md](README.md) for additional context. + +## Tools + +### Generate Azure Cost Report By Service and Resource Group for Subscription `${AZURE_SUBSCRIPTION_NAME}` + +Generates a detailed cost breakdown report for the last 30 days showing actual spending by resource group and Azure service using the Cost Management API. Includes period-over-period comparison and raises an issue if cost increase exceeds configured threshold. + +- **Robot task name**: Generate Azure Cost Report By Service and Resource Group for Subscription `${AZURE_SUBSCRIPTION_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `azure_cost_historical_report.sh` +- **Tags**: `Azure`, `Cost`, `Analysis`, `Cost`, `Management`, `Reporting`, `Trend`, `Analysis`, `access:read-only`, `data:config` +- **Reads**: `TIMEOUT_SECONDS` +- **Writes**: `azure_cost_trend_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze Azure Advisor Reserved Instance Recommendations for Subscription `${AZURE_SUBSCRIPTION_NAME}` + +Queries Azure Advisor and the Reservations API to identify Reserved Instance purchase opportunities. Calculates potential savings from 1-year and 3-year commitments for VMs, App Service Plans, and other eligible resources. + +- **Robot task name**: Analyze Azure Advisor Reserved Instance Recommendations for Subscription `${AZURE_SUBSCRIPTION_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `azure_advisor_reservation_recommendations.sh` +- **Tags**: `Azure`, `Cost`, `Analysis`, `Reserved`, `Instances`, `Advisor`, `Savings`, `access:read-only`, `data:config` +- **Reads**: `TIMEOUT_SECONDS` +- **Writes**: `azure_advisor_ri_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZURE_SUBSCRIPTION_IDS` | string | Comma-separated list of Azure subscription IDs to analyze for cost reporting (e.g., "sub1,sub2,sub3"). Leave empty to use current subscription. | `""` | no | +| `AZURE_SUBSCRIPTION_NAME` | string | Azure subscription name for reporting purposes | `""` | no | +| `COST_ANALYSIS_LOOKBACK_DAYS` | string | Number of days to look back for cost analysis (default: 30) | `30` | no | +| `COST_INCREASE_THRESHOLD` | string | Percentage threshold for cost increase alerts. An issue will be raised if period-over-period cost increase exceeds this value (e.g., 10 for 10% increase, default: 10) | `10` | no | +| `COST_BUDGET` | string | Budget threshold in USD for the analysis period. An issue will be raised if total costs exceed this value. Set to 0 to disable (default: 0). | `0` | no | +| `COST_CONCENTRATION_THRESHOLD` | string | Maximum percentage of total cost that any single resource group should represent. An issue will be raised if any resource group exceeds this threshold (default: 25). | `25` | no | +| `TIMEOUT_SECONDS` | string | Timeout in seconds for tasks (default: 1500 = 25 minutes). | `1500` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- `azure_cost_trend_issues.json` +- `azure_advisor_ri_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-subscription-cost-report/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-subscription-cost-report +export AZURE_SUBSCRIPTION_IDS=... +export AZURE_SUBSCRIPTION_NAME=... +export COST_ANALYSIS_LOOKBACK_DAYS=... +export COST_INCREASE_THRESHOLD=... +export COST_BUDGET=... +export COST_CONCENTRATION_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-subscription-cost-report +export AZURE_SUBSCRIPTION_IDS=... +export AZURE_SUBSCRIPTION_NAME=... +export COST_ANALYSIS_LOOKBACK_DAYS=... +export COST_INCREASE_THRESHOLD=... +bash azure_advisor_reservation_recommendations.sh +bash azure_cost_historical_report.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `azure_advisor_reservation_recommendations.sh` — Bash helper script `azure_advisor_reservation_recommendations.sh`. +- `azure_cost_historical_report.sh` — Bash helper script `azure_cost_historical_report.sh`. diff --git a/codebundles/azure-vm-cost-optimization/SKILL-TEMPLATE.md b/codebundles/azure-vm-cost-optimization/SKILL-TEMPLATE.md new file mode 100644 index 000000000..000b41e41 --- /dev/null +++ b/codebundles/azure-vm-cost-optimization/SKILL-TEMPLATE.md @@ -0,0 +1,102 @@ +--- +name: azure-vm-cost-optimization +kind: skill-template +description: Azure VM Cost Optimization: Analyzes Virtual Machines to identify cost optimization opportunities including... Use when triaging or monitoring Azure, Cost, Optimization workloads with skill templat... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, Cost, Optimization, Virtual, Machines, VMs, Rightsizing, Deallocation] +resource_types: [virtual_machine] +access: read-only +--- + +# Azure VM Cost Optimization + +## Summary + +This codebundle analyzes Azure Virtual Machines to identify cost optimization opportunities. + +See [README.md](README.md) for additional context. + +## Tools + +### Analyze Virtual Machine Rightsizing and Deallocation Opportunities in Resource Group `${AZURE_RESOURCE_GROUPS}` for Subscription `${AZURE_SUBSCRIPTION_NAME}` + +Analyzes Azure Virtual Machines across specified subscriptions to identify cost optimization opportunities. Focuses on: 1) VMs that are stopped but not deallocated (still incurring compute costs), 2) Oversized VMs with low CPU utilization that can be downsized to B-series burstable instances. Examines CPU utilization metrics over the past 30 days to provide data-driven rightsizing recommendations. + +- **Robot task name**: Analyze Virtual Machine Rightsizing and Deallocation Opportunities in Resource Group `${AZURE_RESOURCE_GROUPS}` for Subscription `${AZURE_SUBSCRIPTION_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `analyze_vm_optimization.sh` +- **Tags**: `Azure`, `Cost`, `Optimization`, `Virtual`, `Machines`, `VMs`, `Rightsizing`, `Deallocation`, `access:read-only`, `data:config` +- **Reads**: `AZURE_SUBSCRIPTION_NAME`, `TIMEOUT_SECONDS` +- **Writes**: `vm_optimization_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZURE_SUBSCRIPTION_IDS` | string | Comma-separated list of Azure subscription IDs to analyze for VM optimization. | `""` | no | +| `AZURE_RESOURCE_GROUPS` | string | Comma-separated list of resource groups to analyze (leave empty to analyze all resource groups in the subscription) | `""` | no | +| `AZURE_SUBSCRIPTION_NAME` | string | Azure subscription name for reporting purposes | `""` | no | +| `COST_ANALYSIS_LOOKBACK_DAYS` | string | Number of days to look back for utilization analysis (default: 30) | `30` | no | +| `LOW_COST_THRESHOLD` | string | Monthly savings threshold for LOW classification (default: 0) | `0` | no | +| `MEDIUM_COST_THRESHOLD` | string | Monthly savings threshold for MEDIUM classification (default: 2000) | `2000` | no | +| `HIGH_COST_THRESHOLD` | string | Monthly savings threshold for HIGH classification (default: 10000) | `10000` | no | +| `AZURE_DISCOUNT_PERCENTAGE` | string | Discount percentage off MSRP for Azure services (default: 0) | `0` | no | +| `TIMEOUT_SECONDS` | string | Timeout in seconds for tasks (default: 1500 = 25 minutes). | `1500` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- `vm_optimization_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-vm-cost-optimization/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-vm-cost-optimization +export AZURE_SUBSCRIPTION_IDS=... +export AZURE_RESOURCE_GROUPS=... +export AZURE_SUBSCRIPTION_NAME=... +export COST_ANALYSIS_LOOKBACK_DAYS=... +export LOW_COST_THRESHOLD=... +export MEDIUM_COST_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-vm-cost-optimization +export AZURE_SUBSCRIPTION_IDS=... +export AZURE_RESOURCE_GROUPS=... +export AZURE_SUBSCRIPTION_NAME=... +export COST_ANALYSIS_LOOKBACK_DAYS=... +bash analyze_vm_optimization.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `analyze_vm_optimization.sh` — Bash helper script `analyze_vm_optimization.sh`. diff --git a/codebundles/azure-vm-os-health/SKILL-TEMPLATE.md b/codebundles/azure-vm-os-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..16ddd785c --- /dev/null +++ b/codebundles/azure-vm-os-health/SKILL-TEMPLATE.md @@ -0,0 +1,222 @@ +--- +name: azure-vm-os-health +kind: skill-template +description: Runs diagnostic checks against Azure VMs to monitor disk utilization, memory utilization, uptime, patch status and... Use when triaging or monitoring Azure, Virtual, Machine workloads with skill te... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, Virtual, Machine, Disk, Health, Uptime] +resource_types: [virtual_machine] +access: read-only +--- + +# Azure VM Health Check + +## Summary + +This bundle provides comprehensive health checks for Azure Virtual Machines, including disk utilization, memory usage, uptime, and patch status. + +See [README.md](README.md) for additional context. + +## Tools + +### Check Disk Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}` + +Checks disk utilization for VMs and parses each result. + +- **Robot task name**: Check Disk Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `next_steps_disk_utilization.sh` +- **Tags**: `access:read-only`, `VM`, `Azure`, `Disk`, `Health`, `data:config` +- **Reads**: `AZURE_SUBSCRIPTION_NAME`, `AZ_RESOURCE_GROUP`, `DISK_THRESHOLD`, `MAX_PARALLEL_JOBS`, `TIMEOUT_SECONDS`, `VM_INCLUDE_LIST`, `VM_OMIT_LIST` +- **Writes**: `disk_utilization_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Memory Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}` + +Checks memory utilization for VMs and parses each result. + +- **Robot task name**: Check Memory Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `next_steps_memory_check.sh` +- **Tags**: `access:read-only`, `VM`, `Azure`, `Memory`, `Health`, `data:config` +- **Reads**: `AZURE_SUBSCRIPTION_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `memory_utilization_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Uptime for VMs in Resource Group `${AZ_RESOURCE_GROUP}` + +Checks uptime for VMs and parses each result. + +- **Robot task name**: Check Uptime for VMs in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `next_steps_uptime.sh` +- **Tags**: `access:read-only`, `VM`, `Azure`, `Uptime`, `Health`, `data:config` +- **Reads**: `AZURE_SUBSCRIPTION_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `uptime_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Last Patch Status for VMs in Resource Group `${AZ_RESOURCE_GROUP}` + +Checks last patch status for VMs and parses each result. + +- **Robot task name**: Check Last Patch Status for VMs in Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `next_steps_patch_time.sh` +- **Tags**: `access:read-only`, `VM`, `Azure`, `Patch`, `Health`, `data:config` +- **Reads**: `AZURE_SUBSCRIPTION_NAME`, `AZ_RESOURCE_GROUP` +- **Writes**: `patch_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Calculates Azure VM health by checking disk, memory, uptime, and patch status. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Check Disk Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}` + +Checks disk utilization for VMs and parses each result. + +- **Robot task name**: Check Disk Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `disk_utilization` +- **Underlying script**: `next_steps_disk_utilization.sh` +- **Tags**: `VM`, `Azure`, `Disk`, `Health`, `data:config` +- **Reads**: — +- **Pass condition**: `${issue_count} == 0` + + +#### Check Memory Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}` + +Checks memory utilization for VMs and parses each result. + +- **Robot task name**: Check Memory Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `memory_utilization` +- **Underlying script**: `next_steps_memory_check.sh` +- **Tags**: `VM`, `Azure`, `Memory`, `Health`, `data:config` +- **Reads**: — +- **Pass condition**: `${issue_count} == 0` + + +#### Check Uptime for VMs in Resource Group `${AZ_RESOURCE_GROUP}` + +Checks uptime for VMs and parses each result. + +- **Robot task name**: Check Uptime for VMs in Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `vm_uptime` +- **Underlying script**: `next_steps_uptime.sh` +- **Tags**: `VM`, `Azure`, `Uptime`, `Health`, `data:config` +- **Reads**: — +- **Pass condition**: `${issue_count} == 0` + + +#### Check Last Patch Status for VMs in Resource Group `${AZ_RESOURCE_GROUP}` + +Checks last patch status for VMs and parses each result. + +- **Robot task name**: Check Last Patch Status for VMs in Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `patch_status` +- **Underlying script**: `next_steps_patch_time.sh` +- **Tags**: `VM`, `Azure`, `Patch`, `Health`, `data:config` +- **Reads**: — +- **Pass condition**: `${issue_count} == 0` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZ_RESOURCE_GROUP` | string | The resource group containing the VM(s). | — | yes | +| `DISK_THRESHOLD` | string | The threshold percentage for disk usage warnings. | `85` | no | +| `UPTIME_THRESHOLD` | string | The threshold in days for system uptime warnings. | `30` | no | +| `MEMORY_THRESHOLD` | string | The threshold percentage for memory usage warnings. | `90` | no | +| `MAX_PARALLEL_JOBS` | string | Maximum number of parallel VM checks to run simultaneously. | `5` | no | +| `TIMEOUT_SECONDS` | string | Timeout in seconds for Azure VM run-command operations. | `90` | no | +| `VM_INCLUDE_LIST` | string | Comma-separated list of VM name patterns to include (e.g., "web-*,app-*"). If empty, all VMs are processed. | — | yes | +| `VM_OMIT_LIST` | string | Comma-separated list of VM name patterns to exclude (e.g., "test-*,dev-*"). If empty, no VMs are excluded. | — | yes | +| `AZURE_SUBSCRIPTION_ID` | string | The Azure Subscription ID. | — | yes | +| `AZURE_SUBSCRIPTION_NAME` | string | The Azure Subscription Name. | `subscription-01` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `disk_utilization_issues.json` +- `memory_utilization_issues.json` +- `uptime_issues.json` +- `patch_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-vm-os-health/runbook.robot` +- **Monitor**: `codebundles/azure-vm-os-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-vm-os-health +export AZ_RESOURCE_GROUP=... +export DISK_THRESHOLD=... +export UPTIME_THRESHOLD=... +export MEMORY_THRESHOLD=... +export MAX_PARALLEL_JOBS=... +export TIMEOUT_SECONDS=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-vm-os-health +export AZ_RESOURCE_GROUP=... +export DISK_THRESHOLD=... +export UPTIME_THRESHOLD=... +export MEMORY_THRESHOLD=... +bash next_steps_disk_utilization.sh +bash next_steps_memory_check.sh +bash next_steps_patch_time.sh +bash next_steps_uptime.sh +bash vm_disk_utilization.sh +bash vm_last_patch_check.sh +bash vm_memory_check.sh +bash vm_uptime_check.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `next_steps_disk_utilization.sh` — Bash helper script `next_steps_disk_utilization.sh`. +- `next_steps_memory_check.sh` — Bash helper script `next_steps_memory_check.sh`. +- `next_steps_patch_time.sh` — Bash helper script `next_steps_patch_time.sh`. +- `next_steps_uptime.sh` — Bash helper script `next_steps_uptime.sh`. +- `vm_disk_utilization.sh` — Bash helper script `vm_disk_utilization.sh`. +- `vm_last_patch_check.sh` — Bash helper script `vm_last_patch_check.sh`. +- `vm_memory_check.sh` — Bash helper script `vm_memory_check.sh`. +- `vm_uptime_check.sh` — Bash helper script `vm_uptime_check.sh`. diff --git a/codebundles/azure-vmss-triage/SKILL-TEMPLATE.md b/codebundles/azure-vmss-triage/SKILL-TEMPLATE.md new file mode 100644 index 000000000..276d5d9cd --- /dev/null +++ b/codebundles/azure-vmss-triage/SKILL-TEMPLATE.md @@ -0,0 +1,150 @@ +--- +name: azure-vmss-triage +kind: skill-template +description: Runs diagnostic checks against virtual machine scaled sets and generates reports from key metrics. Use when triaging or monitoring Azure, Virtual, Machine workloads with skill template `azure-vmss-... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Azure, Virtual, Machine, Scale, Set, Triage, Health] +resource_types: [azure_resource] +access: read-only +--- + +# Azure VM Scale Set Triage + +## Summary + +This codebundle runs a suite of metrics checks for a VM Scale Set in Azure. + +See [README.md](README.md) for additional context. + +## Tools + +### Check Scale Set `${VMSCALESET}` Key Metrics In Resource Group `${AZ_RESOURCE_GROUP}` + +Checks key metrics of VM Scale Set for issues. + +- **Robot task name**: Check Scale Set `${VMSCALESET}` Key Metrics In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `vmss_metrics.sh` +- **Tags**: `Scale`, `Set`, `VM`, `Azure`, `Metrics`, `Health`, `data:config` +- **Reads**: `AZ_RESOURCE_GROUP`, `VMSCALESET` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch VM Scale Set `${VMSCALESET}` Config In Resource Group `${AZ_RESOURCE_GROUP}` + +Fetch the config of the scaled set in azure + +- **Robot task name**: Fetch VM Scale Set `${VMSCALESET}` Config In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `vmss_config.sh` +- **Tags**: `VM`, `Scale`, `Set`, `logs`, `tail`, `data:config` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Activities for VM Scale Set `${VMSCALESET}` In Resource Group `${AZ_RESOURCE_GROUP}` + +Gets the events for the scaled set and checks for errors + +- **Robot task name**: Fetch Activities for VM Scale Set `${VMSCALESET}` In Resource Group `${AZ_RESOURCE_GROUP}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `vmss_activities.sh` +- **Tags**: `VM`, `Scale`, `Set`, `monitor`, `events`, `errors`, `data:logs-bulk` +- **Reads**: `AZ_RESOURCE_GROUP`, `VMSCALESET` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Checks VM Scale Set key metrics and returns a 1 when healthy, or 0 when not healthy. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Check Scale Set `${VMSCALESET}` Key Metrics In Resource Group `${AZ_RESOURCE_GROUP}` + +Checks key metrics of VM Scale Set for issues. + +- **Robot task name**: Check Scale Set `${VMSCALESET}` Key Metrics In Resource Group `${AZ_RESOURCE_GROUP}` +- **Sub-metric name**: `vmss_health` +- **Underlying script**: `vmss_metrics.sh` +- **Tags**: `Scale`, `Set`, `VM`, `Azure`, `Metrics`, `Health`, `data:config` +- **Reads**: — + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `AZ_RESOURCE_GROUP` | string | The resource group to perform actions against. | — | yes | +| `VMSCALESET` | string | The Azure Virtual Machine Scale Set to triage. | — | yes | +| `RW_LOOKBACK_WINDOW` | string | The time period, in minutes, to look back for activites/events. | `60` | no | +| `AZURE_RESOURCE_SUBSCRIPTION_ID` | string | The Azure Subscription ID for the resource. | `""` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/azure-vmss-triage/runbook.robot` +- **Monitor**: `codebundles/azure-vmss-triage/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/azure-vmss-triage +export AZ_RESOURCE_GROUP=... +export VMSCALESET=... +export RW_LOOKBACK_WINDOW=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/azure-vmss-triage +export AZ_RESOURCE_GROUP=... +export VMSCALESET=... +export RW_LOOKBACK_WINDOW=... +export AZURE_RESOURCE_SUBSCRIPTION_ID=... +bash vmss_activities.sh +bash vmss_config.sh +bash vmss_metrics.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `vmss_activities.sh` — Bash helper script `vmss_activities.sh`. +- `vmss_config.sh` — Bash helper script `vmss_config.sh`. +- `vmss_metrics.sh` — Bash helper script `vmss_metrics.sh`. diff --git a/codebundles/curl-gmp-kong-ingress-inspection/SKILL-TEMPLATE.md b/codebundles/curl-gmp-kong-ingress-inspection/SKILL-TEMPLATE.md new file mode 100644 index 000000000..08418387a --- /dev/null +++ b/codebundles/curl-gmp-kong-ingress-inspection/SKILL-TEMPLATE.md @@ -0,0 +1,110 @@ +--- +name: curl-gmp-kong-ingress-inspection +kind: skill-template +description: Collects Kong ingress host metrics from GMP on GCP and inspects the results for ingress with a HTTP error code rate... Use when triaging or monitoring GCP, GMP, Ingress workloads with skill templat... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [GCP, GMP, Ingress, Kong, Metrics] +resource_types: [ingress] +access: read-only +--- + +# GKE Kong Ingress Host Triage + +## Summary + +This code collects Kong ingress host metrics from Google Monitoring Platform (GMP) on Google Cloud Platform (GCP) and inspects the results for ingresses with a HTTP error code rate greater than zero over a configurable duration. + +See [README.md](README.md) for additional context. + +## Tools + +### Check If Kong Ingress HTTP Error Rate Violates HTTP Error Threshold in GCP Project `${GCP_PROJECT_ID}` + +Fetches HTTP Error metrics for the Kong ingress host and service from GMP and performs an inspection on the results. If there are currently any results with more than the defined HTTP error threshold, their route and service names will be surfaced for further troubleshooting. + +- **Robot task name**: Check If Kong Ingress HTTP Error Rate Violates HTTP Error Threshold in GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Tags**: `curl`, `http`, `ingress`, `errors`, `metrics`, `kong`, `gmp`, `access:read-only`, `data:config` +- **Reads**: `GCP_PROJECT_ID`, `HTTP_ERROR_RATE_THRESHOLD`, `TIME_SLICE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check If Kong Ingress HTTP Request Latency Violates Threshold in GCP Project `${GCP_PROJECT_ID}` + +Fetches metrics for the Kong ingress 99th percentile request latency from GMP and performs an inspection on the results. If there are currently any results with more than the defined request latency threshold, their route and service names will be surfaced for further troubleshooting. + +- **Robot task name**: Check If Kong Ingress HTTP Request Latency Violates Threshold in GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Tags**: `curl`, `request`, `ingress`, `latency`, `http`, `kong`, `gmp`, `access:read-only`, `data:config` +- **Reads**: `GCP_PROJECT_ID`, `REQUEST_LATENCY_THRESHOLD`, `TIME_SLICE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check If Kong Ingress Controller Reports Upstream Errors in GCP Project `${GCP_PROJECT_ID}` + +Fetches metrics for the Kong ingress controller related to upstream healthchecks or dns errors. + +- **Robot task name**: Check If Kong Ingress Controller Reports Upstream Errors in GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Tags**: `curl`, `request`, `ingress`, `upstream`, `healthcheck`, `dns`, `errrors`, `http`, `kong`, `gmp`, `access:read-only`, `data:config` +- **Reads**: `GCP_PROJECT_ID` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `GCP_PROJECT_ID` | string | The GCP Project ID to scope the API to. | — | yes | +| `TIME_SLICE` | string | Specify the window of time used to measure the rate. | `1m` | no | +| `HTTP_ERROR_RATE_THRESHOLD` | string | Specify the error rate threshold that is considered unhealthy. Measured in errors/s. | `0.5` | no | +| `REQUEST_LATENCY_THRESHOLD` | string | The threshold in ms for request latency to be considered unhealthy. | — | yes | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `gcp_credentials` | GCP service account json used to authenticate with GCP APIs. | yes | + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/curl-gmp-kong-ingress-inspection/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/curl-gmp-kong-ingress-inspection +export GCP_PROJECT_ID=... +export TIME_SLICE=... +export HTTP_ERROR_RATE_THRESHOLD=... +export REQUEST_LATENCY_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/curl-gmp-nginx-ingress-inspection/SKILL-TEMPLATE.md b/codebundles/curl-gmp-nginx-ingress-inspection/SKILL-TEMPLATE.md new file mode 100644 index 000000000..1b5fbbf1b --- /dev/null +++ b/codebundles/curl-gmp-nginx-ingress-inspection/SKILL-TEMPLATE.md @@ -0,0 +1,101 @@ +--- +name: curl-gmp-nginx-ingress-inspection +kind: skill-template +description: Collects Nginx ingress host controller metrics from GMP on GCP and inspects the results for ingress with a HTTP... Use when triaging or monitoring GCP, GMP, Ingress workloads with skill template `c... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [GCP, GMP, Ingress, Nginx, Metrics] +resource_types: [ingress] +access: read-only +--- + +# GKE Nginx Ingress Host Triage + +## Summary + +Runs a task which performs inspects the HTTP error code metrics related to your nginx ingress controller in your GKE kubernetes cluster and raises issues based on the number of ingress with errors. + +See [README.md](README.md) for additional context. + +## Tools + +### Fetch Nginx HTTP Errors From GMP for Ingress `${INGRESS_OBJECT_NAME}` + +Fetches metrics for the Nginx ingress host from GMP and performs an inspection on the results. + +- **Robot task name**: Fetch Nginx HTTP Errors From GMP for Ingress `${INGRESS_OBJECT_NAME}` +- **Robot file**: `runbook.robot` +- **Tags**: `curl`, `http`, `ingress`, `latency`, `errors`, `metrics`, `controller`, `nginx`, `gmp`, `500s`, `data:config` +- **Reads**: `CONTEXT`, `ERROR_CODES`, `GCP_PROJECT_ID`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `TIME_SLICE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Find Owner and Service Health for Ingress `${INGRESS_OBJECT_NAME}` + +Checks the ingress object service and endpoints. Also returns the owner of the pods that support the Ingress. + +- **Robot task name**: Find Owner and Service Health for Ingress `${INGRESS_OBJECT_NAME}` +- **Robot file**: `runbook.robot` +- **Tags**: `owner`, `ingress`, `service`, `endpoints`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `NAMESPACE` | string | The name of the namespace to search. | `` | yes | +| `GCP_PROJECT_ID` | string | The GCP Project ID to scope the API to. | — | yes | +| `TIME_SLICE` | string | The amount of time to perform aggregations over. | `60m` | no | +| `ERROR_CODES` | string | Which http status codes to look for and classify as errors. | `500|501|502|503|504` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `gcp_credentials` | GCP service account json used to authenticate with GCP APIs. | yes | + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/curl-gmp-nginx-ingress-inspection/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/curl-gmp-nginx-ingress-inspection +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export GCP_PROJECT_ID=... +export TIME_SLICE=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/curl-http-ok/SKILL-TEMPLATE.md b/codebundles/curl-http-ok/SKILL-TEMPLATE.md new file mode 100644 index 000000000..41605adfb --- /dev/null +++ b/codebundles/curl-http-ok/SKILL-TEMPLATE.md @@ -0,0 +1,110 @@ +--- +name: curl-http-ok +kind: skill-template +description: This taskset uses curl to validate the response code of the endpoint and provides the total time of the request. Use when triaging or monitoring Linux, macOS, Windows workloads with skill template ... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Linux, macOS, Windows, HTTP] +resource_types: [] +access: read-only +--- + +# cURL HTTP OK + +## Summary + +This codebundle validates the response code of an endpoint using cURL and provides the total time of the request. + +See [README.md](README.md) for additional context. + +## Tools + +### Check HTTP URL Availability and Timeliness + +Use cURL to validate single or multiple http responses + +- **Robot task name**: Check HTTP URL Availability and Timeliness +- **Robot file**: `runbook.robot` +- **Tags**: `curl`, `http`, `ingress`, `latency`, `errors`, `access:read-only`, `data:config` +- **Reads**: `URLS` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +This taskset uses curl to validate the response code of the endpoint. Returns ascore of 1 if healthy, an 0 if unhealthy. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Validate HTTP URL Availability and Timeliness + +Use cURL to validate single or multiple http responses + +- **Robot task name**: Validate HTTP URL Availability and Timeliness +- **Sub-metric name**: `overall_health` +- **Tags**: `cURL`, `HTTP`, `Ingress`, `Latency`, `Errors`, `data:config` +- **Reads**: `URLS` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `URLS` | string | Comma-separated list of URLs to perform requests against. | `https://www.runwhen.com` | no | +| `TARGET_LATENCY` | string | The maximum latency in seconds as a float value allowed for requests to have. | `1.2` | no | +| `ACCEPTABLE_RESPONSE_CODES` | string | Comma-separated list of HTTP response codes that indicate success and connectivity (e.g., 200,201,202,204,301,302,307,401,403). | `200,201,202,204,301,302,307,401,403` | no | +| `OWNER_DETAILS` | string | Json list of owner details | `{"name": "my-ingress", "kind": "Ingress", "namespace": "default"}` | no | +| `VERIFY_SSL` | string | Whether to verify SSL certificates. Set to 'false' to ignore SSL certificate errors. | `false` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/curl-http-ok/runbook.robot` +- **Monitor**: `codebundles/curl-http-ok/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/curl-http-ok +export URLS=... +export TARGET_LATENCY=... +export ACCEPTABLE_RESPONSE_CODES=... +export OWNER_DETAILS=... +export VERIFY_SSL=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) diff --git a/codebundles/dns-health/SKILL-TEMPLATE.md b/codebundles/dns-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..c4ef7fb06 --- /dev/null +++ b/codebundles/dns-health/SKILL-TEMPLATE.md @@ -0,0 +1,190 @@ +--- +name: dns-health +kind: skill-template +description: This taskset performs comprehensive DNS health monitoring and validation tasks. Use when triaging or monitoring DNS, Azure, GCP workloads with skill template `dns-health`. +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [DNS, Azure, GCP, AWS] +resource_types: [azure_resource] +access: read-only +--- + +# DNS Health & Monitoring + +## Summary + +This taskset performs comprehensive DNS health monitoring and validation tasks. + +See [README.md](README.md) for additional context. + +## Tools + +### Check DNS Zone Records + +Verifies DNS zones and their record integrity + +- **Robot task name**: Check DNS Zone Records +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `dns`, `zone-records`, `data:config` +- **Reads**: `DNS_ZONES` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Detect Broken Record Resolution + +Implements repeated DNS checks for multiple FQDNs to detect resolution failures + +- **Robot task name**: Detect Broken Record Resolution +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `dns`, `resolution`, `consistency`, `data:config` +- **Reads**: `DNS_RESOLVERS`, `TEST_FQDNS` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Test Forward Lookup Zones + +Tests forward lookup zones and conditional forwarders for proper resolution + +- **Robot task name**: Test Forward Lookup Zones +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `dns`, `forward-lookup`, `conditional-forwarders`, `data:config` +- **Reads**: `FORWARD_LOOKUP_ZONES`, `FORWARD_ZONE_TEST_SUBDOMAINS` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### External Resolution Validation + +Tests resolution of multiple public domains through multiple resolvers + +- **Robot task name**: External Resolution Validation +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `dns`, `external`, `public`, `resolvers`, `data:config` +- **Reads**: `DNS_RESOLVERS`, `PUBLIC_DOMAINS` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### DNS Latency Check + +Tests DNS query latency for configured zones + +- **Robot task name**: DNS Latency Check +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `dns`, `latency`, `performance`, `data:config` +- **Reads**: `DNS_ZONES`, `FORWARD_LOOKUP_ZONES`, `TEST_FQDNS` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +This SLI measures DNS health metrics including resolution success rates, + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### DNS Resolution Success Rate + +Measures the success rate of DNS resolution across all configured FQDNs and pushes a metric (0-100) + +- **Robot task name**: DNS Resolution Success Rate +- **Sub-metric name**: `resolution_success` +- **Tags**: `dns`, `resolution`, `success-rate`, `sli`, `data:config` +- **Reads**: `FORWARD_LOOKUP_ZONES`, `PUBLIC_DOMAINS`, `TEST_FQDNS` + + +#### DNS Query Latency + +Measures average DNS query latency in milliseconds across all configured FQDNs and pushes the metric + +- **Robot task name**: DNS Query Latency +- **Sub-metric name**: `latency_performance` +- **Tags**: `dns`, `latency`, `performance`, `sli`, `data:config` +- **Reads**: `FORWARD_LOOKUP_ZONES`, `PUBLIC_DOMAINS`, `TEST_FQDNS` + + +#### DNS Zone Health + +Measures the health of configured DNS zones (1 for healthy, 0 for unhealthy) + +- **Robot task name**: DNS Zone Health +- **Sub-metric name**: `zone_health` +- **Tags**: `dns`, `zone-health`, `sli`, `data:config` +- **Reads**: `DNS_ZONES` + + +#### External DNS Resolver Availability + +Measures availability of external DNS resolvers (percentage of working resolvers) + +- **Robot task name**: External DNS Resolver Availability +- **Sub-metric name**: `resolver_availability` +- **Tags**: `dns`, `external`, `resolver`, `availability`, `sli`, `data:config` +- **Reads**: `DNS_RESOLVERS` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `TEST_FQDNS` | string | Important domains/services to monitor for DNS resolution (comma-separated if multiple). Example: api.mycompany.com,db.mycompany.com | `google.com,example.com` | no | +| `FORWARD_LOOKUP_ZONES` | string | Internal company domains that forward to on-premises DNS (optional, for hybrid environments). Example: internal.company.com | `""` | no | +| `PUBLIC_DOMAINS` | string | Your public websites to test external DNS resolution (optional). Example: mycompany.com,blog.mycompany.com | `""` | no | +| `DNS_RESOLVERS` | string | Custom DNS servers to test against (comma-separated). Example: 10.0.0.4,10.0.1.4 or 8.8.8.8,1.1.1.1 | `8.8.8.8,1.1.1.1` | no | +| `DNS_ZONES` | string | DNS zones to check health for (comma-separated). Can be private or public zones. Example: mycompany.com,internal.corp | `""` | no | +| `FORWARD_ZONE_TEST_SUBDOMAINS` | string | Specific servers to test in forward lookup zones (optional). Example: dc01,mail,web | `""` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/dns-health/runbook.robot` +- **Monitor**: `codebundles/dns-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/dns-health +export TEST_FQDNS=... +export FORWARD_LOOKUP_ZONES=... +export PUBLIC_DOMAINS=... +export DNS_RESOLVERS=... +export DNS_ZONES=... +export FORWARD_ZONE_TEST_SUBDOMAINS=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) diff --git a/codebundles/gcloud-log-inspection/SKILL-TEMPLATE.md b/codebundles/gcloud-log-inspection/SKILL-TEMPLATE.md new file mode 100644 index 000000000..912c24997 --- /dev/null +++ b/codebundles/gcloud-log-inspection/SKILL-TEMPLATE.md @@ -0,0 +1,84 @@ +--- +name: gcloud-log-inspection +kind: skill-template +description: Fetches logs from a GCP using a configurable query and raises an issue with details on the most common issues. Use when triaging or monitoring GCP, Gcloud, Google Monitoring workloads with skill te... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [GCP, Gcloud, Google Monitoring] +resource_types: [gcp_resource] +access: read-only +--- + +# GCP Gcloud Log Inspection + +## Summary + +Runs a task which performs an inspection on your logs in a GCP project, returning results regarding common issues, counts and related Kubernetes namespaces using a filter. + +See [README.md](README.md) for additional context. + +## Tools + +### Inspect GCP Logs For Common Errors in GCP Project `${GCP_PROJECT_ID}` + +Fetches logs from a Google Cloud Project and filters for a count of common error messages. + +- **Robot task name**: Inspect GCP Logs For Common Errors in GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Tags**: `Logs`, `Query`, `Gcloud`, `GCP`, `Errors`, `Common`, `access:read-only`, `data:logs-regexp` +- **Reads**: `ADD_FILTERS`, `GCP_PROJECT_ID`, `SEVERITY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `SEVERITY` | string | What minimum severity to filter for. See https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry#LogSeverity for examples. | `ERROR` | no | +| `ADD_FILTERS` | string | Extra optional filters to add to the gcloud log read request. See https://cloud.google.com/logging/docs/view/logging-query-language for syntax. | `` | yes | +| `GCP_PROJECT_ID` | string | The GCP Project ID to scope the API to. | — | yes | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `gcp_credentials` | GCP service account json used to authenticate with GCP APIs. | yes | + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/gcloud-log-inspection/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/gcloud-log-inspection +export SEVERITY=... +export ADD_FILTERS=... +export GCP_PROJECT_ID=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/gcp-bucket-health/SKILL-TEMPLATE.md b/codebundles/gcp-bucket-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..eac90b527 --- /dev/null +++ b/codebundles/gcp-bucket-health/SKILL-TEMPLATE.md @@ -0,0 +1,194 @@ +--- +name: gcp-bucket-health +kind: skill-template +description: Inspect GCP Storage bucket usage and configuration. Use when triaging or monitoring GCP, GCS workloads with skill template `gcp-bucket-health`. +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [GCP, GCS] +resource_types: [gcp_resource] +access: read-only +--- + +# GCP Storage Bucket Health + +## Summary + +This code checks if any GCP (Google Cloud Platform) buckets are unhealthy, focusing on: - Utilization (with a user defined threshold for issue/alert generation). + +See [README.md](README.md) for additional context. + +## Tools + +### Fetch GCP Bucket Storage Utilization for `${PROJECT_IDS}` + +Fetches all GCP buckets in each project and obtains the total size. + +- **Robot task name**: Fetch GCP Bucket Storage Utilization for `${PROJECT_IDS}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `bucket_size.sh` +- **Tags**: `gcloud`, `gcs`, `gcp`, `bucket`, `data:config` +- **Reads**: `USAGE_THRESHOLD` +- **Writes**: `bucket_report.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Add GCP Bucket Storage Configuration for `${PROJECT_IDS}` to Report + +Fetches all GCP buckets in each project and obtains the total size. + +- **Robot task name**: Add GCP Bucket Storage Configuration for `${PROJECT_IDS}` to Report +- **Robot file**: `runbook.robot` +- **Underlying script**: `bucket_details.sh` +- **Tags**: `gcloud`, `gcs`, `gcp`, `bucket`, `data:config` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check GCP Bucket Security Configuration for `${PROJECT_IDS}` + +Fetches all GCP buckets in each project and checks for public buckets, risky IAM permissions, and encryption configuration. + +- **Robot task name**: Check GCP Bucket Security Configuration for `${PROJECT_IDS}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_security.sh` +- **Tags**: `gcloud`, `gcs`, `gcp`, `bucket`, `security`, `data:config` +- **Reads**: `PUBLIC_ACCESS_BUCKET_THRESHOLD` +- **Writes**: `bucket_security_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch GCP Bucket Storage Operations Rate for `${PROJECT_IDS}` + +Fetches all GCP buckets in each project and obtains the read and write operations rate that incurrs cost. Generates issues if the rate is above a specified threshold. + +- **Robot task name**: Fetch GCP Bucket Storage Operations Rate for `${PROJECT_IDS}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `bucket_ops_costs.sh` +- **Tags**: `gcloud`, `gcs`, `gcp`, `bucket`, `data:config` +- **Reads**: `OPS_RATE_THRESHOLD` +- **Writes**: `bucket_ops_report.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +This SLI uses the GCP API or gcloud to score bucket health. Produces a value between 0 (completely failing thet test) and 1 (fully passing the test). Looks for usage above a threshold and public buckets. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Fetch GCP Bucket Storage Utilization for `${PROJECT_IDS}` + +Fetches all GCP buckets in each project and obtains the total size. + +- **Robot task name**: Fetch GCP Bucket Storage Utilization for `${PROJECT_IDS}` +- **Sub-metric name**: `storage_utilization` +- **Underlying script**: `bucket_size.sh` +- **Tags**: `gcloud`, `gcs`, `gcp`, `bucket`, `data:config` +- **Reads**: `USAGE_THRESHOLD` +- **Pass condition**: `int(${buckets_over_threshold.stdout}) == 0` + + +#### Check GCP Bucket Security Configuration for `${PROJECT_IDS}` + +Fetches all GCP buckets in each project and checks for public buckets, risky IAM permissions, and encryption configuration. + +- **Robot task name**: Check GCP Bucket Security Configuration for `${PROJECT_IDS}` +- **Sub-metric name**: `security_config` +- **Underlying script**: `check_security.sh` +- **Tags**: `gcloud`, `gcs`, `gcp`, `bucket`, `security`, `data:config` +- **Reads**: `PUBLIC_ACCESS_BUCKET_THRESHOLD` +- **Pass condition**: `int(${total_public_access_buckets.stdout}) <= ${PUBLIC_ACCESS_BUCKET_THRESHOLD}` + + +#### Fetch GCP Bucket Storage Operations Rate for `${PROJECT_IDS}` + +Fetches all GCP buckets in each project and obtains the read and write operations rate that incurrs cost. + +- **Robot task name**: Fetch GCP Bucket Storage Operations Rate for `${PROJECT_IDS}` +- **Sub-metric name**: `operations_rate` +- **Underlying script**: `bucket_ops_costs.sh` +- **Tags**: `gcloud`, `gcs`, `gcp`, `bucket`, `data:config` +- **Reads**: `OPS_RATE_THRESHOLD` +- **Pass condition**: `int(${buckets_over_ops_threshold.stdout}) == 0` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `PROJECT_IDS` | string | The GCP Project ID to scope the API to. Accepts multiple comma separated project IDs. | — | yes | +| `USAGE_THRESHOLD` | string | The amount of storage, in TB, to generate an issue on. | `0.5` | no | +| `OPS_RATE_THRESHOLD` | string | The rate of read+write operations, in ops/s, to generate an issue on. | `10` | no | +| `PUBLIC_ACCESS_BUCKET_THRESHOLD` | string | The amount of storage buckets that can be publicly accessible. | `0` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `gcp_credentials` | GCP service account json used to authenticate with GCP APIs. | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `bucket_report.json` +- `bucket_security_issues.json` +- `bucket_ops_report.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/gcp-bucket-health/runbook.robot` +- **Monitor**: `codebundles/gcp-bucket-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/gcp-bucket-health +export PROJECT_IDS=... +export USAGE_THRESHOLD=... +export OPS_RATE_THRESHOLD=... +export PUBLIC_ACCESS_BUCKET_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/gcp-bucket-health +export PROJECT_IDS=... +export USAGE_THRESHOLD=... +export OPS_RATE_THRESHOLD=... +bash bucket_details.sh +bash bucket_ops_costs.sh +bash bucket_size.sh +bash check_security.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `bucket_details.sh` — Bash helper script `bucket_details.sh`. +- `bucket_ops_costs.sh` — Bash helper script `bucket_ops_costs.sh`. +- `bucket_size.sh` — Bash helper script `bucket_size.sh`. +- `check_security.sh` — Bash helper script `check_security.sh`. diff --git a/codebundles/gcp-cloud-function-health/SKILL-TEMPLATE.md b/codebundles/gcp-cloud-function-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..816b73235 --- /dev/null +++ b/codebundles/gcp-cloud-function-health/SKILL-TEMPLATE.md @@ -0,0 +1,125 @@ +--- +name: gcp-cloud-function-health +kind: skill-template +description: Identify problems related to GCP Cloud Function deployments. Use when triaging or monitoring GCP, Cloud Functions workloads with skill template `gcp-cloud-function-health`. +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [GCP, Cloud Functions] +resource_types: [gcp_resource] +access: read-only +--- + +# GCP Cloud Function Health + +## Summary + +This code checks if any GCP (Google Cloud Platform) cloud functions are unhealthy. + +See [README.md](README.md) for additional context. + +## Tools + +### List Unhealthy Cloud Functions in GCP Project `${GCP_PROJECT_ID}` + +Fetches a list of GCP Cloud Functions that are not healthy. + +- **Robot task name**: List Unhealthy Cloud Functions in GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `cloud_functions_next_steps.sh` +- **Tags**: `gcloud`, `function`, `gcp`, `${GCP_PROJECT_ID}`, `access:read-only`, `data:config` +- **Reads**: `GCP_PROJECT_ID` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Get Error Logs for Unhealthy Cloud Functions in GCP Project `${GCP_PROJECT_ID}` + +Fetches GCP logs related to unhealthy Cloud Functions within the last 14 days + +- **Robot task name**: Get Error Logs for Unhealthy Cloud Functions in GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `cloud_functions_next_steps.sh` +- **Tags**: `gcloud`, `function`, `gcp`, `${GCP_PROJECT_ID}`, `access:read-only`, `data:logs-regexp` +- **Reads**: `GCP_PROJECT_ID` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Count the number of Cloud Functions in an unhealthy state for a GCP Project. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Count unhealthy GCP Cloud Functions in GCP Project `${GCP_PROJECT_ID}` + +Counts all GCP Functions that are not in a Healthy state + +- **Robot task name**: Count unhealthy GCP Cloud Functions in GCP Project `${GCP_PROJECT_ID}` +- **Sub-metric name**: `function_health` +- **Tags**: `gcloud`, `function`, `gcp`, `${GCP_PROJECT_ID}`, `data:config` +- **Reads**: `GCP_PROJECT_ID` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `GCP_PROJECT_ID` | string | The GCP Project ID to scope the API to. | — | yes | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `gcp_credentials` | GCP service account json used to authenticate with GCP APIs. | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/gcp-cloud-function-health/runbook.robot` +- **Monitor**: `codebundles/gcp-cloud-function-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/gcp-cloud-function-health +export GCP_PROJECT_ID=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/gcp-cloud-function-health +export GCP_PROJECT_ID=... +bash cloud_functions_next_steps.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `cloud_functions_next_steps.sh` — Bash helper script `cloud_functions_next_steps.sh`. diff --git a/codebundles/gcp-project-cost-health/SKILL-TEMPLATE.md b/codebundles/gcp-project-cost-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..2ed71d76f --- /dev/null +++ b/codebundles/gcp-project-cost-health/SKILL-TEMPLATE.md @@ -0,0 +1,134 @@ +--- +name: gcp-project-cost-health +kind: skill-template +description: GCP cost management toolkit: generate historical cost reports by service/project using BigQuery billing export.... Use when triaging or monitoring GCP, Cost, Optimization workloads with skill templ... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [GCP, Cost, Optimization, Cost, Management, Cost, Reporting, BigQuery, Trend, Analysis] +resource_types: [gcp_resource] +access: read-only +--- + +# GCP Project Cost Health & Reporting + +## Summary + +Comprehensive toolkit for analyzing GCP costs and spending across projects using BigQuery billing export. + +See [README.md](README.md) for additional context. + +## Tools + +### Generate GCP Cost Report By Service and Project + +Generates a detailed cost breakdown report showing actual spending by project and GCP service using BigQuery billing export. Includes month-over-month comparison across the last 3 complete calendar months with per-project and per-service trend analysis. Raises issues when cost increases exceed the configured threshold. + +- **Robot task name**: Generate GCP Cost Report By Service and Project +- **Robot file**: `runbook.robot` +- **Underlying script**: `display_top_projects.sh` +- **Tags**: `GCP`, `Cost`, `Analysis`, `Cost`, `Management`, `Reporting`, `Trend`, `Analysis`, `access:read-only`, `data:config` +- **Reads**: — +- **Writes**: `gcp_cost_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze GCP Network Costs By SKU + +Analyzes network-related costs broken down by SKU, showing daily spend for the last 7 days, weekly, monthly, and three-month spend. Detects cost anomalies, deviations, and projects future costs based on recent spending trends to provide early warnings. + +- **Robot task name**: Analyze GCP Network Costs By SKU +- **Robot file**: `runbook.robot` +- **Underlying script**: `gcp_network_cost_analysis.sh` +- **Tags**: `GCP`, `Network`, `Cost`, `Analysis`, `Egress`, `Ingress`, `access:read-only`, `data:config` +- **Reads**: — +- **Writes**: `gcp_network_cost_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Get GCP Cost Optimization Recommendations + +Fetches COST-RELATED recommendations from GCP Recommender API (committed use discounts, idle resources, rightsizing, etc.). Filters out non-cost recommendations like security/IAM suggestions. + +- **Robot task name**: Get GCP Cost Optimization Recommendations +- **Robot file**: `runbook.robot` +- **Underlying script**: `gcp_recommendations.sh` +- **Tags**: `GCP`, `Cost`, `Optimization`, `Recommendations`, `FinOps`, `access:read-only`, `data:config` +- **Reads**: — +- **Writes**: `gcp_recommendations_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `GCP_PROJECT_IDS` | string | Comma-separated list of GCP project IDs to analyze for cost optimization (e.g., "project-1,project-2,project-3"). If left blank, will assess all projects found in the billing export. | `""` | no | +| `GCP_BILLING_EXPORT_TABLE` | string | BigQuery table path for billing export in format: project-id.dataset_name.gcp_billing_export_v1_XXXXXX (optional - will auto-discover if not provided) | `""` | no | +| `COST_ANALYSIS_LOOKBACK_DAYS` | string | Number of days to look back for cost analysis (default: 30) | `30` | no | +| `GCP_COST_BUDGET` | string | Optional budget threshold in USD. A severity 3 issue will be raised if total costs exceed this amount. Leave at 0 to disable. | `10000` | no | +| `GCP_PROJECT_COST_THRESHOLD_PERCENT` | string | Optional percentage threshold (0-100). A severity 3 issue will be raised if any single project exceeds this percentage of total costs. Leave at 0 to disable. | `25` | no | +| `NETWORK_COST_THRESHOLD_MONTHLY` | string | Monthly network cost threshold (in USD) for severity 3 alerts. Triggers on SKUs that exceed this amount OR are projected to breach it based on recent spending trends (last 7 days). | `200` | no | +| `COST_INCREASE_THRESHOLD` | string | Percentage threshold for month-over-month cost increase alerts. An issue will be raised if total, per-project, or per-service costs increase by more than this percentage between calendar months (default: 10 for 10%). | `10` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `gcp_credentials` | GCP service account json used to authenticate with GCP APIs. | yes | + +## Outputs + +- `gcp_cost_issues.json` +- `gcp_network_cost_issues.json` +- `gcp_recommendations_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/gcp-project-cost-health/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/gcp-project-cost-health +export GCP_PROJECT_IDS=... +export GCP_BILLING_EXPORT_TABLE=... +export COST_ANALYSIS_LOOKBACK_DAYS=... +export GCP_COST_BUDGET=... +export GCP_PROJECT_COST_THRESHOLD_PERCENT=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/gcp-project-cost-health +export GCP_PROJECT_IDS=... +export GCP_BILLING_EXPORT_TABLE=... +export COST_ANALYSIS_LOOKBACK_DAYS=... +bash display_top_projects.sh +bash gcp_cost_historical_report.sh +bash gcp_network_cost_analysis.sh +bash gcp_recommendations.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `display_top_projects.sh` — Bash helper script `display_top_projects.sh`. +- `gcp_cost_historical_report.sh` — Bash helper script `gcp_cost_historical_report.sh`. +- `gcp_network_cost_analysis.sh` — Bash helper script `gcp_network_cost_analysis.sh`. +- `gcp_recommendations.sh` — Bash helper script `gcp_recommendations.sh`. diff --git a/codebundles/gcp-vertex-modelgarden-health/SKILL-TEMPLATE.md b/codebundles/gcp-vertex-modelgarden-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..d13d3dc5b --- /dev/null +++ b/codebundles/gcp-vertex-modelgarden-health/SKILL-TEMPLATE.md @@ -0,0 +1,244 @@ +--- +name: gcp-vertex-modelgarden-health +kind: skill-template +description: Troubleshooting and remediation tasks for GCP Vertex AI Model Garden using Google Cloud Monitoring Python SDK. Use when triaging or monitoring GCP, Vertex AI, Model Garden workloads with skill temp... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [GCP, Vertex AI, Model Garden] +resource_types: [gcp_resource] +access: read-only +--- + +# GCP Vertex AI Model Garden Health + +## Summary + +This codebundle provides comprehensive health monitoring for Google Cloud Platform's Vertex AI Model Garden. + +See [README.md](README.md) for additional context. + +## Tools + +### Discover All Deployed Vertex AI Models in `${GCP_PROJECT_ID}` + +Discovers all deployed Vertex AI models across regions to establish baseline for subsequent analysis + +- **Robot task name**: Discover All Deployed Vertex AI Models in `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Tags**: `vertex-ai`, `discovery`, `models`, `endpoints`, `access:read-only`, `data:config` +- **Reads**: `DISCOVERED_ENDPOINTS`, `DISCOVERED_MODELS`, `EMPTY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze Vertex AI Model Garden Error Patterns and Response Codes in `${GCP_PROJECT_ID}` + +Analyzes error patterns and response codes from Model Garden invocations to identify issues using Python SDK + +- **Robot task name**: Analyze Vertex AI Model Garden Error Patterns and Response Codes in `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Tags**: `vertex-ai`, `error-analysis`, `response-codes`, `troubleshooting`, `access:read-only`, `data:logs-regexp` +- **Reads**: `DISCOVERED_ENDPOINTS`, `DISCOVERED_MODELS`, `EMPTY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Investigate Vertex AI Model Latency Performance Issues in `${GCP_PROJECT_ID}` + +Analyzes latency metrics to identify performance bottlenecks and degradation using Python SDK + +- **Robot task name**: Investigate Vertex AI Model Latency Performance Issues in `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Tags**: `vertex-ai`, `latency`, `performance`, `analysis`, `access:read-only`, `data:config` +- **Reads**: `DISCOVERED_ENDPOINTS`, `DISCOVERED_MODELS`, `EMPTY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Monitor Vertex AI Throughput and Token Consumption Patterns in `${GCP_PROJECT_ID}` + +Analyzes throughput consumption and token usage patterns for capacity planning using Python SDK + +- **Robot task name**: Monitor Vertex AI Throughput and Token Consumption Patterns in `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Tags**: `vertex-ai`, `throughput`, `tokens`, `capacity-planning`, `access:read-only`, `data:config` +- **Reads**: `DISCOVERED_ENDPOINTS`, `DISCOVERED_MODELS`, `EMPTY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Vertex AI Model Garden API Logs for Issues in `${GCP_PROJECT_ID}` + +Analyzes recent API logs for error patterns and usage issues + +- **Robot task name**: Check Vertex AI Model Garden API Logs for Issues in `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Tags**: `vertex-ai`, `logs`, `api-calls`, `monitoring`, `access:read-only`, `data:logs-regexp` +- **Reads**: `GCP_PROJECT_ID`, `LOG_FRESHNESS` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Vertex AI Model Garden Service Health and Quotas in `${GCP_PROJECT_ID}` + +Performs comprehensive health checks on Vertex AI services and quotas + +- **Robot task name**: Check Vertex AI Model Garden Service Health and Quotas in `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Tags**: `vertex-ai`, `health-check`, `quotas`, `service-status`, `access:read-only`, `data:config` +- **Reads**: `DISCOVERED_ENDPOINTS`, `DISCOVERED_MODELS`, `EMPTY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Generate Vertex AI Model Garden Health Summary and Next Steps for `${GCP_PROJECT_ID}` + +Generates a comprehensive health summary with actionable recommendations + +- **Robot task name**: Generate Vertex AI Model Garden Health Summary and Next Steps for `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Tags**: `summary`, `health-report`, `recommendations`, `access:read-only` +- **Reads**: `GCP_PROJECT_ID` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Generate Normalized Health Report Table for `${GCP_PROJECT_ID}` + +Generates a normalized tabular health report for regular monitoring of all LLAMA models (MaaS and Self-hosted) + +- **Robot task name**: Generate Normalized Health Report Table for `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Tags**: `vertex-ai`, `health-report`, `monitoring`, `table`, `access:read-only`, `data:config` +- **Reads**: `EMPTY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Calculates SLI for GCP Vertex AI Model Garden health using Google Cloud Monitoring Python SDK. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Quick Vertex AI Log Health Check for `${GCP_PROJECT_ID}` + +Performs a quick check of recent Vertex AI logs for immediate health assessment + +- **Robot task name**: Quick Vertex AI Log Health Check for `${GCP_PROJECT_ID}` +- **Sub-metric name**: `log_health` +- **Tags**: `vertex-ai`, `logs`, `health-check`, `quick`, `access:read-only`, `data:logs-regexp` +- **Reads**: `GCP_PROJECT_ID`, `SLI_LOG_LOOKBACK` + + +#### Calculate Error Rate Score for `${GCP_PROJECT_ID}` + +Calculates error rate score based on Model Garden invocation errors + +- **Robot task name**: Calculate Error Rate Score for `${GCP_PROJECT_ID}` +- **Sub-metric name**: `error_rate` +- **Tags**: `vertex-ai`, `error-rate`, `sli`, `monitoring`, `access:read-only`, `data:logs-regexp` +- **Reads**: `GCP_PROJECT_ID` + + +#### Calculate Latency Performance Score for `${GCP_PROJECT_ID}` + +Calculates latency performance score based on model response times + +- **Robot task name**: Calculate Latency Performance Score for `${GCP_PROJECT_ID}` +- **Sub-metric name**: `latency_performance` +- **Tags**: `vertex-ai`, `latency`, `performance`, `sli`, `access:read-only`, `data:config` +- **Reads**: `GCP_PROJECT_ID` + + +#### Calculate Throughput Usage Score for `${GCP_PROJECT_ID}` + +Calculates throughput usage score based on token consumption data + +- **Robot task name**: Calculate Throughput Usage Score for `${GCP_PROJECT_ID}` +- **Sub-metric name**: `throughput_usage` +- **Tags**: `vertex-ai`, `throughput`, `usage`, `sli`, `access:read-only`, `data:config` +- **Reads**: `GCP_PROJECT_ID` + + +#### Discover All Deployed Models for `${GCP_PROJECT_ID}` + +Proactively discovers all deployed Vertex AI models and endpoints + +- **Robot task name**: Discover All Deployed Models for `${GCP_PROJECT_ID}` +- **Sub-metric name**: `model_discovery` +- **Tags**: `vertex-ai`, `discovery`, `model-inventory`, `access:read-only`, `data:config` +- **Reads**: `GCP_PROJECT_ID` + + +#### Check Service Availability Score for `${GCP_PROJECT_ID}` + +Checks Vertex AI service availability and configuration + +- **Robot task name**: Check Service Availability Score for `${GCP_PROJECT_ID}` +- **Sub-metric name**: `service_availability` +- **Tags**: `vertex-ai`, `service-health`, `availability`, `sli`, `access:read-only`, `data:config` +- **Reads**: `GCP_PROJECT_ID` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `GCP_PROJECT_ID` | string | The GCP Project ID to scope the API to. | — | yes | +| `LOG_FRESHNESS` | string | Time window for log analysis (e.g., 1h, 30m, 2h, 1d). | `2h` | no | +| `VERTEX_AI_REGIONS` | string | Comma-separated list of regions to check for model discovery (optional). Use 'fast' for common US regions, 'us-only' for all US regions, 'priority' for worldwide common regions. | `` | yes | +| `SLI_LOG_LOOKBACK` | string | Time window for SLI log health check (e.g., 15m, 30m, 1h). | `15m` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `gcp_credentials` | GCP service account json used to authenticate with GCP APIs. | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/gcp-vertex-modelgarden-health/runbook.robot` +- **Monitor**: `codebundles/gcp-vertex-modelgarden-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/gcp-vertex-modelgarden-health +export GCP_PROJECT_ID=... +export LOG_FRESHNESS=... +export VERTEX_AI_REGIONS=... +export SLI_LOG_LOOKBACK=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) diff --git a/codebundles/gh-actions-artifact-analysis/SKILL-TEMPLATE.md b/codebundles/gh-actions-artifact-analysis/SKILL-TEMPLATE.md new file mode 100644 index 000000000..576c894e1 --- /dev/null +++ b/codebundles/gh-actions-artifact-analysis/SKILL-TEMPLATE.md @@ -0,0 +1,129 @@ +--- +name: gh-actions-artifact-analysis +kind: skill-template +description: This taskset fetches the latest GitHub Actions worflow run artifact and analyzes the results with a user provided... Use when triaging or monitoring GitHub, Actions workloads with skill template `g... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [GitHub, Actions] +resource_types: [] +access: read-only +--- + +# GitHub Actions Artifact Analysis + +## Summary + +This codebundle is highly configurable and integrates with GitHub Actions and workflow artifacts. + +See [README.md](README.md) for additional context. + +## Tools + +### Analyze artifact from GitHub workflow `${WORKFLOW_NAME}` in repository `${GITHUB_REPO}` + +Check GitHub workflow status and analyze artifact with a user provided command. + +- **Robot task name**: Analyze artifact from GitHub workflow `${WORKFLOW_NAME}` in repository `${GITHUB_REPO}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `gh_actions_artifact_analysis.sh` +- **Tags**: `github`, `workflow`, `actions`, `artifact`, `report`, `access:read-only`, `data:config` +- **Reads**: `ANALYSIS_COMMAND`, `GITHUB_REPO`, `GITHUB_TOKEN`, `ISSUE_NEXT_STEPS`, `ISSUE_SEARCH_STRING`, `ISSUE_SEVERITY`, `ISSUE_TITLE`, `WORKFLOW_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +This SLI fetches the latest GitHub Actions worflow run artifact pushes a metric based on a user provided command. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Analyze artifact from GitHub Workflow `${WORKFLOW_NAME}` in repository `${GITHUB_REPO}` and push metric + +Check GitHub workflow status, run a user provided analysis command, and push the metric. The analysis command should result in a single metric. + +- **Robot task name**: Analyze artifact from GitHub Workflow `${WORKFLOW_NAME}` in repository `${GITHUB_REPO}` and push metric +- **Sub-metric name**: `artifact_analysis` +- **Underlying script**: `gh_actions_artifact_analysis.sh` +- **Tags**: `github`, `workflow`, `actions`, `artifact`, `report`, `data:config` +- **Reads**: `ANALYSIS_COMMAND`, `GITHUB_TOKEN` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `GITHUB_REPO` | string | The GitHub Reposiroty to query | `''` | no | +| `WORKFLOW_NAME` | string | The GitHub Actions workflow name. | `''` | no | +| `ARTIFACT_NAME` | string | The artifact to inspect. | `''` | no | +| `ANALYSIS_COMMAND` | string | A command to run against the output report. Tools like jq and awk are available. | `''` | no | +| `RESULT_FILE` | string | The artifact to inspect. | `''` | no | +| `PERIOD_HOURS` | string | The amount of hours to condider for a healthy last workflow run. | `24` | no | +| `ISSUE_SEARCH_STRING` | string | A string that, if found in the analysis output, will generate an Issue. | `ERROR|Error` | no | +| `ISSUE_SEVERITY` | string | The severity of the issue. 1 = Critical, 2=Major, 3=Minor, 4=Informational | `4` | no | +| `ISSUE_TITLE` | string | The title of the issue. | `The text `${ISSUE_SEARCH_STRING}` was found in GitHub Workflow `${WORKFLOW_NAME}` in repo `${GITHUB_REPO}`` | no | +| `ISSUE_NEXT_STEPS` | string | A list of next steps to take when the Issue is raised. Use `\n` to separate items in the list.' | `Review the log output or escalate to the service owner.` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `GITHUB_TOKEN` | The GitHub Token used to access the repository. | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/gh-actions-artifact-analysis/runbook.robot` +- **Monitor**: `codebundles/gh-actions-artifact-analysis/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/gh-actions-artifact-analysis +export GITHUB_REPO=... +export WORKFLOW_NAME=... +export ARTIFACT_NAME=... +export ANALYSIS_COMMAND=... +export RESULT_FILE=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/gh-actions-artifact-analysis +export GITHUB_REPO=... +export WORKFLOW_NAME=... +export ARTIFACT_NAME=... +export ANALYSIS_COMMAND=... +bash gh_actions_artifact_analysis.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `gh_actions_artifact_analysis.sh` — Bash helper script `gh_actions_artifact_analysis.sh`. diff --git a/codebundles/gh-actions-health/SKILL-TEMPLATE.md b/codebundles/gh-actions-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..7c4a315e8 --- /dev/null +++ b/codebundles/gh-actions-health/SKILL-TEMPLATE.md @@ -0,0 +1,300 @@ +--- +name: gh-actions-health +kind: skill-template +description: Comprehensive health monitoring for GitHub Actions across specified repositories and organizations. Use when triaging or monitoring GitHub, Actions workloads with skill template `gh-actions-health`. +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [GitHub, Actions] +resource_types: [] +access: read-only +--- + +# GitHub Actions Health Monitoring + +## Summary + +Comprehensive health monitoring for GitHub Actions across specified repositories and organizations. + +See [README.md](README.md) for additional context. + +## Tools + +### Check Recent Workflow Failures Across Specified Repositories + +Analyzes recent workflow failures across the specified repositories and identifies common failure patterns + +- **Robot task name**: Check Recent Workflow Failures Across Specified Repositories +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_workflow_failures.sh` +- **Tags**: — +- **Reads**: `GITHUB_TOKEN` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Long Running Workflows Across Specified Repositories + +Identifies workflows that have been running longer than expected thresholds across the specified repositories + +- **Robot task name**: Check Long Running Workflows Across Specified Repositories +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_long_running_workflows.sh` +- **Tags**: — +- **Reads**: `GITHUB_TOKEN`, `MAX_WORKFLOW_DURATION_MINUTES` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Repository Health Summary for Specified Repositories + +Provides a comprehensive health summary across the specified repositories + +- **Robot task name**: Check Repository Health Summary for Specified Repositories +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_repo_health_summary.sh` +- **Tags**: — +- **Reads**: `GITHUB_TOKEN`, `REPO_FAILURE_THRESHOLD` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check GitHub Actions Runner Health Across Specified Organizations + +Monitors the health and availability of GitHub Actions runners across the specified organizations + +- **Robot task name**: Check GitHub Actions Runner Health Across Specified Organizations +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_runner_health.sh` +- **Tags**: — +- **Reads**: `GITHUB_TOKEN`, `HIGH_RUNNER_UTILIZATION_THRESHOLD` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Security Workflow Status Across Specified Repositories + +Monitors security-related workflows and dependency scanning results across the specified repositories + +- **Robot task name**: Check Security Workflow Status Across Specified Repositories +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_security_workflows.sh` +- **Tags**: — +- **Reads**: `GITHUB_TOKEN` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check GitHub Actions Billing and Usage Across Specified Organizations + +Monitors GitHub Actions usage patterns and potential billing concerns across the specified organizations + +- **Robot task name**: Check GitHub Actions Billing and Usage Across Specified Organizations +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_billing_usage.sh` +- **Tags**: — +- **Reads**: `GITHUB_TOKEN`, `HIGH_USAGE_THRESHOLD` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check GitHub API Rate Limits + +Monitors GitHub API rate limit usage to prevent throttling during health checks + +- **Robot task name**: Check GitHub API Rate Limits +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_rate_limits.sh` +- **Tags**: — +- **Reads**: `GITHUB_TOKEN`, `RATE_LIMIT_WARNING_THRESHOLD` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Service Level Indicators for GitHub Actions Health Monitoring + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Calculate Workflow Success Rate Across Specified Repositories + +Calculates the success rate of workflows across the specified repositories over the specified period + +- **Robot task name**: Calculate Workflow Success Rate Across Specified Repositories +- **Sub-metric name**: `workflow_success` +- **Underlying script**: `calculate_workflow_sli.sh` +- **Tags**: `github`, `workflow`, `success-rate`, `sli`, `multi-repo` +- **Reads**: `GITHUB_TOKEN`, `MIN_WORKFLOW_SUCCESS_RATE` +- **Pass condition**: `float(${success_rate}) >= float(${MIN_WORKFLOW_SUCCESS_RATE})` + + +#### Calculate Organization Health Score Across Specified Organizations + +Calculates overall organization health score across all specified organizations + +- **Robot task name**: Calculate Organization Health Score Across Specified Organizations +- **Sub-metric name**: `org_health` +- **Underlying script**: `calculate_org_sli.sh` +- **Tags**: `github`, `organization`, `health-score`, `sli`, `multi-org` +- **Reads**: `GITHUB_TOKEN`, `MIN_ORG_HEALTH_SCORE` +- **Pass condition**: `float(${org_health_score}) >= float(${MIN_ORG_HEALTH_SCORE})` + + +#### Calculate Runner Availability Score Across Specified Organizations + +Calculates the availability score of GitHub Actions runners across the specified organizations + +- **Robot task name**: Calculate Runner Availability Score Across Specified Organizations +- **Sub-metric name**: `runner_availability` +- **Underlying script**: `calculate_runner_sli.sh` +- **Tags**: `github`, `runners`, `availability`, `sli`, `multi-org` +- **Reads**: `GITHUB_TOKEN`, `MIN_RUNNER_AVAILABILITY` +- **Pass condition**: `float(${availability_score}) >= float(${MIN_RUNNER_AVAILABILITY})` + + +#### Calculate Security Workflow Score Across Specified Repositories + +Calculates security workflow health score including vulnerability scanning across the specified repositories + +- **Robot task name**: Calculate Security Workflow Score Across Specified Repositories +- **Sub-metric name**: `security_workflows` +- **Underlying script**: `calculate_security_sli.sh` +- **Tags**: `github`, `security`, `vulnerability`, `sli`, `multi-repo` +- **Reads**: `GITHUB_TOKEN`, `MIN_SECURITY_SCORE` +- **Pass condition**: `float(${security_score}) >= float(${MIN_SECURITY_SCORE}) and int(${critical_vulnerabilities}) == 0` + + +#### Calculate Performance Score Across Specified Repositories + +Calculates workflow performance score based on execution times across the specified repositories + +- **Robot task name**: Calculate Performance Score Across Specified Repositories +- **Sub-metric name**: `workflow_performance` +- **Underlying script**: `calculate_performance_sli.sh` +- **Tags**: `github`, `performance`, `duration`, `sli`, `multi-repo` +- **Reads**: `GITHUB_TOKEN`, `MAX_LONG_RUNNING_WORKFLOWS`, `MIN_PERFORMANCE_SCORE` +- **Pass condition**: `float(${performance_score}) >= float(${MIN_PERFORMANCE_SCORE}) and int(${long_running_count}) <= int(${MAX_LONG_RUNNING_WORKFLOWS})` + + +#### Calculate API Rate Limit Health Score + +Calculates GitHub API rate limit utilization health score + +- **Robot task name**: Calculate API Rate Limit Health Score +- **Sub-metric name**: `api_rate_limit` +- **Underlying script**: `calculate_rate_limit_sli.sh` +- **Tags**: `github`, `api`, `rate-limit`, `sli` +- **Reads**: `GITHUB_TOKEN`, `MAX_RATE_LIMIT_USAGE` +- **Pass condition**: `float(${usage_percentage}) <= float(${MAX_RATE_LIMIT_USAGE})` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `GITHUB_REPOS` | string | Comma-separated list of GitHub repositories in format owner/repo, or 'ALL' for all org repositories | `ALL` | no | +| `GITHUB_ORGS` | string | GitHub organization names (single org or comma-separated list for multiple orgs) | `""` | no | +| `MAX_WORKFLOW_DURATION_MINUTES` | string | Maximum expected workflow duration in minutes | `60` | no | +| `REPO_FAILURE_THRESHOLD` | string | Maximum number of workflow failures allowed across specified repositories | `10` | no | +| `HIGH_RUNNER_UTILIZATION_THRESHOLD` | string | Threshold percentage for high runner utilization warning | `80` | no | +| `HIGH_USAGE_THRESHOLD` | string | Threshold percentage for high billing usage warning | `80` | no | +| `RATE_LIMIT_WARNING_THRESHOLD` | string | Threshold percentage for GitHub API rate limit warning | `70` | no | +| `FAILURE_LOOKBACK_DAYS` | string | Number of days to look back for workflow failures. Accepts partial numbers (e.g. 0.04 = 1h) | `1` | no | +| `MAX_REPOS_TO_ANALYZE` | string | Maximum number of repositories to analyze when GITHUB_REPOS is 'ALL' (0 for unlimited) | `0` | no | +| `MAX_REPOS_PER_ORG` | string | Maximum number of repositories to analyze per organization when using 'ALL' (0 for unlimited) | `0` | no | +| `MIN_WORKFLOW_SUCCESS_RATE` | string | Minimum acceptable workflow success rate (0.0-1.0) | `0.95` | no | +| `MIN_ORG_HEALTH_SCORE` | string | Minimum acceptable organization health score (0.0-1.0) | `0.90` | no | +| `MIN_RUNNER_AVAILABILITY` | string | Minimum acceptable runner availability score (0.0-1.0) | `0.95` | no | +| `MIN_SECURITY_SCORE` | string | Minimum acceptable security workflow score (0.0-1.0) | `0.98` | no | +| `MIN_PERFORMANCE_SCORE` | string | Minimum acceptable workflow performance score (0.0-1.0) | `0.90` | no | +| `MAX_RATE_LIMIT_USAGE` | string | Maximum acceptable API rate limit usage percentage | `70` | no | +| `MAX_LONG_RUNNING_WORKFLOWS` | string | Maximum number of long-running workflows considered healthy | `2` | no | +| `SLI_LOOKBACK_DAYS` | string | Number of days to look back for SLI calculations | `7` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `GITHUB_TOKEN` | GitHub Personal Access Token with appropriate permissions | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/gh-actions-health/runbook.robot` +- **Monitor**: `codebundles/gh-actions-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/gh-actions-health +export GITHUB_REPOS=... +export GITHUB_ORGS=... +export MAX_WORKFLOW_DURATION_MINUTES=... +export REPO_FAILURE_THRESHOLD=... +export HIGH_RUNNER_UTILIZATION_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/gh-actions-health +export GITHUB_REPOS=... +export GITHUB_ORGS=... +export MAX_WORKFLOW_DURATION_MINUTES=... +bash calculate_org_sli.sh +bash calculate_performance_sli.sh +bash calculate_rate_limit_sli.sh +bash calculate_runner_sli.sh +bash calculate_security_sli.sh +bash calculate_workflow_sli.sh +bash check_billing_usage.sh +bash check_long_running_workflows.sh +bash check_org_workflow_health.sh +bash check_rate_limits.sh +bash check_repo_health_summary.sh +bash check_runner_health.sh +# ... and 2 more scripts +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `calculate_org_sli.sh` — Bash helper script `calculate_org_sli.sh`. +- `calculate_performance_sli.sh` — Bash helper script `calculate_performance_sli.sh`. +- `calculate_rate_limit_sli.sh` — Bash helper script `calculate_rate_limit_sli.sh`. +- `calculate_runner_sli.sh` — Bash helper script `calculate_runner_sli.sh`. +- `calculate_security_sli.sh` — Bash helper script `calculate_security_sli.sh`. +- `calculate_workflow_sli.sh` — Bash helper script `calculate_workflow_sli.sh`. +- `check_billing_usage.sh` — Bash helper script `check_billing_usage.sh`. +- `check_long_running_workflows.sh` — Bash helper script `check_long_running_workflows.sh`. +- `check_org_workflow_health.sh` — Bash helper script `check_org_workflow_health.sh`. +- `check_rate_limits.sh` — Bash helper script `check_rate_limits.sh`. +- `check_repo_health_summary.sh` — Bash helper script `check_repo_health_summary.sh`. +- `check_runner_health.sh` — Bash helper script `check_runner_health.sh`. +- `check_security_workflows.sh` — Bash helper script `check_security_workflows.sh`. +- `check_workflow_failures.sh` — Bash helper script `check_workflow_failures.sh`. diff --git a/codebundles/gke-cluster-health/SKILL-TEMPLATE.md b/codebundles/gke-cluster-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..522675812 --- /dev/null +++ b/codebundles/gke-cluster-health/SKILL-TEMPLATE.md @@ -0,0 +1,209 @@ +--- +name: gke-cluster-health +kind: skill-template +description: Identify issues affecting GKE Clusters in a GCP Project. Use when triaging or monitoring GCP, GKE workloads with skill template `gke-cluster-health`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [GCP, GKE] +resource_types: [gke_cluster] +access: read-only +--- + +# GKE Cluster Health + +## Summary + +This codebundle performs comprehensive health checking for Google Kubernetes Engine (GKE) clusters, including node pool analysis, instance group evaluation, and resource optimization recommendations. + +See [README.md](README.md) for additional context. + +## Tools + +### Identify GKE Service Account Issues in GCP Project `${GCP_PROJECT_ID}` + +Checks for IAM Service Account issues that can affect Cluster functionality + +- **Robot task name**: Identify GKE Service Account Issues in GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `sa_check.sh` +- **Tags**: `gcloud`, `gke`, `gcp`, `access:read-only`, `data:config` +- **Reads**: — +- **Writes**: `issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch GKE Recommendations for GCP Project `${GCP_PROJECT_ID}` + +Fetch and summarize GCP Recommendations for GKE Clusters + +- **Robot task name**: Fetch GKE Recommendations for GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `gcp_recommendations.sh` +- **Tags**: `recommendations`, `gcloud`, `gke`, `gcp`, `access:read-only`, `data:config` +- **Reads**: — +- **Writes**: `recommendations_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Kubernetes Version Support for GKE Clusters in GCP Project `${GCP_PROJECT_ID}` + +Checks whether GKE clusters are running deprecated or extended-support Kubernetes versions and estimates cost impact. GKE charges a $0.50/hr/cluster surcharge for versions in extended support (6x standard cost). GKE Enterprise includes extended support at no additional charge. + +- **Robot task name**: Check Kubernetes Version Support for GKE Clusters in GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_gke_version_support.sh` +- **Tags**: `version`, `deprecation`, `cost`, `extended-support`, `gcloud`, `gke`, `gcp`, `access:read-only`, `data:config` +- **Reads**: `GCP_PROJECT_ID` +- **Writes**: `version_support_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch GKE Cluster Health for GCP Project `${GCP_PROJECT_ID}` + +Using kubectl, fetch overall basic health of the cluster by checking unhealthy pods, overutilized nodes, and underutilized clusters with cost savings opportunities. Analyzes resource utilization and provides MSRP-based cost optimization recommendations. Useful when stackdriver is not available. Requires iam permissions to fetch cluster credentials with viewer rights. + +- **Robot task name**: Fetch GKE Cluster Health for GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `cluster_health.sh` +- **Tags**: `health`, `crashloopbackoff`, `cost-optimization`, `underutilization`, `gcloud`, `gke`, `gcp`, `access:read-only`, `data:config` +- **Reads**: — +- **Writes**: `cluster_health_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check for Quota Related GKE Autoscaling Issues in GCP Project `${GCP_PROJECT_ID}` + +Ensure that GKE Autoscaling will not be blocked by Quota constraints + +- **Robot task name**: Check for Quota Related GKE Autoscaling Issues in GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `quota_check.sh` +- **Tags**: `quota`, `autoscaling`, `gcloud`, `gke`, `gcp`, `access:read-only`, `data:config` +- **Reads**: — +- **Writes**: `region_quota_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Validate GKE Node Sizes for GCP Project `${GCP_PROJECT_ID}` + +Analyse live pod requests/limits, node usage, and propose suitable GKE node machine types. + +- **Robot task name**: Validate GKE Node Sizes for GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Tags**: `sizing`, `gke`, `gcloud`, `access:read-only`, `node`, `autoscale`, `data:config` +- **Reads**: — +- **Writes**: `node_size_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch GKE Cluster Operations for GCP Project `${GCP_PROJECT_ID}` + +Fetches GKE Operations and identify stuck or failed tasks. + +- **Robot task name**: Fetch GKE Cluster Operations for GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `cluster_operations.sh` +- **Tags**: `sizing`, `gke`, `gcloud`, `access:read-only`, `cluster`, `operations`, `data:config` +- **Reads**: — +- **Writes**: `cluster_operations_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Node Pool Health for GCP Project `${GCP_PROJECT_ID}` + +Performs comprehensive node pool health checking including instance group logs, compute operations, and Kubernetes events to surface hard-to-find issues like region exhaustion and quota blocking. + +- **Robot task name**: Check Node Pool Health for GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `node_pool_health.sh` +- **Tags**: `nodepool`, `health`, `events`, `quota`, `exhaustion`, `gcloud`, `gke`, `gcp`, `access:read-only`, `data:config` +- **Reads**: — +- **Writes**: `node_pool_health_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `GCP_PROJECT_ID` | string | The GCP Project ID to scope the API to. | — | yes | +| `CRITICAL_NAMESPACES` | string | A comma separated list of namespaces which are critical. If pods are unhealthy in these namespaces, a severity 1 issue is raised. | `kube-system,flux-system,cert-manager` | no | +| `MAX_CPU_LIMIT_OVERCOMMIT` | string | The desired Maximum CPU Limits overcommitment factored into node recommendations.(e.g. 3=300% overcomitted) | `3` | no | +| `MAX_MEM_LIMIT_OVERCOMMIT` | string | The desired Maximum CPU Limits overcommitment factored into node recommendations.(e.g. 2=200% overcomitted) | `2` | no | +| `OPERATIONS_LOOKBACK_HOURS` | string | The time (in hours) to fetch and analyze cluster operations. | `24` | no | +| `OPERATIONS_STUCK_HOURS` | string | The amount of time (in hours) to declare an operation as stuck. | `2` | no | +| `NODE_HEALTH_LOOKBACK_HOURS` | string | The time (in hours) to look back for node pool events and compute operations when checking node health. | `24` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `gcp_credentials` | GCP service account json used to authenticate with GCP APIs. | yes | + +## Outputs + +- `issues.json` +- `recommendations_issues.json` +- `version_support_issues.json` +- `cluster_health_issues.json` +- `region_quota_issues.json` +- `node_size_issues.json` +- `cluster_operations_issues.json` +- `node_pool_health_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/gke-cluster-health/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/gke-cluster-health +export GCP_PROJECT_ID=... +export CRITICAL_NAMESPACES=... +export MAX_CPU_LIMIT_OVERCOMMIT=... +export MAX_MEM_LIMIT_OVERCOMMIT=... +export OPERATIONS_LOOKBACK_HOURS=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/gke-cluster-health +export GCP_PROJECT_ID=... +export CRITICAL_NAMESPACES=... +export MAX_CPU_LIMIT_OVERCOMMIT=... +bash check_gke_version_support.sh +bash cluster_health.sh +bash cluster_operations.sh +bash gcp_recommendations.sh +bash node_pool_health.sh +bash quota_check.sh +bash sa_check.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `check_gke_version_support.sh` — Bash helper script `check_gke_version_support.sh`. +- `cluster_health.sh` — Bash helper script `cluster_health.sh`. +- `cluster_operations.sh` — Bash helper script `cluster_operations.sh`. +- `gcp_recommendations.sh` — Bash helper script `gcp_recommendations.sh`. +- `node_pool_health.sh` — Bash helper script `node_pool_health.sh`. +- `quota_check.sh` — Bash helper script `quota_check.sh`. +- `sa_check.sh` — Bash helper script `sa_check.sh`. diff --git a/codebundles/jenkins-health/SKILL-TEMPLATE.md b/codebundles/jenkins-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..59d29831a --- /dev/null +++ b/codebundles/jenkins-health/SKILL-TEMPLATE.md @@ -0,0 +1,256 @@ +--- +name: jenkins-health +kind: skill-template +description: List Jenkins health, failed builds, tests and long running builds. Use when triaging or monitoring Jenkins workloads with skill template `jenkins-health`. +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Jenkins] +resource_types: [] +access: read-only +--- + +# Jenkins Health + +## Summary + +This CodeBundle monitors and evaluates the health of Jenkins using the Jenkins REST API The SLI produces a score of 0 (bad), 1(good), or a value in between. + +See [README.md](README.md) for additional context. + +## Tools + +### List Failed Build Logs in Jenkins Instance `${JENKINS_INSTANCE_NAME}` + +Fetches logs from failed Jenkins builds using the Jenkins API + +- **Robot task name**: List Failed Build Logs in Jenkins Instance `${JENKINS_INSTANCE_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `${CURDIR}/failed_build_logs.sh` +- **Tags**: `Jenkins`, `Logs`, `Builds`, `data:logs-regexp` +- **Reads**: `JENKINS_TOKEN`, `JENKINS_USERNAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List Long Running Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}` + +Identifies Jenkins builds that have been running longer than a specified threshold + +- **Robot task name**: List Long Running Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `${CURDIR}/long_running_builds.sh` +- **Tags**: `Jenkins`, `Builds`, `data:config` +- **Reads**: `JENKINS_TOKEN`, `JENKINS_USERNAME`, `LONG_RUNNING_BUILD_MAX_WAIT_TIME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List Recent Failed Tests in Jenkins Instance `${JENKINS_INSTANCE_NAME}` + +List Recent Failed Tests in Jenkins Instance + +- **Robot task name**: List Recent Failed Tests in Jenkins Instance `${JENKINS_INSTANCE_NAME}` +- **Robot file**: `runbook.robot` +- **Tags**: `Jenkins`, `Tests`, `data:logs-regexp` +- **Reads**: `JENKINS_TOKEN`, `JENKINS_URL`, `JENKINS_USERNAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Jenkins Instance `${JENKINS_INSTANCE_NAME}` Health + +Check if Jenkins instance is reachable and responding + +- **Robot task name**: Check Jenkins Instance `${JENKINS_INSTANCE_NAME}` Health +- **Robot file**: `runbook.robot` +- **Tags**: `Jenkins`, `Health`, `data:config` +- **Reads**: `JENKINS_TOKEN`, `JENKINS_URL`, `JENKINS_USERNAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List Long Queued Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}` + +Check for builds stuck in queue beyond threshold + +- **Robot task name**: List Long Queued Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}` +- **Robot file**: `runbook.robot` +- **Tags**: `Jenkins`, `Queue`, `Builds`, `data:config` +- **Reads**: `JENKINS_TOKEN`, `JENKINS_URL`, `JENKINS_USERNAME`, `QUEUED_BUILD_MAX_WAIT_TIME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List Executor Utilization in Jenkins Instance `${JENKINS_INSTANCE_NAME}` + +Check Jenkins executor utilization across nodes + +- **Robot task name**: List Executor Utilization in Jenkins Instance `${JENKINS_INSTANCE_NAME}` +- **Robot file**: `runbook.robot` +- **Tags**: `Jenkins`, `Executors`, `Utilization`, `data:config` +- **Reads**: `JENKINS_TOKEN`, `JENKINS_URL`, `JENKINS_USERNAME`, `MAX_EXECUTOR_UTILIZATION` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Jenkins Instance `${JENKINS_INSTANCE_NAME}` Logs and Add to Report + +Fetches and displays Jenkins logs from the Atom feed + +- **Robot task name**: Fetch Jenkins Instance `${JENKINS_INSTANCE_NAME}` Logs and Add to Report +- **Robot file**: `runbook.robot` +- **Tags**: `Jenkins`, `Logs`, `data:logs-bulk` +- **Reads**: `JENKINS_TOKEN`, `JENKINS_URL`, `JENKINS_USERNAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Check Jenkins health, failed builds, tests and long running builds + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Check For Failed Build Logs in Jenkins Instance `${JENKINS_INSTANCE_NAME}` + +Check For Failed Build Logs in Jenkins + +- **Robot task name**: Check For Failed Build Logs in Jenkins Instance `${JENKINS_INSTANCE_NAME}` +- **Sub-metric name**: `failed_builds` +- **Underlying script**: `${CURDIR}/failed_build_logs.sh` +- **Tags**: `Jenkins`, `Logs`, `Builds`, `data:logs-regexp` +- **Reads**: `JENKINS_TOKEN`, `JENKINS_USERNAME`, `MAX_FAILED_BUILDS` +- **Pass condition**: `int(${failed_builds}) <= int(${MAX_FAILED_BUILDS})` + + +#### Check For Long Running Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}` + +Check Jenkins builds that have been running longer than a specified threshold + +- **Robot task name**: Check For Long Running Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}` +- **Sub-metric name**: `long_running_builds` +- **Underlying script**: `${CURDIR}/long_running_builds.sh` +- **Tags**: `Jenkins`, `Builds`, `data:config` +- **Reads**: `JENKINS_TOKEN`, `JENKINS_USERNAME`, `LONG_RUNNING_BUILD_MAX_WAIT_TIME`, `MAX_LONG_RUNNING_BUILDS` +- **Pass condition**: `int(${long_running_count}) <= int(${MAX_LONG_RUNNING_BUILDS})` + + +#### Check For Recent Failed Tests in Jenkins Instance `${JENKINS_INSTANCE_NAME}` + +Check For Recent Failed Tests in Jenkins + +- **Robot task name**: Check For Recent Failed Tests in Jenkins Instance `${JENKINS_INSTANCE_NAME}` +- **Sub-metric name**: `failed_tests` +- **Tags**: `Jenkins`, `Tests`, `data:logs-regexp` +- **Reads**: `JENKINS_TOKEN`, `JENKINS_URL`, `JENKINS_USERNAME`, `MAX_ALLOWED_FAILED_TESTS` +- **Pass condition**: `int(${total_failed_tests}) <= int(${MAX_ALLOWED_FAILED_TESTS})` + + +#### Check For Jenkins Instance `${JENKINS_INSTANCE_NAME}` Health + +Check if Jenkins instance is reachable and responding + +- **Robot task name**: Check For Jenkins Instance `${JENKINS_INSTANCE_NAME}` Health +- **Sub-metric name**: `instance_health` +- **Tags**: `Jenkins`, `Health`, `data:config` +- **Reads**: `JENKINS_TOKEN`, `JENKINS_URL`, `JENKINS_USERNAME` + + +#### Check For Long Queued Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}` + +Check for builds stuck in queue beyond threshold and calculate SLI score + +- **Robot task name**: Check For Long Queued Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}` +- **Sub-metric name**: `queued_builds` +- **Tags**: `Jenkins`, `Queue`, `Builds`, `SLI`, `data:config` +- **Reads**: `JENKINS_TOKEN`, `JENKINS_URL`, `JENKINS_USERNAME`, `MAX_QUEUED_BUILDS`, `QUEUED_BUILD_MAX_WAIT_TIME` +- **Pass condition**: `int(${queued_count}) <= int(${MAX_QUEUED_BUILDS})` + + +#### Check Jenkins Executor Utilization in Jenkins Instance `${JENKINS_INSTANCE_NAME}` + +Check if Jenkins executor utilization is above 80% + +- **Robot task name**: Check Jenkins Executor Utilization in Jenkins Instance `${JENKINS_INSTANCE_NAME}` +- **Sub-metric name**: `executor_utilization` +- **Tags**: `Jenkins`, `Executors`, `Utilization`, `data:config` +- **Reads**: `JENKINS_TOKEN`, `JENKINS_URL`, `JENKINS_USERNAME`, `MAX_EXECUTOR_UTILIZATION` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `JENKINS_URL` | string | The URL of your Jenkins instance | — | yes | +| `LONG_RUNNING_BUILD_MAX_WAIT_TIME` | string | The threshold for long running builds, formats like '5m', '2h', '1d' or '5min', '2h', '1d' | `"10m"` | no | +| `QUEUED_BUILD_MAX_WAIT_TIME` | string | The time threshold for builds in queue, formats like '5m', '2h', '1d' or '5min', '2h', '1d' | `"10m"` | no | +| `MAX_EXECUTOR_UTILIZATION` | string | The maximum percentage of executor utilization to consider healthy | `"80"` | no | +| `JENKINS_INSTANCE_NAME` | string | Jenkins Instance Name | `"prod-jenkins"` | no | +| `MAX_LONG_RUNNING_BUILDS` | string | The maximum number of long running builds to consider healthy | `"0"` | no | +| `MAX_FAILED_BUILDS` | string | The maximum number of failed builds allowed and consider healthy | `"0"` | no | +| `MAX_ALLOWED_FAILED_TESTS` | string | The maximum number of failed tests allowed and consider healthy | `"0"` | no | +| `MAX_QUEUED_BUILDS` | string | The maximum number of builds stuck in queue to consider healthy | `"0"` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `JENKINS_USERNAME` | Jenkins username for authentication | yes | +| `JENKINS_TOKEN` | Jenkins API token for authentication | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/jenkins-health/runbook.robot` +- **Monitor**: `codebundles/jenkins-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/jenkins-health +export JENKINS_URL=... +export LONG_RUNNING_BUILD_MAX_WAIT_TIME=... +export QUEUED_BUILD_MAX_WAIT_TIME=... +export MAX_EXECUTOR_UTILIZATION=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/jenkins-health +export JENKINS_URL=... +export LONG_RUNNING_BUILD_MAX_WAIT_TIME=... +bash failed_build_logs.sh +bash long_running_builds.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `failed_build_logs.sh` — Bash helper script `failed_build_logs.sh`. +- `long_running_builds.sh` — Bash helper script `long_running_builds.sh`. diff --git a/codebundles/k8s-app-troubleshoot/SKILL-TEMPLATE.md b/codebundles/k8s-app-troubleshoot/SKILL-TEMPLATE.md new file mode 100644 index 000000000..9ac0ebd66 --- /dev/null +++ b/codebundles/k8s-app-troubleshoot/SKILL-TEMPLATE.md @@ -0,0 +1,147 @@ +--- +name: k8s-app-troubleshoot +kind: skill-template +description: Performs application-level troubleshooting by inspecting the logs of a workload for parsable exceptions,. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-app... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Application Troubleshoot + +## Summary + +This codebundle attempts to identify issues created in application code changes recently. + +See [README.md](README.md) for additional context. + +## Tools + +### Get `${CONTAINER_NAME}` Application Logs from Workload `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}` + +Collects the last approximately 300 lines of logs from the workload + +- **Robot task name**: Get `${CONTAINER_NAME}` Application Logs from Workload `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `resource`, `application`, `workload`, `logs`, `state`, `${container_name}`, `${workload_name}`, `access:read-only`, `data:logs-bulk` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `LABELS`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scan `${CONTAINER_NAME}` Application For Misconfigured Environment + +Compares codebase to configured infra environment variables and attempts to report missing environment variables in the app + +- **Robot task name**: Scan `${CONTAINER_NAME}` Application For Misconfigured Environment +- **Robot file**: `runbook.robot` +- **Underlying script**: `env_check.sh` +- **Tags**: `environment`, `variables`, `env`, `infra`, `${container_name}`, `${workload_name}`, `data:config` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Tail `${CONTAINER_NAME}` Application Logs For Stacktraces in Workload `${WORKLOAD_NAME}` + +Performs an inspection on container logs for exceptions/stacktraces, parsing them and attempts to find relevant source code information + +- **Robot task name**: Tail `${CONTAINER_NAME}` Application Logs For Stacktraces in Workload `${WORKLOAD_NAME}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `CREATE_ISSUES`, `KUBERNETES_DISTRIBUTION_BINARY`, `LABELS`, `NAMESPACE`, `REPO_URI` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Measures the number of exception stacktraces present in an application's logs over a time period. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Measure Application Exceptions in `${NAMESPACE}` + +Examines recent logs for exceptions, providing a count of them. + +- **Robot task name**: Measure Application Exceptions in `${NAMESPACE}` +- **Sub-metric name**: `app_troubleshoot` +- **Tags**: `resource`, `application`, `workload`, `logs`, `state`, `exceptions`, `errors`, `data:logs-stacktrace` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `LABELS`, `NAMESPACE` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | `sock-shop` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | `sandbox-cluster-1` | no | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `REPO_URI` | string | Repo URI for the source code to inspect. | `https://github.com/runwhen-contrib/runwhen-local` | no | +| `LABELS` | string | The Kubernetes labels used to select the resource for logs. | — | yes | +| `CREATE_ISSUES` | string | Whether or not the taskset should create github issues when it finds problems. | `YES` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-app-troubleshoot/runbook.robot` +- **Monitor**: `codebundles/k8s-app-troubleshoot/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-app-troubleshoot +export NAMESPACE=... +export CONTEXT=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export REPO_URI=... +export LABELS=... +export CREATE_ISSUES=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-app-troubleshoot +export NAMESPACE=... +export CONTEXT=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export REPO_URI=... +bash env_check.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `env_check.sh` — Bash helper script `env_check.sh`. diff --git a/codebundles/k8s-argocd-application-health/SKILL-TEMPLATE.md b/codebundles/k8s-argocd-application-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..05134dfe1 --- /dev/null +++ b/codebundles/k8s-argocd-application-health/SKILL-TEMPLATE.md @@ -0,0 +1,136 @@ +--- +name: k8s-argocd-application-health +kind: skill-template +description: This taskset collects information and runs general troubleshooting checks against argocd application objects within... Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill temp... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift, ArgoCD] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes ArgoCD Application Health & Troubleshoot + +## Summary + +This codebundle is used to help measure and troubleshoot the health of an ArgoCD managed application. + +See [README.md](README.md) for additional context. + +## Tools + +### Fetch ArgoCD Application Sync Status & Health for `${APPLICATION}` + +Shows the sync status and health of the ArgoCD application. + +- **Robot task name**: Fetch ArgoCD Application Sync Status & Health for `${APPLICATION}` +- **Robot file**: `runbook.robot` +- **Tags**: `Application`, `Sync`, `Health`, `ArgoCD`, `data:config` +- **Reads**: `APPLICATION`, `APPLICATION_APP_NAMESPACE`, `CONTEXT` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch ArgoCD Application Last Sync Operation Details for `${APPLICATION}` + +Fetches the last ArgoCD Application sync operation staus. + +- **Robot task name**: Fetch ArgoCD Application Last Sync Operation Details for `${APPLICATION}` +- **Robot file**: `runbook.robot` +- **Tags**: `Application`, `SyncOperation`, `History`, `ArgoCD`, `data:config` +- **Reads**: `APPLICATION`, `APPLICATION_APP_NAMESPACE`, `CONTEXT` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Unhealthy ArgoCD Application Resources for `${APPLICATION}` + +Displays all resources in an ArgoCD Application that are not in a healthy state. + +- **Robot task name**: Fetch Unhealthy ArgoCD Application Resources for `${APPLICATION}` +- **Robot file**: `runbook.robot` +- **Tags**: `Resources`, `Unhealthy`, `SyncStatus`, `ArgoCD`, `data:config` +- **Reads**: `APPLICATION`, `APPLICATION_APP_NAMESPACE`, `APPLICATION_TARGET_NAMESPACE`, `CONTEXT` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scan For Errors in Pod Logs Related to ArgoCD Application `${APPLICATION}` + +Grep for the error pattern across all pods managed by this Applications deployments. + +- **Robot task name**: Scan For Errors in Pod Logs Related to ArgoCD Application `${APPLICATION}` +- **Robot file**: `runbook.robot` +- **Tags**: `Error`, `Logs`, `Deployments`, `ArgoCD`, `Pods`, `data:logs-regexp` +- **Reads**: `APPLICATION`, `APPLICATION_TARGET_NAMESPACE`, `ERROR_PATTERN` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fully Describe ArgoCD Application `${APPLICATION}` + +Describe all details regarding the ArgoCD Application. Useful if reviewing all content. + +- **Robot task name**: Fully Describe ArgoCD Application `${APPLICATION}` +- **Robot file**: `runbook.robot` +- **Tags**: `Application`, `Describe`, `ArgoCD`, `data:config` +- **Reads**: `APPLICATION`, `APPLICATION_APP_NAMESPACE`, `CONTEXT` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `binary_name` | string | The Kubernetes cli binary to use. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `ERROR_PATTERN` | string | The error pattern to use when grep-ing logs. | `Error|Exception` | no | +| `APPLICATION` | string | The name of the ArgoCD Application to query. Leave blank to query all applications within the namespace. | `''` | no | +| `APPLICATION_TARGET_NAMESPACE` | string | The name of the Kubernetes namespace where the application resources are deployed to. | — | yes | +| `APPLICATION_APP_NAMESPACE` | string | The name of the Kubernetes namespace in which the ArgoCD Application resource exists. | — | yes | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-argocd-application-health/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-argocd-application-health +export binary_name=... +export CONTEXT=... +export ERROR_PATTERN=... +export APPLICATION=... +export APPLICATION_TARGET_NAMESPACE=... +export APPLICATION_APP_NAMESPACE=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/k8s-argocd-helm-health/SKILL-TEMPLATE.md b/codebundles/k8s-argocd-helm-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..e1f602d81 --- /dev/null +++ b/codebundles/k8s-argocd-helm-health/SKILL-TEMPLATE.md @@ -0,0 +1,94 @@ +--- +name: k8s-argocd-helm-health +kind: skill-template +description: This codebundle runs a series of tasks to identify potential helm release issues related to ArgoCD managed Helm objects. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill te... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift, ArgoCD] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes ArgoCD HelmRelease TaskSet + +## Summary + +This codebundle is used to help measure and troubleshoot the health of an ArgoCD managed Helm deployments. + +See [README.md](README.md) for additional context. + +## Tools + +### Fetch all available ArgoCD Helm releases in namespace `${NAMESPACE}` + +List all ArgoCD helm releases that are visible to the kubeconfig. + +- **Robot task name**: Fetch all available ArgoCD Helm releases in namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `argocd`, `helmrelease`, `available`, `list`, `health`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Installed ArgoCD Helm release versions in namespace `${NAMESPACE}` + +Fetch Installed ArgoCD Helm release Versions. + +- **Robot task name**: Fetch Installed ArgoCD Helm release versions in namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `argocd`, `helmrelease`, `version`, `state`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `DISTRIBUTION` | string | Which distribution of Kubernetes to use for operations, such as: Kubernetes, OpenShift, etc. | `Kubernetes` | no | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | `default` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-argocd-helm-health/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-argocd-helm-health +export DISTRIBUTION=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/k8s-artifactory-health/SKILL-TEMPLATE.md b/codebundles/k8s-artifactory-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..8b0b0cedc --- /dev/null +++ b/codebundles/k8s-artifactory-health/SKILL-TEMPLATE.md @@ -0,0 +1,89 @@ +--- +name: k8s-artifactory-health +kind: skill-template +description: Performs a triage on the Open Source version of Artifactory in a Kubernetes cluster. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-artifactory-health`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift, Artifactory] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Artifactory Triage + +## Summary + +This codebundle queries the health REST endpoints of an Artifactory workload in Kubernetes, checking if the service is healthy, and raising issues if it's not. + +See [README.md](README.md) for additional context. + +## Tools + +### Check Artifactory Liveness and Readiness Endpoints in `NAMESPACE` + +Runs a set of exec commands internally in the Artifactory workloads to curl the system health endpoints. + +- **Robot task name**: Check Artifactory Liveness and Readiness Endpoints in `NAMESPACE` +- **Robot file**: `runbook.robot` +- **Tags**: `Pods`, `Statefulset`, `Artifactory`, `Health`, `System`, `Curl`, `API`, `OK`, `HTTP`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `STATEFULSET_NAME` | string | The name of the Artifactory statefulset. | `artifactory-oss` | no | +| `NAMESPACE` | string | The name of the Kubernetes namespace that the Artifactory workloads reside in. | `artifactory` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `LABELS` | string | The Kubernetes labels used to fetch the first matching statefulset. | `` | yes | +| `EXPECTED_AVAILABILITY` | string | The minimum numbers of replicas allowed considered healthy. | `2` | no | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for CLI commands | `kubectl` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `kubeconfig` | The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). | yes | + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-artifactory-health/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-artifactory-health +export STATEFULSET_NAME=... +export NAMESPACE=... +export CONTEXT=... +export LABELS=... +export EXPECTED_AVAILABILITY=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/k8s-certmanager-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-certmanager-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 000000000..1d7ef14b8 --- /dev/null +++ b/codebundles/k8s-certmanager-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,143 @@ +--- +name: k8s-certmanager-healthcheck +kind: skill-template +description: Checks the overall health of certificates in a namespace that are managed by cert-manager. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-certmanager-health... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift, cert-manager] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes cert-manager Healthcheck + +## Summary + +This taskset looks into issues related to CertManager Certificates. + +See [README.md](README.md) for additional context. + +## Tools + +### Get Namespace Certificate Summary for Namespace `${NAMESPACE}` + +Gets a list of cert-manager certificates that are due for renewal and summarize their information for review. + +- **Robot task name**: Get Namespace Certificate Summary for Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `tls`, `certificates`, `kubernetes`, `objects`, `expiration`, `summary`, `cert-manager`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Find Unhealthy Certificates in Namespace `${NAMESPACE}` + +Gets a list of cert-manager certificates are not available. + +- **Robot task name**: Find Unhealthy Certificates in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `tls`, `certificates`, `kubernetes`, `cert-manager`, `failed`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Find Failed Certificate Requests and Identify Issues for Namespace `${NAMESPACE}` + +Gets a list of failed cert-manager certificates and summarize their issues. + +- **Robot task name**: Find Failed Certificate Requests and Identify Issues for Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `certificate_next_steps.sh` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Counts the number of unhealthy cert-manager managed certificates in a namespace. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Count Unready and Expired Certificates in Namespace `${NAMESPACE}` + +Adds together the count of unready and expired certificates. A healthy SLI value is 0. + +- **Robot task name**: Count Unready and Expired Certificates in Namespace `${NAMESPACE}` +- **Sub-metric name**: `cert_manager_health` +- **Tags**: `certificate`, `status`, `count`, `health`, `certmanager`, `cert`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `NAMESPACE` | string | The name of the namespace to search. | `` | yes | +| `DISTRIBUTION` | string | Which distribution of Kubernetes to use for operations, such as: Kubernetes, OpenShift, etc. | `Kubernetes` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-certmanager-healthcheck/runbook.robot` +- **Monitor**: `codebundles/k8s-certmanager-healthcheck/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-certmanager-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export DISTRIBUTION=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-certmanager-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export DISTRIBUTION=... +bash certificate_next_steps.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `certificate_next_steps.sh` — Bash helper script `certificate_next_steps.sh`. diff --git a/codebundles/k8s-chaos-flux/SKILL-TEMPLATE.md b/codebundles/k8s-chaos-flux/SKILL-TEMPLATE.md new file mode 100644 index 000000000..59b098808 --- /dev/null +++ b/codebundles/k8s-chaos-flux/SKILL-TEMPLATE.md @@ -0,0 +1,141 @@ +--- +name: k8s-chaos-flux +kind: skill-template +description: This taskset is used to suspend a flux resource for the purposes of executing chaos tasks. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-chaos-flux`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [kubernetes_resource] +access: read-write +--- + +# Kubernetes Flux Choas Testing + +## Summary + +The `k8s-chaos-flux` codebundle is built to facility chaos tests on Flux managed resources. + +See [README.md](README.md) for additional context. + +## Tools + +### Suspend the Flux Resource Reconciliation for `${FLUX_RESOURCE_NAME}` in namespace `${FLUX_RESOURCE_NAMESPACE}` + +Suspends a flux resource so that it can be manipulated for chaos purposes. + +- **Robot task name**: Suspend the Flux Resource Reconciliation for `${FLUX_RESOURCE_NAME}` in namespace `${FLUX_RESOURCE_NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `Chaos`, `Flux`, `Kubernetes`, `Resource`, `Suspend`, `access:read-write` +- **Reads**: `CONTEXT`, `FLUX_RESOURCE_NAME`, `FLUX_RESOURCE_NAMESPACE`, `FLUX_RESOURCE_TYPE`, `KUBERNETES_DISTRIBUTION_BINARY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Select Random FluxCD Workload for Chaos Target in Namespace `${FLUX_RESOURCE_NAMESPACE}` + +Inspects the Flux resource and randomly selects a deployment to tickle. Tehe. Only runs if RANDOMIZE = Yes. + +- **Robot task name**: Select Random FluxCD Workload for Chaos Target in Namespace `${FLUX_RESOURCE_NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `Chaos`, `Flux`, `Kubernetes`, `Resource`, `Random`, `access:read-write` +- **Reads**: `CONTEXT`, `FLUX_RESOURCE_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `RANDOMIZE`, `TARGET_NAMESPACE`, `TARGET_RESOURCE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Execute Chaos Command on `${TARGET_RESOURCE}` in Namespace `${TARGET_NAMESPACE}` + +Run the desired chaos command within a targeted resource + +- **Robot task name**: Execute Chaos Command on `${TARGET_RESOURCE}` in Namespace `${TARGET_NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `Chaos`, `Flux`, `Kubernetes`, `Resource`, `Kill`, `OOM`, `access:read-write` +- **Reads**: `CHAOS_COMMAND`, `CHAOS_COMMAND_LOOP`, `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `TARGET_NAMESPACE`, `TARGET_RESOURCE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Execute Additional Chaos Command on ${FLUX_RESOURCE_TYPE} '${FLUX_RESOURCE_NAME}' in namespace '${FLUX_RESOURCE_NAMESPACE}' + +Run the additional command as input, verbatim. + +- **Robot task name**: Execute Additional Chaos Command on ${FLUX_RESOURCE_TYPE} '${FLUX_RESOURCE_NAME}' in namespace '${FLUX_RESOURCE_NAMESPACE}' +- **Robot file**: `runbook.robot` +- **Tags**: `Chaos`, `Flux`, `Kubernetes`, `Resource`, `access:read-write` +- **Reads**: `ADDNL_COMMAND`, `CONTEXT`, `TARGET_NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Resume Flux Resource Reconciliation in `${TARGET_NAMESPACE}` + +Resumes Flux reconciliation on desired resource. + +- **Robot task name**: Resume Flux Resource Reconciliation in `${TARGET_NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `Chaos`, `Flux`, `Kubernetes`, `Resource`, `Resume`, `access:read-write` +- **Reads**: `CONTEXT`, `FLUX_RESOURCE_NAME`, `FLUX_RESOURCE_NAMESPACE`, `FLUX_RESOURCE_TYPE`, `KUBERNETES_DISTRIBUTION_BINARY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `RANDOMIZE` | string | Boolean to determine whether to randomly select the impacted resource. | `No` | no | +| `FLUX_RESOURCE_TYPE` | string | The type of the Flux resource to suspend. | `kustomization` | no | +| `FLUX_RESOURCE_NAME` | string | The name of the Flux resource to suspend. | `app-online-boutique` | no | +| `FLUX_RESOURCE_NAMESPACE` | string | The name of the namespace that manages the Flux resource. | `flux-system` | no | +| `TARGET_NAMESPACE` | string | The name of the namespace to target when invoking resource instability. | `online-boutique` | no | +| `TARGET_RESOURCE` | string | The name of the target resource to run chaos commands in. | `deployment/cartservice` | no | +| `CHAOS_COMMAND` | string | The command to run in the target pod. | `/bin/sh -c "while true; do yes > /dev/null & done"` | no | +| `CHAOS_COMMAND_LOOP` | string | The number of times to execute this command. | `1` | no | +| `ADDNL_COMMAND` | string | Run any additional chaos command - verbatim. | `kubectl get pods` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-chaos-flux/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-chaos-flux +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export RANDOMIZE=... +export FLUX_RESOURCE_TYPE=... +export FLUX_RESOURCE_NAME=... +export FLUX_RESOURCE_NAMESPACE=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/k8s-chaos-namespace/SKILL-TEMPLATE.md b/codebundles/k8s-chaos-namespace/SKILL-TEMPLATE.md new file mode 100644 index 000000000..54468be98 --- /dev/null +++ b/codebundles/k8s-chaos-namespace/SKILL-TEMPLATE.md @@ -0,0 +1,150 @@ +--- +name: k8s-chaos-namespace +kind: skill-template +description: Provides chaos injection tasks for Kubernetes namespaces. These are destructive tasks and the expectation is that... Use when triaging or monitoring Kubernetes, Chaos, Engineering workloads with sk... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, Chaos, Engineering, Namespace] +resource_types: [namespace] +access: read-only +--- + +# Kubernetes Namespace Chaos Engineering + +## Summary + +This codebundle provides chaos injection for kubernetes namespaces `Test Namespace Highly Available`. + +See [README.md](README.md) for additional context. + +## Tools + +### Kill Random Pods In Namespace `${NAMESPACE}` + +Randomly selects up to 10 pods in a namespace to delete to test HA + +- **Robot task name**: Kill Random Pods In Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `delete_random_pods.sh` +- **Tags**: `Kubernetes`, `Namespace`, `Deployments`, `Pods`, `Highly`, `Available` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### OOMKill Pods In Namespace `${NAMESPACE}` + +Randomly selects n number of pods to oomkill + +- **Robot task name**: OOMKill Pods In Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `oomkill_pod.sh` +- **Tags**: `Kubernetes`, `Namespace`, `Deployments`, `Pods`, `Highly`, `Available`, `OOMkill`, `Memory` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Mangle Service Selector In Namespace `${NAMESPACE}` + +Breaks a service's label selector to cause a network disruption + +- **Robot task name**: Mangle Service Selector In Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `Kubernetes`, `networking`, `Services`, `Selector` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Mangle Service Port In Namespace `${NAMESPACE}` + +Changes a service's port to cause a network disruption + +- **Robot task name**: Mangle Service Port In Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `Kubernetes`, `networking`, `Services`, `Port` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fill Random Pod Tmp Directory In Namespace `${NAMESPACE}` + +Attaches to a pod and fills the /tmp directory with random data + +- **Robot task name**: Fill Random Pod Tmp Directory In Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `Kubernetes`, `pods`, `volumes`, `tmp` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `CONTEXT` | string | The kubernetes context to use in the kubeconfig provided. | — | yes | +| `NAMESPACE` | string | The namespace to target for scripts. | — | yes | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `kubeconfig` | The kubeconfig secret to use for authenticating with the cluster. | yes | + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-chaos-namespace/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-chaos-namespace +export CONTEXT=... +export NAMESPACE=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-chaos-namespace +export CONTEXT=... +export NAMESPACE=... +bash change_service_port.sh +bash change_service_selector.sh +bash delete_random_pods.sh +bash drain_node.sh +bash expand_tmp.sh +bash oomkill_pod.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `change_service_port.sh` — Bash helper script `change_service_port.sh`. +- `change_service_selector.sh` — Bash helper script `change_service_selector.sh`. +- `delete_random_pods.sh` — Bash helper script `delete_random_pods.sh`. +- `drain_node.sh` — Bash helper script `drain_node.sh`. +- `expand_tmp.sh` — Bash helper script `expand_tmp.sh`. +- `oomkill_pod.sh` — Bash helper script `oomkill_pod.sh`. diff --git a/codebundles/k8s-chaos-workload/SKILL-TEMPLATE.md b/codebundles/k8s-chaos-workload/SKILL-TEMPLATE.md new file mode 100644 index 000000000..c2e0c8ab5 --- /dev/null +++ b/codebundles/k8s-chaos-workload/SKILL-TEMPLATE.md @@ -0,0 +1,153 @@ +--- +name: k8s-chaos-workload +kind: skill-template +description: Provides chaos injection tasks for specific workloads like your apps in a Kubernetes namespace. These are... Use when triaging or monitoring Kubernetes, Chaos, Engineering workloads with skill temp... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, Chaos, Engineering, Workload, Application, Deployments, StatefulSet] +resource_types: [kubernetes_resource] +access: read-write +--- + +# Kubernetes Workload Chaos Engineering + +## Summary + +This codebundle provides chaos injection for a specific workload within a Kubernetes namespace. + +See [README.md](README.md) for additional context. + +## Tools + +### Test `${WORKLOAD_NAME}` High Availability in Namespace `${NAMESPACE}` + +Kills a pod under this workload to test high availability. + +- **Robot task name**: Test `${WORKLOAD_NAME}` High Availability in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `kill_workload_pod.sh` +- **Tags**: `Kubernetes`, `StatefulSet`, `Deployments`, `Pods`, `Highly`, `Available`, `access:read-write` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### OOMKill `${WORKLOAD_NAME}` Pod + +Kills the oldest pod running under the configured workload. + +- **Robot task name**: OOMKill `${WORKLOAD_NAME}` Pod +- **Robot file**: `runbook.robot` +- **Underlying script**: `oomkill_workload_pod.sh` +- **Tags**: `Kubernetes`, `StatefulSet`, `Deployments`, `Pods`, `Highly`, `Available`, `OOMkill`, `Memory`, `access:read-write` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Mangle Service Selector For `${WORKLOAD_NAME}` in `${NAMESPACE}` + +Breaks a service's label selector to cause a network disruption + +- **Robot task name**: Mangle Service Selector For `${WORKLOAD_NAME}` in `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `change_service_selector.sh` +- **Tags**: `Kubernetes`, `networking`, `Services`, `Selector`, `access:read-only` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Mangle Service Port For `${WORKLOAD_NAME}` in `${NAMESPACE}` + +Changes a service's port to cause a network disruption + +- **Robot task name**: Mangle Service Port For `${WORKLOAD_NAME}` in `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `change_service_port.sh` +- **Tags**: `Kubernetes`, `networking`, `Services`, `Port`, `access:read-write` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fill Tmp Directory Of Pod From `${WORKLOAD_NAME}` + +Attaches to a pod and fills the /tmp directory with random data + +- **Robot task name**: Fill Tmp Directory Of Pod From `${WORKLOAD_NAME}` +- **Robot file**: `runbook.robot` +- **Tags**: `Kubernetes`, `pods`, `volumes`, `tmp`, `access:read-write` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `CONTEXT` | string | The kubernetes context to use in the kubeconfig provided. | — | yes | +| `NAMESPACE` | string | The namespace to target for scripts. | — | yes | +| `WORKLOAD_NAME` | string | The name of the workload to perform chaos testing on. Include the kind in the name, eg: deployment/my-app | — | yes | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `kubeconfig` | The kubeconfig secret to use for authenticating with the cluster. | yes | + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-chaos-workload/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-chaos-workload +export CONTEXT=... +export NAMESPACE=... +export WORKLOAD_NAME=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-chaos-workload +export CONTEXT=... +export NAMESPACE=... +export WORKLOAD_NAME=... +bash change_service_port.sh +bash change_service_selector.sh +bash expand_tmp.sh +bash kill_workload_pod.sh +bash oomkill_workload_pod.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `change_service_port.sh` — Bash helper script `change_service_port.sh`. +- `change_service_selector.sh` — Bash helper script `change_service_selector.sh`. +- `expand_tmp.sh` — Bash helper script `expand_tmp.sh`. +- `kill_workload_pod.sh` — Bash helper script `kill_workload_pod.sh`. +- `oomkill_workload_pod.sh` — Bash helper script `oomkill_workload_pod.sh`. diff --git a/codebundles/k8s-cluster-node-health/SKILL-TEMPLATE.md b/codebundles/k8s-cluster-node-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..c8ff36a77 --- /dev/null +++ b/codebundles/k8s-cluster-node-health/SKILL-TEMPLATE.md @@ -0,0 +1,116 @@ +--- +name: k8s-cluster-node-health +kind: skill-template +description: Evaluate cluster node health using kubectl. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-cluster-node-health`. +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Cluster Node Health + +## Summary + +The Service Level Indicator will generate a score for the health of the nodes in the cluster. + +See [README.md](README.md) for additional context. + +## Tools + +### Check for Node Restarts in Cluster `${CONTEXT}` within Interval `${RW_LOOKBACK_WINDOW}` + +Identify nodes that are starting and stopping within the time interval. + +- **Robot task name**: Check for Node Restarts in Cluster `${CONTEXT}` within Interval `${RW_LOOKBACK_WINDOW}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `node_restart_check.sh` +- **Tags**: `cluster`, `preempt`, `spot`, `reboot`, `utilization`, `saturation`, `exhaustion`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Evaluate cluster node health using kubectl. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Generate Namespace Score in Kubernetes Cluster `$${CONTEXT}` + +_No sub-check documentation in Robot source._ + +- **Robot task name**: Generate Namespace Score in Kubernetes Cluster `$${CONTEXT}` +- **Sub-metric name**: `node_health` +- **Tags**: — +- **Reads**: — + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | `default` | no | +| `INTERVAL` | string | The time interval in which to look back for node events. | `10 minutes` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-cluster-node-health/runbook.robot` +- **Monitor**: `codebundles/k8s-cluster-node-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-cluster-node-health +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export INTERVAL=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-cluster-node-health +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export INTERVAL=... +bash node_restart_check.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `node_restart_check.sh` — Bash helper script `node_restart_check.sh`. diff --git a/codebundles/k8s-cluster-resource-health/SKILL-TEMPLATE.md b/codebundles/k8s-cluster-resource-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..a74417790 --- /dev/null +++ b/codebundles/k8s-cluster-resource-health/SKILL-TEMPLATE.md @@ -0,0 +1,164 @@ +--- +name: k8s-cluster-resource-health +kind: skill-template +description: Identify resource constraints or issues in a cluster. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-cluster-resource-health`. +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Cluster Resource Health + +## Summary + +The Service Level Indicator will count the amount of nodes that are over 90% active utilization according to `kubectl top nodes` Create a report of all nodes that are above 90% utilization. + +See [README.md](README.md) for additional context. + +## Tools + +### Identify High Utilization Nodes for Cluster `${CONTEXT}` + +Identify nodes with high utilization . Requires jq. + +- **Robot task name**: Identify High Utilization Nodes for Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `get_high_use_nodes.sh` +- **Tags**: `cluster`, `resources`, `cpu`, `memory`, `utilization`, `saturation`, `exhaustion`, `starvation`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Identify Pods Causing High Node Utilization in Cluster `${CONTEXT}` + +Identify nodes with high utilization and match to pods that are significantly above their resource request configuration. Requires jq. + +- **Robot task name**: Identify Pods Causing High Node Utilization in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `pods_impacting_high_use_nodes.sh` +- **Tags**: `pods`, `resources`, `requests`, `utilization`, `cpu`, `memory`, `exhaustion`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT` +- **Writes**: `pods_exceeding_requests.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Identify Pods with Resource Limits Exceeding Node Capacity in Cluster `${CONTEXT}` + +Identify any Pods in the Cluster `${CONTEXT}` with resource limits (CPU or Memory) larger than the Node's allocatable capacity. + +- **Robot task name**: Identify Pods with Resource Limits Exceeding Node Capacity in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `overlimit_check.sh` +- **Tags**: `nodes`, `limits`, `utilization`, `saturation`, `exhaustion`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Counts the number of nodes above 90% CPU or Memory Utilization from kubectl top. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Identify High Utilization Nodes for Cluster `${CONTEXT}` + +Fetch utilization of each node and raise issue if CPU or Memory is above 90% utilization . Requires jq. Requires get/list of nodes in "metrics.k8s.io" + +- **Robot task name**: Identify High Utilization Nodes for Cluster `${CONTEXT}` +- **Sub-metric name**: `node_utilization` +- **Underlying script**: `get_high_use_nodes.sh` +- **Tags**: `Cluster`, `Resources`, `CPU`, `Memory`, `Utilization`, `Saturation`, `Exhaustion`, `Starvation`, `data:config` +- **Reads**: — + + +#### Identify Pods with Resource Limits Exceeding Node Capacity in Cluster `${CONTEXT}` + +Identify any Pods in the Cluster `${CONTEXT}` with resource limits (CPU or Memory) larger than the Node's allocatable capacity. + +- **Robot task name**: Identify Pods with Resource Limits Exceeding Node Capacity in Cluster `${CONTEXT}` +- **Sub-metric name**: `resource_limits` +- **Underlying script**: `overlimit_check.sh` +- **Tags**: `nodes`, `limits`, `utilization`, `saturation`, `exhaustion`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | `default` | no | +| `MAX_LIMIT_PERCENTAGE` | string | The maximum % that a limit can be in regards to the underlying node capacity. | `90` | no | +| `MEM_USAGE_MIN` | string | The minimum value (in MB) in which to evaluate requests vs usage. Usage below this value are not evaluated. | `100` | no | +| `CPU_USAGE_MIN` | string | The minimum value (in millicores) in which to evaluate requests vs usage. Usage below this value are not evaluated. | `100` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `pods_exceeding_requests.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-cluster-resource-health/runbook.robot` +- **Monitor**: `codebundles/k8s-cluster-resource-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-cluster-resource-health +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export MAX_LIMIT_PERCENTAGE=... +export MEM_USAGE_MIN=... +export CPU_USAGE_MIN=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-cluster-resource-health +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export MAX_LIMIT_PERCENTAGE=... +export MEM_USAGE_MIN=... +bash get_high_use_nodes.sh +bash overlimit_check.sh +bash pods_impacting_high_use_nodes.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `get_high_use_nodes.sh` — Bash helper script `get_high_use_nodes.sh`. +- `overlimit_check.sh` — Bash helper script `overlimit_check.sh`. +- `pods_impacting_high_use_nodes.sh` — Bash helper script `pods_impacting_high_use_nodes.sh`. diff --git a/codebundles/k8s-daemonset-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-daemonset-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 000000000..93f259b53 --- /dev/null +++ b/codebundles/k8s-daemonset-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,225 @@ +--- +name: k8s-daemonset-healthcheck +kind: skill-template +description: Triages issues related to a DaemonSet and its pods, including node scheduling and resource constraints. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-daemo... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [daemonset] +access: read-only +--- + +# Kubernetes DaemonSet Triage + +## Summary + +This codebundle provides a suite of tasks aimed at triaging issues related to a daemonset and its replicas in Kubernetes clusters. + +See [README.md](README.md) for additional context. + +## Tools + +### Analyze Application Log Patterns for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` + +Fetches and analyzes logs from the DaemonSet pods for errors, stack traces, connection issues, and other patterns that indicate application health problems. + +- **Robot task name**: Analyze Application Log Patterns for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DAEMONSET_NAME`, `LOG_AGE`, `LOG_ANALYSIS_DEPTH`, `LOG_SEVERITY_THRESHOLD`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Detect Log Anomalies for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` + +Analyzes logs for repeating patterns, anomalous behavior, and unusual log volume that may indicate underlying issues. + +- **Robot task name**: Detect Log Anomalies for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DAEMONSET_NAME`, `LOG_AGE`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Identify Recent Configuration Changes for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` + +Identifies recent configuration changes from ControllerRevision analysis that might be related to current issues. + +- **Robot task name**: Identify Recent Configuration Changes for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DAEMONSET_NAME`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Liveness Probe Configuration for DaemonSet `${DAEMONSET_NAME}` + +Validates if a Liveness probe has possible misconfigurations + +- **Robot task name**: Check Liveness Probe Configuration for DaemonSet `${DAEMONSET_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `validate_probes.sh` +- **Tags**: — +- **Reads**: `CONTEXT`, `DAEMONSET_NAME`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Readiness Probe Configuration for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` + +Validates if a readiness probe has possible misconfigurations + +- **Robot task name**: Check Readiness Probe Configuration for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `validate_probes.sh` +- **Tags**: — +- **Reads**: `CONTEXT`, `DAEMONSET_NAME`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check for Container Restarts in DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` + +Analyzes container restart patterns in the DaemonSet pods to identify the root cause of restarts, distinguishing between OOM kills, liveness probe failures, and other termination causes. + +- **Robot task name**: Check for Container Restarts in DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `container_restarts.sh` +- **Tags**: `access:read-only`, `containers`, `restarts`, `errors`, `oom`, `probes`, `daemonset`, `${DAEMONSET_NAME}`, `data:config` +- **Reads**: `DAEMONSET_NAME`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Inspect DaemonSet Warning Events for `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` + +Fetches warning events related to the DaemonSet workload in the namespace and triages any issues found in the events. + +- **Robot task name**: Inspect DaemonSet Warning Events for `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `workload_issues.sh` +- **Tags**: `access:read-only`, `events`, `workloads`, `errors`, `warnings`, `get`, `daemonset`, `${DAEMONSET_NAME}`, `data:config` +- **Reads**: `CONTEXT`, `DAEMONSET_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch DaemonSet Workload Details For `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` + +Fetches the current state of the DaemonSet for future review in the report. + +- **Robot task name**: Fetch DaemonSet Workload Details For `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `daemonset`, `details`, `manifest`, `info`, `${DAEMONSET_NAME}`, `data:config` +- **Reads**: `CONTEXT`, `DAEMONSET_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Inspect DaemonSet Status for `${DAEMONSET_NAME}` in namespace `${NAMESPACE}` + +Pulls the status information for a given DaemonSet and checks if all pods are properly scheduled and running across nodes, identifying node scheduling issues. + +- **Robot task name**: Inspect DaemonSet Status for `${DAEMONSET_NAME}` in namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `workload_next_steps.sh` +- **Tags**: — +- **Reads**: `CONTEXT`, `DAEMONSET_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Node Affinity and Tolerations for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` + +Checks the node affinity, tolerations, and scheduling constraints of the DaemonSet to identify potential scheduling issues. + +- **Robot task name**: Check Node Affinity and Tolerations for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DAEMONSET_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | — | yes | +| `DAEMONSET_NAME` | string | The name of the DaemonSet to triage. | — | yes | +| `LOG_AGE` | string | The age of logs to fetch from pods, used for log analysis tasks. | `3h` | no | +| `LOG_ANALYSIS_DEPTH` | string | The depth of log analysis to perform - basic, standard, or comprehensive. | `standard` | no | +| `LOG_SEVERITY_THRESHOLD` | string | The minimum severity level for creating issues (1=critical, 2=high, 3=medium, 4=low, 5=info). | `3` | no | +| `LOG_PATTERN_CATEGORIES` | string | Comma-separated list of log pattern categories to scan for. | `GenericError,AppFailure,StackTrace,Connection,Timeout,Auth,Exceptions,Resource` | no | +| `ANOMALY_THRESHOLD` | string | The threshold for detecting event anomalies based on events per minute. | `5` | no | +| `CONTAINER_RESTART_AGE` | string | The time window (in (h) hours or (m) minutes) to search for container restarts. Only containers that restarted within this time period will be reported. | `10m` | no | +| `CONTAINER_RESTART_THRESHOLD` | string | The minimum number of restarts required to trigger an issue. Containers with restart counts below this threshold will be ignored. | `1` | no | +| `EXCLUDED_CONTAINER_NAMES` | string | Comma-separated list of container names to exclude from log analysis (e.g., linkerd-proxy, istio-proxy, vault-agent). | `linkerd-proxy,istio-proxy,vault-agent` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-daemonset-healthcheck/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-daemonset-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export DAEMONSET_NAME=... +export LOG_AGE=... +export LOG_ANALYSIS_DEPTH=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-daemonset-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export DAEMONSET_NAME=... +bash container_restarts.sh +bash track_daemonset_config_changes.sh +bash validate_probes.sh +bash workload_issues.sh +bash workload_next_steps.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `container_restarts.sh` — Bash helper script `container_restarts.sh`. +- `track_daemonset_config_changes.sh` — Bash helper script `track_daemonset_config_changes.sh`. +- `validate_probes.sh` — Bash helper script `validate_probes.sh`. +- `workload_issues.sh` — Bash helper script `workload_issues.sh`. +- `workload_next_steps.sh` — Bash helper script `workload_next_steps.sh`. diff --git a/codebundles/k8s-deployment-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-deployment-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 000000000..5f4600668 --- /dev/null +++ b/codebundles/k8s-deployment-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,311 @@ +--- +name: k8s-deployment-healthcheck +kind: skill-template +description: Triages issues related to a deployment and its replicas. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-deployment-healthcheck`. +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [deployment] +access: read-only +--- + +# Kubernetes Deployment Triage + +## Summary + +This codebundle provides a suite of tasks aimed at triaging issues related to a deployment and its replicas in Kubernetes clusters. + +See [README.md](README.md) for additional context. + +## Tools + +### Analyze Application Log Patterns for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Fetches and analyzes logs from the deployment pods for errors, connection issues, and other patterns that indicate application health problems. Note: Warning messages about missing log files for excluded containers (like linkerd-proxy, istio-proxy) are expected and harmless. + +- **Robot task name**: Analyze Application Log Patterns for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `LOGS_EXCLUDE_PATTERN`, `LOG_AGE`, `LOG_ANALYSIS_DEPTH`, `LOG_SEVERITY_THRESHOLD`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Detect Event Anomalies for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Analyzes Kubernetes event patterns to identify anomalies such as sudden spikes in event rates, unusual patterns, or recurring issues that might indicate underlying problems with controllers, resources, or deployments. + +- **Robot task name**: Detect Event Anomalies for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `event_anomalies.sh` +- **Tags**: — +- **Reads**: `ANOMALY_THRESHOLD`, `DEPLOYMENT_NAME`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Deployment Logs for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Fetches and displays deployment logs in the report for manual review. Note: Issues are not created by this task - see "Analyze Application Log Patterns" for automated issue detection. + +- **Robot task name**: Fetch Deployment Logs for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTAINER_NAME`, `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `LOG_AGE`, `LOG_LINES`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Liveness Probe Configuration for Deployment `${DEPLOYMENT_NAME}` + +Validates if a Liveness probe has possible misconfigurations + +- **Robot task name**: Check Liveness Probe Configuration for Deployment `${DEPLOYMENT_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `validate_probes.sh` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Readiness Probe Configuration for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Validates if a readiness probe has possible misconfigurations + +- **Robot task name**: Check Readiness Probe Configuration for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `validate_probes.sh` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Inspect Deployment Warning Events for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Fetches warning events related to the deployment workload in the namespace and triages any issues found in the events. + +- **Robot task name**: Inspect Deployment Warning Events for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `workload_issues.sh` +- **Tags**: `access:read-only`, `events`, `workloads`, `errors`, `warnings`, `get`, `deployment`, `${DEPLOYMENT_NAME}`, `data:config` +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Deployment Replica Status for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Inspects the deployment replica status including desired vs available replicas and identifies any scaling issues. + +- **Robot task name**: Check Deployment Replica Status for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `deployment`, `replicas`, `scaling`, `status`, `${DEPLOYMENT_NAME}`, `data:config` +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Inspect Container Restarts for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Checks for container restarts and provides details on restart patterns that might indicate application issues. + +- **Robot task name**: Inspect Container Restarts for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `container_restarts.sh` +- **Tags**: `access:read-only`, `containers`, `restarts`, `pods`, `deployment`, `${DEPLOYMENT_NAME}`, `data:config` +- **Reads**: `CONTAINER_RESTART_AGE`, `CONTAINER_RESTART_THRESHOLD`, `DEPLOYMENT_NAME`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Identify Recent Configuration Changes for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Identifies recent configuration changes from ReplicaSet analysis that might be related to current issues. + +- **Robot task name**: Identify Recent Configuration Changes for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check HPA Health for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Checks if a HorizontalPodAutoscaler exists for the deployment and validates its configuration and current status. + +- **Robot task name**: Check HPA Health for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +This SLI uses kubectl to score deployment health. Produces a value between 0 (completely failing the test) and 1 (fully passing the test). Looks for container restarts, critical log errors, pods not ready, deployment status, and recent events. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Get Container Restarts and Score for Deployment `${DEPLOYMENT_NAME}` + +Counts the total sum of container restarts within a timeframe and determines if they're beyond a threshold. + +- **Robot task name**: Get Container Restarts and Score for Deployment `${DEPLOYMENT_NAME}` +- **Sub-metric name**: `container_restarts` +- **Tags**: `Restarts`, `Pods`, `Containers`, `Count`, `Status`, `data:config` +- **Reads**: `CONTAINER_RESTART_AGE`, `CONTAINER_RESTART_THRESHOLD`, `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Pass condition**: `${restart_count} <= ${threshold}` + + +#### Get Critical Log Errors and Score for Deployment `${DEPLOYMENT_NAME}` + +Fetches logs and checks for critical error patterns that indicate application failures. + +- **Robot task name**: Get Critical Log Errors and Score for Deployment `${DEPLOYMENT_NAME}` +- **Sub-metric name**: `log_errors` +- **Tags**: `logs`, `errors`, `critical`, `patterns`, `data:logs-regexp` +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `LOGS_EXCLUDE_PATTERN`, `MAX_LOG_BYTES`, `MAX_LOG_LINES`, `NAMESPACE` + + +#### Get NotReady Pods Score for Deployment `${DEPLOYMENT_NAME}` + +Fetches a count of unready pods for the specific deployment. + +- **Robot task name**: Get NotReady Pods Score for Deployment `${DEPLOYMENT_NAME}` +- **Sub-metric name**: `pod_readiness` +- **Tags**: `access:read-only`, `Pods`, `Status`, `Phase`, `Ready`, `Unready`, `Running`, `data:config` +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Pass condition**: `${unready_count} == 0` + + +#### Get Deployment Replica Status and Score for `${DEPLOYMENT_NAME}` + +Checks if deployment has the expected number of ready replicas and is available. + +- **Robot task name**: Get Deployment Replica Status and Score for `${DEPLOYMENT_NAME}` +- **Sub-metric name**: `replica_status` +- **Tags**: `deployment`, `replicas`, `status`, `availability`, `data:config` +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Pass condition**: `${ready_replicas} >= 1 and "${available_status}" == "True"` + + +#### Get Recent Warning Events Score for `${DEPLOYMENT_NAME}` + +Checks for recent warning events related to the deployment within a short time window, with filtering to reduce noise. + +- **Robot task name**: Get Recent Warning Events Score for `${DEPLOYMENT_NAME}` +- **Sub-metric name**: `warning_events` +- **Tags**: `events`, `warnings`, `recent`, `fast`, `data:config` +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `EVENT_AGE`, `EVENT_THRESHOLD`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Pass condition**: `${event_count} <= ${threshold} else (0.5 if ${event_count} <= ${threshold_doubled}` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | — | yes | +| `DEPLOYMENT_NAME` | string | The name of the deployment to triage. | — | yes | +| `LOG_LINES` | string | The number of log lines to fetch from the pods when inspecting logs. | `100` | no | +| `LOG_AGE` | string | The age of logs to fetch from pods, used for log analysis tasks. | `10m` | no | +| `LOG_ANALYSIS_DEPTH` | string | The depth of log analysis to perform - basic, standard, or comprehensive. | `standard` | no | +| `LOG_SEVERITY_THRESHOLD` | string | The minimum severity level for creating issues (1=critical, 2=high, 3=medium, 4=low, 5=info). | `3` | no | +| `LOG_PATTERN_CATEGORIES` | string | Comma-separated list of log pattern categories to scan for. | `GenericError,AppFailure,Connection,Timeout,Auth,Exceptions,Resource,HealthyRecovery` | no | +| `ANOMALY_THRESHOLD` | string | The threshold for detecting event anomalies based on events per minute. | `5` | no | +| `LOGS_ERROR_PATTERN` | string | The error pattern to use when grep-ing logs. | `error|ERROR` | no | +| `LOGS_EXCLUDE_PATTERN` | string | Pattern used to exclude entries from log analysis when searching for errors. Use regex patterns to filter out false positives like JSON structures. | `"errors":\\s*\\[\\]|\\bINFO\\b|\\bDEBUG\\b|\\bTRACE\\b|\\bSTART\\s*-\\s*|\\bSTART\\s*method\\b` | no | +| `LOG_SCAN_TIMEOUT` | string | Timeout in seconds for log scanning operations. Increase this value if log scanning times out on large log files. | `300` | no | +| `EXCLUDED_CONTAINER_NAMES` | string | Comma-separated list of container names to exclude from log analysis (e.g., linkerd-proxy, istio-proxy, vault-agent). | `linkerd-proxy,istio-proxy,vault-agent` | no | +| `CONTAINER_NAME` | string | Optional: the specific container name to fetch logs from. If not set, the primary application container is auto-detected by excluding known sidecars. | `` | yes | +| `CONTAINER_RESTART_AGE` | string | The time window (in (h) hours or (m) minutes) to search for container restarts. Only containers that restarted within this time period will be reported. | `10m` | no | +| `CONTAINER_RESTART_THRESHOLD` | string | The minimum number of restarts required to trigger an issue. Containers with restart counts below this threshold will be ignored. | `1` | no | +| `MAX_LOG_LINES` | string | Maximum number of log lines to fetch per container to prevent API overload. | `100` | no | +| `MAX_LOG_BYTES` | string | Maximum log size in bytes to fetch per container to prevent API overload. | `256000` | no | +| `EVENT_AGE` | string | The time window to check for recent warning events. | `10m` | no | +| `EVENT_THRESHOLD` | string | The maximum number of critical warning events allowed before scoring is reduced. | `2` | no | +| `CHECK_SERVICE_ENDPOINTS` | string | Whether to check service endpoint health. Set to 'false' if deployment doesn't have associated services. | `true` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `kubeconfig` | The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-deployment-healthcheck/runbook.robot` +- **Monitor**: `codebundles/k8s-deployment-healthcheck/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-deployment-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export DEPLOYMENT_NAME=... +export LOG_LINES=... +export LOG_AGE=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-deployment-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export DEPLOYMENT_NAME=... +bash check_replicaset.sh +bash container_restarts.sh +bash deployment_logs.sh +bash event_anomalies.sh +bash track_deployment_config_changes.sh +bash validate_probes.sh +bash workload_issues.sh +bash workload_next_steps.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `check_replicaset.sh` — Bash helper script `check_replicaset.sh`. +- `container_restarts.sh` — Bash helper script `container_restarts.sh`. +- `deployment_logs.sh` — Bash helper script `deployment_logs.sh`. +- `event_anomalies.sh` — Bash helper script `event_anomalies.sh`. +- `track_deployment_config_changes.sh` — Bash helper script `track_deployment_config_changes.sh`. +- `validate_probes.sh` — Bash helper script `validate_probes.sh`. +- `workload_issues.sh` — Bash helper script `workload_issues.sh`. +- `workload_next_steps.sh` — Bash helper script `workload_next_steps.sh`. diff --git a/codebundles/k8s-deployment-ops/SKILL-TEMPLATE.md b/codebundles/k8s-deployment-ops/SKILL-TEMPLATE.md new file mode 100644 index 000000000..09e65077c --- /dev/null +++ b/codebundles/k8s-deployment-ops/SKILL-TEMPLATE.md @@ -0,0 +1,237 @@ +--- +name: k8s-deployment-ops +kind: skill-template +description: Perform oprational tasks for a Kubernetes deployment. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-deployment-ops`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [deployment] +access: read-only +--- + +# Kubernetes Deployment Operations + +## Summary + +This codebundle provides a suite of operational tasks related to a deployment in Kubernetes clusters. + +See [README.md](README.md) for additional context. + +## Tools + +### Restart Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Perform a rollout restart on the deployment + +- **Robot task name**: Restart Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Force Delete Pods in Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Force delete all pods related to the deployment + +- **Robot task name**: Force Delete Pods in Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Rollback Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` to Previous Version + +Perform a rollback to a known functional version + +- **Robot task name**: Rollback Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` to Previous Version +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scale Down Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Stops (or nearly stops) all running pods in a deployment to immediately halt a failing or runaway service. + +- **Robot task name**: Scale Down Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `ALLOW_SCALE_TO_ZERO`, `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scale Up Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` by ${SCALE_UP_FACTOR}x + +Increase deployment replicas + +- **Robot task name**: Scale Up Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` by ${SCALE_UP_FACTOR}x +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `MAX_REPLICAS`, `NAMESPACE`, `SCALE_UP_FACTOR` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Clean Up Stale ReplicaSets for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Deletes all stale replicasets. + +- **Robot task name**: Clean Up Stale ReplicaSets for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scale Down Stale ReplicaSets for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Finds any old/stale replicasets that still have active pods and scales them down. + +- **Robot task name**: Scale Down Stale ReplicaSets for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scale Up HPA for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` by ${HPA_SCALE_FACTOR}x + +Increase HPA min and max replicas by a scaling factor + +- **Robot task name**: Scale Up HPA for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` by ${HPA_SCALE_FACTOR}x +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `HPA_MAX_REPLICAS`, `HPA_SCALE_FACTOR`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scale Down HPA for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` to Min ${HPA_MIN_REPLICAS} + +Decrease HPA min and max replicas to specified minimum values or scale down by factor + +- **Robot task name**: Scale Down HPA for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` to Min ${HPA_MIN_REPLICAS} +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `HPA_MIN_REPLICAS`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Increase CPU Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Intelligently increases CPU resources for a deployment based on VPA recommendations, HPA presence, or doubles current values. Does not apply if GitOps-managed or HPA exists. + +- **Robot task name**: Increase CPU Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Increase Memory Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Intelligently increases memory resources for a deployment based on VPA recommendations, HPA presence, or doubles current values. Does not apply if GitOps-managed or HPA exists. + +- **Robot task name**: Increase Memory Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Decrease CPU Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Intelligently decreases CPU resources for a deployment by dividing current values by scale down factor. Does not apply if GitOps-managed or HPA exists. + +- **Robot task name**: Decrease CPU Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RESOURCE_SCALE_DOWN_FACTOR` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Decrease Memory Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` + +Intelligently decreases memory resources for a deployment by dividing current values by scale down factor. Does not apply if GitOps-managed or HPA exists. + +- **Robot task name**: Decrease Memory Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RESOURCE_SCALE_DOWN_FACTOR` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `DEPLOYMENT_NAME` | string | Used to target the resource for queries and filtering events. | — | yes | +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | — | yes | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `SCALE_UP_FACTOR` | string | The multiple in which to increase the total amount of pods. For example, a deployment with 2 pods and a scale up factor of 2 will result in 4 pods. | `2` | no | +| `MAX_REPLICAS` | string | The Max replicas for any scaleup activity. | `10` | no | +| `ALLOW_SCALE_TO_ZERO` | string | Permit deployments to scale to 0. | `false` | no | +| `HPA_SCALE_FACTOR` | string | The multiple by which to scale HPA min/max replicas. | `2` | no | +| `HPA_MAX_REPLICAS` | string | The maximum replicas allowed for HPA max value during scale up operations. | `20` | no | +| `HPA_MIN_REPLICAS` | string | The minimum replicas to set for HPA during scale down operations. | `1` | no | +| `RESOURCE_SCALE_DOWN_FACTOR` | string | The factor by which to divide CPU/memory resources when scaling down (e.g., 2 means divide by 2). | `2` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-deployment-ops/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-deployment-ops +export DEPLOYMENT_NAME=... +export NAMESPACE=... +export CONTEXT=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export SCALE_UP_FACTOR=... +export MAX_REPLICAS=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/k8s-fluxcd-helm-health/SKILL-TEMPLATE.md b/codebundles/k8s-fluxcd-helm-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..97b0342f2 --- /dev/null +++ b/codebundles/k8s-fluxcd-helm-health/SKILL-TEMPLATE.md @@ -0,0 +1,134 @@ +--- +name: k8s-fluxcd-helm-health +kind: skill-template +description: This codebundle runs a series of tasks to identify potential helm release issues related to Flux managed Helm objects. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill temp... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift, FluxCD] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes FluxCD HelmRelease TaskSet + +## Summary + +The `k8s-fluxcd-helm-health` codebundle checks for helm related resources within the Kubernetes cluster to surface up potential issues. + +See [README.md](README.md) for additional context. + +## Tools + +### List all available FluxCD Helmreleases in Namespace `${NAMESPACE}` + +List all FluxCD helmreleases that are visible to the kubeconfig. + +- **Robot task name**: List all available FluxCD Helmreleases in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `FluxCD`, `Helmrelease`, `Available`, `List`, `${NAMESPACE}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RESOURCE_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Installed FluxCD Helmrelease Versions in Namespace `${NAMESPACE}` + +List helmreleases and the last attempted software version and the current running version. + +- **Robot task name**: Fetch Installed FluxCD Helmrelease Versions in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `FluxCD`, `Helmrelease`, `Versions`, `${NAMESPACE}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RESOURCE_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Mismatched FluxCD HelmRelease Version in Namespace `${NAMESPACE}` + +List helmreleases and use jq to display any releases where the last attempted software revision doesn't match the current running revision. Requires jq. + +- **Robot task name**: Fetch Mismatched FluxCD HelmRelease Version in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `FluxCD`, `Helmrelease`, `Version`, `Mismatched`, `Unhealthy`, `${NAMESPACE}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RESOURCE_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch FluxCD HelmRelease Error Messages in Namespace `${NAMESPACE}` + +List helmreleases and display the status conditions message for any helmreleases that are not in a Ready state. + +- **Robot task name**: Fetch FluxCD HelmRelease Error Messages in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `FluxCD`, `Helmrelease`, `Errors`, `Unhealthy`, `Message`, `${NAMESPACE}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RESOURCE_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check for Available Helm Chart Updates in Namespace `${NAMESPACE}` + +List all helmreleases in namespace and check for available helmchart updates. + +- **Robot task name**: Check for Available Helm Chart Updates in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `FluxCD`, `Helmchart`, `Errors`, `Unhealthy`, `Message`, `HelmRelease`, `${NAMESPACE}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RESOURCE_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `DISTRIBUTION` | string | Which distribution of Kubernetes to use for operations, such as: Kubernetes, OpenShift, etc. | `Kubernetes` | no | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. Accepts a single namespace in the format `-n namespace-name` or `--all-namespaces`. | `--all-namespaces` | no | +| `RESOURCE_NAME` | string | The short or long name of the Kubernetes helmrelease resource to search for. These might vary by helm controller implementation, and are best to use full crd name. | `helmreleases` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | `default` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-fluxcd-helm-health/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-fluxcd-helm-health +export DISTRIBUTION=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export NAMESPACE=... +export RESOURCE_NAME=... +export CONTEXT=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/k8s-fluxcd-kustomization-health/SKILL-TEMPLATE.md b/codebundles/k8s-fluxcd-kustomization-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..6c5218a4f --- /dev/null +++ b/codebundles/k8s-fluxcd-kustomization-health/SKILL-TEMPLATE.md @@ -0,0 +1,155 @@ +--- +name: k8s-fluxcd-kustomization-health +kind: skill-template +description: This codebundle runs a series of tasks to identify potential Kustomization issues related to Flux managed... Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift, FluxCD] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes FluxCD Kustomization TaskSet + +## Summary + +The `k8s-fluxcd-kustomizations-health` codebundle checks for Kustomization resources within the Kubernetes cluster to surface up potential issues. + +See [README.md](README.md) for additional context. + +## Tools + +### List All FluxCD Kustomization objects in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}` + +List all FluxCD kustomization objects. + +- **Robot task name**: List All FluxCD Kustomization objects in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `FluxCD`, `Kustomization`, `Available`, `List`, `${NAMESPACE}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RESOURCE_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List Suspended FluxCD Kustomization objects in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}` + +List Suspended FluxCD kustomization objects. + +- **Robot task name**: List Suspended FluxCD Kustomization objects in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `FluxCD`, `Kustomization`, `Suspended`, `List`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RESOURCE_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List Unready FluxCD Kustomizations in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}` + +List all Kustomizations that are not found in a ready state in namespace. + +- **Robot task name**: List Unready FluxCD Kustomizations in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `workload_next_steps.sh` +- **Tags**: `access:read-only`, `FluxCD`, `Kustomization`, `Versions`, `${NAMESPACE}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RESOURCE_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +This codebundle checks for unhealthy or suspended FluxCD Kustomization objects. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### List Suspended FluxCD Kustomization objects in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}` + +List Suspended FluxCD kustomization objects. + +- **Robot task name**: List Suspended FluxCD Kustomization objects in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}` +- **Sub-metric name**: `suspended_kustomizations` +- **Tags**: `access:read-only`, `FluxCD`, `Kustomization`, `Suspended`, `List`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RESOURCE_NAME` + + +#### List Unready FluxCD Kustomizations in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}` + +List all Kustomizations that are not found in a ready state in namespace. + +- **Robot task name**: List Unready FluxCD Kustomizations in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}` +- **Sub-metric name**: `unready_kustomizations` +- **Tags**: `access:read-only`, `FluxCD`, `Kustomization`, `Versions`, `${NAMESPACE}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RESOURCE_NAME` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `DISTRIBUTION` | string | Which distribution of Kubernetes to use for operations, such as: Kubernetes, OpenShift, etc. | `Kubernetes` | no | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | `default` | no | +| `RESOURCE_NAME` | string | The short or long name of the Kubernetes kustomizations resource to search for. These might vary by Kustomize controller implementation, and are best to use full crd name. | `kustomizations` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | `default` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-fluxcd-kustomization-health/runbook.robot` +- **Monitor**: `codebundles/k8s-fluxcd-kustomization-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-fluxcd-kustomization-health +export DISTRIBUTION=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export NAMESPACE=... +export RESOURCE_NAME=... +export CONTEXT=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-fluxcd-kustomization-health +export DISTRIBUTION=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export NAMESPACE=... +export RESOURCE_NAME=... +bash workload_next_steps.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `workload_next_steps.sh` — Bash helper script `workload_next_steps.sh`. diff --git a/codebundles/k8s-fluxcd-reconcile/SKILL-TEMPLATE.md b/codebundles/k8s-fluxcd-reconcile/SKILL-TEMPLATE.md new file mode 100644 index 000000000..dcd7e9757 --- /dev/null +++ b/codebundles/k8s-fluxcd-reconcile/SKILL-TEMPLATE.md @@ -0,0 +1,114 @@ +--- +name: k8s-fluxcd-reconcile +kind: skill-template +description: Generates a report of the reconciliation errors for fluxcd in your cluster. Use when triaging or monitoring Kubernetes, Fluxcd workloads with skill template `k8s-fluxcd-reconcile`. +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, Fluxcd] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Fluxcd Reconciliation Report + +## Summary + +This codebundle measures the number of reconciliation errors in the fluxcd controllers and can generate a report of them. + +See [README.md](README.md) for additional context. + +## Tools + +### Check FluxCD Reconciliation Health in Kubernetes Namespace `${FLUX_NAMESPACE}` + +Fetches reconciliation logs for flux and creates a report for them. + +- **Robot task name**: Check FluxCD Reconciliation Health in Kubernetes Namespace `${FLUX_NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `Kubernetes`, `Namespace`, `Flux`, `data:config` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Measures failing reconciliations for fluxcd + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Health Check Flux Reconciliation + +Measures failing reconciliations for fluxcd + +- **Robot task name**: Health Check Flux Reconciliation +- **Sub-metric name**: `fluxcd_reconcile` +- **Tags**: `Kubernetes`, `Namespace`, `Flux`, `data:config` +- **Reads**: — + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `CONTEXT` | string | The kubernetes context to use in the kubeconfig provided. | — | yes | +| `FLUX_NAMESPACE` | string | The namespace where the flux controllers reside. Typically flux-system. | `flux-system` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `kubeconfig` | The kubeconfig secret to use for authenticating with the cluster. | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-fluxcd-reconcile/runbook.robot` +- **Monitor**: `codebundles/k8s-fluxcd-reconcile/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-fluxcd-reconcile +export CONTEXT=... +export FLUX_NAMESPACE=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-fluxcd-reconcile +export CONTEXT=... +export FLUX_NAMESPACE=... +bash flux_reconcile_report.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `flux_reconcile_report.sh` — Bash helper script `flux_reconcile_report.sh`. diff --git a/codebundles/k8s-gitops-gh-remediate/SKILL-TEMPLATE.md b/codebundles/k8s-gitops-gh-remediate/SKILL-TEMPLATE.md new file mode 100644 index 000000000..dfb10ca1a --- /dev/null +++ b/codebundles/k8s-gitops-gh-remediate/SKILL-TEMPLATE.md @@ -0,0 +1,139 @@ +--- +name: k8s-gitops-gh-remediate +kind: skill-template +description: Provides a list of tasks that can remediate configuraiton issues with manifests in GitHub based GitOps repositories. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill templa... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift, FluxCD, ArgoCD, GitHub] +resource_types: [kubernetes_resource] +access: read-write +--- + +# Kubernetes GitOps GitHub Remediation + +## Summary + +This codebundle provides a suite of tasks aimed at remediating configuration issues related to Kubernetes deployments managed in github repositories. + +See [README.md](README.md) for additional context. + +## Tools + +### Remediate Readiness and Liveness Probe GitOps Manifests in Namespace `${NAMESPACE}` + +Fixes misconfigured readiness or liveness probe configurations for pods in a namespace that are managed in a GitHub GitOps repository + +- **Robot task name**: Remediate Readiness and Liveness Probe GitOps Manifests in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `update_github_manifests.sh` +- **Tags**: `access:read-write`, `readiness`, `liveness`, `probe`, `remediate`, `gitops`, `github`, `data:config` +- **Reads**: `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Increase ResourceQuota Limit for Namespace `${NAMESPACE}` in GitHub GitOps Repository + +Looks for a resourcequota object in the namespace and increases it if applicable, and if it is managed in a GitHub GitOps repository + +- **Robot task name**: Increase ResourceQuota Limit for Namespace `${NAMESPACE}` in GitHub GitOps Repository +- **Robot file**: `runbook.robot` +- **Underlying script**: `update_github_manifests.sh` +- **Tags**: `access:read-write`, `resourcequota`, `quota`, `namespace`, `remediate`, `github`, `gitops`, `data:config` +- **Reads**: `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Adjust Pod Resources to Match VPA Recommendation in `${NAMESPACE}` + +Queries the namespace for any Vertical Pod Autoscaler resource recommendations and applies them to GitOps GitHub controlled manifests. + +- **Robot task name**: Adjust Pod Resources to Match VPA Recommendation in `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `update_github_manifests.sh` +- **Tags**: `access:read-write`, `recommendation`, `resources`, `utilization`, `gitops`, `github`, `pods`, `cpu`, `memory`, `allocation`, `vpa`, `data:config` +- **Reads**: `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Expand Persistent Volume Claims in Namespace `${NAMESPACE}` + +Checks the disk utilization for all PVCs and updates the GitOps manifest for any that are highly utilized. + +- **Robot task name**: Expand Persistent Volume Claims in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `update_github_manifests.sh` +- **Tags**: `access:read-write`, `recommendation`, `pv`, `pvc`, `utilization`, `gitops`, `github`, `persistentvolumeclaim`, `persistentvolume`, `storage`, `capacity`, `data:config` +- **Reads**: `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | — | yes | +| `CONTEXT` | string | Which Kubernetes context to operate within. | `''` | no | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-gitops-gh-remediate/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-gitops-gh-remediate +export NAMESPACE=... +export CONTEXT=... +export KUBERNETES_DISTRIBUTION_BINARY=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-gitops-gh-remediate +export NAMESPACE=... +export CONTEXT=... +export KUBERNETES_DISTRIBUTION_BINARY=... +bash pvc_utilization_check.sh +bash resource_quota_check.sh +bash update_github_manifests.sh +bash validate_all_probes.sh +bash vpa_recommendations.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `pvc_utilization_check.sh` — Bash helper script `pvc_utilization_check.sh`. +- `resource_quota_check.sh` — Bash helper script `resource_quota_check.sh`. +- `update_github_manifests.sh` — Bash helper script `update_github_manifests.sh`. +- `validate_all_probes.sh` — Bash helper script `validate_all_probes.sh`. +- `vpa_recommendations.sh` — Bash helper script `vpa_recommendations.sh`. diff --git a/codebundles/k8s-image-check/SKILL-TEMPLATE.md b/codebundles/k8s-image-check/SKILL-TEMPLATE.md new file mode 100644 index 000000000..441e216b0 --- /dev/null +++ b/codebundles/k8s-image-check/SKILL-TEMPLATE.md @@ -0,0 +1,118 @@ +--- +name: k8s-image-check +kind: skill-template +description: This taskset provides detailed information about the images used in a Kubernetes namespace. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-image-check`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Image Check + +## Summary + +Simple informational report that provides information about images in a namespace. + +See [README.md](README.md) for additional context. + +## Tools + +### Check Image Rollover Times for Namespace `${NAMESPACE}` + +Fetches and checks when images last rolled over in a namespace. + +- **Robot task name**: Check Image Rollover Times for Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List Images and Tags for Every Container in Running Pods for Namespace `${NAMESPACE}` + +Display the status, image name, image tag, and container name for running pods in the namespace. + +- **Robot task name**: List Images and Tags for Every Container in Running Pods for Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `pods`, `containers`, `image`, `images`, `tag`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List Images and Tags for Every Container in Failed Pods for Namespace `${NAMESPACE}` + +Display the status, image name, image tag, and container name for failed pods in the namespace. + +- **Robot task name**: List Images and Tags for Every Container in Failed Pods for Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `pods`, `containers`, `image`, `images`, `tag`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List ImagePullBackOff Events and Test Path and Tags for Namespace `${NAMESPACE}` + +Search events in the last 5 minutes for BackOff events related to image pull issues. Run Skopeo to test if the image path exists and what tags are available. + +- **Robot task name**: List ImagePullBackOff Events and Test Path and Tags for Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `containers`, `image`, `images`, `tag`, `imagepullbackoff`, `skopeo`, `backoff`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `NAMESPACE` | string | The name of the namespace to search. | `` | yes | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-image-check/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-image-check +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/k8s-ingress-gce-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-ingress-gce-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 000000000..20a8e693d --- /dev/null +++ b/codebundles/k8s-ingress-gce-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,147 @@ +--- +name: k8s-ingress-gce-healthcheck +kind: skill-template +description: Troubleshoot GCE Ingress Resources related to GCP HTTP Load Balancer in GKE. Use when triaging or monitoring Kubernetes, GKE, GCE workloads with skill template `k8s-ingress-gce-healthcheck`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, GKE, GCE, GCP] +resource_types: [ingress] +access: read-only +--- + +# Kubernetes Ingress GCE & GCP HTTP Load Balancer Healthcheck + +## Summary + +Triages the GCP HTTP Load Balancer resources that are created when an ingress object is detected and created by the ingress-gce controller. + +See [README.md](README.md) for additional context. + +## Tools + +### Search For GCE Ingress Warnings in GKE Context `${CONTEXT}` + +Find warning events related to GCE Ingress and services objects + +- **Robot task name**: Search For GCE Ingress Warnings in GKE Context `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `service`, `ingress`, `endpoint`, `health`, `ingress-gce`, `gke`, `data:config` +- **Reads**: `CONTEXT`, `GCP_PROJECT_ID`, `INGRESS`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Identify Unhealthy GCE HTTP Ingress Backends in GKE Namespace `${NAMESPACE}` + +Checks the backend annotations on the ingress object to determine if they are not regstered as healthy + +- **Robot task name**: Identify Unhealthy GCE HTTP Ingress Backends in GKE Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `service`, `ingress`, `endpoint`, `health`, `ingress-gce`, `gke`, `data:config` +- **Reads**: `CONTEXT`, `INGRESS`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Validate GCP HTTP Load Balancer Configurations in GCP Project `${GCP_PROJECT_ID}` + +Extract GCP HTTP Load Balancer components from ingress annotations and check health of each object + +- **Robot task name**: Validate GCP HTTP Load Balancer Configurations in GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check_gce_ingress_objects.sh` +- **Tags**: `access:read-only`, `service`, `ingress`, `endpoint`, `health`, `backends`, `urlmap`, `gce`, `data:config` +- **Reads**: `INGRESS`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Network Error Logs from GCP Operations Manager for Ingress Backends in GCP Project `${GCP_PROJECT_ID}` + +Fetch logs from the last 1d that are specific to the HTTP Load Balancer within the last 60 minutes + +- **Robot task name**: Fetch Network Error Logs from GCP Operations Manager for Ingress Backends in GCP Project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `service`, `ingress`, `endpoint`, `health`, `data:logs-regexp` +- **Reads**: `CONTEXT`, `GCP_PROJECT_ID`, `INGRESS`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Review GCP Operations Logging Dashboard in GCP project `${GCP_PROJECT_ID}` + +Create urls that will help users obtain logs from the GCP Dashboard + +- **Robot task name**: Review GCP Operations Logging Dashboard in GCP project `${GCP_PROJECT_ID}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `service`, `ingress`, `endpoint`, `health`, `logging`, `http`, `loadbalancer`, `data:logs-regexp` +- **Reads**: `CONTEXT`, `INGRESS`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | — | yes | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `INGRESS` | string | Which Ingress object to troubleshoot. | — | yes | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `GCP_PROJECT_ID` | string | The GCP Project ID to scope the API to. | — | yes | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `gcp_credentials` | GCP service account json used to authenticate with GCP APIs. | yes | + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-ingress-gce-healthcheck/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-ingress-gce-healthcheck +export NAMESPACE=... +export CONTEXT=... +export INGRESS=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export GCP_PROJECT_ID=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-ingress-gce-healthcheck +export NAMESPACE=... +export CONTEXT=... +export INGRESS=... +export KUBERNETES_DISTRIBUTION_BINARY=... +bash check_gce_ingress_objects.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `check_gce_ingress_objects.sh` — Bash helper script `check_gce_ingress_objects.sh`. diff --git a/codebundles/k8s-ingress-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-ingress-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 000000000..65a5e9ad0 --- /dev/null +++ b/codebundles/k8s-ingress-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,94 @@ +--- +name: k8s-ingress-healthcheck +kind: skill-template +description: Triages issues related to a ingress objects and services. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-ingress-healthcheck`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [ingress] +access: read-only +--- + +# Kubernetes Ingress Healthcheck + +## Summary + +The `k8s-ingress-healthchech` codebundle checks the health of ingress objects within a Namespace. + +See [README.md](README.md) for additional context. + +## Tools + +### Fetch Ingress Object Health in Namespace `${NAMESPACE}` + +Fetches all ingress objects in the namespace and outputs the name, health status, services, and endpoints. + +- **Robot task name**: Fetch Ingress Object Health in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `service`, `ingress`, `endpoint`, `health`, `${NAMESPACE}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check for Ingress and Service Conflicts in Namespace `${NAMESPACE}` + +Look for conflicting configuration between service and ingress objects. + +- **Robot task name**: Check for Ingress and Service Conflicts in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `service`, `ingress`, `health`, `conflict`, `${NAMESPACE}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | — | yes | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-ingress-healthcheck/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-ingress-healthcheck +export NAMESPACE=... +export CONTEXT=... +export KUBERNETES_DISTRIBUTION_BINARY=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/k8s-istio-system-health/SKILL-TEMPLATE.md b/codebundles/k8s-istio-system-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..697e3149d --- /dev/null +++ b/codebundles/k8s-istio-system-health/SKILL-TEMPLATE.md @@ -0,0 +1,296 @@ +--- +name: k8s-istio-system-health +kind: skill-template +description: Checks istio proxy sidecar injection status, high memory and cpu usage, warnings and errors in logs, valid... Use when triaging or monitoring Kubernetes, Istio, AKS workloads with skill template `k... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, Istio, AKS, EKS, GKE, OpenShift] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Istio System Health + +## Summary + +This codebundle provides a task aimed at finding issues related to a Istio sidecar being available for the applications. + +See [README.md](README.md) for additional context. + +## Tools + +### Verify Istio Sidecar Injection for Cluster `${CONTEXT}` + +Checks all deployments in specified namespaces for Istio sidecar injection status + +- **Robot task name**: Verify Istio Sidecar Injection for Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `istio_sidecar_injection_report.sh` +- **Tags**: — +- **Reads**: — +- **Writes**: `issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Istio Sidecar Resource Usage for Cluster `${CONTEXT}` + +Checks all pods in specified namespaces for Istio sidecar resources usage + +- **Robot task name**: Check Istio Sidecar Resource Usage for Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `istio_sidecar_resource_usage.sh` +- **Tags**: — +- **Reads**: — +- **Writes**: `istio_sidecar_resource_usage_issue.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Validate Istio Installation in Cluster `${CONTEXT}` + +Verify Istio Istallation in cluster + +- **Robot task name**: Validate Istio Installation in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `istio_installation_verify.sh` +- **Tags**: — +- **Reads**: — +- **Writes**: `istio_installation_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Istio Controlplane Logs For Errors in Cluster `${CONTEXT}` + +Check istio controlplane logs for known errors and warnings in cluster ${CONTEXT} + +- **Robot task name**: Check Istio Controlplane Logs For Errors in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `istio_controlplane_logs.sh` +- **Tags**: — +- **Reads**: `CONTEXT` +- **Writes**: `istio_controlplane_issues.json`, `istio_controlplane_report.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Istio Proxy Logs in Cluster `${CONTEXT}` + +Check istio proxy logs for known errors and warnings in cluster + +- **Robot task name**: Fetch Istio Proxy Logs in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `istio_proxy_logs.sh` +- **Tags**: — +- **Reads**: — +- **Writes**: `istio_proxy_issues.json`, `istio_proxy_report.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Verify Istio SSL Certificates in Cluster `${CONTEXT}` + +Check Istio valid Root CA and mTLS Certificates in cluster + +- **Robot task name**: Verify Istio SSL Certificates in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `istio_mtls_check.sh` +- **Tags**: — +- **Reads**: — +- **Writes**: `istio_mtls_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Istio Configuration Health in Cluster `${CONTEXT}` + +Check Istio configurations in cluster + +- **Robot task name**: Check Istio Configuration Health in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `analyze_istio_configurations.sh` +- **Tags**: — +- **Reads**: — +- **Writes**: `issues_istio_analyze.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Checks istio proxy sidecar injection status, high memory and cpu usage, warnings and errors in logs, valid certificates, configuration and verify istio installation. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Verify Istio Sidecar Injection for Cluster `${CONTEXT}` + +Checks all deployments in specified namespaces for Istio sidecar injection status + +- **Robot task name**: Verify Istio Sidecar Injection for Cluster `${CONTEXT}` +- **Sub-metric name**: `sidecar_injection` +- **Underlying script**: `check_istio_injection.sh` +- **Tags**: — +- **Reads**: — +- **Pass condition**: `len(@{issues}) == 0` + + +#### Check Istio Sidecar Resource Usage for Cluster `${CONTEXT}` + +Checks all pods in specified namespaces for Istio sidecar resources usage + +- **Robot task name**: Check Istio Sidecar Resource Usage for Cluster `${CONTEXT}` +- **Sub-metric name**: `sidecar_resources` +- **Underlying script**: `istio_sidecar_resource_usage.sh` +- **Tags**: — +- **Reads**: — +- **Pass condition**: `len(@{issues}) == 0` + + +#### Validate Istio Installation in Cluster `${CONTEXT}` + +Verify Istio Istallation + +- **Robot task name**: Validate Istio Installation in Cluster `${CONTEXT}` +- **Sub-metric name**: `installation` +- **Underlying script**: `istio_installation_verify.sh` +- **Tags**: — +- **Reads**: — +- **Pass condition**: `len(@{issues}) == 0` + + +#### Check Istio Controlplane Logs For Errors in Cluster `${CONTEXT}` + +Check controlplane logs for known errors and warnings in Cluster + +- **Robot task name**: Check Istio Controlplane Logs For Errors in Cluster `${CONTEXT}` +- **Sub-metric name**: `controlplane_logs` +- **Underlying script**: `istio_controlplane_logs.sh` +- **Tags**: — +- **Reads**: — +- **Pass condition**: `len(@{issues}) == 0` + + +#### Fetch Istio Proxy Logs in Cluster `${CONTEXT}` + +Check istio proxy logs for known errors and warnings in cluster + +- **Robot task name**: Fetch Istio Proxy Logs in Cluster `${CONTEXT}` +- **Sub-metric name**: `proxy_logs` +- **Underlying script**: `istio_proxy_logs.sh` +- **Tags**: — +- **Reads**: — +- **Pass condition**: `len(@{issues}) == 0` + + +#### Verify Istio SSL Certificates in Cluster `${CONTEXT}` + +Check Istio valid Root CA and mTLS Certificates in Cluster + +- **Robot task name**: Verify Istio SSL Certificates in Cluster `${CONTEXT}` +- **Sub-metric name**: `ssl_certificates` +- **Underlying script**: `istio_mtls_check.sh` +- **Tags**: — +- **Reads**: — +- **Pass condition**: `len(@{issues}) == 0` + + +#### Check Istio Configuration Health in Cluster `${CONTEXT}` + +Check Istio configurations in Cluster + +- **Robot task name**: Check Istio Configuration Health in Cluster `${CONTEXT}` +- **Sub-metric name**: `configuration` +- **Underlying script**: `analyze_istio_configurations.sh` +- **Tags**: — +- **Reads**: — +- **Pass condition**: `len(@{issues}) == 0` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `EXCLUDED_NAMESPACES` | string | Comma-separated list of namespaces to exclude from checks (e.g., kube-system,istio-system). | `kube-system` | no | +| `CPU_USAGE_THRESHOLD` | string | The Threshold for the CPU usage. | `80` | no | +| `MEMORY_USAGE_THRESHOLD` | string | The Threshold for the MEMORY usage. | `80` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `issues.json` +- `istio_sidecar_resource_usage_issue.json` +- `istio_installation_issues.json` +- `istio_controlplane_issues.json` +- `istio_controlplane_report.json` +- `istio_proxy_issues.json` +- `istio_proxy_report.json` +- `istio_mtls_issues.json` +- `issues_istio_analyze.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-istio-system-health/runbook.robot` +- **Monitor**: `codebundles/k8s-istio-system-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-istio-system-health +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export EXCLUDED_NAMESPACES=... +export CPU_USAGE_THRESHOLD=... +export MEMORY_USAGE_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-istio-system-health +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export EXCLUDED_NAMESPACES=... +export CPU_USAGE_THRESHOLD=... +bash analyze_istio_configurations.sh +bash check_istio_injection.sh +bash istio_controlplane_logs.sh +bash istio_installation_verify.sh +bash istio_mtls_check.sh +bash istio_proxy_logs.sh +bash istio_sidecar_injection_report.sh +bash istio_sidecar_resource_usage.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `analyze_istio_configurations.sh` — Bash helper script `analyze_istio_configurations.sh`. +- `check_istio_injection.sh` — Bash helper script `check_istio_injection.sh`. +- `istio_controlplane_logs.sh` — Bash helper script `istio_controlplane_logs.sh`. +- `istio_installation_verify.sh` — Bash helper script `istio_installation_verify.sh`. +- `istio_mtls_check.sh` — Bash helper script `istio_mtls_check.sh`. +- `istio_proxy_logs.sh` — Bash helper script `istio_proxy_logs.sh`. +- `istio_sidecar_injection_report.sh` — Bash helper script `istio_sidecar_injection_report.sh`. +- `istio_sidecar_resource_usage.sh` — Bash helper script `istio_sidecar_resource_usage.sh`. diff --git a/codebundles/k8s-jaeger-http-query/SKILL-TEMPLATE.md b/codebundles/k8s-jaeger-http-query/SKILL-TEMPLATE.md new file mode 100644 index 000000000..cedf5a8db --- /dev/null +++ b/codebundles/k8s-jaeger-http-query/SKILL-TEMPLATE.md @@ -0,0 +1,97 @@ +--- +name: k8s-jaeger-http-query +kind: skill-template +description: This taskset queries Jaeger API directly for trace details and parses the results. Use when triaging or monitoring GKE, EKS, AKS workloads with skill template `k8s-jaeger-http-query`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [GKE, EKS, AKS, Kubernetes, HTTP] +resource_types: [kubernetes_resource] +access: read-only +--- + +# K8s Jaeger Query + +## Summary + +This codebundle is used for searching in a Jaeger instance for trace data that indicates issues with services. + +See [README.md](README.md) for additional context. + +## Tools + +### Query Traces in Jaeger for Unhealthy HTTP Response Codes in Namespace `${NAMESPACE}` + +Query Jaeger for all services and report on any HTTP related trace errors + +- **Robot task name**: Query Traces in Jaeger for Unhealthy HTTP Response Codes in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `query_jaeger_http_errors.sh` +- **Tags**: `jaeger`, `http`, `ingress`, `latency`, `errors`, `traces`, `kubernetes`, `data:logs-regexp` +- **Reads**: `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | — | yes | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `SERVICE_EXCLUSIONS` | string | Comma separated list of serivces to exclude from the query | `none` | no | +| `LOOKBACK` | string | The age to query for traces. Defaults to 5m. | `5m` | no | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-jaeger-http-query/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-jaeger-http-query +export NAMESPACE=... +export CONTEXT=... +export SERVICE_EXCLUSIONS=... +export LOOKBACK=... +export KUBERNETES_DISTRIBUTION_BINARY=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-jaeger-http-query +export NAMESPACE=... +export CONTEXT=... +export SERVICE_EXCLUSIONS=... +export LOOKBACK=... +bash query_jaeger_http_errors.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `query_jaeger_http_errors.sh` — Bash helper script `query_jaeger_http_errors.sh`. diff --git a/codebundles/k8s-jenkins-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-jenkins-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 000000000..c14201da1 --- /dev/null +++ b/codebundles/k8s-jenkins-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,99 @@ +--- +name: k8s-jenkins-healthcheck +kind: skill-template +description: This taskset collects information about perstistent volumes and persistent volume claims to. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-jenkins-healthch... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift, Jenkins] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Jenkins Healthcheck + +## Summary + +This taskset performs checks against its rest api to determine if there are any stuck jobs, which will result in raised issues if any are detected. + +See [README.md](README.md) for additional context. + +## Tools + +### Query The Jenkins Kubernetes Workload HTTP Endpoint in Kubernetes StatefulSet `${STATEFULSET_NAME}` + +Performs a curl within the jenkins statefulset kubernetes workload to determine if the pod is up and healthy, and can serve requests. + +- **Robot task name**: Query The Jenkins Kubernetes Workload HTTP Endpoint in Kubernetes StatefulSet `${STATEFULSET_NAME}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `HTTP`, `Curl`, `Web`, `Code`, `OK`, `Available`, `Jenkins`, `HTTP`, `Endpoint`, `API`, `data:config` +- **Reads**: `CONTEXT`, `JENKINS_SA_TOKEN`, `JENKINS_SA_USERNAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Query For Stuck Jenkins Jobs in Kubernetes Statefulset Workload `${STATEFULSET_NAME}` + +Performs a curl within the jenkins statefulset kubernetes workload to check for stuck jobs in the jenkins piepline queue. + +- **Robot task name**: Query For Stuck Jenkins Jobs in Kubernetes Statefulset Workload `${STATEFULSET_NAME}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `HTTP`, `Curl`, `Web`, `Code`, `OK`, `Available`, `Queue`, `Stuck`, `Jobs`, `Jenkins`, `data:config` +- **Reads**: `CONTEXT`, `JENKINS_SA_TOKEN`, `JENKINS_SA_USERNAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `NAMESPACE` | string | The name of the namespace to search. | `` | yes | +| `STATEFULSET_NAME` | string | Used to target the resource for queries and filtering events. | — | yes | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `JENKINS_SA_USERNAME` | The username associated with the API token, typically the username. | yes | +| `JENKINS_SA_TOKEN` | The API token generated and managed by jenkins in the user configuration settings. | yes | + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-jenkins-healthcheck/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-jenkins-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export STATEFULSET_NAME=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/k8s-job-namespace-health/SKILL-TEMPLATE.md b/codebundles/k8s-job-namespace-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..4b7357326 --- /dev/null +++ b/codebundles/k8s-job-namespace-health/SKILL-TEMPLATE.md @@ -0,0 +1,193 @@ +--- +name: k8s-job-namespace-health +kind: skill-template +description: Surfaces Kubernetes Job and CronJob health in a namespace: failed or long-running Jobs, pod events, and CronJob... Use when triaging or monitoring Kubernetes, Job, CronJob workloads with skill temp... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, Job, CronJob, batch, Namespace, Health] +resource_types: [namespace] +access: read-only +--- + +# Kubernetes Namespace Job Health + +## Summary + +This CodeBundle surfaces Kubernetes **Job** and **CronJob** reliability in one namespace: terminal failures, long-running active Jobs, warning events on Job-owned pods, suspended or stale CronJobs, and failed latest child Jobs. + +See [README.md](README.md) for additional context. + +## Tools + +### Summarize Job Status in Namespace `${NAMESPACE}` + +Aggregates Jobs by active, succeeded, and failed completion state and flags long-running active Jobs or elevated batch concurrency in the namespace. + +- **Robot task name**: Summarize Job Status in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `summarize-jobs-in-namespace.sh` +- **Tags**: `Kubernetes`, `Job`, `Namespace`, `batch`, `summary`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT`, `NAMESPACE` +- **Writes**: `summarize_jobs_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Identify Failed Jobs and Backoff in Namespace `${NAMESPACE}` + +Lists Jobs in Failed condition, backoff exhaustion, and Job pods with container waiting or non-zero exit states. + +- **Robot task name**: Identify Failed Jobs and Backoff in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `list-failed-jobs-in-namespace.sh` +- **Tags**: `Kubernetes`, `Job`, `failed`, `backoff`, `access:read-only`, `data:logs-config` +- **Reads**: `CONTEXT`, `NAMESPACE` +- **Writes**: `list_failed_jobs_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Correlate Job Failures with Recent Events in Namespace `${NAMESPACE}` + +Collects warning and failure-oriented events for pods owned by Jobs within the configured lookback window. + +- **Robot task name**: Correlate Job Failures with Recent Events in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `job-failure-events-in-namespace.sh` +- **Tags**: `Kubernetes`, `Job`, `events`, `access:read-only`, `data:logs-config` +- **Reads**: `CONTEXT`, `NAMESPACE` +- **Writes**: `job_failure_events_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check CronJob Schedule Health in Namespace `${NAMESPACE}` + +Flags suspended CronJobs, schedules that ran recently without a recorded success, and CronJobs whose latest child Job failed. + +- **Robot task name**: Check CronJob Schedule Health in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `cronjob-schedule-health-in-namespace.sh` +- **Tags**: `Kubernetes`, `CronJob`, `schedule`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT`, `NAMESPACE` +- **Writes**: `cronjob_health_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Measures namespace Job and CronJob health with lightweight kubectl checks. Produces a value between 0 (failing) and 1 (healthy) from the mean of binary sub-scores. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Score Failed Jobs Dimension for Namespace `${NAMESPACE}` + +1 when no Job has a Failed=True condition; 0 otherwise. + +- **Robot task name**: Score Failed Jobs Dimension for Namespace `${NAMESPACE}` +- **Sub-metric name**: `failed_jobs` +- **Tags**: `access:read-only`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Pass condition**: `${n} == 0` + + +#### Score Long-Running Active Jobs for Namespace `${NAMESPACE}` + +1 when no active Job exceeds JOB_ACTIVE_DURATION_WARN_MINUTES based on status.startTime. + +- **Robot task name**: Score Long-Running Active Jobs for Namespace `${NAMESPACE}` +- **Sub-metric name**: `long_running_active` +- **Tags**: `access:read-only`, `data:config` +- **Reads**: `CONTEXT`, `JOB_ACTIVE_DURATION_WARN_MINUTES`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Pass condition**: `${n} == 0` + + +#### Score CronJob Reliability for Namespace `${NAMESPACE}` + +1 when no CronJob is suspended and no latest CronJob-owned Job is in Failed=True state. + +- **Robot task name**: Score CronJob Reliability for Namespace `${NAMESPACE}` +- **Sub-metric name**: `cronjob_reliability` +- **Tags**: `access:read-only`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Pass condition**: `(${ns} == 0 and ${nf} == 0)` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Kubernetes CLI binary (kubectl or oc). | `kubectl` | no | +| `CONTEXT` | string | Kubernetes context for API calls. | — | yes | +| `NAMESPACE` | string | Namespace whose Job and CronJob health is evaluated. | — | yes | +| `RW_LOOKBACK_WINDOW` | string | Lookback window for events and CronJob freshness (e.g. 24h, 30m). | `24h` | no | +| `JOB_ACTIVE_DURATION_WARN_MINUTES` | string | Flag active Jobs running longer than this many minutes. | `360` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `summarize_jobs_issues.json` +- `list_failed_jobs_issues.json` +- `job_failure_events_issues.json` +- `cronjob_health_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-job-namespace-health/runbook.robot` +- **Monitor**: `codebundles/k8s-job-namespace-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-job-namespace-health +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export RW_LOOKBACK_WINDOW=... +export JOB_ACTIVE_DURATION_WARN_MINUTES=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-job-namespace-health +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export RW_LOOKBACK_WINDOW=... +bash cronjob-schedule-health-in-namespace.sh +bash job-failure-events-in-namespace.sh +bash list-failed-jobs-in-namespace.sh +bash summarize-jobs-in-namespace.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `cronjob-schedule-health-in-namespace.sh` — Bash helper script `cronjob-schedule-health-in-namespace.sh`. +- `job-failure-events-in-namespace.sh` — Bash helper script `job-failure-events-in-namespace.sh`. +- `list-failed-jobs-in-namespace.sh` — Bash helper script `list-failed-jobs-in-namespace.sh`. +- `summarize-jobs-in-namespace.sh` — Bash helper script `summarize-jobs-in-namespace.sh`. diff --git a/codebundles/k8s-karpenter-autoscaling-health/SKILL-TEMPLATE.md b/codebundles/k8s-karpenter-autoscaling-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..1cff1c989 --- /dev/null +++ b/codebundles/k8s-karpenter-autoscaling-health/SKILL-TEMPLATE.md @@ -0,0 +1,209 @@ +--- +name: k8s-karpenter-autoscaling-health +kind: skill-template +description: Monitors Karpenter-driven autoscaling: NodePools, NodeClaims, pending workloads, controller logs, and cloud NodeClasses. Use when triaging or monitoring Kubernetes, Karpenter, Autoscaling workloads... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, Karpenter, Autoscaling, NodePool, NodeClaim, EKS, AKS, GKE] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Karpenter Autoscaling Health + +## Summary + +This CodeBundle monitors Karpenter-driven autoscaling: NodePool or legacy Provisioner status, NodeClaim or Machine readiness, Pending workloads that indicate capacity or scheduling pressure, Karpenter controller logs, cloud NodeClass conditions, stuck NodeClaims, and optional log-to-pod correlation. + +See [README.md](README.md) for additional context. + +## Tools + +### Summarize NodePool and NodeClaim Health in Cluster `${CONTEXT}` + +Lists NodePools or Provisioners and NodeClaims or Machines, parses unhealthy status conditions, and summarizes not-ready or cordoned nodes. + +- **Robot task name**: Summarize NodePool and NodeClaim Health in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-karpenter-nodepool-nodeclaim-status.sh` +- **Tags**: `Kubernetes`, `Karpenter`, `NodePool`, `NodeClaim`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT` +- **Writes**: `karpenter_nodepool_nodeclaim_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Detect Workloads Blocked on Provisioning or Capacity in Cluster `${CONTEXT}` + +Finds Pending pods whose status messages indicate insufficient capacity, scheduling failures, or topology spread constraints correlated with scaling pressure. + +- **Robot task name**: Detect Workloads Blocked on Provisioning or Capacity in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-pending-provisioning-workloads.sh` +- **Tags**: `Kubernetes`, `Karpenter`, `Pending`, `scheduling`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT` +- **Writes**: `karpenter_pending_workload_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scan Karpenter Controller Logs for Errors in Namespace `${KARPENTER_NAMESPACE}` + +Aggregates recent controller pod logs for ERROR, WARN, and known failure substrings within RW_LOOKBACK_WINDOW, capped for RBAC and volume safety. + +- **Robot task name**: Scan Karpenter Controller Logs for Errors in Namespace `${KARPENTER_NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `scan-karpenter-controller-logs.sh` +- **Tags**: `Kubernetes`, `Karpenter`, `logs`, `controller`, `access:read-only`, `data:logs` +- **Reads**: `CONTEXT`, `KARPENTER_NAMESPACE` +- **Writes**: `karpenter_controller_log_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Cloud NodeClass Resources for Misconfiguration Signals in Cluster `${CONTEXT}` + +Reads EC2NodeClass, legacy AWSNodeTemplate, or other provider NodeClass conditions for subnet, security group, AMI, or IAM-related failures. + +- **Robot task name**: Check Cloud NodeClass Resources for Misconfiguration Signals in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-karpenter-nodeclass-conditions.sh` +- **Tags**: `Kubernetes`, `Karpenter`, `NodeClass`, `AWS`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT` +- **Writes**: `karpenter_nodeclass_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Identify Stale or Stuck NodeClaims in Cluster `${CONTEXT}` + +Finds NodeClaims that remain non-ready past STUCK_NODECLAIM_THRESHOLD_MINUTES or show prolonged deletion, indicating consolidation or lifecycle issues. + +- **Robot task name**: Identify Stale or Stuck NodeClaims in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-stuck-nodeclaims.sh` +- **Tags**: `Kubernetes`, `Karpenter`, `NodeClaim`, `stuck`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT` +- **Writes**: `karpenter_stuck_nodeclaim_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Correlate Recent Karpenter Log Patterns with Pending Pods in Cluster `${CONTEXT}` + +Optional cross-check that links controller log lines to Pending pod names when both appear together for faster triage. + +- **Robot task name**: Correlate Recent Karpenter Log Patterns with Pending Pods in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `correlate-karpenter-logs-pending-pods.sh` +- **Tags**: `Kubernetes`, `Karpenter`, `correlation`, `logs`, `access:read-only`, `data:logs` +- **Reads**: `CONTEXT`, `KARPENTER_NAMESPACE` +- **Writes**: `karpenter_correlation_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Measures Karpenter autoscaling health using NodePool or NodeClaim conditions, Pending capacity signals, and stuck NodeClaims. Produces a value between 0 and 1. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Measure Karpenter Autoscaling Health Score for Cluster `${CONTEXT}` + +Runs lightweight kubectl checks and averages binary dimension scores into a single 0 to 1 metric. + +- **Robot task name**: Measure Karpenter Autoscaling Health Score for Cluster `${CONTEXT}` +- **Sub-metric name**: `nodepool_nodeclaim_conditions` +- **Underlying script**: `sli-karpenter-autoscaling-score.sh` +- **Tags**: `access:read-only`, `data:config` +- **Reads**: `CONTEXT` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `CONTEXT` | string | Kubernetes context name for the target cluster. | — | yes | +| `KARPENTER_NAMESPACE` | string | Namespace where the Karpenter controller runs (for log tasks). | `karpenter` | no | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | kubectl-compatible CLI binary. | `kubectl` | no | +| `RW_LOOKBACK_WINDOW` | string | Lookback window for logs and recent transitions. | `30m` | no | +| `KARPENTER_LOG_ERROR_THRESHOLD` | string | Minimum matching controller log lines before raising an issue. | `1` | no | +| `STUCK_NODECLAIM_THRESHOLD_MINUTES` | string | Minutes after which a non-ready NodeClaim is considered stale. | `30` | no | +| `KARPENTER_LOG_MAX_LINES` | string | Maximum tail lines per controller pod for log tasks. | `500` | no | +| `SLI_PENDING_POD_MAX` | string | Maximum Pending pods with capacity-like messages before SLI fails the pending dimension. | `5` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `karpenter_nodepool_nodeclaim_issues.json` +- `karpenter_pending_workload_issues.json` +- `karpenter_controller_log_issues.json` +- `karpenter_nodeclass_issues.json` +- `karpenter_stuck_nodeclaim_issues.json` +- `karpenter_correlation_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-karpenter-autoscaling-health/runbook.robot` +- **Monitor**: `codebundles/k8s-karpenter-autoscaling-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-karpenter-autoscaling-health +export CONTEXT=... +export KARPENTER_NAMESPACE=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export RW_LOOKBACK_WINDOW=... +export KARPENTER_LOG_ERROR_THRESHOLD=... +export STUCK_NODECLAIM_THRESHOLD_MINUTES=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-karpenter-autoscaling-health +export CONTEXT=... +export KARPENTER_NAMESPACE=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export RW_LOOKBACK_WINDOW=... +bash check-karpenter-nodeclass-conditions.sh +bash check-karpenter-nodepool-nodeclaim-status.sh +bash check-pending-provisioning-workloads.sh +bash check-stuck-nodeclaims.sh +bash correlate-karpenter-logs-pending-pods.sh +bash scan-karpenter-controller-logs.sh +bash sli-karpenter-autoscaling-score.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `check-karpenter-nodeclass-conditions.sh` — Bash helper script `check-karpenter-nodeclass-conditions.sh`. +- `check-karpenter-nodepool-nodeclaim-status.sh` — Bash helper script `check-karpenter-nodepool-nodeclaim-status.sh`. +- `check-pending-provisioning-workloads.sh` — Bash helper script `check-pending-provisioning-workloads.sh`. +- `check-stuck-nodeclaims.sh` — Bash helper script `check-stuck-nodeclaims.sh`. +- `correlate-karpenter-logs-pending-pods.sh` — Bash helper script `correlate-karpenter-logs-pending-pods.sh`. +- `scan-karpenter-controller-logs.sh` — Bash helper script `scan-karpenter-controller-logs.sh`. +- `sli-karpenter-autoscaling-score.sh` — Bash helper script `sli-karpenter-autoscaling-score.sh`. diff --git a/codebundles/k8s-karpenter-control-plane-health/SKILL-TEMPLATE.md b/codebundles/k8s-karpenter-control-plane-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..28485ee6c --- /dev/null +++ b/codebundles/k8s-karpenter-control-plane-health/SKILL-TEMPLATE.md @@ -0,0 +1,189 @@ +--- +name: k8s-karpenter-control-plane-health +kind: skill-template +description: Monitors Karpenter controller health: workload readiness, admission webhooks, warning events, CRD versions, and... Use when triaging or monitoring Kubernetes, Karpenter, cluster workloads with skil... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, Karpenter, cluster, control-plane, health] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Karpenter Control Plane Health + +## Summary + +This CodeBundle answers whether the Karpenter controller is running and wired correctly—workload readiness, admission webhooks, recent Warning events, installed CRD groups, and metrics-oriented Services—before you dig into provisioning or node claims. + +See [README.md](README.md) for additional context. + +## Tools + +### Check Karpenter Controller Workload Health in Cluster `${CONTEXT}` + +Verifies Karpenter controller pods are Ready, surfaces CrashLoopBackOff, high restarts, and replica gaps for Karpenter Deployments. + +- **Robot task name**: Check Karpenter Controller Workload Health in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-karpenter-controller-pods.sh` +- **Tags**: `Kubernetes`, `Karpenter`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT`, `KARPENTER_NAMESPACE` +- **Writes**: `controller_pods_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Verify Karpenter Admission Webhooks in Cluster `${CONTEXT}` + +Lists ValidatingWebhookConfiguration and MutatingWebhookConfiguration objects tied to Karpenter and checks TLS client configuration and recent webhook-related warnings. + +- **Robot task name**: Verify Karpenter Admission Webhooks in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-karpenter-webhooks.sh` +- **Tags**: `Kubernetes`, `Karpenter`, `webhooks`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT`, `KARPENTER_NAMESPACE` +- **Writes**: `webhook_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Inspect Warning Events in Karpenter Namespace `${KARPENTER_NAMESPACE}` + +Aggregates recent Warning events involving Karpenter workloads or messages, grouped by involved object for triage. + +- **Robot task name**: Inspect Warning Events in Karpenter Namespace `${KARPENTER_NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `karpenter-namespace-warning-events.sh` +- **Tags**: `Kubernetes`, `Karpenter`, `events`, `access:read-only`, `data:events` +- **Reads**: `CONTEXT`, `KARPENTER_NAMESPACE`, `RW_LOOKBACK_WINDOW` +- **Writes**: `warning_events_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Summarize Installed Karpenter API Versions and CRDs in Cluster `${CONTEXT}` + +Detects CRD API groups related to Karpenter to spot missing installs or mixed API families. + +- **Robot task name**: Summarize Installed Karpenter API Versions and CRDs in Cluster `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-karpenter-crds.sh` +- **Tags**: `Kubernetes`, `Karpenter`, `crd`, `access:read-only`, `data:config` +- **Reads**: `CONTEXT` +- **Writes**: `crds_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Karpenter Service and Metrics Endpoints in Namespace `${KARPENTER_NAMESPACE}` + +Validates Services that front the controller expose ports suitable for metrics scraping and that Endpoints are populated. + +- **Robot task name**: Check Karpenter Service and Metrics Endpoints in Namespace `${KARPENTER_NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-karpenter-service-metrics.sh` +- **Tags**: `Kubernetes`, `Karpenter`, `metrics`, `access:read-only`, `data:metrics` +- **Reads**: `CONTEXT`, `KARPENTER_NAMESPACE` +- **Writes**: `service_metrics_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Measures Karpenter control-plane health using lightweight controller readiness, webhook presence, warning event volume, and Service endpoint checks. Produces a value between 0 (failing) and 1 (healthy). + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Score Karpenter Control Plane Dimensions in Cluster `${CONTEXT}` + +Runs a compact bash probe that returns binary scores per dimension and aggregates them into the SLI metric. + +- **Robot task name**: Score Karpenter Control Plane Dimensions in Cluster `${CONTEXT}` +- **Sub-metric name**: `controller` +- **Underlying script**: `sli-karpenter-dimensions.sh` +- **Tags**: `access:read-only`, `data:config` +- **Reads**: — + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `CONTEXT` | string | Kubernetes context name for the target cluster. | — | yes | +| `KARPENTER_NAMESPACE` | string | Namespace where the Karpenter controller runs. | `karpenter` | no | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | kubectl-compatible CLI binary. | `kubectl` | no | +| `RW_LOOKBACK_WINDOW` | string | Lookback window for event analysis (for example 30m or 2h). | `30m` | no | +| `SLI_WARNING_EVENT_THRESHOLD` | string | Maximum Warning events allowed in the lookback window for a passing score. | `5` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `controller_pods_issues.json` +- `webhook_issues.json` +- `warning_events_issues.json` +- `crds_issues.json` +- `service_metrics_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-karpenter-control-plane-health/runbook.robot` +- **Monitor**: `codebundles/k8s-karpenter-control-plane-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-karpenter-control-plane-health +export CONTEXT=... +export KARPENTER_NAMESPACE=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export RW_LOOKBACK_WINDOW=... +export SLI_WARNING_EVENT_THRESHOLD=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-karpenter-control-plane-health +export CONTEXT=... +export KARPENTER_NAMESPACE=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export RW_LOOKBACK_WINDOW=... +bash check-karpenter-controller-pods.sh +bash check-karpenter-crds.sh +bash check-karpenter-service-metrics.sh +bash check-karpenter-webhooks.sh +bash karpenter-namespace-warning-events.sh +bash sli-karpenter-dimensions.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `check-karpenter-controller-pods.sh` — Bash helper script `check-karpenter-controller-pods.sh`. +- `check-karpenter-crds.sh` — Bash helper script `check-karpenter-crds.sh`. +- `check-karpenter-service-metrics.sh` — Bash helper script `check-karpenter-service-metrics.sh`. +- `check-karpenter-webhooks.sh` — Bash helper script `check-karpenter-webhooks.sh`. +- `karpenter-namespace-warning-events.sh` — Bash helper script `karpenter-namespace-warning-events.sh`. +- `sli-karpenter-dimensions.sh` — Bash helper script `sli-karpenter-dimensions.sh`. diff --git a/codebundles/k8s-labeledpods-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-labeledpods-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 000000000..0013fe644 --- /dev/null +++ b/codebundles/k8s-labeledpods-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,93 @@ +--- +name: k8s-labeledpods-healthcheck +kind: skill-template +description: This codebundle fetches the number of running pods with the set of provided labels, letting you measure the number... Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill templ... +runtime: + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [pod] +access: read-only +--- + +# Kubernetes Labeled Pod Count + +## Summary + +This codebundle fetches the number of running pods with the set of provided labels, letting you measure the number of running pods. + +See [README.md](README.md) for additional context. + +## Monitor + +This codebundle fetches the number of running pods with the set of provided labels, letting you measure the number of running pods. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Measure Number of Running Pods with Label in `${NAMESPACE}` + +Counts the number of running pods with the configured labels. + +- **Robot task name**: Measure Number of Running Pods with Label in `${NAMESPACE}` +- **Sub-metric name**: `labeled_pods_health` +- **Tags**: `access:read-only`, `Pods`, `Containers`, `Running`, `Status`, `Count`, `Health`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `LABELS`, `NAMESPACE` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. Supports csv list of namespaces, or ALL. | — | yes | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `LABELS` | string | The metadata labels to use when selecting the objects to measure as running. | — | yes | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `kubeconfig` | The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Monitor**: `codebundles/k8s-labeledpods-healthcheck/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-labeledpods-healthcheck +export NAMESPACE=... +export CONTEXT=... +export LABELS=... +export KUBERNETES_DISTRIBUTION_BINARY=... +ro sli.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `sli.robot` — monitor scoring (`sli.robot` runtime file) diff --git a/codebundles/k8s-litellm-proxy-health/SKILL-TEMPLATE.md b/codebundles/k8s-litellm-proxy-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..9c7cc17be --- /dev/null +++ b/codebundles/k8s-litellm-proxy-health/SKILL-TEMPLATE.md @@ -0,0 +1,221 @@ +--- +name: k8s-litellm-proxy-health +kind: skill-template +description: Exposes LiteLLM proxy health via HTTP APIs (liveness, readiness, models, optional deep checks, integrations) plus... Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill templa... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift, LiteLLM, HTTP] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes LiteLLM Proxy API Health + +## Summary + +This CodeBundle calls the LiteLLM proxy HTTP API to report health beyond pod logs: liveness and readiness endpoints, configured models, optional expensive upstream health checks, integration health, and optional kubectl correlation with the Kubernetes Service. + +See [README.md](README.md) for additional context. + +## Tools + +### Check LiteLLM Liveness Endpoint for Proxy `${LITELLM_SERVICE_NAME}` + +Calls GET /health/liveliness (or /health/live) to confirm the proxy responds without invoking upstream LLM APIs. + +- **Robot task name**: Check LiteLLM Liveness Endpoint for Proxy `${LITELLM_SERVICE_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-litellm-liveness.sh` +- **Tags**: `Kubernetes`, `LiteLLM`, `liveness`, `access:read-only`, `data:metrics` +- **Reads**: — +- **Writes**: `litellm_liveness_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check LiteLLM Readiness and Dependencies for Proxy `${LITELLM_SERVICE_NAME}` + +Calls GET /health/readiness to surface database and cache connectivity and proxy version. + +- **Robot task name**: Check LiteLLM Readiness and Dependencies for Proxy `${LITELLM_SERVICE_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-litellm-readiness.sh` +- **Tags**: `Kubernetes`, `LiteLLM`, `readiness`, `access:read-only`, `data:metrics` +- **Reads**: — +- **Writes**: `litellm_readiness_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List Configured Models and Routes for LiteLLM Proxy `${LITELLM_SERVICE_NAME}` + +Uses /v1/models and /v1/model/info to verify expected models are registered. + +- **Robot task name**: List Configured Models and Routes for LiteLLM Proxy `${LITELLM_SERVICE_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `list-litellm-models.sh` +- **Tags**: `Kubernetes`, `LiteLLM`, `models`, `access:read-only`, `data:logs-config` +- **Reads**: — +- **Writes**: `litellm_models_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Optional Deep Model Health for LiteLLM Proxy `${LITELLM_SERVICE_NAME}` + +When LITELLM_RUN_DEEP_HEALTH is true, calls GET /health with the master key to run upstream health checks (may incur provider cost). + +- **Robot task name**: Check Optional Deep Model Health for LiteLLM Proxy `${LITELLM_SERVICE_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-litellm-deep-health.sh` +- **Tags**: `Kubernetes`, `LiteLLM`, `deep-health`, `access:read-only`, `data:metrics` +- **Reads**: — +- **Writes**: `litellm_deep_health_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check External Integration Service Health for LiteLLM Proxy `${LITELLM_SERVICE_NAME}` + +Calls GET /health/services for configured integration names when LITELLM_INTEGRATION_SERVICES is set. + +- **Robot task name**: Check External Integration Service Health for LiteLLM Proxy `${LITELLM_SERVICE_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-litellm-integration-health.sh` +- **Tags**: `Kubernetes`, `LiteLLM`, `integrations`, `access:read-only`, `data:metrics` +- **Reads**: — +- **Writes**: `litellm_integration_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Verify Kubernetes Service Reachability Context for `${LITELLM_SERVICE_NAME}` + +Uses kubectl to confirm the Service and Endpoints exist and align with LITELLM_HTTP_PORT for correlating API failures with cluster networking. + +- **Robot task name**: Verify Kubernetes Service Reachability Context for `${LITELLM_SERVICE_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `verify-litellm-k8s-service.sh` +- **Tags**: `Kubernetes`, `LiteLLM`, `service`, `access:read-only`, `data:metrics` +- **Reads**: — +- **Writes**: `litellm_k8s_service_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Measures LiteLLM proxy availability using liveness and readiness HTTP endpoints and a lightweight Kubernetes Service existence check. Produces a value between 0 (failing) and 1 (healthy). + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Collect LiteLLM Proxy Sub-Scores for Service `${LITELLM_SERVICE_NAME}` + +Fetches liveness, readiness, and Kubernetes Service scores as binary 0/1 values. + +- **Robot task name**: Collect LiteLLM Proxy Sub-Scores for Service `${LITELLM_SERVICE_NAME}` +- **Sub-metric name**: `liveness` +- **Underlying script**: `sli-litellm-proxy-score.sh` +- **Tags**: `access:read-only`, `data:metrics` +- **Reads**: — + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `CONTEXT` | string | Kubernetes context to use for kubectl-backed checks. | — | yes | +| `NAMESPACE` | string | Namespace where the LiteLLM proxy runs. | — | yes | +| `PROXY_BASE_URL` | string | Optional base URL for the LiteLLM HTTP API (for example http://my-litellm.my-ns.svc.cluster.local:4000). Leave empty to auto port-forward to the Service via kubectl. | `` | yes | +| `LITELLM_SERVICE_NAME` | string | Kubernetes Service name for the LiteLLM proxy. | — | yes | +| `LITELLM_HTTP_PORT` | string | Service port number for the proxy HTTP listener. | `4000` | no | +| `LITELLM_RUN_DEEP_HEALTH` | string | Set to true to enable expensive GET /health upstream probes. | `false` | no | +| `LITELLM_INTEGRATION_SERVICES` | string | Comma-separated integration names for /health/services checks, or empty to skip. | `` | yes | +| `LITELLM_MASTER_KEY_SECRET_NAME` | string | Optional Kubernetes Secret name in NAMESPACE to read the master key from when the litellm_master_key secret is not provided. Leave empty to infer from the Pod env or auto-discover. | `` | yes | +| `LITELLM_MASTER_KEY_SECRET_KEY` | string | Optional data key within LITELLM_MASTER_KEY_SECRET_NAME. Leave empty to try common keys (masterkey, master_key, MASTER_KEY, LITELLM_MASTER_KEY). | `` | yes | +| `LITELLM_MASTER_KEY_INFER_FROM_POD` | string | When true (default), inspect the LiteLLM Pod env vars (e.g. LITELLM_MASTER_KEY) and follow any secretKeyRef to derive the key. Set to false to skip. | `true` | no | +| `LITELLM_MASTER_KEY_EXEC_FALLBACK` | string | When true (default), fall back to `kubectl exec -- printenv LITELLM_MASTER_KEY` if Pod spec inspection cannot resolve the secretKeyRef (for example due to missing RBAC on the Secret, or env wired via envFrom.secretRef). Set to false to forbid exec. | `true` | no | +| `LITELLM_MASTER_KEY_SECRET_PATTERN` | string | Regex used to auto-discover a master key Secret by name as a last-resort fallback when Pod env inference does not find anything. | `litellm` | no | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Kubernetes CLI binary to use. | `kubectl` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `litellm_master_key` | Optional LiteLLM master or admin API key for protected routes. When omitted the codebundle will try to derive it from a Kubernetes Secret in NAMESPACE. | yes | +| `kubeconfig` | The kubernetes kubeconfig yaml containing connection configuration. | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `litellm_liveness_issues.json` +- `litellm_readiness_issues.json` +- `litellm_models_issues.json` +- `litellm_deep_health_issues.json` +- `litellm_integration_issues.json` +- `litellm_k8s_service_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-litellm-proxy-health/runbook.robot` +- **Monitor**: `codebundles/k8s-litellm-proxy-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-litellm-proxy-health +export CONTEXT=... +export NAMESPACE=... +export PROXY_BASE_URL=... +export LITELLM_SERVICE_NAME=... +export LITELLM_HTTP_PORT=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-litellm-proxy-health +export CONTEXT=... +export NAMESPACE=... +export PROXY_BASE_URL=... +bash _master_key_helper.sh +bash _portforward_helper.sh +bash check-litellm-deep-health.sh +bash check-litellm-integration-health.sh +bash check-litellm-liveness.sh +bash check-litellm-readiness.sh +bash list-litellm-models.sh +bash resolve-litellm-master-key.sh +bash sli-litellm-proxy-score.sh +bash verify-litellm-k8s-service.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `_master_key_helper.sh` — Bash helper script `_master_key_helper.sh`. +- `_portforward_helper.sh` — Bash helper script `_portforward_helper.sh`. +- `check-litellm-deep-health.sh` — Bash helper script `check-litellm-deep-health.sh`. +- `check-litellm-integration-health.sh` — Bash helper script `check-litellm-integration-health.sh`. +- `check-litellm-liveness.sh` — Bash helper script `check-litellm-liveness.sh`. +- `check-litellm-readiness.sh` — Bash helper script `check-litellm-readiness.sh`. +- `list-litellm-models.sh` — Bash helper script `list-litellm-models.sh`. +- `resolve-litellm-master-key.sh` — Bash helper script `resolve-litellm-master-key.sh`. +- `sli-litellm-proxy-score.sh` — Bash helper script `sli-litellm-proxy-score.sh`. +- `verify-litellm-k8s-service.sh` — Bash helper script `verify-litellm-k8s-service.sh`. diff --git a/codebundles/k8s-litellm-spend-governance/SKILL-TEMPLATE.md b/codebundles/k8s-litellm-spend-governance/SKILL-TEMPLATE.md new file mode 100644 index 000000000..d150665c8 --- /dev/null +++ b/codebundles/k8s-litellm-spend-governance/SKILL-TEMPLATE.md @@ -0,0 +1,302 @@ +--- +name: k8s-litellm-spend-governance +kind: skill-template +description: Surfaces LiteLLM spend, budget, and failure signals from proxy Admin APIs for operational and cost governance. Use when triaging or monitoring Kubernetes, LiteLLM, spend workloads with skill templa... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, LiteLLM, spend, governance, metrics] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes LiteLLM Spend and Governance + +## Summary + +This CodeBundle queries the LiteLLM proxy Admin and spend APIs (not container logs alone) to surface cost pressure, budget blocks, rate limits, and provider-side failures. + +See [README.md](README.md) for additional context. + +## Tools + +### Check Spend Tracking Configuration for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` + +Hits /health/readiness and /key/list to report whether a spend-tracking DB is wired up (so later tasks can distinguish "no DB" from "transient failure") and whether admin auth is working. + +- **Robot task name**: Check Spend Tracking Configuration for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-litellm-spend-config.sh` +- **Tags**: `Kubernetes`, `LiteLLM`, `access:read-only`, `data:metrics` +- **Reads**: — +- **Writes**: `spend_config_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Review Recent Spend Logs for Failures for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` + +Queries /spend/logs for the lookback window and flags rows matching budget, rate-limit, or provider failure heuristics. + +- **Robot task name**: Review Recent Spend Logs for Failures for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `review-litellm-spend-logs.sh` +- **Tags**: `Kubernetes`, `LiteLLM`, `access:read-only`, `data:metrics` +- **Reads**: — +- **Writes**: `spend_logs_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Global Spend Report Against Threshold for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` + +Calls /global/spend/report for the computed date window and compares estimated spend to LITELLM_SPEND_THRESHOLD_USD when non-zero. + +- **Robot task name**: Check Global Spend Report Against Threshold for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-litellm-global-spend.sh` +- **Tags**: `Kubernetes`, `LiteLLM`, `access:read-only`, `data:metrics` +- **Reads**: — +- **Writes**: `global_spend_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Inspect Virtual Key Spend and Remaining Budget for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` + +Uses /key/list when available to highlight keys near max_budget or with expired credentials. + +- **Robot task name**: Inspect Virtual Key Spend and Remaining Budget for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `inspect-litellm-key-budgets.sh` +- **Tags**: `Kubernetes`, `LiteLLM`, `access:read-only`, `data:metrics` +- **Reads**: — +- **Writes**: `key_budget_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Review User Budget and Rate Limit Status for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` + +Calls /user/info for configured user_ids to surface soft_budget_cooldown and spend versus limits. + +- **Robot task name**: Review User Budget and Rate Limit Status for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `review-litellm-user-budgets.sh` +- **Tags**: `Kubernetes`, `LiteLLM`, `access:read-only`, `data:metrics` +- **Reads**: — +- **Writes**: `user_budget_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Summarize Team Budgets and Limits for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` + +Queries /team/info for configured team identifiers to detect teams near max_budget or blocked traffic risk. + +- **Robot task name**: Summarize Team Budgets and Limits for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `summarize-litellm-team-budgets.sh` +- **Tags**: `Kubernetes`, `LiteLLM`, `access:read-only`, `data:metrics` +- **Reads**: — +- **Writes**: `team_budget_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Summarize Spend by Model and User for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` + +Aggregates per-model and per-user spend from /spend/logs?summarize=true (OSS-compatible, compact payload) and flags groups that exceed configured LITELLM_MODEL_SPEND_THRESHOLD_USD or LITELLM_USER_SPEND_THRESHOLD_USD. + +- **Robot task name**: Summarize Spend by Model and User for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `summarize-litellm-model-spend.sh` +- **Tags**: `Kubernetes`, `LiteLLM`, `access:read-only`, `data:metrics` +- **Reads**: — +- **Writes**: `model_spend_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Aggregate Error and Blocked Request Signals for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` + +Derives triage counts for budget_exceeded, rate limits, HTTP 429, and 5xx signals from spend logs in one summary. + +- **Robot task name**: Aggregate Error and Blocked Request Signals for LiteLLM `${LITELLM_SERVICE_NAME}` in `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `aggregate-litellm-failure-signals.sh` +- **Tags**: `Kubernetes`, `LiteLLM`, `access:read-only`, `data:metrics` +- **Reads**: — +- **Writes**: `aggregate_failure_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Measures LiteLLM proxy governance health from Admin API reachability, global spend versus threshold, and spend-log failure heuristics. Produces a value between 0 (failing) and 1 (healthy). + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Score LiteLLM Proxy Reachability for `${LITELLM_SERVICE_NAME}` + +Binary 1 if /health or / returns HTTP 2xx within timeout. + +- **Robot task name**: Score LiteLLM Proxy Reachability for `${LITELLM_SERVICE_NAME}` +- **Sub-metric name**: `api_reachable` +- **Underlying script**: `sli-litellm-dimension.sh` +- **Tags**: `access:read-only`, `data:metrics` +- **Reads**: — + + +#### Score Global Spend Threshold for `${LITELLM_SERVICE_NAME}` + +Binary 1 if threshold is disabled, spend is under threshold, or the report cannot be fetched. + +- **Robot task name**: Score Global Spend Threshold for `${LITELLM_SERVICE_NAME}` +- **Sub-metric name**: `global_spend_threshold` +- **Underlying script**: `sli-litellm-dimension.sh` +- **Tags**: `access:read-only`, `data:metrics` +- **Reads**: — + + +#### Score Spend Logs Cleanliness for `${LITELLM_SERVICE_NAME}` + +Binary 1 when the /spend/logs summary endpoint parses cleanly or is unavailable on OSS (neutral pass). Uses summarize=true so a >100 MB raw log response on a busy proxy cannot drop the request. + +- **Robot task name**: Score Spend Logs Cleanliness for `${LITELLM_SERVICE_NAME}` +- **Sub-metric name**: `spend_logs_clean` +- **Underlying script**: `sli-litellm-dimension.sh` +- **Tags**: `access:read-only`, `data:metrics` +- **Reads**: — + + +#### Score Spend Tracking Readiness for `${LITELLM_SERVICE_NAME}` + +Binary 1 when /health/readiness reports db=connected, so spend-governance tasks have a DB to query. This is the authoritative "is spend tracking configured" signal. + +- **Robot task name**: Score Spend Tracking Readiness for `${LITELLM_SERVICE_NAME}` +- **Sub-metric name**: `spend_db_connected` +- **Underlying script**: `sli-litellm-dimension.sh` +- **Tags**: `access:read-only`, `data:metrics` +- **Reads**: — + + +#### Score Exception Rate for `${LITELLM_SERVICE_NAME}` + +Binary 1 when exception_rate across top model deployments stays under LITELLM_EXCEPTION_RATE_PCT. Uses OSS /global/activity endpoints (compact payloads). + +- **Robot task name**: Score Exception Rate for `${LITELLM_SERVICE_NAME}` +- **Sub-metric name**: `exception_rate_ok` +- **Underlying script**: `sli-litellm-dimension.sh` +- **Tags**: `access:read-only`, `data:metrics` +- **Reads**: — + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `CONTEXT` | string | Kubernetes context name. | — | yes | +| `NAMESPACE` | string | Namespace where the LiteLLM service runs. | — | yes | +| `PROXY_BASE_URL` | string | Optional LiteLLM proxy base URL (for example http://my-litellm.my-ns.svc.cluster.local:4000). Leave empty to auto port-forward to the Service via kubectl. | `` | yes | +| `LITELLM_SERVICE_NAME` | string | Kubernetes Service name for labeling and reports. | — | yes | +| `LITELLM_HTTP_PORT` | string | Service port number for the proxy HTTP listener (used when auto port-forwarding). | `4000` | no | +| `LITELLM_SPEND_THRESHOLD_USD` | string | Alert when global estimated spend exceeds this USD amount (0 disables). | `0` | no | +| `LITELLM_MODEL_SPEND_THRESHOLD_USD` | string | Per-model spend threshold used by the Summarize Spend by Model task. 0 disables the issue but the report still lists top models by spend. | `0` | no | +| `LITELLM_USER_SPEND_THRESHOLD_USD` | string | Per-user spend threshold used by the Summarize Spend by Model task. 0 disables the issue but the report still lists top users by spend. | `0` | no | +| `LITELLM_EXCEPTION_RATE_PCT` | string | Percent of requests in the lookback window that may fail before the aggregate failure task raises an issue. Default 1 = 1%. | `1` | no | +| `LITELLM_ENABLE_RAW_LOG_SCAN` | string | Opt-in flag to additionally scan the raw /spend/logs response for failure keyword heuristics. Disabled by default because the response can exceed 100 MB on busy proxies and drop through a kubectl port-forward tunnel. Set to true only when querying a proxy with modest traffic or from inside the cluster. | `false` | no | +| `LITELLM_USER_IDS` | string | Comma-separated internal user_ids for /user/info (empty skips). | `${EMPTY}` | no | +| `LITELLM_TEAM_IDS` | string | Comma-separated team ids for /team/info (empty skips). | `${EMPTY}` | no | +| `LITELLM_MASTER_KEY_SECRET_NAME` | string | Optional Kubernetes Secret name in NAMESPACE to read the master key from when the litellm_master_key secret is not provided. Leave empty to infer from the Pod env or auto-discover. | `` | yes | +| `LITELLM_MASTER_KEY_SECRET_KEY` | string | Optional data key within LITELLM_MASTER_KEY_SECRET_NAME. Leave empty to try common keys (masterkey, master_key, MASTER_KEY, LITELLM_MASTER_KEY). | `` | yes | +| `LITELLM_MASTER_KEY_INFER_FROM_POD` | string | When true (default), inspect the LiteLLM Pod env vars (e.g. LITELLM_MASTER_KEY) and follow any secretKeyRef to derive the key. Set to false to skip. | `true` | no | +| `LITELLM_MASTER_KEY_EXEC_FALLBACK` | string | When true (default), fall back to `kubectl exec -- printenv LITELLM_MASTER_KEY` if Pod spec inspection cannot resolve the secretKeyRef. Set to false to forbid exec. | `true` | no | +| `LITELLM_MASTER_KEY_SECRET_PATTERN` | string | Regex used to auto-discover a master key Secret by name as a last-resort fallback when Pod env inference does not find anything. | `litellm` | no | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Kubernetes CLI binary for connectivity verification. | `kubectl` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `kubeconfig` | Kubeconfig for kubectl connectivity checks. | yes | +| `litellm_master_key` | Optional LiteLLM master or admin API key for spend/governance routes. When omitted the codebundle will try to derive it from a Kubernetes Secret in NAMESPACE. | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `spend_config_issues.json` +- `spend_logs_issues.json` +- `global_spend_issues.json` +- `key_budget_issues.json` +- `user_budget_issues.json` +- `team_budget_issues.json` +- `model_spend_issues.json` +- `aggregate_failure_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-litellm-spend-governance/runbook.robot` +- **Monitor**: `codebundles/k8s-litellm-spend-governance/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-litellm-spend-governance +export CONTEXT=... +export NAMESPACE=... +export PROXY_BASE_URL=... +export LITELLM_SERVICE_NAME=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-litellm-spend-governance +export CONTEXT=... +export NAMESPACE=... +bash _master_key_helper.sh +bash _portforward_helper.sh +bash aggregate-litellm-failure-signals.sh +bash check-litellm-global-spend.sh +bash check-litellm-spend-config.sh +bash inspect-litellm-key-budgets.sh +bash litellm-http-helpers.sh +bash resolve-litellm-master-key.sh +bash review-litellm-spend-logs.sh +bash review-litellm-user-budgets.sh +bash sli-litellm-dimension.sh +bash summarize-litellm-model-spend.sh +# ... and 1 more scripts +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `_master_key_helper.sh` — Bash helper script `_master_key_helper.sh`. +- `_portforward_helper.sh` — Bash helper script `_portforward_helper.sh`. +- `aggregate-litellm-failure-signals.sh` — Bash helper script `aggregate-litellm-failure-signals.sh`. +- `check-litellm-global-spend.sh` — Bash helper script `check-litellm-global-spend.sh`. +- `check-litellm-spend-config.sh` — Bash helper script `check-litellm-spend-config.sh`. +- `inspect-litellm-key-budgets.sh` — Bash helper script `inspect-litellm-key-budgets.sh`. +- `litellm-http-helpers.sh` — Bash helper script `litellm-http-helpers.sh`. +- `resolve-litellm-master-key.sh` — Bash helper script `resolve-litellm-master-key.sh`. +- `review-litellm-spend-logs.sh` — Bash helper script `review-litellm-spend-logs.sh`. +- `review-litellm-user-budgets.sh` — Bash helper script `review-litellm-user-budgets.sh`. +- `sli-litellm-dimension.sh` — Bash helper script `sli-litellm-dimension.sh`. +- `summarize-litellm-model-spend.sh` — Bash helper script `summarize-litellm-model-spend.sh`. +- `summarize-litellm-team-budgets.sh` — Bash helper script `summarize-litellm-team-budgets.sh`. diff --git a/codebundles/k8s-loki-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-loki-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 000000000..2cd2f9095 --- /dev/null +++ b/codebundles/k8s-loki-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,94 @@ +--- +name: k8s-loki-healthcheck +kind: skill-template +description: This taskset checks the health of Grafana Loki and its hash ring. Use when triaging or monitoring RunWhen workloads with skill template `k8s-loki-healthcheck`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [RunWhen] +resource_types: [] +access: read-only +--- + +# Kubernetes Grafana Loki Health Check + +## Summary + +A set of tasks to query the state and health of a Loki deployment in Kubernetes. + +See [README.md](README.md) for additional context. + +## Tools + +### Check Loki Ring API for Unhealthy Shards in Kubernetes Cluster `$${NAMESPACE}` + +Request and inspect the state of the Loki hash rings for non-active (potentially unhealthy) shards. + +- **Robot task name**: Check Loki Ring API for Unhealthy Shards in Kubernetes Cluster `$${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `Loki`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Loki API Ready in Kubernetes Cluster `${NAMESPACE}` + +Pings the internal Loki API to check it's ready. + +- **Robot task name**: Check Loki API Ready in Kubernetes Cluster `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `Loki`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `NAMESPACE` | string | The name of the namespace to search. | `loki` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-loki-healthcheck/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-loki-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/k8s-namespace-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-namespace-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 000000000..577f36eda --- /dev/null +++ b/codebundles/k8s-namespace-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,264 @@ +--- +name: k8s-namespace-healthcheck +kind: skill-template +description: This taskset runs general troubleshooting checks against all applicable objects in a namespace. Looks for warning... Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill templa... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [namespace] +access: read-only +--- + +# Kubernetes Namespace Inspection + +## Summary + +This codebundle is used for searching in a namespace for possible issues to triage; covering things such as scraping logs, checking for anomalies in events, looking for pod restarts, etc. + +See [README.md](README.md) for additional context. + +## Tools + +### Inspect Warning Events in Namespace `${NAMESPACE}` + +Queries all warning events in a given namespace within the RW_LOOKBACK_WINDOW timeframe, + +- **Robot task name**: Inspect Warning Events in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `workload_issues.sh` +- **Tags**: `access:read-only`, `namespace`, `trace`, `error`, `pods`, `events`, `logs`, `grep`, `${NAMESPACE}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Inspect Container Restarts In Namespace `${NAMESPACE}` + +Fetches pods that have container restarts and provides a detailed analysis of restart causes including proper OOM vs liveness probe failure detection. + +- **Robot task name**: Inspect Container Restarts In Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `container_restarts.sh` +- **Tags**: `access:read-only`, `namespace`, `containers`, `status`, `restarts`, `${namespace}`, `data:config` +- **Reads**: `NAMESPACE` +- **Writes**: `container_restart_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Inspect Pending Pods In Namespace `${NAMESPACE}` + +Fetches pods that are pending and provides details. + +- **Robot task name**: Inspect Pending Pods In Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `workload_issues.sh` +- **Tags**: `access:read-only`, `namespace`, `pods`, `status`, `pending`, `${NAMESPACE}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RW_LOOKBACK_WINDOW` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Inspect Failed Pods In Namespace `${NAMESPACE}` + +Fetches all pods which are not running (unready) in the namespace and adds them to a report for future review. + +- **Robot task name**: Inspect Failed Pods In Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `workload_issues.sh` +- **Tags**: `access:read-only`, `namespace`, `pods`, `status`, `unready`, `not`, `starting`, `phase`, `failed`, `${namespace}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RW_LOOKBACK_WINDOW` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Inspect Workload Status Conditions In Namespace `${NAMESPACE}` + +Parses all workloads in a namespace and inspects their status conditions for issues. Status conditions with a status value of False are considered an error. + +- **Robot task name**: Inspect Workload Status Conditions In Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `workload_next_steps.sh` +- **Tags**: `access:read-only`, `namespace`, `status`, `conditions`, `pods`, `reasons`, `workloads`, `${namespace}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RW_LOOKBACK_WINDOW` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Get Listing Of Resources In Namespace `${NAMESPACE}` + +Simple fetch all to provide a snapshot of information about the workloads in the namespace for future review in a report. + +- **Robot task name**: Get Listing Of Resources In Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `get`, `all`, `resources`, `info`, `workloads`, `namespace`, `manifests`, `${namespace}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Event Anomalies in Namespace `${NAMESPACE}` + +Fetches non warning events in a namespace within a timeframe and checks for unusual activity, raising issues for any found. + +- **Robot task name**: Check Event Anomalies in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `workload_issues.sh` +- **Tags**: `access:read-only`, `namespace`, `events`, `info`, `state`, `anomolies`, `count`, `occurences`, `${namespace}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RW_LOOKBACK_WINDOW` +- **Writes**: `events.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Missing or Risky PodDisruptionBudget Policies in Namepace `${NAMESPACE}` + +Searches through deployemnts and statefulsets to determine if PodDistruptionBudgets are missing and/or are configured in a risky way that might affect maintenance activities. + +- **Robot task name**: Check Missing or Risky PodDisruptionBudget Policies in Namepace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Resource Quota Utilization in Namespace `${NAMESPACE}` + +Lists any namespace resource quotas and checks their utilization, raising issues if they are above 80% + +- **Robot task name**: Check Resource Quota Utilization in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `resource_quota_check.sh` +- **Tags**: `access:read-only`, `resourcequota`, `quota`, `availability`, `unavailable`, `policy`, `${namespace}`, `data:config` +- **Reads**: `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +This SLI uses kubectl to score namespace health. Produces a value between 0 (completely failing thet test) and 1 (fully passing the test). Looks for container restarts, events, and pods not ready. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Get Error Event Count within ${RW_LOOKBACK_WINDOW} and calculate Score + +Captures error events and counts them within the RW_LOOKBACK_WINDOW timeframe, consistent with runbook analysis. + +- **Robot task name**: Get Error Event Count within ${RW_LOOKBACK_WINDOW} and calculate Score +- **Sub-metric name**: `error_events` +- **Tags**: `Event`, `Count`, `Warning`, `data:config` +- **Reads**: `CONTEXT`, `EVENT_THRESHOLD`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RW_LOOKBACK_WINDOW` +- **Pass condition**: `${event_count} <= ${threshold}` + + +#### Get Container Restarts and Score in Namespace `${NAMESPACE}` + +Counts the total sum of container restarts within a timeframe and determines if they're beyond a threshold. + +- **Robot task name**: Get Container Restarts and Score in Namespace `${NAMESPACE}` +- **Sub-metric name**: `container_restarts` +- **Tags**: `Restarts`, `Pods`, `Containers`, `Count`, `Status`, `data:config` +- **Reads**: `CONTAINER_RESTART_THRESHOLD`, `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RW_LOOKBACK_WINDOW` +- **Pass condition**: `${restart_count} <= ${threshold}` + + +#### Get NotReady Pods in `${NAMESPACE}` + +Fetches a count of unready pods. + +- **Robot task name**: Get NotReady Pods in `${NAMESPACE}` +- **Sub-metric name**: `pod_readiness` +- **Tags**: `access:read-only`, `Pods`, `Status`, `Phase`, `Ready`, `Unready`, `Running`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RW_LOOKBACK_WINDOW` +- **Pass condition**: `${unready_count} == 0` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | — | yes | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `EVENT_AGE` | string | The time window in minutes as to when the event was last seen. | `30m` | no | +| `CONTAINER_RESTART_AGE` | string | The time window (in (h) hours or (m) minutes) as search for container restarts. | `4h` | no | +| `RW_LOOKBACK_WINDOW` | string | The time window (in (h) hours or (m) minutes) to look back for time-sensitive issues like failed pods, pending pods, workload status conditions, and event anomalies. Resources with issues older than this window will be ignored. | `1h` | no | +| `CONTAINER_RESTART_THRESHOLD` | string | The maximum total container restarts to be still considered healthy. | `3` | no | +| `EVENT_THRESHOLD` | string | The maximum total events to be still considered healthy. | `4` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `kubeconfig` | The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `container_restart_issues.json` +- `events.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-namespace-healthcheck/runbook.robot` +- **Monitor**: `codebundles/k8s-namespace-healthcheck/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-namespace-healthcheck +export NAMESPACE=... +export CONTEXT=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export EVENT_AGE=... +export CONTAINER_RESTART_AGE=... +export RW_LOOKBACK_WINDOW=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-namespace-healthcheck +export NAMESPACE=... +export CONTEXT=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export EVENT_AGE=... +bash container_restarts.sh +bash find_resource_owners.sh +bash resource_quota_check.sh +bash warning_events.sh +bash workload_issues.sh +bash workload_next_steps.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `container_restarts.sh` — Bash helper script `container_restarts.sh`. +- `find_resource_owners.sh` — Bash helper script `find_resource_owners.sh`. +- `resource_quota_check.sh` — Bash helper script `resource_quota_check.sh`. +- `warning_events.sh` — Bash helper script `warning_events.sh`. +- `workload_issues.sh` — Bash helper script `workload_issues.sh`. +- `workload_next_steps.sh` — Bash helper script `workload_next_steps.sh`. diff --git a/codebundles/k8s-otelcollector/SKILL-TEMPLATE.md b/codebundles/k8s-otelcollector/SKILL-TEMPLATE.md new file mode 100644 index 000000000..51d46e1a5 --- /dev/null +++ b/codebundles/k8s-otelcollector/SKILL-TEMPLATE.md @@ -0,0 +1,129 @@ +--- +name: k8s-otelcollector +kind: skill-template +description: This taskset performs diagnostic checks on a OpenTelemetry Collector to ensure it's pushing metrics. Use when triaging or monitoring GKE, EKS, AKS workloads with skill template `k8s-otelcollector`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [GKE, EKS, AKS, Kubernetes, OpenTelemetry, otel, collector] +resource_types: [kubernetes_resource] +access: read-only +--- + +# K8s OpenTelemetry Collector Health + +## Summary + +Checks the OTEL collector's logs and metrics to determine its health, such as large queues or errors. + +See [README.md](README.md) for additional context. + +## Tools + +### Query Collector Queued Spans in Namespace `${NAMESPACE}` + +Query the collector metrics endpoint and inspect queue size + +- **Robot task name**: Query Collector Queued Spans in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `otel_metrics_check.sh` +- **Tags**: `access:read-only`, `otel-collector`, `metrics`, `queued`, `back`, `pressure`, `data:config` +- **Reads**: `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check OpenTelemetry Collector Logs For Errors In Namespace `${NAMESPACE}` + +Fetch logs and check for errors + +- **Robot task name**: Check OpenTelemetry Collector Logs For Errors In Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `otel_error_check.sh` +- **Tags**: `access:read-only`, `otel-collector`, `metrics`, `errors`, `logs`, `data:logs-regexp` +- **Reads**: `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Query OpenTelemetry Logs For Dropped Spans In Namespace `${NAMESPACE}` + +Query the collector logs for dropped spans from errors + +- **Robot task name**: Query OpenTelemetry Logs For Dropped Spans In Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `otel_dropped_check.sh` +- **Tags**: `access:read-only`, `otel-collector`, `metrics`, `errors`, `logs`, `dropped`, `rejected`, `data:logs-regexp` +- **Reads**: `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | — | yes | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `WORKLOAD_SERVICE` | string | The service name used to curl the otel collector metrics endpoint. | `otel-demo-otelcol` | no | +| `WORKLOAD_NAME` | string | The workload name to act as a bastion-host. The collector can be used, or a bastion host depending on networking requirements. | `deployment/otel-demo-otelcol` | no | +| `METRICS_PORT` | string | The port used by the collector to serve its metrics at. This will be scraped. | `8888` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-otelcollector/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-otelcollector +export NAMESPACE=... +export CONTEXT=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export WORKLOAD_SERVICE=... +export WORKLOAD_NAME=... +export METRICS_PORT=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-otelcollector +export NAMESPACE=... +export CONTEXT=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export WORKLOAD_SERVICE=... +bash otel_dropped_check.sh +bash otel_error_check.sh +bash otel_metrics_check.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `otel_dropped_check.sh` — Bash helper script `otel_dropped_check.sh`. +- `otel_error_check.sh` — Bash helper script `otel_error_check.sh`. +- `otel_metrics_check.sh` — Bash helper script `otel_metrics_check.sh`. diff --git a/codebundles/k8s-podresources-health/SKILL-TEMPLATE.md b/codebundles/k8s-podresources-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..6831b7ac2 --- /dev/null +++ b/codebundles/k8s-podresources-health/SKILL-TEMPLATE.md @@ -0,0 +1,140 @@ +--- +name: k8s-podresources-health +kind: skill-template +description: Inspects the resources provisioned for a given set of pods and raises issues or recommendations as necessary. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [pod] +access: read-only +--- + +# Kubernetes Pod Resources Health + +## Summary + +Inspects the resources provisioned for a given set of pods and raises issues or recommendations as necessary. + +See [README.md](README.md) for additional context. + +## Tools + +### Show Pods Without Resource Limit or Resource Requests Set in Namespace `${NAMESPACE}` + +Scans a list of pods in a namespace using labels as a selector and checks if their resources are set. + +- **Robot task name**: Show Pods Without Resource Limit or Resource Requests Set in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `LABELS` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Pod Resource Utilization with Top in Namespace `${NAMESPACE}` + +Performs and a top command on list of labeled workloads to check pod resources. + +- **Robot task name**: Check Pod Resource Utilization with Top in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `top`, `resources`, `utilization`, `pods`, `workloads`, `cpu`, `memory`, `allocation`, `labeled`, `${NAMESPACE}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `LABELS` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Identify VPA Pod Resource Recommendations in Namespace `${NAMESPACE}` + +Queries the namespace for any Vertical Pod Autoscaler resource recommendations. + +- **Robot task name**: Identify VPA Pod Resource Recommendations in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `vpa_recommendations.sh` +- **Tags**: `access:read-only`, `recommendation`, `resources`, `utilization`, `pods`, `cpu`, `memory`, `allocation`, `vpa`, `${NAMESPACE}`, `data:config` +- **Reads**: `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Identify Overutilized Pods in Namespace `${NAMESPACE}` + +Scans the namespace for pods that are over utilizing resources or may be experiencing resource problems like oomkills or restarts. + +- **Robot task name**: Identify Overutilized Pods in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `find_resource_owners.sh` +- **Tags**: `access:read-only`, `overutilized`, `resources`, `utilization`, `pods`, `cpu`, `memory`, `allocation`, `${NAMESPACE}`, `oomkill`, `restarts`, `data:config` +- **Reads**: `CONTEXT` +- **Writes**: `overutilized_pods.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `CONTEXT` | string | Which Kubernetes context to operate within. | `''` | no | +| `LABELS` | string | The metadata labels to use when selecting the objects to measure as running. | `''` | no | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `UTILIZATION_THRESHOLD` | string | The resource usage threshold at which to identify issues. | `95` | no | +| `DEFAULT_INCREASE` | string | The percentage increase for resource recommendations. | `25` | no | +| `RESTART_AGE` | string | The age (in minutes) to consider when looking for container restarts. | `10` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- `overutilized_pods.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-podresources-health/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-podresources-health +export CONTEXT=... +export LABELS=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export UTILIZATION_THRESHOLD=... +export DEFAULT_INCREASE=... +export RESTART_AGE=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-podresources-health +export CONTEXT=... +export LABELS=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export UTILIZATION_THRESHOLD=... +bash find_resource_owners.sh +bash identify_resource_contrained_pods.sh +bash vpa_recommendations.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `find_resource_owners.sh` — Bash helper script `find_resource_owners.sh`. +- `identify_resource_contrained_pods.sh` — Bash helper script `identify_resource_contrained_pods.sh`. +- `vpa_recommendations.sh` — Bash helper script `vpa_recommendations.sh`. diff --git a/codebundles/k8s-postgres-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-postgres-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 000000000..12eed1ca9 --- /dev/null +++ b/codebundles/k8s-postgres-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,231 @@ +--- +name: k8s-postgres-healthcheck +kind: skill-template +description: Runs a series of tasks to check the overall health of a postgres cluster and to provide detailed information useful... Use when triaging or monitoring AKS, EKS, GKE workloads with skill template `k... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [AKS, EKS, GKE, Kubernetes, Patroni, Postgres, Crunchy, Zalando] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Postgres Healthcheck + +## Summary + +Runs a series of tasks to check the overall health of a postgres cluster and to provide detailed information useful for debugging or reviewing configurations. + +See [README.md](README.md) for additional context. + +## Tools + +### List Resources Related to Postgres Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` + +Runs a simple fetch all for the resources in the given workspace under the configured labels. + +- **Robot task name**: List Resources Related to Postgres Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `postgres`, `resources`, `workloads`, `standard`, `information`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Get Postgres Pod Logs & Events for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` + +Queries Postgres-related pods for their recent logs and checks for any warning-type events. + +- **Robot task name**: Get Postgres Pod Logs & Events for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `postgres`, `events`, `warnings`, `labels`, `logs`, `errors`, `pods`, `data:logs-bulk` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Get Postgres Pod Resource Utilization for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` + +Performs and a top command on list of labeled postgres-related workloads to check pod resources. + +- **Robot task name**: Get Postgres Pod Resource Utilization for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `top`, `resources`, `utilization`, `database`, `workloads`, `cpu`, `memory`, `allocation`, `postgres`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check PostgreSQL Connection Health for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` + +Checks connection utilization, client connection summaries, and detects connection saturation issues. Prefers running queries from replicas for safety. + +- **Robot task name**: Check PostgreSQL Connection Health for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `connection_health.sh` +- **Tags**: `access:read-only`, `postgres`, `connections`, `utilization`, `health`, `clients`, `saturation`, `data:config`, `data:sql-query` +- **Reads**: `NAMESPACE`, `OBJECT_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check PostgreSQL Core Metrics for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` + +Checks storage utilization, database sizes, table bloat, WAL usage, and other core PostgreSQL metrics. + +- **Robot task name**: Check PostgreSQL Core Metrics for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `core_metrics.sh` +- **Tags**: `access:read-only`, `postgres`, `storage`, `metrics`, `health`, `disk`, `wal`, `bloat`, `data:config`, `data:sql-query` +- **Reads**: `NAMESPACE`, `OBJECT_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Get Running Postgres Configuration for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` + +Fetches the postgres instance's configuration information. + +- **Robot task name**: Get Running Postgres Configuration for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `config_health.sh` +- **Tags**: `access:read-only`, `config`, `postgres`, `file`, `show`, `path`, `setup`, `configuration`, `data:config`, `data:sql-query` +- **Reads**: `NAMESPACE`, `OBJECT_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Get Patroni Output and Add to Report for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` + +Attempts to run the patronictl CLI within the workload if it's available to check the current state of a patroni cluster, if applicable. + +- **Robot task name**: Get Patroni Output and Add to Report for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `patroni`, `patronictl`, `list`, `cluster`, `health`, `check`, `state`, `postgres`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Patroni Database Lag for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` + +Identifies the lag using patronictl and raises issues if necessary. + +- **Robot task name**: Fetch Patroni Database Lag for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `patroni`, `patronictl`, `list`, `cluster`, `health`, `postgres`, `lag`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Database Backup Status for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` + +Checks the status of backup operations on Kubernets Postgres clusters. Raises issues if backups have not been completed or appear unhealthy. + +- **Robot task name**: Check Database Backup Status for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `backup_health.sh` +- **Tags**: `access:read-only`, `patroni`, `cluster`, `health`, `backup`, `database`, `postgres`, `data:config`, `data:sql-query` +- **Reads**: `NAMESPACE`, `OBJECT_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Runs multiple Kubernetes and psql commands to report on the health of a postgres cluster. Produces a value between 0 (completely failing thet test) and 1 (fully passing the test). Checks for database lag & backup health. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Check Patroni Database Lag in Namespace `${NAMESPACE}` on Host `${HOSTNAME}` using `patronictl` + +Identifies the lag using patronictl and raises issues if necessary. + +- **Robot task name**: Check Patroni Database Lag in Namespace `${NAMESPACE}` on Host `${HOSTNAME}` using `patronictl` +- **Sub-metric name**: `database_lag` +- **Tags**: `patroni`, `patronictl`, `list`, `cluster`, `health`, `check`, `state`, `postgres`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY` + + +#### Check Database Backup Status for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` + +Ensure that backups are current and not stale. + +- **Robot task name**: Check Database Backup Status for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` +- **Sub-metric name**: `backup_status` +- **Underlying script**: `backup_health.sh` +- **Tags**: `patroni`, `cluster`, `health`, `backup`, `database`, `postgres`, `data:config`, `data:sql-query` +- **Reads**: — + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-postgres-healthcheck/runbook.robot` +- **Monitor**: `codebundles/k8s-postgres-healthcheck/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-postgres-healthcheck +export CONTEXT=... +export KUBERNETES_DISTRIBUTION_BINARY=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-postgres-healthcheck +export CONTEXT=... +export KUBERNETES_DISTRIBUTION_BINARY=... +bash backup_health.sh +bash config_health.sh +bash connection_health.sh +bash core_metrics.sh +bash dbquery.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `backup_health.sh` — Bash helper script `backup_health.sh`. +- `config_health.sh` — Bash helper script `config_health.sh`. +- `connection_health.sh` — Bash helper script `connection_health.sh`. +- `core_metrics.sh` — Bash helper script `core_metrics.sh`. +- `dbquery.sh` — Bash helper script `dbquery.sh`. diff --git a/codebundles/k8s-postgres-operations/SKILL-TEMPLATE.md b/codebundles/k8s-postgres-operations/SKILL-TEMPLATE.md new file mode 100644 index 000000000..85a7cfcec --- /dev/null +++ b/codebundles/k8s-postgres-operations/SKILL-TEMPLATE.md @@ -0,0 +1,138 @@ +--- +name: k8s-postgres-operations +kind: skill-template +description: PostgreSQL Operations Runbook for Kubernetes clusters. Use when triaging or monitoring Kubernetes, PostgreSQL, CrunchyDB workloads with skill template `k8s-postgres-operations`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, PostgreSQL, CrunchyDB, Zalando] +resource_types: [kubernetes_resource] +access: read-write +--- + +# PostgreSQL Operations + +## Summary + +This codebundle provides **operational remediation capabilities** for PostgreSQL clusters running in Kubernetes. + +See [README.md](README.md) for additional context. + +## Tools + +### Reinitialize Failed PostgreSQL Cluster Members for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` + +Identify and reinitialize any failed cluster members + +- **Robot task name**: Reinitialize Failed PostgreSQL Cluster Members for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `cluster_operations.sh` +- **Tags**: `access:read-write`, `reinitialize`, `recovery`, `postgres`, `operations` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Perform PostgreSQL Cluster Failover Operation for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` + +Execute failover operation to promote a specific replica or perform automatic failover + +- **Robot task name**: Perform PostgreSQL Cluster Failover Operation for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `cluster_operations.sh` +- **Tags**: `access:read-write`, `failover`, `postgres`, `operations`, `emergency` +- **Reads**: `DATABASE_CONTAINER`, `OBJECT_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Restart PostgreSQL Cluster with Rolling Update for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` + +Perform rolling restart of all PostgreSQL cluster members + +- **Robot task name**: Restart PostgreSQL Cluster with Rolling Update for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `cluster_operations.sh` +- **Tags**: `access:read-write`, `restart`, `postgres`, `operations`, `maintenance` +- **Reads**: `DATABASE_CONTAINER`, `NAMESPACE`, `OBJECT_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Verify Cluster Recovery and Generate Summary for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` + +Final verification of cluster health after operations + +- **Robot task name**: Verify Cluster Recovery and Generate Summary for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `cluster_operations.sh` +- **Tags**: `access:read-write`, `verification`, `summary`, `postgres` +- **Reads**: `NAMESPACE`, `OBJECT_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | — | yes | +| `OBJECT_NAME` | string | The name of the PostgreSQL cluster object. | — | yes | +| `OBJECT_API_VERSION` | string | The API version of the PostgreSQL cluster object. | — | yes | +| `DATABASE_CONTAINER` | string | The name of the database container in the PostgreSQL pods. | `database` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-postgres-operations/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-postgres-operations +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export OBJECT_NAME=... +export OBJECT_API_VERSION=... +export DATABASE_CONTAINER=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-postgres-operations +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export OBJECT_NAME=... +bash cluster_operations.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `cluster_operations.sh` — Bash helper script `cluster_operations.sh`. diff --git a/codebundles/k8s-prometheus-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-prometheus-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 000000000..158d6b24b --- /dev/null +++ b/codebundles/k8s-prometheus-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,143 @@ +--- +name: k8s-prometheus-healthcheck +kind: skill-template +description: This taskset investigates the logs, state and health of Kubernetes Prometheus operator. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-prometheus-healthcheck`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift, Prometheus] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubeprometheus Operator Troubleshoot + +## Summary + +A set of tasks that troubleshoot the Kubernetes Prometheus Operator for issues. + +See [README.md](README.md) for additional context. + +## Tools + +### Check Prometheus Service Monitors in namespace `${NAMESPACE}` + +Checks the selector mappings of service monitors are valid in the namespace + +- **Robot task name**: Check Prometheus Service Monitors in namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `validate_servicemonitors.sh` +- **Tags**: `access:read-only`, `prometheus`, `data:config` +- **Reads**: `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check For Successful Rule Setup in Kubernetes Namespace `${NAMESPACE}` + +Inspects operator instance logs for failed rules setup + +- **Robot task name**: Check For Successful Rule Setup in Kubernetes Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `prometheys`, `data:logs-regexp` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `PROM_NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Verify Prometheus RBAC Can Access ServiceMonitors in Namespace `${PROM_NAMESPACE}` + +Fetch operator rbac and verify it has ServiceMonitors in rbac. + +- **Robot task name**: Verify Prometheus RBAC Can Access ServiceMonitors in Namespace `${PROM_NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `prometheus`, `data:config` +- **Reads**: `KUBERNETES_DISTRIBUTION_BINARY`, `PROM_NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Inspect Prometheus Operator Logs for Scraping Errors in Namespace `${NAMESPACE}` + +Inspect the prometheus operator logs for scraping errors and raise issues if any found + +- **Robot task name**: Inspect Prometheus Operator Logs for Scraping Errors in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `prometheus`, `data:logs-regexp` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `PROM_NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Prometheus API Healthy in Namespace `${PROM_NAMESPACE}` + +Ping Prometheus healthy API endpoint for a 200 response code. + +- **Robot task name**: Check Prometheus API Healthy in Namespace `${PROM_NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `prometheus`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `PROM_NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `NAMESPACE` | string | The name of the namespace to search. | `loki` | no | +| `PROM_NAMESPACE` | string | The name of the namespace that kubeprometheus resides in. | `kube-prometheus-stack` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-prometheus-healthcheck/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-prometheus-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export PROM_NAMESPACE=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-prometheus-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export PROM_NAMESPACE=... +bash validate_servicemonitors.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `validate_servicemonitors.sh` — Bash helper script `validate_servicemonitors.sh`. diff --git a/codebundles/k8s-pvc-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-pvc-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 000000000..cdca3cd1d --- /dev/null +++ b/codebundles/k8s-pvc-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,180 @@ +--- +name: k8s-pvc-healthcheck +kind: skill-template +description: This taskset collects information about storage such as PersistentVolumes and PersistentVolumeClaims to. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-pvc-... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [persistent_volume_claim] +access: read-only +--- + +# Kubernetes Persistent Volume Healthcheck + +## Summary + +This taskset provides a set of commands to troubleshoot storage-related issues in a Kubernetes cluster. + +See [README.md](README.md) for additional context. + +## Tools + +### Fetch Events for Unhealthy Kubernetes PersistentVolumeClaims in Namespace `${NAMESPACE}` + +Lists events related to PersistentVolumeClaims within the namespace that are not bound to PersistentVolumes. + +- **Robot task name**: Fetch Events for Unhealthy Kubernetes PersistentVolumeClaims in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `storage_next_steps.sh` +- **Tags**: `access:read-only`, `pvc`, `list`, `kubernetes`, `storage`, `persistentvolumeclaim`, `persistentvolumeclaims`, `events`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List PersistentVolumeClaims in Terminating State in Namespace `${NAMESPACE}` + +Lists persistentvolumeclaims in a Terminating state. + +- **Robot task name**: List PersistentVolumeClaims in Terminating State in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `pvc`, `list`, `kubernetes`, `storage`, `persistentvolumeclaim`, `terminating`, `check`, `PersistentVolumes`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List PersistentVolumes in Terminating State in Namespace `${NAMESPACE}` + +Lists events related to persistent volumes in Terminating state. + +- **Robot task name**: List PersistentVolumes in Terminating State in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `pv`, `list`, `kubernetes`, `storage`, `persistentvolume`, `terminating`, `events`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### List Pods with Attached Volumes and Related PersistentVolume Details in Namespace `${NAMESPACE}` + +For each pod in a namespace, collect details on configured PersistentVolumeClaim, PersistentVolume, and node. + +- **Robot task name**: List Pods with Attached Volumes and Related PersistentVolume Details in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `pod`, `storage`, `pvc`, `pv`, `status`, `csi`, `storagereport`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch the Storage Utilization for PVC Mounts in Namespace `${NAMESPACE}` + +For each pod in a namespace, fetch the utilization of any PersistentVolumeClaims mounted using the linux df command. Requires kubectl exec permissions. + +- **Robot task name**: Fetch the Storage Utilization for PVC Mounts in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `pvc_utilization_check.sh` +- **Tags**: `access:read-only`, `pod`, `storage`, `pvc`, `utilization`, `capacity`, `persistentvolumeclaims`, `persistentvolumeclaim`, `check`, `pvc`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: `pvc_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check for RWO Persistent Volume Node Attachment Issues in Namespace `${NAMESPACE}` + +For each pod in a namespace, check if it has an RWO persistent volume claim and if so, validate that the pod and the pv are on the same node. + +- **Robot task name**: Check for RWO Persistent Volume Node Attachment Issues in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `pod`, `storage`, `pvc`, `readwriteonce`, `node`, `persistentvolumeclaims`, `persistentvolumeclaim`, `scheduled`, `attachment`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +This SLI collects information about storage such as PersistentVolumes and PersistentVolumeClaims and generates an aggregated health score for the namespace. 1 = Healthy, 0 = Failed, >0 <1 = Degraded + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Generate Namespace Score for Namespace `${NAMESPACE}` + +_No sub-check documentation in Robot source._ + +- **Robot task name**: Generate Namespace Score for Namespace `${NAMESPACE}` +- **Sub-metric name**: `pvc_health` +- **Tags**: — +- **Reads**: — + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `NAMESPACE` | string | The name of the namespace to search. | `` | yes | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `pvc_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-pvc-healthcheck/runbook.robot` +- **Monitor**: `codebundles/k8s-pvc-healthcheck/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-pvc-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-pvc-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +bash pvc_utilization_check.sh +bash storage_next_steps.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `pvc_utilization_check.sh` — Bash helper script `pvc_utilization_check.sh`. +- `storage_next_steps.sh` — Bash helper script `storage_next_steps.sh`. diff --git a/codebundles/k8s-redis-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-redis-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 000000000..2ca004f41 --- /dev/null +++ b/codebundles/k8s-redis-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,98 @@ +--- +name: k8s-redis-healthcheck +kind: skill-template +description: This taskset collects information on your redis workload in your Kubernetes cluster and raises issues if any health... Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill temp... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift, Redis] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Redis Healthcheck + +## Summary + +A set of tasks which performs a health check and read/write verification on a Redis workload running in a Kubernetes cluster. + +See [README.md](README.md) for additional context. + +## Tools + +### Ping `${DEPLOYMENT_NAME}` Redis Workload + +Verifies that a PING can be peformed against the redis workload. + +- **Robot task name**: Ping `${DEPLOYMENT_NAME}` Redis Workload +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `redis`, `cli`, `ping`, `pong`, `alive`, `probe`, `ready`, `data:config` +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Verify `${DEPLOYMENT_NAME}` Redis Read Write Operation in Kubernetes + +Attempts to perform a write and read operation on the redis workload, checking that a key can be set, incremented, and read from. + +- **Robot task name**: Verify `${DEPLOYMENT_NAME}` Redis Read Write Operation in Kubernetes +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `redis`, `cli`, `increment`, `health`, `check`, `read`, `write`, `data:config` +- **Reads**: `CONTEXT`, `DEPLOYMENT_NAME`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `REDIS_HEALTHCHECK_KEY` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `NAMESPACE` | string | The name of the namespace to search. | `` | yes | +| `DEPLOYMENT_NAME` | string | Used to target the redis resource for the health check. | — | yes | +| `REDIS_HEALTHCHECK_KEY` | string | The key used to perform read/write operations on to validate storage. | `runwhen_task_rw_healthcheck` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-redis-healthcheck/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-redis-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export DEPLOYMENT_NAME=... +export REDIS_HEALTHCHECK_KEY=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/k8s-restart-resource/SKILL-TEMPLATE.md b/codebundles/k8s-restart-resource/SKILL-TEMPLATE.md new file mode 100644 index 000000000..1f6c8ad60 --- /dev/null +++ b/codebundles/k8s-restart-resource/SKILL-TEMPLATE.md @@ -0,0 +1,108 @@ +--- +name: k8s-restart-resource +kind: skill-template +description: This taskset restarts a resource with a given set of labels, typically used with other tasksets. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-restart-reso... +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [kubernetes_resource] +access: read-write +--- + +# Kubernetes Restart resource + +## Summary + +Restarts a kubernetes resource in an attempt to get it out of a bad state. + +See [README.md](README.md) for additional context. + +## Tools + +### Get Current Resource State with Labels `${LABELS}` + +Gets the current state of the resource before applying the restart for report review. + +- **Robot task name**: Get Current Resource State with Labels `${LABELS}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `resource`, `application`, `restart`, `state`, `yaml`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `LABELS`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Get Resource Logs with Labels `${LABELS}` + +Collects the last approximately 200 lines of logs from the resource before restarting it. + +- **Robot task name**: Get Resource Logs with Labels `${LABELS}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `resource`, `application`, `workload`, `logs`, `state`, `data:logs-bulk` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `LABELS`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Restart Resource with Labels `${LABELS}` in `${CONTEXT}` + +Restarts the labeled resource in an attempt to get it out of a bad state. + +- **Robot task name**: Restart Resource with Labels `${LABELS}` in `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-write`, `resource`, `application`, `restart`, `pod`, `kill`, `rollout`, `revision` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `LABELS`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `NAMESPACE` | string | The name of the namespace to search. | `` | yes | +| `LABELS` | string | The kubectl label string to use for selecting the resource. | `` | yes | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-restart-resource/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-restart-resource +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export LABELS=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/k8s-serviceaccount-check/SKILL-TEMPLATE.md b/codebundles/k8s-serviceaccount-check/SKILL-TEMPLATE.md new file mode 100644 index 000000000..cf2c5e88a --- /dev/null +++ b/codebundles/k8s-serviceaccount-check/SKILL-TEMPLATE.md @@ -0,0 +1,84 @@ +--- +name: k8s-serviceaccount-check +kind: skill-template +description: This taskset provides tasks to troubleshoot service accounts in a Kubernetes namespace. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-serviceaccount-check`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Service Account Check + +## Summary + +Tasks that help debug or validate service accounts and their access. + +See [README.md](README.md) for additional context. + +## Tools + +### Test Service Account Access to Kubernetes API Server in Namespace `${NAMESPACE}` + +Runs a curl pod as a specific serviceaccount and attempts to all the Kubernetes API server with the mounted token + +- **Robot task name**: Test Service Account Access to Kubernetes API Server in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `ServiceAccount`, `Curl`, `APIServer`, `RBAC`, `${SERVICE_ACCOUNT}`, `${NAMESPACE}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `SERVICE_ACCOUNT` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `NAMESPACE` | string | The name of the namespace to search. | `` | yes | +| `SERVICE_ACCOUNT` | string | The name of the namespace to search. | `default` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-serviceaccount-check/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-serviceaccount-check +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export SERVICE_ACCOUNT=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/k8s-stacktrace-health/SKILL-TEMPLATE.md b/codebundles/k8s-stacktrace-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..6868be9ef --- /dev/null +++ b/codebundles/k8s-stacktrace-health/SKILL-TEMPLATE.md @@ -0,0 +1,119 @@ +--- +name: k8s-stacktrace-health +kind: skill-template +description: Detects and analyzes stacktraces/tracebacks in Kubernetes workload logs for troubleshooting application issues. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Workload Stacktrace Analysis + +## Summary + +This codebundle provides comprehensive stacktrace/traceback detection and analysis for Kubernetes workloads (deployments, statefulsets, and daemonsets). + +See [README.md](README.md) for additional context. + +## Tools + +### Analyze Workload Stacktraces for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}` + +Collects and analyzes stacktraces/tracebacks from all pods in the workload for troubleshooting application issues. + +- **Robot task name**: Analyze Workload Stacktraces for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `EXCLUDED_CONTAINER_NAMES`, `LOG_AGE`, `LOG_LINES`, `LOG_SIZE`, `NAMESPACE`, `WORKLOAD_NAME`, `WORKLOAD_TYPE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +This SLI monitors stacktrace health in kubernetes workload application logs. Produces a value between 0 (stacktraces detected) and 1 (no stacktraces found). Focuses specifically on application error detection through stacktrace analysis. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Get Stacktrace Health Score for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` + +Checks for recent stacktraces/tracebacks related to the workload within a short time window, with filtering to reduce noise. + +- **Robot task name**: Get Stacktrace Health Score for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` +- **Sub-metric name**: `stacktrace_score` +- **Tags**: `stacktraces`, `tracebacks`, `errors`, `recent`, `fast`, `data:logs-stacktrace` +- **Reads**: `CONTEXT`, `MAX_LOG_BYTES`, `MAX_LOG_LINES`, `NAMESPACE`, `WORKLOAD_NAME`, `WORKLOAD_TYPE` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | — | yes | +| `WORKLOAD_NAME` | string | The name of the workload (deployment, statefulset, or daemonset) to analyze for stacktraces. | — | yes | +| `WORKLOAD_TYPE` | string | The type of Kubernetes workload to analyze. | `deployment` | no | +| `LOG_LINES` | string | The number of log lines to fetch from the pods when inspecting logs. | `2000` | no | +| `LOG_AGE` | string | The age of logs to fetch from pods, used for log analysis tasks. | `15m` | no | +| `LOG_SIZE` | string | The maximum size of logs in bytes to fetch from pods, used for log analysis tasks. Defaults to 2MB. | `2097152` | no | +| `EXCLUDED_CONTAINER_NAMES` | string | comma-separated string of keywords used to identify and skip container names containing any of these substrings." | `linkerd-proxy,istio-proxy,vault-agent` | no | +| `MAX_LOG_LINES` | string | Maximum number of log lines to fetch per container to prevent API overload. | `2000` | no | +| `MAX_LOG_BYTES` | string | Maximum log size in bytes to fetch per container to prevent API overload. | `256000` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `kubeconfig` | The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-stacktrace-health/runbook.robot` +- **Monitor**: `codebundles/k8s-stacktrace-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-stacktrace-health +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export WORKLOAD_NAME=... +export WORKLOAD_TYPE=... +export LOG_LINES=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) diff --git a/codebundles/k8s-statefulset-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-statefulset-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 000000000..7e29511e6 --- /dev/null +++ b/codebundles/k8s-statefulset-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,309 @@ +--- +name: k8s-statefulset-healthcheck +kind: skill-template +description: Triages issues related to a StatefulSet and its pods, including persistent volumes and ordered deployment... Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [statefulset] +access: read-only +--- + +# Kubernetes StatefulSet Triage + +## Summary + +This codebundle ships two robots that work together to keep an eye on a single Kubernetes StatefulSet: - `sli.robot` – a lightweight health score (0.0 – 1.0) published as a RunWhen SLI.. + +See [README.md](README.md) for additional context. + +## Tools + +### Analyze Application Log Patterns for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` + +Fetches and analyzes logs from the StatefulSet pods for errors, stack traces, connection issues, and other patterns that indicate application health problems. + +- **Robot task name**: Analyze Application Log Patterns for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `LOG_AGE`, `LOG_ANALYSIS_DEPTH`, `LOG_SEVERITY_THRESHOLD`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Detect Log Anomalies for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` + +Analyzes logs for repeating patterns, anomalous behavior, and unusual log volume that may indicate underlying issues. + +- **Robot task name**: Detect Log Anomalies for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `LOG_AGE`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Liveness Probe Configuration for StatefulSet `${STATEFULSET_NAME}` + +Validates if a Liveness probe has possible misconfigurations + +- **Robot task name**: Check Liveness Probe Configuration for StatefulSet `${STATEFULSET_NAME}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `validate_probes.sh` +- **Tags**: — +- **Reads**: `CONTEXT`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Readiness Probe Configuration for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` + +Validates if a readiness probe has possible misconfigurations + +- **Robot task name**: Check Readiness Probe Configuration for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `validate_probes.sh` +- **Tags**: — +- **Reads**: `CONTEXT`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check for Container Restarts in StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` + +Analyzes container restart patterns in the StatefulSet pods to identify the root cause of restarts, distinguishing between OOM kills, liveness probe failures, and other termination causes. + +- **Robot task name**: Check for Container Restarts in StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `container_restarts.sh` +- **Tags**: `access:read-only`, `containers`, `restarts`, `errors`, `oom`, `probes`, `statefulset`, `${STATEFULSET_NAME}`, `data:config` +- **Reads**: `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Inspect StatefulSet Warning Events for `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` + +Fetches warning events related to the StatefulSet workload in the namespace and triages any issues found in the events. + +- **Robot task name**: Inspect StatefulSet Warning Events for `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `workload_issues.sh` +- **Tags**: `access:read-only`, `events`, `workloads`, `errors`, `warnings`, `get`, `statefulset`, `${STATEFULSET_NAME}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch StatefulSet Workload Details For `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` + +Fetches the current state of the StatefulSet for future review in the report. + +- **Robot task name**: Fetch StatefulSet Workload Details For `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `statefulset`, `details`, `manifest`, `info`, `${STATEFULSET_NAME}`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Inspect StatefulSet Replicas for `${STATEFULSET_NAME}` in namespace `${NAMESPACE}` + +Pulls the replica information for a given StatefulSet and checks if it's highly available, if the replica counts are the expected / healthy values, and raises issues if it is not progressing and is missing pods. Includes StatefulSet-specific checks for ordered deployment. + +- **Robot task name**: Inspect StatefulSet Replicas for `${STATEFULSET_NAME}` in namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Underlying script**: `workload_next_steps.sh` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check StatefulSet PersistentVolumeClaims for `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` + +Checks the status of PersistentVolumeClaims associated with the StatefulSet and identifies storage-related issues. + +- **Robot task name**: Check StatefulSet PersistentVolumeClaims for `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Identify Recent Configuration Changes for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` + +Identifies recent configuration changes from ControllerRevision analysis that might be related to current issues. + +- **Robot task name**: Identify Recent Configuration Changes for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +This SLI uses kubectl to score StatefulSet health. Produces a value between 0 (completely failing the test) and 1 (fully passing the test). Looks for container restarts, critical log errors, pods not ready, StatefulSet replica/revision status, PersistentVolumeClaim binding, and recent warning events. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Get Container Restarts and Score for StatefulSet `${STATEFULSET_NAME}` + +Counts the total sum of container restarts within a timeframe and determines if they're beyond a threshold. + +- **Robot task name**: Get Container Restarts and Score for StatefulSet `${STATEFULSET_NAME}` +- **Sub-metric name**: `container_restarts` +- **Tags**: `Restarts`, `Pods`, `Containers`, `Count`, `Status`, `data:config` +- **Reads**: `CONTAINER_RESTART_AGE`, `CONTAINER_RESTART_THRESHOLD`, `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Pass condition**: `${restart_count} <= ${threshold}` + + +#### Get Critical Log Errors and Score for StatefulSet `${STATEFULSET_NAME}` + +Fetches logs and checks for critical error patterns that indicate application failures. + +- **Robot task name**: Get Critical Log Errors and Score for StatefulSet `${STATEFULSET_NAME}` +- **Sub-metric name**: `log_errors` +- **Tags**: `logs`, `errors`, `critical`, `patterns`, `data:logs-regexp` +- **Reads**: `CONTEXT`, `LOGS_EXCLUDE_PATTERN`, `MAX_LOG_BYTES`, `MAX_LOG_LINES`, `NAMESPACE`, `STATEFULSET_NAME` + + +#### Get NotReady Pods Score for StatefulSet `${STATEFULSET_NAME}` + +Fetches a count of unready pods for the specific StatefulSet. + +- **Robot task name**: Get NotReady Pods Score for StatefulSet `${STATEFULSET_NAME}` +- **Sub-metric name**: `pod_readiness` +- **Tags**: `access:read-only`, `Pods`, `Status`, `Phase`, `Ready`, `Unready`, `Running`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Pass condition**: `${unready_count} == 0` + + +#### Get StatefulSet Replica Status and Score for `${STATEFULSET_NAME}` + +Checks if the StatefulSet has the expected number of ready replicas and that all pods are on the latest revision. + +- **Robot task name**: Get StatefulSet Replica Status and Score for `${STATEFULSET_NAME}` +- **Sub-metric name**: `replica_status` +- **Tags**: `statefulset`, `replicas`, `revisions`, `status`, `availability`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` + + +#### Get PersistentVolumeClaim Status and Score for StatefulSet `${STATEFULSET_NAME}` + +Checks that PersistentVolumeClaims associated with the StatefulSet are Bound. Unbound PVCs commonly keep StatefulSet pods from starting. + +- **Robot task name**: Get PersistentVolumeClaim Status and Score for StatefulSet `${STATEFULSET_NAME}` +- **Sub-metric name**: `pvc_status` +- **Tags**: `statefulset`, `pvc`, `storage`, `persistent`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` + + +#### Get Recent Warning Events Score for StatefulSet `${STATEFULSET_NAME}` + +Checks for recent warning events related to the StatefulSet, its pods, and its PersistentVolumeClaims within a short time window. + +- **Robot task name**: Get Recent Warning Events Score for StatefulSet `${STATEFULSET_NAME}` +- **Sub-metric name**: `warning_events` +- **Tags**: `events`, `warnings`, `recent`, `fast`, `data:config` +- **Reads**: `CONTEXT`, `EVENT_AGE`, `EVENT_THRESHOLD`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Pass condition**: `${event_count} <= ${threshold} else (0.5 if ${event_count} <= ${threshold_doubled}` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | — | yes | +| `STATEFULSET_NAME` | string | The name of the StatefulSet to triage. | — | yes | +| `LOG_AGE` | string | The age of logs to fetch from pods, used for log analysis tasks. | `3h` | no | +| `LOG_ANALYSIS_DEPTH` | string | The depth of log analysis to perform - basic, standard, or comprehensive. | `standard` | no | +| `LOG_SEVERITY_THRESHOLD` | string | The minimum severity level for creating issues (1=critical, 2=high, 3=medium, 4=low, 5=info). | `3` | no | +| `LOG_PATTERN_CATEGORIES` | string | Comma-separated list of log pattern categories to scan for. | `GenericError,AppFailure,StackTrace,Connection,Timeout,Auth,Exceptions,Resource` | no | +| `ANOMALY_THRESHOLD` | string | The threshold for detecting event anomalies based on events per minute. | `5` | no | +| `CONTAINER_RESTART_AGE` | string | The time window (in (h) hours or (m) minutes) to search for container restarts. Only containers that restarted within this time period will be reported. | `10m` | no | +| `CONTAINER_RESTART_THRESHOLD` | string | The minimum number of restarts required to trigger an issue. Containers with restart counts below this threshold will be ignored. | `1` | no | +| `EXCLUDED_CONTAINER_NAMES` | string | Comma-separated list of container names to exclude from log analysis (e.g., linkerd-proxy, istio-proxy, vault-agent). | `linkerd-proxy,istio-proxy,vault-agent` | no | +| `MAX_LOG_LINES` | string | Maximum number of log lines to fetch per container to prevent API overload. | `100` | no | +| `MAX_LOG_BYTES` | string | Maximum log size in bytes to fetch per container to prevent API overload. | `256000` | no | +| `EVENT_AGE` | string | The time window to check for recent warning events. | `10m` | no | +| `EVENT_THRESHOLD` | string | The maximum number of critical warning events allowed before scoring is reduced. | `2` | no | +| `LOGS_EXCLUDE_PATTERN` | string | Pattern used to exclude entries from log analysis when searching for errors. Use regex patterns to filter out false positives like JSON structures. | `"errors":\s*\[\]|\\bINFO\\b|\\bDEBUG\\b|\\bTRACE\\b|\\bSTART\\s*-\\s*|\\bSTART\\s*method\\b` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `kubeconfig` | The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-statefulset-healthcheck/runbook.robot` +- **Monitor**: `codebundles/k8s-statefulset-healthcheck/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-statefulset-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export STATEFULSET_NAME=... +export LOG_AGE=... +export LOG_ANALYSIS_DEPTH=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/k8s-statefulset-healthcheck +export KUBERNETES_DISTRIBUTION_BINARY=... +export CONTEXT=... +export NAMESPACE=... +export STATEFULSET_NAME=... +bash container_restarts.sh +bash track_statefulset_config_changes.sh +bash validate_probes.sh +bash workload_issues.sh +bash workload_next_steps.sh +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `container_restarts.sh` — Bash helper script `container_restarts.sh`. +- `track_statefulset_config_changes.sh` — Bash helper script `track_statefulset_config_changes.sh`. +- `validate_probes.sh` — Bash helper script `validate_probes.sh`. +- `workload_issues.sh` — Bash helper script `workload_issues.sh`. +- `workload_next_steps.sh` — Bash helper script `workload_next_steps.sh`. diff --git a/codebundles/k8s-statefulset-ops/SKILL-TEMPLATE.md b/codebundles/k8s-statefulset-ops/SKILL-TEMPLATE.md new file mode 100644 index 000000000..375c96ab7 --- /dev/null +++ b/codebundles/k8s-statefulset-ops/SKILL-TEMPLATE.md @@ -0,0 +1,213 @@ +--- +name: k8s-statefulset-ops +kind: skill-template +description: Perform operational tasks for a Kubernetes StatefulSet. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-statefulset-ops`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift] +resource_types: [statefulset] +access: read-only +--- + +# Kubernetes StatefulSet Operations + +## Summary + +This codebundle provides StatefulSet-scoped operational tasks so operators can restart workloads, recycle pods, roll back, scale replicas, tune HPA bounds, and adjust CPU or memory resources—similar to `k8s-deployment-ops`, but for the StatefulSet API. + +See [README.md](README.md) for additional context. + +## Tools + +### Restart StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` + +Perform a rollout restart on the StatefulSet + +- **Robot task name**: Restart StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Force Delete Pods for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` + +Force delete all pods related to the StatefulSet using pod template labels + +- **Robot task name**: Force Delete Pods for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Rollback StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` to Previous Version + +Perform a rollback to a known functional version + +- **Robot task name**: Rollback StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` to Previous Version +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scale Down StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` + +Stops (or nearly stops) all running pods in a StatefulSet to immediately halt a failing or runaway service. + +- **Robot task name**: Scale Down StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `ALLOW_SCALE_TO_ZERO`, `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scale Up StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` by ${SCALE_UP_FACTOR}x + +Increase StatefulSet replicas by multiplying current count by SCALE_UP_FACTOR (capped by MAX_REPLICAS). + +- **Robot task name**: Scale Up StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` by ${SCALE_UP_FACTOR}x +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `MAX_REPLICAS`, `NAMESPACE`, `SCALE_UP_FACTOR`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scale Up HPA for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` by ${HPA_SCALE_FACTOR}x + +Increase HPA min and max replicas by a scaling factor + +- **Robot task name**: Scale Up HPA for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` by ${HPA_SCALE_FACTOR}x +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `HPA_MAX_REPLICAS`, `HPA_SCALE_FACTOR`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Scale Down HPA for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` to Min ${HPA_MIN_REPLICAS} + +Decrease HPA min and max replicas to specified minimum values or scale down by factor + +- **Robot task name**: Scale Down HPA for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` to Min ${HPA_MIN_REPLICAS} +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `HPA_MIN_REPLICAS`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Increase CPU Resources for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` + +Intelligently increases CPU resources for a StatefulSet based on VPA recommendations, HPA presence, or doubles current values. Does not apply if GitOps-managed or HPA exists. + +- **Robot task name**: Increase CPU Resources for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Increase Memory Resources for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` + +Intelligently increases memory resources for a StatefulSet based on VPA recommendations, HPA presence, or doubles current values. Does not apply if GitOps-managed or HPA exists. + +- **Robot task name**: Increase Memory Resources for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Decrease CPU Resources for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` + +Intelligently decreases CPU resources for a StatefulSet by dividing current values by scale down factor. Does not apply if GitOps-managed or HPA exists. + +- **Robot task name**: Decrease CPU Resources for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RESOURCE_SCALE_DOWN_FACTOR`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Decrease Memory Resources for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` + +Intelligently decreases memory resources for a StatefulSet by dividing current values by scale down factor. Does not apply if GitOps-managed or HPA exists. + +- **Robot task name**: Decrease Memory Resources for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE`, `RESOURCE_SCALE_DOWN_FACTOR`, `STATEFULSET_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `STATEFULSET_NAME` | string | Used to target the StatefulSet for queries and filtering events. | — | yes | +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | — | yes | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `SCALE_UP_FACTOR` | string | The multiple by which to increase replica count. For example, a StatefulSet with 2 pods and a scale up factor of 2 will target 4 pods. | `2` | no | +| `MAX_REPLICAS` | string | The Max replicas for any scaleup activity. | `10` | no | +| `ALLOW_SCALE_TO_ZERO` | string | Permit StatefulSets to scale to 0. | `false` | no | +| `HPA_SCALE_FACTOR` | string | The multiple by which to scale HPA min/max replicas. | `2` | no | +| `HPA_MAX_REPLICAS` | string | The maximum replicas allowed for HPA max value during scale up operations. | `20` | no | +| `HPA_MIN_REPLICAS` | string | The minimum replicas to set for HPA during scale down operations. | `1` | no | +| `RESOURCE_SCALE_DOWN_FACTOR` | string | The factor by which to divide CPU/memory resources when scaling down (e.g., 2 means divide by 2). | `2` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-statefulset-ops/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-statefulset-ops +export STATEFULSET_NAME=... +export NAMESPACE=... +export CONTEXT=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export SCALE_UP_FACTOR=... +export MAX_REPLICAS=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/k8s-tail-logs-dynamic/SKILL-TEMPLATE.md b/codebundles/k8s-tail-logs-dynamic/SKILL-TEMPLATE.md new file mode 100644 index 000000000..cadc2b23c --- /dev/null +++ b/codebundles/k8s-tail-logs-dynamic/SKILL-TEMPLATE.md @@ -0,0 +1,124 @@ +--- +name: k8s-tail-logs-dynamic +kind: skill-template +description: Performs application-level troubleshooting by inspecting the logs of a workload for parsable exceptions,. Use when triaging or monitoring Kubernetes, AKS, EKS workloads with skill template `k8s-tai... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Kubernetes, AKS, EKS, GKE, OpenShift, GoLang, Json, Python, CSharp, Django, Node, Java, FastAPI] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Tail Application Logs + +## Summary + +This codebundle measures stack traces as they appear in your application logs and can produce reports for a breakdown of stack traces. + +See [README.md](README.md) for additional context. + +## Tools + +### Get `${CONTAINER_NAME}` Application Logs in Namespace `${NAMESPACE}` + +Collects the last approximately 300 lines of logs from the workload + +- **Robot task name**: Get `${CONTAINER_NAME}` Application Logs in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `resource`, `application`, `workload`, `logs`, `state`, `${container_name}`, `${workload_name}`, `data:logs-bulk` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `LABELS`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Tail `${CONTAINER_NAME}` Application Logs For Stacktraces + +Performs an inspection on container logs for exceptions/stacktraces, parsing them and attempts to find relevant source code information + +- **Robot task name**: Tail `${CONTAINER_NAME}` Application Logs For Stacktraces +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `INPUT_MODE`, `KUBERNETES_DISTRIBUTION_BINARY`, `LABELS`, `NAMESPACE`, `STACKTRACE_PARSER` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Measures the number of exception stacktraces present in an application's logs over a time period. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Tail `${CONTAINER_NAME}` Application Logs For Stacktraces + +Tails logs and organizes output for measuring counts. + +- **Robot task name**: Tail `${CONTAINER_NAME}` Application Logs For Stacktraces +- **Sub-metric name**: `log_analysis` +- **Tags**: `resource`, `application`, `workload`, `logs`, `state`, `exceptions`, `errors`, `data:logs-stacktrace` +- **Reads**: `CONTEXT`, `INPUT_MODE`, `KUBERNETES_DISTRIBUTION_BINARY`, `LABELS`, `NAMESPACE`, `STACKTRACE_PARSER` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `NAMESPACE` | string | The name of the Kubernetes namespace to scope actions and searching to. | `sock-shop` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | `sandbox-cluster-1` | no | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | +| `LABELS` | string | The Kubernetes labels used to select the resource for logs. | — | yes | +| `STACKTRACE_PARSER` | string | What parser implementation to use when going through logs. Dynamic will use the first successful parser which is more computationally expensive. | `Dynamic` | no | +| `INPUT_MODE` | string | Changes ingestion style of logs, typically split (1 log per line) works best. | `SPLIT` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-tail-logs-dynamic/runbook.robot` +- **Monitor**: `codebundles/k8s-tail-logs-dynamic/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-tail-logs-dynamic +export NAMESPACE=... +export CONTEXT=... +export KUBERNETES_DISTRIBUTION_BINARY=... +export LABELS=... +export STACKTRACE_PARSER=... +export INPUT_MODE=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) diff --git a/codebundles/k8s-vault-healthcheck/SKILL-TEMPLATE.md b/codebundles/k8s-vault-healthcheck/SKILL-TEMPLATE.md new file mode 100644 index 000000000..cd4e8fa51 --- /dev/null +++ b/codebundles/k8s-vault-healthcheck/SKILL-TEMPLATE.md @@ -0,0 +1,182 @@ +--- +name: k8s-vault-healthcheck +kind: skill-template +description: A suite of tasks that can be used to triage potential issues in your vault namespace. Use when triaging or monitoring AKS, EKS, GKE workloads with skill template `k8s-vault-healthcheck`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [AKS, EKS, GKE, Kubernetes, Vault] +resource_types: [kubernetes_resource] +access: read-only +--- + +# Kubernetes Vault Triage + +## Summary + +A taskset which checks the status of a Vault workload in Kubernetes. + +See [README.md](README.md) for additional context. + +## Tools + +### Fetch Vault CSI Driver Logs in Namespace `${NAMESPACE}` + +Fetches the last 100 lines of logs for the vault CSI driver. + +- **Robot task name**: Fetch Vault CSI Driver Logs in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `fetch`, `log`, `pod`, `container`, `errors`, `inspect`, `trace`, `info`, `vault`, `csi`, `driver`, `data:logs-bulk` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Get Vault CSI Driver Warning Events in `${NAMESPACE}` + +Fetches warning-type events related to the vault CSI driver. + +- **Robot task name**: Get Vault CSI Driver Warning Events in `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `events`, `errors`, `warnings`, `get`, `vault`, `csi`, `driver`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Vault CSI Driver Replicas + +Performs an inspection on the replicas of the vault CSI driver daemonset. + +- **Robot task name**: Check Vault CSI Driver Replicas +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Vault Pod Workload Logs in Namespace `${NAMESPACE}` with Labels `${LABELS}` + +Fetches the last 100 lines of logs for all vault pod workloads in the vault namespace. + +- **Robot task name**: Fetch Vault Pod Workload Logs in Namespace `${NAMESPACE}` with Labels `${LABELS}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `fetch`, `log`, `pod`, `container`, `errors`, `inspect`, `trace`, `info`, `statefulset`, `vault`, `data:logs-bulk` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Get Related Vault Events in Namespace `${NAMESPACE}` + +Fetches all warning-type events related to vault in the vault namespace. + +- **Robot task name**: Get Related Vault Events in Namespace `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `events`, `workloads`, `errors`, `warnings`, `get`, `statefulset`, `vault`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Vault StatefulSet Manifest Details in `${NAMESPACE}` + +Fetches the current state of the vault statefulset manifest for inspection. + +- **Robot task name**: Fetch Vault StatefulSet Manifest Details in `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `statefulset`, `details`, `manifest`, `info`, `vault`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Fetch Vault DaemonSet Manifest Details in Kubernetes Cluster `${NAMESPACE}` + +Fetches the current state of the vault daemonset manifest for inspection. + +- **Robot task name**: Fetch Vault DaemonSet Manifest Details in Kubernetes Cluster `${NAMESPACE}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `statefulset`, `details`, `manifest`, `info`, `vault`, `data:config` +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Verify Vault Availability in Namespace `${NAMESPACE}` and Context `${CONTEXT}` + +Curls the vault endpoint and checks the HTTP response code. + +- **Robot task name**: Verify Vault Availability in Namespace `${NAMESPACE}` and Context `${CONTEXT}` +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `http`, `curl`, `vault`, `web`, `code`, `ok`, `available`, `data:config` +- **Reads**: `NAMESPACE`, `VAULT_URL` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Vault StatefulSet Replicas in `NAMESPACE` + +Pulls the replica information for the Vault statefulset and checks if it's highly available + +- **Robot task name**: Check Vault StatefulSet Replicas in `NAMESPACE` +- **Robot file**: `runbook.robot` +- **Tags**: — +- **Reads**: `CONTEXT`, `KUBERNETES_DISTRIBUTION_BINARY`, `NAMESPACE` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `NAMESPACE` | string | The namespace that your vault workloads reside in. Typically 'vault'. | `vault` | no | +| `CONTEXT` | string | Which Kubernetes context to operate within. | — | yes | +| `LABELS` | string | Additional labels to use when selecting vault resources during triage. | — | yes | +| `VAULT_URL` | string | The URL of the vault instance to check. | — | yes | +| `KUBERNETES_DISTRIBUTION_BINARY` | string | Which binary to use for Kubernetes CLI commands. | `kubectl` | no | + +## Secrets + +_No secrets imported in Robot source._ + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/k8s-vault-healthcheck/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/k8s-vault-healthcheck +export NAMESPACE=... +export CONTEXT=... +export LABELS=... +export VAULT_URL=... +export KUBERNETES_DISTRIBUTION_BINARY=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/mailgun-sending-domain-health/SKILL-TEMPLATE.md b/codebundles/mailgun-sending-domain-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..165d3caa2 --- /dev/null +++ b/codebundles/mailgun-sending-domain-health/SKILL-TEMPLATE.md @@ -0,0 +1,340 @@ +--- +name: mailgun-sending-domain-health +kind: skill-template +description: Validates Mailgun sending domain verification state, delivery metrics, and DNS (SPF, DKIM, DMARC, optional MX) for... Use when triaging or monitoring Mailgun, email, DNS workloads with skill templa... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Mailgun, email, DNS, delivery, domain] +resource_types: [] +access: read-only +--- + +# Mailgun Sending Domain Delivery & DNS Health + +## Summary + +This CodeBundle validates Mailgun sending domains using the regional Mailgun HTTP API and public DNS (`dig`). + +See [README.md](README.md) for additional context. + +## Tools + +### Validate Mailgun Domain Scope Configuration + +Confirms at least one Mailgun sending domain is in scope before running deeper checks. + +- **Robot task name**: Validate Mailgun Domain Scope Configuration +- **Robot file**: `runbook.robot` +- **Tags**: `Mailgun`, `email`, `domain`, `access:read-only`, `data:config` +- **Reads**: `DOMAIN_LIST` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Verify Mailgun Domain Registration and State for Domains in Scope + +Calls Mailgun Domains API to confirm each domain exists, is active, and required DNS records are verified. + +- **Robot task name**: Verify Mailgun Domain Registration and State for Domains in Scope +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-mailgun-domain-state.sh` +- **Tags**: `Mailgun`, `email`, `domain`, `access:read-only`, `data:logs-config` +- **Reads**: `DOMAIN` +- **Writes**: `mailgun_domain_state_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Delivery Success Rate for Mailgun Domains in Scope + +Aggregates delivered vs failed stats over MAILGUN_STATS_WINDOW_HOURS and compares to MAILGUN_MIN_DELIVERY_SUCCESS_PCT. + +- **Robot task name**: Check Delivery Success Rate for Mailgun Domains in Scope +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-mailgun-delivery-success-rate.sh` +- **Tags**: `Mailgun`, `email`, `metrics`, `delivery`, `access:read-only`, `data:metrics` +- **Reads**: `DOMAIN` +- **Writes**: `mailgun_delivery_success_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Bounce and Complaint Rates for Mailgun Domains in Scope + +Evaluates bounce and complaint ratios from Mailgun stats against MAILGUN_MAX_BOUNCE_RATE_PCT and MAILGUN_MAX_COMPLAINT_RATE_PCT. + +- **Robot task name**: Check Bounce and Complaint Rates for Mailgun Domains in Scope +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-mailgun-bounce-complaint-rates.sh` +- **Tags**: `Mailgun`, `email`, `metrics`, `reputation`, `access:read-only`, `data:metrics` +- **Reads**: `DOMAIN` +- **Writes**: `mailgun_bounce_complaint_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Sample Recent Delivered Messages for Mailgun Domains in Scope + +Retrieves a sample of recently delivered messages showing recipients, subjects, and delivery details. + +- **Robot task name**: Sample Recent Delivered Messages for Mailgun Domains in Scope +- **Robot file**: `runbook.robot` +- **Underlying script**: `sample-mailgun-delivered.sh` +- **Tags**: `Mailgun`, `email`, `events`, `delivery`, `access:read-only`, `data:logs-config` +- **Reads**: `DOMAIN` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Analyze 30-Day Volume Trends for Mailgun Domains in Scope + +Fetches 30 days of daily metrics, compares week-over-week volume, and flags cliff drops exceeding MAILGUN_VOLUME_DROP_THRESHOLD_PCT. + +- **Robot task name**: Analyze 30-Day Volume Trends for Mailgun Domains in Scope +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-mailgun-volume-trends.sh` +- **Tags**: `Mailgun`, `email`, `metrics`, `trends`, `access:read-only`, `data:metrics` +- **Reads**: `DOMAIN` +- **Writes**: `mailgun_volume_trend_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check Recent Permanent Failures in Mailgun Events for Domains in Scope + +Samples recent failed events to surface DNS, policy, or authentication-related failures. + +- **Robot task name**: Check Recent Permanent Failures in Mailgun Events for Domains in Scope +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-mailgun-recent-failures.sh` +- **Tags**: `Mailgun`, `email`, `events`, `failures`, `access:read-only`, `data:logs-config` +- **Reads**: `DOMAIN` +- **Writes**: `mailgun_recent_failures_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Check for Rejected Messages in Mailgun for Domains in Scope + +Samples messages Mailgun refused to process (suppressions, policy blocks, invalid recipients) to diagnose volume drops. + +- **Robot task name**: Check for Rejected Messages in Mailgun for Domains in Scope +- **Robot file**: `runbook.robot` +- **Underlying script**: `check-mailgun-rejected-events.sh` +- **Tags**: `Mailgun`, `email`, `events`, `rejected`, `access:read-only`, `data:logs-config` +- **Reads**: `DOMAIN` +- **Writes**: `mailgun_rejected_events_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Verify SPF Record for Mailgun Sending Domains in Scope + +Resolves TXT/SPF and checks Mailgun include expectations using API-ground truth when available. + +- **Robot task name**: Verify SPF Record for Mailgun Sending Domains in Scope +- **Robot file**: `runbook.robot` +- **Underlying script**: `verify-mailgun-spf-dns.sh` +- **Tags**: `Mailgun`, `email`, `DNS`, `SPF`, `access:read-only`, `data:logs-config` +- **Reads**: `DOMAIN` +- **Writes**: `mailgun_spf_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Verify DKIM DNS Records for Mailgun Domains in Scope + +Confirms DKIM TXT records in DNS match Mailgun-reported expectations for each selector. + +- **Robot task name**: Verify DKIM DNS Records for Mailgun Domains in Scope +- **Robot file**: `runbook.robot` +- **Underlying script**: `verify-mailgun-dkim-dns.sh` +- **Tags**: `Mailgun`, `email`, `DNS`, `DKIM`, `access:read-only`, `data:logs-config` +- **Reads**: `DOMAIN` +- **Writes**: `mailgun_dkim_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Verify DMARC Policy for Mailgun Sending Domains in Scope + +Checks _dmarc TXT presence for the organizational domain used in From headers. + +- **Robot task name**: Verify DMARC Policy for Mailgun Sending Domains in Scope +- **Robot file**: `runbook.robot` +- **Underlying script**: `verify-mailgun-dmarc-dns.sh` +- **Tags**: `Mailgun`, `email`, `DNS`, `DMARC`, `access:read-only`, `data:logs-config` +- **Reads**: `DOMAIN` +- **Writes**: `mailgun_dmarc_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Verify MX Records for Mailgun Domains When MX Verification Is Enabled + +When MAILGUN_VERIFY_MX is true, validates published MX against Mailgun receiving hints for inbound routing. + +- **Robot task name**: Verify MX Records for Mailgun Domains When MX Verification Is Enabled +- **Robot file**: `runbook.robot` +- **Underlying script**: `verify-mailgun-mx-dns.sh` +- **Tags**: `Mailgun`, `email`, `DNS`, `MX`, `access:read-only`, `data:logs-config` +- **Reads**: `DOMAIN` +- **Writes**: `mailgun_mx_issues.json` +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Measures Mailgun sending domain health from domain state, delivery success, and SPF alignment. Produces a score between 0 and 1. + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Score Mailgun Domain Active State + +Binary 1/0 score from Mailgun Domains API active state. + +- **Robot task name**: Score Mailgun Domain Active State +- **Sub-metric name**: `domain_active` +- **Underlying script**: `sli-mailgun-domain-score.sh` +- **Tags**: `Mailgun`, `email`, `sli`, `access:read-only`, `data:metrics` +- **Reads**: — + + +#### Score Mailgun Delivery Success Threshold + +Binary 1/0 score comparing delivery success to MAILGUN_MIN_DELIVERY_SUCCESS_PCT. + +- **Robot task name**: Score Mailgun Delivery Success Threshold +- **Sub-metric name**: `delivery_success` +- **Underlying script**: `sli-mailgun-delivery-score.sh` +- **Tags**: `Mailgun`, `email`, `sli`, `access:read-only`, `data:metrics` +- **Reads**: — + + +#### Score Mailgun SPF Alignment + +Binary 1/0 score when SPF authorizes Mailgun. + +- **Robot task name**: Score Mailgun SPF Alignment +- **Sub-metric name**: `spf_mailgun` +- **Underlying script**: `sli-mailgun-spf-score.sh` +- **Tags**: `Mailgun`, `email`, `sli`, `access:read-only`, `data:metrics` +- **Reads**: — + + +#### Score Mailgun Volume Trend + +Binary 1/0 score comparing current-week volume to 30-day historical weekly average. + +- **Robot task name**: Score Mailgun Volume Trend +- **Sub-metric name**: `volume_trend` +- **Underlying script**: `sli-mailgun-volume-trend-score.sh` +- **Tags**: `Mailgun`, `email`, `sli`, `access:read-only`, `data:metrics` +- **Reads**: — + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `MAILGUN_SENDING_DOMAIN` | string | FQDN of the Mailgun sending domain to assess. | — | yes | +| `MAILGUN_API_REGION` | string | Mailgun API region (us or eu). | — | yes | +| `RESOURCES` | string | Specific domain FQDN or All to list domains via the Mailgun API. | `All` | no | +| `MAILGUN_STATS_WINDOW_HOURS` | string | Rolling window in hours for Mailgun stats queries. | `24` | no | +| `MAILGUN_MIN_DELIVERY_SUCCESS_PCT` | string | Minimum acceptable delivered divided by delivered plus failed percentage. | `95` | no | +| `MAILGUN_MAX_BOUNCE_RATE_PCT` | string | Maximum acceptable bounce rate percentage vs accepted volume. | `5` | no | +| `MAILGUN_MAX_COMPLAINT_RATE_PCT` | string | Maximum acceptable complaint rate percentage vs accepted volume. | `0.1` | no | +| `MAILGUN_VERIFY_MX` | string | Set true to enforce MX checks for inbound routing. | `false` | no | +| `MAILGUN_VOLUME_DROP_THRESHOLD_PCT` | string | Week-over-week volume decline percentage that triggers an alert (e.g. 80 means a drop of 80%+). | `80` | no | +| `MAILGUN_DELIVERED_SAMPLE_SIZE` | string | Number of recent delivered messages to sample in the report. | `10` | no | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `mailgun_api_key` | Mailgun private API key (HTTP Basic user=api, password=key) | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` +- `mailgun_domain_state_issues.json` +- `mailgun_delivery_success_issues.json` +- `mailgun_bounce_complaint_issues.json` +- `mailgun_volume_trend_issues.json` +- `mailgun_recent_failures_issues.json` +- `mailgun_rejected_events_issues.json` +- `mailgun_spf_issues.json` +- `mailgun_dkim_issues.json` +- `mailgun_dmarc_issues.json` +- `mailgun_mx_issues.json` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/mailgun-sending-domain-health/runbook.robot` +- **Monitor**: `codebundles/mailgun-sending-domain-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/mailgun-sending-domain-health +export MAILGUN_SENDING_DOMAIN=... +export MAILGUN_API_REGION=... +export RESOURCES=... +export MAILGUN_STATS_WINDOW_HOURS=... +export MAILGUN_MIN_DELIVERY_SUCCESS_PCT=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/mailgun-sending-domain-health +export MAILGUN_SENDING_DOMAIN=... +export MAILGUN_API_REGION=... +export RESOURCES=... +bash check-mailgun-bounce-complaint-rates.sh +bash check-mailgun-delivery-success-rate.sh +bash check-mailgun-domain-state.sh +bash check-mailgun-recent-failures.sh +bash check-mailgun-rejected-events.sh +bash check-mailgun-volume-trends.sh +bash discover-mailgun-domains.sh +bash sample-mailgun-delivered.sh +bash sli-mailgun-delivery-score.sh +bash sli-mailgun-domain-score.sh +bash sli-mailgun-spf-score.sh +bash sli-mailgun-volume-trend-score.sh +# ... and 4 more scripts +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `check-mailgun-bounce-complaint-rates.sh` — Bash helper script `check-mailgun-bounce-complaint-rates.sh`. +- `check-mailgun-delivery-success-rate.sh` — Bash helper script `check-mailgun-delivery-success-rate.sh`. +- `check-mailgun-domain-state.sh` — Bash helper script `check-mailgun-domain-state.sh`. +- `check-mailgun-recent-failures.sh` — Bash helper script `check-mailgun-recent-failures.sh`. +- `check-mailgun-rejected-events.sh` — Bash helper script `check-mailgun-rejected-events.sh`. +- `check-mailgun-volume-trends.sh` — Bash helper script `check-mailgun-volume-trends.sh`. +- `discover-mailgun-domains.sh` — Bash helper script `discover-mailgun-domains.sh`. +- `sample-mailgun-delivered.sh` — Bash helper script `sample-mailgun-delivered.sh`. +- `sli-mailgun-delivery-score.sh` — Bash helper script `sli-mailgun-delivery-score.sh`. +- `sli-mailgun-domain-score.sh` — Bash helper script `sli-mailgun-domain-score.sh`. +- `sli-mailgun-spf-score.sh` — Bash helper script `sli-mailgun-spf-score.sh`. +- `sli-mailgun-volume-trend-score.sh` — Bash helper script `sli-mailgun-volume-trend-score.sh`. +- `verify-mailgun-dkim-dns.sh` — Bash helper script `verify-mailgun-dkim-dns.sh`. +- `verify-mailgun-dmarc-dns.sh` — Bash helper script `verify-mailgun-dmarc-dns.sh`. +- `verify-mailgun-mx-dns.sh` — Bash helper script `verify-mailgun-mx-dns.sh`. +- `verify-mailgun-spf-dns.sh` — Bash helper script `verify-mailgun-spf-dns.sh`. diff --git a/codebundles/terraform-cloud-workspace-lock-check/SKILL-TEMPLATE.md b/codebundles/terraform-cloud-workspace-lock-check/SKILL-TEMPLATE.md new file mode 100644 index 000000000..fd9e5a7a2 --- /dev/null +++ b/codebundles/terraform-cloud-workspace-lock-check/SKILL-TEMPLATE.md @@ -0,0 +1,84 @@ +--- +name: terraform-cloud-workspace-lock-check +kind: skill-template +description: Check whether the Terraform Cloud Workspace is in a locked state. Use when triaging or monitoring Terraform, Cloud workloads with skill template `terraform-cloud-workspace-lock-check`. +runtime: + runbook: runbook.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Terraform, Cloud] +resource_types: [] +access: read-only +--- + +# Terraform Cloud Workspace Lock Check + +## Summary + +Check whether the Terraform Cloud Workspace is in a locked state. + +See [README.md](README.md) for additional context. + +## Tools + +### Checking whether the Terraform Cloud Workspace '${TERRAFORM_WORKSPACE_NAME}' is in a locked state + +Use curl to check whether the Terraform Cloud Workspace is in a locked state + +- **Robot task name**: Checking whether the Terraform Cloud Workspace '${TERRAFORM_WORKSPACE_NAME}' is in a locked state +- **Robot file**: `runbook.robot` +- **Tags**: `access:read-only`, `terraform`, `cloud`, `workspace`, `lock`, `data:config` +- **Reads**: `TERRAFORM_API_TOKEN`, `TERRAFORM_API_URL`, `TERRAFORM_ORGANIZATION_NAME`, `TERRAFORM_WORKSPACE_NAME` +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `TERRAFORM_API_URL` | string | What URL to perform requests against. | `https://app.terraform.io/api/v2` | no | +| `TERRAFORM_ORGANIZATION_NAME` | string | Name of the organization in Terraform Cloud. | `` | yes | +| `TERRAFORM_WORKSPACE_NAME` | string | Name of the workspace in Terraform Cloud. | `` | yes | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `TERRAFORM_API_TOKEN` | Bearer Token to use for authentication to Terraform Cloud API | yes | + +## Outputs + +_See Robot run output and platform report artifacts._ + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/terraform-cloud-workspace-lock-check/runbook.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/terraform-cloud-workspace-lock-check +export TERRAFORM_API_URL=... +export TERRAFORM_ORGANIZATION_NAME=... +export TERRAFORM_WORKSPACE_NAME=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +_No standalone shell scripts in this bundle._ + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues diff --git a/codebundles/vercel-project-health/SKILL-TEMPLATE.md b/codebundles/vercel-project-health/SKILL-TEMPLATE.md new file mode 100644 index 000000000..9b9b7d1af --- /dev/null +++ b/codebundles/vercel-project-health/SKILL-TEMPLATE.md @@ -0,0 +1,317 @@ +--- +name: vercel-project-health +kind: skill-template +description: Vercel project health — project configuration snapshot, recent deployments with git branches, and unhealthy HTTP... Use when triaging or monitoring Vercel, HTTP, logs workloads with skill template ... +runtime: + runbook: runbook.robot + monitor: sli.robot + executor: worker + entrypoint: /home/runwhen/robot-runtime/runrobot.sh + base_image: rw-base-runtime +platforms: [Vercel, HTTP, logs, runtime, project, deployments] +resource_types: [] +access: read-only +--- + +# Vercel Project Health + +## Summary + +This CodeBundle inspects a Vercel project end-to-end: **project configuration** (sanitized), **recent deployments with git branches** and production readiness hints, **failed-deployment diagnostics** (real build error reasons), **production domain verification**, **historical HTTP traffic** (4xx, 5xx, optional codes) by route over a lookback window, and a complementary **synthetic HTTP probe**.... + +See [README.md](README.md) for additional context. + +## Tools + +### Fetch Vercel Project Configuration for Configured Project(s) + +GET /v9/projects — writes sanitized project metadata per project under VERCEL_ARTIFACT_DIR (see suite vars). + +- **Robot task name**: Fetch Vercel Project Configuration for Configured Project(s) +- **Robot file**: `runbook.robot` +- **Tags**: `Vercel`, `config`, `access:read-only`, `data:logs-config` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Report Vercel Deployment Branches and Status for Configured Project(s) + +Lists recent production and preview deployments (all targets), git branch and commit metadata, and summary hints such as latest production READY state. + +- **Robot task name**: Report Vercel Deployment Branches and Status for Configured Project(s) +- **Robot file**: `runbook.robot` +- **Tags**: `Vercel`, `deployments`, `git`, `access:read-only`, `data:logs-config` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Diagnose Recent Failed Vercel Deployments for Configured Project(s) + +For each ERROR/CANCELED entry in the deployment-branches snapshot (capped by MAX_FAILED_DEPLOYMENTS_TO_DIAGNOSE), pulls GET /v13/deployments/{id} and surfaces the actual errorCode + errorMessage + branch + commit so on-call sees the real failure reason instead of just a count. + +- **Robot task name**: Diagnose Recent Failed Vercel Deployments for Configured Project(s) +- **Robot file**: `runbook.robot` +- **Tags**: `Vercel`, `deployments`, `diagnose`, `access:read-only`, `data:logs-config` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Verify Vercel Project Production Domains for Configured Project(s) + +Calls GET /v9/projects/{id}/domains, separates production-bound hostnames from preview/custom-environment aliases, reports verification + redirect state, and raises one issue per unverified production domain (with the TXT/CNAME records the user needs to add). + +- **Robot task name**: Verify Vercel Project Production Domains for Configured Project(s) +- **Robot file**: `runbook.robot` +- **Tags**: `Vercel`, `domains`, `access:read-only`, `data:logs-config` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Resolve Vercel Deployments in Time Window for Configured Project(s) + +Lists deployments whose active interval overlaps the lookback window so log queries use relevant deployment IDs and warns when none cover the window. + +- **Robot task name**: Resolve Vercel Deployments in Time Window for Configured Project(s) +- **Robot file**: `runbook.robot` +- **Tags**: `Vercel`, `deployment`, `access:read-only`, `data:logs-config` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Collect Vercel Request Logs for Configured Project(s) + +Hits Vercel's historical request-logs endpoint (the same one the dashboard's "Logs" page uses) for the lookback window, paginates rows, and writes vercel_request_log_rows.json. The 4xx / 5xx / other aggregate tasks read this file directly instead of issuing more API calls. Filtered to VERCEL_REQUEST_LOGS_ENV (default: production) so we only score what real users hit. + +- **Robot task name**: Collect Vercel Request Logs for Configured Project(s) +- **Robot file**: `runbook.robot` +- **Tags**: `Vercel`, `HTTP`, `logs`, `access:read-only`, `data:logs` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Aggregate 4xx Paths from Vercel Request Logs for Configured Project(s) + +Reads the shared request-log rows and aggregates ALL 4xx responses (400-499) by code, path, and method. Surfaces 401/403/422/etc. that a 404-only filter would drop. + +- **Robot task name**: Aggregate 4xx Paths from Vercel Request Logs for Configured Project(s) +- **Robot file**: `runbook.robot` +- **Tags**: `Vercel`, `HTTP`, `4xx`, `access:read-only`, `data:logs` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Aggregate 5xx Paths from Vercel Request Logs for Configured Project(s) + +Aggregates server-side HTTP errors (5xx) by code, path, and method from the shared request-log rows. + +- **Robot task name**: Aggregate 5xx Paths from Vercel Request Logs for Configured Project(s) +- **Robot file**: `runbook.robot` +- **Tags**: `Vercel`, `HTTP`, `5xx`, `access:read-only`, `data:logs` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Aggregate Other Unhealthy HTTP Codes from Vercel Request Logs for Configured Project(s) + +Aggregates additional client error codes configured in UNHEALTHY_HTTP_CODES (for example 408 and 429) by code, path, and method from the shared request-log rows. + +- **Robot task name**: Aggregate Other Unhealthy HTTP Codes from Vercel Request Logs for Configured Project(s) +- **Robot file**: `runbook.robot` +- **Tags**: `Vercel`, `HTTP`, `errors`, `access:read-only`, `data:logs` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Build Consolidated Vercel HTTP Error Summary for Configured Project(s) + +Merges per-code summaries, applies MIN_REQUEST_COUNT_THRESHOLD for noise reduction, and emits consolidated JSON plus a top-routes table for reporting. + +- **Robot task name**: Build Consolidated Vercel HTTP Error Summary for Configured Project(s) +- **Robot file**: `runbook.robot` +- **Tags**: `Vercel`, `HTTP`, `summary`, `access:read-only`, `data:logs` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +### Probe Production URL Paths for Configured Project(s) + +Synthetic HTTP GET probe against configurable paths on the latest production URL. Catches what historical logs miss (DNS / cert / cold-start timeouts, regional CDN issues, no-traffic blind spots) and complements the request-logs aggregations. Configure VERCEL_PROBE_PATHS, VERCEL_PROBE_BASE_URL (optional override), VERCEL_PROBE_TIMEOUT_SECONDS, VERCEL_PROBE_SLOW_MS. + +- **Robot task name**: Probe Production URL Paths for Configured Project(s) +- **Robot file**: `runbook.robot` +- **Tags**: `Vercel`, `HTTP`, `probe`, `access:read-only`, `data:probe` +- **Reads**: — +- **Writes**: — +- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail + + +## Monitor + +Measures Vercel project health across eight binary sub-signals — API reachability, latest production deployment READY, recent deployment failure ratio, production-branch match, latest production deployment fresh, production alias is current (no rollback in progress), production domains verified, and a capped runtime HTTP error sample. Averages them into a primary score between 0 (failing) and 1 (healthy). + +- **Robot file**: `sli.robot` +- **Score range**: `0.0` (failing) to `1.0` (healthy) +- **Aggregation**: arithmetic mean of the sub-checks below +- **Recommended interval**: `180s` + +### Sub-checks + +#### Score Vercel Project API Reachability + +Binary score: 1 when GET /v9/projects/{id} returns the configured project for the current token + team scope, 0 otherwise. + +- **Robot task name**: Score Vercel Project API Reachability +- **Sub-metric name**: `vercel_api_ok` +- **Tags**: `Vercel`, `sli`, `access:read-only`, `data:metrics` +- **Reads**: `VERCEL_PROJECT_ID`, `VERCEL_TEAM_ID` + + +#### Score Vercel Deployment Health Signals + +Five lightweight signals derived from a single GET /v9/projects/{id} call: latest production deployment is READY; recent ERROR/CANCELED count is at or below SLI_MAX_RECENT_FAILED_DEPLOYMENTS; link.productionBranch matches EXPECTED_PRODUCTION_BRANCH (when configured); the latest production deployment is fresher than SLI_MAX_PRODUCTION_AGE_HOURS; and project.targets.production points at the newest READY production deployment (alias-current / no rollback in progress). Pushes five sub-metrics from one API call. + +- **Robot task name**: Score Vercel Deployment Health Signals +- **Sub-metric name**: `production_deployment_ready` +- **Underlying script**: `sli-vercel-deployment-health-score.sh` +- **Tags**: `Vercel`, `sli`, `access:read-only`, `data:metrics` +- **Reads**: `EXPECTED_PRODUCTION_BRANCH`, `SLI_MAX_PRODUCTION_AGE_HOURS`, `SLI_MAX_RECENT_FAILED_DEPLOYMENTS`, `VERCEL_PROJECT_ID` + + +#### Score Vercel Domain Verification + +Binary score: 1 when every production-bound domain attached to the project is verified, 0 if any production domain has verified=false. Calls GET /v9/projects/{id}/domains once per SLI run. Branch-bound preview aliases and custom-environment domains are excluded. + +- **Robot task name**: Score Vercel Domain Verification +- **Sub-metric name**: `domains_verified_ok` +- **Tags**: `Vercel`, `sli`, `access:read-only`, `data:metrics` +- **Reads**: `VERCEL_PROJECT_ID` +- **Pass condition**: `(${total} == 0 or len(${unverified}) == 0)` + + +#### Score Vercel Runtime Error Sample + +Binary score: 1 when error-class (status >= 400) rows in a capped sample of the historical request-logs endpoint stay at or below SLI_MAX_ERROR_EVENTS, 0 otherwise. Backed by GET https://vercel.com/api/logs/request-logs (the same endpoint the dashboard's Logs page uses) — NOT the live-tail /v1/runtime-logs endpoint. + +- **Robot task name**: Score Vercel Runtime Error Sample +- **Sub-metric name**: `runtime_error_sample` +- **Tags**: `Vercel`, `sli`, `access:read-only`, `data:metrics` +- **Reads**: `SLI_LOOKBACK_HOURS`, `SLI_MAX_ERROR_EVENTS`, `SLI_MAX_ROWS`, `VERCEL_PROJECT_ID`, `VERCEL_REQUEST_LOGS_ENV` +- **Pass condition**: `${count} <= ${threshold}` + + +## Inputs + +| Name | Type | Description | Default | Required | +|---|---|---|---|---| +| `VERCEL_TEAM_ID` | string | Vercel team slug or ID; leave empty for hobby projects scoped to the token owner | `` | yes | +| `VERCEL_PROJECT_ID` | string | Single Vercel project ID (prj_...); ignored when VERCEL_PROJECT_IDS is non-empty | `` | yes | +| `VERCEL_PROJECT_IDS` | string | Optional comma-separated project IDs for multi-project runs (overrides single ID when set) | `` | yes | +| `VERCEL_ARTIFACT_ROOT` | string | Parent directory for per-project JSON outputs when multiple projects are configured | `.vercel-health-projects` | no | +| `TIME_WINDOW_HOURS` | string | Lookback hours for log aggregation | `24` | no | +| `DEPLOYMENT_ENVIRONMENT` | string | production, preview, or all deployments when resolving IDs | `production` | no | +| `UNHEALTHY_HTTP_CODES` | string | Comma-separated extra HTTP status codes for the other-errors task | `408,429` | no | +| `MIN_REQUEST_COUNT_THRESHOLD` | string | Minimum requests per path before treating counts as high-severity in the summary | `5` | no | +| `VERCEL_REQUEST_LOGS_ENV` | string | Filter passed to the historical request-logs endpoint. Use 'production' (default) to score only what real users hit, 'preview' for branch deployments, or 'all' to combine. | `production` | no | +| `VERCEL_REQUEST_LOGS_MAX_ROWS` | string | Cap on rows fetched from the historical request-logs endpoint per project per run. Stops paginating once reached. | `5000` | no | +| `VERCEL_REQUEST_LOGS_MAX_PAGES` | string | Hard cap on pages walked even when hasMoreRows=true; bounds wall-clock for very busy projects. | `20` | no | +| `VERCEL_PROBE_PATHS` | string | Comma-separated paths to synthetic-probe against the production URL. Empty disables the probe task. | `/` | no | +| `VERCEL_PROBE_BASE_URL` | string | Optional explicit base URL for the synthetic probe; auto-resolved from the latest READY production deployment when empty. | `` | yes | +| `VERCEL_PROBE_TIMEOUT_SECONDS` | string | Per-request timeout for the synthetic probe (seconds). | `10` | no | +| `VERCEL_PROBE_SLOW_MS` | string | Probe latency threshold in ms; requests slower than this raise an informational issue. | `2000` | no | +| `DEPLOYMENT_SNAPSHOT_LIMIT` | string | Maximum deployments to include in the branch/status snapshot (most recent first) | `25` | no | +| `MAX_FAILED_DEPLOYMENTS_TO_DIAGNOSE` | string | Maximum recent ERROR/CANCELED deployments to enrich with build-error reason via GET /v13/deployments/{id}. Each adds one API call, so keep this small. | `2` | no | +| `MAX_DEPLOYMENTS_TO_SCAN` | string | Maximum READY deployments to keep when resolving the lookback window for log scans. | `10` | no | +| `SLI_LOOKBACK_HOURS` | string | Lookback window (hours) for the error-sample SLI. Defaults to TIME_WINDOW_HOURS when unset. | `24` | no | +| `SLI_MAX_ROWS` | string | Cap on rows fetched from the request-logs endpoint per SLI run. Bounds wall-clock for very busy projects. | `200` | no | +| `SLI_MAX_ERROR_EVENTS` | string | Maximum allowed HTTP 4xx/5xx events in the request-logs sample before the runtime_error_sample sub-score drops to 0. | `25` | no | +| `SLI_MAX_RECENT_FAILED_DEPLOYMENTS` | string | Allowed ERROR/CANCELED deployments in project.latestDeployments before the recent-failures SLI scores 0 | `1` | no | +| `SLI_MAX_PRODUCTION_AGE_HOURS` | string | Maximum hours since the latest production deployment before the production_deployment_fresh sub-score drops to 0 (default 168h / 7 days). Catches projects whose main branch has drifted far ahead of what is actually live. | `168` | no | +| `EXPECTED_PRODUCTION_BRANCH` | string | Optional expected production branch; when set, the production-branch SLI scores 0 if Vercel's link.productionBranch differs. Leave blank to skip the check. | `` | yes | + +## Secrets + +| Name | Description | Required | +|---|---|---| +| `vercel_token` | Vercel API bearer token with read access to project and deployment logs | yes | + +## Outputs + +- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot` + +## How to invoke + +### Production (RunWhen runner / worker) + +The platform **runner** schedules work on a location **worker**. The worker +image (`rw-base-runtime`) executes Robot via `runrobot.sh` with +`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`. + +- **Runbook**: `codebundles/vercel-project-health/runbook.robot` +- **Monitor**: `codebundles/vercel-project-health/sli.robot` + +### Local development (devcontainer only) + +`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime. + +```bash +cd codebundles/vercel-project-health +export VERCEL_TEAM_ID=... +export VERCEL_PROJECT_ID=... +export VERCEL_PROJECT_IDS=... +export VERCEL_ARTIFACT_ROOT=... +export TIME_WINDOW_HOURS=... +ro runbook.robot +``` + +### Standalone scripts (no Robot) + + +Set the input variables above, then run the matching script: + +```bash +cd codebundles/vercel-project-health +export VERCEL_TEAM_ID=... +export VERCEL_PROJECT_ID=... +export VERCEL_PROJECT_IDS=... +bash aggregate-vercel-4xx-paths.sh +bash aggregate-vercel-5xx-paths.sh +bash aggregate-vercel-other-error-paths.sh +bash collect-vercel-request-logs.sh +bash diagnose-recent-failed-deployments.sh +bash probe-vercel-production-urls.sh +bash report-vercel-deployment-branches.sh +bash report-vercel-http-error-summary.sh +bash report-vercel-project-config.sh +bash report-vercel-project-domains.sh +bash resolve-vercel-deployments-in-window.sh +bash sli-vercel-deployment-health-score.sh +# ... and 1 more scripts +``` + +## Source files + +- `runbook.robot` — orchestrates tools and raises issues +- `sli.robot` — monitor scoring (`sli.robot` runtime file) +- `aggregate-vercel-4xx-paths.sh` — Bash helper script `aggregate-vercel-4xx-paths.sh`. +- `aggregate-vercel-5xx-paths.sh` — Bash helper script `aggregate-vercel-5xx-paths.sh`. +- `aggregate-vercel-other-error-paths.sh` — Bash helper script `aggregate-vercel-other-error-paths.sh`. +- `collect-vercel-request-logs.sh` — Bash helper script `collect-vercel-request-logs.sh`. +- `diagnose-recent-failed-deployments.sh` — Bash helper script `diagnose-recent-failed-deployments.sh`. +- `probe-vercel-production-urls.sh` — Bash helper script `probe-vercel-production-urls.sh`. +- `report-vercel-deployment-branches.sh` — Bash helper script `report-vercel-deployment-branches.sh`. +- `report-vercel-http-error-summary.sh` — Bash helper script `report-vercel-http-error-summary.sh`. +- `report-vercel-project-config.sh` — Bash helper script `report-vercel-project-config.sh`. +- `report-vercel-project-domains.sh` — Bash helper script `report-vercel-project-domains.sh`. +- `resolve-vercel-deployments-in-window.sh` — Bash helper script `resolve-vercel-deployments-in-window.sh`. +- `sli-vercel-deployment-health-score.sh` — Bash helper script `sli-vercel-deployment-health-score.sh`. +- `vercel-helpers.sh` — Bash helper script `vercel-helpers.sh`. diff --git a/scripts/generate_skill_md.py b/scripts/generate_skill_md.py new file mode 100755 index 000000000..2576cbd6c --- /dev/null +++ b/scripts/generate_skill_md.py @@ -0,0 +1,736 @@ +#!/usr/bin/env python3 +"""Generate SKILL-TEMPLATE.md manifests for CodeBundles from runbook.robot / sli.robot.""" + +from __future__ import annotations + +import argparse +import re +import sys +from dataclasses import dataclass, field +from pathlib import Path + +OUTPUT_FILENAME = "SKILL-TEMPLATE.md" +LEGACY_FILENAME = "SKILL.md" + + +@dataclass +class ImportField: + name: str + kind: str # "variable" | "secret" + type_: str = "string" + description: str = "" + default: str | None = None + + @property + def required(self) -> bool: + if self.kind == "secret": + return True + return self.default is None or self.default == "" + + +@dataclass +class RobotTask: + name: str + robot_file: str + documentation: str = "" + tags: list[str] = field(default_factory=list) + bash_file: str | None = None + sub_metric: str | None = None + pass_condition: str | None = None + json_writes: list[str] = field(default_factory=list) + env_reads: set[str] = field(default_factory=set) + + +@dataclass +class RobotFile: + path: Path + documentation: str = "" + display_name: str = "" + supports: list[str] = field(default_factory=list) + tasks: list[RobotTask] = field(default_factory=list) + imports: list[ImportField] = field(default_factory=list) + + +SECTION_RE = re.compile(r"^\*\*\* (.+?) \*\*\*$") +METADATA_RE = re.compile(r"^Metadata\s+(\S+)\s+(.*)$") +DOC_LINE_RE = re.compile(r"^\s+\[Documentation\]\s+(.*)$") +TAGS_RE = re.compile(r"^\s+\[Tags\]\s+(.*)$") +BASH_FILE_RE = re.compile(r"^\s+\.\.\.\s+bash_file=([^\s]+)") +SUB_NAME_RE = re.compile(r"RW\.Core\.Push Metric.*sub_name=([^\s\]]+)") +PASS_EVAL_RE = re.compile( + r"\$\{[^}]+\}=\s+Evaluate\s+1 if (.+?) else 0", re.IGNORECASE +) +CAT_JSON_RE = re.compile(r"cat\s+([a-zA-Z0-9_.-]+\.json)") +ENV_VAR_RE = re.compile(r"\$\{([A-Z][A-Z0-9_]*)\}") +IMPORT_VAR_RE = re.compile(r"RW\.Core\.Import User Variable\s+(\S+)") +IMPORT_SECRET_RE = re.compile(r"RW\.Core\.Import Secret\s+(\S+)") +CONTINUATION_KV = re.compile(r"^\s+\.\.\.\s+(\w+)=(.*)$") + + +def _split_sections(text: str) -> dict[str, list[str]]: + sections: dict[str, list[str]] = {} + current = "_preamble" + sections[current] = [] + for line in text.splitlines(): + m = SECTION_RE.match(line.strip()) + if m: + current = m.group(1) + sections.setdefault(current, []) + else: + sections.setdefault(current, []).append(line) + return sections + + +def _parse_settings(lines: list[str]) -> tuple[str, str, list[str]]: + documentation = "" + display_name = "" + supports: list[str] = [] + for line in lines: + stripped = line.strip() + if stripped.startswith("Documentation"): + documentation = stripped.split("Documentation", 1)[-1].strip() + continue + if not stripped.startswith("Metadata"): + continue + rest = stripped[len("Metadata") :].strip() + if rest.startswith("Display Name"): + display_name = rest[len("Display Name") :].strip() + elif rest.startswith("Supports"): + raw = rest[len("Supports") :].strip() + if "," in raw: + supports = [s.strip() for s in raw.split(",") if s.strip()] + else: + supports = [s for s in raw.split() if s] + return documentation, display_name, supports + + +def _parse_import_blocks(lines: list[str]) -> list[ImportField]: + imports: list[ImportField] = [] + i = 0 + while i < len(lines): + line = lines[i] + var_m = IMPORT_VAR_RE.search(line) + sec_m = IMPORT_SECRET_RE.search(line) + if not var_m and not sec_m: + i += 1 + continue + name = (var_m or sec_m).group(1) + kind = "variable" if var_m else "secret" + type_ = "string" + description = "" + default: str | None = None + i += 1 + while i < len(lines): + cont = CONTINUATION_KV.match(lines[i]) + if not cont: + break + key, val = cont.group(1), cont.group(2).strip() + if key == "type": + type_ = val + elif key == "description": + description = val + elif key == "default": + default = val + i += 1 + imports.append( + ImportField( + name=name, + kind=kind, + type_=type_, + description=description, + default=default, + ) + ) + return imports + + +def _parse_tasks(lines: list[str], robot_file: str) -> list[RobotTask]: + tasks: list[RobotTask] = [] + i = 0 + while i < len(lines): + line = lines[i] + stripped = line.strip() + if ( + not stripped + or stripped.startswith("[") + or stripped.startswith("IF") + or stripped.startswith("FOR") + or stripped.startswith("END") + or stripped.startswith("${") + or stripped.startswith("...") + or stripped.startswith("#") + or stripped.startswith("RW.") + or stripped.startswith("Run ") + or stripped.startswith("Log") + or stripped.startswith("Set ") + or stripped.startswith("RETURN") + ): + i += 1 + continue + if line.startswith(" ") or line.startswith("\t"): + i += 1 + continue + if SECTION_RE.match(stripped): + i += 1 + continue + + task_name = stripped + doc = "" + tags: list[str] = [] + bash_file: str | None = None + sub_metric: str | None = None + pass_condition: str | None = None + json_writes: list[str] = [] + env_reads: set[str] = set() + i += 1 + block: list[str] = [] + while i < len(lines): + nxt = lines[i] + if ( + nxt.strip() + and not nxt.startswith(" ") + and not nxt.startswith("\t") + and not SECTION_RE.match(nxt.strip()) + and not nxt.strip().startswith("[") + ): + break + block.append(nxt) + i += 1 + + block_text = "\n".join(block) + for bline in block: + dm = DOC_LINE_RE.match(bline) + if dm: + doc = dm.group(1).strip() + tm = TAGS_RE.match(bline) + if tm: + tags = [t.strip() for t in tm.group(1).split() if t.strip()] + bm = BASH_FILE_RE.match(bline) + if bm: + bash_file = bm.group(1) + pm = PASS_EVAL_RE.search(bline) + if pm: + pass_condition = pm.group(1).strip() + for jm in CAT_JSON_RE.findall(bline): + if jm not in json_writes: + json_writes.append(jm) + env_reads.update(ENV_VAR_RE.findall(bline)) + sm = SUB_NAME_RE.search(block_text) + if sm: + sub_metric = sm.group(1) + + tasks.append( + RobotTask( + name=task_name, + robot_file=robot_file, + documentation=doc, + tags=tags, + bash_file=bash_file, + sub_metric=sub_metric, + pass_condition=pass_condition, + json_writes=json_writes, + env_reads=env_reads, + ) + ) + return tasks + + +def parse_robot(path: Path) -> RobotFile: + text = path.read_text(encoding="utf-8", errors="replace") + sections = _split_sections(text) + settings = sections.get("Settings", []) + doc, display, supports = _parse_settings(settings) + keywords = sections.get("Keywords", []) + tasks_lines = sections.get("Tasks", []) + imports = _parse_import_blocks(keywords + settings) + tasks = _parse_tasks(tasks_lines, path.name) + return RobotFile( + path=path, + documentation=doc, + display_name=display, + supports=supports, + tasks=tasks, + imports=imports, + ) + + +def _infer_resource_types(bundle_name: str, supports: list[str]) -> list[str]: + name = bundle_name.lower() + mapping = [ + ("aks", "aks_cluster"), + ("eks", "eks_cluster"), + ("gke", "gke_cluster"), + ("deployment", "deployment"), + ("namespace", "namespace"), + ("ingress", "ingress"), + ("statefulset", "statefulset"), + ("daemonset", "daemonset"), + ("pvc", "persistent_volume_claim"), + ("pod", "pod"), + ("lambda", "lambda_function"), + ("sqs", "sqs_queue"), + ("elasticache", "elasticache_cluster"), + ("s3", "s3_bucket"), + ("ec2", "ec2_instance"), + ("keyvault", "key_vault"), + ("key-vault", "key_vault"), + ("kv-", "key_vault"), + ("appservice", "app_service"), + ("appgateway", "application_gateway"), + ("loadbalancer", "load_balancer"), + ("servicebus", "service_bus"), + ("storage", "storage_account"), + ("acr", "container_registry"), + ("adf", "data_factory"), + ("databricks", "databricks_workspace"), + ("devops", "azure_devops"), + ("subscription", "subscription"), + ("vm-", "virtual_machine"), + ("virtual-machine", "virtual_machine"), + ] + found: list[str] = [] + for needle, rtype in mapping: + if needle in name and rtype not in found: + found.append(rtype) + if not found: + if any(p.lower() in ("kubernetes", "aks", "eks", "gke", "openshift") for p in supports): + found.append("kubernetes_resource") + elif any(p.lower() == "azure" for p in supports): + found.append("azure_resource") + elif any(p.lower() == "aws" for p in supports): + found.append("aws_resource") + elif any(p.lower() == "gcp" for p in supports): + found.append("gcp_resource") + return found[:3] + + +def _infer_access(all_tags: list[list[str]]) -> str: + flat = [t for tags in all_tags for t in tags] + if not flat: + return "read-only" + if any("access:read-write" in t or "access:write" in t for t in flat): + return "read-write" + if any("access:read-only" in t for t in flat): + return "read-only" + return "read-write" if any("write" in t.lower() for t in flat) else "read-only" + + +def _readme_summary(readme: Path) -> str: + if not readme.exists(): + return "" + lines = readme.read_text(encoding="utf-8", errors="replace").splitlines() + body: list[str] = [] + for line in lines: + if line.startswith("#"): + continue + if line.strip(): + body.append(line.strip()) + if len(body) >= 2: + break + return " ".join(body)[:500] + + +def _build_description( + bundle_name: str, + display_name: str, + robot_doc: str, + readme_summary: str, + supports: list[str], +) -> str: + base = robot_doc or (readme_summary[:120] if readme_summary else "") or display_name + base = re.sub(r"\s+", " ", base).strip() + if len(base) > 120: + base = base[:117].rsplit(" ", 1)[0] + "..." + if base and not base.endswith("."): + base += "." + platform_hint = ", ".join(supports[:3]) if supports else "RunWhen" + trigger = ( + f"Use when triaging or monitoring {platform_hint} workloads " + f"with skill template `{bundle_name}`." + ) + desc = f"{base} {trigger}".strip() + if len(desc) > 200: + desc = desc[:197] + "..." + return desc + + +def _robot_name_field(task_name: str) -> str: + escaped = task_name.replace("|", "\\|") + return f'{escaped}' + + +def _dedupe_imports(imports: list[ImportField]) -> list[ImportField]: + seen: set[str] = set() + out: list[ImportField] = [] + for imp in imports: + if imp.name in seen: + continue + seen.add(imp.name) + out.append(imp) + return out + + +def _collect_json_outputs(runbook: RobotFile | None, monitor: RobotFile | None) -> list[str]: + files: list[str] = [] + for rf in (runbook, monitor): + if not rf: + continue + for task in rf.tasks: + for j in task.json_writes: + if j not in files: + files.append(j) + return files + + +def _list_shell_scripts(bundle_dir: Path) -> list[Path]: + return sorted( + p + for p in bundle_dir.iterdir() + if p.is_file() + and p.suffix == ".sh" + and p.name != "auth.sh" + ) + + +def _script_purpose(name: str) -> str: + return f"Bash helper script `{name}`." + + +def _format_tool(task: RobotTask, import_names: set[str]) -> str: + reads = sorted(task.env_reads & import_names) or sorted(task.env_reads) + writes = ", ".join(f"`{j}`" for j in task.json_writes) or "—" + tags = ", ".join(f"`{t}`" for t in task.tags) or "—" + script_line = ( + f"- **Underlying script**: `{task.bash_file}`\n" + if task.bash_file + else "" + ) + return ( + f"### {task.name}\n\n" + f"{task.documentation or '_No task documentation in Robot source._'}\n\n" + f"- **Robot task name**: {_robot_name_field(task.name)}\n" + f"- **Robot file**: `{task.robot_file}`\n" + f"{script_line}" + f"- **Tags**: {tags}\n" + f"- **Reads**: {', '.join(f'`{r}`' for r in reads) or '—'}\n" + f"- **Writes**: {writes}\n" + f"- **Issues raised**: issues reported via `RW.Core.Add Issue` when checks fail\n" + ) + + +def _format_monitor_subcheck(task: RobotTask, import_names: set[str]) -> str: + reads = sorted(task.env_reads & import_names) or sorted(task.env_reads) + tags = ", ".join(f"`{t}`" for t in task.tags) or "—" + script_line = ( + f"- **Underlying script**: `{task.bash_file}`\n" + if task.bash_file + else "" + ) + pass_line = ( + f"- **Pass condition**: `{task.pass_condition}`\n" + if task.pass_condition + else "" + ) + sub = task.sub_metric or "—" + return ( + f"#### {task.name}\n\n" + f"{task.documentation or '_No sub-check documentation in Robot source._'}\n\n" + f"- **Robot task name**: {_robot_name_field(task.name)}\n" + f"- **Sub-metric name**: `{sub}`\n" + f"{script_line}" + f"- **Tags**: {tags}\n" + f"- **Reads**: {', '.join(f'`{r}`' for r in reads) or '—'}\n" + f"{pass_line}" + ) + + +def generate_skill_md(bundle_dir: Path) -> str | None: + bundle_name = bundle_dir.name + runbook_path = bundle_dir / "runbook.robot" + monitor_path = bundle_dir / "sli.robot" + + if not runbook_path.exists() and not monitor_path.exists(): + return None + + runbook = parse_robot(runbook_path) if runbook_path.exists() else None + monitor = parse_robot(monitor_path) if monitor_path.exists() else None + + display_name = ( + (runbook.display_name if runbook else "") + or (monitor.display_name if monitor else "") + or bundle_name.replace("-", " ").title() + ) + supports = (runbook.supports if runbook else []) or ( + monitor.supports if monitor else [] + ) + robot_doc = (runbook.documentation if runbook else "") or ( + monitor.documentation if monitor else "" + ) + readme_summary = _readme_summary(bundle_dir / "README.md") + if readme_summary and len(readme_summary) > 400: + readme_summary = readme_summary[:397].rsplit(" ", 1)[0] + "..." + description = _build_description( + bundle_name, display_name, robot_doc, readme_summary, supports + ) + + all_imports = _dedupe_imports( + (runbook.imports if runbook else []) + (monitor.imports if monitor else []) + ) + import_names = {i.name for i in all_imports} + + all_tags: list[list[str]] = [] + if runbook: + all_tags.extend(t.tags for t in runbook.tasks) + if monitor: + all_tags.extend(t.tags for t in monitor.tasks) + + platforms = supports or ["RunWhen"] + resource_types = _infer_resource_types(bundle_name, supports) + access = _infer_access(all_tags) + + lines: list[str] = [ + "---", + f"name: {bundle_name}", + "kind: skill-template", + f"description: {description}", + "runtime:", + ] + if runbook_path.exists(): + lines.append(" runbook: runbook.robot") + if monitor_path.exists(): + lines.append(" monitor: sli.robot") + lines.extend( + [ + " executor: worker", + " entrypoint: /home/runwhen/robot-runtime/runrobot.sh", + " base_image: rw-base-runtime", + f"platforms: [{', '.join(platforms)}]", + f"resource_types: [{', '.join(resource_types)}]", + f"access: {access}", + "---", + "", + f"# {display_name}", + "", + "## Summary", + "", + (readme_summary.split(". ")[0] + "." if readme_summary else "") + or robot_doc + or f"Skill template `{bundle_name}` for RunWhen agents.", + "", + "See [README.md](README.md) for additional context.", + "", + ] + ) + + if runbook and runbook.tasks: + lines.append("## Tools") + lines.append("") + for task in runbook.tasks: + lines.append(_format_tool(task, import_names)) + lines.append("") + + if monitor: + lines.extend(["## Monitor", ""]) + if monitor.documentation: + lines.append(monitor.documentation) + lines.append("") + lines.extend( + [ + "- **Robot file**: `sli.robot`", + "- **Score range**: `0.0` (failing) to `1.0` (healthy)", + "- **Aggregation**: arithmetic mean of the sub-checks below", + "- **Recommended interval**: `180s`", + "", + "### Sub-checks", + "", + ] + ) + subchecks = [ + t + for t in monitor.tasks + if t.sub_metric or "Push Metric" in " ".join([t.documentation, t.name]) + ] + if not subchecks: + subchecks = [ + t + for t in monitor.tasks + if t.documentation or t.tags or t.bash_file + ] + for task in subchecks: + if not task.sub_metric: + # skip aggregate score task (no sub_name) + if "Generate" in task.name and "Score" in task.name: + continue + lines.append(_format_monitor_subcheck(task, import_names)) + lines.append("") + if not subchecks: + lines.append( + "_Monitor tasks are defined in `sli.robot`; see source for sub-check details._\n" + ) + + lines.extend(["## Inputs", ""]) + if all_imports: + lines.extend( + [ + "| Name | Type | Description | Default | Required |", + "|---|---|---|---|---|", + ] + ) + for imp in all_imports: + if imp.kind != "variable": + continue + default = f"`{imp.default}`" if imp.default is not None else "—" + req = "no" if not imp.required else "yes" + desc = imp.description.replace("|", "\\|") + lines.append( + f"| `{imp.name}` | {imp.type_} | {desc} | {default} | {req} |" + ) + else: + lines.append("_No user variables imported in Robot source._") + lines.append("") + + lines.extend(["## Secrets", ""]) + secrets = [i for i in all_imports if i.kind == "secret"] + if secrets: + lines.extend( + ["| Name | Description | Required |", "|---|---|---|"] + ) + for imp in secrets: + desc = imp.description.replace("|", "\\|") or "—" + lines.append(f"| `{imp.name}` | {desc} | yes |") + else: + lines.append("_No secrets imported in Robot source._") + lines.append("") + + outputs = _collect_json_outputs(runbook, monitor) + lines.extend(["## Outputs", ""]) + if monitor: + lines.append("- Monitor health score (`0.0`–`1.0`) pushed by `sli.robot`") + for j in outputs: + lines.append(f"- `{j}`") + if not outputs and not monitor: + lines.append("_See Robot run output and platform report artifacts._") + lines.append("") + + invoke_paths: list[str] = [] + if runbook_path.exists(): + invoke_paths.append( + f"- **Runbook**: `codebundles/{bundle_name}/runbook.robot`" + ) + if monitor_path.exists(): + invoke_paths.append( + f"- **Monitor**: `codebundles/{bundle_name}/sli.robot`" + ) + lines.extend( + [ + "## How to invoke", + "", + "### Production (RunWhen runner / worker)", + "", + "The platform **runner** schedules work on a location **worker**. The worker", + "image (`rw-base-runtime`) executes Robot via `runrobot.sh` with", + "`RW_PATH_TO_ROBOT` set to the bound path under `/home/runwhen/collection/`.", + "", + *invoke_paths, + "", + "### Local development (devcontainer only)", + "", + "`ro` is a dev-time wrapper in `codecollection-devtools` — not the enterprise runtime.", + "", + "```bash", + f"cd codebundles/{bundle_name}", + ] + ) + for imp in all_imports[:6]: + if imp.kind == "variable": + lines.append(f"export {imp.name}=...") + if runbook_path.exists(): + lines.append("ro runbook.robot") + elif monitor_path.exists(): + lines.append("ro sli.robot") + lines.extend(["```", "", "### Standalone scripts (no Robot)", "", ""]) + scripts = _list_shell_scripts(bundle_dir) + if scripts: + lines.append( + "Set the input variables above, then run the matching script:" + ) + lines.append("") + lines.append("```bash") + lines.append(f"cd codebundles/{bundle_name}") + for imp in all_imports[:4]: + if imp.kind == "variable": + lines.append(f"export {imp.name}=...") + for script in scripts[:12]: + lines.append(f"bash {script.name}") + if len(scripts) > 12: + lines.append(f"# ... and {len(scripts) - 12} more scripts") + lines.append("```") + else: + lines.append("_No standalone shell scripts in this bundle._") + lines.append("") + + lines.extend(["## Source files", ""]) + if runbook_path.exists(): + lines.append("- `runbook.robot` — orchestrates tools and raises issues") + if monitor_path.exists(): + lines.append("- `sli.robot` — monitor scoring (`sli.robot` runtime file)") + for script in scripts: + lines.append(f"- `{script.name}` — {_script_purpose(script.name)}") + lines.append("") + + return "\n".join(lines).rstrip() + "\n" + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "codecollection", + type=Path, + nargs="?", + default=Path("/home/runwhen/codecollection"), + ) + parser.add_argument("--dry-run", action="store_true") + parser.add_argument("--bundle", action="append", help="Only process named bundles") + args = parser.parse_args() + + root = args.codecollection / "codebundles" + if not root.is_dir(): + print(f"Not found: {root}", file=sys.stderr) + return 1 + + bundles = sorted( + d for d in root.iterdir() if d.is_dir() and d.name != "CURSOR_RULES_README.md" + ) + if args.bundle: + names = set(args.bundle) + bundles = [d for d in bundles if d.name in names] + + written = skipped = errors = 0 + for bundle_dir in bundles: + try: + content = generate_skill_md(bundle_dir) + if content is None: + skipped += 1 + continue + out = bundle_dir / OUTPUT_FILENAME + legacy = bundle_dir / LEGACY_FILENAME + if args.dry_run: + print(f"would write {out} ({len(content)} bytes)") + if legacy.exists(): + print(f"would remove legacy {legacy}") + else: + out.write_text(content, encoding="utf-8") + if legacy.exists() and legacy != out: + legacy.unlink() + print(f"wrote {out}") + written += 1 + except Exception as exc: # noqa: BLE001 + print(f"ERROR {bundle_dir.name}: {exc}", file=sys.stderr) + errors += 1 + + print(f"\nDone: {written} written, {skipped} skipped, {errors} errors") + return 1 if errors else 0 + + +if __name__ == "__main__": + sys.exit(main())