23seriy · 23seriy · May 15, 2026 · May 14, 2026 · May 15, 2026
diff --git a/.github/.markdownlint.json b/.github/.markdownlint.json
@@ -0,0 +1,8 @@
+{
+  "default": true,
+  "MD013": false,
+  "MD033": false,
+  "MD041": false,
+  "MD024": { "siblings_only": true },
+  "MD046": { "style": "fenced" }
+}
diff --git a/.github/mlc-config.json b/.github/mlc-config.json
@@ -0,0 +1,11 @@
+{
+  "ignorePatterns": [
+    {
+      "pattern": "^http://localhost"
+    },
+    {
+      "pattern": "^http://prometheus"
+    }
+  ],
+  "aliveStatusCodes": [200, 206, 301, 302, 403]
+}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,71 @@
+name: CI
+
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+
+jobs:
+  lint:
+    name: Lint & Validate
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Check markdown links
+        uses: gaurav-nelson/github-action-markdown-link-check@v1
+        with:
+          use-quiet-mode: 'yes'
+          config-file: '.github/mlc-config.json'
+        continue-on-error: true
+
+      - name: Lint markdown
+        uses: DavidAnson/markdownlint-cli2-action@v19
+        with:
+          globs: '**/*.md'
+          config: '.github/.markdownlint.json'
+        continue-on-error: true
+
+      - name: Validate workflow frontmatter
+        run: |
+          echo "Checking all workflows have frontmatter..."
+          errors=0
+          for f in workflows/**/*.md; do
+            if ! head -1 "$f" | grep -q '^---$'; then
+              echo "❌ Missing frontmatter: $f"
+              errors=$((errors + 1))
+            fi
+          done
+          echo "Checked $(find workflows -name '*.md' | wc -l) workflows, $errors missing frontmatter"
+          [ "$errors" -eq 0 ] && echo "✅ All workflows have frontmatter"
+
+      - name: Check README workflow table matches files
+        run: |
+          echo "Checking README links match actual files..."
+          errors=0
+          for f in $(grep -oE '\./workflows/[^)]+\.md' README.md); do
+            if [ ! -f "$f" ]; then
+              echo "❌ README links to $f but file doesn't exist"
+              errors=$((errors + 1))
+            fi
+          done
+          echo "Checked $(grep -coE '\./workflows/[^)]+\.md' README.md) README links, $errors broken"
+          [ "$errors" -eq 0 ] && echo "✅ All README links are valid"
+
+      - name: Check scripts are executable
+        run: |
+          for f in scripts/*.sh; do
+            [ -f "$f" ] || continue
+            if [ ! -x "$f" ]; then
+              echo "❌ Not executable: $f"
+            fi
+          done
+
+      - name: Shellcheck scripts
+        run: |
+          if command -v shellcheck >/dev/null; then
+            shellcheck scripts/*.sh || true
+          else
+            echo "shellcheck not available, skipping"
+          fi
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,64 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+## [Unreleased]
+
+### Added — Workflows
+- **`/helm-chart-review`** — review Helm charts for security, reliability, and best practices (kubernetes/)
+- **`/secrets-leak-scan`** — scan git repos for leaked secrets using gitleaks, trufflehog, or regex (security/)
+- **`/incident-triage`** — guided first 15 minutes of a production incident (observability/)
+
+### Added — Prompts
+- **`pr-description.md`** — generate PR descriptions from diffs
+- **`explain-like-a-senior.md`** — explain infrastructure code to junior engineers
+
+### Added — Scripts
+- **`aws-whoami.sh`** — quick AWS identity and account context check
+- **`stale-branches.sh`** — list git branches older than N days
+
+### Added — CI
+- GitHub Actions CI: markdown lint, link check, frontmatter validation, README link verification
+
+### Improved
+- **`/aws-account-audit`** — added `FAST=yes` input to skip slow per-policy IAM loops on large accounts
+- **`/aws-cost-quickscan`** — added `DEEP=yes` input for per-instance CPU utilization analysis
+- **`/terraform-plan-review`** — added Step 0 with plan generation commands (including Terragrunt)
+- **`/k8s-debug`** — enhanced log analysis (Step 5) with init container logs, structured error extraction, severity classification, and "noisiest pods" scan; added restart timeline analysis (Step 6a) and HPA health check (Step 6b); expanded triage cheat-sheet with startup-order, Redis, autoscaling, and webhook patterns
+
+---
+
+## [0.1.0] — 2026-05-04
+
+### Added — Workflows
+- **`/k8s-debug`** — general-purpose Kubernetes cluster debugger (kubernetes/)
+- **`/k8s-workload-debug`** — deep-dive on a single workload (kubernetes/)
+- **`/k8s-rbac-audit`** — RBAC security audit (kubernetes/)
+- **`/k8s-cost-hotspots`** — cost and waste analysis (kubernetes/)
+- **`/k8s-upgrade-readiness`** — pre-flight checks for K8s upgrades (kubernetes/)
+- **`/helm-release-debug`** — diagnose stuck or failed Helm releases (kubernetes/)
+- **`/aws-account-audit`** — AWS account security audit (aws/)
+- **`/aws-cost-quickscan`** — AWS cost waste analysis (aws/)
+- **`/aws-vpc-debug`** — VPC connectivity triage (aws/)
+- **`/aws-iam-policy-review`** — IAM policy risk analysis (aws/)
+- **`/terraform-plan-review`** — Terraform plan risk analysis (iac/)
+- **`/ci-debug`** — CI/CD pipeline failure diagnosis (cicd/)
+- **`/jenkins-pipeline-review`** — Jenkinsfile code review (cicd/)
+- **`/dockerfile-review`** — Dockerfile security and optimization review (containers/)
+
+### Added — Prompts
+- **`incident-commander.md`** — incident commander system prompt
+- **`postmortem-writer.md`** — blameless post-mortem generator
+- **`code-review-devops.md`** — DevOps code review prompt
+
+### Added — Rules
+- **`devops-agent.windsurfrules`** — AI safety guardrails for DevOps repos
+
+### Added — Scripts
+- **`k8s-snapshot.sh`** — cluster state snapshot to Markdown
+
+### Added — Repo
+- Repository structure: workflows/, prompts/, rules/, scripts/
+- README.md with full documentation
+- CONTRIBUTING.md with workflow design rules
+- MIT License
diff --git a/README.md b/README.md
@@ -25,6 +25,7 @@ A growing collection of **AI-agent workflows, prompts, and rules** for day-to-da
 | [k8s-cost-hotspots](./workflows/kubernetes/k8s-cost-hotspots.md) | `/k8s-cost-hotspots` | Find waste: over-provisioned workloads, missing requests/limits, idle workloads, orphan PVCs/PVs, idle LoadBalancers. | `kubectl`, `jq`, metrics-server. |
 | [k8s-upgrade-readiness](./workflows/kubernetes/k8s-upgrade-readiness.md) | `/k8s-upgrade-readiness` | Pre-flight before a control-plane / node upgrade: deprecated APIs, version skew, PDB gaps, expiring certs, broken webhooks. | `kubectl`. Optional: `kubent` or `pluto`, `helm`. |
 | [helm-release-debug](./workflows/kubernetes/helm-release-debug.md) | `/helm-release-debug` | Diagnose a stuck or failed Helm release: history, values diff, hook failures, rendered manifest vs cluster, workload health. | `helm` v3, `kubectl`. Optional: `jq`, `yq`. |
+| [helm-chart-review](./workflows/kubernetes/helm-chart-review.md) | `/helm-chart-review` | Review a Helm chart for security, reliability, and best practices: resource specs, probes, security context, PDBs, anti-affinity, RBAC. | Helm chart source. Optional: `helm` CLI. |
 
 ### AWS / Cloud
 
@@ -49,6 +50,18 @@ A growing collection of **AI-agent workflows, prompts, and rules** for day-to-da
 | [jenkins-pipeline-review](./workflows/cicd/jenkins-pipeline-review.md) | `/jenkins-pipeline-review` | Review Jenkinsfile / shared-library Groovy for security risks, anti-patterns, missing error handling, credential leaks, CPS issues, and build config cross-references. | Jenkinsfile(s) or `vars/*.groovy`. Optional: `repositories_v2.json`. |
 | [dockerfile-review](./workflows/containers/dockerfile-review.md) | `/dockerfile-review` | Review Dockerfiles for security, size, caching, and best practices. Flags CVE-prone bases, leaked secrets, missing health checks. | Dockerfile(s). Optional: `docker`, `trivy`. |
 
+### Security
+
+| Workflow | Slash command | Description | Prerequisites |
+|---|---|---|---|
+| [secrets-leak-scan](./workflows/security/secrets-leak-scan.md) | `/secrets-leak-scan` | Scan git repo history for leaked secrets: API keys, passwords, tokens, private keys. Uses gitleaks, trufflehog, or regex fallback. | Git repo. Optional: `gitleaks`, `trufflehog`. |
+
+### Observability & Incident
+
+| Workflow | Slash command | Description | Prerequisites |
+|---|---|---|---|
+| [incident-triage](./workflows/observability/incident-triage.md) | `/incident-triage` | Guided first 15 minutes of a production incident: timeline, blast radius, evidence gathering, mitigation suggestions. | Access to affected environment. |
+
 More on the way — see [Roadmap](#roadmap).
 
 ## Prompts
@@ -60,6 +73,8 @@ Reusable system prompts you can paste into any AI agent for common DevOps tasks:
 | [incident-commander](./prompts/incident-commander.md) | Puts the AI in incident-commander mode: timeline, blast radius, action tracking, status updates. |
 | [postmortem-writer](./prompts/postmortem-writer.md) | Generates a blameless post-mortem from incident notes: timeline, root cause, impact, action items. |
 | [code-review-devops](./prompts/code-review-devops.md) | Reviews IaC / pipeline / Docker / K8s code with a security-first DevOps lens. |
+| [pr-description](./prompts/pr-description.md) | Generates a PR description from a diff: what, why, how, testing, risk, rollback plan. |
+| [explain-like-a-senior](./prompts/explain-like-a-senior.md) | Explains infrastructure code to junior engineers: what it does, why, gotchas, and how it fits together. |
 
 ## Rules
 
@@ -76,6 +91,8 @@ Standalone shell utilities referenced by workflows or useful on their own:
 | Script | Usage |
 |---|---|
 | [k8s-snapshot.sh](./scripts/k8s-snapshot.sh) | `./k8s-snapshot.sh [namespace\|all] [output-dir]` — dump cluster state (nodes, pods, events, services, top) to a timestamped Markdown file. |
+| [aws-whoami.sh](./scripts/aws-whoami.sh) | `./aws-whoami.sh [profile]` — quick AWS identity check: caller, region, account alias, org, SSO role. |
+| [stale-branches.sh](./scripts/stale-branches.sh) | `./stale-branches.sh [days] [--remote]` — list git branches older than N days with last commit info. |
 
 ## Using a workflow
 
@@ -100,7 +117,9 @@ devops-ai-workflows/
 │   ├── aws/                 # AWS / cloud workflow definitions
 │   ├── iac/                 # Infrastructure as Code workflows
 │   ├── cicd/                # CI/CD pipeline workflows
-│   └── containers/          # Container & image workflows
+│   ├── containers/          # Container & image workflows
+│   ├── security/            # Security & repo hygiene workflows
+│   └── observability/       # Observability & incident workflows
 ├── prompts/                 # Reusable LLM prompts
 ├── rules/                   # Editor/agent rule files
 ├── scripts/                 # Standalone shell helpers
@@ -127,12 +146,10 @@ Ideas I plan to add (PRs welcome):
 - [ ] `/image-cve-triage` — prioritise CVE scanner output by exploitability + fix availability
 - [ ] `/github-actions-review` — security review of GitHub Actions workflow files
 - [ ] `/release-checklist` — pre-release gate
-- [ ] `/helm-chart-review` — review Helm chart for missing resources/limits, PDB, anti-affinity, template issues
 
 **Observability & incident**
 - [ ] `/prometheus-query-helper` — intent → PromQL with rationale
 - [ ] `/log-pattern-extract` — cluster repeated errors out of a log dump
-- [ ] `/incident-triage` — guided first 15 minutes of an incident
 - [ ] `/postmortem` — blameless post-mortem from a transcript
 - [ ] `/runbook-from-incident` — turn a resolved incident into a reusable runbook
 
@@ -144,7 +161,6 @@ Ideas I plan to add (PRs welcome):
 - [ ] `/db-migration-review` — flag risky migration patterns
 
 **Security & repo hygiene**
-- [ ] `/secrets-leak-scan` — gitleaks/trufflehog over full git history
 - [ ] `/cve-impact-assessment` — given a CVE, check whether your stack is affected
 - [ ] `/repo-health` — README, license, CI, branch protection, stale branches
 - [ ] `/dependency-upgrade-plan` — group outdated deps by risk and suggest batching

diff --git a/prompts/explain-like-a-senior.md b/prompts/explain-like-a-senior.md
@@ -0,0 +1,56 @@
+# Explain Like a Senior — System Prompt
+
+Paste this into any AI agent when you want a clear, educational explanation of infrastructure code for a junior engineer or new team member.
+
+---
+
+## System prompt
+
+You are a **senior DevOps/SRE engineer** explaining infrastructure code to a junior team member. Your goal is to build understanding, not just describe syntax.
+
+### For each piece of code, explain
+
+1. **What it does** — plain English, no jargon. If jargon is unavoidable, define it.
+2. **Why it's designed this way** — what problem does this solve? What trade-offs were made?
+3. **What could go wrong** — common failure modes, misconfigurations, and gotchas.
+4. **How it connects** — how does this piece fit into the bigger picture? What depends on it? What does it depend on?
+5. **What you'd change** — if anything looks suboptimal, explain what a senior would do differently and why.
+
+### Explanation style
+
+- **Start with the big picture**, then zoom in. "This Terraform module creates a VPC with public and private subnets. Here's how each piece works..."
+- **Use analogies** where they help. "A NAT Gateway is like a mail forwarding service — private instances send mail through it so they can reach the internet without being directly addressable."
+- **Show the mental model.** How would a senior engineer think about this? What questions would they ask?
+- **Point out non-obvious things.** "This `depends_on` might look unnecessary, but without it, the IAM role gets created before the policy is attached, and the Lambda function fails on first deploy."
+- **Be honest about complexity.** If something is genuinely confusing or poorly designed, say so — don't pretend it's simple.
+
+### Format
+
+```markdown
+## Overview
+<big picture: what this code does and why it exists>
+
+## Walk-through
+<section by section explanation>
+
+### <section name>
+**What:** <what this block does>
+**Why:** <why it's needed>
+**Gotcha:** <what could go wrong>
+
+## How it fits together
+<architecture context — what calls this, what this calls>
+
+## Things to watch out for
+<list of common mistakes or misconfigurations>
+
+## If I were reviewing this
+<what a senior would suggest improving>
+```
+
+### Rules
+
+- **No condescension.** Junior doesn't mean stupid. Explain clearly without being patronizing.
+- **No hand-waving.** If you don't know why something is done a certain way, say "I'm not sure why this specific choice was made — it might be historical. Here's what I'd investigate."
+- **Use the actual code.** Reference specific lines, variables, and resource names.
+- **Encourage questions.** End with "Good questions to ask your team about this: ..."
diff --git a/prompts/pr-description.md b/prompts/pr-description.md
@@ -0,0 +1,54 @@
+# PR Description Generator — System Prompt
+
+Paste this into any AI agent along with your `git diff` or list of changes to generate a PR description.
+
+---
+
+## System prompt
+
+You are a **PR description writer** for a DevOps/infrastructure team. Given a diff, commit list, or description of changes, generate a clear, reviewable pull request description.
+
+### Output format
+
+```markdown
+## What
+
+<1-3 sentences: what this PR does in plain English>
+
+## Why
+
+<1-3 sentences: why this change is needed — the problem, feature request, or improvement>
+
+## How
+
+<bullet list of the key changes, grouped by file or area>
+
+## Testing
+
+<what was tested and how — manual steps, CI results, environments used>
+
+## Risk
+
+<what could go wrong, blast radius, rollback plan>
+- **Risk level:** Low / Medium / High
+- **Rollback:** <how to revert if needed>
+- **Affected environments:** <which envs will be impacted>
+
+## Checklist
+
+- [ ] Code follows project conventions
+- [ ] Tests added/updated
+- [ ] Documentation updated (if applicable)
+- [ ] No secrets or credentials in the diff
+- [ ] Reviewed for security implications
+```
+
+### Rules
+
+- **Be specific.** Don't say "updated the config" — say "changed the RDS instance class from `db.t3.medium` to `db.t3.large` to handle increased query load."
+- **Group changes logically.** If the PR touches 5 files across 2 concerns, group by concern, not by file.
+- **Flag breaking changes** prominently with ⚠️.
+- **Mention dependencies** — does this PR need to be merged/deployed before or after another PR?
+- **Include the diff context.** If the user provides a diff, reference specific file paths and line changes.
+- **Never include secret values** from the diff. If the diff contains credentials, flag it as a blocker.
+- **For infrastructure PRs**, always include: what resources are created/modified/destroyed, blast radius, and rollback plan.
diff --git a/scripts/aws-whoami.sh b/scripts/aws-whoami.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# ────────────────────────────────────────────────────────────────
+# aws-whoami.sh — Quick AWS identity and account context
+# ────────────────────────────────────────────────────────────────
+# Usage: ./aws-whoami.sh [profile]
+#
+# Shows: caller identity, account alias, region, organization,
+# and SSO role (if using AWS SSO).
+# ────────────────────────────────────────────────────────────────
+set -euo pipefail
+
+PROFILE_FLAG=""
+[ -n "${1:-}" ] && PROFILE_FLAG="--profile $1"
+
+echo "🔍 AWS Identity Check"
+echo "====================="
+echo ""
+
+echo "--- Caller Identity ---"
+aws sts get-caller-identity $PROFILE_FLAG --output table 2>&1
+
+echo ""
+echo "--- Region ---"
+REGION=$(aws configure get region $PROFILE_FLAG 2>/dev/null || echo "not set")
+echo "Region: $REGION"
+
+echo ""
+echo "--- Account Aliases ---"
+aws iam list-account-aliases $PROFILE_FLAG --query 'AccountAliases[]' --output text 2>/dev/null || echo "(none or no permission)"
+
+echo ""
+echo "--- Organization ---"
+aws organizations describe-organization $PROFILE_FLAG --query 'Organization.{Id:Id,Master:MasterAccountId,Email:MasterAccountEmail}' --output table 2>/dev/null || echo "Not in an org (or no permission)"
+
+echo ""
+echo "--- SSO Role (if applicable) ---"
+ARN=$(aws sts get-caller-identity $PROFILE_FLAG --query 'Arn' --output text 2>/dev/null)
+if echo "$ARN" | grep -q 'assumed-role'; then
+  ROLE=$(echo "$ARN" | awk -F/ '{print $2}')
+  USER=$(echo "$ARN" | awk -F/ '{print $3}')
+  echo "Role: $ROLE"
+  echo "User: $USER"
+else
+  echo "Not using assumed role"
+fi