From 73652186e2d407ff8e99c1ebb13a6234fc5f2499 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 1 Oct 2025 06:19:00 +0000 Subject: [PATCH] feat: Add OpenTelemetry Astronomy Shop resilience demo Co-authored-by: adil --- astronomy-demo/README.md | 103 ++++++++++++++++++ .../provisioning/alerting/contact-points.yaml | 17 +++ .../grafana/provisioning/alerting/rules.yaml | 73 +++++++++++++ astronomy-demo/k6/checkout.js | 55 ++++++++++ .../cart-remediation/kustomization.yaml | 14 +++ .../cart-remediation/patch-replicas.yaml | 6 + .../cart-remediation/patch-resources.yaml | 16 +++ .../litmus/engines/cart-cpu-hog.yaml | 22 ++++ .../litmus/engines/cart-network-latency.yaml | 24 ++++ .../litmus/experiments/pod-cpu-hog.yaml | 34 ++++++ .../experiments/pod-network-latency.yaml | 42 +++++++ 11 files changed, 406 insertions(+) create mode 100644 astronomy-demo/README.md create mode 100644 astronomy-demo/grafana/provisioning/alerting/contact-points.yaml create mode 100644 astronomy-demo/grafana/provisioning/alerting/rules.yaml create mode 100644 astronomy-demo/k6/checkout.js create mode 100644 astronomy-demo/kustomize/overlays/cart-remediation/kustomization.yaml create mode 100644 astronomy-demo/kustomize/overlays/cart-remediation/patch-replicas.yaml create mode 100644 astronomy-demo/kustomize/overlays/cart-remediation/patch-resources.yaml create mode 100644 astronomy-demo/litmus/engines/cart-cpu-hog.yaml create mode 100644 astronomy-demo/litmus/engines/cart-network-latency.yaml create mode 100644 astronomy-demo/litmus/experiments/pod-cpu-hog.yaml create mode 100644 astronomy-demo/litmus/experiments/pod-network-latency.yaml diff --git a/astronomy-demo/README.md b/astronomy-demo/README.md new file mode 100644 index 0000000..b4b0156 --- /dev/null +++ b/astronomy-demo/README.md @@ -0,0 +1,103 @@ +### Astronomy Shop Resilience Demo Kit + +This kit demonstrates an end-to-end incident on the OpenTelemetry Astronomy Shop: baseline SLOs, induce failure with LitmusChaos, detect via Grafana + SigNoz, triage in Calmo, and remediate on Kubernetes with Kustomize. + +#### Components +- k6 steady traffic to checkout +- Grafana alerting (p95 latency, error-rate) with webhook to Calmo +- LitmusChaos experiments: pod-network-latency (cart → datastore), pod-cpu-hog (cart) +- Kustomize remediation overlay to scale and resource-bump `cartservice` +- Optional misconfig scenario: wrong image to trigger CrashLoopBackOff + +#### Prerequisites +- Kubernetes cluster (GKE recommended) with the OpenTelemetry Demo deployed (namespace `otel-demo` assumed) +- SigNoz or Prometheus-compatible metrics endpoint connected to Grafana +- Grafana v9+ with provisioning enabled +- LitmusChaos installed and a target ServiceAccount with permissions in `otel-demo` +- Calmo ingestion endpoint URL and optional API key +- kubectl, kustomize, and k6 installed locally + +#### Environment +Export these before running: + +```bash +export ASTRONOMY_NS=otel-demo +export FRONTEND_BASE_URL="http://frontend.${ASTRONOMY_NS}.svc.cluster.local:8080" +export CALMO_WEBHOOK_URL="https://ingest.getcalmo.com/webhook/" +export CALMO_WEBHOOK_SECRET="" +# Grafana: set your Prometheus/SigNoz datasource UID (from Grafana > Connections > Data sources) +export GRAFANA_PROM_DS_UID="prometheus" +``` + +### 1) Baseline: generate steady checkout traffic + +```bash +k6 run ./k6/checkout.js \ + -e BASE_URL="$FRONTEND_BASE_URL" \ + -e CHECKOUT_RATE_PER_SEC=3 \ + -e TEST_DURATION="10m" +``` + +Verify traces/metrics in Grafana/SigNoz are healthy; note baseline p95 and error-rate. + +### 2) Arm alerts and route to Calmo +Provision Grafana contact point and alert rules via ConfigMaps/volumes or by copying files from `grafana/provisioning/alerting/*` into Grafana's provisioning directory. Ensure the datasource UID is set to `$GRAFANA_PROM_DS_UID` and Calmo webhook URL is set. + +Files: +- `grafana/provisioning/alerting/contact-points.yaml` +- `grafana/provisioning/alerting/rules.yaml` + +These configure: +- Alert A: checkout p95 latency > 2s for 5m +- Alert B: checkout error-rate > 3% for 5m + +### 3) Inject failures with LitmusChaos + +Set the app label and namespace in the engines if needed. Apply experiments and engines: + +```bash +kubectl apply -n litmus -f ./litmus/experiments/pod-network-latency.yaml +kubectl apply -n litmus -f ./litmus/experiments/pod-cpu-hog.yaml + +kubectl apply -n "$ASTRONOMY_NS" -f ./litmus/engines/cart-network-latency.yaml +kubectl apply -n "$ASTRONOMY_NS" -f ./litmus/engines/cart-cpu-hog.yaml +``` + +Observe: increased `cart` span latency, possible throttling/restarts; Grafana alerts should fire within 5–7 minutes; Calmo receives webhooks and correlates with K8s events and recent deploys. + +### 4) Remediate with Kustomize overlay + +Apply the remediation overlay to scale and resource-bump `cartservice`: + +```bash +kubectl kustomize ./kustomize/overlays/cart-remediation | kubectl apply -n "$ASTRONOMY_NS" -f - +``` + +Validate SLOs recover, then roll back chaos: + +```bash +kubectl delete -n "$ASTRONOMY_NS" -f ./litmus/engines/cart-network-latency.yaml || true +kubectl delete -n "$ASTRONOMY_NS" -f ./litmus/engines/cart-cpu-hog.yaml || true +``` + +### 5) Optional simple scenarios + +- CrashLoopBackOff: apply `kubernetes/misconfig/cart-bad-image.yaml` to simulate non-existent image for `cartservice`, then revert. +- NodeSelector misplacement: add a strict `nodeSelector` to `cartservice` to schedule onto non-matching nodes and observe Pending pods. +- pod-cpu-hog: run only the CPU hog engine. + +### 6) Grafana → Calmo webhook payload + +Grafana contact point is configured to send JSON including `title`, `state`, `labels`, `evalMatches`, `startsAt`. Calmo can enrich with SLO metadata and correlate. + +### Clean-up + +```bash +# Remove remediation overlay changes (if you used a dedicated overlay, you can roll back by re-applying base manifests) +kubectl rollout restart deploy/cartservice -n "$ASTRONOMY_NS" + +# Delete engines (experiments can remain installed in litmus namespace) +kubectl delete -n "$ASTRONOMY_NS" -f ./litmus/engines/cart-network-latency.yaml || true +kubectl delete -n "$ASTRONOMY_NS" -f ./litmus/engines/cart-cpu-hog.yaml || true +``` + diff --git a/astronomy-demo/grafana/provisioning/alerting/contact-points.yaml b/astronomy-demo/grafana/provisioning/alerting/contact-points.yaml new file mode 100644 index 0000000..0c70e40 --- /dev/null +++ b/astronomy-demo/grafana/provisioning/alerting/contact-points.yaml @@ -0,0 +1,17 @@ +apiVersion: 1 +contactPoints: + - orgId: 1 + name: calmo-webhook + receivers: + - uid: calmo-webhook-receiver + type: webhook + settings: + url: ${CALMO_WEBHOOK_URL} + httpMethod: POST + sendResolved: true + username: "" + password: "" + maxAlerts: 0 + secureFields: + password: ${CALMO_WEBHOOK_SECRET} + disableResolveMessage: false diff --git a/astronomy-demo/grafana/provisioning/alerting/rules.yaml b/astronomy-demo/grafana/provisioning/alerting/rules.yaml new file mode 100644 index 0000000..fc29d80 --- /dev/null +++ b/astronomy-demo/grafana/provisioning/alerting/rules.yaml @@ -0,0 +1,73 @@ +apiVersion: 1 +groups: + - orgId: 1 + name: astronomy-shop-slo + interval: 1m + rules: + - uid: checkout-p95-latency + title: checkout p95 latency > 2s + condition: C + for: 5m + labels: + service: checkout + slo: p95-latency + env: staging + annotations: + runbook_url: https://git/ops/runbooks/checkout-latency + data: + - refId: A + datasourceUid: ${GRAFANA_PROM_DS_UID} + model: + interval: "" + intervalFactor: 2 + legendFormat: p95 + refId: A + expr: | + histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="checkout"}[5m])) by (le)) + range: true + datasource: {uid: ${GRAFANA_PROM_DS_UID}} + - refId: B + datasourceUid: __expr__ + model: + type: threshold + refId: B + expression: 2 + - refId: C + datasourceUid: __expr__ + model: + type: math + refId: C + expression: "$A > $B" + - uid: checkout-error-rate + title: checkout error-rate > 3% + condition: C + for: 5m + labels: + service: checkout + slo: error-rate + env: staging + annotations: + runbook_url: https://git/ops/runbooks/checkout-latency + data: + - refId: A + datasourceUid: ${GRAFANA_PROM_DS_UID} + model: + refId: A + expr: | + sum(rate(http_requests_total{service="checkout",status=~"5.."}[5m])) + range: true + datasource: {uid: ${GRAFANA_PROM_DS_UID}} + - refId: B + datasourceUid: ${GRAFANA_PROM_DS_UID} + model: + refId: B + expr: | + sum(rate(http_requests_total{service="checkout"}[5m])) + range: true + datasource: {uid: ${GRAFANA_PROM_DS_UID}} + - refId: C + datasourceUid: __expr__ + model: + type: math + refId: C + expression: "($A / clamp_min($B, 1)) > 0.03" diff --git a/astronomy-demo/k6/checkout.js b/astronomy-demo/k6/checkout.js new file mode 100644 index 0000000..1ed6442 --- /dev/null +++ b/astronomy-demo/k6/checkout.js @@ -0,0 +1,55 @@ +import http from 'k6/http'; +import { check, sleep } from 'k6'; + +// Env vars +const BASE_URL = __ENV.BASE_URL || 'http://localhost:8080'; +const RATE = Number(__ENV.CHECKOUT_RATE_PER_SEC || 2); // requests per second +const DURATION = __ENV.TEST_DURATION || '10m'; + +export const options = { + scenarios: { + steady_checkout: { + executor: 'constant-arrival-rate', + rate: RATE, + timeUnit: '1s', + duration: DURATION, + preAllocatedVUs: Math.max(10, RATE * 2), + maxVUs: Math.max(50, RATE * 4), + tags: { service: 'checkout', route: '/api/checkout' }, + }, + }, + thresholds: { + http_req_duration: ['p(95)<2000'], + http_req_failed: ['rate<0.03'], + }, +}; + +export default function () { + // Minimal flow: add-to-cart then checkout endpoint + // Adjust endpoints to match your frontend/cart routes + const headers = { 'Content-Type': 'application/json' }; + + // Add to cart + const addRes = http.post(`${BASE_URL}/api/cart`, JSON.stringify({ productId: 'extreme-astronomy-binoculars', quantity: 1 }), { headers }); + check(addRes, { + 'add-to-cart status is 2xx': (r) => r.status >= 200 && r.status < 300, + }); + + // Checkout + const payload = { + email: 'demo@example.com', + address: { + street: '1 Space Way', city: 'Andromeda', state: 'OT', zip: '424242', country: 'US', + }, + creditCard: { + number: '4111111111111111', ccv: '737', expMonth: 12, expYear: 2030, + }, + }; + const res = http.post(`${BASE_URL}/api/checkout`, JSON.stringify(payload), { headers }); + check(res, { + 'checkout status is 2xx/3xx': (r) => r.status >= 200 && r.status < 400, + }); + + sleep(0.5); +} + diff --git a/astronomy-demo/kustomize/overlays/cart-remediation/kustomization.yaml b/astronomy-demo/kustomize/overlays/cart-remediation/kustomization.yaml new file mode 100644 index 0000000..ab1e97e --- /dev/null +++ b/astronomy-demo/kustomize/overlays/cart-remediation/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: [] + +patches: + - target: + kind: Deployment + name: cartservice + path: patch-replicas.yaml + - target: + kind: Deployment + name: cartservice + path: patch-resources.yaml diff --git a/astronomy-demo/kustomize/overlays/cart-remediation/patch-replicas.yaml b/astronomy-demo/kustomize/overlays/cart-remediation/patch-replicas.yaml new file mode 100644 index 0000000..6279f46 --- /dev/null +++ b/astronomy-demo/kustomize/overlays/cart-remediation/patch-replicas.yaml @@ -0,0 +1,6 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cartservice +spec: + replicas: 4 diff --git a/astronomy-demo/kustomize/overlays/cart-remediation/patch-resources.yaml b/astronomy-demo/kustomize/overlays/cart-remediation/patch-resources.yaml new file mode 100644 index 0000000..3f93d0a --- /dev/null +++ b/astronomy-demo/kustomize/overlays/cart-remediation/patch-resources.yaml @@ -0,0 +1,16 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cartservice +spec: + template: + spec: + containers: + - name: server + resources: + requests: + cpu: "300m" + memory: "256Mi" + limits: + cpu: "1000m" + memory: "512Mi" diff --git a/astronomy-demo/litmus/engines/cart-cpu-hog.yaml b/astronomy-demo/litmus/engines/cart-cpu-hog.yaml new file mode 100644 index 0000000..e7f7f7f --- /dev/null +++ b/astronomy-demo/litmus/engines/cart-cpu-hog.yaml @@ -0,0 +1,22 @@ +apiVersion: litmuschaos.io/v1alpha1 +kind: ChaosEngine +metadata: + name: cart-cpu-hog + namespace: otel-demo +spec: + appinfo: + appns: otel-demo + applabel: "app=cartservice" + appkind: deployment + chaosServiceAccount: litmus-admin + experiments: + - name: pod-cpu-hog + spec: + components: + env: + - name: TOTAL_CHAOS_DURATION + value: '600' + - name: CPU_CORES + value: '1' + - name: PODS_AFFECTED_PERC + value: '100' diff --git a/astronomy-demo/litmus/engines/cart-network-latency.yaml b/astronomy-demo/litmus/engines/cart-network-latency.yaml new file mode 100644 index 0000000..d1bfdc7 --- /dev/null +++ b/astronomy-demo/litmus/engines/cart-network-latency.yaml @@ -0,0 +1,24 @@ +apiVersion: litmuschaos.io/v1alpha1 +kind: ChaosEngine +metadata: + name: cart-network-latency + namespace: otel-demo +spec: + appinfo: + appns: otel-demo + applabel: "app=cartservice" + appkind: deployment + chaosServiceAccount: litmus-admin + experiments: + - name: pod-network-latency + spec: + components: + env: + - name: TOTAL_CHAOS_DURATION + value: '600' + - name: NETWORK_LATENCY + value: '400' + - name: JITTER + value: '0' + - name: PODS_AFFECTED_PERC + value: '100' diff --git a/astronomy-demo/litmus/experiments/pod-cpu-hog.yaml b/astronomy-demo/litmus/experiments/pod-cpu-hog.yaml new file mode 100644 index 0000000..20bee6b --- /dev/null +++ b/astronomy-demo/litmus/experiments/pod-cpu-hog.yaml @@ -0,0 +1,34 @@ +apiVersion: litmuschaos.io/v1alpha1 +kind: ChaosExperiment +metadata: + name: pod-cpu-hog + labels: + litmuschaos.io/name: pod-cpu-hog +spec: + definition: + scope: Namespaced + permissions: + - apiGroups: [''] + resources: ['pods', 'pods/log'] + verbs: ['create', 'list', 'get', 'update', 'patch', 'delete'] + - apiGroups: [''] + resources: ['events'] + verbs: ['create', 'list', 'get', 'update', 'patch'] + - apiGroups: ['apps'] + resources: ['deployments'] + verbs: ['list', 'get'] + image: litmuschaos/go-runner:latest + imagePullPolicy: IfNotPresent + args: + - -c + - ./experiments -name pod-cpu-hog + command: ["/bin/bash"] + env: + - name: TOTAL_CHAOS_DURATION + value: '600' + - name: CPU_CORES + value: '1' + - name: PODS_AFFECTED_PERC + value: '100' + - name: SEQUENCE + value: parallel diff --git a/astronomy-demo/litmus/experiments/pod-network-latency.yaml b/astronomy-demo/litmus/experiments/pod-network-latency.yaml new file mode 100644 index 0000000..9fc346b --- /dev/null +++ b/astronomy-demo/litmus/experiments/pod-network-latency.yaml @@ -0,0 +1,42 @@ +apiVersion: litmuschaos.io/v1alpha1 +kind: ChaosExperiment +metadata: + name: pod-network-latency + labels: + litmuschaos.io/name: pod-network-latency +spec: + definition: + scope: Namespaced + permissions: + - apiGroups: [''] + resources: ['pods', 'pods/log'] + verbs: ['create', 'list', 'get', 'update', 'patch', 'delete'] + - apiGroups: [''] + resources: ['events'] + verbs: ['create', 'list', 'get', 'update', 'patch'] + - apiGroups: ['apps'] + resources: ['deployments'] + verbs: ['list', 'get'] + image: litmuschaos/go-runner:latest + imagePullPolicy: IfNotPresent + args: + - -c + - ./experiments -name pod-network-latency + command: ["/bin/bash"] + env: + - name: TOTAL_CHAOS_DURATION + value: '600' + - name: NETWORK_INTERFACE + value: eth0 + - name: LIB_IMAGE + value: litmuschaos/go-runner:latest + - name: TC_IMAGE + value: gaiadocker/iproute2 + - name: TARGET_CONTAINER + value: '' + - name: JITTER + value: '0' + - name: NETWORK_LATENCY + value: '400' + - name: PODS_AFFECTED_PERC + value: '100'