From 73652186e2d407ff8e99c1ebb13a6234fc5f2499 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 1 Oct 2025 06:19:00 +0000
Subject: [PATCH] feat: Add OpenTelemetry Astronomy Shop resilience demo

Co-authored-by: adil <adil@getcalmo.com>
---
 astronomy-demo/README.md                      | 103 ++++++++++++++++++
 .../provisioning/alerting/contact-points.yaml |  17 +++
 .../grafana/provisioning/alerting/rules.yaml  |  73 +++++++++++++
 astronomy-demo/k6/checkout.js                 |  55 ++++++++++
 .../cart-remediation/kustomization.yaml       |  14 +++
 .../cart-remediation/patch-replicas.yaml      |   6 +
 .../cart-remediation/patch-resources.yaml     |  16 +++
 .../litmus/engines/cart-cpu-hog.yaml          |  22 ++++
 .../litmus/engines/cart-network-latency.yaml  |  24 ++++
 .../litmus/experiments/pod-cpu-hog.yaml       |  34 ++++++
 .../experiments/pod-network-latency.yaml      |  42 +++++++
 11 files changed, 406 insertions(+)
 create mode 100644 astronomy-demo/README.md
 create mode 100644 astronomy-demo/grafana/provisioning/alerting/contact-points.yaml
 create mode 100644 astronomy-demo/grafana/provisioning/alerting/rules.yaml
 create mode 100644 astronomy-demo/k6/checkout.js
 create mode 100644 astronomy-demo/kustomize/overlays/cart-remediation/kustomization.yaml
 create mode 100644 astronomy-demo/kustomize/overlays/cart-remediation/patch-replicas.yaml
 create mode 100644 astronomy-demo/kustomize/overlays/cart-remediation/patch-resources.yaml
 create mode 100644 astronomy-demo/litmus/engines/cart-cpu-hog.yaml
 create mode 100644 astronomy-demo/litmus/engines/cart-network-latency.yaml
 create mode 100644 astronomy-demo/litmus/experiments/pod-cpu-hog.yaml
 create mode 100644 astronomy-demo/litmus/experiments/pod-network-latency.yaml
diff --git a/astronomy-demo/README.md b/astronomy-demo/README.md
new file mode 100644
index 0000000..b4b0156
--- /dev/null
+++ b/astronomy-demo/README.md
@@ -0,0 +1,103 @@
+### Astronomy Shop Resilience Demo Kit
+
+This kit demonstrates an end-to-end incident on the OpenTelemetry Astronomy Shop: baseline SLOs, induce failure with LitmusChaos, detect via Grafana + SigNoz, triage in Calmo, and remediate on Kubernetes with Kustomize.
+
+#### Components
+- k6 steady traffic to checkout
+- Grafana alerting (p95 latency, error-rate) with webhook to Calmo
+- LitmusChaos experiments: pod-network-latency (cart → datastore), pod-cpu-hog (cart)
+- Kustomize remediation overlay to scale and resource-bump `cartservice`
+- Optional misconfig scenario: wrong image to trigger CrashLoopBackOff
+
+#### Prerequisites
+- Kubernetes cluster (GKE recommended) with the OpenTelemetry Demo deployed (namespace `otel-demo` assumed)
+- SigNoz or Prometheus-compatible metrics endpoint connected to Grafana
+- Grafana v9+ with provisioning enabled
+- LitmusChaos installed and a target ServiceAccount with permissions in `otel-demo`
+- Calmo ingestion endpoint URL and optional API key
+- kubectl, kustomize, and k6 installed locally
+
+#### Environment
+Export these before running:
+
+```bash
+export ASTRONOMY_NS=otel-demo
+export FRONTEND_BASE_URL="http://frontend.${ASTRONOMY_NS}.svc.cluster.local:8080"
+export CALMO_WEBHOOK_URL="https://ingest.getcalmo.com/webhook/<your-source>"
+export CALMO_WEBHOOK_SECRET="<optional-shared-secret>"
+# Grafana: set your Prometheus/SigNoz datasource UID (from Grafana > Connections > Data sources)
+export GRAFANA_PROM_DS_UID="prometheus"
+```
+
+### 1) Baseline: generate steady checkout traffic
+
+```bash
+k6 run ./k6/checkout.js \
+  -e BASE_URL="$FRONTEND_BASE_URL" \
+  -e CHECKOUT_RATE_PER_SEC=3 \
+  -e TEST_DURATION="10m"
+```
+
+Verify traces/metrics in Grafana/SigNoz are healthy; note baseline p95 and error-rate.
+
+### 2) Arm alerts and route to Calmo
+Provision Grafana contact point and alert rules via ConfigMaps/volumes or by copying files from `grafana/provisioning/alerting/*` into Grafana's provisioning directory. Ensure the datasource UID is set to `$GRAFANA_PROM_DS_UID` and Calmo webhook URL is set.
+
+Files:
+- `grafana/provisioning/alerting/contact-points.yaml`
+- `grafana/provisioning/alerting/rules.yaml`
+
+These configure:
+- Alert A: checkout p95 latency > 2s for 5m
+- Alert B: checkout error-rate > 3% for 5m
+
+### 3) Inject failures with LitmusChaos
+
+Set the app label and namespace in the engines if needed. Apply experiments and engines:
+
+```bash
+kubectl apply -n litmus -f ./litmus/experiments/pod-network-latency.yaml
+kubectl apply -n litmus -f ./litmus/experiments/pod-cpu-hog.yaml
+
+kubectl apply -n "$ASTRONOMY_NS" -f ./litmus/engines/cart-network-latency.yaml
+kubectl apply -n "$ASTRONOMY_NS" -f ./litmus/engines/cart-cpu-hog.yaml
+```
+
+Observe: increased `cart` span latency, possible throttling/restarts; Grafana alerts should fire within 5–7 minutes; Calmo receives webhooks and correlates with K8s events and recent deploys.
+
+### 4) Remediate with Kustomize overlay
+
+Apply the remediation overlay to scale and resource-bump `cartservice`:
+
+```bash
+kubectl kustomize ./kustomize/overlays/cart-remediation | kubectl apply -n "$ASTRONOMY_NS" -f -
+```
+
+Validate SLOs recover, then roll back chaos:
+
+```bash
+kubectl delete -n "$ASTRONOMY_NS" -f ./litmus/engines/cart-network-latency.yaml || true
+kubectl delete -n "$ASTRONOMY_NS" -f ./litmus/engines/cart-cpu-hog.yaml || true
+```
+
+### 5) Optional simple scenarios
+
+- CrashLoopBackOff: apply `kubernetes/misconfig/cart-bad-image.yaml` to simulate non-existent image for `cartservice`, then revert.
+- NodeSelector misplacement: add a strict `nodeSelector` to `cartservice` to schedule onto non-matching nodes and observe Pending pods.
+- pod-cpu-hog: run only the CPU hog engine.
+
+### 6) Grafana → Calmo webhook payload
+
+Grafana contact point is configured to send JSON including `title`, `state`, `labels`, `evalMatches`, `startsAt`. Calmo can enrich with SLO metadata and correlate.
+
+### Clean-up
+
+```bash
+# Remove remediation overlay changes (if you used a dedicated overlay, you can roll back by re-applying base manifests)
+kubectl rollout restart deploy/cartservice -n "$ASTRONOMY_NS"
+
+# Delete engines (experiments can remain installed in litmus namespace)
+kubectl delete -n "$ASTRONOMY_NS" -f ./litmus/engines/cart-network-latency.yaml || true
+kubectl delete -n "$ASTRONOMY_NS" -f ./litmus/engines/cart-cpu-hog.yaml || true
+```
+
diff --git a/astronomy-demo/grafana/provisioning/alerting/contact-points.yaml b/astronomy-demo/grafana/provisioning/alerting/contact-points.yaml
new file mode 100644
index 0000000..0c70e40
--- /dev/null
+++ b/astronomy-demo/grafana/provisioning/alerting/contact-points.yaml
@@ -0,0 +1,17 @@
+apiVersion: 1
+contactPoints:
+  - orgId: 1
+    name: calmo-webhook
+    receivers:
+      - uid: calmo-webhook-receiver
+        type: webhook
+        settings:
+          url: ${CALMO_WEBHOOK_URL}
+          httpMethod: POST
+          sendResolved: true
+          username: ""
+          password: ""
+          maxAlerts: 0
+        secureFields:
+          password: ${CALMO_WEBHOOK_SECRET}
+    disableResolveMessage: false
diff --git a/astronomy-demo/grafana/provisioning/alerting/rules.yaml b/astronomy-demo/grafana/provisioning/alerting/rules.yaml
new file mode 100644
index 0000000..fc29d80
--- /dev/null
+++ b/astronomy-demo/grafana/provisioning/alerting/rules.yaml
@@ -0,0 +1,73 @@
+apiVersion: 1
+groups:
+  - orgId: 1
+    name: astronomy-shop-slo
+    interval: 1m
+    rules:
+      - uid: checkout-p95-latency
+        title: checkout p95 latency > 2s
+        condition: C
+        for: 5m
+        labels:
+          service: checkout
+          slo: p95-latency
+          env: staging
+        annotations:
+          runbook_url: https://git/ops/runbooks/checkout-latency
+        data:
+          - refId: A
+            datasourceUid: ${GRAFANA_PROM_DS_UID}
+            model:
+              interval: ""
+              intervalFactor: 2
+              legendFormat: p95
+              refId: A
+              expr: |
+                histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="checkout"}[5m])) by (le))
+              range: true
+              datasource: {uid: ${GRAFANA_PROM_DS_UID}}
+          - refId: B
+            datasourceUid: __expr__
+            model:
+              type: threshold
+              refId: B
+              expression: 2
+          - refId: C
+            datasourceUid: __expr__
+            model:
+              type: math
+              refId: C
+              expression: "$A > $B"
+      - uid: checkout-error-rate
+        title: checkout error-rate > 3%
+        condition: C
+        for: 5m
+        labels:
+          service: checkout
+          slo: error-rate
+          env: staging
+        annotations:
+          runbook_url: https://git/ops/runbooks/checkout-latency
+        data:
+          - refId: A
+            datasourceUid: ${GRAFANA_PROM_DS_UID}
+            model:
+              refId: A
+              expr: |
+                sum(rate(http_requests_total{service="checkout",status=~"5.."}[5m]))
+              range: true
+              datasource: {uid: ${GRAFANA_PROM_DS_UID}}
+          - refId: B
+            datasourceUid: ${GRAFANA_PROM_DS_UID}
+            model:
+              refId: B
+              expr: |
+                sum(rate(http_requests_total{service="checkout"}[5m]))
+              range: true
+              datasource: {uid: ${GRAFANA_PROM_DS_UID}}
+          - refId: C
+            datasourceUid: __expr__
+            model:
+              type: math
+              refId: C
+              expression: "($A / clamp_min($B, 1)) > 0.03"
diff --git a/astronomy-demo/k6/checkout.js b/astronomy-demo/k6/checkout.js
new file mode 100644
index 0000000..1ed6442
--- /dev/null
+++ b/astronomy-demo/k6/checkout.js
@@ -0,0 +1,55 @@
+import http from 'k6/http';
+import { check, sleep } from 'k6';
+
+// Env vars
+const BASE_URL = __ENV.BASE_URL || 'http://localhost:8080';
+const RATE = Number(__ENV.CHECKOUT_RATE_PER_SEC || 2); // requests per second
+const DURATION = __ENV.TEST_DURATION || '10m';
+
+export const options = {
+  scenarios: {
+    steady_checkout: {
+      executor: 'constant-arrival-rate',
+      rate: RATE,
+      timeUnit: '1s',
+      duration: DURATION,
+      preAllocatedVUs: Math.max(10, RATE * 2),
+      maxVUs: Math.max(50, RATE * 4),
+      tags: { service: 'checkout', route: '/api/checkout' },
+    },
+  },
+  thresholds: {
+    http_req_duration: ['p(95)<2000'],
+    http_req_failed: ['rate<0.03'],
+  },
+};
+
+export default function () {
+  // Minimal flow: add-to-cart then checkout endpoint
+  // Adjust endpoints to match your frontend/cart routes
+  const headers = { 'Content-Type': 'application/json' };
+
+  // Add to cart
+  const addRes = http.post(`${BASE_URL}/api/cart`, JSON.stringify({ productId: 'extreme-astronomy-binoculars', quantity: 1 }), { headers });
+  check(addRes, {
+    'add-to-cart status is 2xx': (r) => r.status >= 200 && r.status < 300,
+  });
+
+  // Checkout
+  const payload = {
+    email: 'demo@example.com',
+    address: {
+      street: '1 Space Way', city: 'Andromeda', state: 'OT', zip: '424242', country: 'US',
+    },
+    creditCard: {
+      number: '4111111111111111', ccv: '737', expMonth: 12, expYear: 2030,
+    },
+  };
+  const res = http.post(`${BASE_URL}/api/checkout`, JSON.stringify(payload), { headers });
+  check(res, {
+    'checkout status is 2xx/3xx': (r) => r.status >= 200 && r.status < 400,
+  });
+
+  sleep(0.5);
+}
+
diff --git a/astronomy-demo/kustomize/overlays/cart-remediation/kustomization.yaml b/astronomy-demo/kustomize/overlays/cart-remediation/kustomization.yaml
new file mode 100644
index 0000000..ab1e97e
--- /dev/null
+++ b/astronomy-demo/kustomize/overlays/cart-remediation/kustomization.yaml
@@ -0,0 +1,14 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources: []
+
+patches:
+  - target:
+      kind: Deployment
+      name: cartservice
+    path: patch-replicas.yaml
+  - target:
+      kind: Deployment
+      name: cartservice
+    path: patch-resources.yaml
diff --git a/astronomy-demo/kustomize/overlays/cart-remediation/patch-replicas.yaml b/astronomy-demo/kustomize/overlays/cart-remediation/patch-replicas.yaml
new file mode 100644
index 0000000..6279f46
--- /dev/null
+++ b/astronomy-demo/kustomize/overlays/cart-remediation/patch-replicas.yaml
@@ -0,0 +1,6 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: cartservice
+spec:
+  replicas: 4
diff --git a/astronomy-demo/kustomize/overlays/cart-remediation/patch-resources.yaml b/astronomy-demo/kustomize/overlays/cart-remediation/patch-resources.yaml
new file mode 100644
index 0000000..3f93d0a
--- /dev/null
+++ b/astronomy-demo/kustomize/overlays/cart-remediation/patch-resources.yaml
@@ -0,0 +1,16 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: cartservice
+spec:
+  template:
+    spec:
+      containers:
+        - name: server
+          resources:
+            requests:
+              cpu: "300m"
+              memory: "256Mi"
+            limits:
+              cpu: "1000m"
+              memory: "512Mi"
diff --git a/astronomy-demo/litmus/engines/cart-cpu-hog.yaml b/astronomy-demo/litmus/engines/cart-cpu-hog.yaml
new file mode 100644
index 0000000..e7f7f7f
--- /dev/null
+++ b/astronomy-demo/litmus/engines/cart-cpu-hog.yaml
@@ -0,0 +1,22 @@
+apiVersion: litmuschaos.io/v1alpha1
+kind: ChaosEngine
+metadata:
+  name: cart-cpu-hog
+  namespace: otel-demo
+spec:
+  appinfo:
+    appns: otel-demo
+    applabel: "app=cartservice"
+    appkind: deployment
+  chaosServiceAccount: litmus-admin
+  experiments:
+    - name: pod-cpu-hog
+      spec:
+        components:
+          env:
+            - name: TOTAL_CHAOS_DURATION
+              value: '600'
+            - name: CPU_CORES
+              value: '1'
+            - name: PODS_AFFECTED_PERC
+              value: '100'
diff --git a/astronomy-demo/litmus/engines/cart-network-latency.yaml b/astronomy-demo/litmus/engines/cart-network-latency.yaml
new file mode 100644
index 0000000..d1bfdc7
--- /dev/null
+++ b/astronomy-demo/litmus/engines/cart-network-latency.yaml
@@ -0,0 +1,24 @@
+apiVersion: litmuschaos.io/v1alpha1
+kind: ChaosEngine
+metadata:
+  name: cart-network-latency
+  namespace: otel-demo
+spec:
+  appinfo:
+    appns: otel-demo
+    applabel: "app=cartservice"
+    appkind: deployment
+  chaosServiceAccount: litmus-admin
+  experiments:
+    - name: pod-network-latency
+      spec:
+        components:
+          env:
+            - name: TOTAL_CHAOS_DURATION
+              value: '600'
+            - name: NETWORK_LATENCY
+              value: '400'
+            - name: JITTER
+              value: '0'
+            - name: PODS_AFFECTED_PERC
+              value: '100'
diff --git a/astronomy-demo/litmus/experiments/pod-cpu-hog.yaml b/astronomy-demo/litmus/experiments/pod-cpu-hog.yaml
new file mode 100644
index 0000000..20bee6b
--- /dev/null
+++ b/astronomy-demo/litmus/experiments/pod-cpu-hog.yaml
@@ -0,0 +1,34 @@
+apiVersion: litmuschaos.io/v1alpha1
+kind: ChaosExperiment
+metadata:
+  name: pod-cpu-hog
+  labels:
+    litmuschaos.io/name: pod-cpu-hog
+spec:
+  definition:
+    scope: Namespaced
+    permissions:
+      - apiGroups: ['']
+        resources: ['pods', 'pods/log']
+        verbs: ['create', 'list', 'get', 'update', 'patch', 'delete']
+      - apiGroups: ['']
+        resources: ['events']
+        verbs: ['create', 'list', 'get', 'update', 'patch']
+      - apiGroups: ['apps']
+        resources: ['deployments']
+        verbs: ['list', 'get']
+    image: litmuschaos/go-runner:latest
+    imagePullPolicy: IfNotPresent
+    args:
+      - -c
+      - ./experiments -name pod-cpu-hog
+    command: ["/bin/bash"]
+    env:
+      - name: TOTAL_CHAOS_DURATION
+        value: '600'
+      - name: CPU_CORES
+        value: '1'
+      - name: PODS_AFFECTED_PERC
+        value: '100'
+      - name: SEQUENCE
+        value: parallel
diff --git a/astronomy-demo/litmus/experiments/pod-network-latency.yaml b/astronomy-demo/litmus/experiments/pod-network-latency.yaml
new file mode 100644
index 0000000..9fc346b
--- /dev/null
+++ b/astronomy-demo/litmus/experiments/pod-network-latency.yaml
@@ -0,0 +1,42 @@
+apiVersion: litmuschaos.io/v1alpha1
+kind: ChaosExperiment
+metadata:
+  name: pod-network-latency
+  labels:
+    litmuschaos.io/name: pod-network-latency
+spec:
+  definition:
+    scope: Namespaced
+    permissions:
+      - apiGroups: ['']
+        resources: ['pods', 'pods/log']
+        verbs: ['create', 'list', 'get', 'update', 'patch', 'delete']
+      - apiGroups: ['']
+        resources: ['events']
+        verbs: ['create', 'list', 'get', 'update', 'patch']
+      - apiGroups: ['apps']
+        resources: ['deployments']
+        verbs: ['list', 'get']
+    image: litmuschaos/go-runner:latest
+    imagePullPolicy: IfNotPresent
+    args:
+      - -c
+      - ./experiments -name pod-network-latency
+    command: ["/bin/bash"]
+    env:
+      - name: TOTAL_CHAOS_DURATION
+        value: '600'
+      - name: NETWORK_INTERFACE
+        value: eth0
+      - name: LIB_IMAGE
+        value: litmuschaos/go-runner:latest
+      - name: TC_IMAGE
+        value: gaiadocker/iproute2
+      - name: TARGET_CONTAINER
+        value: ''
+      - name: JITTER
+        value: '0'
+      - name: NETWORK_LATENCY
+        value: '400'
+      - name: PODS_AFFECTED_PERC
+        value: '100'