AlpNuhoglu · AlpNuhoglu · Jun 20, 2026 · Jun 19, 2026 · Jun 19, 2026
diff --git a/cmd/outbox-relay/main.go b/cmd/outbox-relay/main.go
@@ -68,6 +68,7 @@ func main() {
 		BatchSize:    cfg.OutboxBatchSize,
 		PollInterval: cfg.OutboxPollInterval,
 		Workers:      cfg.OutboxWorkers,
+		MaxAttempts:  cfg.OutboxMaxAttempts,
 	}, m, log)
 
 	ctx, stop := server.ShutdownContext()

diff --git a/config/prometheus/alerts.yml b/config/prometheus/alerts.yml
@@ -0,0 +1,53 @@
+# Alerting rules for the transactional outbox relay. These turn the relay's
+# instruments (see docs/outbox.md §9) into actionable signals: a growing backlog
+# means the relay is falling behind or NATS is unreachable; a dead-lettered row
+# means an event will never be delivered without operator action.
+groups:
+  - name: outbox-relay
+    rules:
+      # Any dead-lettered row is page-worthy: the event is permanently stuck in
+      # FAILED and a human must inspect last_error and re-queue or drop it.
+      - alert: OutboxEventsDeadLettered
+        expr: increase(gamemesh_outbox_events_dead_lettered_total[5m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+          component: outbox-relay
+        annotations:
+          summary: "Outbox rows dead-lettered (service {{ $labels.service }})"
+          description: >
+            {{ $value | printf "%.0f" }} outbox row(s) moved to FAILED in the last
+            5m. They will never be published without operator action — inspect
+            last_error and re-queue or drop them.
+
+      # A persistently rising backlog means the relay cannot keep up or NATS is
+      # down. Rows are durable (no loss), so this is a warning, not a page.
+      - alert: OutboxBacklogGrowing
+        expr: gamemesh_outbox_events_pending > 500
+        for: 10m
+        labels:
+          severity: warning
+          component: outbox-relay
+        annotations:
+          summary: "Outbox backlog growing (service {{ $labels.service }})"
+          description: >
+            {{ $value | printf "%.0f" }} PENDING outbox rows for over 10m. The
+            relay is falling behind or NATS is unreachable. Events are safe but
+            delayed — check NATS connectivity and relay throughput/replicas.
+
+      # Publishes failing while nothing is succeeding = NATS is down. Distinct
+      # from a backlog: this fires fast on the failure rate, not the queue depth.
+      - alert: OutboxPublishStalled
+        expr: |
+          rate(gamemesh_outbox_publish_failures_total[5m]) > 0
+          and rate(gamemesh_outbox_events_published_total[5m]) == 0
+        for: 5m
+        labels:
+          severity: critical
+          component: outbox-relay
+        annotations:
+          summary: "Outbox publishing stalled (service {{ $labels.service }})"
+          description: >
+            Publish attempts are failing and nothing is succeeding for 5m — NATS
+            is likely down. Rows stay PENDING and retry; if this persists they
+            will eventually dead-letter (see OutboxEventsDeadLettered).
diff --git a/config/prometheus/prometheus.yml b/config/prometheus/prometheus.yml
@@ -5,6 +5,10 @@ global:
   scrape_interval: 5s
   evaluation_interval: 5s
 
+# Alerting rules (outbox relay SLOs). Evaluated every evaluation_interval.
+rule_files:
+  - /etc/prometheus/alerts.yml
+
 scrape_configs:
   - job_name: gateway
     static_configs:

diff --git a/deployments/k8s/15-outbox-relay.yaml b/deployments/k8s/15-outbox-relay.yaml
@@ -0,0 +1,90 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: outbox-relay
+  namespace: gamemesh
+spec:
+  # Three replicas for HA. The relay claims rows with FOR UPDATE SKIP LOCKED
+  # (internal/outbox/store.go), so replicas never publish the same outbox row —
+  # running more than one removes the relay as a single point of failure between
+  # persistence and event transport.
+  replicas: 3
+  selector:
+    matchLabels:
+      app: outbox-relay
+  template:
+    metadata:
+      labels:
+        app: outbox-relay
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8085"
+        prometheus.io/path: /metrics
+    spec:
+      containers:
+        - name: outbox-relay
+          image: gamemesh/outbox-relay:latest
+          imagePullPolicy: IfNotPresent
+          ports:
+            - containerPort: 8085
+          env:
+            - name: HTTP_PORT
+              value: "8085"
+            - name: NATS_URL
+              value: nats://nats:4222
+            - name: OUTBOX_ENABLED
+              value: "true"
+            - name: OUTBOX_BATCH_SIZE
+              value: "100"
+            - name: OUTBOX_POLL_INTERVAL
+              value: "1s"
+            - name: OUTBOX_WORKERS
+              value: "4"
+            # Dead-letter a row after this many failed publish attempts
+            # (0 = retry forever). Caps poison rows from wedging a worker.
+            - name: OUTBOX_MAX_ATTEMPTS
+              value: "5"
+            - name: POSTGRES_DSN
+              value: host=postgres user=$(POSTGRES_USER) password=$(POSTGRES_PASSWORD) dbname=$(POSTGRES_DB) port=5432 sslmode=disable
+          envFrom:
+            - configMapRef:
+                name: gamemesh-config
+            - secretRef:
+                name: gamemesh-secrets
+          readinessProbe:
+            httpGet: { path: /healthz, port: 8085 }
+            initialDelaySeconds: 3
+            periodSeconds: 5
+          livenessProbe:
+            httpGet: { path: /healthz, port: 8085 }
+            initialDelaySeconds: 10
+            periodSeconds: 10
+          resources:
+            requests: { cpu: 100m, memory: 64Mi }
+            limits: { cpu: 500m, memory: 128Mi }
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: outbox-relay
+  namespace: gamemesh
+spec:
+  selector:
+    app: outbox-relay
+  ports:
+    - port: 8085
+      targetPort: 8085
+---
+# Keep at least 2 relays running during voluntary disruptions (node drains,
+# rollouts). Without this a cluster autoscaler or upgrade could evict all three
+# replicas at once and stall outbox publishing until they reschedule.
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: outbox-relay
+  namespace: gamemesh
+spec:
+  minAvailable: 2
+  selector:
+    matchLabels:
+      app: outbox-relay
diff --git a/deployments/k8s/30-monitoring.yaml b/deployments/k8s/30-monitoring.yaml
@@ -37,6 +37,9 @@ data:
   prometheus.yml: |
     global:
       scrape_interval: 5s
+      evaluation_interval: 5s
+    rule_files:
+      - /etc/prometheus/alerts.yml
     scrape_configs:
       - job_name: gamemesh-pods
         kubernetes_sd_configs:
@@ -54,6 +57,39 @@ data:
             target_label: __address__
           - source_labels: [__meta_kubernetes_pod_label_app]
             target_label: app
+  alerts.yml: |
+    # Outbox relay alerting rules (mirror of config/prometheus/alerts.yml).
+    groups:
+      - name: outbox-relay
+        rules:
+          - alert: OutboxEventsDeadLettered
+            expr: increase(gamemesh_outbox_events_dead_lettered_total[5m]) > 0
+            for: 0m
+            labels: { severity: critical, component: outbox-relay }
+            annotations:
+              summary: "Outbox rows dead-lettered (service {{ $labels.service }})"
+              description: >
+                {{ $value | printf "%.0f" }} outbox row(s) moved to FAILED in the
+                last 5m. They will never publish without operator action.
+          - alert: OutboxBacklogGrowing
+            expr: gamemesh_outbox_events_pending > 500
+            for: 10m
+            labels: { severity: warning, component: outbox-relay }
+            annotations:
+              summary: "Outbox backlog growing (service {{ $labels.service }})"
+              description: >
+                {{ $value | printf "%.0f" }} PENDING rows for over 10m — relay
+                behind or NATS unreachable. Events safe but delayed.
+          - alert: OutboxPublishStalled
+            expr: |
+              rate(gamemesh_outbox_publish_failures_total[5m]) > 0
+              and rate(gamemesh_outbox_events_published_total[5m]) == 0
+            for: 5m
+            labels: { severity: critical, component: outbox-relay }
+            annotations:
+              summary: "Outbox publishing stalled (service {{ $labels.service }})"
+              description: >
+                Publishes failing and none succeeding for 5m — NATS likely down.
 ---
 apiVersion: apps/v1
 kind: Deployment

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -202,6 +202,9 @@ services:
       OUTBOX_BATCH_SIZE: ${OUTBOX_BATCH_SIZE:-100}
       OUTBOX_POLL_INTERVAL: ${OUTBOX_POLL_INTERVAL:-1s}
       OUTBOX_WORKERS: ${OUTBOX_WORKERS:-4}
+      # Dead-letter a row after this many failed publish attempts (0 = retry
+      # forever). 5 lets transient NATS blips self-heal but caps poison rows.
+      OUTBOX_MAX_ATTEMPTS: ${OUTBOX_MAX_ATTEMPTS:-5}
     depends_on:
       postgres:
         condition: service_healthy
@@ -250,7 +253,8 @@ services:
     ports:
       - "9090:9090"
     volumes:
-      - ./config/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      # Mount the whole dir so prometheus.yml and alerts.yml are both available.
+      - ./config/prometheus:/etc/prometheus:ro
     depends_on:
       - gateway
 

diff --git a/docs/outbox.md b/docs/outbox.md
@@ -126,7 +126,9 @@ CREATE INDEX idx_outbox_pending
 - **Partial index on `PENDING`.** The relay's only hot query is "oldest
   unpublished rows." A partial index stays small as `PUBLISHED` rows accumulate,
   keeping the poll `O(batch)` regardless of total table size.
-- **Two states, no `FAILED`.** See §6.
+- **Three states (`PENDING` → `PUBLISHED`, or `PENDING` → `FAILED`).** A failed
+  publish stays `PENDING` and retries; a row that exhausts `OUTBOX_MAX_ATTEMPTS`
+  is dead-lettered to `FAILED`. See §6.
 
 ---
 
@@ -150,7 +152,9 @@ transaction**:
 2. **Publish** the batch across a **bounded worker pool** of `OUTBOX_WORKERS`
    (`internal/outbox/relay.go: publishAll`) — never one goroutine per event.
 3. **Mark** successfully published rows `PUBLISHED` (stamping `published_at`) and
-   **bump `attempt_count`** on the failures, which stay `PENDING`.
+   **bump `attempt_count`** on the failures, which stay `PENDING` — unless a row
+   reaches `OUTBOX_MAX_ATTEMPTS`, in which case it is moved to `FAILED`
+   (dead-lettered) in the same statement so it is never polled again.
 4. **Commit.** The rows stayed locked for the whole cycle, so no other replica
    touched them.
 
@@ -167,6 +171,7 @@ in-flight batch's transaction completes or rolls back cleanly.
 | `OUTBOX_BATCH_SIZE`    | `100`   | Rows claimed per poll.               |
 | `OUTBOX_POLL_INTERVAL` | `1s`    | Delay between polls when idle.       |
 | `OUTBOX_WORKERS`       | `4`     | Bounded publish concurrency / batch. |
+| `OUTBOX_MAX_ATTEMPTS`  | `0`     | Dead-letter a row to `FAILED` after this many failed publishes (`0` = retry forever). |
 
 The player service always **writes** to the outbox regardless of
 `OUTBOX_ENABLED`; the flag only governs the relay.
@@ -191,20 +196,26 @@ other event. The outbox is invisible past the publish.
         insert (in business tx)
 PENDING ───────────────────────► (relay publishes) ──success──► PUBLISHED
    ▲                                     │
-   └───────────── failure ───────────────┘  (attempt_count++ , stays PENDING)
+   ├───────── failure (attempt < max) ───┘  (attempt_count++ , stays PENDING)
+   │
+   └───────── failure (attempt = max) ──────────────────────────► FAILED (dead-letter)
 ```
 
-**Only two states.** A failed publish is simply a `PENDING` row that is retried
-on the next poll, so a transient NATS outage **self-heals** with no operator
-action and nothing to reconcile. There is intentionally **no `FAILED` state**:
-it would add a manual recovery step for a condition that resolves itself.
-`attempt_count` is enough to alert on a genuinely poisoned row (e.g. one that
-keeps failing) without a state machine.
+**Self-healing, with a poison-row backstop.** A failed publish is normally just a
+`PENDING` row that is retried on the next poll, so a transient NATS outage
+**self-heals** with no operator action. To stop a genuinely poisoned row (one
+that can *never* publish — e.g. malformed payload) from being retried forever and
+tying up a worker, a row is moved to `FAILED` once `attempt_count` reaches
+`OUTBOX_MAX_ATTEMPTS`. `FAILED` rows drop out of the `PENDING` poll, carry a
+`last_error`, and increment `gamemesh_outbox_events_dead_lettered_total` so an
+operator can alert, inspect, and re-queue them. Setting `OUTBOX_MAX_ATTEMPTS=0`
+disables the backstop and restores the original retry-forever behaviour.
 
 | Failure                                          | Outcome                                                                 |
 | ------------------------------------------------ | ----------------------------------------------------------------------- |
 | Crash between `COMMIT` and any publish           | Row is durable & `PENDING`; relay publishes it on next poll. **No loss.** |
 | NATS down when relay polls                       | `Publish` fails; row stays `PENDING`, `attempt_count++`; retried later. |
+| Row fails `OUTBOX_MAX_ATTEMPTS` times            | Row moved to `FAILED` (dead-letter); no longer polled; metric + `last_error` for triage. |
 | Crash **after** NATS publish, **before** mark    | Row stays `PENDING`; relay republishes → **duplicate** (see §7).        |
 | Business `INSERT` fails (e.g. duplicate username)| Whole tx rolls back; **no orphan outbox row**.                          |
 | Two relay replicas poll simultaneously           | `SKIP LOCKED` hands each row to exactly one replica.                    |
@@ -278,11 +289,25 @@ following the existing `gamemesh_*` convention with a `service` const label:
 | `gamemesh_outbox_events_pending`         | Gauge     | Current `PENDING` backlog (per poll).    |
 | `gamemesh_outbox_events_published_total` | Counter   | Rows successfully relayed.               |
 | `gamemesh_outbox_publish_failures_total` | Counter   | Publish attempts that failed (retried).  |
+| `gamemesh_outbox_events_dead_lettered_total`| Counter| Rows moved to `FAILED` after `OUTBOX_MAX_ATTEMPTS` (poison rows). |
 | `gamemesh_outbox_publish_duration_seconds`| Histogram| Per-row publish latency.                 |
 
 **Operational signals:** a steadily rising `pending` gauge means the relay is
 falling behind or NATS is unreachable; a rising `publish_failures_total` with a
-flat `published_total` means NATS is down (rows are safe, just delayed).
+flat `published_total` means NATS is down (rows are safe, just delayed). Any
+increase in `dead_lettered_total` is a **page-worthy** signal — an event will
+never be delivered without operator action, so alert on
+`increase(gamemesh_outbox_events_dead_lettered_total[5m]) > 0`.
+
+**Alert rules.** These signals are shipped as Prometheus rules in
+`config/prometheus/alerts.yml` (compose) and the `prometheus-config` ConfigMap in
+`deployments/k8s/30-monitoring.yaml` (k8s):
+
+| Alert                     | Severity | Fires when                                                       |
+| ------------------------- | -------- | ---------------------------------------------------------------- |
+| `OutboxEventsDeadLettered`| critical | any row dead-letters in 5m (permanent loss without action).      |
+| `OutboxBacklogGrowing`    | warning  | `pending > 500` for 10m (relay behind or NATS unreachable).      |
+| `OutboxPublishStalled`    | critical | publishes failing and none succeeding for 5m (NATS down).        |
 
 ---
 
@@ -297,9 +322,12 @@ flat `published_total` means NATS is down (rows are safe, just delayed).
   throughput.
 - **Ordering.** The relay processes oldest-first but does not guarantee strict
   global ordering under concurrency. Consumers must not assume cross-event order.
-- **Kubernetes.** The compose deployment runs the relay; the k8s manifests under
-  `deployments/k8s/` are not updated in this change and would need an analogous
-  Deployment to run the relay in-cluster.
+- **Kubernetes.** `deployments/k8s/15-outbox-relay.yaml` runs the relay in-cluster
+  with **3 replicas** and a `PodDisruptionBudget` of `minAvailable: 2`, so node
+  drains and rollouts never evict the whole relay at once. The replicas are safe
+  by construction — `FOR UPDATE SKIP LOCKED` hands each outbox row to exactly one
+  of them — which removes the relay as a single point of failure between
+  persistence and event transport.
 
 ---
 

diff --git a/internal/outbox/publisher.go b/internal/outbox/publisher.go
@@ -4,9 +4,7 @@ import (
 	"context"
 
 	"github.com/google/uuid"
-	"go.opentelemetry.io/otel"
 	"go.opentelemetry.io/otel/attribute"
-	"go.opentelemetry.io/otel/propagation"
 	"go.opentelemetry.io/otel/trace"
 	"gorm.io/gorm"
 
@@ -56,10 +54,7 @@ func (p *Publisher) Publish(ctx context.Context, topic string, e events.Event) e
 	)
 	defer span.End()
 
-	if e.Carrier == nil {
-		e.Carrier = make(map[string]string)
-	}
-	otel.GetTextMapPropagator().Inject(ctx, propagation.MapCarrier(e.Carrier))
+	e.Carrier = tracing.InjectCarrier(ctx, e.Carrier)
 
 	id, err := uuid.Parse(e.ID)
 	if err != nil {