From e9905ca6995dc7d63d4588a8ce0a9d0303a65b76 Mon Sep 17 00:00:00 2001
From: rajashish147 <rajashish147@gmail.com>
Date: Sat, 4 Apr 2026 03:24:35 +0530
Subject: [PATCH 1/8] Refactor code structure for improved readability and
 maintainability

---
 .github/pull_request_template.md              |    2 +-
 .github/workflows/deploy.yml                  |  325 ++--
 .github/workflows/pr.yml                      |   54 +-
 .gitignore                                    |   11 -
 CHANGELOG.md                                  |   20 +-
 CONTRIBUTING.md                               |    1 -
 README.md                                     |  223 ++-
 docs/ARCHITECTURE.md                          |   51 +-
 docs/DEPLOYMENT.md                            |  143 +-
 docs/OBSERVABILITY_ARCHITECTURE.md            |  309 +---
 docs/ROLLBACK_QUICKREF.md                     |   43 +-
 docs/ROLLBACK_SYSTEM.md                       |   38 +-
 docs/SLO.md                                   |    8 +-
 docs/env-contract.md                          |   28 +-
 docs/infra-contract.md                        |   20 +
 docs/walkthrough.md                           |  116 +-
 infra/.env.monitoring.example                 |   61 -
 infra/alertmanager/alertmanager.yml           |   65 -
 infra/blackbox/blackbox.yml                   |   21 -
 infra/docker-compose.monitoring.yml           |  264 ---
 infra/docker-compose.nginx.yml                |   44 +
 infra/docker-compose.redis.yml                |   38 +
 infra/grafana/dashboards/fieldtrack.json      |  680 --------
 .../provisioning/dashboards/dashboard.yml     |   15 -
 .../provisioning/datasources/prometheus.yml   |   13 -
 infra/loki/loki-config.yaml                   |   47 -
 infra/nginx/api.conf                          |  274 ---
 infra/prometheus/alerts.yml                   |  559 ------
 infra/prometheus/prometheus.yml               |  101 --
 infra/promtail/promtail.yml                   |   62 -
 infra/scripts/render-alertmanager.sh          |  133 --
 infra/scripts/verify-alertmanager.sh          |  192 --
 infra/tempo/tempo.yml                         |   43 -
 package.json                                  |    3 +-
 scripts/analytics-backfill.ts                 |  242 ---
 scripts/deploy-bluegreen.sh                   | 1539 -----------------
 scripts/deploy.sh                             | 1217 +++++++++++++
 scripts/load-env.sh                           |   97 --
 scripts/load-testing/README.md                |  127 --
 scripts/load-testing/dashboard-load-test.js   |  124 --
 scripts/load-testing/expenses-load-test.js    |  134 --
 scripts/load-testing/map-load-test.js         |   92 -
 scripts/load-testing/queue-impact-test.js     |  146 --
 scripts/monitoring-sync.sh                    |  344 ----
 scripts/rollback.sh                           |  114 --
 scripts/smoke-test.sh                         |  445 -----
 scripts/validate-env.sh                       |  289 ----
 scripts/verify-stabilization.sh               |  372 +---
 scripts/vps-readiness-check.sh                |   35 +-
 scripts/vps-setup.sh                          |  528 ------
 src/config/env.ts                             |    7 +-
 src/routes/events.routes.ts                   |    2 +-
 src/routes/health.ts                          |   20 +-
 src/server.ts                                 |   54 +-
 src/tracing.ts                                |    9 +-
 tests/setup/env-setup.ts                      |    3 +
 vitest.config.ts                              |    4 +
 57 files changed, 1826 insertions(+), 8125 deletions(-)
 create mode 100644 docs/infra-contract.md
 delete mode 100644 infra/.env.monitoring.example
 delete mode 100644 infra/alertmanager/alertmanager.yml
 delete mode 100644 infra/blackbox/blackbox.yml
 delete mode 100644 infra/docker-compose.monitoring.yml
 create mode 100644 infra/docker-compose.nginx.yml
 create mode 100644 infra/docker-compose.redis.yml
 delete mode 100644 infra/grafana/dashboards/fieldtrack.json
 delete mode 100644 infra/grafana/provisioning/dashboards/dashboard.yml
 delete mode 100644 infra/grafana/provisioning/datasources/prometheus.yml
 delete mode 100644 infra/loki/loki-config.yaml
 delete mode 100644 infra/nginx/api.conf
 delete mode 100644 infra/prometheus/alerts.yml
 delete mode 100644 infra/prometheus/prometheus.yml
 delete mode 100644 infra/promtail/promtail.yml
 delete mode 100644 infra/scripts/render-alertmanager.sh
 delete mode 100644 infra/scripts/verify-alertmanager.sh
 delete mode 100644 infra/tempo/tempo.yml
 delete mode 100644 scripts/analytics-backfill.ts
 delete mode 100644 scripts/deploy-bluegreen.sh
 create mode 100644 scripts/deploy.sh
 delete mode 100644 scripts/load-env.sh
 delete mode 100644 scripts/load-testing/README.md
 delete mode 100644 scripts/load-testing/dashboard-load-test.js
 delete mode 100644 scripts/load-testing/expenses-load-test.js
 delete mode 100644 scripts/load-testing/map-load-test.js
 delete mode 100644 scripts/load-testing/queue-impact-test.js
 delete mode 100644 scripts/monitoring-sync.sh
 delete mode 100644 scripts/rollback.sh
 delete mode 100644 scripts/smoke-test.sh
 delete mode 100644 scripts/validate-env.sh
 delete mode 100644 scripts/vps-setup.sh

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 93109a4..d599dfe 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -52,7 +52,7 @@ Closes #
 ## Final Checklist
 
 - [ ] PR title follows conventional commit format (`type(scope): description`)
-- [ ] Branch name follows convention (`feat/*`, `fix/*`, `infra/*`, etc.)
+- [ ] Branch name follows convention (`feat/*`, `fix/*`, `docs/*`, etc.)
 - [ ] No debug logs, commented-out code, or `TODO` / `FIXME` left in diff
 - [ ] No secrets or credentials committed
 - [ ] Relevant documentation updated (if applicable)
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index 66a7dd5..66c54ea 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -20,9 +20,9 @@
 #                   ┘                                                  │
 #                                             api-health-gate ◄────────┘
 #                                                   │
-#                                              sync-infra ─► sync-monitoring ─► health-and-smoke
-#                                                                                       │
-#                                              rollback ◄──────────────────────────────┘ (on failure)
+#                                             health-and-smoke
+#                                                   │
+#                                              rollback ◄───────────────── (on failure)
 
 name: Deploy to Production
 
@@ -211,6 +211,69 @@ jobs:
       - name: Run all tests
         run: npm test
 
+  # ---------------------------------------------------------------------------
+  # JOB: infra-leakage-guard
+  #
+  # Pre-deploy safety gate: ensures the API repo has not re-introduced
+  # references to infra concerns (monitoring stack, /ready in deploy path).
+  # Runs in parallel with validate and test-api.
+  #
+  # Guards:
+  #   1. No alertmanager/docker-compose.monitoring client code in src/ or tests/
+  #   2. No docker-compose.monitoring references in deploy.sh or deploy.yml executable steps
+  #   3. No /ready usage in scripts/deploy.sh (health gate must use /health only)
+  # ---------------------------------------------------------------------------
+  infra-leakage-guard:
+    name: Infra Leakage Guard
+    runs-on: ubuntu-latest
+    needs: [codeql-gate]
+    timeout-minutes: 5
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v5
+        with:
+          ref: ${{ needs.codeql-gate.outputs.deploy_sha }}
+
+      - name: Block monitoring infra client references in API source
+        run: |
+          # The API legitimately uses prom-client (prometheus.ts plugin) and emits
+          # OTLP traces. What must NOT appear is external infra client code —
+          # i.e., direct references to alertmanager, loki push clients, or
+          # docker-compose.monitoring in the application source.
+          # Exclude comment-only lines (-h suppresses filenames for grep -Ev).
+          LEAKS=$(grep -rhE "(alertmanager|docker-compose\.monitoring)" src/ tests/ 2>/dev/null \
+                    | grep -Ev '^\s*(//|#|\*|/\*)')
+          if [ -n "$LEAKS" ]; then
+            echo "::error::Infra client references found in src/ or tests/"
+            echo "$LEAKS"
+            exit 1
+          fi
+          echo "✓ No alertmanager/monitoring-compose references in src/ or tests/"
+
+      - name: Block docker-compose.monitoring references in deploy path
+        run: |
+          # deploy.sh is the only script in the deploy path.
+          if grep -E "docker-compose\.monitoring" scripts/deploy.sh 2>/dev/null | grep -Ev '^\s*#'; then
+            echo "::error::deploy.sh references docker-compose.monitoring — deploy must be monitoring-independent"
+            exit 1
+          fi
+          # Verify deploy.yml does not execute monitoring compose commands.
+          # Guard comments are allowed; executable command lines are not.
+          if grep -E "docker-compose\.monitoring|docker compose.*monitoring" .github/workflows/deploy.yml \
+            | grep -Ev '(infra-leakage-guard|Block docker|No docker)'; then
+            echo "::error::deploy.yml workflow references docker-compose.monitoring outside guard comments"
+            exit 1
+          fi
+          echo "✓ No docker-compose.monitoring references in deploy path"
+
+      - name: Block /ready in deploy path (deploy.sh)
+        run: |
+          if grep -E "(/ready)" scripts/deploy.sh | grep -Ev '^\s*#'; then
+            echo "::error::deploy.sh references /ready — deploy gate must use /health only"
+            exit 1
+          fi
+          echo "✓ deploy.sh does not reference /ready"
+
   # ---------------------------------------------------------------------------
   # JOB: build-scan-push
   #
@@ -231,7 +294,7 @@ jobs:
   build-scan-push:
     name: Build, Scan & Push Docker Image
     runs-on: ubuntu-latest
-    needs: [codeql-gate, validate, test-api]
+    needs: [codeql-gate, validate, test-api, infra-leakage-guard]
     timeout-minutes: 25
     permissions:
       contents: read
@@ -607,7 +670,7 @@ jobs:
           script: |
             set -euo pipefail
             export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
-            [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT — run vps-setup.sh first"; exit 1; }
+            [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
             cd "$DEPLOY_ROOT"
             # Pull latest scripts without full deploy
             git fetch origin master --depth=1
@@ -619,7 +682,7 @@ jobs:
   # JOB: deploy
   #
   # Blue-Green deployment to VPS via SSH.
-  # The deploy-bluegreen.sh script manages slot switching and container health.
+  # deploy.sh manages slot switching and container health.
   #
   # DEPENDENCY GATES (both must pass):
   #   - vps-readiness-check: ensures VPS can accept the deployment
@@ -661,27 +724,6 @@ jobs:
           } >> "$GITHUB_STEP_SUMMARY"
           echo "[DEPLOY] Deployment initiated — SHA=${{ github.sha }} EVENT=${{ github.event_name }} ACTOR=${{ github.actor }}"
 
-      - name: Validate environment contract before deploy
-        uses: appleboy/ssh-action@v1.0.3
-        with:
-          host: ${{ secrets.DO_HOST }}
-          username: ${{ secrets.DO_USER }}
-          key: ${{ secrets.DO_SSH_KEY }}
-          script: |
-            set -euo pipefail
-            export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
-            [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
-            cd "$DEPLOY_ROOT"
-            # Pin repo to the exact SHA that was built and scanned by CodeQL.
-            # Prevents stale scripts from running if concurrent commits landed.
-            git fetch origin
-            git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }}
-            chmod +x scripts/*.sh
-            echo "::group::Environment validation"
-            ./scripts/validate-env.sh --check-monitoring
-            echo "::endgroup::"
-            echo "[DEPLOY] Environment contract validated"
-
       - name: Blue-Green deploy via SSH
         uses: appleboy/ssh-action@v1.0.3
         with:
@@ -699,8 +741,7 @@ jobs:
             git fetch origin
             git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }}
             chmod +x scripts/*.sh
-            # Environment already validated in previous step
-            ./scripts/deploy-bluegreen.sh "${{ needs.build-scan-push.outputs.sha_short }}"
+            ./scripts/deploy.sh "${{ needs.build-scan-push.outputs.sha_short }}"
             echo "[DEPLOY] Deploy completed in $(($(date +%s) - T0))s"
 
       - name: Log deployment state (slot + SHA for debugging)
@@ -734,9 +775,9 @@ jobs:
   # ---------------------------------------------------------------------------
   # JOB: api-health-gate  (Step E+)
   #
-  # Early API health validation — runs AFTER deploy but BEFORE infra sync.
-  # Ensures the API container is truly healthy before we sync monitoring/nginx.
-  # If the API is not healthy at this point, STOP before touching infra.
+  # Validates the API container is healthy after deploy.
+  # Ensures /health returns 200 before proceeding to smoke tests.
+  # If the API is not healthy at this point, rollback is triggered.
   # ---------------------------------------------------------------------------
   api-health-gate:
     name: API Health Gate
@@ -744,7 +785,7 @@ jobs:
     needs: [deploy]
     timeout-minutes: 5
     steps:
-      - name: Verify API container is healthy before infra sync
+      - name: Verify API container is healthy after deploy
         uses: appleboy/ssh-action@v1.0.3
         with:
           host: ${{ secrets.DO_HOST }}
@@ -755,7 +796,6 @@ jobs:
             export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
             [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
             cd "$DEPLOY_ROOT"
-            source scripts/load-env.sh
 
             ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "blue")
             ACTIVE_CONTAINER="api-$ACTIVE_SLOT"
@@ -769,181 +809,53 @@ jobs:
             for i in $(seq 1 15); do
               STATUS=$(docker run --rm --network api_network "$FT_CURL_IMG" \
                 -s -o /dev/null -w "%{http_code}" \
-                "http://$ACTIVE_CONTAINER:3000/ready" 2>/dev/null || echo "000")
+                "http://$ACTIVE_CONTAINER:3000/health" 2>/dev/null || echo "000")
               if [ "$STATUS" = "200" ]; then
-                echo "[DEPLOY] API ready (slot=$ACTIVE_SLOT attempt=$i)"
+                echo "[DEPLOY] API healthy (slot=$ACTIVE_SLOT attempt=$i)"
                 exit 0
               fi
               sleep 2
             done
 
-            echo "::error::API /ready did not return 200 after 30s"
+            echo "::error::API /health did not return 200 after 30s"
             docker logs "$ACTIVE_CONTAINER" --tail 30 >&2 2>/dev/null || true
             exit 1
 
-  # ---------------------------------------------------------------------------
-  # JOB: sync-infra
-  #
-  # Syncs Nginx config (with slot-aware port substitution).
-  # Monitoring restarts are handled exclusively by deploy-bluegreen.sh.
-  # ---------------------------------------------------------------------------
-  sync-infra:
-    name: Sync Infrastructure (nginx)
-    runs-on: ubuntu-latest
-    needs: [api-health-gate]
-    timeout-minutes: 10
-    steps:
-      - name: Sync infrastructure configs via SSH
-        uses: appleboy/ssh-action@v1.0.3
-        with:
-          host: ${{ secrets.DO_HOST }}
-          username: ${{ secrets.DO_USER }}
-          key: ${{ secrets.DO_SSH_KEY }}
-          script: |
-            set -euo pipefail
-            T0=$(date +%s)
-            export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
-            [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
-            cd "$DEPLOY_ROOT"
-            INFRA_DIR="$DEPLOY_ROOT/infra"
-            NGINX_LIVE="$DEPLOY_ROOT/infra/nginx/live/api.conf"
-            NGINX_BACKUP_DIR="$DEPLOY_ROOT/infra/nginx/backup"
-            ACTIVE_SLOT_FILE="/var/run/api/active-slot"
-
-            ACTIVE_SLOT=$(cat "$ACTIVE_SLOT_FILE" 2>/dev/null || echo "blue")
-            ACTIVE_CONTAINER="api-$ACTIVE_SLOT"
-
-            # Load env from .env — exports DEPLOY_ROOT, API_HOSTNAME, and all
-            # app variables. DEPLOY_ROOT is already exported above; load-env.sh uses it.
-            source "$DEPLOY_ROOT/scripts/load-env.sh"
-
-            # Ensure live/backup dirs exist
-            mkdir -p "$(dirname "$NGINX_LIVE")" "$NGINX_BACKUP_DIR"
-
-            echo "::group::Nginx sync (slot=$ACTIVE_SLOT)"
-            cp "$NGINX_LIVE" "$NGINX_BACKUP_DIR/api.conf.bak.$(date +%s)" 2>/dev/null || true
-            NGINX_TMP=$(mktemp /tmp/fieldtrack-nginx.XXXXXX.conf)
-            sed \
-              -e "s|__ACTIVE_CONTAINER__|$ACTIVE_CONTAINER|g" \
-              -e "s|__API_HOSTNAME__|$API_HOSTNAME|g" \
-              "$INFRA_DIR/nginx/api.conf" > "$NGINX_TMP"
-            cp "$NGINX_TMP" "$NGINX_LIVE"
-            rm -f "$NGINX_TMP"
-
-            NGINX_TEST_OUT=$(docker exec nginx nginx -t 2>&1) || {
-              echo "::error::Nginx config test failed — restoring backup"
-              printf '%s\n' "$NGINX_TEST_OUT" >&2
-              LATEST_BAK=$(ls -1t "$NGINX_BACKUP_DIR"/api.conf.bak.* 2>/dev/null | head -1 || true)
-              [ -n "$LATEST_BAK" ] && cp "$LATEST_BAK" "$NGINX_LIVE"
-              exit 1
-            }
-            docker exec nginx nginx -s reload >/dev/null 2>&1
-            echo "[DEPLOY] Nginx reloaded → upstream=$ACTIVE_CONTAINER"
-            echo "::endgroup::"
-
-            # ROUTING VALIDATION — in-network (source of truth)
-            sleep 2
-            ROUTE_STATUS=$(docker run --rm --network api_network \
-              curlimages/curl:8.7.1 -sk -o /dev/null -w "%{http_code}" \
-              --max-time 10 https://nginx/health 2>/dev/null || echo "000")
-
-            if [ "$ROUTE_STATUS" = "200" ]; then
-              echo "[DEPLOY] Nginx routing verified (HTTP $ROUTE_STATUS)"
-            else
-              echo "::error::Nginx routing check failed (HTTP $ROUTE_STATUS) — restoring backup"
-              LATEST_BAK=$(ls -1t "$NGINX_BACKUP_DIR"/api.conf.bak.* 2>/dev/null | head -1 || true)
-              [ -n "$LATEST_BAK" ] && cp "$LATEST_BAK" "$NGINX_LIVE"
-              docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1 || true
-              exit 1
-            fi
-
-            # HTTPS advisory check (non-blocking)
-            HTTPS_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
-              --resolve "$API_HOSTNAME:443:127.0.0.1" \
-              -H "Host: $API_HOSTNAME" \
-              "https://127.0.0.1/health" --insecure 2>/dev/null || echo "000")
-            [ "$HTTPS_STATUS" != "200" ] && \
-              echo "[DEPLOY] HTTPS advisory status=$HTTPS_STATUS (host→Docker TCP routing; in-network check is authoritative)"
-
-            echo "[DEPLOY] Infra sync completed in $(($(date +%s) - T0))s"
-
-  # ---------------------------------------------------------------------------
-  # JOB: sync-monitoring  (Step F)
-  #
-  # Idempotent monitoring stack sync — runs after every deploy.
-  # Delegates to scripts/monitoring-sync.sh which:
-  #   - Self-heals missing .env.monitoring from example
-  #   - Creates api_network if absent
-  #   - Renders alertmanager.rendered.yml
-  #   - Runs docker compose up -d
-  #   - Validates prometheus / alertmanager / grafana health
-  # Monitoring is REQUIRED — deploy fails if any required container is unhealthy.
-  # ---------------------------------------------------------------------------
-  sync-monitoring:
-    name: Sync Monitoring Stack
-    runs-on: ubuntu-latest
-    needs: [sync-infra]
-    timeout-minutes: 15
-    steps:
-      - name: Sync and validate monitoring stack via SSH
-        uses: appleboy/ssh-action@v1.0.3
-        with:
-          host: ${{ secrets.DO_HOST }}
-          username: ${{ secrets.DO_USER }}
-          key: ${{ secrets.DO_SSH_KEY }}
-          script: |
-            set -euo pipefail
-            export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
-            [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
-            cd "$DEPLOY_ROOT"
-            chmod +x scripts/monitoring-sync.sh
-            ./scripts/monitoring-sync.sh
-
-      - name: Monitoring sync summary
-        if: always()
-        run: |
-          {
-            echo "## Monitoring Sync"
-            echo "| Container | Required |"
-            echo "|---|---|"
-            echo "| prometheus | ✅ |"
-            echo "| alertmanager | ✅ |"
-            echo "| grafana | ✅ |"
-          } >> "$GITHUB_STEP_SUMMARY"
-
-      - name: Deployment artifact traceability
-        if: always()
-        run: |
-          {
-            echo "## Deployment Artifacts"
-            echo "| Field | Value |"
-            echo "|---|---|"
-            echo "| Deployment SHA | \`${{ github.sha }}\` |"
-            echo "| Image Tag | \`fieldtrack-api:${{ needs.get-metadata.outputs.sha_short || github.sha }}\` |"
-            echo "| Workflow Run | [\#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) |"
-            echo "| Triggered By | \`${{ github.event_name }}\` |"
-            echo "| Commit Message | \`${{ github.event.head_commit.message }}\` |"
-          } >> "$GITHUB_STEP_SUMMARY"
-          
-          # Also output to logs for audit trail
-          echo "DEPLOYMENT_COMPLETE: SHA=${{ github.sha }} IMAGE=ghcr.io/${{ github.repository_owner }}/api:${{ github.sha }} RUN=${{ github.run_id }}"
-
   # ---------------------------------------------------------------------------
   # JOB: health-and-smoke
   #
-  # Step 1: Poll /health and /ready until they return 200 (up to 60 s each).
-  # Step 2: Run the full smoke test suite (login + core API flows).
+  # Post-deploy health verification and CI coupling guard.
   # Failure here triggers the rollback job automatically.
   # ---------------------------------------------------------------------------
   health-and-smoke:
     name: Health Checks & Smoke Tests
     runs-on: ubuntu-latest
-    needs: [sync-infra, sync-monitoring]
+    needs: [api-health-gate]
     timeout-minutes: 15
     steps:
       - name: Checkout
         uses: actions/checkout@v5
 
+      - name: CI guard — deploy.sh must not reference /ready or monitoring stack
+        run: |
+          set -euo pipefail
+          echo "Checking deploy.sh for forbidden references..."
+          # Exclude comment lines (starting with optional whitespace then #)
+          if grep -E "(/ready)" scripts/deploy.sh | grep -Ev '^\s*#'; then
+            echo "::error::deploy.sh references /ready — deploy gate must only use /health"
+            exit 1
+          fi
+          if grep -E "(prometheus|grafana|alertmanager|loki)" scripts/deploy.sh | grep -Ev '^\s*#'; then
+            echo "::error::deploy.sh references monitoring stack — deploy must be monitoring-independent"
+            exit 1
+          fi
+          echo "Validating no local infra coupling..."
+          if grep -R "infra/" . | grep -v "docs/infra-contract.md"; then
+            echo "::error::Local infra coupling detected"
+            exit 1
+          fi
+          echo "✓ CI guards passed: no /ready or monitoring references in deploy.sh"
+
       - name: Wait for /health endpoint (via VPS)
         uses: appleboy/ssh-action@v1.0.3
         with:
@@ -955,7 +867,8 @@ jobs:
             export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
             [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
             cd "$DEPLOY_ROOT"
-            source scripts/load-env.sh
+            API_BASE_URL=$(grep -E '^API_BASE_URL=' .env | head -1 | cut -d'=' -f2-)
+            API_HOSTNAME=$(echo "$API_BASE_URL" | sed -E 's|^https?://||' | cut -d'/' -f1)
             for i in $(seq 1 30); do
               # Phase 1: in-network (source of truth)
               if docker run --rm --network api_network \
@@ -988,7 +901,8 @@ jobs:
             export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
             [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
             cd "$DEPLOY_ROOT"
-            source scripts/load-env.sh
+            API_BASE_URL=$(grep -E '^API_BASE_URL=' .env | head -1 | cut -d'=' -f2-)
+            API_HOSTNAME=$(echo "$API_BASE_URL" | sed -E 's|^https?://||' | cut -d'/' -f1)
             for i in $(seq 1 10); do
               # Phase 1: in-network (source of truth)
               if docker run --rm --network api_network \
@@ -1010,45 +924,24 @@ jobs:
             echo "::error::Final health check failed after 10 attempts"
             exit 1
 
-      - name: Run smoke tests
-        env:
-          API_BASE_URL: ${{ secrets.API_BASE_URL }}
-          FT_EMP_EMAIL: ${{ secrets.FT_EMP_EMAIL }}
-          FT_EMP_PASSWORD: ${{ secrets.FT_EMP_PASSWORD }}
-          FT_ADMIN_EMAIL: ${{ secrets.FT_ADMIN_EMAIL }}
-          FT_ADMIN_PASSWORD: ${{ secrets.FT_ADMIN_PASSWORD }}
-          SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
-          SUPABASE_ANON_KEY: ${{ secrets.SUPABASE_ANON_KEY }}
-        run: |
-          chmod +x scripts/smoke-test.sh
-          ./scripts/smoke-test.sh
-
-      - name: Upload smoke test report
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: smoke-test-report-${{ github.sha }}
-          path: smoke-report.json
-          retention-days: 30
-
       - name: Deployment summary
         run: |
           echo "[DEPLOY] Production deployment complete"
           echo "  Commit : ${{ github.sha }}"
           echo "  Health : OK"
-          echo "  Smoke  : passed"
+          echo "  Post-deploy checks : passed"
 
   # ---------------------------------------------------------------------------
   # JOB: rollback
   #
-  # Triggered automatically when deploy, sync-infra, OR health-and-smoke fails.
+  # Triggered automatically when deploy or health-and-smoke fails.
   # Restores the previously healthy Blue-Green slot via the rollback script.
   # 'if: always()' ensures this job can evaluate even if upstream jobs failed.
   # ---------------------------------------------------------------------------
   rollback:
     name: Rollback Deployment (auto)
     runs-on: ubuntu-latest
-    needs: [vps-readiness-check, deploy, api-health-gate, sync-infra, sync-monitoring, health-and-smoke]
+    needs: [vps-readiness-check, deploy, api-health-gate, health-and-smoke]
     timeout-minutes: 10
     if: |
       always() &&
@@ -1056,8 +949,6 @@ jobs:
         needs.vps-readiness-check.result == 'failure' ||
         needs.deploy.result == 'failure' ||
         needs.api-health-gate.result == 'failure' ||
-        needs.sync-infra.result == 'failure' ||
-        needs.sync-monitoring.result == 'failure' ||
         needs.health-and-smoke.result == 'failure'
       )
     steps:
@@ -1067,8 +958,6 @@ jobs:
           [ "${{ needs.vps-readiness-check.result }}" = "failure" ] && echo "  [ERROR] failed job: vps-readiness-check" || true
           [ "${{ needs.deploy.result }}" = "failure" ] && echo "  [ERROR] failed job: deploy" || true
           [ "${{ needs.api-health-gate.result }}" = "failure" ] && echo "  [ERROR] failed job: api-health-gate" || true
-          [ "${{ needs.sync-infra.result }}" = "failure" ] && echo "  [ERROR] failed job: sync-infra" || true
-          [ "${{ needs.sync-monitoring.result }}" = "failure" ] && echo "  [ERROR] failed job: sync-monitoring" || true
           [ "${{ needs.health-and-smoke.result }}" = "failure" ] && echo "  [ERROR] failed job: health-and-smoke" || true
 
       - name: Rollback on VPS
@@ -1083,7 +972,7 @@ jobs:
             [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
             cd "$DEPLOY_ROOT"
             chmod +x scripts/*.sh
-            ./scripts/rollback.sh --auto
+            ./scripts/deploy.sh --rollback --auto
 
             ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "unknown")
             echo "[DEPLOY] Rollback complete — slot=$ACTIVE_SLOT sha=${{ github.sha }}"
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 89ebacb..cf568a1 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -22,7 +22,6 @@ jobs:
     timeout-minutes: 5
     outputs:
       api: ${{ steps.filter.outputs.api }}
-      infra: ${{ steps.filter.outputs.infra }}
     steps:
       - uses: actions/checkout@v5
 
@@ -37,9 +36,6 @@ jobs:
               - 'package-lock.json'
               - 'tsconfig.json'
               - 'vitest.config.ts'
-            infra:
-              - 'infra/**'
-              - '.github/workflows/**'
 
   api-ci:
     name: API CI
@@ -229,58 +225,10 @@ jobs:
           docker network rm ci_api_net
           docker rmi fieldtrack-api:ci-validation
 
-  infra-ci:
-    name: Infra CI
-    runs-on: ubuntu-latest
-    needs: detect-changes
-    timeout-minutes: 10
-    if: always()
-    steps:
-      - name: Abort if change detection failed
-        if: needs.detect-changes.result != 'success'
-        run: |
-          echo "❌ Change detection did not succeed (result: ${{ needs.detect-changes.result }}) — cannot safely skip checks"
-          exit 1
-
-      - name: Skip if no infra changes
-        if: needs.detect-changes.outputs.infra != 'true'
-        run: |
-          echo "No infra changes — skipping all infra validation"
-          echo "✓ Infra CI (skipped)"
-          exit 0
-
-      - uses: actions/checkout@v5
-        if: needs.detect-changes.outputs.infra == 'true'
-
-      - name: Validate nginx config
-        if: needs.detect-changes.outputs.infra == 'true'
-        run: |
-          sed \
-            -e 's/__ACTIVE_CONTAINER__/api-blue/g' \
-            -e 's/__API_HOSTNAME__/api.test.local/g' \
-            infra/nginx/api.conf > /tmp/nginx.conf
-
-          if grep -q '__[A-Z_]*__' /tmp/nginx.conf; then
-            echo "❌ Unreplaced placeholders"
-            exit 1
-          fi
-
-          mkdir -p /tmp/ssl
-          openssl req -x509 -nodes -days 1 \
-            -newkey rsa:2048 \
-            -keyout /tmp/ssl/origin.key \
-            -out /tmp/ssl/origin.crt \
-            -subj "/CN=localhost"
-
-          docker run --rm \
-            -v /tmp/nginx.conf:/etc/nginx/conf.d/default.conf:ro \
-            -v /tmp/ssl:/etc/ssl/api:ro \
-            nginx:1.27-alpine nginx -t
-
   # ---------------------------------------------------------------------------
   # JOB: codeql-lite
   #
-  # Lightweight CodeQL security scan — runs in PARALLEL with api-ci and infra-ci.
+  # Lightweight CodeQL security scan — runs in PARALLEL with api-ci.
   # Uses security-extended queries (OWASP Top-10 class) for fast PR feedback.
   # This job is REQUIRED in branch protection; PRs cannot merge until it passes.
   #
diff --git a/.gitignore b/.gitignore
index 789b129..11fb191 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,16 +2,6 @@
 # .gitignore for FieldTrack API
 # ============================================
 
-# ----------------
-# Infrastructure
-# ----------------
-# Monitoring data
-infra/tempo/data/
-infra/prometheus/data/
-infra/grafana/data/
-# Rendered Alertmanager config (contains real webhook URL — VPS only)
-infra/alertmanager/alertmanager.rendered.yml
-
 # Deployment history (VPS-side file, never committed)
 .deploy_history
 .last_deploy
@@ -44,7 +34,6 @@ packages/*/node_modules/
 .env.test.local
 .env.production.local
 !.env.example
-!.env.monitoring.example
 
 # ----------------
 # Build Output
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f5399bd..309b088 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -62,9 +62,9 @@ All significant changes to FieldTrack 2.0 are documented here by development pha
 - Every image is tagged with both `latest` and a 7-character SHA
 
 ### Rollback System (commits `35db851`, `23e7720`)
-- Added `backend/scripts/rollback.sh` — reads `.deploy_history`, validates ≥ 2 deployments, displays history table with current/target markers, prompts for confirmation, redeploys previous image using `deploy-bluegreen.sh`
-- Updated `backend/scripts/deploy-bluegreen.sh` to prepend the deployed SHA to `.deploy_history` (rolling window of 5) after every successful deploy
-- Added `backend/.gitignore` entry for `.deploy_history`
+- Added rollback mode to `scripts/deploy.sh` — reads `.deploy_history`, validates ≥ 2 deployments, displays history table with current/target markers, prompts for confirmation, and redeploys the previous image
+- Updated `scripts/deploy.sh` to prepend the deployed SHA to `.deploy_history` (rolling window of 5) after every successful deploy
+- Added `.gitignore` entry for `.deploy_history`
 - Added `docs/ROLLBACK_SYSTEM.md` and `docs/ROLLBACK_QUICKREF.md`
 
 ---
@@ -137,20 +137,14 @@ All significant changes to FieldTrack 2.0 are documented here by development pha
 - Added `otelMixin` in `src/config/logger.ts` — injects `trace_id`, `span_id`, `trace_flags` into every Pino log line
 - Added OTel span enrichment in `app.ts` `onRequest` hook — sets `http.route`, `http.client_ip`, `request.id`, `enduser.id` on every request
 - Upgraded Prometheus histogram to `observeWithExemplar()` with `traceId` on every observation
-- Updated `infra/docker-compose.monitoring.yml` — Tempo ports 4317/4318; Prometheus `--enable-feature=exemplar-storage`
-- Updated `infra/prometheus/prometheus.yml` — OpenMetrics scrape format for exemplar ingestion
+- Updated standalone infra repository monitoring config — Tempo ports 4317/4318; Prometheus `--enable-feature=exemplar-storage`
+- Updated standalone infra repository Prometheus config — OpenMetrics scrape format for exemplar ingestion
 
 ---
 
 ## [Phase 13] — Production Infrastructure: VPS, Nginx & Monitoring Stack — 2026
 
-- Added `backend/scripts/vps-setup.sh` — idempotent VPS provisioning (Docker, Nginx, systemd, certbot, ufw)
-- Added `infra/nginx/api.conf` — TLS termination, HTTP→HTTPS redirect, proxy headers, WebSocket upgrade, gzip
-- Added `infra/docker-compose.monitoring.yml` — Prometheus, Grafana, Loki, Promtail, Tempo on `api_network`
-- Added `infra/grafana/dashboards/fieldtrack.json` — pre-built dashboard (HTTP rate, latency, queue depth, heap, Redis)
-- Added `infra/grafana/provisioning/` — auto-provisioned dashboard and Prometheus datasource
-- Added `infra/prometheus/alerts.yml` — alert rules for API latency, queue depth, Redis connectivity, host metrics
-- Added `infra/promtail/promtail.yml` — Docker log discovery and shipping to Loki
+- Added VPS setup and infra assets for production infrastructure (later extracted into standalone infra repository)
 
 ---
 
@@ -167,7 +161,7 @@ All significant changes to FieldTrack 2.0 are documented here by development pha
 ## [Phase 11] — CI/CD Deployment Hardening — 2025
 
 - Added initial GitHub Actions workflow for automated deployment
-- Added `backend/scripts/deploy-bluegreen.sh` — blue-green zero-downtime deployment using Docker port-swap and Nginx upstream switch
+- Added blue-green zero-downtime deployment script (later unified into `scripts/deploy.sh`)
 - Health-check validation before traffic switch
 - Old container removed only after successful switchover
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1c1a8b1..a654b37 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -23,7 +23,6 @@ cp .env.example .env
 |---------|---------|---------|
 | New feature | `feature/<description>` | `feature/expense-attachments` |
 | Bug fix | `fix/<description>` | `fix/session-double-close` |
-| Infrastructure | `infra/<description>` | `infra/add-redis-tls` |
 | Documentation | `docs/<description>` | `docs/update-api-reference` |
 | Tests | `test/<description>` | `test/analytics-edge-cases` |
 | Chores / deps | `chore/<description>` | `chore/bump-fastify-5` |
diff --git a/README.md b/README.md
index 6a731f6..d0d8650 100644
--- a/README.md
+++ b/README.md
@@ -11,25 +11,25 @@
 
 ## Overview
 
-FieldTrack 2.0 is a production-ready REST API backend for managing field workforce operations. It provides secure, multi-tenant APIs for tracking employee attendance, real-time GPS location, expense workflows, and aggregate analytics — all with a full observability stack, automated CI/CD, and zero-downtime blue-green deployments.
+FieldTrack 2.0 is a production-ready REST API for managing field workforce operations. It provides secure, multi-tenant APIs for tracking employee attendance, real-time GPS location, expense workflows, and aggregate analytics.
+
+**Boundaries:** This repository is the API only. Infrastructure (nginx, monitoring stack, VPS provisioning) lives in the infra repository.
 
 ---
 
 ## Features
 
-- **Multi-tenant isolation** — every data query is scoped to the authenticated organization; cross-tenant access is architecturally impossible
-- **Attendance sessions** — check-in / check-out lifecycle with state machine enforcement (`EmployeeAlreadyCheckedIn`, `SessionAlreadyClosed`)
+- **Multi-tenant isolation** — every query is scoped to the authenticated organization; cross-tenant access is architecturally impossible
+- **Attendance sessions** — check-in / check-out lifecycle with state machine enforcement
 - **Real-time GPS ingestion** — single and batch endpoints (up to 100 points), idempotent upsert, per-user rate limiting
 - **Async distance calculation** — BullMQ background worker computes Haversine distance after check-out; never blocks the HTTP response
-- **Expense workflow** — PENDING → APPROVED / REJECTED lifecycle, ADMIN review endpoints, re-review guard
-- **Admin analytics** — org-wide summaries, per-user breakdowns, configurable leaderboard (distance / duration / sessions)
-- **Redis-backed rate limiting** — per-JWT-sub limits on write endpoints survive corporate NAT and horizontal scaling
-- **Security plugins** — Helmet, CORS, Redis rate limiter, brute-force detection with Prometheus counters
-- **Distributed tracing** — OpenTelemetry → Tempo; trace IDs injected into every Pino log line
-- **One-click metric-to-trace** — Prometheus exemplars link latency spikes directly to Tempo traces in Grafana
-- **Blue-green zero-downtime deployments** — Nginx upstream swap, health-check gate, 5-SHA rollback history
-- **Automated rollback** — `rollback.sh` restores the previous version in under 10 seconds
-- **Full test suite** — 124 tests (8 files) with Vitest; unit + integration coverage; CI blocks deploy on failure
+- **Expense workflow** — PENDING → APPROVED / REJECTED lifecycle, with re-review guard
+- **Admin analytics** — org-wide summaries, per-user breakdowns, configurable leaderboard
+- **Redis-backed rate limiting** — per-JWT-sub limits survive corporate NAT and horizontal scaling
+- **Security** — Helmet, CORS, Redis rate limiter, brute-force detection
+- **Distributed tracing** — OpenTelemetry → OTLP; trace IDs injected into every Pino log line
+- **Blue-green zero-downtime deployments** — nginx upstream swap, health-check gate, 5-SHA rollback history
+- **Full test suite** — Vitest unit + integration coverage; CI blocks deploy on failure
 
 ---
 
@@ -37,139 +37,185 @@ FieldTrack 2.0 is a production-ready REST API backend for managing field workfor
 
 | Layer | Technology |
 |-------|------------|
-| **Runtime** | Node.js 24 (Alpine) |
+| **Runtime** | Node.js 24 (Debian slim / distroless) |
 | **Language** | TypeScript 5.9 (strict, ESM) |
 | **Framework** | Fastify 5 |
 | **Database** | PostgreSQL via [Supabase](https://supabase.com) |
 | **Auth** | JWT (`@fastify/jwt`) — Supabase-issued tokens |
 | **Job Queue** | [BullMQ](https://docs.bullmq.io/) + Redis |
 | **Validation** | [Zod 4](https://zod.dev/) |
-| **Observability** | Prometheus · Grafana · Loki · Tempo · Promtail · OpenTelemetry |
+| **Tracing** | OpenTelemetry (OTLP export) |
 | **Security** | `@fastify/helmet` · `@fastify/cors` · `@fastify/rate-limit` · `@fastify/compress` |
 | **Testing** | [Vitest](https://vitest.dev/) |
 | **CI/CD** | GitHub Actions → GHCR → Blue-Green VPS Deploy |
 
 ---
 
-## Architecture
+## Local Development
 
-### System Overview
+**Prerequisites:** Node.js ≥ 24, npm, a running Redis instance, a Supabase project
 
-```
-┌─────────────────────────────────────────────────────────────────┐
-│                        CLIENT LAYER                             │
-│  Mobile App  →  Web Dashboard  →  Desktop Client                │
-└────────────────────────────┬────────────────────────────────────┘
-                             │ HTTPS / REST API
-                             ▼
-┌─────────────────────────────────────────────────────────────────┐
-│                    APPLICATION LAYER                            │
-│                                                                  │
-│  Nginx (TLS · Blue-Green Routing)                               │
-│    │                                                             │
-│    ▼                                                             │
-│  Fastify 5 API Server                                           │
-│    ├─ Auth Middleware (JWT)                                     │
-│    ├─ Security (Helmet · CORS · Rate Limit)                     │
-│    ├─ Validation (Zod)                                          │
-│    └─ Business Logic                                            │
-└────────────────────────────┬────────────────────────────────────┘
-                             │
-                ┌────────────┼────────────┐
-                │            │            │
-                ▼            ▼            ▼
-┌──────────────────┐  ┌──────────────┐  ┌──────────────────┐
-│   Supabase       │  │    Redis     │  │  BullMQ Worker   │
-│   PostgreSQL     │  │  Job Queue   │  │  (Distance Calc) │
-│  (Multi-tenant)  │  │              │  │                  │
-└──────────────────┘  └──────────────┘  └──────────────────┘
-
-┌─────────────────────────────────────────────────────────────────┐
-│                   OBSERVABILITY LAYER                           │
-│                                                                  │
-│  Prometheus → Grafana ← Loki ← Tempo                            │
-│   (Metrics)   (Dashboards) (Logs) (Traces)                      │
-└─────────────────────────────────────────────────────────────────┘
+```bash
+# Install dependencies
+npm install
+
+# Configure environment
+cp .env.example .env
+# Edit .env — fill in SUPABASE_URL, keys, REDIS_URL, and CORS_ORIGIN
+
+# Start in development mode (hot reload)
+npm run dev
 ```
 
-**📊 For detailed architecture diagrams, data flows, and deployment topology see [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md)**
+The API will start on `http://localhost:3000`.
 
 ---
 
-## Quick Start
+## Environment Variables
 
-**Prerequisites:** Node.js ≥ 24, npm, Redis, a Supabase project
+All variables are validated at startup by `src/config/env.ts` (Zod schema, fail-fast).
 
-```bash
-# 1. Install dependencies
-npm install
+### URLs
 
-# 2. Configure environment
-cp .env.example .env
-# Edit .env — fill in Supabase URL, keys, Redis URL, and ALLOWED_ORIGINS
+| Variable | Required | Purpose |
+|----------|:---:|---------|
+| `API_BASE_URL` | ✅ | Canonical public URL of this API (`https://…`, no trailing slash) |
+| `APP_BASE_URL` | ✅ | Root URL of the application — used in email footers and redirects |
+| `FRONTEND_BASE_URL` | ✅ prod | URL of the web frontend — used to build email links |
 
-# 3. Run in development mode
-npm run dev
+### Runtime
 
-# 4. Run the test suite
-npm run test
-```
+| Variable | Required | Default | Purpose |
+|----------|:---:|---------|---------|
+| `CONFIG_VERSION` | ✅ | `"1"` | Schema version guard — must be `"1"` |
+| `APP_ENV` | ✅ | `development` | Application environment — drives all app-level logic |
+| `PORT` | ✅ | `3000` | Container listen port |
+
+### Auth & Data
+
+| Variable | Required | Purpose |
+|----------|:---:|---------|
+| `SUPABASE_URL` | ✅ | Supabase project URL |
+| `SUPABASE_ANON_KEY` | ✅ | Supabase public/anon key |
+| `SUPABASE_SERVICE_ROLE_KEY` | ✅ | Service role key — bypasses RLS, never expose to clients |
+| `SUPABASE_JWT_SECRET` | ✅ | JWT signing secret (≥ 32 chars, HS256) |
+| `REDIS_URL` | ✅ | Redis connection URL (`redis://` or `rediss://`) |
+
+### Security
+
+| Variable | Required in Prod | Default | Purpose |
+|----------|:---:|---------|---------|
+| `CORS_ORIGIN` | ✅ | `""` | Comma-separated allowed CORS origins. Empty activates localhost fallback in dev |
+| `METRICS_SCRAPE_TOKEN` | ✅ | — | Token required to scrape `/metrics`. Unset = open in dev/test |
+| `TEMPO_ENDPOINT` | — | `http://tempo:4318` | OTLP HTTP endpoint for trace export |
+
+> **Observability variables (`METRICS_SCRAPE_TOKEN`, `TEMPO_ENDPOINT`) are optional for standalone operation.** The API starts and handles requests without them. `METRICS_SCRAPE_TOKEN` gates the `/metrics` endpoint (unset = endpoint is open, safe in dev/test). `TEMPO_ENDPOINT` controls where traces are exported; if the Tempo collector is unreachable, traces are silently dropped with no impact to request handling. The monitoring stack that scrapes these endpoints is managed in the [infra repository].
+
+---
+
+## Scripts
+
+| Command | Purpose |
+|---------|---------|
+| `npm run dev` | Start development server with hot reload |
+| `npm run typecheck` | TypeScript type check (no emit) |
+| `npm test` | Run full test suite (Vitest) |
+| `npm run build` | Compile TypeScript to `dist/` |
+| `npm start` | Start compiled production server |
+| `./scripts/deploy.sh <sha>` | Blue-green deploy a specific image SHA |
+| `./scripts/deploy.sh --rollback` | Interactive rollback to previous SHA |
+| `./scripts/deploy.sh --rollback --auto` | Non-interactive rollback (CI) |
+
+---
+
+## Health Endpoints
+
+| Endpoint | Purpose | Deploy Gate |
+|----------|---------|-------------|
+| `GET /health` | Liveness check — returns `{"status":"ok"}` once the server bootstraps | **YES** — used by deploy.sh and CI |
+| `GET /ready` | Dependency check — verifies Redis and Supabase connectivity | NO — informational only, not a deploy gate |
+
+`/health` returns 200 after server bootstrap regardless of dependency status. `/ready` failing does not block a deployment; a degraded-but-running API is preferred over a stuck deploy.
 
 ---
 
-## Deployment
+## Deployment Overview
+
+> **First-deployment requirement:** The API container joins `api_network`. On a fresh VPS, **nginx** (reverse-proxy) and **Redis** must already be running and attached to that network via the infra repository before the first `deploy.sh` run. Subsequent deploys are fully self-contained.
+
+## Infra Requirement
+
+This API requires an external infra repository.
+
+Expected on server:
+- nginx (connected to `api_network`)
+- Redis (`redis:6379`)
+
+Default path:
+- `INFRA_ROOT=/opt/infra`
 
-FieldTrack 2.0 deploys automatically via GitHub Actions on every push to `master`.
+Deployments run automatically via GitHub Actions on every push to `master` (after CodeQL scan passes).
 
 ```
-Push to master
-  → test job (npm ci · tsc · vitest)  — blocks on failure
-  → build-and-deploy job (Docker Buildx with GHA cache → GHCR → VPS SSH)
+CodeQL deep scan (master)
+  → validate (typecheck + audit) ──┐
+  → test-api ─────────────────────┼──► build-scan-push ──► vps-readiness-check ──► deploy
+                                  ┘                                                     │
+                                                       api-health-gate ◄────────────────┘
+                                                             │
+                                                       health-and-smoke ──► rollback (on failure)
 ```
 
-### Manual deploy / rollback
+**Blue-green strategy:** The VPS always runs two containers (`api-blue`, `api-green`). On each deploy, the inactive slot is updated and nginx is reloaded to point at it. The previous slot is stopped only after the health gate passes.
 
+**nginx is managed by the infra repository.** The API container joins `api_network`; nginx is expected to already be running and configured.
+
+**Manual deploy:**
 ```bash
-# On the VPS
-./scripts/deploy-bluegreen.sh <sha>   # Deploy a specific image
-./scripts/rollback.sh                 # Restore previous version (~10 s)
+./scripts/deploy.sh <sha>
 ```
 
-See [docs/DEPLOYMENT.md](docs/DEPLOYMENT.md) for full setup instructions including VPS provisioning, Nginx config, and CI/CD secret configuration.
+**Rollback:**
+```bash
+./scripts/deploy.sh --rollback
+```
+
+See [docs/DEPLOYMENT.md](docs/DEPLOYMENT.md) for full deployment details.
 
 ---
 
 ## Project Structure
 
-> **Note:** The web frontend is maintained in a separate repository: [fieldtrack-tech/web](https://github.com/fieldtrack-tech/web)
-
 ```
 api/
 ├── src/               # Application source
 │   ├── modules/       # Domain modules (attendance · locations · expenses · analytics)
-│   ├── plugins/       # Fastify plugins (JWT · Prometheus · security stack)
+│   ├── plugins/       # Fastify plugins (JWT · metrics · security)
 │   ├── workers/       # BullMQ distance calculation worker
 │   ├── middleware/    # Auth + role guard
-│   └── utils/         # Shared utilities (errors · response · tenant · metrics)
+│   └── utils/         # Shared utilities (errors · response · tenant)
 ├── tests/             # Vitest unit and integration tests
-├── scripts/           # Blue-green deploy + rollback scripts
-├── infra/             # Monitoring stack (Prometheus · Grafana · Loki · Tempo)
+├── scripts/           # Deploy, rollback, and utility scripts
 ├── docs/              # Project documentation
 └── .github/workflows/ # GitHub Actions CI/CD
 ```
 
+> The web frontend is in a separate repository: [fieldtrack-tech/web](https://github.com/fieldtrack-tech/web)  
+> Infrastructure (nginx, monitoring, VPS setup) is in a separate infra repository.
+
 ---
 
 ## Documentation
 
 | Document | Description |
 |----------|-------------|
-| [Architecture](docs/ARCHITECTURE.md) | System design, component diagrams, data flows, deployment topology, security layers |
+| [Architecture](docs/ARCHITECTURE.md) | System design, component diagrams, data flows |
 | [API Reference](docs/API_REFERENCE.md) | All endpoints, auth requirements, request/response schemas, error codes |
 | [Deployment Guide](docs/DEPLOYMENT.md) | VPS provisioning, CI/CD setup, blue-green deploy, troubleshooting |
 | [Rollback System](docs/ROLLBACK_SYSTEM.md) | Rollback architecture, deployment history, safety features |
-| [Rollback Quick Reference](docs/ROLLBACK_QUICKREF.md) | Fast operator reference card for deployments |
-| [Walkthrough](docs/walkthrough.md) | Phase-by-phase build history and deep-dives |
+| [Rollback Quick Reference](docs/ROLLBACK_QUICKREF.md) | Fast operator reference card |
+| [Environment Contract](docs/env-contract.md) | All environment variables, naming rules |
+| [Infra Contract](docs/infra-contract.md) | External infra responsibilities and path contract (`INFRA_ROOT`) |
 | [Changelog](CHANGELOG.md) | Full history of every phase |
 | [Contributing](CONTRIBUTING.md) | Contribution workflow, branching, code conventions |
 | [Security Policy](SECURITY.md) | How to report vulnerabilities |
@@ -184,7 +230,6 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for setup instructions, branch naming con
 ```
 feature/<description>   # new functionality
 fix/<description>       # bug fixes
-infra/<description>     # infrastructure changes
 docs/<description>      # documentation
 test/<description>      # test additions
 chore/<description>     # maintenance / deps
@@ -194,12 +239,6 @@ chore/<description>     # maintenance / deps
 ```
 type(scope): short imperative description
 ```
-Allowed types: `feat` `fix` `refactor` `ci` `infra` `docs` `test` `chore`
+Allowed types: `feat` `fix` `refactor` `ci` `docs` `test` `chore`
 
 All PRs require review from CODEOWNERS and must pass CI before merge.
-
----
-
-## License
-
-[MIT](LICENSE) © 2026 FieldTrack
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index 61fb68e..d425872 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -99,33 +99,10 @@
 │                                                                           │
 └───────────────────────────────────────────────────────────────────────────┘
 
-┌─────────────────────────────────────────────────────────────────────────┐
-│                    OBSERVABILITY LAYER                                  │
-├─────────────────────────────────────────────────────────────────────────┤
-│                                                                           │
-│  ┌──────────────┐    ┌──────────────┐    ┌──────────────┐             │
-│  │  Prometheus  │───▶│   Grafana    │◀───│     Loki     │             │
-│  │   (Metrics)  │    │ (Dashboard)  │    │    (Logs)    │             │
-│  └──────────────┘    └──────────────┘    └──────────────┘             │
-│         ▲                                        ▲                       │
-│         │                                        │                       │
-│         │            ┌──────────────┐            │                       │
-│         └────────────│    Tempo     │────────────┘                       │
-│                      │  (Traces)    │                                    │
-│                      └──────────────┘                                    │
-│                             ▲                                            │
-│                             │                                            │
-│                             │ OpenTelemetry                              │
-│                             │                                            │
-└─────────────────────────────┼────────────────────────────────────────────┘
-                              │
-                              │
-                    ┌─────────┴─────────┐
-                    │                   │
-                    │  Fastify API      │
-                    │  (Instrumented)   │
-                    │                   │
-                    └───────────────────┘
+```
+
+> Monitoring stack (Prometheus, Grafana, Loki, Tempo) is managed by the **infra repository**.
+> The API exposes `/metrics` and OTLP traces, which the infra repo consumes.
 ```
 
 ## Component Details
@@ -158,12 +135,9 @@
   - Configurable concurrency (`WORKER_CONCURRENCY` env var)
   - Job retention limits: 1 000 completed, 5 000 failed (prevents Redis memory growth)
 
-### Observability Layer
-- **Prometheus**: Metrics collection and alerting
-- **Grafana**: Visualization dashboards
-- **Loki**: Log aggregation and querying
-- **Tempo**: Distributed tracing
-- **OpenTelemetry**: Unified instrumentation
+### Observability
+- The API emits metrics (Prometheus format on `/metrics`), structured logs (Pino/JSON), and traces (OpenTelemetry OTLP)
+- Collection, dashboards, and alerting are handled by the **infra repository**
 
 ## Data Flow
 
@@ -337,8 +311,8 @@ Fastify API
 │  Layer 4: Monitoring & Response                                          │
 │  ┌──────────────────────────────────────────────────────────────┐       │
 │  │  • Abuse detection logging                                   │       │
-│  │  • Prometheus alerting                                       │       │
-│  │  • Distributed tracing                                       │       │
+  │  • Alerting (handled by infra repository)                    │       │
+  │  • Distributed tracing (OpenTelemetry OTLP)                  │       │
 │  │  • Error tracking                                            │       │
 │  └──────────────────────────────────────────────────────────────┘       │
 │                                                                           │
@@ -367,10 +341,9 @@ Fastify API
 - **Compression**: @fastify/compress
 
 ### Observability
-- **Metrics**: Prometheus + prom-client
-- **Logs**: Pino + Loki
-- **Traces**: OpenTelemetry 2.x + Tempo
-- **Dashboards**: Grafana
+- **Metrics**: prom-client (exposed on `/metrics`, scraped by infra repo)
+- **Logs**: Pino (structured JSON, collected by infra repo)
+- **Traces**: OpenTelemetry 2.x (exported via OTLP to `TEMPO_ENDPOINT`)
 
 ### DevOps
 - **Containerization**: Docker (node:24-alpine)
diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md
index a21d390..76249b2 100644
--- a/docs/DEPLOYMENT.md
+++ b/docs/DEPLOYMENT.md
@@ -2,6 +2,8 @@
 
 This document covers deploying FieldTrack API to a Linux VPS using the included blue-green deployment system.
 
+> **Scope:** This document covers the API only. Nginx configuration, TLS, and the monitoring stack are managed by the **infra repository**.
+
 ---
 
 ## Prerequisites
@@ -9,84 +11,48 @@ This document covers deploying FieldTrack API to a Linux VPS using the included
 - A Linux VPS (Ubuntu 22.04 recommended) accessible via SSH
 - A GitHub Container Registry (GHCR) account with push access to the repository
 - GitHub Actions secrets configured (see [CI/CD Setup](#cicd-setup))
-- Docker and Docker Compose installed on the VPS (handled by `vps-setup.sh`)
+- Docker installed on the VPS
+- Nginx already running and configured via the **infra repository**
 
 ---
 
-## Initial VPS Provisioning
-
-The `vps-setup.sh` script handles the full first-time setup of a fresh VPS:
-
-```bash
-# Copy the script to the VPS and run as root
-scp scripts/vps-setup.sh root@your-server:/tmp/
-ssh root@your-server 'bash /tmp/vps-setup.sh'
-```
-
-This script:
+## API Deployment
 
-1. Installs Docker, Docker Compose, Nginx, and system dependencies
-2. Creates a dedicated `deploy` OS user with limited permissions
-3. Clones the repository and initialises the directory structure
-4. Obtains a TLS certificate via Let's Encrypt (`certbot`)
-5. Configures Nginx as a reverse proxy (TLS termination + blue-green upstream switching)
-6. Sets up a `systemd` service for auto-restart on boot
-7. Configures log rotation and minimal `ufw` firewall rules
-8. Starts the monitoring stack (Prometheus, Grafana, Loki, Tempo)
+1. SSH into VPS
+2. Ensure nginx is running (managed via infra repository)
+3. Copy `.env.example` to `.env` and fill in all values
+4. Deploy: `./scripts/deploy.sh <sha>`
+5. Confirm health: `curl https://<domain>/health`
 
-Before running, update the variables at the top of the script:
+## Rollback
 
 ```bash
-DOMAIN="yourdomain.com"         # Your server's domain
-DEPLOY_USER="fieldtrack"        # OS user to run the service
-GH_USER="your-github-username"  # GitHub username (for GHCR)
-REPO_URL="https://github.com/your-username/api.git"
+./scripts/deploy.sh --rollback           # interactive
+./scripts/deploy.sh --rollback --auto    # non-interactive (CI)
 ```
 
----
+## Monitoring
 
-## API Deployment
-1. SSH into VPS
-2. Run `scripts/vps-setup.sh` from workspace root
-3. Set `.env` and `.env.monitoring` in workspace root
-4. Start monitoring stack: `docker-compose -f infra/docker-compose.monitoring.yml up -d`
-5. Deploy API: `scripts/deploy-bluegreen.sh`
-6. Confirm readiness: `curl https://<domain>/ready`
-7. Confirm Prometheus target status is UP
+The observability stack (Prometheus, Grafana, Loki, Tempo) is **handled by the infra repository**. The API exposes:
+- `GET /metrics` — Prometheus-format metrics (protected by `METRICS_SCRAPE_TOKEN`)
+- Traces exported via OTLP to `TEMPO_ENDPOINT`
 
-## Rollback
-1. API: `scripts/rollback.sh`
+---
 
-## Monitoring
-1. Set `.env.monitoring` in workspace root
-2. Start stack: `docker-compose -f infra/docker-compose.monitoring.yml up -d`
-3. Grafana: `http://<domain>:3000`
-4. Prometheus: `http://<domain>:9090`
-5. Loki: `http://<domain>:3100`
-6. Tempo: `http://<domain>:3200`
-
-## Nginx
-1. Config: `infra/nginx/api.conf`
-2. Canonical path: `/etc/nginx/conf.d/api.conf`
-3. TLS bootstrap: two-stage via Certbot
+## Blue-Green Deployment
 
-## Troubleshooting
-1. Logs: `infra/promtail/promtail.yml`
-2. Alerts: `infra/prometheus/alerts.yml`
-3. Config: `infra/prometheus/prometheus.yml`
-4. Grafana dashboards: `infra/grafana/dashboards/`
-5. Nginx config: `infra/nginx/api.conf`
 The deployment uses a blue-green strategy for zero-downtime releases.
 
 ### How It Works
 
-The VPS always runs **two containers** (`api-blue` on port 3001, `api-green` on port 3002). Nginx routes all traffic to whichever is currently active.
+The VPS keeps **two named slots** (`api-blue`, `api-green`). Only the active slot receives traffic through nginx over `api_network`.
+The API containers do **not** bind host ports.
 
 On each deploy:
 
 1. The new image is pulled from GHCR
 2. The **inactive** container is replaced with the new image
-3. Readiness checks poll `GET /ready` until the new container is ready (up to 60 s)
+3. The new container is health-checked via `GET /health`
 4. Nginx upstream is switched to the new container (`nginx -s reload`)
 5. The previously active container is stopped and removed
 6. The deployed SHA is prepended to `.deploy_history` (keeps last 5)
@@ -95,13 +61,10 @@ On each deploy:
 
 ```bash
 # SSH into the VPS
-cd /home/ashish/api
+cd $HOME/api
 
 # Deploy a specific image SHA (e.g. from CI output)
-./scripts/deploy-bluegreen.sh a4f91c2
-
-# Deploy the latest tag
-./scripts/deploy-bluegreen.sh latest
+./scripts/deploy.sh a4f91c2
 ```
 
 ---
@@ -111,66 +74,38 @@ cd /home/ashish/api
 To instantly revert to the previous deployment:
 
 ```bash
-cd /home/ashish/api
-./scripts/rollback.sh
+cd $HOME/api
+./scripts/deploy.sh --rollback
 ```
 
 The script:
 1. Reads `.deploy_history` (requires at least 2 recorded deployments)
 2. Displays the full history with current/target markers
 3. Prompts for confirmation before proceeding
-4. Calls `deploy-bluegreen.sh <previous-sha>` — no rebuild, image already in GHCR
+4. Redeploys the previous SHA — no rebuild, image already in GHCR
 
 **Typical rollback time: under 10 seconds.**
 
-To deploy any specific historical SHA:
-
-```bash
-./scripts/deploy-bluegreen.sh 7b3e9f1
-```
-
 For full rollback system documentation, see [ROLLBACK_SYSTEM.md](ROLLBACK_SYSTEM.md).
 
 ---
 
-## Monitoring Stack
-
-The observability stack runs alongside the application on the same VPS:
-
-```bash
-cd infra
-docker compose -f docker-compose.monitoring.yml up -d
-```
-
-| Service | Default Port | Access |
-|---------|-------------|--------|
-| Grafana | 3001 (internal) | Via Nginx proxy or direct |
-| Prometheus | 9090 (internal) | Internal only |
-| Loki | 3100 (internal) | Internal only |
-| Tempo | 3200 / 4318 | Internal only |
-
-The pre-built Grafana dashboard (`infra/grafana/dashboards/fieldtrack.json`) is auto-provisioned and covers HTTP metrics, queue depth, latency, and Redis health.
-
----
-
 ## Environment Variables
 
 Copy `.env.example` to `.env` on the VPS and fill in all values before the first deploy.
 
-See [README.md](../README.md) for the full variable reference.
+See [README.md](../README.md) and [env-contract.md](env-contract.md) for the full variable reference.
 
 ---
 
-## Health Check
+## Health Endpoints
 
-The application exposes a public health endpoint:
+| Endpoint | Purpose | Deploy gate |
+|----------|---------|-------------|
+| `GET /health` | Liveness — returns `{"status":"ok"}` after bootstrap | **YES** |
+| `GET /ready` | Dependency check (Redis + Supabase) | NO — informational only |
 
-```bash
-curl https://yourdomain.com/health
-# {"status":"ok","timestamp":"2026-03-10T12:00:00.000Z"}
-```
-
-The deployment script now uses `/ready` to validate dependency readiness before switching Nginx traffic.
+The deploy script uses `/health` exclusively. `/ready` failing does not block a deployment.
 
 ---
 
@@ -185,7 +120,7 @@ docker logs api-green   # or api-blue
 **Rollback fails: "insufficient deployment history"**  
 Only one deployment has been recorded. Deploy manually with a known-good SHA:
 ```bash
-./scripts/deploy-bluegreen.sh <known-good-sha>
+./scripts/deploy.sh <known-good-sha>
 ```
 
 **Container image not found in GHCR**  
@@ -195,7 +130,7 @@ docker pull ghcr.io/fieldtrack-tech/api:<sha>
 ```
 
 **Nginx fails to reload**  
-Check the Nginx config syntax:
-```bash
-nginx -t
-```
+Nginx is managed by the infra repository. Check its configuration and reload there.
+
+**API starts but /ready fails**  
+Acceptable — Redis or Supabase may be temporarily unavailable. The deploy is still considered successful if `/health` returns 200.
diff --git a/docs/OBSERVABILITY_ARCHITECTURE.md b/docs/OBSERVABILITY_ARCHITECTURE.md
index 2edb05a..9a23f1f 100644
--- a/docs/OBSERVABILITY_ARCHITECTURE.md
+++ b/docs/OBSERVABILITY_ARCHITECTURE.md
@@ -1,299 +1,24 @@
-# FieldTrack API — Observability Architecture
+# FieldTrack — Observability Architecture
 
-This document describes the monitoring, logging, and metrics systems in FieldTrack API and how they fit together in production.
+> **Handled by infra repository.**
+>
+> The monitoring stack (Prometheus, Grafana, Loki, Tempo, Promtail, Alertmanager) is
+> configured and operated out of the infra repository, not this one.
 
----
+## What this API exposes
 
-## Stack Topology
+| Endpoint | Purpose |
+|----------|---------|
+| GET /metrics | Prometheus-format metrics (protected by \METRICS_SCRAPE_TOKEN\) |
+| OTLP traces | Exported to \TEMPO_ENDPOINT\ (default: \http://tempo:4318\) |
+| Structured logs | JSON via Pino, written to stdout — collected by infra's Promtail |
 
-```
-                          ┌─────────────────────────────────────────────────┐
-                          │               VPS (single host)                 │
-                          │                                                 │
-  Browser / Client        │  Nginx (public)                                 │
-       │                  │    ├─ /         → api-blue:3000             │
-       │ HTTPS            │    │             or api-green:3000          │
-       └─────────────────►│    └─ /monitor/ → 127.0.0.1:3333 (Grafana)    │
-                          │                                                 │
-                          │  ┌──────────────────────────────────────────┐   │
-                          │  │          api_network (Docker)     │   │
-                          │  │                                          │   │
-                          │  │  api-blue:3000  ──────────────────┐  │   │
-                          │  │  api-green:3000 ── /metrics ──────┼──┼──►│ Prometheus
-                          │  │                                        │  │   │ 127.0.0.1:9090
-                          │  │  node-exporter:9100  ─── /metrics ───┘  │   │
-                          │  │                                          │   │
-                          │  │  Promtail ──── push ──► Loki:3100        │   │
-                          │  │    │                       │             │   │
-                          │  │    │ reads                 │             │   │
-                          │  │  /var/log/*               ▼             │   │
-                          │  │  /var/lib/docker/       Grafana          │   │
-                          │  │    containers/          :3000 →          │   │
-                          │  │                     127.0.0.1:3333       │   │
-                          │  └──────────────────────────────────────────┘   │
-                          └─────────────────────────────────────────────────┘
-```
+## Environment variables (API side)
 
----
+| Variable | Purpose |
+|----------|---------|
+| \METRICS_SCRAPE_TOKEN\ | Token that Prometheus must send when scraping \/metrics\ |
+| \TEMPO_ENDPOINT\ | OTLP HTTP endpoint for trace export |
 
-## Metrics Flow
+See [env-contract.md](env-contract.md) for full details.
 
-### Scrape chain
-
-```
-Prometheus (every 15 s)
-  ├─ GET api-blue:3000/metrics   [x-metrics-token: <token>]
-  ├─ GET api-green:3000/metrics  [x-metrics-token: <token>]  ← inactive = DOWN (expected)
-  ├─ GET node-exporter:9100/metrics  [no auth — host-internal only]
-  └─ GET localhost:9090/metrics      [self-monitoring]
-```
-
-### Endpoint
-
-The Fastify API exposes `/metrics` in [OpenMetrics](https://openmetrics.io/) format via the `@fastify/metrics` plugin. The endpoint is **not** reachable through Nginx (blocked by `location /metrics { return 403; }`).
-
-### Authentication
-
-Prometheus sends a custom header on every scrape:
-
-```
-x-metrics-token: <value of METRICS_SCRAPE_TOKEN>
-```
-
-The API validates this header in its metrics middleware. Requests without a matching token receive `403 Forbidden`.
-
-`METRICS_SCRAPE_TOKEN` is injected into the Prometheus container via the `METRICS_SCRAPE_TOKEN` environment variable, which Prometheus expands when loading `prometheus.yml`  
-(`headers: { x-metrics-token: ${METRICS_SCRAPE_TOKEN} }`).
-
-### Prometheus config file
-
-[infra/prometheus/prometheus.yml](../infra/prometheus/prometheus.yml)
-
-### Retention
-
-- Time-based: **30 days**
-- Size-based: **5 GB**  
-  Prometheus evicts oldest data first when the size limit is reached.
-
----
-
-## Logs Flow
-
-### Collection chain
-
-```
-Container stdout/stderr
-        │
-        ▼
-Docker JSON log files
-  /var/lib/docker/containers/<id>/*-json.log
-        │
-        ▼ (Promtail reads, parses, labels)
-        │
-        ▼
-Loki:3100/loki/api/v1/push
-        │
-        ▼
-Grafana (Loki datasource) → Explore / Dashboard panels
-```
-
-### Promtail config file
-
-[infra/promtail/promtail.yml](../infra/promtail/promtail.yml)
-
-### Log sources
-
-| Source | Path | Labels added |
-|--------|------|--------------|
-| Docker containers | `/var/lib/docker/containers/*/*-json.log` | `job=docker`, `container_id`, `level`, `trace_id` |
-| Host syslog | `/var/log/*.log` | `job=syslog` |
-
-### Log parsing pipeline (Docker)
-
-Promtail applies a multi-stage pipeline to container logs:
-
-1. **`docker: {}`** — unwraps Docker's JSON envelope (`log`, `stream`, `time`)
-2. **regex** — extracts `container_id` from the file path
-3. **json** — extracts `level`, `msg`, `trace_id`, `span_id` from Pino structured logs
-4. **labels** — promotes `level` and `trace_id` as Loki stream labels
-
-### Positions persistence
-
-Promtail records log offsets in:
-
-```
-/data/positions.yaml   (inside promtail_data Docker volume → fieldtrack_promtail_data)
-```
-
-This file survives container restarts so Promtail never re-ingests already-processed logs.
-
-### Loki retention
-
-Loki is configured via [infra/loki/loki-config.yaml](../infra/loki/loki-config.yaml).
-
-| Setting | Value | Location |
-|---------|-------|----------|
-| `limits_config.retention_period` | `30d` | `loki-config.yaml` |
-| `compactor.retention_enabled` | `true` | `loki-config.yaml` |
-| Compaction interval | every 10 minutes | `loki-config.yaml` |
-| Deletion delay | 2 hours | `loki-config.yaml` |
-
-The compactor process runs inside the single-binary Loki container.  It scans the index every 10 minutes, marks chunks older than 30 days for deletion, and removes them 2 hours later.  The `loki_data` Docker volume (stored in `/loki/chunks`, `/loki/rules`, `/loki/compactor`) must have enough disk space for at most 30 days of logs.
-
----
-
-## Grafana
-
-| Property | Value |
-|----------|-------|
-| Bound to | `127.0.0.1:3333` |
-| Public URL | `https://<API_HOSTNAME>/monitor/` |
-| Served via | Nginx `location /monitor/` → `proxy_pass http://127.0.0.1:3333` |
-| Auth | Admin credentials from `GRAFANA_ADMIN_PASSWORD` secret |
-| Sign-up | Disabled (`GF_USERS_ALLOW_SIGN_UP=false`) |
-
-### Datasources (provisioned)
-
-Configured under [infra/grafana/provisioning/datasources/](../infra/grafana/provisioning/datasources/).
-
-| Name | Type | URL |
-|------|------|-----|
-| Prometheus | prometheus | `http://prometheus:9090` |
-| Loki | loki | `http://loki:3100` |
-
-### Dashboards (provisioned)
-
-Pre-built dashboards are stored in [infra/grafana/dashboards/](../infra/grafana/dashboards/) and automatically loaded at startup.
-
----
-
-## Container Services
-
-All services run inside the `api_network` Docker bridge network.
-
-| Container | Image | Bound port | Role |
-|-----------|-------|------------|------|
-| `prometheus` | `prom/prometheus:v2.52.0` | `127.0.0.1:9090` | Metrics scraper & TSDB |
-| `grafana` | `grafana/grafana:10.4.2` | `127.0.0.1:3333` | Dashboards |
-| `loki` | `grafana/loki:2.9.6` | internal `:3100` | Log aggregation |
-| `promtail` | `grafana/promtail:2.9.6` | — | Log shipper |
-| `node-exporter` | `prom/node-exporter:v1.8.1` | internal `:9100` | Host metrics |
-
-All images are **pinned** to exact versions to ensure deterministic restarts.
-
-### Resource limits
-
-Each monitoring container has a Docker-managed memory ceiling enforced via `deploy.resources.limits`:
-
-| Container | Memory limit |
-|-----------|--------------|
-| `loki` | 1 GB |
-| `prometheus` | 1 GB |
-| `grafana` | 512 MB |
-| `promtail` | 128 MB |
-| `node-exporter` | *(no limit — minimal footprint)* |
-
----
-
-## Persistent Volumes
-
-| Docker Volume | Named Volume | Contents |
-|---------------|-------------|----------|
-| `prometheus_data` | `fieldtrack_prometheus_data` | Prometheus TSDB |
-| `grafana_data` | `fieldtrack_grafana_data` | Grafana DB, plugins |
-| `loki_data` | `fieldtrack_loki_data` | Loki chunks & index |
-| `promtail_data` | `fieldtrack_promtail_data` | Log offset positions file |
-
----
-
-## Monitoring Stack Restart Policy
-
-The deploy script ([scripts/deploy-bluegreen.sh](../scripts/deploy-bluegreen.sh)) and the CI sync-infra job only restart the monitoring stack when monitoring configuration has actually changed.
-
-Change detection uses a SHA-256 hash over all files matching:
-
-```
-infra/**/*.{yml,yaml,conf,toml,json}
-```
-
-with the `infra/nginx/` subtree excluded (nginx is rendered on every deploy and does not require a monitoring restart).
-
-The last-known hash is stored at `~/.fieldtrack-monitoring-hash`. If the new hash matches, the monitoring stack is left running untouched.
-
----
-
-## Security Notes
-
-| Control | Detail |
-|---------|--------|
-| `/metrics` blocked at Nginx | `location /metrics { return 403; }` — scraping is only possible from inside `api_network` |
-| Prometheus token auth | `x-metrics-token` header required; value stored in `METRICS_SCRAPE_TOKEN` env var |
-| Grafana not publicly listed | Accessible only at `/monitor/`; no signup |
-| Monitoring ports loopback-bound | Prometheus `:9090` and Grafana `:3333` bound to `127.0.0.1`; not accessible externally |
-| Image versions pinned | No `latest` tags — prevents silent breaking changes on container restart |
-| Container log limits | All monitoring containers use `json-file` driver with `max-size: 10m` / `max-file: 3` |
-
----
-
-## Alerting (Deployed)
-
-The [infra/prometheus/alerts.yml](../infra/prometheus/alerts.yml) file defines alerting rules. Prometheus loads it via:
-
-```yaml
-rule_files:
-  - alerts.yml
-```
-
-Alertmanager is now deployed in [infra/docker-compose.monitoring.yml](../infra/docker-compose.monitoring.yml) and configured in [infra/prometheus/prometheus.yml](../infra/prometheus/prometheus.yml):
-
-```yaml
-alerting:
-  alertmanagers:
-    - static_configs:
-        - targets:
-            - alertmanager:9093
-```
-
-Alertmanager is configured at [infra/alertmanager/alertmanager.yml](../infra/alertmanager/alertmanager.yml), and Slack webhook is loaded from `infra/.env.monitoring` (ALERTMANAGER_SLACK_WEBHOOK).
-
-Alerting now uses Slack only. Set this in `infra/.env.monitoring` with a valid Slack incoming webhook endpoint:
-
-- `ALERTMANAGER_SLACK_WEBHOOK`
-
-Then redeploy the monitoring stack.
-
-
----
-
-## Certbot Bootstrap (Fresh VPS)
-
-Nginx references LetsEncrypt certificates at `/etc/letsencrypt/live/<API_HOSTNAME>/`. On a fresh VPS these do not exist yet, so a full SSL config causes Nginx to refuse to start.
-
-**Safe bootstrap sequence:**
-
-1. Deploy a temporary HTTP-only Nginx config that only serves `/.well-known/acme-challenge/` and your `server_name`. Comment out the `listen 443` server block and all `ssl_*` directives.
-
-2. Start Nginx with the HTTP-only config:
-   ```bash
-   sudo nginx -t && sudo systemctl start nginx
-   ```
-
-3. Obtain the certificate:
-   ```bash
-   sudo certbot certonly --webroot -w /var/www/certbot -d $API_HOSTNAME
-   ```
-
-4. Render and install the full SSL config from the template:
-   ```bash
-   sed \
-     -e "s|__ACTIVE_CONTAINER__|api-blue|g" \
-     -e "s|__API_HOSTNAME__|$API_HOSTNAME|g" \
-     infra/nginx/api.conf > infra/nginx/live/api.conf
-   # nginx runs in Docker — reload via docker exec (no host nginx service):
-   docker exec nginx nginx -t && docker exec nginx nginx -s reload
-   ```
-
-5. Enable auto-renewal (Certbot installs a systemd timer automatically on Ubuntu):
-   ```bash
-   sudo systemctl status certbot.timer
-   ```
diff --git a/docs/ROLLBACK_QUICKREF.md b/docs/ROLLBACK_QUICKREF.md
index d130b7b..db05a05 100644
--- a/docs/ROLLBACK_QUICKREF.md
+++ b/docs/ROLLBACK_QUICKREF.md
@@ -5,18 +5,23 @@
 ### Deploy Latest Version
 ```bash
 cd "$HOME/api"
-./scripts/deploy-bluegreen.sh <SHA>
+./scripts/deploy.sh <SHA>
 ```
 
 ### Rollback to Previous Version
 ```bash
 cd "$HOME/api"
-./scripts/rollback.sh
+./scripts/deploy.sh --rollback
+```
+
+### Rollback (non-interactive, for CI)
+```bash
+./scripts/deploy.sh --rollback --auto
 ```
 
 ### Deploy Specific Version
 ```bash
-./scripts/deploy-bluegreen.sh 7b3e9f1
+./scripts/deploy.sh 7b3e9f1
 ```
 
 ## How It Works
@@ -61,7 +66,8 @@ cd "$HOME/api"
 
 ```
 ┌──────────────────┐
-│ ./rollback.sh    │
+│ ./deploy.sh      │
+│ --rollback       │
 └──────┬───────────┘
        │
        ▼
@@ -97,8 +103,7 @@ cd "$HOME/api"
 ```
 /api/
 ├── scripts/
-│   ├── deploy-bluegreen.sh
-│   └── rollback.sh
+│   └── deploy.sh           # Deploy and rollback
 └── .deploy_history (last 5 SHAs)
 ```
 
@@ -106,19 +111,15 @@ cd "$HOME/api"
 
 ```bash
 # Deploy new version
-$ ./scripts/deploy-bluegreen.sh b8c4d2e
-[1/7] Pulling image...
-[2/7] Detecting active container...
-[3/7] Starting inactive container...
-[4/7] Waiting for health check...
-[5/7] Switching nginx upstream...
-[6/7] Reloading nginx...
-[7/7] Cleaning old container...
-Deployment successful.
-Deployment history updated: b8c4d2e
+$ ./scripts/deploy.sh b8c4d2e
+[DEPLOY] state=PULL_IMAGE ...
+[DEPLOY] state=START_INACTIVE ...
+[DEPLOY] state=HEALTH_CHECK_INTERNAL ...
+[DEPLOY] state=SWITCH_NGINX ...
+[DEPLOY] state=SUCCESS duration_sec=18
 
 # Issue discovered - rollback
-$ ./scripts/rollback.sh
+$ ./scripts/deploy.sh --rollback
 Current deployment : b8c4d2e
 Previous deployment: a4f91c2
 
@@ -131,11 +132,7 @@ Deployment history:
 Current production will be replaced with: a4f91c2
 
 Continue with rollback? (yes/no): yes
-
-Starting rollback to image: a4f91c2
-[1/7] Pulling image...
-...
-Rollback completed successfully.
+[DEPLOY] state=SUCCESS duration_sec=9 msg=DEPLOY_SUCCESS
 Production is now running: a4f91c2
 ```
 
@@ -143,7 +140,7 @@ Production is now running: a4f91c2
 
 | Issue | Solution |
 |-------|----------|
-| Script not executable | `chmod +x scripts/rollback.sh` |
+| Script not executable | `chmod +x scripts/deploy.sh` |
 | No deployment history | Deploy at least once before rollback |
 | Insufficient history | Need at least 2 deployments to rollback |
 | Image not found | Verify SHA exists in GHCR |
diff --git a/docs/ROLLBACK_SYSTEM.md b/docs/ROLLBACK_SYSTEM.md
index 0aad560..56f9a8d 100644
--- a/docs/ROLLBACK_SYSTEM.md
+++ b/docs/ROLLBACK_SYSTEM.md
@@ -8,9 +8,8 @@ The rollback system provides instant production recovery by redeploying previous
 
 ### Components
 
-1. **deploy-bluegreen.sh** - Blue-green deployment script with deployment tracking
-2. **rollback.sh** - Automated rollback to previous deployment
-3. **.deploy_history** - Deployment history file storing the last 5 deployed image SHAs
+1. **deploy.sh** - Unified blue-green deployment and rollback script
+2. **.deploy_history** - Deployment history file storing the last 5 deployed image SHAs
 
 ### How It Works
 
@@ -23,8 +22,8 @@ The rollback system provides instant production recovery by redeploying previous
 2. Deploy script pulls image and performs blue-green deployment
 3. After successful deployment → prepends "a4f91c2" to .deploy_history
 4. History maintains last 5 deployments
-5. If deployment fails → rollback.sh reads line 2 from .deploy_history
-6. Rollback redeploys previous image using deploy-bluegreen.sh
+5. If deployment fails → `deploy.sh --rollback --auto` is triggered by CI
+6. Rollback redeploys previous image using `deploy.sh <sha>`
 ```
 
 ### Deployment Tracking
@@ -63,7 +62,7 @@ Deploy the latest image from CI:
 
 ```bash
 cd "$HOME/api"
-./scripts/deploy-bluegreen.sh a4f91c2
+./scripts/deploy.sh a4f91c2
 ```
 
 ### Rollback to Previous Version
@@ -72,7 +71,7 @@ Instantly restore the last working deployment:
 
 ```bash
 cd "$HOME/api"
-./scripts/rollback.sh
+./scripts/deploy.sh --rollback
 ```
 
 **Interactive output with history:**
@@ -99,10 +98,7 @@ Manually deploy any historical image:
 
 ```bash
 # Deploy a specific commit SHA
-./scripts/deploy-bluegreen.sh 7b3e9f1
-
-# Deploy a specific tag
-./scripts/deploy-bluegreen.sh v1.2.3
+./scripts/deploy.sh 7b3e9f1
 ```
 
 ## Safety Features
@@ -147,7 +143,7 @@ sudo systemctl reload nginx # Reload only if valid
 
 ```bash
 # Deploy new version
-./scripts/deploy-bluegreen.sh b8c4d2e
+./scripts/deploy.sh b8c4d2e
 
 # Health check fails → deployment aborted
 # Production still running previous version
@@ -158,7 +154,7 @@ sudo systemctl reload nginx # Reload only if valid
 
 ```bash
 # Deploy succeeds but issue discovered later
-./scripts/rollback.sh
+./scripts/deploy.sh --rollback
 
 # Confirms rollback
 # Redeploys previous image in <10 seconds
@@ -169,11 +165,11 @@ sudo systemctl reload nginx # Reload only if valid
 
 ```bash
 # Need to deploy a specific older version
-./scripts/deploy-bluegreen.sh 7b3e9f1
+./scripts/deploy.sh 7b3e9f1
 
 # Pulls specific image from GHCR
 # Performs blue-green deployment
-# Updates .last_deploy to 7b3e9f1
+# Prepends SHA to .deploy_history (rolling last 5)
 ```
 
 ## Integration with CI/CD
@@ -185,7 +181,7 @@ sudo systemctl reload nginx # Reload only if valid
   run: |
     ssh ${{ secrets.VPS_USER }}@${{ secrets.VPS_HOST }} \
       "cd \"$HOME/api\" && \
-       ./scripts/deploy-bluegreen.sh ${{ env.SHA_SHORT }}"
+       ./scripts/deploy.sh ${{ env.SHA_SHORT }}"
 ```
 
 ### Deployment History
@@ -206,10 +202,9 @@ The history maintains the last 5 deployments in chronological order (newest firs
 ```
 $HOME/api/
 ├── scripts/
-│   ├── deploy-bluegreen.sh    # Blue-green deployment
-│   └── rollback.sh             # Rollback automation
-├── .deploy_history             # Last 5 deployment SHAs
-└── .env                        # Environment configuration
+│   └── deploy.sh             # Unified deploy + rollback
+├── .deploy_history           # Last 5 deployment SHAs
+└── .env                      # Environment configuration
 ```
 
 ## Troubleshooting
@@ -218,7 +213,7 @@ $HOME/api/
 
 ```bash
 # Make script executable
-chmod +x scripts/rollback.sh
+chmod +x scripts/deploy.sh
 ```
 
 ### No Deployment History
@@ -278,4 +273,3 @@ Potential improvements (not currently implemented):
 
 - [Blue-Green Deployment](./DEPLOYMENT.md)
 - [CI/CD Pipeline](.github/workflows/deploy.yml)
-- [VPS Setup](../scripts/vps-setup.sh)
diff --git a/docs/SLO.md b/docs/SLO.md
index b3daab9..88683bc 100644
--- a/docs/SLO.md
+++ b/docs/SLO.md
@@ -1,6 +1,6 @@
 # FieldTrack Service Level Objectives (SLOs)
 
-This document defines the service-level objectives for FieldTrack production services.  Each SLO has a corresponding error budget and alert rules in `infra/prometheus/alerts.yml`.
+This document defines the service-level objectives for FieldTrack production services. Alert rules are implemented in the standalone infra repository.
 
 ---
 
@@ -19,7 +19,7 @@ This document defines the service-level objectives for FieldTrack production ser
 
 | | |
 |---|---|
-| **SLI** | `up{job=~"fieldtrack-api.*"}` |
+| **SLI** | HTTP availability measured from `up` metric on API containers |
 | **Target** | 99.9% monthly availability |
 | **Error budget** | 43.8 minutes / month |
 | **Window** | 30-day rolling |
@@ -33,7 +33,7 @@ Sub-1h monthly downtime budget is appropriate for a B2B scheduling SaaS.  Breach
 
 | | |
 |---|---|
-| **SLI** | `histogram_quantile(0.95, ...)` over `http_request_duration_seconds_bucket` |
+| **SLI** | p95 HTTP request duration (measured via `http_request_duration_seconds_bucket`) |
 | **Target p95** | < 500 ms |
 | **Target p99** | < 2 000 ms |
 | **Error budget** | 5% of requests may exceed the p95 threshold |
@@ -48,7 +48,7 @@ Sub-1h monthly downtime budget is appropriate for a B2B scheduling SaaS.  Breach
 
 | | |
 |---|---|
-| **SLI** | `rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m])` |
+| **SLI** | Ratio of 5xx responses to total HTTP requests |
 | **Target** | < 1% 5xx error rate |
 | **Error budget** | 1% of requests may fail with 5xx |
 | **Window** | 5-minute rolling |
diff --git a/docs/env-contract.md b/docs/env-contract.md
index 922bc1d..7e1d63d 100644
--- a/docs/env-contract.md
+++ b/docs/env-contract.md
@@ -35,8 +35,8 @@
 | `*_BASE_URL` | Full URL — scheme + host, **no trailing slash** | `https://api.getfieldtrack.app` |
 | `*_HOSTNAME` | Bare domain — **no scheme, no path** | `api.getfieldtrack.app` |
 
-**`API_HOSTNAME` is always DERIVED from `API_BASE_URL` at deploy-time by `load-env.sh`.**  
-It must **never** be set in `.env` — set it only in `infra/.env.monitoring`.
+**`API_HOSTNAME` is always DERIVED from `API_BASE_URL` at deploy-time by the deployment workflow/script.**  
+It must **never** be set in `.env` — set it only in the infra repository's `.env.monitoring`.
 
 ---
 
@@ -48,7 +48,7 @@ Validated by `src/config/env.ts` (Zod schema, fail-fast).
 
 | Variable | Required in Prod | Type | Purpose |
 |----------|:---:|------|---------|
-| `API_BASE_URL` | ✅ | `https://…` URL | **The canonical public URL of this API.** Used in OpenAPI server definitions and any server-generated links referencing the API itself. Also used by all deploy scripts and CI smoke tests. |
+| `API_BASE_URL` | ✅ | `https://…` URL | **The canonical public URL of this API.** Used in OpenAPI server definitions and any server-generated links referencing the API itself. Also used by deploy scripts and CI health checks. |
 | `APP_BASE_URL` | ✅ | `https://…` URL | Canonical root URL for the whole application. Used in email footers, OpenGraph canonical tags, and generic redirects that don't need to distinguish API vs frontend. |
 | `FRONTEND_BASE_URL` | ✅ | `https://…` URL | Public URL of the web frontend (maintained in a separate repository: `fieldtrack-tech/web`). Used to build password-reset and invitation email links. |
 
@@ -124,28 +124,22 @@ Validated by `src/config/env.ts` (Zod schema, fail-fast).
 
 ## CI / Scripts — GitHub Actions + Shell Scripts
 
-Variables consumed by `smoke-test.sh`, deploy scripts, and workflows.  
+Variables consumed by deploy scripts and workflows.  
 Stored as **GitHub repository secrets**.
 
 | Secret Name | Purpose | Used By |
 |------------|---------|---------|
-| `API_BASE_URL` | Full public URL of the API for health probes and smoke tests | `deploy.yml`, `smoke-test.sh` |
+| `API_BASE_URL` | Full public URL of the API for health probes | `deploy.yml`, `deploy.sh` |
 | `CORS_ORIGIN` | Allowed CORS origins for the deployed container | `deploy.yml` (pre-flight validation) |
 | `DO_HOST` | DigitalOcean VPS IP / hostname | SSH deploy steps |
 | `DO_USER` | SSH username on VPS | SSH deploy steps |
 | `DO_SSH_KEY` | SSH private key (PEM) | SSH deploy steps |
-| `FT_EMP_EMAIL` | Employee test account email | `smoke-test.sh` |
-| `FT_EMP_PASSWORD` | Employee test account password | `smoke-test.sh` |
-| `FT_ADMIN_EMAIL` | Admin test account email | `smoke-test.sh` |
-| `FT_ADMIN_PASSWORD` | Admin test account password | `smoke-test.sh` |
-| `SUPABASE_URL` | Supabase project URL (for smoke test auth) | `smoke-test.sh` |
-| `SUPABASE_ANON_KEY` | Supabase anon key (for smoke test auth) | `smoke-test.sh` |
 
 > **Renamed:** `FT_API_BASE_URL` → `API_BASE_URL`. Update the GitHub repo secret accordingly.
 
 ---
 
-## Infra — `infra/.env.monitoring`
+## Infra (standalone infra repository) — `.env.monitoring`
 
 Used by Docker Compose for Prometheus, Grafana, Nginx, Blackbox Exporter.
 
@@ -206,7 +200,7 @@ FRONTEND_BASE_URL=https://app.getfieldtrack.app
 CORS_ORIGIN=https://app.getfieldtrack.app
 METRICS_SCRAPE_TOKEN=<openssl rand -hex 32>
 
-# Infra (infra/.env.monitoring on VPS)
+# Infra (.env.monitoring in infra repo on VPS)
 API_HOSTNAME=api.getfieldtrack.app
 METRICS_SCRAPE_TOKEN=<same token as backend>
 GRAFANA_ADMIN_PASSWORD=<strong password>
@@ -217,12 +211,6 @@ CORS_ORIGIN=https://app.getfieldtrack.app
 DO_HOST=<vps-ip>
 DO_USER=ashish
 DO_SSH_KEY=<pem private key>
-FT_EMP_EMAIL=<smoke test employee>
-FT_EMP_PASSWORD=<smoke test employee password>
-FT_ADMIN_EMAIL=<smoke test admin>
-FT_ADMIN_PASSWORD=<smoke test admin password>
-SUPABASE_URL=https://your-project.supabase.co
-SUPABASE_ANON_KEY=eyJ...
 ```
 
 ---
@@ -248,7 +236,7 @@ The following variables were **renamed** as part of the env contract cleanup (Ma
 
 | Old Name | New Name | Where |
 |----------|----------|-------|
-| `FT_API_BASE_URL` | `API_BASE_URL` | GitHub secrets, `smoke-test.sh`, `deploy.yml` |
+| `FT_API_BASE_URL` | `API_BASE_URL` | GitHub secrets, `deploy.yml` |
 
 **Action required:**
 1. Rename the GitHub repository secret `FT_API_BASE_URL` → `API_BASE_URL`
diff --git a/docs/infra-contract.md b/docs/infra-contract.md
new file mode 100644
index 0000000..63425bd
--- /dev/null
+++ b/docs/infra-contract.md
@@ -0,0 +1,20 @@
+# Infra Contract
+
+This API repository expects an external infra repository to provide runtime infrastructure.
+
+Required external services:
+- nginx container attached to `api_network`
+- Redis reachable at `redis:6379`
+
+Required external paths under `INFRA_ROOT`:
+- `$INFRA_ROOT/nginx/live`
+- `$INFRA_ROOT/nginx/backup`
+- `$INFRA_ROOT/nginx/api.conf`
+
+Default on server:
+- `INFRA_ROOT=/opt/infra`
+
+Deployment assumptions:
+- API deploy script (`scripts/deploy.sh`) never starts infra services
+- API deploy script only renders and reloads nginx config via paths under `INFRA_ROOT`
+- API and infra share the Docker bridge network `api_network`
diff --git a/docs/walkthrough.md b/docs/walkthrough.md
index 2f07725..c80d357 100644
--- a/docs/walkthrough.md
+++ b/docs/walkthrough.md
@@ -1784,101 +1784,18 @@ Result: **0 errors** across all 3 utility files, 3 repositories, 2 services, 5 c
 
 ---
 
-## Phase 13 — Production Infrastructure: VPS, Nginx & Monitoring Stack
+## Phase 13 — Production Infrastructure (Moved)
 
-### Overview
-
-Phase 13 moved FieldTrack 2.0 from a locally-runnable service to a fully operational production deployment. It introduced the VPS setup automation, Nginx reverse proxy, and a complete self-hosted observability stack (Prometheus + Grafana + Loki + Tempo).
-
----
-
-### 13.1 — VPS Setup Script
-
-**File:** `scripts/vps-setup.sh`
-
-A single idempotent script provisions a fresh Ubuntu VPS from zero to production-ready:
-
-- Installs Docker, Docker Compose, Nginx, and dependencies
-- Creates the `fieldtrack` OS user with limited permissions
-- Clones the repository and creates the directory structure
-- Configures the `systemd` service for auto-restart
-- Issues and renews TLS certificates via Let's Encrypt (`certbot`)
-- Sets up log rotation and minimal firewall rules (`ufw`)
-- Starts the monitoring stack alongside the application
-
----
+Infrastructure ownership was extracted from this API repository.
 
-### 13.2 — Nginx Reverse Proxy
+The following are now managed in the standalone infra repository:
+- VPS bootstrap and host setup
+- nginx reverse proxy configuration
+- Redis runtime service
+- monitoring stack (Prometheus, Grafana, Loki, Alertmanager, Promtail)
 
-**File:** `infra/nginx/api.conf`
-
-- Terminates TLS (HTTPS → HTTP to backend containers)
-- Upstream block points to the active blue/green container port
-- HTTP → HTTPS redirect on port 80
-- Proxy headers: `X-Real-IP`, `X-Forwarded-For`, `X-Forwarded-Proto`
-- WebSocket upgrade support (`Upgrade`, `Connection` headers)
-- Gzip compression for JSON responses
-- Security headers: `X-Frame-Options`, `X-Content-Type-Options`, `HSTS`
-
----
-
-### 13.3 — Monitoring Stack
-
-**File:** `infra/docker-compose.monitoring.yml`
-
-Five services on the `api_network` Docker network:
-
-| Service | Port | Role |
-|---------|------|------|
-| `prometheus` | 9090 | Scrapes `/metrics` every 15 s; stores time-series |
-| `grafana` | 3001 | Dashboards, alerting, data-source wiring |
-| `loki` | 3100 | Log aggregation backend |
-| `promtail` | — | Reads Docker container logs; ships to Loki |
-| `tempo` | 3200 / 4317 / 4318 | Distributed trace storage; OTLP ingest |
-
----
+This API repository now focuses on application code and deployment orchestration only.
 
-### 13.4 — Grafana Dashboard
-
-**File:** `infra/grafana/dashboards/fieldtrack.json`
-
-A provisioned Grafana dashboard covering:
-
-- HTTP request rate and error rate by route
-- p50/p95/p99 latency per endpoint
-- Node.js heap usage and event-loop lag
-- BullMQ queue depth and recalculation throughput
-- Active session count
-- Redis memory usage
-
-Dashboard is automatically loaded on container start via `infra/grafana/provisioning/`.
-
----
-
-### Files Created
-
-| File | Purpose |
-|------|----------|
-| `scripts/vps-setup.sh` | Full VPS provisioning from scratch |
-| `infra/docker-compose.monitoring.yml` | Prometheus, Grafana, Loki, Promtail, Tempo |
-| `infra/grafana/dashboards/fieldtrack.json` | Application dashboard (auto-provisioned) |
-| `infra/grafana/provisioning/dashboards/dashboard.yml` | Dashboard provisioning config |
-| `infra/grafana/provisioning/datasources/prometheus.yml` | Prometheus datasource provisioning |
-| `infra/nginx/api.conf` | Nginx reverse proxy and TLS termination |
-| `infra/prometheus/prometheus.yml` | Scrape config targeting backend `/metrics` |
-
----
-
-### Verification Results
-
-| Check | Result |
-|-------|--------|
-| VPS setup script idempotent | Can be re-run safely on existing VPS |
-| Nginx serves HTTPS | TLS via Let's Encrypt certbot |
-| Grafana auto-provisioned | Dashboard loads on container start |
-| Prometheus scrapes backend | `http_requests_total` visible in Grafana |
-
----
 
 ## Phase 14 — Distributed Tracing, Log Correlation & Metric Exemplars
 
@@ -1961,7 +1878,7 @@ httpRequestDuration.labels(labels).observeWithExemplar(
 
 Exemplars make individual high-latency data points "clickable" in Grafana: clicking a spike in the latency graph jumps directly to the Tempo trace for that exact request.
 
-Infrastructure requirements enabled in `docker-compose.monitoring.yml`:
+Infrastructure requirements enabled in the standalone infra repository:
 - Prometheus `--enable-feature=exemplar-storage` flag
 - Backend scraped with `Content-Type: application/openmetrics-text` (required for exemplar ingestion)
 
@@ -1972,11 +1889,9 @@ Infrastructure requirements enabled in `docker-compose.monitoring.yml`:
 | File | Action |
 |------|--------|
 | `src/tracing.ts` | **NEW** — OpenTelemetry SDK bootstrap; OTLP exporter to Tempo |
-| `src/server.ts` | **MODIFIED** — `import "./tracing.js"` as the very first import |
+| `src/server.ts` | **MODIFIED** — calls `initTelemetry()` at startup before app bootstrap |
 | `src/config/logger.ts` | **MODIFIED** — `otelMixin` injects trace/span IDs into every log line |
 | `src/plugins/prometheus.ts` | **MODIFIED** — exemplar support on duration histogram |
-| `infra/docker-compose.monitoring.yml` | **MODIFIED** — Tempo OTLP ports 4317/4318; Prometheus exemplar storage |
-| `infra/prometheus/prometheus.yml` | **MODIFIED** — OpenMetrics scrape protocol for backend jobs |
 | `src/app.ts` | **MODIFIED** — `onRequest` hook enriches active span with route pattern and request ID |
 
 ---
@@ -2372,7 +2287,7 @@ The pipeline is split into two jobs:
 
 ### Multi-Version Rollback System
 
-**Files:** `scripts/deploy-bluegreen.sh`, `scripts/rollback.sh`
+**File:** `scripts/deploy.sh`
 
 #### Deployment History
 
@@ -2391,19 +2306,19 @@ The history window is capped at the **last 5 deployments**.
 #### Rollback Procedure
 
 ```bash
-./scripts/rollback.sh
+./scripts/deploy.sh --rollback
 ```
 
 1. Reads `.deploy_history` — requires ≥ 2 entries
 2. Displays current and target versions with the full history
 3. Prompts for interactive confirmation: `Continue with rollback? (yes/no)`
-4. Calls `deploy-bluegreen.sh <previous-SHA>` to redeploy the previous image
+4. Calls `deploy.sh <previous-SHA>` to redeploy the previous image
 5. The previous image is already in GHCR — no rebuild, **< 10 seconds** end-to-end
 
 #### Deploy a Specific Historical Version
 
 ```bash
-./scripts/deploy-bluegreen.sh 7b3e9f1
+./scripts/deploy.sh 7b3e9f1
 ```
 
 Any SHA from `.deploy_history` (or any valid GHCR tag) can be targeted directly.
@@ -2415,8 +2330,7 @@ Any SHA from `.deploy_history` (or any valid GHCR tag) can be targeted directly.
 | File | Action |
 |------|--------|
 | `.github/workflows/deploy.yml` | **MODIFIED** — Split into `test` + `build-and-deploy` jobs; `npm ci`; `tsc --noEmit`; GHA cache |
-| `scripts/deploy-bluegreen.sh` | **MODIFIED** — Appends SHA to `.deploy_history`; maintains 5-entry window |
-| `scripts/rollback.sh` | **NEW** — Reads history, confirms, re-deploys previous image |
+| `scripts/deploy.sh` | **MODIFIED** — Unified deploy + rollback, appends SHA to `.deploy_history`; maintains 5-entry window |
 | `.gitignore` | **MODIFIED** — `.deploy_history` excluded |
 | `docs/ROLLBACK_SYSTEM.md` | **NEW** — Architecture, usage, troubleshooting guide |
 | `docs/ROLLBACK_QUICKREF.md` | **NEW** — Fast reference card for operators |
diff --git a/infra/.env.monitoring.example b/infra/.env.monitoring.example
deleted file mode 100644
index 711716d..0000000
--- a/infra/.env.monitoring.example
+++ /dev/null
@@ -1,61 +0,0 @@
-# =============================================================================
-# FieldTrack 2.0 — Monitoring Stack Environment
-#
-# Copy to infra/.env.monitoring on the VPS and fill in values.
-# Do NOT commit this file with real secrets — keep it on the VPS only.
-#
-# Usage:
-#   docker compose --env-file infra/.env.monitoring \
-#                  -f infra/docker-compose.monitoring.yml up -d
-#
-# Validate before deploy:
-#   bash scripts/validate-env.sh --check-monitoring
-# =============================================================================
-
-# ── ENV CONTRACT ──────────────────────────────────────────────────────────────
-#
-#   APP layer   → API_BASE_URL  lives in .env  (full URL)
-#   INFRA layer → API_HOSTNAME  lives here              (hostname only)
-#
-#   API_HOSTNAME MUST match the hostname portion of API_BASE_URL:
-#     .env:  API_BASE_URL=https://api.example.com
-#     this file:      API_HOSTNAME=api.example.com
-#
-#   Verify consistency before every deploy:
-#     bash scripts/validate-env.sh --check-monitoring
-#
-#   API_DOMAIN IS REMOVED — do not add it here or anywhere else.
-# =============================================================================
-
-# ── Infra layer (hostname only — no scheme, no trailing slash) ────────────────
-#
-# Derived from API_BASE_URL in .env.
-# Prometheus uses this for the readiness probe target.
-# Grafana uses this for GF_SERVER_ROOT_URL.
-#
-# Example: API_BASE_URL=https://api.getfieldtrack.app
-#       →  API_HOSTNAME=api.getfieldtrack.app
-API_HOSTNAME=api.getfieldtrack.app
-
-# ── Grafana ───────────────────────────────────────────────────────────────────
-# Strong password for the Grafana admin account (min 12 chars).
-GRAFANA_ADMIN_PASSWORD=change-me-use-a-strong-password
-
-# ── Prometheus scrape authentication ──────────────────────────────────────────
-# Bearer token for the /metrics endpoint.
-# MUST be identical to METRICS_SCRAPE_TOKEN in .env.
-# Mismatch → Prometheus receives 401s → all metric alerts go blind.
-#
-# Generate:  openssl rand -hex 32
-METRICS_SCRAPE_TOKEN=change-me-generate-with-openssl-rand-hex-32
-
-# ── Alertmanager Slack notification target ────────────────────────────────────
-# Used by infra/scripts/render-alertmanager.sh to render the Alertmanager config
-# template before container start. Alertmanager does NOT support env vars natively.
-#
-# Generate from: Slack → Your App → Incoming Webhooks → Add New Webhook
-# Must start with: https://hooks.slack.com/
-#
-# IMPORTANT: Do NOT add FRONTEND_DOMAIN here — it has been removed from the
-# env contract. The render script will exit 1 if it detects that variable.
-ALERTMANAGER_SLACK_WEBHOOK=YOUR_SLACK_INCOMING_WEBHOOK_URL
diff --git a/infra/alertmanager/alertmanager.yml b/infra/alertmanager/alertmanager.yml
deleted file mode 100644
index db267a1..0000000
--- a/infra/alertmanager/alertmanager.yml
+++ /dev/null
@@ -1,65 +0,0 @@
-# Alertmanager route and receiver configuration for Slack-only alerting.
-#
-# NOTE:
-# This file is a TEMPLATE and MUST be rendered via envsubst before use.
-# Alertmanager does NOT support environment variables natively.
-# Render this file by running:
-#   bash infra/scripts/render-alertmanager.sh
-# The rendered output is written to: infra/alertmanager/alertmanager.rendered.yml
-# docker-compose mounts ONLY the rendered file — never this template directly.
-#
-# No email, SMTP, or PagerDuty configurations are present.
-
-route:
-  receiver: ops-slack-warning
-  group_by: ["alertname", "severity"]
-  group_wait: 30s
-  group_interval: 5m
-  repeat_interval: 4h
-  routes:
-    - match:
-        severity: critical
-      receiver: ops-slack-critical
-
-    - match:
-        severity: warning
-      receiver: ops-slack-warning
-
-receivers:
-  # Critical alerts: dedicated Slack channel for immediate response
-  - name: ops-slack-critical
-    slack_configs:
-      - api_url: "${ALERTMANAGER_SLACK_WEBHOOK}"
-        channel: "#critical-alerts"
-        send_resolved: true
-        title: "[{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}"
-        text: |
-          *Severity:* {{ .CommonLabels.severity }}
-          *Instance:* {{ .CommonLabels.instance }}
-          *Summary:* {{ .CommonAnnotations.summary }}
-          *Description:* {{ .CommonAnnotations.description }}
-
-  # Warning alerts: standard alerts channel
-  - name: ops-slack-warning
-    slack_configs:
-      - api_url: "${ALERTMANAGER_SLACK_WEBHOOK}"
-        channel: "#alerts"
-        send_resolved: true
-        title: "[{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}"
-        text: |
-          *Severity:* {{ .CommonLabels.severity }}
-          *Instance:* {{ .CommonLabels.instance }}
-          *Summary:* {{ .CommonAnnotations.summary }}
-          *Description:* {{ .CommonAnnotations.description }}
-
-# Silence rules: suppress expected noise during planned maintenance.
-# Add entries here before a deployment rather than disabling alerting entirely.
-inhibit_rules:
-  # If the backend container is down (DeploymentFailure), suppress the
-  # dependent high-latency and error-rate alerts — they are all downstream
-  # effects of the same root cause and would produce redundant notifications.
-  - source_matchers:
-      - alertname="DeploymentFailure"
-    target_matchers:
-      - alertname=~"FieldTrackHighErrorRate|FieldTrackHighLatency|FieldTrackAvgLatencyHigh|ReadinessCheckFailing"
-    equal: ["job"]
diff --git a/infra/blackbox/blackbox.yml b/infra/blackbox/blackbox.yml
deleted file mode 100644
index 6e114d0..0000000
--- a/infra/blackbox/blackbox.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-# FieldTrack 2.0 — Blackbox Exporter Configuration
-#
-# Modules used by Prometheus scrape jobs (see prometheus.yml).
-#   fieldtrack-readiness: probes HTTPS /ready, also exposes TLS cert expiry metrics.
-
-modules:
-
-  # HTTP probe used for readiness check and TLS certificate monitoring.
-  # probe_success == 1 when /ready returns HTTP 200.
-  # probe_ssl_earliest_cert_expiry exposes the TLS certificate expiry timestamp.
-  http_2xx:
-    prober: http
-    timeout: 10s
-    http:
-      valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
-      valid_status_codes: [200]
-      method: GET
-      tls_config:
-        insecure_skip_verify: false
-      preferred_ip_protocol: "ip4"
-      ip_protocol_fallback: false
diff --git a/infra/docker-compose.monitoring.yml b/infra/docker-compose.monitoring.yml
deleted file mode 100644
index 4d973e9..0000000
--- a/infra/docker-compose.monitoring.yml
+++ /dev/null
@@ -1,264 +0,0 @@
-services:
-
-  loki:
-    image: grafana/loki:2.9.6
-    container_name: loki
-    restart: unless-stopped
-    expose:
-      - "3100"
-    volumes:
-      - loki_data:/loki
-      - ./loki/loki-config.yaml:/etc/loki/local-config.yaml:ro
-    command: -config.file=/etc/loki/local-config.yaml
-    networks:
-      - api_network
-    deploy:
-      resources:
-        limits:
-          memory: 1g
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-    healthcheck:
-      test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-      start_period: 15s
-
-  promtail:
-    image: grafana/promtail:2.9.6
-    container_name: promtail
-    restart: unless-stopped
-    volumes:
-      - /var/log:/var/log:ro
-      - /var/lib/docker/containers:/var/lib/docker/containers:ro
-      - ./promtail/promtail.yml:/etc/promtail/promtail.yml:ro
-      - promtail_data:/data
-    command: -config.file=/etc/promtail/promtail.yml
-    networks:
-      - api_network
-    depends_on:
-      loki:
-        condition: service_healthy
-    deploy:
-      resources:
-        limits:
-          memory: 128m
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-
-  alertmanager:
-    image: prom/alertmanager:v0.27.0
-    container_name: alertmanager
-    restart: unless-stopped
-    expose:
-      - "9093"
-
-    volumes:
-      - ./alertmanager/alertmanager.rendered.yml:/etc/alertmanager/alertmanager.yml:ro
-      - alertmanager_data:/alertmanager
-
-    command:
-      - "--config.file=/etc/alertmanager/alertmanager.yml"
-      - "--storage.path=/alertmanager"
-      - "--web.listen-address=:9093"
-
-    networks:
-      - api_network
-
-    deploy:
-      resources:
-        limits:
-          memory: 128m
-
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-
-    healthcheck:
-      test: ["CMD", "wget", "--spider", "-q", "http://localhost:9093/-/healthy"]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-      start_period: 10s
-
-  prometheus:
-    image: prom/prometheus:v2.52.0
-    container_name: prometheus
-    restart: unless-stopped
-    expose:
-      - "9090"
-
-    environment:
-      - METRICS_SCRAPE_TOKEN=${METRICS_SCRAPE_TOKEN}
-      - API_HOSTNAME=${API_HOSTNAME}
-
-    volumes:
-      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
-      - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
-      - prometheus_data:/prometheus
-
-    command:
-      - "--config.file=/etc/prometheus/prometheus.yml"
-      - "--storage.tsdb.retention.time=30d"
-      - "--storage.tsdb.retention.size=5GB"
-      - "--web.enable-lifecycle"
-
-    networks:
-      - api_network
-
-    depends_on:
-      alertmanager:
-        condition: service_healthy
-
-    deploy:
-      resources:
-        limits:
-          memory: 512m
-
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-
-    healthcheck:
-      test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-      start_period: 10s
-
-  grafana:
-    image: grafana/grafana:10.4.2
-    container_name: grafana
-    restart: unless-stopped
-    expose:
-      - "3000"
-
-    environment:
-      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
-      - GF_SERVER_ROOT_URL=https://${API_HOSTNAME}/grafana
-
-    volumes:
-      - grafana_data:/var/lib/grafana
-
-    networks:
-      - api_network
-
-    depends_on:
-      prometheus:
-        condition: service_healthy
-
-    deploy:
-      resources:
-        limits:
-          memory: 256m
-
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-
-    healthcheck:
-      test: ["CMD", "wget", "--spider", "-q", "http://localhost:3000/api/health"]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-      start_period: 30s
-
-  node-exporter:
-    image: prom/node-exporter:v1.8.1
-    container_name: node-exporter
-    restart: unless-stopped
-    expose:
-      - "9100"
-
-    command:
-      - "--path.rootfs=/host"
-
-    volumes:
-      - /:/host:ro,rslave
-
-    networks:
-      - api_network
-
-    deploy:
-      resources:
-        limits:
-          memory: 64m
-
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-
-  nginx:
-    image: nginx:1.25-alpine
-    container_name: nginx
-    restart: unless-stopped
-    ports:
-      - "80:80"
-      - "443:443"
-
-    volumes:
-      # Rendered nginx config — written by deploy script on each deploy
-      - ./nginx/live:/etc/nginx/conf.d:ro
-      # SSL certificates (managed by certbot on the host)
-      - /etc/ssl/api:/etc/ssl/api:ro
-      # ACME challenge webroot for certbot renewal
-      - /var/www/certbot:/var/www/certbot:ro
-      # Nginx access logs shared with promtail
-      - /var/log/nginx:/var/log/nginx
-
-    networks:
-      - api_network
-
-    # nginx can start as soon as the grafana *container* exists.
-    # Waiting for service_healthy would create a blocking chain:
-    #   nginx → grafana → prometheus → alertmanager
-    # which delays the ingress layer on fresh deployments by minutes.
-    # nginx uses deferred Docker DNS ($api_backend variable + resolver 127.0.0.11)
-    # so it starts cleanly before any backend container is ready.
-    depends_on:
-      grafana:
-        condition: service_started
-
-    deploy:
-      resources:
-        limits:
-          memory: 64m
-
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-
-    healthcheck:
-      test: ["CMD", "wget", "--no-check-certificate", "--spider", "-q", "https://localhost/health"]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-      start_period: 10s
-
-networks:
-  api_network:
-    external: true
-
-volumes:
-  prometheus_data:
-  alertmanager_data:
-  grafana_data:
-  loki_data:
-  promtail_data:
\ No newline at end of file
diff --git a/infra/docker-compose.nginx.yml b/infra/docker-compose.nginx.yml
new file mode 100644
index 0000000..82e4cba
--- /dev/null
+++ b/infra/docker-compose.nginx.yml
@@ -0,0 +1,44 @@
+services:
+
+  nginx:
+    image: nginx:1.25-alpine
+    container_name: nginx
+    restart: unless-stopped
+    ports:
+      - "80:80"
+      - "443:443"
+
+    volumes:
+      # Rendered nginx config — written by deploy script on each deploy
+      - ./nginx/live:/etc/nginx/conf.d:ro
+      # SSL certificates (managed by certbot on the host)
+      - /etc/ssl/api:/etc/ssl/api:ro
+      # ACME challenge webroot for certbot renewal
+      - /var/www/certbot:/var/www/certbot:ro
+      # Nginx access logs shared with promtail
+      - /var/log/nginx:/var/log/nginx
+
+    networks:
+      - api_network
+
+    deploy:
+      resources:
+        limits:
+          memory: 64m
+
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+
+    healthcheck:
+      test: ["CMD", "wget", "--no-check-certificate", "--spider", "-q", "https://localhost/health"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 10s
+
+networks:
+  api_network:
+    external: true
diff --git a/infra/docker-compose.redis.yml b/infra/docker-compose.redis.yml
new file mode 100644
index 0000000..7043cc3
--- /dev/null
+++ b/infra/docker-compose.redis.yml
@@ -0,0 +1,38 @@
+services:
+
+  redis:
+    image: redis:7-alpine
+    container_name: redis
+    restart: unless-stopped
+    command: redis-server --save 60 1 --loglevel warning
+
+    volumes:
+      - redis_data:/data
+
+    networks:
+      - api_network
+
+    deploy:
+      resources:
+        limits:
+          memory: 256m
+
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 10s
+
+networks:
+  api_network:
+    external: true
+
+volumes:
+  redis_data:
diff --git a/infra/grafana/dashboards/fieldtrack.json b/infra/grafana/dashboards/fieldtrack.json
deleted file mode 100644
index 48e11ac..0000000
--- a/infra/grafana/dashboards/fieldtrack.json
+++ /dev/null
@@ -1,680 +0,0 @@
-{
-    "annotations": {
-        "list": [
-            {
-                "builtIn": 1,
-                "datasource": {
-                    "type": "grafana",
-                    "uid": "-- Grafana --"
-                },
-                "enable": true,
-                "hide": true,
-                "iconColor": "rgba(0, 211, 255, 1)",
-                "name": "Annotations & Alerts",
-                "type": "dashboard"
-            }
-        ]
-    },
-    "editable": true,
-    "fiscalYearStartMonth": 0,
-    "graphTooltip": 1,
-    "id": null,
-    "links": [],
-    "panels": [
-        {
-            "title": "HTTP Request Rate (req/s)",
-            "type": "timeseries",
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 0
-            },
-            "datasource": {
-                "type": "prometheus",
-                "uid": ""
-            },
-            "fieldConfig": {
-                "defaults": {
-                    "color": {
-                        "mode": "palette-classic"
-                    },
-                    "custom": {
-                        "axisBorderShow": false,
-                        "axisCenteredZero": false,
-                        "axisLabel": "req/s",
-                        "drawStyle": "line",
-                        "fillOpacity": 20,
-                        "lineWidth": 2,
-                        "pointSize": 5,
-                        "showPoints": "auto",
-                        "stacking": {
-                            "mode": "none"
-                        }
-                    },
-                    "unit": "reqps"
-                },
-                "overrides": []
-            },
-            "targets": [
-                {
-                    "expr": "sum(rate(http_requests_total{job=~\"fieldtrack-api.*\"}[5m])) by (status_code)",
-                    "legendFormat": "{{status_code}}",
-                    "refId": "A"
-                }
-            ]
-        },
-        {
-            "title": "HTTP Request Latency (p50 / p95 / p99)",
-            "type": "timeseries",
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 0
-            },
-            "datasource": {
-                "type": "prometheus",
-                "uid": ""
-            },
-            "fieldConfig": {
-                "defaults": {
-                    "color": {
-                        "mode": "palette-classic"
-                    },
-                    "custom": {
-                        "axisBorderShow": false,
-                        "axisCenteredZero": false,
-                        "axisLabel": "seconds",
-                        "drawStyle": "line",
-                        "fillOpacity": 10,
-                        "lineWidth": 2,
-                        "pointSize": 5,
-                        "showPoints": "auto",
-                        "stacking": {
-                            "mode": "none"
-                        }
-                    },
-                    "unit": "s"
-                },
-                "overrides": []
-            },
-            "targets": [
-                {
-                    "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{job=~\"fieldtrack-api.*\"}[5m])) by (le))",
-                    "legendFormat": "p50",
-                    "refId": "A"
-                },
-                {
-                    "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=~\"fieldtrack-api.*\"}[5m])) by (le))",
-                    "legendFormat": "p95",
-                    "refId": "B"
-                },
-                {
-                    "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{job=~\"fieldtrack-api.*\"}[5m])) by (le))",
-                    "legendFormat": "p99",
-                    "refId": "C"
-                }
-            ]
-        },
-        {
-            "title": "In-Flight Requests",
-            "type": "stat",
-            "gridPos": {
-                "h": 4,
-                "w": 6,
-                "x": 0,
-                "y": 8
-            },
-            "datasource": {
-                "type": "prometheus",
-                "uid": ""
-            },
-            "fieldConfig": {
-                "defaults": {
-                    "color": {
-                        "mode": "thresholds"
-                    },
-                    "thresholds": {
-                        "steps": [
-                            {
-                                "color": "green",
-                                "value": null
-                            },
-                            {
-                                "color": "yellow",
-                                "value": 50
-                            },
-                            {
-                                "color": "red",
-                                "value": 100
-                            }
-                        ]
-                    }
-                },
-                "overrides": []
-            },
-            "targets": [
-                {
-                    "expr": "http_requests_in_flight{job=~\"fieldtrack-api.*\"}",
-                    "legendFormat": "In-Flight",
-                    "refId": "A"
-                }
-            ]
-        },
-        {
-            "title": "Total Requests (24h)",
-            "type": "stat",
-            "gridPos": {
-                "h": 4,
-                "w": 6,
-                "x": 6,
-                "y": 8
-            },
-            "datasource": {
-                "type": "prometheus",
-                "uid": ""
-            },
-            "fieldConfig": {
-                "defaults": {
-                    "color": {
-                        "mode": "thresholds"
-                    },
-                    "thresholds": {
-                        "steps": [
-                            {
-                                "color": "blue",
-                                "value": null
-                            }
-                        ]
-                    }
-                },
-                "overrides": []
-            },
-            "targets": [
-                {
-                    "expr": "sum(increase(http_requests_total{job=~\"fieldtrack-api.*\"}[24h]))",
-                    "legendFormat": "Total",
-                    "refId": "A"
-                }
-            ]
-        },
-        {
-            "title": "Error Rate (5xx)",
-            "type": "stat",
-            "gridPos": {
-                "h": 4,
-                "w": 6,
-                "x": 12,
-                "y": 8
-            },
-            "datasource": {
-                "type": "prometheus",
-                "uid": ""
-            },
-            "fieldConfig": {
-                "defaults": {
-                    "color": {
-                        "mode": "thresholds"
-                    },
-                    "unit": "percentunit",
-                    "thresholds": {
-                        "steps": [
-                            {
-                                "color": "green",
-                                "value": null
-                            },
-                            {
-                                "color": "yellow",
-                                "value": 0.01
-                            },
-                            {
-                                "color": "red",
-                                "value": 0.05
-                            }
-                        ]
-                    }
-                },
-                "overrides": []
-            },
-            "targets": [
-                {
-                    "expr": "sum(rate(http_requests_total{job=~\"fieldtrack-api.*\", status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=~\"fieldtrack-api.*\"}[5m]))",
-                    "legendFormat": "5xx Rate",
-                    "refId": "A"
-                }
-            ]
-        },
-        {
-            "title": "Uptime",
-            "type": "stat",
-            "gridPos": {
-                "h": 4,
-                "w": 6,
-                "x": 18,
-                "y": 8
-            },
-            "datasource": {
-                "type": "prometheus",
-                "uid": ""
-            },
-            "fieldConfig": {
-                "defaults": {
-                    "color": {
-                        "mode": "thresholds"
-                    },
-                    "unit": "s",
-                    "thresholds": {
-                        "steps": [
-                            {
-                                "color": "green",
-                                "value": null
-                            }
-                        ]
-                    }
-                },
-                "overrides": []
-            },
-            "targets": [
-                {
-                    "expr": "process_uptime_seconds{job=~\"fieldtrack-api.*\"}",
-                    "legendFormat": "Uptime",
-                    "refId": "A"
-                }
-            ]
-        },
-        {
-            "title": "Request Rate by Route",
-            "type": "timeseries",
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 12
-            },
-            "datasource": {
-                "type": "prometheus",
-                "uid": ""
-            },
-            "fieldConfig": {
-                "defaults": {
-                    "color": {
-                        "mode": "palette-classic"
-                    },
-                    "custom": {
-                        "drawStyle": "line",
-                        "fillOpacity": 15,
-                        "lineWidth": 2
-                    },
-                    "unit": "reqps"
-                },
-                "overrides": []
-            },
-            "targets": [
-                {
-                    "expr": "sum(rate(http_requests_total{job=~\"fieldtrack-api.*\"}[5m])) by (route)",
-                    "legendFormat": "{{route}}",
-                    "refId": "A"
-                }
-            ]
-        },
-        {
-            "title": "Node.js Heap Memory",
-            "type": "timeseries",
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 12
-            },
-            "datasource": {
-                "type": "prometheus",
-                "uid": ""
-            },
-            "fieldConfig": {
-                "defaults": {
-                    "color": {
-                        "mode": "palette-classic"
-                    },
-                    "custom": {
-                        "drawStyle": "line",
-                        "fillOpacity": 20,
-                        "lineWidth": 2
-                    },
-                    "unit": "bytes"
-                },
-                "overrides": []
-            },
-            "targets": [
-                {
-                    "expr": "nodejs_heap_size_used_bytes{job=~\"fieldtrack-api.*\"}",
-                    "legendFormat": "Heap Used",
-                    "refId": "A"
-                },
-                {
-                    "expr": "nodejs_heap_size_total_bytes{job=~\"fieldtrack-api.*\"}",
-                    "legendFormat": "Heap Total",
-                    "refId": "B"
-                },
-                {
-                    "expr": "process_resident_memory_bytes{job=~\"fieldtrack-api.*\"}",
-                    "legendFormat": "RSS",
-                    "refId": "C"
-                }
-            ]
-        },
-        {
-            "title": "CPU Usage (%)",
-            "type": "timeseries",
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 20
-            },
-            "datasource": {
-                "type": "prometheus",
-                "uid": ""
-            },
-            "fieldConfig": {
-                "defaults": {
-                    "color": {
-                        "mode": "palette-classic"
-                    },
-                    "custom": {
-                        "drawStyle": "line",
-                        "fillOpacity": 20,
-                        "lineWidth": 2
-                    },
-                    "unit": "percentunit",
-                    "max": 1
-                },
-                "overrides": []
-            },
-            "targets": [
-                {
-                    "expr": "1 - avg(rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\"}[5m]))",
-                    "legendFormat": "CPU Usage",
-                    "refId": "A"
-                }
-            ]
-        },
-        {
-            "title": "System Memory Usage",
-            "type": "timeseries",
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 20
-            },
-            "datasource": {
-                "type": "prometheus",
-                "uid": ""
-            },
-            "fieldConfig": {
-                "defaults": {
-                    "color": {
-                        "mode": "palette-classic"
-                    },
-                    "custom": {
-                        "drawStyle": "line",
-                        "fillOpacity": 20,
-                        "lineWidth": 2
-                    },
-                    "unit": "bytes"
-                },
-                "overrides": []
-            },
-            "targets": [
-                {
-                    "expr": "node_memory_MemTotal_bytes{job=\"node-exporter\"} - node_memory_MemAvailable_bytes{job=\"node-exporter\"}",
-                    "legendFormat": "Used",
-                    "refId": "A"
-                },
-                {
-                    "expr": "node_memory_MemTotal_bytes{job=\"node-exporter\"}",
-                    "legendFormat": "Total",
-                    "refId": "B"
-                }
-            ]
-        },
-        {
-            "title": "Disk Usage",
-            "type": "gauge",
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 28
-            },
-            "datasource": {
-                "type": "prometheus",
-                "uid": ""
-            },
-            "fieldConfig": {
-                "defaults": {
-                    "color": {
-                        "mode": "thresholds"
-                    },
-                    "unit": "percentunit",
-                    "max": 1,
-                    "thresholds": {
-                        "steps": [
-                            {
-                                "color": "green",
-                                "value": null
-                            },
-                            {
-                                "color": "yellow",
-                                "value": 0.7
-                            },
-                            {
-                                "color": "red",
-                                "value": 0.9
-                            }
-                        ]
-                    }
-                },
-                "overrides": []
-            },
-            "targets": [
-                {
-                    "expr": "1 - (node_filesystem_avail_bytes{job=\"node-exporter\", mountpoint=\"/\", fstype!=\"tmpfs\"} / node_filesystem_size_bytes{job=\"node-exporter\", mountpoint=\"/\", fstype!=\"tmpfs\"})",
-                    "legendFormat": "Disk Used",
-                    "refId": "A"
-                }
-            ]
-        },
-        {
-            "title": "Network I/O",
-            "type": "timeseries",
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 28
-            },
-            "datasource": {
-                "type": "prometheus",
-                "uid": ""
-            },
-            "fieldConfig": {
-                "defaults": {
-                    "color": {
-                        "mode": "palette-classic"
-                    },
-                    "custom": {
-                        "drawStyle": "line",
-                        "fillOpacity": 15,
-                        "lineWidth": 2
-                    },
-                    "unit": "Bps"
-                },
-                "overrides": []
-            },
-            "targets": [
-                {
-                    "expr": "sum(rate(node_network_receive_bytes_total{job=\"node-exporter\", device!~\"lo|veth.*|docker.*|br-.*\"}[5m]))",
-                    "legendFormat": "Receive",
-                    "refId": "A"
-                },
-                {
-                    "expr": "sum(rate(node_network_transmit_bytes_total{job=\"node-exporter\", device!~\"lo|veth.*|docker.*|br-.*\"}[5m]))",
-                    "legendFormat": "Transmit",
-                    "refId": "B"
-                }
-            ]
-        },
-        {
-            "title": "API Error Budget Remaining (30d)",
-            "type": "stat",
-            "gridPos": {
-                "h": 8,
-                "w": 8,
-                "x": 0,
-                "y": 36
-            },
-            "datasource": {
-                "type": "prometheus",
-                "uid": ""
-            },
-            "fieldConfig": {
-                "defaults": {
-                    "color": {
-                        "mode": "thresholds"
-                    },
-                    "unit": "percentunit",
-                    "thresholds": {
-                        "steps": [
-                            {
-                                "color": "red",
-                                "value": null
-                            },
-                            {
-                                "color": "yellow",
-                                "value": 0.5
-                            },
-                            {
-                                "color": "green",
-                                "value": 0.9
-                            }
-                        ]
-                    }
-                },
-                "overrides": []
-            },
-            "targets": [
-                {
-                    "expr": "fieldtrack:api_error_budget_remaining_30d",
-                    "legendFormat": "Remaining",
-                    "refId": "A"
-                }
-            ]
-        },
-        {
-            "title": "API Error Burn Rate (1h / 6h)",
-            "type": "timeseries",
-            "gridPos": {
-                "h": 8,
-                "w": 8,
-                "x": 8,
-                "y": 36
-            },
-            "datasource": {
-                "type": "prometheus",
-                "uid": ""
-            },
-            "fieldConfig": {
-                "defaults": {
-                    "color": {
-                        "mode": "palette-classic"
-                    },
-                    "custom": {
-                        "drawStyle": "line",
-                        "fillOpacity": 15,
-                        "lineWidth": 2
-                    },
-                    "unit": "percentunit"
-                },
-                "overrides": []
-            },
-            "targets": [
-                {
-                    "expr": "fieldtrack:api_error_rate_1h",
-                    "legendFormat": "1h",
-                    "refId": "A"
-                },
-                {
-                    "expr": "fieldtrack:api_error_rate_6h",
-                    "legendFormat": "6h",
-                    "refId": "B"
-                }
-            ]
-        },
-        {
-            "title": "Webhook Permanent Failure Rate (5m)",
-            "type": "stat",
-            "gridPos": {
-                "h": 8,
-                "w": 8,
-                "x": 16,
-                "y": 36
-            },
-            "datasource": {
-                "type": "prometheus",
-                "uid": ""
-            },
-            "fieldConfig": {
-                "defaults": {
-                    "color": {
-                        "mode": "thresholds"
-                    },
-                    "unit": "percentunit",
-                    "thresholds": {
-                        "steps": [
-                            {
-                                "color": "green",
-                                "value": null
-                            },
-                            {
-                                "color": "yellow",
-                                "value": 0.1
-                            },
-                            {
-                                "color": "red",
-                                "value": 0.3
-                            }
-                        ]
-                    }
-                },
-                "overrides": []
-            },
-            "targets": [
-                {
-                    "expr": "fieldtrack:webhook_failure_rate_5m",
-                    "legendFormat": "Failure Rate",
-                    "refId": "A"
-                }
-            ]
-        }
-    ],
-    "schemaVersion": 39,
-    "tags": [
-        "fieldtrack",
-        "backend",
-        "monitoring"
-    ],
-    "templating": {
-        "list": []
-    },
-    "time": {
-        "from": "now-6h",
-        "to": "now"
-    },
-    "timepicker": {},
-    "timezone": "browser",
-    "title": "FieldTrack 2.0 — Backend & System",
-    "uid": "fieldtrack-api",
-    "version": 2
-}
\ No newline at end of file
diff --git a/infra/grafana/provisioning/dashboards/dashboard.yml b/infra/grafana/provisioning/dashboards/dashboard.yml
deleted file mode 100644
index ddd035f..0000000
--- a/infra/grafana/provisioning/dashboards/dashboard.yml
+++ /dev/null
@@ -1,15 +0,0 @@
-# FieldTrack 2.0 — Grafana Dashboard Provisioning
-apiVersion: 1
-
-providers:
-  - name: "FieldTrack Dashboards"
-    orgId: 1
-    folder: "FieldTrack"
-    type: file
-    disableDeletion: false
-    editable: true
-    updateIntervalSeconds: 30
-    allowUiUpdates: true
-    options:
-      path: /var/lib/grafana/dashboards
-      foldersFromFilesStructure: false
diff --git a/infra/grafana/provisioning/datasources/prometheus.yml b/infra/grafana/provisioning/datasources/prometheus.yml
deleted file mode 100644
index ce07def..0000000
--- a/infra/grafana/provisioning/datasources/prometheus.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-# FieldTrack 2.0 — Grafana Datasource Provisioning
-apiVersion: 1
-
-datasources:
-  - name: Prometheus
-    type: prometheus
-    access: proxy
-    url: http://prometheus:9090
-    isDefault: true
-    editable: false
-    jsonData:
-      timeInterval: "15s"
-      httpMethod: POST
diff --git a/infra/loki/loki-config.yaml b/infra/loki/loki-config.yaml
deleted file mode 100644
index e283b99..0000000
--- a/infra/loki/loki-config.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# FieldTrack 2.0 — Loki Configuration
-#
-# Single-binary mode (grafana/loki:2.9.6) with filesystem storage.
-# Retention is enforced by the compactor (retention_enabled: true).
-
-auth_enabled: false
-
-server:
-  http_listen_port: 3100
-  grpc_listen_port: 9095
-  log_level: warn
-
-common:
-  path_prefix: /loki
-  storage:
-    filesystem:
-      chunks_directory: /loki/chunks
-      rules_directory: /loki/rules
-  replication_factor: 1
-  ring:
-    kvstore:
-      store: inmemory
-
-schema_config:
-  configs:
-    - from: 2024-01-01
-      store: tsdb
-      object_store: filesystem
-      schema: v13
-      index:
-        prefix: index_
-        period: 24h
-
-limits_config:
-  # Retain logs for 30 days; compactor enforces deletion
-  retention_period: 30d
-
-compactor:
-  working_directory: /loki/compactor
-  compaction_interval: 10m
-  retention_enabled: true
-  retention_delete_delay: 2h
-  retention_delete_worker_count: 150
-  delete_request_store: filesystem
-
-analytics:
-  reporting_enabled: false
diff --git a/infra/nginx/api.conf b/infra/nginx/api.conf
deleted file mode 100644
index 1128f26..0000000
--- a/infra/nginx/api.conf
+++ /dev/null
@@ -1,274 +0,0 @@
-# ============================================================================
-# FieldTrack API — Production Nginx Reverse Proxy
-# ============================================================================
-
-map $http_upgrade $connection_upgrade {
-    default upgrade;
-    '' close;
-}
-
-# NOTE: No upstream block for api_backend.
-# upstream blocks resolve server hostnames at config-load time, which fails
-# for Docker service names (api-blue / api-green) that may not exist yet.
-# Instead, use a variable + proxy_pass to defer resolution to request time via
-# the resolver 127.0.0.11 directive defined in the server block below.
-
-limit_req_zone $binary_remote_addr zone=api_rate:10m rate=60r/s;
-limit_req_zone $binary_remote_addr zone=api_health:10m rate=5r/s;
-
-# Cloudflare IPs
-set_real_ip_from 103.21.244.0/22;
-set_real_ip_from 103.22.200.0/22;
-set_real_ip_from 103.31.4.0/22;
-set_real_ip_from 104.16.0.0/13;
-set_real_ip_from 104.24.0.0/14;
-set_real_ip_from 108.162.192.0/18;
-set_real_ip_from 131.0.72.0/22;
-set_real_ip_from 141.101.64.0/18;
-set_real_ip_from 162.158.0.0/15;
-set_real_ip_from 172.64.0.0/13;
-set_real_ip_from 173.245.48.0/20;
-set_real_ip_from 188.114.96.0/20;
-set_real_ip_from 190.93.240.0/20;
-set_real_ip_from 197.234.240.0/22;
-set_real_ip_from 198.41.128.0/17;
-
-real_ip_header CF-Connecting-IP;
-real_ip_recursive on;
-
-# ---------------------------------------------------------------------------
-# Trusted-source detection via $realip_remote_addr
-#
-# $realip_remote_addr = the original TCP-connecting IP before the real_ip
-# module rewrites $remote_addr to the end-user IP (from CF-Connecting-IP).
-# For Cloudflare-proxied requests:  $realip_remote_addr = CF edge IP.
-# For VPS-local requests:           $realip_remote_addr = 127.0.0.1.
-#
-# Used ONLY for the /monitor/ (Grafana) proxy — an internal dashboard that
-# should not be directly reachable from arbitrary IPs.
-#
-# API routes (/, /admin/events, etc.) are intentionally NOT restricted here:
-#   - Application layer enforces all auth (JWT + RBAC)
-#   - CI, debugging, and direct-origin access must work without going through Cloudflare
-#   - Cloudflare still proxies all production user traffic (no change in UX)
-# ---------------------------------------------------------------------------
-geo $realip_remote_addr $is_trusted_source {
-    default          0;
-    127.0.0.1/32     1;
-    ::1/128          1;
-    # Cloudflare IPv4 (https://www.cloudflare.com/ips-v4/)
-    103.21.244.0/22  1;
-    103.22.200.0/22  1;
-    103.31.4.0/22    1;
-    104.16.0.0/13    1;
-    104.24.0.0/14    1;
-    108.162.192.0/18 1;
-    131.0.72.0/22    1;
-    141.101.64.0/18  1;
-    162.158.0.0/15   1;
-    172.64.0.0/13    1;
-    173.245.48.0/20  1;
-    188.114.96.0/20  1;
-    190.93.240.0/20  1;
-    197.234.240.0/22 1;
-    198.41.128.0/17  1;
-}
-
-# HTTP → HTTPS (with ACME challenge passthrough for certbot renewal)
-server {
-    listen 80;
-    listen [::]:80;
-    server_name __API_HOSTNAME__;
-
-    # Let certbot serve ACME challenges for certificate renewal
-    location /.well-known/acme-challenge/ {
-        root /var/www/certbot;
-    }
-
-    # Nginx-level liveness probe — answered by nginx directly, no upstream needed.
-    # Used by: in-network post-switch routing checks, CI health gates, and
-    # monitoring probes. Returns 200 even when the API container is down so that
-    # nginx infrastructure health never depends on backend readiness.
-    # This endpoint intentionally does NOT proxy to the API backend.
-    location = /health {
-        access_log off;
-        add_header Content-Type 'application/json; charset=utf-8' always;
-        return 200 '{"status":"ok"}';
-    }
-
-    location / {
-        return 301 https://$host$request_uri;
-    }
-}
-
-# HTTPS SERVER
-server {
-
-    listen 443 ssl;
-    listen [::]:443 ssl;
-
-    server_name __API_HOSTNAME__;
-
-    ssl_certificate     /etc/ssl/api/origin.crt;
-    ssl_certificate_key /etc/ssl/api/origin.key;
-
-    ssl_protocols TLSv1.2 TLSv1.3;
-    ssl_prefer_server_ciphers on;
-
-    server_tokens off;
-
-    # ─────────────────────────────────────────────────────────────────────────────
-    # Docker DNS Resolution (CRITICAL for service name upstreams)
-    # 
-    # Enables runtime DNS resolution for Docker service names (e.g., grafana:3000).
-    # Without this, Nginx fails at config-load with: "host not found in upstream".
-    # Docker's embedded resolver is at 127.0.0.11:53.
-    # valid=5s caches DNS lookups for 5 seconds — short enough that after a
-    # blue-green switch nginx re-resolves the new container within one health
-    # check cycle. ipv6=off stops AAAA queries that Docker bridge networks do
-    # not answer, which can add latency or cause spurious resolution failures.
-    # ─────────────────────────────────────────────────────────────────────────────
-    resolver 127.0.0.11 valid=5s ipv6=off;
-    resolver_timeout 5s;
-
-    # Variable-based backend URL — resolved at request time via Docker DNS (127.0.0.11).
-    # __ACTIVE_CONTAINER__ is substituted with api-blue or api-green by deploy script.
-    set $api_backend "http://__ACTIVE_CONTAINER__:3000";
-
-    # safer host validation (still simple)
-    if ($host !~* ^(__API_HOSTNAME__|localhost|127\.0\.0\.1)$) {
-    return 444;
-    }
-
-    # No server-level IP restrictions on API routes.
-    # All application endpoints are secured by JWT + RBAC in Fastify.
-    # IP-level access control is limited to /metrics, /internal (hard 403)
-    # and /monitor/ (Grafana dashboard, Cloudflare + localhost only).
-
-    # Headers
-    add_header X-Frame-Options "SAMEORIGIN" always;
-    add_header X-Content-Type-Options "nosniff" always;
-    add_header X-XSS-Protection "1; mode=block" always;
-    add_header Referrer-Policy "strict-origin-when-cross-origin" always;
-    add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
-
-    add_header Content-Security-Policy "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'; img-src 'self' data: https:; font-src 'self' data:; connect-src 'self' https://__API_HOSTNAME__; frame-ancestors 'self';" always;
-
-    access_log /var/log/nginx/api_access.log;
-    error_log  /var/log/nginx/api_error.log;
-
-    client_max_body_size 10M;
-    client_body_timeout 30s;
-    send_timeout 30s;
-
-    # Upstream timeout defaults for non-streaming traffic.
-    # SSE keeps a longer read timeout in its location block.
-    proxy_connect_timeout 5s;
-    proxy_send_timeout 60s;
-    proxy_read_timeout 60s;
-
-    gzip on;
-    gzip_comp_level 5;
-    gzip_min_length 256;
-    gzip_proxied any;
-    gzip_vary on;
-    gzip_types application/json application/javascript text/css text/plain text/xml application/xml;
-
-    # Block sensitive endpoints
-    location /metrics {
-        allow 127.0.0.1;
-        deny all;
-    }
-    location /internal { return 403; }
-    location ~ ^/internal/ { return 403; }
-    location /prometheus { return 403; }
-
-    # Health — publicly accessible liveness probe (no dependencies).
-    # Reachable from CI runners, monitoring probes (Blackbox, uptime services),
-    # load balancers, and deploy scripts. Rate-limited to prevent abuse.
-    # Explicit upstream URIs (proxy_pass .../health) guard against URI-rewrite
-    # regressions — nginx won't silently change the upstream path.
-    location = /health {
-        limit_req zone=api_health burst=10 nodelay;
-        proxy_pass $api_backend$request_uri;
-        proxy_buffering off;
-        proxy_set_header Host __API_HOSTNAME__;
-        proxy_set_header X-Forwarded-Host $host;
-        proxy_set_header X-Real-IP $remote_addr;
-        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
-        proxy_set_header X-Forwarded-Proto $scheme;
-        proxy_connect_timeout 5s;
-        proxy_read_timeout 30s;
-    }
-
-    location = /ready {
-        # INTERNAL ONLY — requires dependency checks (Redis, Supabase, BullMQ)
-        # Expensive operations; only reachable from within VPS or localhost.
-        allow 127.0.0.1;
-        allow ::1;
-        deny all;
-        limit_req zone=api_health burst=10 nodelay;
-        proxy_pass $api_backend$request_uri;
-        proxy_buffering off;
-        proxy_set_header Host __API_HOSTNAME__;
-        proxy_set_header X-Forwarded-Host $host;
-        proxy_set_header X-Real-IP $remote_addr;
-        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
-        proxy_set_header X-Forwarded-Proto $scheme;
-        proxy_connect_timeout 5s;
-        proxy_read_timeout 30s;
-    }
-
-    # SSE — open to all origins; application enforces JWT auth
-    location = /admin/events {
-        limit_req zone=api_rate burst=10 nodelay;
-        proxy_pass $api_backend$request_uri;
-        proxy_http_version 1.1;
-        proxy_set_header Connection '';
-        proxy_set_header Host __API_HOSTNAME__;
-        proxy_set_header X-Forwarded-Host $host;
-        proxy_set_header Authorization $http_authorization;
-        proxy_buffering off;
-        proxy_cache off;
-        add_header X-Accel-Buffering no;
-        proxy_read_timeout 3600s;
-    }
-
-    # MAIN API — open to all origins; application enforces JWT + RBAC
-    location / {
-        limit_req zone=api_rate burst=50 nodelay;
-        proxy_pass $api_backend$request_uri;
-        proxy_http_version 1.1;
-        proxy_set_header Host __API_HOSTNAME__;
-        proxy_set_header X-Forwarded-Host $host;
-        proxy_set_header X-Real-IP $remote_addr;
-        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
-        proxy_set_header X-Forwarded-Proto $scheme;
-        proxy_set_header X-Request-ID $request_id;
-        proxy_set_header Authorization $http_authorization;
-        proxy_set_header Accept-Encoding "";
-        proxy_set_header Upgrade $http_upgrade;
-        proxy_set_header Connection $connection_upgrade;
-        proxy_connect_timeout 10s;
-        proxy_send_timeout 30s;
-        proxy_read_timeout 30s;
-        proxy_buffering on;
-    }
-
-    # Grafana (Cloudflare + localhost only, via Docker service DNS)
-    location /monitor/ {
-        if ($is_trusted_source = 0) { return 403; }
-        set $grafana_upstream "http://grafana:3000";
-        proxy_pass $grafana_upstream;
-        proxy_http_version 1.1;
-        proxy_set_header Upgrade $http_upgrade;
-        proxy_set_header Connection $connection_upgrade;
-        proxy_set_header Host __API_HOSTNAME__;
-        proxy_set_header X-Forwarded-Host $host;
-        proxy_buffering off;
-    }
-
-    location = /monitor {
-        return 301 $scheme://$host/monitor/;
-    }
-}
\ No newline at end of file
diff --git a/infra/prometheus/alerts.yml b/infra/prometheus/alerts.yml
deleted file mode 100644
index 0d23a9d..0000000
--- a/infra/prometheus/alerts.yml
+++ /dev/null
@@ -1,559 +0,0 @@
-groups:
-
-# ---------------------------------------------------------
-# RECORDING RULES
-# ---------------------------------------------------------
-
-- name: fieldtrack_recording_rules
-  rules:
-  - record: fieldtrack:api_requests_rate_5m
-    expr: sum(rate(http_requests_total{job=~"fieldtrack-api.*"}[5m]))
-
-  - record: fieldtrack:api_errors_5xx_rate_5m
-    expr: sum(rate(http_requests_total{job=~"fieldtrack-api.*",status_code=~"5.."}[5m]))
-
-  - record: fieldtrack:api_error_rate_5m
-    expr: fieldtrack:api_errors_5xx_rate_5m / clamp_min(fieldtrack:api_requests_rate_5m, 1e-9)
-
-  - record: fieldtrack:api_error_rate_1h
-    expr: |
-      sum(rate(http_requests_total{job=~"fieldtrack-api.*",status_code=~"5.."}[1h]))
-      /
-      clamp_min(sum(rate(http_requests_total{job=~"fieldtrack-api.*"}[1h])), 1e-9)
-
-  - record: fieldtrack:api_error_rate_6h
-    expr: |
-      sum(rate(http_requests_total{job=~"fieldtrack-api.*",status_code=~"5.."}[6h]))
-      /
-      clamp_min(sum(rate(http_requests_total{job=~"fieldtrack-api.*"}[6h])), 1e-9)
-
-  - record: fieldtrack:webhook_failure_rate_5m
-    expr: |
-      sum(rate(webhook_failures_total[5m]))
-      /
-      clamp_min(sum(rate(webhook_deliveries_total[5m])), 1e-9)
-
-  - record: fieldtrack:api_error_budget_remaining_30d
-    expr: |
-      1 - (
-        sum(increase(http_requests_total{job=~"fieldtrack-api.*",status_code=~"5.."}[30d]))
-        /
-        clamp_min(sum(increase(http_requests_total{job=~"fieldtrack-api.*"}[30d])), 1)
-      )
-
-# ---------------------------------------------------------
-# API HEALTH
-# ---------------------------------------------------------
-
-- name: fieldtrack_api_alerts
-  rules:
-
-  - alert: FieldTrackHighErrorRate
-    expr: fieldtrack:api_error_rate_5m > 0.05
-    for: 2m
-    labels:
-      severity: critical
-    annotations:
-      summary: "High API error rate"
-      description: "5xx errors exceed 5%"
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ROLLBACK_QUICKREF.md"
-      runbook: |
-        Cause: Application throwing unhandled errors or DB/dependency failures.
-        Actions:
-          1. Check container logs: docker logs fieldtrack-api --tail 200
-          2. Check /system-health endpoint from VPS
-          3. Review recent deployments: git log --oneline -10
-          4. If DB: check Supabase dashboard for connection pool saturation
-          5. If memory: check HostMemoryPressure alert and restart container
-          6. Rollback if needed: see docs/ROLLBACK_QUICKREF.md
-
-  - alert: FieldTrackHighLatency
-    expr: |
-      histogram_quantile(
-        0.95,
-        sum(rate(http_request_duration_seconds_bucket{job=~"fieldtrack-api.*"}[5m])) by (le)
-      ) > 1
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: "High API latency"
-      description: "p95 latency above 1 second"
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/SLO.md"
-      runbook: |
-        Cause: Slow DB queries, queue contention, or upstream dependency latency.
-        Actions:
-          1. Open Grafana latency panel (p95/p99) and identify spike start time
-          2. Check slow-response logs in Loki (`slow_response` and `very_slow_response`)
-          3. Check DB load and connection saturation in Supabase dashboard
-          4. Inspect queue backlogs via GET /admin/system-health
-          5. Roll back recent deployment if latency regression started post-release
-
-  - alert: FieldTrackAvgLatencyHigh
-    expr: |
-      sum(rate(http_request_duration_seconds_sum{job=~"fieldtrack-api.*"}[5m]))
-      /
-      sum(rate(http_request_duration_seconds_count{job=~"fieldtrack-api.*"}[5m]))
-      > 0.5
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: "FieldTrack API latency exceeded threshold"
-      description: "Average response time exceeded 500 ms for 5 minutes"
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/SLO.md"
-      runbook: |
-        Cause: Sustained performance degradation across many routes.
-        Actions:
-          1. Compare avg latency with p95/p99 to identify broad vs tail issue
-          2. Review top routes by request rate and latency in Grafana
-          3. Inspect backend logs for DB timeout and retry patterns
-          4. Validate Redis and Supabase health via /ready and /system-health
-          5. Trigger rollback if regression is tied to latest deploy
-
-# ---------------------------------------------------------
-# WORKER ALERTS
-# ---------------------------------------------------------
-
-- name: fieldtrack_worker_alerts
-  rules:
-
-  - alert: DistanceWorkerJobFailuresHigh
-    expr: increase(distance_jobs_total{status="failed"}[5m]) > 3
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Distance worker jobs failing at high rate"
-      description: "More than 3 distance recalculation jobs permanently failed in the last 5 minutes. Check Redis connectivity and the distance-engine queue."
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/OBSERVABILITY_ARCHITECTURE.md"
-      runbook: |
-        Cause: Redis connectivity failure, Supabase query errors, or malformed GPS data.
-        Actions:
-          1. Check Redis: redis-cli -u $REDIS_URL ping
-          2. Check worker logs: docker logs fieldtrack-api | grep "Distance worker"
-          3. Inspect failed queue: GET /admin/system-health (worker section)
-          4. Replay stuck sessions via queue_retry_intents if needed
-          5. Check for GPS point anomalies (MAX_POINTS_PER_SESSION exceeded)
-
-  - alert: AnalyticsQueueBacklogGrowing
-    expr: analytics_queue_depth > 500
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Analytics queue backlog high"
-      description: "Analytics queue depth exceeded 500 for 5 minutes"
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/OBSERVABILITY_ARCHITECTURE.md"
-      runbook: |
-        Cause: Worker throughput below enqueue rate or downstream DB contention.
-        Actions:
-          1. Check analytics worker logs for repeated errors/timeouts
-          2. Inspect queue depth in GET /admin/system-health
-          3. Validate Redis latency and connection health
-          4. Check Supabase CPU/connection pressure
-          5. Temporarily scale worker concurrency if safe
-
-  # Phase 22: Fire if more than 5 analytics jobs permanently fail within 5 minutes.
-  # This indicates a systemic problem (bad DB schema change, Supabase outage, etc.)
-  # that retries alone cannot recover from — requires operator intervention.
-  - alert: AnalyticsJobFailuresHigh
-    expr: increase(analytics_job_failures_total[5m]) > 5
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Analytics jobs failing at high rate"
-      description: "More than 5 analytics jobs permanently failed (exhausted all retries) in the last 5 minutes. Check the analytics-failed dead letter queue and worker logs."
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/OBSERVABILITY_ARCHITECTURE.md"
-      runbook: |
-        Cause: Supabase schema change, DB connection exhaustion, or analytics aggregation bug.
-        Actions:
-          1. Check worker logs: docker logs fieldtrack-api | grep "analytics"
-          2. Inspect dead letter queue via GET /admin/system-health
-          3. Verify DB schema: check employee_daily_metrics and org_daily_metrics tables
-          4. If transient: failed jobs auto-expire after 72 h; monitor retry_intents_dead metric
-          5. If persistent: hotfix deployment required — see docs/ROLLBACK_QUICKREF.md
-
-# ---------------------------------------------------------
-# HOST ALERTS
-# ---------------------------------------------------------
-
-- name: fieldtrack_host_alerts
-  rules:
-
-  - alert: HostHighCPU
-    expr: 100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[5m]))) > 85
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: "High CPU usage"
-      description: "Host CPU usage above 85% for 5 minutes"
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/OBSERVABILITY_ARCHITECTURE.md"
-      runbook: |
-        Cause: Traffic surge, runaway process, or expensive query loops.
-        Actions:
-          1. Check top CPU consumers on host (`top`/`htop`)
-          2. Correlate with request rate and queue depth in Grafana
-          3. Inspect container logs for retry storms or hot loops
-          4. Scale out backend replicas or reduce noisy traffic source
-          5. Roll back if a recent deploy caused the spike
-
-  - alert: HostMemoryPressure
-    expr: |
-      (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)
-      /
-      node_memory_MemTotal_bytes
-      > 0.85
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: "High memory usage"
-      description: "Host memory usage above 85% for 5 minutes"
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ROLLBACK_QUICKREF.md"
-      runbook: |
-        Cause: Memory leak, oversized cache, or traffic burst.
-        Actions:
-          1. Inspect container RSS and heap charts in Grafana
-          2. Check process logs for OOM warnings and GC pressure
-          3. Restart affected container if memory does not recover
-          4. If recurring post-deploy, roll back and open incident
-          5. Confirm host swap/disk not under pressure simultaneously
-
-  - alert: DiskAlmostFull
-    expr: |
-      (node_filesystem_size_bytes{mountpoint="/"} -
-      node_filesystem_free_bytes{mountpoint="/"})
-      /
-      node_filesystem_size_bytes{mountpoint="/"}
-      > 0.85
-    for: 5m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Disk usage above 85%"
-      description: "Root filesystem usage exceeded 85% for 5 minutes"
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/DEPLOYMENT.md"
-      runbook: |
-        Cause: Log growth, artifact buildup, or runaway temp files.
-        Actions:
-          1. Identify large directories (`du -sh /*` on host)
-          2. Rotate/prune Docker images and logs
-          3. Verify Loki/Promtail retention settings
-          4. Free space before deployment operations
-          5. Increase disk capacity if growth trend persists
-
-# ---------------------------------------------------------
-# DEPLOYMENT & INFRASTRUCTURE ALERTS
-# ---------------------------------------------------------
-
-- name: fieldtrack_infrastructure_alerts
-  rules:
-
-  - alert: RedisDown
-    expr: up{job="redis"} == 0
-    for: 2m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Redis is unreachable"
-      description: "Redis has been down for more than 2 minutes. BullMQ workers, rate limiting, and the auth context cache will all degrade until Redis recovers."
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ARCHITECTURE.md"
-      runbook: |
-        Cause: Redis container crash, OOM kill, or network partition.
-        Actions:
-          1. Check container: docker ps | grep redis; docker logs redis --tail 50
-          2. Restart if crashed: docker restart redis (or docker compose up -d redis)
-          3. Verify BullMQ reconnects: check worker logs after Redis recovery
-          4. Rate limiting degrades gracefully (requests allowed through) during outage
-          5. Circuit-breaker state is DB-backed and survives Redis restart
-
-  - alert: DeploymentFailure
-    expr: up{job=~"fieldtrack-api.*"} == 0
-    for: 2m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Backend container is down"
-      description: "{{ $labels.job }} has been down for more than 2 minutes. Check deployment logs and container status."
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ROLLBACK_QUICKREF.md"
-      runbook: |
-        Cause: Container OOM, crash loop, failed deployment, or host issue.
-        Actions:
-          1. Check status: docker ps -a | grep fieldtrack
-          2. Inspect last 100 lines: docker logs fieldtrack-api --tail 100
-          3. Check exit code: docker inspect fieldtrack-api | jq '.[0].State'
-          4. Restart if safe: docker restart fieldtrack-api
-          5. Rollback if bad deploy: see docs/ROLLBACK_QUICKREF.md
-          6. Check host memory/disk: node_memory and node_filesystem alerts
-
-  - alert: ReadinessCheckFailing
-    expr: probe_success{job="fieldtrack-readiness"} == 0
-    for: 3m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Readiness check failing"
-      description: "/ready endpoint has been failing for 3 minutes. Check DB, Redis, and Supabase connectivity."
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ROLLBACK_QUICKREF.md"
-      runbook: |
-        Cause: One or more hard dependencies unhealthy (Redis/Supabase/BullMQ).
-        Actions:
-          1. Hit /ready and /health manually from VPS
-          2. Check Redis ping and Supabase connectivity
-          3. Inspect container logs for startup/recovery errors
-          4. Check worker state in /admin/system-health
-          5. Roll back if issue began immediately after deployment
-
-# ---------------------------------------------------------
-# TLS CERTIFICATE ALERTS
-# ---------------------------------------------------------
-
-- name: fieldtrack_tls_alerts
-  rules:
-
-  - alert: TLSCertExpiringSoon
-    expr: probe_ssl_earliest_cert_expiry{job="fieldtrack-readiness"} - time() < 14 * 24 * 3600
-    for: 1h
-    labels:
-      severity: warning
-    annotations:
-      summary: "TLS certificate expiring within 14 days"
-      description: "Certificate for {{ $labels.instance }} expires in less than 14 days. Renew via certbot."
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/DEPLOYMENT.md"
-      runbook: |
-        Cause: Certificate nearing expiry date.
-        Actions:
-          1. Verify expiry date using blackbox panel and `openssl s_client`
-          2. Renew certificate (certbot or managed provider)
-          3. Reload NGINX and confirm certificate chain
-          4. Recheck probe_ssl_earliest_cert_expiry metric
-          5. Confirm no stale cert served via CDN edge
-
-  - alert: TLSCertExpired
-    expr: probe_ssl_earliest_cert_expiry{job="fieldtrack-readiness"} - time() < 0
-    for: 5m
-    labels:
-      severity: critical
-    annotations:
-      summary: "TLS certificate has expired"
-      description: "Certificate for {{ $labels.instance }} has expired. All HTTPS traffic is failing."
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/DEPLOYMENT.md"
-      runbook: |
-        Cause: Certificate renewal failed or cert not reloaded.
-        Actions:
-          1. Renew certificate immediately
-          2. Reload NGINX and verify HTTPS handshake
-          3. Validate Cloudflare/full-chain configuration
-          4. Confirm /health and /ready are reachable over HTTPS
-          5. Open incident and track customer impact window
-
-# ---------------------------------------------------------
-# WEBHOOK DELIVERY SLOs  (SLO 4 + SLO 5)
-# See docs/SLO.md for full SLO definitions and error-budget
-# burn-rate strategy.
-# ---------------------------------------------------------
-
-- name: fieldtrack_webhook_slo_alerts
-  rules:
-
-  # --- SLO 4: Webhook delivery permanent failure rate > 10% for 5 m (warning) --
-  - alert: WebhookDeliveryFailureRateWarning
-    expr: fieldtrack:webhook_failure_rate_5m > 0.10
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Webhook permanent failure rate above 10%"
-      description: >-
-        More than 10% of webhook deliveries are permanently failing (all retries
-        exhausted) over the last 5 minutes.  Check receiver endpoints and circuit
-        breaker status.  DLQ jobs can be replayed via POST /admin/webhook-dlq/:id/replay.
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/WEBHOOK_SIGNATURES.md"
-      runbook: |
-        Cause: Elevated webhook failures for one or more receivers.
-        Actions:
-          1. Check webhook worker logs for dominant error patterns
-          2. Inspect DLQ depth and recent failed deliveries
-          3. Confirm receiver endpoints are reachable and returning 2xx
-          4. Check circuit breaker status in webhooks table
-          5. Replay DLQ jobs after root cause is fixed
-
-  # --- SLO 4: Webhook delivery permanent failure rate > 30% for 2 m (critical) -
-  - alert: WebhookDeliveryFailureRateHigh
-    expr: fieldtrack:webhook_failure_rate_5m > 0.30
-    for: 2m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Webhook permanent failure rate critically high (>30%)"
-      description: >-
-        Over 30% of webhook deliveries are permanently failing.  This is a
-        customer-visible outage for all orgs with active webhooks.  Investigate
-        immediately: check DB connectivity, receiver endpoints, and circuit breaker
-        state.
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/WEBHOOK_SIGNATURES.md"
-      runbook: |
-        Cause: Mass endpoint failures, DB outage, or a code bug in the delivery worker.
-        Actions:
-          1. Check worker logs: docker logs fieldtrack-api | grep "webhook.worker"
-          2. Inspect DLQ: GET /admin/webhook-dlq (admin token required)
-          3. Check circuit breaker state: query webhooks table for circuit_open_until IS NOT NULL
-          4. Replay DLQ entries after fixing root cause: POST /admin/webhook-dlq/:id/replay
-          5. If DB issue: check Supabase dashboard, verify webhook_deliveries writes
-          6. If code bug: rollback deployment — see docs/ROLLBACK_QUICKREF.md
-
-  # --- SLO 5: DLQ depth above 100 for 30 min --------------------------------
-  - alert: WebhookDlqGrowing
-    expr: dlq_size{queue="webhook-delivery-dlq"} > 100
-    for: 30m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Webhook DLQ depth above 100 for 30 minutes"
-      description: >-
-        The webhook dead-letter queue has had more than 100 unprocessed jobs for
-        30 minutes.  This indicates sustained delivery failures that exceed the
-        normal transient-failure pattern.  Review DLQ via GET /admin/webhook-dlq
-        and replay or purge stale entries.
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/OBSERVABILITY_ARCHITECTURE.md"
-      runbook: |
-        Cause: Persistent downstream delivery failures.
-        Actions:
-          1. Review DLQ entries and identify repeated endpoint failures
-          2. Confirm webhook receiver health and DNS/TLS validity
-          3. Inspect retry/error metrics and circuit breaker audit entries
-          4. Purge stale DLQ entries after archival is confirmed
-          5. Replay jobs only after receivers are healthy
-
-  # --- Circuit breaker: any webhook circuit opened (leading indicator) -------
-  #
-  # webhook_failures_total counts permanent failures; a sudden spike often
-  # indicates a circuit breaker tripped.  A short `for: 0m` (fires immediately)
-  # gives the earliest possible signal to investigate the affected endpoint.
-  - alert: WebhookCircuitBreakerOpened
-    expr: increase(webhook_failures_total[2m]) > 5
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Webhook failure spike — possible circuit breaker activation"
-      description: >-
-        More than 5 permanent webhook failures occurred in the last 2 minutes.
-        A circuit breaker may have opened, pausing delivery to one or more
-        endpoints.  Check circuit breaker state in webhook_deliveries and the
-        webhooks.circuit_open_until column.
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/WEBHOOK_SIGNATURES.md"
-      runbook: |
-        Cause: Rapid repeated delivery failures triggered circuit breaker protection.
-        Actions:
-          1. Query webhooks with circuit_open_until > now()
-          2. Validate receiver status codes and timeout behavior
-          3. Confirm auto-recovery scanner is running in worker logs
-          4. Check whether failures are payload/size related vs network
-          5. Re-enable/replay once endpoint stability is restored
-
-  # --- Rate limit burst spike -----------------------------------------------
-  - alert: RateLimitBurstSpike
-    expr: increase(security_rate_limit_hits_total[5m]) > 500
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Rate limiter blocking unusually high request volume"
-      description: >-
-        More than 500 requests were rate-limited in the last 5 minutes.  This
-        may indicate a misconfigured client, a burst from a single org, or the
-        start of a DoS attempt.  Review the rate-limit logs to identify the
-        offending org / IP.
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ARCHITECTURE.md"
-      runbook: |
-        Cause: Burst traffic beyond per-user/per-org sliding window limits.
-        Actions:
-          1. Inspect rate-limit logs for top offending keys
-          2. Confirm traffic is expected (batch job) vs malicious
-          3. Check Redis health to ensure limiter is functioning correctly
-          4. Apply temporary edge-level mitigation if attack suspected
-          5. Tune per-org/per-user thresholds only with incident review
-
-# ---------------------------------------------------------
-# API ERROR BUDGET BURN RATE  (SLO 3 multi-window alerting)
-# See docs/SLO.md §Error Budget Alert Strategy
-# ---------------------------------------------------------
-
-- name: fieldtrack_slo_error_budget
-  rules:
-
-  # Fast burn: 1 h window at 14x burn rate (>14% error rate)
-  # exhausts monthly error budget in ~2 days if sustained.
-  - alert: FieldTrackSloErrorBudgetBurnFast
-    expr: fieldtrack:api_error_rate_1h > 0.14
-    for: 5m
-    labels:
-      severity: critical
-    annotations:
-      summary: "API error budget burning fast (14x rate)"
-      description: >-
-        The 1-hour error rate exceeds 14% (14x normal budget burn).  At this
-        rate the monthly error budget will be exhausted in under 2 days.
-        Investigate 5xx errors immediately - check logs, DB connectivity, and
-        recent deployments.
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/SLO.md"
-      runbook: |
-        Cause: Sustained high error rate burning error budget at 14x the normal rate.
-        Actions:
-          1. Identify failing routes: check Grafana → FieldTrack API dashboard
-          2. Check container logs for exceptions: docker logs fieldtrack-api --tail 500
-          3. Check DB connectivity: /ready endpoint from VPS
-          4. If recent deploy: rollback immediately — see docs/ROLLBACK_QUICKREF.md
-          5. Open an incident; notify stakeholders if budget < 50%
-
-  # Slow burn: 6 h window at 6x burn rate (>6% error rate)
-  # exhausts monthly error budget in ~5 days if sustained.
-  - alert: FieldTrackSloErrorBudgetBurnSlow
-    expr: fieldtrack:api_error_rate_6h > 0.06
-    for: 15m
-    labels:
-      severity: warning
-    annotations:
-      summary: "API error budget burning (6x rate over 6 h)"
-      description: >-
-        The 6-hour error rate exceeds 6% (6x normal budget burn).  Open a ticket
-        and investigate the root cause before the error budget is exhausted.
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/SLO.md"
-      runbook: |
-        Cause: Sustained elevated 5xx errors over a long window.
-        Actions:
-          1. Review error budget remaining metric on Grafana dashboard
-          2. Identify top failing routes and error classes
-          3. Correlate with deployments and infra incidents
-          4. Open reliability ticket and assign owner
-          5. Plan mitigations before entering critical burn threshold
-
-  # p99 latency SLO breach - 2 s threshold (SLO 2)
-  - alert: FieldTrackLatencyP99High
-    expr: |
-      histogram_quantile(
-        0.99,
-        sum(rate(http_request_duration_seconds_bucket{job=~"fieldtrack-api.*"}[10m])) by (le)
-      ) > 2
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      summary: "API p99 latency above 2 s (SLO 2 breach)"
-      description: >-
-        The 99th-percentile API response time has been above 2 seconds for 10
-        minutes.  This breaches the p99 latency SLO defined in docs/SLO.md.
-        Check slow queries, worker queue depths, and DB connection pool saturation.
-      runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/SLO.md"
-      runbook: |
-        Cause: Tail-latency degradation affecting a subset of requests.
-        Actions:
-          1. Inspect p99 panel and compare with p95 for tail amplification
-          2. Review very_slow_response logs for route-level concentration
-          3. Check DB wait events and queue backlog growth
-          4. Reduce load or scale services if saturation detected
-          5. Roll back if latency regression tracks a release
\ No newline at end of file
diff --git a/infra/prometheus/prometheus.yml b/infra/prometheus/prometheus.yml
deleted file mode 100644
index 4ce26f2..0000000
--- a/infra/prometheus/prometheus.yml
+++ /dev/null
@@ -1,101 +0,0 @@
-# FieldTrack API \u2014 Prometheus Configuration
-#
-# All containers share api_network, so we scrape by Docker service name.
-#
-# Scrape targets:
-#   1. api-blue  (port 3000 inside container, accessible as api-blue:3000)
-#   2. api-green (port 3000 inside container, accessible as api-green:3000)
-#   3. node-exporter (port 9100, accessible as node-exporter:9100)
-#   4. prometheus    (self-monitoring)
-#
-# NOTE: Both blue and green are listed. Only the active container will be
-# running at any time. Prometheus will mark the stopped one as DOWN — this
-# is expected and harmless.
-
-global:
-  scrape_interval: 15s
-  evaluation_interval: 15s
-  scrape_timeout: 10s
-rule_files:
-  - alerts.yml
-
-# Route fired alerts to Alertmanager for delivery (email, etc.).
-# Alertmanager runs on the shared Docker network at alertmanager:9093.
-alerting:
-  alertmanagers:
-    - static_configs:
-        - targets:
-            - "alertmanager:9093"
-      timeout: 10s
-
-scrape_configs:
-  # ── Fastify Backend (Blue) ──────────────────────────────────────────────────
-  - job_name: "api-blue"
-    scrape_protocols: [OpenMetricsText1.0.0, PrometheusText0.0.4]
-    metrics_path: /metrics
-    scrape_interval: 15s
-    authorization:
-      credentials: ${METRICS_SCRAPE_TOKEN}
-    static_configs:
-      - targets:
-          - "api-blue:3000"
-        labels:
-          app: "fieldtrack"
-          component: "backend"
-          service: "fieldtrack-api"
-          slot: "blue"
-
-  # ── Fastify Backend (Green) ─────────────────────────────────────────────────
-  - job_name: "api-green"
-    scrape_protocols: [OpenMetricsText1.0.0, PrometheusText0.0.4]
-    metrics_path: /metrics
-    scrape_interval: 15s
-    authorization:
-      credentials: ${METRICS_SCRAPE_TOKEN}
-    static_configs:
-      - targets:
-          - "api-green:3000"
-        labels:
-          app: "fieldtrack"
-          component: "backend"
-          service: "fieldtrack-api"
-          slot: "green"
-
-  # ── Node Exporter ───────────────────────────────────────────────────────────
-  - job_name: "node-exporter"
-    scrape_interval: 30s
-    static_configs:
-      - targets:
-          - "node-exporter:9100"
-        labels:
-          app: "fieldtrack"
-          component: "host"
-
-  # ── Prometheus self-monitoring ──────────────────────────────────────────────
-  - job_name: "prometheus"
-    scrape_interval: 60s
-    static_configs:
-      - targets:
-          - "localhost:9090"
-
-  # ── Blackbox: readiness probe + TLS certificate monitoring ──────────────────
-  # probe_success{job="fieldtrack-readiness"} → used by ReadinessCheckFailing alert.
-  # probe_ssl_earliest_cert_expiry → used by TLSCertExpiringSoon/TLSCertExpired alerts.
-  - job_name: "fieldtrack-readiness"
-    metrics_path: /probe
-    params:
-      module: [http_2xx]
-    scrape_interval: 60s
-    static_configs:
-      - targets:
-          - "https://${API_HOSTNAME}/ready"
-        labels:
-          app: "fieldtrack"
-          component: "tls"
-    relabel_configs:
-      - source_labels: [__address__]
-        target_label: __param_target
-      - target_label: __address__
-        replacement: "blackbox:9115"
-      - source_labels: [__param_target]
-        target_label: instance
diff --git a/infra/promtail/promtail.yml b/infra/promtail/promtail.yml
deleted file mode 100644
index 415d550..0000000
--- a/infra/promtail/promtail.yml
+++ /dev/null
@@ -1,62 +0,0 @@
-server:
-  http_listen_port: 9080
-  grpc_listen_port: 0
-
-positions:
-  # Persisted to the promtail_data volume — survives container restarts.
-  # Must NOT use /tmp (ephemeral) so log offsets are never lost.
-  filename: /data/positions.yaml
-
-clients:
-  - url: http://loki:3100/loki/api/v1/push
-
-scrape_configs:
-
-  - job_name: docker-containers
-
-    static_configs:
-      - targets:
-          - localhost
-        labels:
-          job: docker
-          __path__: /var/lib/docker/containers/*/*-json.log
-
-    pipeline_stages:
-
-      # Docker JSON log parsing
-      - docker: {}
-
-      # Extract container id from file path
-      - regex:
-          expression: '/var/lib/docker/containers/(?P<container_id>[a-f0-9]+)/.*'
-
-      # Attach container id as label
-      - labels:
-          container_id:
-
-      # Extract trace_id if present
-      - regex:
-          expression: 'trace_id":"(?P<trace_id>[a-f0-9]+)'
-
-      - labels:
-          trace_id:
-
-      # Parse JSON logs from Pino
-      - json:
-          expressions:
-            level: level
-            msg: msg
-            trace_id: trace_id
-            span_id: span_id
-
-      - labels:
-          level:
-
-  - job_name: syslog
-
-    static_configs:
-      - targets:
-          - localhost
-        labels:
-          job: syslog
-          __path__: /var/log/*.log
\ No newline at end of file
diff --git a/infra/scripts/render-alertmanager.sh b/infra/scripts/render-alertmanager.sh
deleted file mode 100644
index 692d54f..0000000
--- a/infra/scripts/render-alertmanager.sh
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/usr/bin/env bash
-# =============================================================================
-# infra/scripts/render-alertmanager.sh
-#
-# Renders infra/alertmanager/alertmanager.yml (template) into
-# infra/alertmanager/alertmanager.rendered.yml by substituting
-# ${ALERTMANAGER_SLACK_WEBHOOK} from infra/.env.monitoring.
-#
-# MUST be run before `docker compose up` for the monitoring stack.
-# Alertmanager does NOT support environment variables natively — rendering
-# the config before container start is the only safe approach.
-#
-# Usage (from any directory):
-#   bash infra/scripts/render-alertmanager.sh
-#
-# Exit codes:
-#   0 — rendered file written successfully
-#   1 — validation or rendering failure
-# =============================================================================
-set -euo pipefail
-
-# ---------------------------------------------------------------------------
-# Resolve absolute paths relative to this script's location.
-# This makes the script safe to call from any working directory.
-# ---------------------------------------------------------------------------
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-INFRA_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
-
-ENV_FILE="${INFRA_DIR}/.env.monitoring"
-TEMPLATE_FILE="${INFRA_DIR}/alertmanager/alertmanager.yml"
-OUTPUT_FILE="${INFRA_DIR}/alertmanager/alertmanager.rendered.yml"
-
-log_info()  { printf '[render-alertmanager] INFO  %s\n' "$*" >&2; }
-log_error() { printf '[render-alertmanager] ERROR %s\n' "$*" >&2; }
-
-# ---------------------------------------------------------------------------
-# Pre-flight: ensure required tools exist
-# ---------------------------------------------------------------------------
-if ! command -v envsubst &>/dev/null; then
-    log_error "envsubst not found. Install gettext (apt install gettext / yum install gettext)."
-    exit 1
-fi
-
-# ---------------------------------------------------------------------------
-# Validate env file
-# ---------------------------------------------------------------------------
-if [ ! -f "${ENV_FILE}" ]; then
-    log_error "Env file not found: ${ENV_FILE}"
-    log_error "This file must exist on the VPS and must NOT be committed to the repo."
-    exit 1
-fi
-
-# Load env file via `source` under `set -a` so every assignment is exported.
-# This correctly handles values containing special characters (e.g. https://).
-# DO NOT replace this with `export $(grep ... | xargs)` — xargs splits on
-# whitespace and breaks URLs, quoted strings, and any value with spaces.
-set -a
-# shellcheck source=/dev/null
-source "${ENV_FILE}"
-set +a
-
-# Warn loudly if stale / removed variables are still present in the env file.
-# FRONTEND_DOMAIN was removed from the env contract — its presence here is a
-# sign the file is out of date and should be cleaned up on the VPS.
-if [ -n "${FRONTEND_DOMAIN:-}" ]; then
-    log_error "FRONTEND_DOMAIN is set in ${ENV_FILE} but is no longer part of the env contract."
-    log_error "Remove that line from .env.monitoring on the VPS, then re-run this script."
-    exit 1
-fi
-
-# ---------------------------------------------------------------------------
-# Validate ALERTMANAGER_SLACK_WEBHOOK
-# ---------------------------------------------------------------------------
-if [ -z "${ALERTMANAGER_SLACK_WEBHOOK:-}" ]; then
-    log_error "ALERTMANAGER_SLACK_WEBHOOK is not set or empty in ${ENV_FILE}."
-    exit 1
-fi
-
-case "${ALERTMANAGER_SLACK_WEBHOOK}" in
-    https://hooks.slack.com/*)
-        : # valid prefix
-        ;;
-    *)
-        log_error "ALERTMANAGER_SLACK_WEBHOOK does not start with 'https://hooks.slack.com/'."
-        log_error "Value prefix: ***masked*** (redacted to prevent webhook exposure in logs)"
-        exit 1
-        ;;
-esac
-
-# ---------------------------------------------------------------------------
-# Validate template file
-# ---------------------------------------------------------------------------
-if [ ! -f "${TEMPLATE_FILE}" ]; then
-    log_error "Template file not found: ${TEMPLATE_FILE}"
-    exit 1
-fi
-
-if ! grep -qF '${ALERTMANAGER_SLACK_WEBHOOK}' "${TEMPLATE_FILE}"; then
-    log_error "Template file does not contain '\${ALERTMANAGER_SLACK_WEBHOOK}' placeholder."
-    log_error "Check that ${TEMPLATE_FILE} is the correct template."
-    exit 1
-fi
-
-# ---------------------------------------------------------------------------
-# Render: substitute ONLY ALERTMANAGER_SLACK_WEBHOOK (avoid clobbering any
-# other ${...} placeholders that Alertmanager Go template syntax might use).
-# ---------------------------------------------------------------------------
-log_info "Rendering ${TEMPLATE_FILE} -> ${OUTPUT_FILE}"
-
-envsubst '${ALERTMANAGER_SLACK_WEBHOOK}' \
-    < "${TEMPLATE_FILE}" \
-    > "${OUTPUT_FILE}"
-
-# ---------------------------------------------------------------------------
-# Post-render sanity check: no unsubstituted placeholder must remain
-# ---------------------------------------------------------------------------
-if grep -qF '${ALERTMANAGER_SLACK_WEBHOOK}' "${OUTPUT_FILE}"; then
-    log_error "Rendered file still contains the unsubstituted placeholder. Aborting."
-    rm -f "${OUTPUT_FILE}"
-    exit 1
-fi
-
-# Verify the rendered URL looks real (not a placeholder stub)
-if grep -qF 'YOUR/WEBHOOK/URL' "${OUTPUT_FILE}"; then
-    log_error "Rendered file contains placeholder stub URL. Check your .env.monitoring."
-    rm -f "${OUTPUT_FILE}"
-    exit 1
-fi
-
-# Print a redacted preview so operators can confirm the URL was injected.
-WEBHOOK_PREVIEW=$(grep 'api_url' "${OUTPUT_FILE}" | head -1 | sed 's|\(https://hooks.slack.com/services/[^/]*/[^/]*/\).*|\1***|')
-log_info "Webhook preview (redacted): ${WEBHOOK_PREVIEW}"
-log_info "Success. Rendered file: ${OUTPUT_FILE}"
diff --git a/infra/scripts/verify-alertmanager.sh b/infra/scripts/verify-alertmanager.sh
deleted file mode 100644
index efd472c..0000000
--- a/infra/scripts/verify-alertmanager.sh
+++ /dev/null
@@ -1,192 +0,0 @@
-#!/usr/bin/env bash
-# infra/scripts/verify-alertmanager.sh
-#
-# Verifies that the Alertmanager integration is healthy and that alert routing
-# works end-to-end.
-#
-# Usage:
-#   cd /path/to/fieldtrack/infra
-#   bash scripts/verify-alertmanager.sh
-#
-# Requirements:
-#   - Docker Compose monitoring stack must be running
-#   - curl, jq must be available in PATH
-#   - ALERTMANAGER_URL defaults to http://localhost:9093 (exposed by docker-compose)
-#
-# Exit codes:
-#   0 — all checks passed
-#   1 — one or more checks failed
-
-set -euo pipefail
-
-ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://localhost:9093}"
-PROMETHEUS_URL="${PROMETHEUS_URL:-http://localhost:9090}"
-PASS=0
-FAIL=0
-
-# ── Helper functions ──────────────────────────────────────────────────────────
-
-log_pass() { echo "[PASS] $*"; PASS=$((PASS + 1)); }
-log_fail() { echo "[FAIL] $*"; FAIL=$((FAIL + 1)); }
-log_info() { echo "[INFO] $*"; }
-
-require_cmd() {
-  if ! command -v "$1" &>/dev/null; then
-    echo "[ERROR] Required command '$1' not found. Install it and retry."
-    exit 1
-  fi
-}
-
-# ── Pre-flight ────────────────────────────────────────────────────────────────
-
-require_cmd curl
-require_cmd jq
-
-# ── Step 1: Alertmanager health check ─────────────────────────────────────────
-
-log_info "Checking Alertmanager health at ${ALERTMANAGER_URL}/-/healthy"
-
-HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
-  --max-time 10 \
-  "${ALERTMANAGER_URL}/-/healthy" || echo "000")
-
-if [ "$HTTP_STATUS" = "200" ]; then
-  log_pass "Alertmanager is healthy (HTTP 200)"
-else
-  log_fail "Alertmanager health check returned HTTP ${HTTP_STATUS} (expected 200)"
-fi
-
-# ── Step 2: Alertmanager ready check ──────────────────────────────────────────
-
-log_info "Checking Alertmanager ready state at ${ALERTMANAGER_URL}/-/ready"
-
-READY_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
-  --max-time 10 \
-  "${ALERTMANAGER_URL}/-/ready" || echo "000")
-
-if [ "$READY_STATUS" = "200" ]; then
-  log_pass "Alertmanager is ready (HTTP 200)"
-else
-  log_fail "Alertmanager ready check returned HTTP ${READY_STATUS} (expected 200)"
-fi
-
-# ── Step 3: Alertmanager API — list current alerts ────────────────────────────
-
-log_info "Fetching current alerts from Alertmanager API"
-
-ALERTS_RESPONSE=$(curl -s --max-time 10 \
-  "${ALERTMANAGER_URL}/api/v2/alerts" \
-  -H "Accept: application/json" || echo "")
-
-if echo "$ALERTS_RESPONSE" | jq empty 2>/dev/null; then
-  ALERT_COUNT=$(echo "$ALERTS_RESPONSE" | jq 'length')
-  log_pass "Alertmanager API responded with valid JSON (${ALERT_COUNT} active alerts)"
-else
-  log_fail "Alertmanager API did not return valid JSON"
-fi
-
-# ── Step 4: Prometheus → Alertmanager connection ──────────────────────────────
-
-log_info "Checking Prometheus alertmanager targets at ${PROMETHEUS_URL}/api/v1/alertmanagers"
-
-PROM_AM=$(curl -s --max-time 10 \
-  "${PROMETHEUS_URL}/api/v1/alertmanagers" || echo "")
-
-if echo "$PROM_AM" | jq -e '.data.activeAlertmanagers | length > 0' &>/dev/null; then
-  ACTIVE=$(echo "$PROM_AM" | jq -r '.data.activeAlertmanagers[0].url // "unknown"')
-  log_pass "Prometheus is connected to Alertmanager at ${ACTIVE}"
-else
-  log_fail "Prometheus has no active Alertmanager targets — check prometheus.yml alerting block"
-fi
-
-# ── Step 5: Fire a test alert and verify it appears ───────────────────────────
-
-log_info "Sending test alert to Alertmanager"
-
-TEST_ALERT_PAYLOAD=$(cat <<'EOF'
-[{
-  "labels": {
-    "alertname": "ApiAlertmanagerVerification",
-    "severity": "warning",
-    "job": "fieldtrack-api"
-  },
-  "annotations": {
-    "summary": "Alertmanager verification test — safe to ignore",
-    "description": "This alert was fired by verify-alertmanager.sh to confirm end-to-end routing. It will auto-resolve in 5 minutes."
-  },
-  "startsAt": "'"$(date -u +"%Y-%m-%dT%H:%M:%SZ")"'",
-  "endsAt": "'"$(date -u -d "+5 minutes" +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -v+5M +"%Y-%m-%dT%H:%M:%SZ")"'"
-}]
-EOF
-)
-
-POST_STATUS=$(curl -s -o /tmp/am_post_response.txt -w "%{http_code}" \
-  --max-time 10 \
-  -X POST "${ALERTMANAGER_URL}/api/v2/alerts" \
-  -H "Content-Type: application/json" \
-  --data "$TEST_ALERT_PAYLOAD" || echo "000")
-
-if [ "$POST_STATUS" = "200" ]; then
-  log_pass "Test alert accepted by Alertmanager (HTTP 200)"
-else
-  log_fail "Alertmanager rejected test alert (HTTP ${POST_STATUS})"
-  cat /tmp/am_post_response.txt 2>/dev/null || true
-fi
-
-# ── Step 6: Confirm test alert is visible in active alerts ────────────────────
-
-log_info "Waiting 2 seconds for alert to be indexed..."
-sleep 2
-
-ACTIVE_ALERTS=$(curl -s --max-time 10 \
-  "${ALERTMANAGER_URL}/api/v2/alerts?filter=alertname%3DApiAlertmanagerVerification" \
-  -H "Accept: application/json" || echo "[]")
-
-if echo "$ACTIVE_ALERTS" | jq -e 'length > 0' &>/dev/null; then
-  log_pass "Test alert is visible in Alertmanager active alerts list"
-else
-  log_fail "Test alert not found in active alerts — check Alertmanager configuration"
-fi
-
-# ── Step 7: Verify Prometheus rule files load without errors ──────────────────
-
-log_info "Checking Prometheus rule files are loaded correctly"
-
-RULES_RESPONSE=$(curl -s --max-time 10 \
-  "${PROMETHEUS_URL}/api/v1/rules" || echo "")
-
-if echo "$RULES_RESPONSE" | jq -e '.data.groups | length > 0' &>/dev/null; then
-  GROUP_COUNT=$(echo "$RULES_RESPONSE" | jq '.data.groups | length')
-  log_pass "Prometheus loaded ${GROUP_COUNT} rule group(s) from alerts.yml"
-else
-  log_fail "No rule groups found in Prometheus — check alerts.yml path in prometheus.yml"
-fi
-
-# ── Summary ───────────────────────────────────────────────────────────────────
-
-echo ""
-echo "─────────────────────────────────────"
-echo " Alertmanager Verification Summary"
-echo "─────────────────────────────────────"
-echo " PASS: ${PASS}"
-echo " FAIL: ${FAIL}"
-echo "─────────────────────────────────────"
-echo ""
-
-if [ "$FAIL" -gt 0 ]; then
-  echo "One or more checks failed. Review the output above."
-  echo ""
-  echo "Common fixes:"
-  echo "  • Not running? Start with:"
-  echo "      docker compose -f infra/docker-compose.monitoring.yml up -d alertmanager prometheus"
-  echo "  • Slack webhook missing? Add to infra/.env.monitoring:"
-  echo "      ALERTMANAGER_SLACK_WEBHOOK"
-  echo "  • Prometheus can't reach Alertmanager? Verify they share api_network."
-  exit 1
-fi
-
-echo "All checks passed. Alertmanager is operational."
-echo ""
-echo "NOTE: The test alert 'ApiAlertmanagerVerification' will auto-resolve in 5 minutes."
-echo "      You can silence it early via: ${ALERTMANAGER_URL}/#/silences"
-exit 0
diff --git a/infra/tempo/tempo.yml b/infra/tempo/tempo.yml
deleted file mode 100644
index 8c2b943..0000000
--- a/infra/tempo/tempo.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-server:
-  http_listen_port: 3200
-
-distributor:
-  receivers:
-    otlp:
-      protocols:
-        http:
-        grpc:
-
-ingester:
-  trace_idle_period: 10s
-  max_block_bytes: 1_000_000
-  max_block_duration: 5m
-
-compactor:
-  compaction:
-    block_retention: 24h
-
-storage:
-  trace:
-    backend: local
-    local:
-      path: /var/tempo/traces
-
-metrics_generator:
-  storage:
-    path: /var/tempo/generator
-    remote_write:
-      - url: http://prometheus:9090/api/v1/write
-        send_exemplars: true
-  processor:
-    service_graphs:
-      wait: 10s
-      max_items: 10000
-    span_metrics:
-
-overrides:
-  defaults:
-    metrics_generator:
-      processors:
-        - service-graphs
-        - span-metrics
\ No newline at end of file
diff --git a/package.json b/package.json
index 1c87358..7fb4f99 100644
--- a/package.json
+++ b/package.json
@@ -13,8 +13,7 @@
     "lint": "eslint src/modules --ext .ts",
     "start": "node dist/server.js",
     "test": "vitest run",
-    "test:watch": "vitest",
-    "analytics:backfill": "tsx scripts/analytics-backfill.ts"
+    "test:watch": "vitest"
   },
   "keywords": [],
   "author": "",
diff --git a/scripts/analytics-backfill.ts b/scripts/analytics-backfill.ts
deleted file mode 100644
index 84260c8..0000000
--- a/scripts/analytics-backfill.ts
+++ /dev/null
@@ -1,242 +0,0 @@
-/**
- * analytics-backfill.ts — Phase 21 backfill script.
- *
- * Scans historical attendance_sessions and populates employee_daily_metrics
- * and org_daily_metrics for any dates that have missing or incomplete rows.
- *
- * Usage:
- *   npm run analytics:backfill
- *
- * The script is additive and idempotent: running it multiple times produces
- * the same result.  Existing rows are updated via UPSERT (SET, not increment),
- * so it is safe to re-run after data corrections.
- *
- * Processing:
- *  - Fetches all completed sessions (checkout_at IS NOT NULL AND
- *    total_distance_km IS NOT NULL) in batches of BATCH_SIZE.
- *  - Groups by (organization_id, employee_id, date).
- *  - UPSERTs employee_daily_metrics for each group.
- *  - UPSERTs org_daily_metrics by aggregating the just-written employee rows.
- *
- * Skips sessions where total_distance_km is NULL (distance worker not yet run).
- */
-
-import dotenv from "dotenv";
-dotenv.config();
-
-import { createClient } from "@supabase/supabase-js";
-
-// ─── Configuration ────────────────────────────────────────────────────────────
-
-const SUPABASE_URL = process.env["SUPABASE_URL"];
-const SUPABASE_SERVICE_ROLE_KEY = process.env["SUPABASE_SERVICE_ROLE_KEY"];
-
-if (!SUPABASE_URL || !SUPABASE_SERVICE_ROLE_KEY) {
-  console.error(
-    "ERROR: SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY must be set in environment",
-  );
-  process.exit(1);
-}
-
-const supabase = createClient(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY);
-
-/** Number of sessions fetched per paginated round-trip. */
-const BATCH_SIZE = 500;
-
-/** Pause between batches to avoid overwhelming the DB connection pool. */
-const BATCH_DELAY_MS = 100;
-
-// ─── Types ────────────────────────────────────────────────────────────────────
-
-interface SessionRow {
-  id: string;
-  employee_id: string;
-  organization_id: string;
-  checkin_at: string;
-  total_distance_km: number;
-  total_duration_seconds: number;
-}
-
-interface DailyKey {
-  orgId: string;
-  empId: string;
-  date: string;
-}
-
-interface DailyAggregate {
-  sessions: number;
-  distance_km: number;
-  duration_seconds: number;
-}
-
-// ─── Helpers ──────────────────────────────────────────────────────────────────
-
-function sleep(ms: number): Promise<void> {
-  return new Promise((resolve) => setTimeout(resolve, ms));
-}
-
-/**
- * Group session rows by (organization_id, employee_id, date) and accumulate
- * totals.  Returns a map keyed by `orgId|empId|date`.
- */
-function groupByEmployeeDay(
-  sessions: SessionRow[],
-): Map<string, { key: DailyKey; agg: DailyAggregate }> {
-  const map = new Map<string, { key: DailyKey; agg: DailyAggregate }>();
-
-  for (const s of sessions) {
-    const date = s.checkin_at.substring(0, 10);
-    const mapKey = `${s.organization_id}|${s.employee_id}|${date}`;
-    const existing = map.get(mapKey);
-    if (existing) {
-      existing.agg.sessions++;
-      existing.agg.distance_km += s.total_distance_km ?? 0;
-      existing.agg.duration_seconds += s.total_duration_seconds ?? 0;
-    } else {
-      map.set(mapKey, {
-        key: { orgId: s.organization_id, empId: s.employee_id, date },
-        agg: {
-          sessions: 1,
-          distance_km: s.total_distance_km ?? 0,
-          duration_seconds: s.total_duration_seconds ?? 0,
-        },
-      });
-    }
-  }
-
-  return map;
-}
-
-// ─── Backfill Logic ───────────────────────────────────────────────────────────
-
-async function backfill(): Promise<void> {
-  console.log("=== FieldTrack Analytics Backfill ===");
-  console.log(`Batch size: ${BATCH_SIZE}`);
-  console.log("Fetching completed, distance-computed sessions...\n");
-
-  let page = 0;
-  let totalSessions = 0;
-  let totalEmployeeDays = 0;
-  let totalErrors = 0;
-  let hasMore = true;
-
-  while (hasMore) {
-    const from = page * BATCH_SIZE;
-    const to = from + BATCH_SIZE - 1;
-
-    const { data, error } = await supabase
-      .from("attendance_sessions")
-      .select(
-        "id, employee_id, organization_id, checkin_at, total_distance_km, total_duration_seconds",
-      )
-      .not("checkout_at", "is", null)
-      .not("total_distance_km", "is", null)
-      .order("checkin_at", { ascending: true })
-      .range(from, to);
-
-    if (error) {
-      console.error(`Batch ${page + 1}: fetch error — ${error.message}`);
-      totalErrors++;
-      break;
-    }
-
-    const batch = (data ?? []) as SessionRow[];
-    if (batch.length === 0) {
-      break;
-    }
-
-    console.log(
-      `Batch ${page + 1}: processing ${batch.length} sessions (offset ${from})...`,
-    );
-
-    // ── Group sessions by (org, employee, date) ───────────────────────────────
-
-    const employeeDayMap = groupByEmployeeDay(batch);
-    totalSessions += batch.length;
-    totalEmployeeDays += employeeDayMap.size;
-
-    // ── UPSERT employee_daily_metrics ─────────────────────────────────────────
-
-    const empUpsertRows = [...employeeDayMap.values()].map(({ key, agg }) => ({
-      organization_id: key.orgId,
-      employee_id: key.empId,
-      date: key.date,
-      sessions: agg.sessions,
-      distance_km: Math.round(agg.distance_km * 1000) / 1000,
-      duration_seconds: agg.duration_seconds,
-    }));
-
-    const { error: empErr } = await supabase
-      .from("employee_daily_metrics")
-      .upsert(empUpsertRows, { onConflict: "employee_id,date" });
-
-    if (empErr) {
-      console.error(`  employee_daily_metrics upsert failed: ${empErr.message}`);
-      totalErrors++;
-    } else {
-      console.log(`  employee_daily_metrics: upserted ${empUpsertRows.length} rows`);
-    }
-
-    // ── Compute org-level aggregates from the employee rows we just wrote ─────
-
-    // Group the same batch by (org, date)
-    const orgDayMap = new Map<string, { orgId: string; date: string; agg: DailyAggregate }>();
-    for (const { key, agg } of employeeDayMap.values()) {
-      const mapKey = `${key.orgId}|${key.date}`;
-      const existing = orgDayMap.get(mapKey);
-      if (existing) {
-        existing.agg.sessions += agg.sessions;
-        existing.agg.distance_km += agg.distance_km;
-        existing.agg.duration_seconds += agg.duration_seconds;
-      } else {
-        orgDayMap.set(mapKey, {
-          orgId: key.orgId,
-          date: key.date,
-          agg: { ...agg },
-        });
-      }
-    }
-
-    const orgUpsertRows = [...orgDayMap.values()].map(({ orgId, date, agg }) => ({
-      organization_id: orgId,
-      date,
-      total_sessions: agg.sessions,
-      total_distance_km: Math.round(agg.distance_km * 1000) / 1000,
-      total_duration_seconds: agg.duration_seconds,
-    }));
-
-    const { error: orgErr } = await supabase
-      .from("org_daily_metrics")
-      .upsert(orgUpsertRows, { onConflict: "organization_id,date" });
-
-    if (orgErr) {
-      console.error(`  org_daily_metrics upsert failed: ${orgErr.message}`);
-      totalErrors++;
-    } else {
-      console.log(`  org_daily_metrics: upserted ${orgUpsertRows.length} rows`);
-    }
-
-    if (batch.length < BATCH_SIZE) {
-      // Last page — no more rows
-      hasMore = false;
-    } else {
-      page++;
-      await sleep(BATCH_DELAY_MS);
-    }
-  }
-
-  console.log("\n=== Backfill Complete ===");
-  console.log(`Sessions processed : ${totalSessions}`);
-  console.log(`Employee-day rows  : ${totalEmployeeDays}`);
-  console.log(`Errors             : ${totalErrors}`);
-
-  if (totalErrors > 0) {
-    console.error("Backfill completed with errors — check output above.");
-    process.exit(1);
-  }
-}
-
-backfill().catch((err: unknown) => {
-  console.error("Backfill failed:", err instanceof Error ? err.message : String(err));
-  process.exit(1);
-});
diff --git a/scripts/deploy-bluegreen.sh b/scripts/deploy-bluegreen.sh
deleted file mode 100644
index 8771110..0000000
--- a/scripts/deploy-bluegreen.sh
+++ /dev/null
@@ -1,1539 +0,0 @@
-#!/usr/bin/env bash
-# =============================================================================
-# deploy-bluegreen.sh — API Blue-Green Deployment
-#
-# State machine:
-#   INIT
-#   -> PRE_FLIGHT      (preflight.sh + env validation)
-#   -> PULL_IMAGE      (with timeout guard)
-#   -> RESOLVE_SLOT    (recovery-aware slot detection)
-#   -> IDEMPOTENCY     (skip if same SHA already running)
-#   -> START_INACTIVE  (with timeout + image immutability check)
-#   -> HEALTH_CHECK_INTERNAL  (connectivity pre-check + readiness loop)
-#   -> SWITCH_NGINX    (nginx -t gate + atomic slot write)
-#   -> HEALTH_CHECK_PUBLIC    (DNS/TLS/CDN end-to-end)
-#   -> STABILITY_CHECK (post-switch re-verify after settle window)
-#   -> CLEANUP         (graceful shutdown of old container)
-#   -> SUCCESS         (truth check + last-known-good snapshot)
-#
-# Deployment classification states emitted via _ft_state:
-#   DEPLOY_SUCCESS          -- zero-downtime deploy completed
-#   DEPLOY_FAILED_SAFE      -- deploy failed, old container still healthy
-#   DEPLOY_FAILED_ROLLBACK  -- deploy failed AND rollback was triggered
-#   DEPLOY_FAILED_FATAL     -- deploy AND rollback both failed (manual needed)
-#
-# On failure:
-#   -> if active container still running  -> DEPLOY_FAILED_SAFE  exit 1
-#   -> if active container gone           -> rollback triggered
-#      -> rollback succeeded              -> DEPLOY_FAILED_ROLLBACK  exit 1
-#      -> rollback failed                 -> DEPLOY_FAILED_FATAL     exit 2
-#
-# Slot state file: /var/run/api/active-slot
-#   /var/run is a tmpfs (cleared on reboot). The _ft_resolve_slot() recovery
-#   function handles a missing file by inspecting running containers and the
-#   live nginx config, then re-writing the file. No manual step needed after
-#   a reboot or unexpected /run eviction.
-#
-# Exit codes:
-#   0  DEPLOY_SUCCESS              -- zero-downtime deploy succeeded
-#   1  DEPLOY_FAILED_SAFE          -- deploy failed, old container still serving
-#      or DEPLOY_FAILED_ROLLBACK   -- deploy failed, rollback succeeded
-#   2  DEPLOY_FAILED_FATAL         -- deploy AND rollback both failed (rare)
-#   3  DEPLOY_FAILED_FATAL         -- fatal guard (active container missing, race condition)
-#
-# Observability features:
-#   DEPLOY_ID        -- unique deploy identifier for log correlation (YYYYMMDD_HHMMSS_PID)
-#   deploy_id label  -- container labeled with deploy ID for instant traceability
-#   api.sha   -- container labeled with image SHA for quick version lookup
-#   api.slot  -- container labeled with slot name (blue/green)
-#   duration_sec     -- all exits logged with deploy duration for performance tracking
-#   PREFLIGHT_STRICT -- optional strict mode: enforces preflight checks, fails if missing
-#
-# =============================================================================
-set -euo pipefail
-# Enable explicit debugging when DEBUG=true, otherwise suppress xtrace
-if [ "${DEBUG:-false}" = "true" ]; then
-  set -x
-fi
-trap '_ft_trap_err "$LINENO"' ERR
-
-# ---------------------------------------------------------------------------
-# STRUCTURED LOGGING  [DEPLOY] ts=<ISO8601> state=<STATE> <key=value ...>
-# ALL logging writes to stderr (>&2) so that functions returning values via
-# stdout are never contaminated. stdout = data only; stderr = logs.
-# { set +x; } 2>/dev/null suppresses xtrace noise inside helpers.
-# ---------------------------------------------------------------------------
-_FT_STATE="INIT"
-DEPLOY_LOG_FILE="${DEPLOY_LOG_FILE:-/var/log/api/deploy.log}"
-
-# Ensure log directory exists with fallback to home directory
-LOG_DIR="$(dirname "$DEPLOY_LOG_FILE")"
-if ! mkdir -p "$LOG_DIR" 2>/dev/null; then
-    LOG_DIR="$HOME/api/logs"
-    DEPLOY_LOG_FILE="$LOG_DIR/deploy.log"
-    mkdir -p "$LOG_DIR"
-fi
-
-_ft_log() {
-    { set +x; } 2>/dev/null
-    local log_entry
-    log_entry=$(printf '[DEPLOY] deploy_id=%s ts=%s state=%s %s' "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$*")
-    printf '%s\n' "$log_entry" | tee -a "$DEPLOY_LOG_FILE" >&2
-    if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
-}
-
-_ft_state() {
-    { set +x; } 2>/dev/null
-    _FT_STATE="$1"; shift
-    printf '[DEPLOY] deploy_id=%s ts=%s state=%s %s\n' "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$*" >&2
-    if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
-}
-
-_ft_trap_err() {
-    { set +x; } 2>/dev/null
-    printf '[ERROR] deploy_id=%s ts=%s state=%s msg="unexpected failure at line %s"\n' \
-        "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$1" >&2
-    if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
-}
-
-# ---------------------------------------------------------------------------
-# ERROR HELPER -- [ERROR]-prefixed log for failure paths
-# ---------------------------------------------------------------------------
-_ft_error() {
-    { set +x; } 2>/dev/null
-    local log_entry
-    log_entry=$(printf '[ERROR] deploy_id=%s ts=%s state=%s %s' "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$*")
-    printf '%s\n' "$log_entry" | tee -a "$DEPLOY_LOG_FILE" >&2
-    if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
-}
-
-# ---------------------------------------------------------------------------
-# PHASE TIMING HELPER -- wrap phases to measure wall-clock duration
-# Usage:
-#   _ft_phase_start "PHASE_NAME"
-#   ... phase work ...
-#   _ft_phase_end "PHASE_NAME"
-# ---------------------------------------------------------------------------
-_ft_phase_start() {
-    eval "_${1}_START=\$(date +%s)"
-}
-
-_ft_phase_end() {
-    local phase="$1"
-    local start_var="_${phase}_START"
-    local start_ts=${!start_var:-0}
-    if [ "$start_ts" -gt 0 ]; then
-        local duration=$(($(date +%s) - start_ts))
-        _ft_log "msg='phase_complete' phase=$phase duration_sec=$duration"
-    fi
-}
-
-# ---------------------------------------------------------------------------
-# GITHUB ACTIONS SUMMARY -- writes deployment summary to Actions UI
-# Called at end of deploy (success or failure)
-# ---------------------------------------------------------------------------
-_ft_github_summary() {
-    local status="$1"
-    local container="${2:-unknown}"
-    local image="${3:-unknown}"
-    local reason="${4:-}"
-
-    if [ -z "$GITHUB_STEP_SUMMARY" ]; then
-        return 0  # Not running in GitHub Actions
-    fi
-
-    {
-        echo "### 🚀 Deployment Summary"
-        echo ""
-        echo "| Field | Value |"
-        echo "|-------|-------|"
-        echo "| Status | **$status** |"
-        echo "| Deploy ID | \`$DEPLOY_ID\` |"
-        echo "| Duration | $(($(date +%s) - START_TS))s |"
-        echo "| Active Container | \`$container\` |"
-        echo "| Image SHA | \`${image:0:12}...\` |"
-        if [ -n "$reason" ]; then
-            echo "| Reason | $reason |"
-        fi
-        echo "| Timestamp | $(date -u +'%Y-%m-%d %H:%M:%S UTC') |"
-    } >> "$GITHUB_STEP_SUMMARY"
-}
-
-# ---------------------------------------------------------------------------
-# FINAL SYSTEM STATE SNAPSHOT -- records ground truth on success
-# ---------------------------------------------------------------------------
-_ft_final_state() {
-    local active_container="$1"
-    local image_sha="$2"
-    local nginx_upstream
-    nginx_upstream=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo 'unknown')
-    _ft_log "msg='final_state' deploy_id=$DEPLOY_ID active=$active_container sha=${image_sha:0:12} nginx_upstream=$nginx_upstream"
-}
-
-# ---------------------------------------------------------------------------
-# DOCKER HEALTH GATE
-# Waits for the container's HEALTHCHECK to reach "healthy" before allowing
-# nginx to switch. If the container has no HEALTHCHECK defined, this returns
-# immediately (status="none") to avoid blocking on unconfigured containers.
-# ---------------------------------------------------------------------------
-_ft_wait_docker_health() {
-    local name="$1"
-    local i=1
-    local STATUS
-    while [ "$i" -le 30 ]; do
-        STATUS=$(docker inspect --format='{{.State.Health.Status}}' "$name" 2>/dev/null || echo "none")
-        if [ "$STATUS" = "healthy" ]; then
-            _ft_log "msg='docker health check passed' container=$name"
-            return 0
-        fi
-        if [ "$STATUS" = "unhealthy" ]; then
-            _ft_error "msg='docker health check failed' container=$name status=unhealthy"
-            return 1
-        fi
-        # "none" means the image has no HEALTHCHECK — skip gate (return 0 immediately)
-        if [ "$STATUS" = "none" ]; then
-            _ft_log "msg='docker health gate skipped (no HEALTHCHECK defined)' container=$name"
-            return 0
-        fi
-        [ $(( i % 5 )) -eq 0 ] && _ft_log "msg='waiting for docker health' attempt=$i/30 status=$STATUS container=$name"
-        sleep 2
-        i=$(( i + 1 ))
-    done
-    _ft_error "msg='docker health timeout' container=$name last_status=$STATUS"
-    return 1
-}
-
-# ---------------------------------------------------------------------------
-# SYSTEM SNAPSHOT -- emitted on any unrecoverable failure
-# ---------------------------------------------------------------------------
-_ft_snapshot() {
-    { set +x; } 2>/dev/null
-    printf '[DEPLOY] -- SYSTEM SNAPSHOT ----------------------------------------\n' >&2
-    printf '[DEPLOY]   slot_file  = %s\n' "$(cat "${ACTIVE_SLOT_FILE:-/var/run/api/active-slot}" 2>/dev/null || echo 'MISSING')" >&2
-    printf '[DEPLOY]   nginx_upstream = %s\n' "$(grep -oE 'http://(api-blue|api-green):3000' "${NGINX_CONF:-$HOME/api/infra/nginx/live/api.conf}" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo 'unreadable')" >&2
-    printf '[DEPLOY]   containers =\n' >&2
-    docker ps --format '[DEPLOY]     {{.Names}} -> {{.Status}} ({{.Ports}})' 1>&2 2>/dev/null \
-        || printf '[DEPLOY]     (docker ps unavailable)\n' >&2
-    printf '[DEPLOY] -----------------------------------------------------------\n' >&2
-    if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
-}
-
-# ---------------------------------------------------------------------------
-# DEPLOYMENT CLASSIFICATION -- single-source exit helper
-#   All terminal exit paths MUST go through _ft_exit to avoid state drift.
-#
-#   _ft_exit <code> <STATE> [key=value ...]
-#     code 0 -> DEPLOY_SUCCESS
-#     code 1 -> DEPLOY_FAILED_SAFE | DEPLOY_FAILED_ROLLBACK
-#     code 2 -> DEPLOY_FAILED_FATAL
-#
-#   DEPLOY_SUCCESS          zero-downtime deploy completed
-#   DEPLOY_FAILED_SAFE      deploy failed, old container still serving
-#   DEPLOY_FAILED_ROLLBACK  deploy failed, rollback triggered (system restored)
-#   DEPLOY_FAILED_FATAL     deploy AND rollback both failed (manual needed)
-# ---------------------------------------------------------------------------
-_ft_exit() {
-    local code="$1"; shift
-    local duration=$(( $(date +%s) - START_TS ))
-    _ft_state "$@" "duration_sec=$duration"
-    exit "$code"
-}
-
-# Kept for compatibility; delegates to _ft_exit for a final classify+exit in one line.
-_ft_classify() {
-    local outcome="$1"; shift
-    _ft_state "$outcome" "outcome=$outcome $*"
-}
-
-# ---------------------------------------------------------------------------
-# DEPLOYMENT TIMING & IDENTIFIERS
-# ---------------------------------------------------------------------------
-START_TS=$(date +%s)
-DEPLOY_ID=$(date +%Y%m%d_%H%M%S)_$$
-PREFLIGHT_STRICT="${PREFLIGHT_STRICT:-false}"
-
-_ft_log "msg='deploy started' deploy_id=$DEPLOY_ID pid=$$ start_ts=$START_TS"
-if [ "$PREFLIGHT_STRICT" = "true" ]; then
-    _ft_log "msg='PREFLIGHT_STRICT=true -- will enforce preflight checks'"
-fi
-
-# ---------------------------------------------------------------------------
-# CONSTANTS
-# ---------------------------------------------------------------------------
-# Immutable SHA tags ONLY — 'latest' is forbidden in production.
-# Reject empty and 'latest' before any Docker operation so failures are
-# loud and attributed to the caller rather than appearing as pull errors.
-IMAGE_SHA="${1:-}"
-if [ -z "$IMAGE_SHA" ] || [ "$IMAGE_SHA" = "latest" ]; then
-    printf '[DEPLOY] ts=%s state=INIT level=ERROR msg="image SHA required -- latest tag is forbidden in production" sha=%s\n' \
-        "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "${IMAGE_SHA:-<empty>}" >&2
-    exit 2
-fi
-IMAGE="ghcr.io/fieldtrack-tech/api:$IMAGE_SHA"
-
-BLUE_NAME="api-blue"
-GREEN_NAME="api-green"
-APP_PORT=3000
-NETWORK="api_network"
-# Pinned curl container for in-network health probes.
-# Running on api_network exercises Docker DNS + bridge routing — the same
-# path that nginx uses — catching connectivity issues that docker exec
-# localhost bypasses (docker exec goes direct to the container loopback).
-_FT_CURL_IMG="curlimages/curl:8.7.1"
-# In-network curl helper with local fallback.
-#
-# PRIMARY CURL HELPERS — use docker run on api_network (reliable DNS + routing)
-#
-# Primary:  short-lived curlimages/curl container on api_network.
-#           Exercises Docker DNS + bridge routing (same path nginx uses).
-#           Works with distroless containers (no curl binary available).
-#
-# Usage: _ft_net_curl <container_name> <curl-flags...>
-#   The first argument is the container name — not used (kept for signature compat).
-#   Remaining arguments are passed verbatim to curl.
-_ft_net_curl() {
-    local _target_container="$1"; shift
-    # Primary: in-network (Docker DNS + bridge routing)
-    docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" "$@" >/dev/null 2>&1
-}
-# Variant that captures the response body or HTTP status code instead of
-# just testing. Used where we need the response text for status checks.
-# Usage: _ft_net_curl_out <container_name> <curl-flags...>
-_ft_net_curl_out() {
-    local _target_container="$1"; shift
-    local _out
-    _out=$(docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" "$@" 2>/dev/null) || _out=""
-    printf '%s' "$_out"
-}
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
-[ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
-REPO_DIR="$DEPLOY_ROOT"
-
-# Slot state directory and file.
-# /var/run/api/ is chosen over /tmp (world-writable, cleaned by tmpwatch)
-# and $HOME (variable path, not auditable as runtime state).
-# /var/run IS a tmpfs -- the _ft_resolve_slot() recovery handles missing files.
-SLOT_DIR="/var/run/api"
-ACTIVE_SLOT_FILE="$SLOT_DIR/active-slot"
-
-NGINX_CONF="$REPO_DIR/infra/nginx/live/api.conf"
-NGINX_LIVE_DIR="$REPO_DIR/infra/nginx/live"
-NGINX_BACKUP_DIR="$REPO_DIR/infra/nginx/backup"
-NGINX_TEMPLATE="$REPO_DIR/infra/nginx/api.conf"
-MAX_HISTORY=5
-MAX_HEALTH_ATTEMPTS=40
-HEALTH_INTERVAL=3
-LOCK_FILE="$SLOT_DIR/deploy.lock"
-SNAP_DIR="$SLOT_DIR"
-LAST_GOOD_FILE="$SNAP_DIR/last-good"
-
-_ft_ensure_log_dir() {
-    local log_dir
-    log_dir=$(dirname "$DEPLOY_LOG_FILE")
-    if [ ! -d "$log_dir" ]; then
-        mkdir -p "$log_dir" 2>/dev/null || sudo mkdir -p "$log_dir" || true
-        [ -d "$log_dir" ] && chmod 755 "$log_dir" 2>/dev/null || true
-    fi
-}
-
-# ---------------------------------------------------------------------------
-# DEPLOYMENT LOCK -- prevent concurrent deploys
-# ---------------------------------------------------------------------------
-_ft_acquire_lock() {
-    _ft_ensure_slot_dir
-    _ft_ensure_log_dir
-    _ft_log "msg='acquiring deployment lock' pid=$$ file=$LOCK_FILE"
-    exec 200>"$LOCK_FILE"
-    if ! flock -n 200; then
-        _ft_log "level=ERROR msg='another deployment already in progress -- aborting' pid=$$"
-        exit 1
-    fi
-    _ft_log "msg='deployment lock acquired' pid=$$ file=$LOCK_FILE"
-    # Ensure lock is released on exit
-    trap '_ft_release_lock' EXIT
-}
-
-_ft_release_lock() {
-    { set +x; } 2>/dev/null
-    printf '[DEPLOY] ts=%s state=%s msg="releasing deployment lock" pid=%s\n' \
-        "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$$" >&2
-    # Close FD 200 unconditionally; closing the FD releases the flock.
-    exec 200>&- 2>/dev/null || true
-    if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
-}
-
-# ---------------------------------------------------------------------------
-# EXTERNAL ENDPOINT CHECK WITH RETRY + BACKOFF
-# Smooths transient CDN/TLS edge jitter while maintaining strict semantics
-#
-# NOTE: Uses localhost (127.0.0.1) with Host header instead of external hostname.
-# Rationale: nginx is protected by Cloudflare IP allowlist. Requests from the
-# VPS itself (not through Cloudflare) would be blocked with 403. Using localhost
-# + Host header allows the deploy script to:
-#   - Validate full nginx routing stack (localhost → nginx → backend)
-#   - Bypass Cloudflare IP restriction safely
-#   - Use --insecure to accept self-signed/origin certs (nginx rewrite)
-# Security: unchanged. Cloudflare still protects production access; only
-# localhost requests (VPS-internal) bypass the IP filter.
-# ---------------------------------------------------------------------------
-_ft_check_external_ready() {
-    # -f: fail on 4xx/5xx so HTML error pages never match the grep
-    docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" -sfk --max-time 5 "https://nginx/health" 2>/dev/null \
-        | grep -q '"status":"ok"'
-}
-
-# ---------------------------------------------------------------------------
-# RETRY CURL -- wraps curl -sf with retries + 1s backoff
-#   _ft_retry_curl <url> [max_attempts=10] [extra curl flags...]
-#   Returns 0 on first 2xx success, 1 after all attempts exhausted.
-# ---------------------------------------------------------------------------
-_ft_retry_curl() {
-    { set +x; } 2>/dev/null
-    local url="$1"
-    local max="${2:-10}"
-    shift 2 || shift $#
-    local i=0
-    while [ "$i" -lt "$max" ]; do
-        i=$((i + 1))
-        if curl -sf --max-time 5 "$@" "$url" >/dev/null 2>&1; then
-            if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
-            return 0
-        fi
-        sleep 1
-    done
-    if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
-    return 1
-}
-
-# ---------------------------------------------------------------------------
-# SILENT EXECUTION WRAPPER
-# All inherently noisy commands (docker pull, docker compose, etc.) go through
-# run(). Output is suppressed unless DEBUG=true.
-# On failure: surfaces the command name and captured output to stderr so
-# failures are never silently swallowed.
-# ---------------------------------------------------------------------------
-run() {
-    if [ "${DEBUG:-false}" = "true" ]; then
-        "$@"
-    else
-        local _run_out
-        if ! _run_out=$("$@" 2>&1); then
-            printf '[ERROR] Command failed: %s\n' "$*" >&2
-            printf '%s\n' "$_run_out" >&2
-            return 1
-        fi
-    fi
-}
-
-# Like run() but always forwards stderr so error messages are never swallowed.
-run_show_err() {
-    if [ "${DEBUG:-false}" = "true" ]; then
-        "$@"
-    else
-        "$@" >/dev/null
-    fi
-}
-
-# ---------------------------------------------------------------------------
-# SLOT DIRECTORY AND FILE MANAGEMENT
-# ---------------------------------------------------------------------------
-_ft_ensure_slot_dir() {
-    if [ ! -d "$SLOT_DIR" ]; then
-        _ft_log "msg='slot dir missing, creating' path=$SLOT_DIR"
-        sudo mkdir -p "$SLOT_DIR"
-        # Owned by the deploy user so subsequent writes do not need sudo.
-        sudo chown "$(id -un):$(id -gn)" "$SLOT_DIR"
-        sudo chmod 750 "$SLOT_DIR"
-    fi
-}
-
-# Single authoritative validator. Returns 0 for "blue"|"green", 1 otherwise.
-# Logs to stderr on failure so every call site gets a structured error for free.
-_ft_validate_slot() {
-    case "$1" in
-        blue|green) return 0 ;;
-        *) _ft_log "level=ERROR msg='invalid slot value' slot='${1:0:80}'"
-           return 1 ;;
-    esac
-}
-
-_ft_write_slot() {
-    local slot="$1"
-    _ft_validate_slot "$slot" || return 1
-    _ft_ensure_slot_dir
-    local slot_tmp
-    slot_tmp=$(mktemp "${SLOT_DIR}/active-slot.XXXXXX")
-    printf '%s\n' "$slot" > "$slot_tmp"
-    mv "$slot_tmp" "$ACTIVE_SLOT_FILE"
-    _ft_log "msg='slot file updated (atomic)' slot=$slot path=$ACTIVE_SLOT_FILE"
-}
-
-# _ft_resolve_slot -- returns the active slot name, recovering from a missing
-# or corrupt slot file by inspecting running containers and the live nginx config.
-#
-# Recovery precedence:
-#   1. slot file value            (happy path)
-#   2. only blue running          -> blue
-#   3. only green running         -> green
-#   4. both running               -> nginx upstream port as tiebreaker
-#   5. neither running            -> green  (first deploy; inactive = blue)
-_ft_resolve_slot() {
-    _ft_ensure_slot_dir
-
-    # Happy path -- slot file exists and is valid.
-    if [ -f "$ACTIVE_SLOT_FILE" ]; then
-        local current_slot
-        current_slot=$(tr -d '[:space:]' < "$ACTIVE_SLOT_FILE")
-        # Guard: detect log contamination in the file (pre-fix corruption defense).
-        # A valid slot is ONLY the literal string "blue" or "green".
-        if [[ "$current_slot" == *DEPLOY* ]] || [[ "$current_slot" == *\[* ]]; then
-            _ft_log "level=WARN msg='slot file contains log contamination -- treating as corrupt, recovering' value=${current_slot:0:80}"
-        elif _ft_validate_slot "$current_slot"; then
-            _ft_log "msg='slot file read' slot=$current_slot"
-            echo "$current_slot"
-            return 0
-        else
-            # _ft_validate_slot already logged the invalid value; fall through to recovery.
-            _ft_log "level=WARN msg='slot file invalid, falling through to container recovery'"
-        fi
-    else
-        _ft_log "level=WARN msg='slot file missing, recovering from container state' path=$ACTIVE_SLOT_FILE"
-    fi
-
-    # Try to recover from last-known-good snapshot first
-    if [ -f "$LAST_GOOD_FILE" ]; then
-        local last_good_state
-        last_good_state=$(head -1 "$LAST_GOOD_FILE" 2>/dev/null | tr -d '[:space:]')
-        if _ft_validate_slot "$last_good_state" 2>/dev/null; then
-            _ft_log "msg='recovered slot from last-known-good snapshot' slot=$last_good_state file=$LAST_GOOD_FILE"
-            echo "$last_good_state"
-            return 0
-        fi
-    fi
-
-    # Recovery -- infer from running containers, then nginx config.
-    local blue_running=false green_running=false recovered_slot=""
-    docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${BLUE_NAME}$"  && blue_running=true  || true
-    docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${GREEN_NAME}$" && green_running=true || true
-
-    if [ "$blue_running" = "true" ] && [ "$green_running" = "false" ]; then
-        recovered_slot="blue"
-        _ft_log "msg='recovery: only blue running' slot=blue"
-    elif [ "$green_running" = "true" ] && [ "$blue_running" = "false" ]; then
-        recovered_slot="green"
-        _ft_log "msg='recovery: only green running' slot=green"
-    elif [ "$blue_running" = "true" ] && [ "$green_running" = "true" ]; then
-        # Both running -- read nginx upstream container as authoritative tiebreaker.
-        local nginx_upstream
-        nginx_upstream=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
-        if [ "$nginx_upstream" = "api-blue" ]; then recovered_slot="blue"
-        elif [ "$nginx_upstream" = "api-green" ]; then recovered_slot="green"
-        else
-            recovered_slot="blue"
-            _ft_log "level=WARN msg='both containers running and nginx upstream ambiguous, defaulting to blue' nginx_upstream=${nginx_upstream}"
-        fi
-        _ft_log "msg='recovery: both containers running, nginx tiebreaker' nginx_upstream=${nginx_upstream} slot=${recovered_slot}"
-    else
-        # Neither running -- first deploy.
-        recovered_slot="green"
-        _ft_log "msg='recovery: no containers running, assuming first deploy' slot=green"
-    fi
-
-    # Validate before writing -- recovered_slot must be blue or green.
-    # (_ft_validate_slot logs the error; we just fail the subshell.)
-    _ft_validate_slot "$recovered_slot" || return 1
-
-    # Persist the recovered value (atomic write).
-    local slot_tmp
-    slot_tmp=$(mktemp "${SLOT_DIR}/active-slot.XXXXXX")
-    printf '%s\n' "$recovered_slot" > "$slot_tmp"
-    mv "$slot_tmp" "$ACTIVE_SLOT_FILE"
-    _ft_log "msg='slot file recreated (atomic)' slot=$recovered_slot"
-    echo "$recovered_slot"
-}
-
-# ---------------------------------------------------------------------------
-# ACQUIRE DEPLOYMENT LOCK
-# ---------------------------------------------------------------------------
-_ft_acquire_lock
-
-# ---------------------------------------------------------------------------
-# PRE-FLIGHT: load environment + validate contract
-# ---------------------------------------------------------------------------
-_ft_state "PRE_FLIGHT" "msg='loading and validating environment'"
-
-# Log last-known-good state for faster triage
-_LAST_GOOD=$(cat "$LAST_GOOD_FILE" 2>/dev/null || echo "none")
-_ft_log "msg='startup recovery info' last_good=$_LAST_GOOD"
-
-# Disable xtrace while sourcing .env to prevent secrets in logs.
-set +x
-source "$SCRIPT_DIR/load-env.sh"
-if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
-
-# DEPLOY_ROOT is now exported by load-env.sh.
-DEPLOY_HISTORY="$DEPLOY_ROOT/.deploy_history"
-
-_ft_log "msg='environment loaded' api_hostname=$API_HOSTNAME"
-
-set +x
-"$SCRIPT_DIR/validate-env.sh" --check-monitoring
-if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
-# Harden monitoring env file permissions on every deploy (defense-in-depth).
-chmod 600 "$DEPLOY_ROOT/infra/.env.monitoring" 2>/dev/null || true
-
-_ft_log "msg='env contract validated'"
-
-# Ensure api_network exists (idempotent). All containers MUST be on this network.
-docker network create --driver bridge "$NETWORK" 2>/dev/null \
-    && _ft_log "msg='api_network created'" \
-    || _ft_log "msg='api_network already exists'"
-
-# GLOBAL PORT-LEAK GUARD -- api-blue/api-green MUST NOT bind host ports.
-# All API traffic flows: Cloudflare → nginx (binds 80/443) → api_network.
-# nginx is exempt; api containers with host ports bypass the nginx layer
-# and would expose the API without TLS or rate-limiting.
-_API_PORT_LEAKS=$(docker ps --format '{{.Names}} {{.Ports}}' 2>/dev/null \
-    | grep -E '^api-(blue|green)' \
-    | grep -E '(0\.0\.0\.0:|127\.0\.0\.1:)[0-9]+->') || true
-if [ -n "${_API_PORT_LEAKS:-}" ]; then
-    _ft_log "level=ERROR msg='API container has host port bindings — forbidden. Remove and recreate without -p.' leaks=${_API_PORT_LEAKS}"
-    _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=api_port_leak_detected"
-fi
-unset _API_PORT_LEAKS
-_ft_log "msg='port-leak guard passed — no API containers with host port bindings'"
-
-# NGINX CONTAINER GUARD -- nginx MUST run as a Docker container on api_network.
-# With container-name upstreams (server api-blue:3000), Docker's embedded DNS
-# (127.0.0.11) is required for name resolution. This only works from WITHIN
-# Docker containers on the same network -- not from a host systemd nginx service.
-#
-# BOOTSTRAP MODE: If nginx is missing, start it via docker compose --no-deps so
-# the monitoring dependency chain (nginx→grafana→prometheus→alertmanager) does
-# NOT block a first-deploy. nginx starts immediately; monitoring catches up.
-if ! docker inspect nginx >/dev/null 2>&1; then
-    _ft_log "msg='nginx container missing — bootstrapping via docker compose --no-deps'"
-    mkdir -p "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR"
-    # Write a bootstrap config pointing at api-blue (default first-deploy slot)
-    # so nginx can start without waiting for an API container.
-    if [ ! -f "$NGINX_CONF" ]; then
-        # Permission check: ensure deploy user can write to nginx live dir
-        if [ ! -w "$(dirname "$NGINX_CONF")" ]; then
-            sudo chown -R "$(id -un):$(id -gn)" "$(dirname "$NGINX_CONF")"
-        fi
-        _NGINX_GUARD_TMP="$(mktemp /tmp/api-nginx-guard.XXXXXX.conf)"
-        sed \
-            -e "s|__ACTIVE_CONTAINER__|api-blue|g" \
-            -e "s|__API_HOSTNAME__|${API_HOSTNAME}|g" \
-            "$NGINX_TEMPLATE" > "$_NGINX_GUARD_TMP"
-        mv "$_NGINX_GUARD_TMP" "$NGINX_CONF"
-        _ft_log "msg='bootstrap nginx config written (atomic)' target=api-blue path=$NGINX_CONF"
-    fi
-    # Kill any ghost docker-proxy holdind host ports before starting nginx
-    pkill docker-proxy 2>/dev/null || true
-    cd "$DEPLOY_ROOT/infra"
-    _COMPOSE_OUT=$(docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml \
-            up -d --no-deps nginx 2>&1) || {
-        printf '%s\n' "$_COMPOSE_OUT" >&2
-        _ft_log "level=ERROR msg='docker compose up --no-deps nginx failed'"
-        cd "$DEPLOY_ROOT"
-        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_bootstrap_compose_failed"
-    }
-    unset _COMPOSE_OUT
-    cd "$DEPLOY_ROOT"
-    # Wait up to 30 s for the nginx container to become available
-    _NGINX_STARTED=false
-    for _ni in $(seq 1 10); do
-        if docker inspect nginx >/dev/null 2>&1; then
-            _ft_log "msg='nginx bootstrap complete' attempt=$_ni"
-            _NGINX_STARTED=true
-            break
-        fi
-        sleep 3
-    done
-    if [ "$_NGINX_STARTED" != "true" ]; then
-        _ft_log "level=ERROR msg='nginx container failed to start after bootstrap'"
-        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_bootstrap_timeout"
-    fi
-    unset _NGINX_STARTED _ni
-fi
-_NGINX_NETWORK=$(docker inspect nginx --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "")
-if ! echo "$_NGINX_NETWORK" | grep -q "$NETWORK"; then
-    _ft_log "level=ERROR msg='nginx container not on api_network -- container DNS will fail' networks=${_NGINX_NETWORK}"
-    _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_not_on_api_network networks=${_NGINX_NETWORK}"
-fi
-unset _NGINX_NETWORK
-_ft_log "msg='nginx container guard passed' container=nginx network=$NETWORK"
-
-# Ensure nginx live and backup directories exist (deploy user owns them)
-mkdir -p "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR"
-
-# ---------------------------------------------------------------------------
-# PREFLIGHT CHECK  (policy=warn: missing preflight logs a warning, does not abort)
-# ---------------------------------------------------------------------------
-if [ "$PREFLIGHT_STRICT" = "true" ]; then
-    [ -x "$SCRIPT_DIR/preflight.sh" ] || _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=preflight_missing_strict_mode path=$SCRIPT_DIR/preflight.sh"
-    _ft_state "PREFLIGHT" "msg='running preflight checks (STRICT mode)'"
-    if ! "$SCRIPT_DIR/preflight.sh" 2>&1; then
-        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=preflight_failed_strict_mode"
-    fi
-    _ft_log "msg='preflight checks passed (strict mode)'"
-elif [ -x "$SCRIPT_DIR/preflight.sh" ]; then
-    _ft_state "PREFLIGHT" "msg='running preflight checks'"
-    if ! "$SCRIPT_DIR/preflight.sh" 2>&1; then
-        _ft_log "level=ERROR msg='preflight checks failed -- aborting deploy'"
-        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=preflight_failed"
-    fi
-    _ft_log "msg='preflight checks passed'"
-else
-    _ft_log "level=WARN msg='preflight.sh not found or not executable -- continuing (policy=warn)' path=$SCRIPT_DIR/preflight.sh"
-fi
-
-# ---------------------------------------------------------------------------
-# DEPLOY METADATA -- structured log emitted once per deploy for observability
-# ---------------------------------------------------------------------------
-_ft_log "msg='deploy metadata' sha=$IMAGE_SHA image=$IMAGE script_dir=$SCRIPT_DIR repo_dir=$REPO_DIR app_env=${APP_ENV:-unset}"
-
-# ---------------------------------------------------------------------------
-# [1/7] PULL IMAGE
-# ---------------------------------------------------------------------------
-_ft_state "PULL_IMAGE" "msg='pulling container image' sha=$IMAGE_SHA"
-_ft_phase_start "PULL_IMAGE"
-
-# Explicit pull with hard error.
-# Without this guard a missing image would cause docker run to attempt a
-# background pull inside a 60-s timeout, racing the readiness loop.
-if ! run timeout 120 docker pull "$IMAGE"; then
-    _ft_log "level=ERROR msg='image pull failed' image=$IMAGE"
-    _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=image_pull_failed image=$IMAGE"
-fi
-_ft_log "msg='image pulled' image=$IMAGE"
-_ft_phase_end "PULL_IMAGE"
-
-# ---------------------------------------------------------------------------
-# BOOTSTRAP GUARD -- no API containers exist (first deploy or full restart)
-#
-# When no api-blue or api-green containers are present, the normal slot
-# recovery path works but is implicit. This guard makes first-deploy
-# explicit: start api-blue directly, wait for readiness, write nginx config,
-# write slot file, and exit cleanly with BOOTSTRAP_SUCCESS.
-#
-# WHY THIS IS NECESSARY:
-#   - nginx starts (via the guard above) with bootstrap config pointing at api-blue
-#   - Without this guard, nginx is serving 502 until the normal START_INACTIVE
-#     path eventually starts api-blue. This can be 30-60s of errors.
-#   - Explicit bootstrap gives a deterministic, logged, traceable first-deploy.
-#
-# SKIPPED when any api container already exists (normal redeploy path).
-# ---------------------------------------------------------------------------
-if ! docker ps -a --format '{{.Names}}' | grep -Eq '^api-(blue|green)$'; then
-    _ft_state "BOOTSTRAP" "msg='no api containers found — first deploy, starting api-blue directly'"
-
-    # Remove stale container if left in a stopped state somehow
-    docker rm -f api-blue 2>/dev/null || true
-
-    _CID=$(timeout 60 docker run -d \
-        --name api-blue \
-        --network "$NETWORK" \
-        --restart unless-stopped \
-        --label "api.sha=$IMAGE_SHA" \
-        --label "api.slot=blue" \
-        --label "api.deploy_id=$DEPLOY_ID" \
-        --env-file "$ENV_FILE" \
-        "$IMAGE" 2>&1) || {
-        printf '%s\n' "$_CID" >&2
-        _ft_error "msg='bootstrap: container start failed' name=api-blue"
-        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=bootstrap_container_start_failed"
-    }
-    unset _CID
-
-    _ft_log "msg='bootstrap: api-blue started' image=$IMAGE"
-
-    # Grace window: give the process time to bind and initialise workers.
-    # /ready can lag the HTTP server bind by ~1–3 s while workers start.
-    sleep 2
-
-    # Bootstrap readiness: use docker run (works with distroless containers).
-    _BOOT_OK=false
-    for _bi in $(seq 1 20); do
-        if docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" -sf --max-time 4 "http://api-blue:${APP_PORT}/ready" >/dev/null 2>&1; then
-            _ft_log "msg='bootstrap: api-blue ready' attempt=$_bi"
-            _BOOT_OK=true
-            break
-        fi
-        [ $((_bi % 10)) -eq 0 ] && _ft_log "msg='bootstrap: still waiting for api-blue readiness' attempt=$_bi/20"
-        sleep 2
-    done
-
-    if [ "$_BOOT_OK" != "true" ]; then
-        _ft_log "level=ERROR msg='bootstrap: api-blue did not become ready after 60s — container PRESERVED for debugging'"
-        # DO NOT remove the container on bootstrap failure:
-        #   - Preserves logs and state for post-mortem: docker logs api-blue
-        #   - Removing here loses all debugging visibility
-        #   - Operator can inspect and restart manually
-        docker logs api-blue --tail 50 >&2 || true
-        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=bootstrap_api_ready_timeout"
-    fi
-    unset _bi _BOOT_OK
-
-    # Write nginx config pointing at api-blue (same sed logic as SWITCH_NGINX)
-    mkdir -p "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR"
-    NGINX_BOOT_TMP="$(mktemp /tmp/api-nginx-boot.XXXXXX.conf)"
-    sed \
-        -e "s|__ACTIVE_CONTAINER__|api-blue|g" \
-        -e "s|__API_HOSTNAME__|${API_HOSTNAME}|g" \
-        "$NGINX_TEMPLATE" > "$NGINX_BOOT_TMP"
-    cp "$NGINX_BOOT_TMP" "$NGINX_CONF"
-    rm -f "$NGINX_BOOT_TMP"
-
-    # Nginx network attachment guard — must be on api_network before reload.
-    _NGINX_BOOT_NET=$(docker inspect nginx \
-        --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "")
-    if ! echo "$_NGINX_BOOT_NET" | grep -q "$NETWORK"; then
-        _ft_log "level=ERROR msg='bootstrap: nginx not attached to api_network' networks=${_NGINX_BOOT_NET}"
-        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_network_mismatch_bootstrap"
-    fi
-    unset _NGINX_BOOT_NET
-
-    # Fail-fast: any nginx test/reload failure is a hard error at bootstrap.
-    _NGINX_TEST_OUT=$(docker exec nginx nginx -t 2>&1) || {
-        printf '%s\n' "$_NGINX_TEST_OUT" >&2
-        _ft_log "level=ERROR msg='bootstrap: nginx config test failed'"
-        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_config_test_failed_bootstrap"
-    }
-    unset _NGINX_TEST_OUT
-    docker exec nginx nginx -s reload >/dev/null 2>&1 \
-        || { _ft_log "level=ERROR msg='bootstrap: nginx reload failed'"; _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_reload_failed_bootstrap"; }
-    _ft_log "msg='bootstrap: nginx reloaded to api-blue'"
-
-    # Persist slot state (atomic write already in _ft_write_slot)
-    _ft_write_slot "blue"
-
-    # Snapshot last-known-good
-    _SNAP_BOOT_TMP=$(mktemp "${SNAP_DIR}/last-good.XXXXXX")
-    printf 'slot=blue container=api-blue ts=%s\n' "$(date -Iseconds)" > "$_SNAP_BOOT_TMP"
-    mv "$_SNAP_BOOT_TMP" "$LAST_GOOD_FILE"
-    unset _SNAP_BOOT_TMP
-
-    _ft_exit 0 "BOOTSTRAP_SUCCESS" "slot=blue image=$IMAGE"
-fi
-
-# ---------------------------------------------------------------------------
-# [2/7] RESOLVE ACTIVE SLOT (with recovery)
-# ---------------------------------------------------------------------------
-_ft_state "RESOLVE_SLOT" "msg='determining active slot'"
-
-ACTIVE=$(_ft_resolve_slot) || {
-    _ft_log "level=ERROR msg='_ft_resolve_slot failed or exited non-zero -- cannot continue safely'"
-    exit 1
-}
-ACTIVE=$(printf '%s' "$ACTIVE" | tr -d '[:space:]')
-_ft_validate_slot "$ACTIVE" || exit 1
-
-# SLOT REPAIR — heal slot file drift from reality.
-# If the slot file says "green" but api-green is gone (OOM/manual removal),
-# flip the effective slot to whatever container IS actually running.
-# This prevents a deploy from treating a missing container as the "active" one.
-if [ "$ACTIVE" = "green" ] && ! docker inspect api-green >/dev/null 2>&1; then
-    _ft_log "msg='slot repair: green missing — switching effective slot to blue' original_slot=green"
-    ACTIVE="blue"
-    _ft_write_slot "blue"
-elif [ "$ACTIVE" = "blue" ] && ! docker inspect api-blue >/dev/null 2>&1; then
-    # Both containers may be missing on a clean restart; this is ok — the
-    # BOOTSTRAP GUARD above will catch it. Here we only switch when the
-    # opposite slot is actually running.
-    if docker inspect api-green >/dev/null 2>&1; then
-        _ft_log "msg='slot repair: blue missing but green running — switching effective slot to green' original_slot=blue"
-        ACTIVE="green"
-        _ft_write_slot "green"
-    else
-        _ft_log "level=WARN msg='slot repair: neither container running — first deploy or crash; slot kept as blue'"
-    fi
-fi
-_ft_validate_slot "$ACTIVE" || exit 1
-
-if [ "$ACTIVE" = "blue" ]; then
-    ACTIVE_NAME=$BLUE_NAME
-    INACTIVE="green"; INACTIVE_NAME=$GREEN_NAME
-else
-    ACTIVE_NAME=$GREEN_NAME
-    INACTIVE="blue";  INACTIVE_NAME=$BLUE_NAME
-fi
-
-_ft_log "msg='slot resolved' active=$ACTIVE active_name=$ACTIVE_NAME inactive=$INACTIVE inactive_name=$INACTIVE_NAME"
-
-# ---------------------------------------------------------------------------
-# ACTIVE CONTAINER EXISTENCE GUARD
-# Protect against race: active slot file says "blue" but container doesn't exist.
-# This catches crash/OOM scenarios before any deploy logic runs.
-# ---------------------------------------------------------------------------
-if docker ps -a --format '{{.Names}}' | grep -q "^${ACTIVE_NAME}$"; then
-    if ! docker inspect "$ACTIVE_NAME" >/dev/null 2>&1; then
-        _ft_log "level=ERROR msg='active container listed by docker ps but inspect failed -- possible race' container=$ACTIVE_NAME"
-        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=active_container_inspect_race container=$ACTIVE_NAME"
-    fi
-    _ft_log "msg='active container existence guard passed' container=$ACTIVE_NAME"
-else
-    _ft_log "level=WARN msg='active container not running (first deploy or crash recovery)' container=$ACTIVE_NAME"
-fi
-
-# ---------------------------------------------------------------------------
-# IDEMPOTENCY GUARD -- skip deploy if this exact SHA is already the active container
-# ---------------------------------------------------------------------------
-_ft_state "IDEMPOTENCY" "msg='checking if target SHA already deployed' sha=$IMAGE_SHA"
-
-_RUNNING_IMAGE=$(docker inspect --format '{{.Config.Image}}' "$ACTIVE_NAME" 2>/dev/null || echo "")
-if [ "$_RUNNING_IMAGE" = "$IMAGE" ]; then
-    # In-network health check: exercises Docker DNS + bridge routing.
-        _IDEMPOTENT_HEALTH=$(_ft_net_curl_out "$ACTIVE_NAME" \
-            -s --max-time 3 "http://$ACTIVE_NAME:$APP_PORT/ready")
-        if echo "$_IDEMPOTENT_HEALTH" | grep -q '"status":"ready"' 2>/dev/null; then
-            _ft_log "msg='target SHA already running and healthy -- nothing to do' container=$ACTIVE_NAME image=$IMAGE"
-            _ft_final_state "$ACTIVE_NAME" "$IMAGE_SHA"
-            _ft_github_summary "✅ IDEMPOTENT (no change)" "$ACTIVE_NAME" "$IMAGE_SHA" "SHA already deployed"
-            _ft_exit 0 "DEPLOY_SUCCESS" "reason=idempotent_noop sha=$IMAGE_SHA container=$ACTIVE_NAME"
-        else
-            _ft_log "msg='idempotent SHA match but active container not healthy -- proceeding with deploy' container=$ACTIVE_NAME"
-        fi
-        unset _IDEMPOTENT_HEALTH
-    else
-        _ft_log "msg='SHA differs from running image -- proceeding' running=${_RUNNING_IMAGE:-none} target=$IMAGE"
-    fi
-    unset _RUNNING_IMAGE
-
-# ---------------------------------------------------------------------------
-# [3/7] START INACTIVE CONTAINER
-# ---------------------------------------------------------------------------
-_ft_state "START_INACTIVE" "msg='starting inactive container' name=$INACTIVE_NAME"
-
-if docker ps -a --format '{{.Names}}' | grep -Eq "^${INACTIVE_NAME}$"; then
-    _ft_log "msg='renaming stale container for audit trail' name=$INACTIVE_NAME"
-    docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
-    # Rename instead of hard-rm so a post-mortem can inspect the old container
-    # state. The -old-<epoch> suffix lets the zombie purge below collect it.
-    _STALE_TS=$(date +%s)
-    docker rename "$INACTIVE_NAME" "${INACTIVE_NAME}-old-${_STALE_TS}" 2>/dev/null \
-        || docker rm "$INACTIVE_NAME"
-fi
-
-_CID=$(timeout 60 docker run -d \
-  --name "$INACTIVE_NAME" \
-  --network "$NETWORK" \
-  --restart unless-stopped \
-  --label "api.sha=$IMAGE_SHA" \
-  --label "api.slot=$INACTIVE" \
-  --label "api.deploy_id=$DEPLOY_ID" \
-  --env-file "$ENV_FILE" \
-  "$IMAGE" 2>&1) || {
-    printf '%s\n' "$_CID" >&2
-    _ft_error "msg='container start failed' name=$INACTIVE_NAME"
-    _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=container_start_failed name=$INACTIVE_NAME"
-}
-unset _CID
-
-_ft_log "msg='container started' name=$INACTIVE_NAME"
-
-# IMAGE IMMUTABILITY CHECK -- confirm running container image matches target SHA.
-_ACTUAL_IMAGE=$(docker inspect --format '{{.Config.Image}}' "$INACTIVE_NAME" 2>/dev/null || echo "")
-if [ "$_ACTUAL_IMAGE" != "$IMAGE" ]; then
-    _ft_log "level=ERROR msg='image immutability check failed: running image does not match target' expected=$IMAGE actual=${_ACTUAL_IMAGE:-unknown}"
-    docker logs "$INACTIVE_NAME" --tail 50 >&2 || true
-    docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
-    docker rm "$INACTIVE_NAME" || true
-    _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=image_immutability_check_failed expected=$IMAGE actual=${_ACTUAL_IMAGE:-unknown}"
-fi
-_ft_log "msg='image immutability check passed' image=$_ACTUAL_IMAGE"
-unset _ACTUAL_IMAGE
-_ft_log "msg='phase_complete' state=START_INACTIVE status=success container=$INACTIVE_NAME"
-# [4/7] INTERNAL HEALTH CHECK
-#   Uses /ready to validate Redis, Supabase, and BullMQ before traffic switch.
-# ---------------------------------------------------------------------------
-_ft_state "HEALTH_CHECK_INTERNAL" "msg='waiting for container readiness'"
-
-sleep 5
-HEALTH_ENDPOINT="/ready"
-
-# CONNECTIVITY PRE-CHECK (in-network)
-# Probe /health via a short-lived curl container on api_network to verify:
-#   - Docker DNS resolution of $INACTIVE_NAME
-#   - Bridge routing to the container
-#   - HTTP server is bound and responding
-# This exercises the same network path nginx uses, catching issues that
-# docker exec localhost would silently skip.
-_CONN_ATTEMPTS=0
-_CONN_OK=false
-while [ "$_CONN_ATTEMPTS" -lt 5 ]; do
-    _CONN_ATTEMPTS=$((_CONN_ATTEMPTS + 1))
-    if _ft_net_curl "$INACTIVE_NAME" \
-           -sf --max-time 3 "http://$INACTIVE_NAME:$APP_PORT/health"; then
-        _CONN_OK=true
-        break
-    fi
-    sleep 2
-done
-if [ "$_CONN_OK" = "false" ]; then
-    _ft_log "level=ERROR msg='container not reachable after connectivity pre-check' container=$INACTIVE_NAME"
-    docker logs "$INACTIVE_NAME" --tail 100 >&2 || true
-    docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
-    docker rm "$INACTIVE_NAME" || true
-    _ft_log "msg='active container still serving -- deploy failed non-destructively' container=$ACTIVE_NAME"
-    _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=container_not_reachable container=$INACTIVE_NAME"
-fi
-unset _CONN_ATTEMPTS _CONN_OK
-_ft_log "msg='connectivity pre-check passed' container=$INACTIVE_NAME"
-
-ATTEMPT=0
-until true; do
-    ATTEMPT=$((ATTEMPT + 1))
-    STATUS=$(_ft_net_curl_out "$INACTIVE_NAME" \
-        --max-time 4 -s -o /dev/null -w "%{http_code}" \
-        "http://$INACTIVE_NAME:$APP_PORT${HEALTH_ENDPOINT}" || echo "000")
-
-    if [ "$STATUS" = "200" ]; then
-        _ft_log "msg='internal health check passed' endpoint=$HEALTH_ENDPOINT attempts=$ATTEMPT"
-        break
-    fi
-
-    if ! docker ps --format '{{.Names}}' | grep -q "^${INACTIVE_NAME}$"; then
-        _ft_log "level=ERROR msg='container exited unexpectedly' name=$INACTIVE_NAME"
-        docker logs "$INACTIVE_NAME" --tail 100 >&2 || true
-        docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
-        docker rm "$INACTIVE_NAME" || true
-        _ft_log "msg='active container still serving -- deploy failed non-destructively' container=$ACTIVE_NAME"
-        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=new_container_crashed"
-    fi
-
-    if [ "$ATTEMPT" -ge "$MAX_HEALTH_ATTEMPTS" ]; then
-        _ft_log "level=ERROR msg='internal health check timed out' attempts=$ATTEMPT status=$STATUS endpoint=http://$INACTIVE_NAME:$APP_PORT${HEALTH_ENDPOINT}"
-        docker logs "$INACTIVE_NAME" --tail 100 >&2 || true
-        docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
-        docker rm "$INACTIVE_NAME" || true
-        _ft_log "msg='active container still serving -- deploy failed non-destructively' container=$ACTIVE_NAME"
-        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=new_container_health_timeout attempts=$ATTEMPT"
-    fi
-
-    # Only log progress every 10 attempts to avoid spamming; failure threshold logs always appear above
-    [ $((ATTEMPT % 10)) -eq 0 ] && _ft_log "msg='still waiting for readiness' attempt=$ATTEMPT/$MAX_HEALTH_ATTEMPTS status=$STATUS"
-    # Add up to 2s of jitter to prevent synchronized retries under contention.
-    sleep $((HEALTH_INTERVAL + RANDOM % 3))
-done
-
-_ft_log "msg='phase_complete' phase=HEALTH_CHECK_INTERNAL status=success container=$INACTIVE_NAME"
-_ft_phase_end "HEALTH_CHECK_INTERNAL"
-
-# ---------------------------------------------------------------------------
-# DOCKER HEALTH GATE
-# Ensures the container's HEALTHCHECK has settled to "healthy" before
-# switching nginx. Prevents routing to a container that is "starting".
-# ---------------------------------------------------------------------------
-if ! _ft_wait_docker_health "$INACTIVE_NAME"; then
-    docker logs "$INACTIVE_NAME" --tail 50 >&2 || true
-    docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
-    docker rm "$INACTIVE_NAME" || true
-    _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=docker_health_failed container=$INACTIVE_NAME"
-fi
-
-# STABILIZATION DELAY -- brief pause after docker health gate to let
-# any in-flight connection setup settle (TLS session init, worker warm-up).
-_ft_log "msg='stabilization delay' container=$INACTIVE_NAME"
-sleep 3
-
-# PRE-SWITCH CONNECTIVITY CHECK
-# Direct in-network probe of the new container BEFORE touching nginx.
-# Validates Docker DNS resolution + bridge routing work for the new container
-# one final time with a clean, fresh curl invocation.
-if ! docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" \
-       -sf --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/ready" >/dev/null 2>&1; then
-    _ft_error "msg='pre-switch connectivity check failed' container=$INACTIVE_NAME"
-    docker logs "$INACTIVE_NAME" --tail 50 >&2 || true
-    docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
-    docker rm "$INACTIVE_NAME" || true
-    _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=pre_switch_connectivity_failed container=$INACTIVE_NAME"
-fi
-_ft_log "msg='pre-switch connectivity check passed' container=$INACTIVE_NAME"
-# ---------------------------------------------------------------------------
-_ft_state "SWITCH_NGINX" "msg='switching nginx upstream' container=$INACTIVE_NAME"
-
-# Deterministic stabilization window: give the new container a moment before
-# switching nginx (complements the jitter already in the health check loop).
-sleep 2
-
-# Backup stored in NGINX_BACKUP_DIR (under the repo) — consistent with the
-# pruning logic below. Avoids creating files in /etc/nginx/ (host-side)
-# which is not guaranteed to exist when nginx runs only inside Docker.
-mkdir -p "$NGINX_BACKUP_DIR"
-NGINX_BACKUP="$NGINX_BACKUP_DIR/api.conf.bak.$(date +%s)"
-NGINX_TMP="$(mktemp /tmp/api-nginx.XXXXXX.conf)"
-
-# PRE-RELOAD GATE (in-network with fallback): confirm container is still ready
-# before pointing nginx at it.
-if ! _ft_net_curl "$INACTIVE_NAME" \
-       -sf --max-time 4 "http://$INACTIVE_NAME:$APP_PORT/ready"; then
-    _ft_log "level=ERROR msg='pre-reload gate failed: container not ready' container=$INACTIVE_NAME"
-    docker logs "$INACTIVE_NAME" --tail 50 >&2 || true
-    docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
-    docker rm "$INACTIVE_NAME" || true
-    _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=pre_reload_gate_failed container=$INACTIVE_NAME"
-fi
-_ft_log "msg='pre-reload gate passed' container=$INACTIVE_NAME"
-
-sed \
-    -e "s|__ACTIVE_CONTAINER__|$INACTIVE_NAME|g" \
-    -e "s|__API_HOSTNAME__|$API_HOSTNAME|g" \
-    "$NGINX_TEMPLATE" > "$NGINX_TMP"
-
-cp "$NGINX_CONF" "$NGINX_BACKUP"
-cp "$NGINX_TMP" "$NGINX_CONF"
-rm -f "$NGINX_TMP"
-# Prune old backups (keep last 5) to avoid unbounded growth
-ls -1t "$NGINX_BACKUP_DIR"/api.conf.bak.* 2>/dev/null | tail -n +6 | xargs rm -f 2>/dev/null || true
-
-# Nginx network attachment guard: verify nginx is on api_network before every
-# reload. If nginx was accidentally disconnected, Docker DNS resolution of
-# api-blue/api-green will silently fail inside nginx.
-_NGINX_RELOAD_NET=$(docker inspect nginx \
-    --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "")
-if ! echo "$_NGINX_RELOAD_NET" | grep -q "$NETWORK"; then
-    _ft_log "level=ERROR msg='nginx not attached to api_network at reload time' networks=${_NGINX_RELOAD_NET}"
-    cp "$NGINX_BACKUP" "$NGINX_CONF"
-    _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_network_mismatch"
-fi
-unset _NGINX_RELOAD_NET
-
-_NGINX_TEST_OUT=$(docker exec nginx nginx -t 2>&1) || {
-    printf '%s\n' "$_NGINX_TEST_OUT" >&2
-    _ft_log "level=ERROR msg='nginx config test failed -- restoring backup'"
-    cp "$NGINX_BACKUP" "$NGINX_CONF"
-    _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_config_test_failed"
-}
-unset _NGINX_TEST_OUT
-docker exec nginx nginx -s reload >/dev/null 2>&1 \
-    || { cp "$NGINX_BACKUP" "$NGINX_CONF"; _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_reload_failed"; }
-_ft_log "msg='nginx reloaded' upstream=$INACTIVE_NAME:$APP_PORT"
-
-# Upstream sanity check -- confirm nginx config actually points at the new container.
-# Catches template substitution failures before traffic is affected.
-# Upstream sanity: live config must contain http://INACTIVE_NAME:3000 (set $api_backend format)
-_RELOAD_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
-if [ "$_RELOAD_CONTAINER" != "$INACTIVE_NAME" ]; then
-    _ft_log "level=ERROR msg='nginx upstream sanity check failed after reload' expected=$INACTIVE_NAME actual=${_RELOAD_CONTAINER:-unreadable}"
-    cp "$NGINX_BACKUP" "$NGINX_CONF"
-    docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1 || true
-    _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_upstream_mismatch expected=$INACTIVE_NAME actual=${_RELOAD_CONTAINER:-unreadable}"
-fi
-unset _RELOAD_CONTAINER
-_ft_log "msg='nginx upstream sanity check passed' container=$INACTIVE_NAME"
-_ft_log "msg='phase_complete' phase=SWITCH_NGINX status=success container=$INACTIVE_NAME"
-_ft_phase_end "SWITCH_NGINX"
-
-# Write the slot file AFTER nginx reload so it always reflects what nginx
-# is currently serving. If the public health check then fails and we roll
-# back, we restore nginx AND overwrite this file back to $ACTIVE.
-_ft_write_slot "$INACTIVE"
-
-# Observability hook — log the traffic switch for monitoring/tracking
-_ft_log "msg='TRAFFIC_SWITCH' active=$INACTIVE_NAME sha=$IMAGE_SHA deploy_id=$DEPLOY_ID"
-
-# Nginx warm-up delay — prevents race condition where reload completes before
-# upstream connections are fully established and TLS sessions negotiated.
-# Longer than typical TLS handshake + connection setup.
-sleep $((RANDOM % 3 + 5))
-
-# POST-SWITCH ROUTING VERIFICATION (in-network)
-# Run a short-lived curl container on api_network to probe nginx/health.
-# This exercises: Docker DNS resolution of 'nginx', bridge routing nginx→container,
-# nginx upstream substitution, and proxy-pass to $INACTIVE_NAME:$APP_PORT.
-# Same network path that real client traffic takes after the slot switch.
-_ft_log "msg='post-switch nginx routing verification (in-network)'"
-_POST_SWITCH_OK=false
-for _ps in 1 2 3 4 5; do
-    if docker run --rm --network api_network curlimages/curl:8.7.1 \
-           -sfk --max-time 5 "https://nginx/health" >/dev/null 2>&1; then
-        _POST_SWITCH_OK=true
-        break
-    fi
-    sleep $((RANDOM % 2 + 2))
-done
-if [ "$_POST_SWITCH_OK" != "true" ]; then
-    _ft_error "msg='post-switch routing verification failed — nginx cannot reach new container'"
-    _ft_error "msg='ROLLBACK triggered → restoring $ACTIVE_NAME (post-switch restore)'"
-    _ft_snapshot
-    cp "$NGINX_BACKUP" "$NGINX_CONF"
-    if docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1; then
-        _ft_log "msg='nginx restored (post-switch routing failure)'"
-    else
-        _ft_log "level=ERROR msg='nginx restore failed during post-switch rollback'"
-    fi
-    _ft_write_slot "$ACTIVE"
-    docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
-    docker rm "$INACTIVE_NAME" || true
-    _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=post_switch_routing_failed container=$INACTIVE_NAME"
-fi
-unset _POST_SWITCH_OK _ps
-_ft_log "msg='post-switch routing verification passed'"
-
-# POST-SWITCH UPSTREAM VERIFICATION
-# Directly probe the new container via its in-network address after nginx
-# has confirmed routing. Ensures the upstream backend itself is still
-# responding — nginx routing healthy does NOT imply backend healthy.
-if ! docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" \
-       -sf --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/ready" >/dev/null 2>&1; then
-    _ft_error "msg='post-switch upstream verification failed' container=$INACTIVE_NAME"
-    _ft_snapshot
-    cp "$NGINX_BACKUP" "$NGINX_CONF"
-    if docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1; then
-        _ft_log "msg='nginx restored (post-switch upstream failure)'"
-    else
-        _ft_log "level=ERROR msg='nginx restore failed during upstream verification rollback'"
-    fi
-    _ft_write_slot "$ACTIVE"
-    docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
-    docker rm "$INACTIVE_NAME" || true
-    _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=post_switch_upstream_failed container=$INACTIVE_NAME"
-fi
-_ft_log "msg='post-switch upstream verification passed' container=$INACTIVE_NAME"
-
-# ---------------------------------------------------------------------------
-# [6/7] PUBLIC HEALTH CHECK (end-to-end nginx routing)
-#   Validates:
-#   1. HTTP 200              -- nginx routing, TLS, Host header matching
-#   2. Body "status":"ready" -- backend /ready endpoint, external services
-#   3. Container alignment   -- live nginx config points at $INACTIVE_NAME
-#
-#   NOTE: Uses localhost (127.0.0.1) + Host header to validate nginx routing
-#   while avoiding Cloudflare IP allowlist block (see _ft_check_external_ready).
-# ---------------------------------------------------------------------------
-_ft_state "HEALTH_CHECK_PUBLIC" "msg='validating nginx routing + backend health (localhost)' host=$API_HOSTNAME"
-
-# Give nginx a moment to apply the reloaded config cleanly.
-sleep 3
-
-_PUB_PASSED=false
-_PUB_STATUS="000"
-
-# Public health check — single source of truth via docker network
-# HTTPS with -k because nginx redirects HTTP to HTTPS
-# -f: fail on 4xx/5xx so HTML error pages never match the grep
-if docker run --rm --network api_network curlimages/curl:8.7.1 \
-    -sfk --max-time 10 "https://nginx/health" 2>/dev/null | grep -q '"status":"ok"'; then
-    _PUB_PASSED=true
-    _PUB_STATUS="200"
-    _ft_log "msg='public health check passed' container=$INACTIVE_NAME"
-else
-    _PUB_PASSED=false
-    _PUB_STATUS="000"
-    _ft_log "msg='public health check failed' container=$INACTIVE_NAME"
-fi
-
-# Container alignment check -- live nginx config MUST contain http://INACTIVE_NAME:3000.
-_NGINX_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
-if [ -n "$_NGINX_CONTAINER" ] && [ "$_NGINX_CONTAINER" != "$INACTIVE_NAME" ]; then
-    _ft_log "level=ERROR msg='nginx container mismatch -- slot switch did not take effect' expected=$INACTIVE_NAME actual=$_NGINX_CONTAINER"
-    _PUB_PASSED=false
-fi
-
-if [ "$_PUB_PASSED" != "true" ]; then
-    _ft_state "ROLLBACK" "reason='public health check failed' status=$_PUB_STATUS"
-    _ft_snapshot
-
-    _ft_log "msg='restoring previous nginx config'"
-    cp "$NGINX_BACKUP" "$NGINX_CONF"
-    if docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1; then
-        _ft_log "msg='nginx restored to previous config'"
-    else
-        _ft_log "level=ERROR msg='nginx restore failed -- check manually'"
-    fi
-
-    # Restore slot file to the slot that was active before this deploy attempt.
-    _ft_write_slot "$ACTIVE"
-    docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
-    docker rm "$INACTIVE_NAME" || true
-
-    unset _PUB_PASSED _attempt _PUB_STATUS _PUB_BODY _NGINX_CONTAINER
-
-    if docker ps --format '{{.Names}}' | grep -q "^${ACTIVE_NAME}$"; then
-        _ACTIVE_HEALTH=$(_ft_net_curl_out "$ACTIVE_NAME" \
-            -s --max-time 3 "http://$ACTIVE_NAME:$APP_PORT/ready")
-        if echo "$_ACTIVE_HEALTH" | grep -q '"status":"ready"' 2>/dev/null; then
-            _ft_log "msg='deploy failed but active container healthy -- skipping rollback' container=$ACTIVE_NAME"
-            unset _ACTIVE_HEALTH
-            _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=public_health_check_failed active_container_healthy=true"
-        fi
-        unset _ACTIVE_HEALTH
-        _ft_log "msg='active container running but NOT healthy -- treating as degraded, rollback needed' container=$ACTIVE_NAME"
-    fi
-
-    _ft_log "msg='system degraded -- triggering rollback' container=$ACTIVE_NAME"
-    if [ "${API_ROLLBACK_IN_PROGRESS:-0}" != "1" ]; then
-        _ft_log "msg='triggering image rollback to previous stable SHA'"
-        _ft_error "msg='ROLLBACK triggered → restoring $ACTIVE_NAME'"
-        export API_ROLLBACK_IN_PROGRESS=1
-        _ft_release_lock
-        if ! "$SCRIPT_DIR/rollback.sh" --auto; then
-            _ft_snapshot
-            _ft_exit 2 "DEPLOY_FAILED_FATAL" "reason=deploy_and_rollback_both_failed"
-        fi
-        _ft_exit 1 "DEPLOY_FAILED_ROLLBACK" "reason=public_health_check_failed msg='rollback succeeded, system restored'"
-    else
-        _ft_log "msg='nested rollback guard reached -- stopping to prevent infinite loop'"
-        _ft_exit 1 "DEPLOY_FAILED_FATAL" "reason=nested_rollback_guard"
-    fi
-fi
-
-unset _PUB_PASSED _PUB_STATUS _NGINX_CONTAINER
-_ft_log "msg='public health check passed' container=$INACTIVE_NAME"
-
-# ---------------------------------------------------------------------------
-# [6.5/7] STABILITY_CHECK -- re-verify external endpoint after a settle window
-# Catches flapping services that pass the initial check then regress rapidly
-# ---------------------------------------------------------------------------
-_ft_state "STABILITY_CHECK" "msg='post-switch stability check' settle_seconds=5"
-_ft_phase_start "STABILITY_CHECK"
-
-sleep 5
-_STABLE=false
-if _ft_check_external_ready; then
-    _STABLE=true
-    _ft_log "msg='stability check passed' url=https://$API_HOSTNAME/ready"
-    _ft_log "msg='phase_complete' phase=STABILITY_CHECK status=success"
-    _ft_phase_end "STABILITY_CHECK"
-fi
-
-if [ "$_STABLE" = "false" ]; then
-    _ft_log "level=ERROR msg='stability check failed -- service regressed after initial pass'"
-    _ft_snapshot
-
-    # Restore nginx + slot
-    _ft_log "msg='restoring previous nginx config (stability failure)'"
-    cp "$NGINX_BACKUP" "$NGINX_CONF"
-    if docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1; then
-        _ft_log "msg='nginx restored (stability failure)'"
-    else
-        _ft_log "level=ERROR msg='nginx restore failed during stability rollback -- check manually'"
-    fi
-    _ft_write_slot "$ACTIVE"
-    docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
-    docker rm "$INACTIVE_NAME" || true
-
-    if docker ps --format '{{.Names}}' | grep -q "^${ACTIVE_NAME}$"; then
-        _ACTIVE_HEALTH=$(_ft_net_curl_out "$ACTIVE_NAME" \
-            -s --max-time 3 "http://$ACTIVE_NAME:$APP_PORT/ready")
-        if echo "$_ACTIVE_HEALTH" | grep -q '"status":"ready"' 2>/dev/null; then
-            _ft_log "msg='active container healthy after stability failure -- skipping rollback' container=$ACTIVE_NAME"
-            unset _ACTIVE_HEALTH
-            _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=public_health_check_failed active_container_healthy=true"
-        fi
-        unset _ACTIVE_HEALTH
-        _ft_log "msg='active container running but NOT healthy after stability failure -- rollback needed'"
-    fi
-
-    _ft_log "msg='triggering rollback after stability failure'"
-    if [ "${API_ROLLBACK_IN_PROGRESS:-0}" != "1" ]; then
-        _ft_error "msg='ROLLBACK triggered → restoring $ACTIVE_NAME'"
-        export API_ROLLBACK_IN_PROGRESS=1
-        _ft_release_lock
-        if ! "$SCRIPT_DIR/rollback.sh" --auto; then
-            _ft_snapshot
-            _ft_exit 2 "DEPLOY_FAILED_FATAL" "reason=stability_check_and_rollback_both_failed"
-        fi
-        _ft_exit 1 "DEPLOY_FAILED_ROLLBACK" "reason=stability_check_failed msg='rollback succeeded'"
-    else
-        _ft_exit 1 "DEPLOY_FAILED_FATAL" "reason=stability_nested_rollback_guard"
-    fi
-fi
-unset _STABLE
-
-# ---------------------------------------------------------------------------
-# [7/7] CLEANUP + SUCCESS
-# ---------------------------------------------------------------------------
-_ft_state "CLEANUP" "msg='validating active container exists before cleanup' name=$ACTIVE_NAME"
-
-# ACTIVE CONTAINER GUARD -- handle missing container gracefully (e.g., first deploy or crash)
-if ! docker ps --format '{{.Names}}' | grep -q "^$ACTIVE_NAME$"; then
-    _ft_log "msg='active container missing — treating as first deploy, skipping cleanup' name=$ACTIVE_NAME"
-    SKIP_CLEANUP=true
-else
-    _ft_log "msg='active container guard passed' name=$ACTIVE_NAME"
-fi
-
-# Graceful shutdown: allow in-flight requests to drain before forcing removal.
-if [ "${SKIP_CLEANUP:-false}" != "true" ]; then
-    docker stop --time 10 "$ACTIVE_NAME" 2>/dev/null || true
-    # Rename instead of hard-rm: keeps the previous-active container available
-    # for 60 s of post-mortem inspection. The -old-<epoch> suffix is used by
-    # the zombie purge block below.
-    _CLEANUP_TS=$(date +%s)
-    docker rename "$ACTIVE_NAME" "${ACTIVE_NAME}-old-${_CLEANUP_TS}" 2>/dev/null \
-        || docker rm "$ACTIVE_NAME" || true
-    _ft_log "msg='previous container renamed (graceful)' name=$ACTIVE_NAME rename=${ACTIVE_NAME}-old-${_CLEANUP_TS}"
-else
-    _ft_log "msg='cleanup skipped (first deploy scenario or container already removed)'"
-fi
-
-_ft_state "SUCCESS" "msg='deployment complete' container=$INACTIVE_NAME sha=$IMAGE_SHA slot=$INACTIVE"
-
-# ---------------------------------------------------------------------------
-# FINAL TRUTH CHECK -- verify state matches deployment intent
-# Compares internal (localhost) vs external (DNS/Cloudflare) endpoint health
-# to catch routing, TLS, and proxy anomalies
-# ---------------------------------------------------------------------------
-_FT_TRUTH_CHECK_PASSED=true
-
-# (1) Verify slot file is correctly written
-if [ -f "$ACTIVE_SLOT_FILE" ]; then
-    _SLOT_VALUE=$(cat "$ACTIVE_SLOT_FILE" | tr -d '[:space:]')
-    if [ "$_SLOT_VALUE" != "$INACTIVE" ]; then
-        _ft_log "level=ERROR msg='truth check failed: slot file mismatch' expected=$INACTIVE actual=$_SLOT_VALUE"
-        _FT_TRUTH_CHECK_PASSED=false
-    else
-        _ft_log "msg='truth check: slot file correct' slot=$_SLOT_VALUE"
-    fi
-else
-    _ft_log "level=ERROR msg='truth check failed: slot file missing'"
-    _FT_TRUTH_CHECK_PASSED=false
-fi
-
-# (2) Verify nginx upstream container matches target (set $api_backend format)
-_NGINX_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
-if [ -n "$_NGINX_CONTAINER" ]; then
-    if [ "$_NGINX_CONTAINER" != "$INACTIVE_NAME" ]; then
-        _ft_log "level=ERROR msg='truth check failed: nginx container mismatch' expected=$INACTIVE_NAME actual=$_NGINX_CONTAINER"
-        _FT_TRUTH_CHECK_PASSED=false
-    else
-        _ft_log "msg='truth check: nginx upstream correct' container=$_NGINX_CONTAINER"
-    fi
-else
-    _ft_log "level=WARN msg='truth check: could not read nginx upstream'"
-fi
-
-# (3) Compare internal vs external endpoint health
-# Internal: direct container endpoint  (http://$INACTIVE_NAME:$APP_PORT/ready)
-# External: production DNS/Cloudflare   (https://$API_HOSTNAME/ready)
-# Mismatch indicates routing, TLS, or proxy issues
-if command -v curl >/dev/null 2>&1; then
-    sleep 2
-
-    # Check internal endpoint via in-network curl with fallback.
-    _INT_READY=$(_ft_net_curl_out "$INACTIVE_NAME" \
-        -s --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/ready")
-    _INT_READY_OK=false
-    if echo "$_INT_READY" | grep -q '"status":"ready"' 2>/dev/null; then
-        _INT_READY_OK=true
-        _ft_log "msg='truth check: internal endpoint ready' url=http://$INACTIVE_NAME:$APP_PORT/ready"
-    else
-        _ft_log "level=WARN msg='truth check: internal endpoint not ready' url=http://$INACTIVE_NAME:$APP_PORT/ready response=${_INT_READY:0:100}"
-    fi
-
-    # Check external endpoint via docker network (deterministic, no host routing issues)
-    # Uses retry + backoff to smooth transient edge jitter
-    _EXT_READY_OK=false
-    _EXT_LATENCY_MS=0
-    _slo_start=0
-    _slo_end=0
-    _slo_attempt=0
-    for _slo_attempt in 1 2 3; do
-        _slo_start=$(date +%s%3N)
-        if docker run --rm --network api_network curlimages/curl:8.7.1 -sk --max-time 3 "https://nginx/health" 2>/dev/null | grep -q '"status":"ok"'; then
-            _slo_end=$(date +%s%3N)
-            _EXT_LATENCY_MS=$((_slo_end - _slo_start))
-            _EXT_READY_OK=true
-            break
-        fi
-        if [ $_slo_attempt -lt 3 ]; then sleep $((RANDOM % 3 + 5)); fi
-    done
-
-    if [ "$_EXT_READY_OK" = "true" ]; then
-        _ft_log "msg='truth check: external endpoint ready (retry succeeded)' url=https://$API_HOSTNAME/ready latency_ms=$_EXT_LATENCY_MS"
-        # SLO warning: latency threshold (500ms)
-        if [ "$_EXT_LATENCY_MS" -gt 500 ]; then
-            _ft_log "level=WARN msg='SLO warning: high latency detected on external endpoint' latency_ms=$_EXT_LATENCY_MS threshold_ms=500 url=https://$API_HOSTNAME/ready"
-        fi
-    else
-        _ft_log "level=ERROR msg='truth check: external endpoint not ready after 3 retries' url=https://$API_HOSTNAME/ready"
-    fi
-
-    # Consistency check: if internal is ready but external is not, something is wrong
-    # (DNS/Cloudflare/TLS/nginx proxy layer)
-    if [ "$_INT_READY_OK" = "true" ] && [ "$_EXT_READY_OK" = "false" ]; then
-        _ft_log "level=ERROR msg='truth check FAILED: internal ready but external not reachable -- nginx/proxy/DNS/TLS issue' int_ok=$_INT_READY_OK ext_ok=$_EXT_READY_OK"
-        _FT_TRUTH_CHECK_PASSED=false
-    fi
-
-    # Also fail if both are down (service actually not ready)
-    if [ "$_INT_READY_OK" = "false" ] || [ "$_EXT_READY_OK" = "false" ]; then
-        if [ "$_FT_TRUTH_CHECK_PASSED" = "true" ]; then
-            _ft_log "level=ERROR msg='truth check FAILED: endpoint(s) not returning ready status' int_ok=$_INT_READY_OK ext_ok=$_EXT_READY_OK"
-            _FT_TRUTH_CHECK_PASSED=false
-        fi
-    fi
-else
-    _ft_log "level=WARN msg='truth check: curl not available, skipping endpoint checks'"
-fi
-
-if [ "$_FT_TRUTH_CHECK_PASSED" != "true" ]; then
-    _ft_state "FAILURE" "reason='post_deployment_truth_check_failed'"
-    _ft_snapshot
-    exit 2
-fi
-
-# Persist last-known-good snapshot for fast recovery triage (atomic write)
-_ft_log "msg='recording last-known-good state' slot=$INACTIVE container=$INACTIVE_NAME"
-_SNAP_TMP=$(mktemp "${SNAP_DIR}/last-good.XXXXXX")
-printf 'slot=%s container=%s ts=%s\n' "$INACTIVE" "$INACTIVE_NAME" "$(date -Iseconds)" > "$_SNAP_TMP"
-mv "$_SNAP_TMP" "$LAST_GOOD_FILE"
-_ft_log "msg='last-known-good snapshot recorded (atomic)' file=$LAST_GOOD_FILE"
-
-# Record deployment history (atomic write: temp file then mv).
-DEPLOY_HISTORY_TMP="${DEPLOY_HISTORY}.tmp.$$"
-if [ -f "$DEPLOY_HISTORY" ]; then
-    (echo "$IMAGE_SHA"; head -n $((MAX_HISTORY - 1)) "$DEPLOY_HISTORY") > "$DEPLOY_HISTORY_TMP"
-else
-    echo "$IMAGE_SHA" > "$DEPLOY_HISTORY_TMP"
-fi
-mv "$DEPLOY_HISTORY_TMP" "$DEPLOY_HISTORY"
-_ft_log "msg='deploy history updated' sha=$IMAGE_SHA"
-
-# Alertmanager config rendering: always render before monitoring stack operations.
-# Alertmanager does NOT support env vars natively; the rendered file must exist
-# before docker compose up. This is idempotent and safe to run on every deploy.
-bash "$REPO_DIR/infra/scripts/render-alertmanager.sh"
-_ft_log "msg='alertmanager config rendered' file=$REPO_DIR/infra/alertmanager/alertmanager.rendered.yml"
-
-# Monitoring stack: restart only when infra configs have actually changed.
-# Hashes cover all infra config files EXCEPT the nginx template (re-rendered on
-# every deploy) to avoid spurious monitoring restarts.
-MONITORING_HASH=$(find "$REPO_DIR/infra" -readable \
-    -not -path "$REPO_DIR/infra/nginx/*" \
-    \( -name '*.yml' -o -name '*.yaml' -o -name '*.conf' -o -name '*.toml' -o -name '*.json' \) \
-    | sort | xargs -r sha256sum 2>/dev/null | sha256sum | cut -d' ' -f1 || echo "changed")
-MONITORING_HASH_FILE="$HOME/.api-monitoring-hash"
-
-if [ -f "$MONITORING_HASH_FILE" ] && [ "$(cat "$MONITORING_HASH_FILE")" = "$MONITORING_HASH" ]; then
-    _ft_log "msg='monitoring config unchanged -- skipping restart'"
-else
-    _ft_log "msg='monitoring config changed -- restarting monitoring stack'"
-    cd "$REPO_DIR/infra"
-    run docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml pull --quiet
-    run docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml up -d --remove-orphans
-    cd "$REPO_DIR"
-    echo "$MONITORING_HASH" > "$MONITORING_HASH_FILE"
-    _ft_log "msg='monitoring stack restarted'"
-fi
-
-# ---------------------------------------------------------------------------
-# ZOMBIE PURGE: remove any api-(blue|green)-old-<epoch> containers that have
-# accumulated from previous deploys. Runs unconditionally so the Docker engine
-# does not fill up with stopped containers across multiple deployments.
-# ---------------------------------------------------------------------------
-_ft_log "msg='running zombie purge'"
-docker ps -a --format '{{.Names}}' \
-    | grep -E '^api-(blue|green)-old-[0-9]+$' \
-    | xargs -r docker rm -f 2>/dev/null || true
-
-# Final state snapshot and GitHub Actions summary
-_ft_final_state "$INACTIVE_NAME" "$IMAGE_SHA"
-_ft_github_summary "✅ SUCCESS" "$INACTIVE_NAME" "$IMAGE_SHA"
-
-_ft_exit 0 "DEPLOY_SUCCESS" "sha=$IMAGE_SHA container=$INACTIVE_NAME slot=$INACTIVE"
diff --git a/scripts/deploy.sh b/scripts/deploy.sh
new file mode 100644
index 0000000..90da016
--- /dev/null
+++ b/scripts/deploy.sh
@@ -0,0 +1,1217 @@
+#!/usr/bin/env bash
+# =============================================================================
+# deploy.sh — FieldTrack API Deploy + Rollback (unified)
+#
+# Usage:
+#   deploy.sh <sha>                  # deploy a specific image SHA
+#   deploy.sh --rollback             # interactive rollback to previous SHA
+#   deploy.sh --rollback --auto      # non-interactive rollback (CI)
+#
+# State machine:
+#   INIT -> PRE_FLIGHT -> PULL_IMAGE -> RESOLVE_SLOT -> IDEMPOTENCY
+#        -> START_INACTIVE -> HEALTH_CHECK_INTERNAL -> SWITCH_NGINX
+#        -> HEALTH_CHECK_PUBLIC -> STABILITY_CHECK -> CLEANUP -> SUCCESS
+#
+# Deploy outcomes (via _ft_exit):
+#   DEPLOY_SUCCESS          -- zero-downtime deploy completed
+#   BOOTSTRAP_SUCCESS       -- first-ever deploy completed
+#   DEPLOY_FAILED_SAFE      -- deploy failed, old container still serving
+#   DEPLOY_FAILED_ROLLBACK  -- deploy failed, rollback succeeded (system restored)
+#   DEPLOY_FAILED_FATAL     -- deploy AND rollback both failed (manual needed)
+#
+# Exit codes:
+#   0  DEPLOY_SUCCESS / BOOTSTRAP_SUCCESS
+#   1  DEPLOY_FAILED_SAFE / DEPLOY_FAILED_ROLLBACK
+#   2  DEPLOY_FAILED_FATAL
+#
+# Invariants:
+#   - Success DEPENDS ONLY ON: container start + /health=200 + nginx routing
+#   - NEVER depends on: Redis, Supabase, BullMQ, monitoring stack
+#   - No /ready usage anywhere in this script
+#   - All nginx reloads flow through switch_nginx() — exactly once per deploy
+# =============================================================================
+set -euo pipefail
+if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
+trap '_ft_trap_err "$LINENO"' ERR
+
+# ---------------------------------------------------------------------------
+# ARGUMENT PARSING
+# MODE is set before helper functions are loaded so _ft_log can reference it.
+# ---------------------------------------------------------------------------
+MODE="deploy"
+AUTO_MODE=false
+IMAGE_SHA=""
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --rollback) MODE="rollback"; shift ;;
+        --auto)     AUTO_MODE=true;  shift ;;
+        -*)
+            printf '[ERROR] Unknown option: %s\n' "$1" >&2
+            printf 'Usage: deploy.sh <sha> | deploy.sh --rollback [--auto]\n' >&2
+            exit 2
+            ;;
+        *)          IMAGE_SHA="$1"; shift ;;
+    esac
+done
+
+# ---------------------------------------------------------------------------
+# DEPLOY ID + TIMING (set here so all functions and log lines share them)
+# ---------------------------------------------------------------------------
+START_TS=$(date +%s)
+DEPLOY_ID=$(date +%Y%m%d_%H%M%S)_$$
+PREFLIGHT_STRICT="${PREFLIGHT_STRICT:-false}"
+
+# ---------------------------------------------------------------------------
+# STRUCTURED LOGGING
+# ALL logging writes to stderr so stdout is data-only (subshell returns safe).
+# ---------------------------------------------------------------------------
+_FT_STATE="INIT"
+DEPLOY_LOG_FILE="${DEPLOY_LOG_FILE:-/var/log/api/deploy.log}"
+LOG_DIR="$(dirname "$DEPLOY_LOG_FILE")"
+if ! mkdir -p "$LOG_DIR" 2>/dev/null; then
+    LOG_DIR="$HOME/api/logs"
+    DEPLOY_LOG_FILE="$LOG_DIR/deploy.log"
+    mkdir -p "$LOG_DIR"
+fi
+
+_ft_log() {
+    { set +x; } 2>/dev/null
+    local entry
+    entry=$(printf '[DEPLOY] deploy_id=%s ts=%s state=%s %s' \
+        "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$*")
+    printf '%s\n' "$entry" | tee -a "$DEPLOY_LOG_FILE" >&2
+    if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
+}
+
+_ft_state() {
+    { set +x; } 2>/dev/null
+    _FT_STATE="$1"; shift
+    printf '[DEPLOY] deploy_id=%s ts=%s state=%s %s\n' \
+        "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$*" >&2
+    if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
+}
+
+_ft_error() {
+    { set +x; } 2>/dev/null
+    local entry
+    entry=$(printf '[ERROR] deploy_id=%s ts=%s state=%s %s' \
+        "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$*")
+    printf '%s\n' "$entry" | tee -a "$DEPLOY_LOG_FILE" >&2
+    if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
+}
+
+_ft_trap_err() {
+    { set +x; } 2>/dev/null
+    printf '[ERROR] deploy_id=%s ts=%s state=%s msg="unexpected failure at line %s"\n' \
+        "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$1" >&2
+    if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
+}
+
+_ft_exit() {
+    local code="$1"; shift
+    local duration=$(( $(date +%s) - START_TS ))
+    _ft_state "$@" "duration_sec=$duration"
+    exit "$code"
+}
+
+# ---------------------------------------------------------------------------
+# PHASE TIMING
+# ---------------------------------------------------------------------------
+_ft_phase_start() { eval "_${1}_START=\$(date +%s)"; }
+_ft_phase_end() {
+    local phase="$1"
+    local start_var="_${phase}_START"
+    local start_ts=${!start_var:-0}
+    if [ "$start_ts" -gt 0 ]; then
+        _ft_log "msg='phase_complete' phase=$phase duration_sec=$(( $(date +%s) - start_ts ))"
+    fi
+}
+
+# ---------------------------------------------------------------------------
+# SYSTEM SNAPSHOT (emitted on unrecoverable failure)
+# ---------------------------------------------------------------------------
+_ft_snapshot() {
+    { set +x; } 2>/dev/null
+    printf '[DEPLOY] -- SYSTEM SNAPSHOT ----------------------------------------\n' >&2
+    printf '[DEPLOY]   slot_file  = %s\n' \
+        "$(cat "${ACTIVE_SLOT_FILE:-/var/run/api/active-slot}" 2>/dev/null || echo 'MISSING')" >&2
+    printf '[DEPLOY]   backup_file = %s\n' \
+        "$(cat "${SLOT_BACKUP_FILE:-/var/lib/api/active-slot.backup}" 2>/dev/null || echo 'MISSING')" >&2
+    printf '[DEPLOY]   nginx_upstream = %s\n' \
+        "$(grep -oE 'http://(api-blue|api-green):3000' \
+            "${NGINX_CONF:-/opt/infra/nginx/live/api.conf}" 2>/dev/null \
+            | grep -oE 'api-blue|api-green' | head -1 || echo 'unreadable')" >&2
+    printf '[DEPLOY]   containers =\n' >&2
+    docker ps --format '[DEPLOY]     {{.Names}} -> {{.Status}} ({{.Ports}})' 1>&2 2>/dev/null \
+        || printf '[DEPLOY]     (docker ps unavailable)\n' >&2
+    printf '[DEPLOY] -----------------------------------------------------------\n' >&2
+    if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
+}
+
+# ---------------------------------------------------------------------------
+# GITHUB ACTIONS SUMMARY
+# ---------------------------------------------------------------------------
+_ft_github_summary() {
+    local status="$1" container="${2:-unknown}" image="${3:-unknown}" reason="${4:-}"
+    [ -z "$GITHUB_STEP_SUMMARY" ] && return 0
+    {
+        echo "### 🚀 Deployment Summary"
+        echo "| Field | Value |"
+        echo "|-------|-------|"
+        echo "| Status | **$status** |"
+        echo "| Deploy ID | \`$DEPLOY_ID\` |"
+        echo "| Duration | $(($(date +%s) - START_TS))s |"
+        echo "| Active Container | \`$container\` |"
+        echo "| Image SHA | \`${image:0:12}...\` |"
+        [ -n "$reason" ] && echo "| Reason | $reason |"
+        echo "| Timestamp | $(date -u +'%Y-%m-%d %H:%M:%S UTC') |"
+    } >> "$GITHUB_STEP_SUMMARY"
+}
+
+_ft_final_state() {
+    local active_container="$1" image_sha="$2" nginx_upstream
+    nginx_upstream=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null \
+        | grep -oE 'api-blue|api-green' | head -1 || echo 'unknown')
+    _ft_log "msg='final_state' deploy_id=$DEPLOY_ID active=$active_container sha=${image_sha:0:12} nginx_upstream=$nginx_upstream"
+}
+
+# ---------------------------------------------------------------------------
+# DOCKER HEALTH GATE
+# ---------------------------------------------------------------------------
+_ft_wait_docker_health() {
+    local name="$1" i=1 STATUS
+    while [ "$i" -le 30 ]; do
+        STATUS=$(docker inspect --format='{{.State.Health.Status}}' "$name" 2>/dev/null || echo "none")
+        case "$STATUS" in
+            healthy)   _ft_log "msg='docker health check passed' container=$name"; return 0 ;;
+            unhealthy) _ft_error "msg='docker health check failed' container=$name status=unhealthy"; return 1 ;;
+            none)      _ft_log "msg='docker health gate skipped (no HEALTHCHECK)' container=$name"; return 0 ;;
+        esac
+        [ $(( i % 5 )) -eq 0 ] && _ft_log "msg='waiting for docker health' attempt=$i/30 status=$STATUS container=$name"
+        sleep 2; i=$(( i + 1 ))
+    done
+    _ft_error "msg='docker health timeout' container=$name last_status=$STATUS"
+    return 1
+}
+
+# ---------------------------------------------------------------------------
+# IN-NETWORK CURL HELPERS (via curlimages/curl on api_network)
+# ---------------------------------------------------------------------------
+_ft_net_curl() {
+    local _c="$1"; shift
+    docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" "$@" >/dev/null 2>&1
+}
+
+_ft_net_curl_out() {
+    local _c="$1"; shift
+    local _out
+    _out=$(docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" "$@" 2>/dev/null) || _out=""
+    printf '%s' "$_out"
+}
+
+_ft_check_external_ready() {
+    docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" \
+        -sfk --max-time 5 "https://nginx/health" 2>/dev/null | grep -q '"status":"ok"'
+}
+
+# ---------------------------------------------------------------------------
+# ENV LOADER (inlined)
+# Avoids coupling deploy.sh to auxiliary scripts.
+# ---------------------------------------------------------------------------
+_ft_load_env() {
+    ENV_FILE="$DEPLOY_ROOT/.env"
+    if [ ! -f "$ENV_FILE" ]; then
+        _ft_error "msg='required .env not found' path=$ENV_FILE"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=missing_env_file"
+    fi
+
+    set +x
+    set -o allexport
+    # shellcheck source=/dev/null
+    source "$ENV_FILE"
+    set +o allexport
+    if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
+
+    if [ -z "${API_BASE_URL:-}" ]; then
+        _ft_error "msg='API_BASE_URL missing in .env'"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=missing_api_base_url"
+    fi
+    if [ -z "${CORS_ORIGIN:-}" ]; then
+        _ft_error "msg='CORS_ORIGIN missing in .env'"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=missing_cors_origin"
+    fi
+
+    API_HOSTNAME=$(echo "$API_BASE_URL" | sed -E 's|^https?://||' | cut -d'/' -f1)
+    if [ -z "$API_HOSTNAME" ] || printf '%s' "$API_HOSTNAME" | grep -qE '[[:space:]/@?#]'; then
+        _ft_error "msg='invalid API_HOSTNAME derived from API_BASE_URL' api_base_url=$API_BASE_URL derived=$API_HOSTNAME"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=invalid_api_hostname"
+    fi
+    export ENV_FILE API_HOSTNAME
+}
+
+# ---------------------------------------------------------------------------
+# SILENT EXECUTION WRAPPERS
+# ---------------------------------------------------------------------------
+run() {
+    if [ "${DEBUG:-false}" = "true" ]; then
+        "$@"
+    else
+        local _out
+        if ! _out=$("$@" 2>&1); then
+            printf '[ERROR] Command failed: %s\n' "$*" >&2
+            printf '%s\n' "$_out" >&2
+            return 1
+        fi
+    fi
+}
+
+# ---------------------------------------------------------------------------
+# SLOT DIRECTORY AND FILE MANAGEMENT
+# ---------------------------------------------------------------------------
+_ft_ensure_slot_dir() {
+    if [ ! -d "$SLOT_DIR" ]; then
+        _ft_log "msg='slot dir missing, creating' path=$SLOT_DIR"
+        sudo mkdir -p "$SLOT_DIR"
+        sudo chown "$(id -un):$(id -gn)" "$SLOT_DIR"
+        sudo chmod 750 "$SLOT_DIR"
+    fi
+}
+
+_ft_ensure_slot_backup_dir() {
+    local backup_dir
+    backup_dir="$(dirname "$SLOT_BACKUP_FILE")"
+    if [ ! -d "$backup_dir" ]; then
+        sudo mkdir -p "$backup_dir" 2>/dev/null || mkdir -p "$backup_dir" || true
+        sudo chown "$(id -un):$(id -gn)" "$backup_dir" 2>/dev/null || true
+    fi
+}
+
+_ft_validate_slot() {
+    case "$1" in
+        blue|green) return 0 ;;
+        *) _ft_log "level=ERROR msg='invalid slot value' slot='${1:0:80}'"; return 1 ;;
+    esac
+}
+
+_ft_write_slot() {
+    local slot="$1"
+    _ft_validate_slot "$slot" || return 1
+    _ft_ensure_slot_dir
+    local tmp
+    tmp=$(mktemp "${SLOT_DIR}/active-slot.XXXXXX")
+    printf '%s\n' "$slot" > "$tmp"
+    mv "$tmp" "$ACTIVE_SLOT_FILE"
+    _ft_log "msg='slot file updated (atomic)' slot=$slot"
+    # Mirror to persistent backup (survives reboots — /var/run is tmpfs)
+    _ft_ensure_slot_backup_dir
+    local btmp
+    btmp=$(mktemp "$(dirname "$SLOT_BACKUP_FILE")/slot-backup.XXXXXX")
+    printf '%s\n' "$slot" > "$btmp"
+    mv "$btmp" "$SLOT_BACKUP_FILE"
+    _ft_log "msg='slot backup updated' slot=$slot path=$SLOT_BACKUP_FILE"
+}
+
+# ---------------------------------------------------------------------------
+# DEPLOYMENT LOCK
+# ---------------------------------------------------------------------------
+_ft_acquire_lock() {
+    _ft_ensure_slot_dir
+    _ft_log "msg='acquiring deployment lock' pid=$$ file=$LOCK_FILE"
+    exec 200>"$LOCK_FILE"
+    if ! flock -n 200; then
+        _ft_log "level=ERROR msg='another deployment already in progress -- aborting' pid=$$"
+        exit 1
+    fi
+    _ft_log "msg='deployment lock acquired' pid=$$ file=$LOCK_FILE"
+    trap '_ft_release_lock' EXIT
+}
+
+_ft_release_lock() {
+    { set +x; } 2>/dev/null
+    printf '[DEPLOY] ts=%s state=%s msg="releasing deployment lock" pid=%s\n' \
+        "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$$" >&2
+    exec 200>&- 2>/dev/null || true
+    if [ "${DEBUG:-false}" = "true" ]; then set -x; fi
+}
+
+# ===========================================================================
+# PHASE FUNCTIONS
+# ===========================================================================
+
+# ---------------------------------------------------------------------------
+# preflight — load env, validate contract, port-leak guard
+# ---------------------------------------------------------------------------
+preflight() {
+    _ft_state "PRE_FLIGHT" "msg='loading and validating environment'"
+
+    local last_good
+    last_good=$(cat "$LAST_GOOD_FILE" 2>/dev/null || echo "none")
+    _ft_log "msg='startup recovery info' last_good=$last_good"
+
+    _ft_load_env
+
+    DEPLOY_HISTORY="$DEPLOY_ROOT/.deploy_history"
+    _ft_log "msg='environment loaded' api_hostname=$API_HOSTNAME"
+
+    # GLOBAL PORT-LEAK GUARD — api containers MUST NOT bind host ports
+    local leaks
+    leaks=$(docker ps --format '{{.Names}} {{.Ports}}' 2>/dev/null \
+        | grep -E '^api-(blue|green)' \
+        | grep -E '(0\.0\.0\.0:|127\.0\.0\.1:)[0-9]+->') || true
+    if [ -n "${leaks:-}" ]; then
+        _ft_log "level=ERROR msg='API container has host port bindings — forbidden' leaks=${leaks}"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=api_port_leak_detected"
+    fi
+    _ft_log "msg='port-leak guard passed'"
+}
+
+# ---------------------------------------------------------------------------
+# ensure_network — create api_network if absent (idempotent)
+# ---------------------------------------------------------------------------
+ensure_network() {
+    docker network create --driver bridge "$NETWORK" 2>/dev/null \
+        && _ft_log "msg='api_network created'" \
+        || _ft_log "msg='api_network already exists'"
+    mkdir -p "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR"
+}
+
+# ---------------------------------------------------------------------------
+# ensure_nginx — nginx MUST exist and be on api_network; hard fail otherwise
+# ---------------------------------------------------------------------------
+ensure_nginx() {
+    if [ ! -d "$INFRA_ROOT/nginx/live" ]; then
+        _ft_error "msg='infra not initialized at expected path' infra_root=$INFRA_ROOT required=$INFRA_ROOT/nginx/live"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=infra_not_initialized"
+    fi
+    if [ ! -d "$INFRA_ROOT/nginx/backup" ]; then
+        _ft_error "msg='infra not initialized at expected path' infra_root=$INFRA_ROOT required=$INFRA_ROOT/nginx/backup"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=infra_not_initialized"
+    fi
+    if [ ! -f "$INFRA_ROOT/nginx/api.conf" ]; then
+        _ft_error "msg='infra template missing' path=$INFRA_ROOT/nginx/api.conf"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=infra_template_missing"
+    fi
+
+    if ! docker inspect nginx >/dev/null 2>&1; then
+        _ft_error "msg='nginx container not found — nginx is managed by the infra repo' hint='docker compose -f docker-compose.nginx.yml up -d'"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_missing"
+    fi
+    local net
+    net=$(docker inspect nginx \
+        --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "")
+    if ! echo "$net" | grep -q "$NETWORK"; then
+        _ft_error "msg='nginx not on api_network' networks=${net}"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_not_on_api_network"
+    fi
+    _ft_log "msg='nginx guard passed' network=$NETWORK"
+}
+
+# ---------------------------------------------------------------------------
+# pull_image — explicit pull; fails fast so docker run never races a pull
+# ---------------------------------------------------------------------------
+pull_image() {
+    _ft_state "PULL_IMAGE" "msg='pulling container image' sha=$IMAGE_SHA"
+    _ft_phase_start "PULL_IMAGE"
+    if ! run timeout 120 docker pull "$IMAGE"; then
+        _ft_log "level=ERROR msg='image pull failed' image=$IMAGE"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=image_pull_failed image=$IMAGE"
+    fi
+    _ft_log "msg='image pulled' image=$IMAGE"
+    _ft_phase_end "PULL_IMAGE"
+}
+
+# ---------------------------------------------------------------------------
+# resolve_slot — determine ACTIVE/INACTIVE slots with full recovery
+#
+# Reads slot from (in precedence order):
+#   1. /var/run/api/active-slot       (primary, tmpfs)
+#   2. /var/lib/api/active-slot.backup (persistent, survives reboots)
+#   3. nginx config upstream          (tiebreaker when both containers run)
+#   4. running containers             (recovery when slot files missing)
+#   5. default "green" / inactive "blue" (first deploy)
+#
+# Sets globals: ACTIVE, ACTIVE_NAME, INACTIVE, INACTIVE_NAME
+# ---------------------------------------------------------------------------
+resolve_slot() {
+    _ft_state "RESOLVE_SLOT" "msg='determining active slot'"
+    _ft_ensure_slot_dir
+
+    local recovered_slot=""
+
+    # 1. Primary slot file
+    if [ -f "$ACTIVE_SLOT_FILE" ]; then
+        local val
+        val=$(tr -d '[:space:]' < "$ACTIVE_SLOT_FILE")
+        if [[ "$val" == *DEPLOY* ]] || [[ "$val" == *\[* ]]; then
+            _ft_log "level=WARN msg='slot file contaminated — treating as corrupt' value=${val:0:80}"
+        elif _ft_validate_slot "$val" 2>/dev/null; then
+            _ft_log "msg='slot file read' slot=$val"
+            recovered_slot="$val"
+        fi
+    fi
+
+    # 2. Persistent backup slot file (survives /var/run tmpfs wipe on reboot)
+    if [ -z "$recovered_slot" ] && [ -f "$SLOT_BACKUP_FILE" ]; then
+        local bval
+        bval=$(tr -d '[:space:]' < "$SLOT_BACKUP_FILE")
+        if _ft_validate_slot "$bval" 2>/dev/null; then
+            _ft_log "msg='recovered slot from backup file' slot=$bval file=$SLOT_BACKUP_FILE"
+            recovered_slot="$bval"
+        fi
+    fi
+
+    # 3. Last-known-good snapshot
+    if [ -z "$recovered_slot" ] && [ -f "$LAST_GOOD_FILE" ]; then
+        local lgval
+        lgval=$(awk -F= '/^slot=/{print $2}' "$LAST_GOOD_FILE" 2>/dev/null | tr -d '[:space:]')
+        if _ft_validate_slot "$lgval" 2>/dev/null; then
+            _ft_log "msg='recovered slot from last-good snapshot' slot=$lgval"
+            recovered_slot="$lgval"
+        fi
+    fi
+
+    # 4+5. Container state + nginx tiebreaker
+    if [ -z "$recovered_slot" ]; then
+        local blue_running=false green_running=false
+        docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${BLUE_NAME}$"  && blue_running=true  || true
+        docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${GREEN_NAME}$" && green_running=true || true
+
+        if   [ "$blue_running" = "true" ] && [ "$green_running" = "false" ]; then
+            recovered_slot="blue"; _ft_log "msg='recovery: only blue running'"
+        elif [ "$green_running" = "true" ] && [ "$blue_running" = "false" ]; then
+            recovered_slot="green"; _ft_log "msg='recovery: only green running'"
+        elif [ "$blue_running" = "true" ] && [ "$green_running" = "true" ]; then
+            local upstream
+            upstream=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null \
+                | grep -oE 'api-blue|api-green' | head -1 || echo "")
+            recovered_slot="${upstream#api-}"
+            [ -z "$recovered_slot" ] && recovered_slot="blue"
+            _ft_log "msg='recovery: both running, nginx tiebreaker' nginx_upstream=${upstream:-none} slot=$recovered_slot"
+        else
+            recovered_slot="green"
+            _ft_log "msg='recovery: no containers running — first deploy, starting with blue' slot=green"
+        fi
+    fi
+
+    _ft_validate_slot "$recovered_slot" || exit 1
+
+    # Persist recovered value (atomic)
+    local tmp
+    tmp=$(mktemp "${SLOT_DIR}/active-slot.XXXXXX")
+    printf '%s\n' "$recovered_slot" > "$tmp"
+    mv "$tmp" "$ACTIVE_SLOT_FILE"
+
+    ACTIVE="$recovered_slot"
+    if [ "$ACTIVE" = "blue" ]; then
+        ACTIVE_NAME=$BLUE_NAME; INACTIVE="green"; INACTIVE_NAME=$GREEN_NAME
+    else
+        ACTIVE_NAME=$GREEN_NAME; INACTIVE="blue";  INACTIVE_NAME=$BLUE_NAME
+    fi
+
+    _ft_log "msg='slot resolved' active=$ACTIVE active_name=$ACTIVE_NAME inactive=$INACTIVE inactive_name=$INACTIVE_NAME"
+
+    # SLOT REPAIR — heal slot/container drift
+    if [ "$ACTIVE" = "green" ] && ! docker inspect api-green >/dev/null 2>&1; then
+        if docker inspect api-blue >/dev/null 2>&1; then
+            _ft_log "msg='slot repair: green missing but blue running → switching to blue'"
+            ACTIVE="blue"; ACTIVE_NAME=$BLUE_NAME; INACTIVE="green"; INACTIVE_NAME=$GREEN_NAME
+            _ft_write_slot "blue"
+        fi
+    elif [ "$ACTIVE" = "blue" ] && ! docker inspect api-blue >/dev/null 2>&1; then
+        if docker inspect api-green >/dev/null 2>&1; then
+            _ft_log "msg='slot repair: blue missing but green running → switching to green'"
+            ACTIVE="green"; ACTIVE_NAME=$GREEN_NAME; INACTIVE="blue"; INACTIVE_NAME=$BLUE_NAME
+            _ft_write_slot "green"
+        fi
+    fi
+    _ft_validate_slot "$ACTIVE" || exit 1
+}
+
+# ---------------------------------------------------------------------------
+# idempotency_check — skip deploy if target SHA already running + healthy
+# ---------------------------------------------------------------------------
+idempotency_check() {
+    _ft_state "IDEMPOTENCY" "msg='checking if target SHA already deployed' sha=$IMAGE_SHA"
+    local running_image
+    running_image=$(docker inspect --format '{{.Config.Image}}' "$ACTIVE_NAME" 2>/dev/null || echo "")
+    if [ "$running_image" = "$IMAGE" ]; then
+        local health
+        health=$(_ft_net_curl_out "$ACTIVE_NAME" \
+            -s --max-time 3 "http://$ACTIVE_NAME:$APP_PORT/health")
+        if echo "$health" | grep -q '"status":"ok"' 2>/dev/null; then
+            _ft_log "msg='target SHA already running and healthy — nothing to do' container=$ACTIVE_NAME"
+            _ft_final_state "$ACTIVE_NAME" "$IMAGE_SHA"
+            _ft_github_summary "✅ IDEMPOTENT (no change)" "$ACTIVE_NAME" "$IMAGE_SHA" "SHA already deployed"
+            _ft_exit 0 "DEPLOY_SUCCESS" "reason=idempotent_noop sha=$IMAGE_SHA"
+        fi
+        _ft_log "msg='SHA matches but container not healthy — proceeding' container=$ACTIVE_NAME"
+    else
+        _ft_log "msg='SHA differs — proceeding' running=${running_image:-none} target=$IMAGE"
+    fi
+}
+
+# ---------------------------------------------------------------------------
+# start_inactive — start new container on api_network (no host ports)
+# ---------------------------------------------------------------------------
+start_inactive() {
+    _ft_state "START_INACTIVE" "msg='starting inactive container' name=$INACTIVE_NAME"
+
+    # Rename any stale container for audit trail (graceful rename→purge later)
+    if docker ps -a --format '{{.Names}}' | grep -Eq "^${INACTIVE_NAME}$"; then
+        _ft_log "msg='renaming stale container' name=$INACTIVE_NAME"
+        docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
+        local ts
+        ts=$(date +%s)
+        docker rename "$INACTIVE_NAME" "${INACTIVE_NAME}-old-${ts}" 2>/dev/null \
+            || docker rm "$INACTIVE_NAME"
+    fi
+
+    local cid
+    cid=$(timeout 60 docker run -d \
+        --name "$INACTIVE_NAME" \
+        --network "$NETWORK" \
+        --restart unless-stopped \
+        --label "api.sha=$IMAGE_SHA" \
+        --label "api.slot=$INACTIVE" \
+        --label "api.deploy_id=$DEPLOY_ID" \
+        --env-file "$ENV_FILE" \
+        "$IMAGE" 2>&1) || {
+        printf '%s\n' "$cid" >&2
+        _ft_error "msg='container start failed' name=$INACTIVE_NAME"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=container_start_failed"
+    }
+    _ft_log "msg='container started' name=$INACTIVE_NAME"
+
+    # Image immutability check
+    local actual
+    actual=$(docker inspect --format '{{.Config.Image}}' "$INACTIVE_NAME" 2>/dev/null || echo "")
+    if [ "$actual" != "$IMAGE" ]; then
+        _ft_log "level=ERROR msg='image immutability check failed' expected=$IMAGE actual=${actual:-unknown}"
+        docker logs "$INACTIVE_NAME" --tail 50 >&2 || true
+        docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
+        docker rm "$INACTIVE_NAME" || true
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=image_immutability_check_failed"
+    fi
+    _ft_log "msg='image immutability check passed'"
+}
+
+# ---------------------------------------------------------------------------
+# health_check_internal — wait for /health=200 via in-network curl
+# NO /ready usage. NO Redis/Supabase dependency.
+# ---------------------------------------------------------------------------
+health_check_internal() {
+    _ft_state "HEALTH_CHECK_INTERNAL" "msg='waiting for container readiness'"
+    _ft_phase_start "HEALTH_CHECK_INTERNAL"
+    sleep 5
+
+    # Connectivity pre-check (5 short probes before main loop)
+    local conn_ok=false conn_attempts=0
+    while [ "$conn_attempts" -lt 5 ]; do
+        conn_attempts=$(( conn_attempts + 1 ))
+        if _ft_net_curl "$INACTIVE_NAME" \
+               -sf --max-time 3 "http://$INACTIVE_NAME:$APP_PORT/health"; then
+            conn_ok=true; break
+        fi
+        sleep 2
+    done
+
+    if [ "$conn_ok" = "false" ]; then
+        _ft_log "level=ERROR msg='container not reachable after connectivity pre-check' container=$INACTIVE_NAME"
+        docker logs "$INACTIVE_NAME" --tail 100 >&2 || true
+        docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
+        docker rm "$INACTIVE_NAME" || true
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=container_not_reachable"
+    fi
+    _ft_log "msg='connectivity pre-check passed' container=$INACTIVE_NAME"
+
+    # Main readiness loop — waits for HTTP 200 on /health
+    local attempt=0
+    until true; do
+        attempt=$(( attempt + 1 ))
+        local status
+        status=$(_ft_net_curl_out "$INACTIVE_NAME" \
+            --max-time 4 -s -o /dev/null -w "%{http_code}" \
+            "http://$INACTIVE_NAME:$APP_PORT/health" || echo "000")
+
+        if [ "$status" = "200" ]; then
+            _ft_log "msg='health check passed' endpoint=/health attempts=$attempt"
+            break
+        fi
+
+        if ! docker ps --format '{{.Names}}' | grep -q "^${INACTIVE_NAME}$"; then
+            _ft_log "level=ERROR msg='container exited unexpectedly' name=$INACTIVE_NAME"
+            docker logs "$INACTIVE_NAME" --tail 100 >&2 || true
+            docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
+            docker rm "$INACTIVE_NAME" || true
+            _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=new_container_crashed"
+        fi
+
+        if [ "$attempt" -ge "$MAX_HEALTH_ATTEMPTS" ]; then
+            _ft_log "level=ERROR msg='health check timed out' attempts=$attempt status=$status"
+            docker logs "$INACTIVE_NAME" --tail 100 >&2 || true
+            docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
+            docker rm "$INACTIVE_NAME" || true
+            _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=health_timeout attempts=$attempt"
+        fi
+
+        [ $(( attempt % 10 )) -eq 0 ] && _ft_log "msg='still waiting' attempt=$attempt/$MAX_HEALTH_ATTEMPTS status=$status"
+        sleep $(( HEALTH_INTERVAL + RANDOM % 3 ))
+    done
+
+    _ft_phase_end "HEALTH_CHECK_INTERNAL"
+
+    # Docker HEALTHCHECK gate (must be healthy, not just starting)
+    if ! _ft_wait_docker_health "$INACTIVE_NAME"; then
+        docker logs "$INACTIVE_NAME" --tail 50 >&2 || true
+        docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
+        docker rm "$INACTIVE_NAME" || true
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=docker_health_failed"
+    fi
+
+    sleep 3  # brief stabilization after healthcheck gate
+
+    # Pre-switch final connectivity check (fresh curl invocation, same net path as nginx)
+    if ! docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" \
+           -sf --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/health" >/dev/null 2>&1; then
+        _ft_error "msg='pre-switch connectivity check failed' container=$INACTIVE_NAME"
+        docker logs "$INACTIVE_NAME" --tail 50 >&2 || true
+        docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
+        docker rm "$INACTIVE_NAME" || true
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=pre_switch_connectivity_failed"
+    fi
+    _ft_log "msg='pre-switch connectivity check passed' container=$INACTIVE_NAME"
+}
+
+# ---------------------------------------------------------------------------
+# switch_nginx — render config, test, reload ONCE; write slot file after reload
+# ---------------------------------------------------------------------------
+switch_nginx() {
+    _ft_state "SWITCH_NGINX" "msg='switching nginx upstream' container=$INACTIVE_NAME"
+    sleep 2  # brief stabilization window before touching nginx
+
+    mkdir -p "$NGINX_BACKUP_DIR"
+    local backup tmp
+    backup="$NGINX_BACKUP_DIR/api.conf.bak.$(date +%s)"
+    tmp="$(mktemp /tmp/api-nginx.XXXXXX.conf)"
+
+    # Pre-reload gate — one final health probe before writing nginx config
+    if ! _ft_net_curl "$INACTIVE_NAME" \
+           -sf --max-time 4 "http://$INACTIVE_NAME:$APP_PORT/health"; then
+        _ft_log "level=ERROR msg='pre-reload gate failed' container=$INACTIVE_NAME"
+        docker logs "$INACTIVE_NAME" --tail 50 >&2 || true
+        docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
+        docker rm "$INACTIVE_NAME" || true
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=pre_reload_gate_failed"
+    fi
+
+    sed \
+        -e "s|__ACTIVE_CONTAINER__|$INACTIVE_NAME|g" \
+        -e "s|__API_HOSTNAME__|$API_HOSTNAME|g" \
+        "$NGINX_TEMPLATE" > "$tmp"
+
+    cp "$NGINX_CONF" "$backup"
+    cp "$tmp" "$NGINX_CONF"
+    rm -f "$tmp"
+    ls -1t "$NGINX_BACKUP_DIR"/api.conf.bak.* 2>/dev/null | tail -n +6 | xargs rm -f 2>/dev/null || true
+
+    # nginx network guard before every reload
+    local net
+    net=$(docker inspect nginx \
+        --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "")
+    if ! echo "$net" | grep -q "$NETWORK"; then
+        _ft_log "level=ERROR msg='nginx not on api_network at reload time' networks=${net}"
+        cp "$backup" "$NGINX_CONF"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_network_mismatch"
+    fi
+
+    local test_out
+    test_out=$(docker exec nginx nginx -t 2>&1) || {
+        printf '%s\n' "$test_out" >&2
+        _ft_log "level=ERROR msg='nginx config test failed — restoring backup'"
+        cp "$backup" "$NGINX_CONF"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_config_test_failed"
+    }
+
+    # === SINGLE nginx reload per deploy ===
+    docker exec nginx nginx -s reload >/dev/null 2>&1 \
+        || { cp "$backup" "$NGINX_CONF"; _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_reload_failed"; }
+    _ft_log "msg='nginx reloaded (once)' upstream=$INACTIVE_NAME:$APP_PORT"
+
+    # Upstream sanity: live config must match INACTIVE_NAME
+    local actual_upstream
+    actual_upstream=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null \
+        | grep -oE 'api-blue|api-green' | head -1 || echo "")
+    if [ "$actual_upstream" != "$INACTIVE_NAME" ]; then
+        _ft_log "level=ERROR msg='nginx upstream sanity failed' expected=$INACTIVE_NAME actual=${actual_upstream:-unreadable}"
+        cp "$backup" "$NGINX_CONF"
+        docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1 || true
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_upstream_mismatch"
+    fi
+    _ft_log "msg='nginx upstream sanity passed' container=$INACTIVE_NAME"
+
+    # Write slot AFTER nginx reload — slot always reflects what nginx serves
+    _ft_write_slot "$INACTIVE"
+    _ft_log "msg='TRAFFIC_SWITCH' active=$INACTIVE_NAME sha=$IMAGE_SHA deploy_id=$DEPLOY_ID"
+    _ft_phase_end "SWITCH_NGINX"
+
+    # Store backup path in global for rollback use in verify_routing / stability
+    NGINX_BACKUP="$backup"
+}
+
+# ---------------------------------------------------------------------------
+# verify_routing — validate nginx→backend end-to-end via api_network
+# Rolls back (with rollback logic inline) on failure.
+# ---------------------------------------------------------------------------
+verify_routing() {
+    _ft_state "HEALTH_CHECK_PUBLIC" "msg='validating nginx routing + backend health'"
+    sleep $(( RANDOM % 3 + 5 ))  # nginx warm-up
+
+    # Post-switch routing verification (5 retries)
+    local ps_ok=false
+    for _ps in 1 2 3 4 5; do
+        if docker run --rm --network api_network "$_FT_CURL_IMG" \
+               -sfk --max-time 5 "https://nginx/health" >/dev/null 2>&1; then
+            ps_ok=true; break
+        fi
+        sleep $(( RANDOM % 2 + 2 ))
+    done
+    if [ "$ps_ok" != "true" ]; then
+        _ft_error "msg='post-switch routing verification failed'"
+        _ft_snapshot
+        _restore_nginx_and_slot "$ACTIVE"
+        docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
+        docker rm "$INACTIVE_NAME" || true
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=post_switch_routing_failed"
+    fi
+    _ft_log "msg='post-switch routing verification passed'"
+
+    # Post-switch upstream verification (direct container probe)
+    if ! docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" \
+           -sf --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/health" >/dev/null 2>&1; then
+        _ft_error "msg='post-switch upstream verification failed' container=$INACTIVE_NAME"
+        _ft_snapshot
+        _restore_nginx_and_slot "$ACTIVE"
+        docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
+        docker rm "$INACTIVE_NAME" || true
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=post_switch_upstream_failed"
+    fi
+    _ft_log "msg='post-switch upstream verified' container=$INACTIVE_NAME"
+
+    # Public health check via nginx
+    local pub_passed=false
+    if docker run --rm --network api_network "$_FT_CURL_IMG" \
+           -sfk --max-time 10 "https://nginx/health" 2>/dev/null | grep -q '"status":"ok"'; then
+        pub_passed=true
+        _ft_log "msg='public health check passed' container=$INACTIVE_NAME"
+    else
+        _ft_log "msg='public health check failed' container=$INACTIVE_NAME"
+    fi
+
+    # Container alignment check
+    local nginx_container
+    nginx_container=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null \
+        | grep -oE 'api-blue|api-green' | head -1 || echo "")
+    if [ -n "$nginx_container" ] && [ "$nginx_container" != "$INACTIVE_NAME" ]; then
+        _ft_log "level=ERROR msg='nginx container mismatch' expected=$INACTIVE_NAME actual=$nginx_container"
+        pub_passed=false
+    fi
+
+    if [ "$pub_passed" != "true" ]; then
+        _ft_state "ROLLBACK" "reason='public health check failed'"
+        _ft_snapshot
+        _restore_nginx_and_slot "$ACTIVE"
+        docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
+        docker rm "$INACTIVE_NAME" || true
+
+        # If ACTIVE_NAME still healthy, no need for image rollback
+        if docker ps --format '{{.Names}}' | grep -q "^${ACTIVE_NAME}$"; then
+            local ah
+            ah=$(_ft_net_curl_out "$ACTIVE_NAME" \
+                -s --max-time 3 "http://$ACTIVE_NAME:$APP_PORT/health")
+            if echo "$ah" | grep -q '"status":"ok"' 2>/dev/null; then
+                _ft_log "msg='active container still healthy — no image rollback needed' container=$ACTIVE_NAME"
+                _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=public_health_check_failed active_healthy=true"
+            fi
+        fi
+
+        _ft_log "msg='system degraded — triggering image rollback'"
+        _trigger_internal_rollback "public_health_check_failed"
+    fi
+
+    # Stability check (post-switch settle verification)
+    _ft_state "STABILITY_CHECK" "msg='post-switch stability check'"
+    _ft_phase_start "STABILITY_CHECK"
+    sleep 5
+
+    if _ft_check_external_ready; then
+        _ft_log "msg='stability check passed' url=https://$API_HOSTNAME/health"
+        _ft_phase_end "STABILITY_CHECK"
+    else
+        _ft_log "level=ERROR msg='stability check failed — service regressed after initial pass'"
+        _ft_snapshot
+        _restore_nginx_and_slot "$ACTIVE"
+        docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
+        docker rm "$INACTIVE_NAME" || true
+
+        if docker ps --format '{{.Names}}' | grep -q "^${ACTIVE_NAME}$"; then
+            local ah
+            ah=$(_ft_net_curl_out "$ACTIVE_NAME" \
+                -s --max-time 3 "http://$ACTIVE_NAME:$APP_PORT/health")
+            if echo "$ah" | grep -q '"status":"ok"' 2>/dev/null; then
+                _ft_log "msg='active container healthy after stability failure' container=$ACTIVE_NAME"
+                _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=stability_check_failed active_healthy=true"
+            fi
+        fi
+        _trigger_internal_rollback "stability_check_failed"
+    fi
+}
+
+# Restore nginx to backup config and write the previous slot.
+# Called from verify_routing on route/stability failure.
+_restore_nginx_and_slot() {
+    local prev_slot="$1"
+    _ft_log "msg='restoring previous nginx config' slot=$prev_slot"
+    cp "$NGINX_BACKUP" "$NGINX_CONF"
+    if docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1; then
+        _ft_log "msg='nginx restored'"
+    else
+        _ft_log "level=ERROR msg='nginx restore failed — check manually'"
+    fi
+    _ft_write_slot "$prev_slot"
+}
+
+# Release lock and exec deploy.sh --rollback --auto as a subprocess.
+# This is the internal failure path — separate from the user-facing rollback().
+_trigger_internal_rollback() {
+    local reason="$1"
+    if [ "${API_ROLLBACK_IN_PROGRESS:-0}" != "1" ]; then
+        _ft_error "msg='ROLLBACK triggered' reason=$reason"
+        export API_ROLLBACK_IN_PROGRESS=1
+        _ft_release_lock
+        if ! "$SCRIPT_DIR/deploy.sh" --rollback --auto; then
+            _ft_snapshot
+            _ft_exit 2 "DEPLOY_FAILED_FATAL" "reason=${reason}_and_rollback_failed"
+        fi
+        _ft_exit 1 "DEPLOY_FAILED_ROLLBACK" "reason=$reason msg='rollback succeeded'"
+    else
+        _ft_log "msg='nested rollback guard reached — stopping'"
+        _ft_exit 1 "DEPLOY_FAILED_FATAL" "reason=nested_rollback_guard"
+    fi
+}
+
+# ---------------------------------------------------------------------------
+# cleanup_old — gracefully stop and rename the previously-active container
+# ---------------------------------------------------------------------------
+cleanup_old() {
+    _ft_state "CLEANUP" "msg='stopping previous container' name=$ACTIVE_NAME"
+
+    if ! docker ps --format '{{.Names}}' | grep -q "^$ACTIVE_NAME$"; then
+        _ft_log "msg='previous container already gone — skipping cleanup' name=$ACTIVE_NAME"
+        return 0
+    fi
+
+    docker stop --time 10 "$ACTIVE_NAME" 2>/dev/null || true
+    local ts
+    ts=$(date +%s)
+    docker rename "$ACTIVE_NAME" "${ACTIVE_NAME}-old-${ts}" 2>/dev/null \
+        || docker rm "$ACTIVE_NAME" || true
+    _ft_log "msg='previous container stopped + renamed' name=$ACTIVE_NAME rename=${ACTIVE_NAME}-old-${ts}"
+}
+
+# ---------------------------------------------------------------------------
+# success — truth check, last-known-good snapshot, deploy history
+# ---------------------------------------------------------------------------
+success() {
+    _ft_state "SUCCESS" "msg='deployment complete' container=$INACTIVE_NAME sha=$IMAGE_SHA slot=$INACTIVE"
+
+    # Truth check
+    local truth_ok=true
+
+    # 1. Slot file
+    if [ -f "$ACTIVE_SLOT_FILE" ]; then
+        local sv
+        sv=$(tr -d '[:space:]' < "$ACTIVE_SLOT_FILE")
+        if [ "$sv" != "$INACTIVE" ]; then
+            _ft_log "level=ERROR msg='truth check: slot mismatch' expected=$INACTIVE actual=$sv"
+            truth_ok=false
+        else
+            _ft_log "msg='truth check: slot correct' slot=$sv"
+        fi
+    else
+        _ft_log "level=ERROR msg='truth check: slot file missing'"
+        truth_ok=false
+    fi
+
+    # 2. nginx upstream
+    local nginx_up
+    nginx_up=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null \
+        | grep -oE 'api-blue|api-green' | head -1 || echo "")
+    if [ -n "$nginx_up" ] && [ "$nginx_up" != "$INACTIVE_NAME" ]; then
+        _ft_log "level=ERROR msg='truth check: nginx upstream mismatch' expected=$INACTIVE_NAME actual=$nginx_up"
+        truth_ok=false
+    else
+        _ft_log "msg='truth check: nginx upstream correct' container=${nginx_up:-unknown}"
+    fi
+
+    # 3. Internal + external endpoint health
+    sleep 2
+    local int_ok=false ext_ok=false
+
+    local int_resp
+    int_resp=$(_ft_net_curl_out "$INACTIVE_NAME" \
+        -s --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/health")
+    echo "$int_resp" | grep -q '"status":"ok"' 2>/dev/null && int_ok=true
+    _ft_log "msg='truth check: internal endpoint' ok=$int_ok url=http://$INACTIVE_NAME:$APP_PORT/health"
+
+    local ext_latency_ms=0
+    for _sa in 1 2 3; do
+        local t0 t1
+        t0=$(date +%s%3N)
+        if docker run --rm --network api_network "$_FT_CURL_IMG" \
+               -sk --max-time 3 "https://nginx/health" 2>/dev/null | grep -q '"status":"ok"'; then
+            t1=$(date +%s%3N)
+            ext_latency_ms=$(( t1 - t0 ))
+            ext_ok=true; break
+        fi
+        [ "$_sa" -lt 3 ] && sleep $(( RANDOM % 3 + 5 ))
+    done
+
+    _ft_log "msg='truth check: external endpoint' ok=$ext_ok latency_ms=$ext_latency_ms url=https://$API_HOSTNAME/health"
+    if [ "$ext_latency_ms" -gt 500 ]; then
+        _ft_log "level=WARN msg='SLO warning: high latency' latency_ms=$ext_latency_ms threshold_ms=500"
+    fi
+
+    if [ "$int_ok" = "true" ] && [ "$ext_ok" = "false" ]; then
+        _ft_log "level=ERROR msg='truth check FAILED: internal ok but external unreachable (nginx/proxy/DNS/TLS issue)'"
+        truth_ok=false
+    fi
+    if [ "$int_ok" = "false" ] || [ "$ext_ok" = "false" ]; then
+        [ "$truth_ok" = "true" ] && _ft_log "level=ERROR msg='truth check FAILED: endpoint(s) not healthy' int=$int_ok ext=$ext_ok"
+        truth_ok=false
+    fi
+
+    if [ "$truth_ok" != "true" ]; then
+        _ft_state "FAILURE" "reason='post_deployment_truth_check_failed'"
+        _ft_snapshot
+        exit 2
+    fi
+
+    # Last-known-good snapshot (atomic)
+    _ft_log "msg='recording last-known-good' slot=$INACTIVE container=$INACTIVE_NAME"
+    local snap_tmp
+    snap_tmp=$(mktemp "${SNAP_DIR}/last-good.XXXXXX")
+    printf 'slot=%s container=%s ts=%s\n' "$INACTIVE" "$INACTIVE_NAME" "$(date -Iseconds)" > "$snap_tmp"
+    mv "$snap_tmp" "$LAST_GOOD_FILE"
+
+    # Deploy history (rolling, atomic)
+    local hist_tmp="${DEPLOY_HISTORY}.tmp.$$"
+    if [ -f "$DEPLOY_HISTORY" ]; then
+        (echo "$IMAGE_SHA"; head -n $(( MAX_HISTORY - 1 )) "$DEPLOY_HISTORY") > "$hist_tmp"
+    else
+        echo "$IMAGE_SHA" > "$hist_tmp"
+    fi
+    mv "$hist_tmp" "$DEPLOY_HISTORY"
+    _ft_log "msg='deploy history updated' sha=$IMAGE_SHA"
+
+    # Zombie purge
+    _ft_log "msg='running zombie purge'"
+    docker ps -a --format '{{.Names}}' \
+        | grep -E '^api-(blue|green)-old-[0-9]+$' \
+        | xargs -r docker rm -f 2>/dev/null || true
+
+    _ft_final_state "$INACTIVE_NAME" "$IMAGE_SHA"
+    _ft_github_summary "✅ SUCCESS" "$INACTIVE_NAME" "$IMAGE_SHA"
+}
+
+# ---------------------------------------------------------------------------
+# main — full blue-green deploy flow
+# ---------------------------------------------------------------------------
+main() {
+    _ft_acquire_lock
+
+    # Validate SHA in deploy mode (not needed for rollback — resolved before calling main)
+    if [ -z "$IMAGE_SHA" ] || [ "$IMAGE_SHA" = "latest" ]; then
+        printf '[DEPLOY] ts=%s state=INIT level=ERROR msg="image SHA required"\n' \
+            "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >&2
+        exit 2
+    fi
+
+    IMAGE="ghcr.io/fieldtrack-tech/api:$IMAGE_SHA"
+    _ft_log "msg='deploy started' mode=$MODE sha=$IMAGE_SHA deploy_id=$DEPLOY_ID pid=$$ start_ts=$START_TS"
+
+    preflight
+    ensure_network
+    ensure_nginx
+    pull_image
+
+    # BOOTSTRAP: first deploy when no api containers exist
+    if ! docker ps -a --format '{{.Names}}' | grep -Eq '^api-(blue|green)$'; then
+        _ft_state "BOOTSTRAP" "msg='no api containers — first deploy'"
+        # Initialize globals required by downstream functions
+        ACTIVE="green"; ACTIVE_NAME=$GREEN_NAME; INACTIVE="blue"; INACTIVE_NAME=$BLUE_NAME
+        DEPLOY_HISTORY="${DEPLOY_HISTORY:-$DEPLOY_ROOT/.deploy_history}"
+        NGINX_BACKUP="$NGINX_BACKUP_DIR/api.conf.bak.$(date +%s)"
+
+        docker rm -f api-blue 2>/dev/null || true
+        start_inactive
+        health_check_internal
+        # Write nginx config directly for first deploy (no backup to restore)
+        mkdir -p "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR"
+        local boot_tmp; boot_tmp="$(mktemp /tmp/api-nginx-boot.XXXXXX.conf)"
+        sed -e "s|__ACTIVE_CONTAINER__|$INACTIVE_NAME|g" \
+            -e "s|__API_HOSTNAME__|${API_HOSTNAME}|g" \
+            "$NGINX_TEMPLATE" > "$boot_tmp"
+        cp "$boot_tmp" "$NGINX_CONF"
+        rm -f "$boot_tmp"
+        local net_check
+        net_check=$(docker inspect nginx \
+            --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "")
+        if ! echo "$net_check" | grep -q "$NETWORK"; then
+            _ft_log "level=ERROR msg='bootstrap: nginx not on api_network' networks=${net_check}"
+            _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_network_mismatch_bootstrap"
+        fi
+        local nt_out
+        nt_out=$(docker exec nginx nginx -t 2>&1) || {
+            printf '%s\n' "$nt_out" >&2
+            _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_config_test_failed_bootstrap"
+        }
+        docker exec nginx nginx -s reload >/dev/null 2>&1 \
+            || _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_reload_failed_bootstrap"
+        _ft_log "msg='bootstrap: nginx reloaded'"
+        _ft_write_slot "blue"
+        local snap_tmp; snap_tmp=$(mktemp "${SNAP_DIR}/last-good.XXXXXX")
+        printf 'slot=blue container=api-blue ts=%s\n' "$(date -Iseconds)" > "$snap_tmp"
+        mv "$snap_tmp" "$LAST_GOOD_FILE"
+        # Deploy history
+        DEPLOY_HISTORY="${DEPLOY_HISTORY:-$DEPLOY_ROOT/.deploy_history}"
+        local hist_tmp="${DEPLOY_HISTORY}.tmp.$$"
+        echo "$IMAGE_SHA" > "$hist_tmp"
+        mv "$hist_tmp" "$DEPLOY_HISTORY"
+        _ft_exit 0 "BOOTSTRAP_SUCCESS" "slot=blue image=$IMAGE"
+    fi
+
+    # Normal deploy path
+    resolve_slot
+    idempotency_check
+    start_inactive
+    health_check_internal
+    switch_nginx
+    verify_routing
+    cleanup_old
+    success
+
+    _ft_exit 0 "DEPLOY_SUCCESS" "sha=$IMAGE_SHA container=$INACTIVE_NAME slot=$INACTIVE"
+}
+
+# ---------------------------------------------------------------------------
+# rollback — restore previous SHA from deploy history
+# ---------------------------------------------------------------------------
+rollback() {
+    _ft_log "msg='rollback initiated' mode=${MODE} auto=$AUTO_MODE"
+
+    if [ ! -f "$DEPLOY_HISTORY" ] || [ ! -s "$DEPLOY_HISTORY" ]; then
+        printf '[ERROR] No deployment history found: %s\n' "$DEPLOY_HISTORY" >&2
+        exit 1
+    fi
+
+    mapfile -t HISTORY < "$DEPLOY_HISTORY"
+    if [ "${#HISTORY[@]}" -lt 2 ]; then
+        printf '[ERROR] Need at least two deployments to rollback (history has %d entries)\n' \
+            "${#HISTORY[@]}" >&2
+        exit 1
+    fi
+
+    local current_sha="${HISTORY[0]}"
+    local previous_sha="${HISTORY[1]}"
+
+    printf '=========================================\n'
+    printf 'FieldTrack Rollback\n'
+    printf '=========================================\n'
+    printf 'Current deployment : %s\n' "$current_sha"
+    printf 'Rollback target    : %s\n' "$previous_sha"
+    printf '\n'
+
+    printf 'Validating rollback image exists...\n'
+    if ! docker manifest inspect "ghcr.io/fieldtrack-tech/api:$previous_sha" >/dev/null 2>&1; then
+        printf '[ERROR] Rollback image not found in registry: ghcr.io/fieldtrack-tech/api:%s\n' "$previous_sha" >&2
+        exit 1
+    fi
+    printf '✓ Rollback image verified.\n\n'
+
+    if [ "$AUTO_MODE" = "false" ]; then
+        printf '⚠️  WARNING: This will replace the current deployment.\n'
+        read -r -p "Continue with rollback? (yes/no): " REPLY
+        if [[ ! $REPLY =~ ^[Yy][Ee][Ss]$ ]]; then
+            printf 'Rollback cancelled.\n'
+            exit 0
+        fi
+    else
+        printf 'Auto rollback mode (CI).\n'
+    fi
+
+    printf '\nStarting rollback to: %s\n\n' "$previous_sha"
+    export API_ROLLBACK_IN_PROGRESS=1
+    IMAGE_SHA="$previous_sha"
+    main
+
+    printf '\n=========================================\n'
+    printf 'Rollback completed: %s\n' "$previous_sha"
+    printf '=========================================\n'
+}
+
+# ===========================================================================
+# CONSTANTS (loaded after function definitions but before execution)
+# ===========================================================================
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
+[ -d "$DEPLOY_ROOT" ] || { printf '[ERROR] DEPLOY_ROOT not found: %s\n' "$DEPLOY_ROOT" >&2; exit 1; }
+REPO_DIR="$DEPLOY_ROOT"
+INFRA_ROOT="${INFRA_ROOT:-/opt/infra}"
+
+BLUE_NAME="api-blue"
+GREEN_NAME="api-green"
+APP_PORT=3000
+NETWORK="api_network"
+_FT_CURL_IMG="curlimages/curl:8.7.1"
+
+SLOT_DIR="/var/run/api"
+ACTIVE_SLOT_FILE="$SLOT_DIR/active-slot"
+SLOT_BACKUP_FILE="/var/lib/api/active-slot.backup"  # persistent, survives reboots
+
+NGINX_CONF="$INFRA_ROOT/nginx/live/api.conf"
+NGINX_LIVE_DIR="$INFRA_ROOT/nginx/live"
+NGINX_BACKUP_DIR="$INFRA_ROOT/nginx/backup"
+NGINX_TEMPLATE="$INFRA_ROOT/nginx/api.conf"
+NGINX_BACKUP=""  # set inside switch_nginx()
+
+MAX_HISTORY=5
+MAX_HEALTH_ATTEMPTS=40
+HEALTH_INTERVAL=3
+
+LOCK_FILE="$SLOT_DIR/deploy.lock"
+SNAP_DIR="$SLOT_DIR"
+LAST_GOOD_FILE="$SNAP_DIR/last-good"
+
+# DEPLOY_HISTORY is set inside preflight() after _ft_load_env()
+DEPLOY_HISTORY=""
+
+# ACTIVE/INACTIVE are set inside resolve_slot()
+ACTIVE="" ACTIVE_NAME="" INACTIVE="" INACTIVE_NAME=""
+
+# IMAGE is set inside main()
+IMAGE=""
+
+# ===========================================================================
+# ENTRY POINT
+# ===========================================================================
+_ft_log "msg='deploy.sh invoked' mode=$MODE auto=$AUTO_MODE sha=${IMAGE_SHA:-<none>} pid=$$"
+
+if [ "$MODE" = "rollback" ]; then
+    # For rollback we need env loaded early to find DEPLOY_HISTORY
+    _ft_load_env
+    DEPLOY_HISTORY="$DEPLOY_ROOT/.deploy_history"
+    rollback
+else
+    main
+fi
diff --git a/scripts/load-env.sh b/scripts/load-env.sh
deleted file mode 100644
index a00b5af..0000000
--- a/scripts/load-env.sh
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/usr/bin/env bash
-# ---------------------------------------------------------------------------
-# load-env.sh — Centralised environment loader for FieldTrack deploy scripts
-#
-# Source this file at the start of every deploy/rollback script:
-#   source "$(dirname "${BASH_SOURCE[0]}")/load-env.sh"
-#
-# After sourcing, the following are exported into the caller's environment:
-#   DEPLOY_ROOT    — absolute path to the repository root on the VPS
-#   ENV_FILE       — absolute path to .env
-#   API_HOSTNAME   — bare hostname derived from API_BASE_URL (no scheme/path)
-#
-# All KEY=VALUE pairs from .env are also exported into the caller's
-# process, so downstream scripts can reference any app env var directly.
-# ---------------------------------------------------------------------------
-set -euo pipefail
-
-# Disable trace to prevent secrets from leaking into logs
-set +x 2>/dev/null || true
-
-# Derive repo root from this script's own location so the loader works
-# regardless of the current working directory when it is sourced.
-_LES_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-_LES_REPO="$(cd "$_LES_DIR/.." && pwd)"
-
-# ── DEPLOY_ROOT ─────────────────────────────────────────────────────────────
-# Prefer an already-exported value (e.g. set explicitly by the CI SSH step);
-# default to the canonical VPS deployment path under the current user's home.
-export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
-
-[ -d "$DEPLOY_ROOT" ] || {
-    echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"
-    echo "   Expected repository root at: $HOME/api"
-    echo "   If your repo is elsewhere, export DEPLOY_ROOT before running scripts."
-    if [ -d "$_LES_REPO" ]; then
-        echo "   Detected script-relative repo candidate: $_LES_REPO"
-    fi
-    exit 1
-}
-
-# ── ENV_FILE ─────────────────────────────────────────────────────────────────
-export ENV_FILE="$DEPLOY_ROOT/.env"
-
-if [ ! -f "$ENV_FILE" ]; then
-    echo "❌ Required .env file not found: $ENV_FILE"
-    echo "   Create it from .env.example and populate all required values."
-    exit 1
-fi
-echo "✓ .env file exists: $ENV_FILE"
-
-# ── Load all variables from .env ─────────────────────────────────────────────
-# allexport is enabled so every KEY=VALUE assignment is automatically exported;
-# disabled immediately after to avoid exporting any later shell variables.
-set -o allexport
-# shellcheck source=/dev/null
-source "$ENV_FILE"
-set +o allexport
-
-# ── Validate required variables ──────────────────────────────────────────────
-_LES_MISSING=""
-for _LES_VAR in API_BASE_URL CORS_ORIGIN; do
-    eval "_LES_VAL=\"\${${_LES_VAR}:-}\""
-    if [ -z "$_LES_VAL" ]; then
-        _LES_MISSING="${_LES_MISSING}  - ${_LES_VAR}\n"
-    fi
-done
-
-if [ -n "$_LES_MISSING" ]; then
-    echo "❌ Missing required variables in $ENV_FILE:"
-    printf "%b" "$_LES_MISSING"
-    exit 1
-fi
-
-echo "✓ API_BASE_URL is set"
-echo "✓ CORS_ORIGIN is set"
-
-# ── Derive API_HOSTNAME from API_BASE_URL ────────────────────────────────────
-# Use bash-safe parsing (no Node.js dependency for VPS compatibility)
-# Strip protocol (http:// or https://) and take first path segment
-API_HOSTNAME=$(echo "$API_BASE_URL" | sed -E 's|^https?://||' | cut -d'/' -f1)
-
-# Validate: result must be a non-empty bare hostname (or host:port).
-# Reject if it contains whitespace, path separators, credential markers (@),
-# or query/fragment characters — any of these indicate a malformed API_BASE_URL.
-if [ -z "$API_HOSTNAME" ] || printf '%s' "$API_HOSTNAME" | grep -qE '[[:space:]/@?#]'; then
-    echo "❌ Invalid API_HOSTNAME derived from API_BASE_URL='$API_BASE_URL'"
-    echo "   Expected a bare hostname or host:port — e.g.: api.example.com"
-    echo "   Got: '$API_HOSTNAME'"
-    echo "   Check that API_BASE_URL has no embedded credentials, spaces, or bare paths."
-    exit 1
-fi
-
-export API_HOSTNAME
-echo "✓ API_HOSTNAME: $API_HOSTNAME"
-
-# Clean up internal variables so they do not leak into the caller's scope.
-unset _LES_DIR _LES_REPO _LES_VAR _LES_VAL _LES_MISSING
diff --git a/scripts/load-testing/README.md b/scripts/load-testing/README.md
deleted file mode 100644
index e212f9a..0000000
--- a/scripts/load-testing/README.md
+++ /dev/null
@@ -1,127 +0,0 @@
-# FieldTrack Phase 24 — Load Testing
-
-Load tests are written for [k6](https://k6.io/) — a modern open-source load testing tool.
-
-## Prerequisites
-
-Install k6: https://k6.io/docs/getting-started/installation/
-
-```bash
-# macOS
-brew install k6
-
-# Windows (winget)
-winget install k6
-
-# Linux (Debian/Ubuntu)
-sudo gpg -k && sudo gpg --no-default-keyring --keyring /usr/share/keyrings/k6-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys C5AD17C747E3415A3642D57D77C6C491D6AC1D69
-echo "deb [signed-by=/usr/share/keyrings/k6-archive-keyring.gpg] https://dl.k6.io/deb stable main" | sudo tee /etc/apt/sources.list.d/k6.list
-sudo apt-get update && sudo apt-get install k6
-```
-
-## Environment Variables
-
-| Variable         | Description                        |
-|------------------|------------------------------------|
-| `BASE_URL`       | API base URL (default: prod)       |
-| `ADMIN_TOKEN`    | Valid admin JWT                    |
-| `EMPLOYEE_TOKEN` | Valid employee JWT                 |
-
-## Scripts
-
-### `dashboard-load-test.js`
-Simulates **50 concurrent admins** polling `/admin/dashboard` and `/admin/sessions`.
-
-**Targets:** dashboard p95 < 1000 ms · sessions p95 < 800 ms · error rate < 1%
-
-> **Phase 24 note:** The dashboard now uses a single indexed `org_dashboard_snapshot` PK lookup.
-> The tighter p95 < 100 ms target from Phase 22 has been replaced with a realistic 1000 ms budget
-> that accounts for cold-cache misses and network latency.
-
-```bash
-k6 run dashboard-load-test.js \
-  -e BASE_URL=https://api.fieldtrack.meowsician.tech \
-  -e ADMIN_TOKEN=<JWT>
-```
-
----
-
-### `map-load-test.js`
-Simulates **20 concurrent monitoring clients** polling `/admin/monitoring/map` every 30 seconds.
-
-**Target:** p95 < 200 ms · error rate < 1%
-
-```bash
-k6 run map-load-test.js \
-  -e BASE_URL=https://api.fieldtrack.meowsician.tech \
-  -e ADMIN_TOKEN=<JWT>
-```
-
----
-
-### `expenses-load-test.js`
-Simulates **100 concurrent employees** submitting expense claims and listing their expenses.
-
-**Targets:** POST p95 < 300 ms · GET p95 < 200 ms · error rate < 1%
-
-> **Warning:** writes real data — use a staging environment or clean up afterward.
-
-```bash
-k6 run expenses-load-test.js \
-  -e BASE_URL=https://api.fieldtrack.meowsician.tech \
-  -e EMPLOYEE_TOKEN=<JWT>
-```
-
----
-
-### `queue-impact-test.js`
-Simulates a **burst of 30 concurrent checkouts** to stress the BullMQ worker queues, then monitors `/admin/queues` for 2 minutes to verify the backlog drains.
-
-**Targets:** checkout p95 < 400 ms · analytics queue depth < 500 · DLQ < 10
-
-```bash
-k6 run queue-impact-test.js \
-  -e BASE_URL=https://api.fieldtrack.meowsician.tech \
-  -e EMPLOYEE_TOKEN=<JWT> \
-  -e ADMIN_TOKEN=<JWT>
-```
-
-## API Response Structure
-
-All scripts parse JSON bodies. The API always returns an envelope:
-
-| Endpoint | Shape |
-|---|---|
-| `GET /admin/dashboard` | `{ success: true, data: { activeEmployeeCount, recentEmployeeCount, ... } }` |
-| `GET /admin/sessions` | `{ success: true, data: SessionDTO[], pagination: { page, limit, total } }` |
-| `GET /admin/monitoring/map` | `{ success: true, data: EmployeeMapMarker[] }` |
-| `POST /expenses` | `{ success: true, data: { id, amount, description, ... } }` |
-| `GET /expenses/my` | `{ success: true, data: Expense[], pagination: { page, limit, total } }` |
-| `GET /admin/queues` | `{ success: true, queues: { analytics: { waiting, active, completed, failed, dlq }, distance: { ... } } }` |
-
-> **Note:** `pagination` appears at the response root alongside `data`, not nested inside `data`.
-> The `/admin/queues` endpoint uses a `queues` key instead of `data`.
-
-## Metrics and Error Rate
-
-All scripts maintain two categories of checks:
-
-- **Correctness checks** (feed `error_rate`): HTTP status code + `success === true` + required body fields.
-  A request only increments `error_rate` when the API returns the wrong status or a malformed body.
-- **Latency checks** (observability only): Response time assertions inside a separate `check()` call
-  that does **not** feed `error_rate`. Slow-but-correct responses do not inflate the error counter.
-
-This means `error_rate < 0.01` measures real API failures, not congestion.
-
-## Running All Tests Sequentially
-
-```bash
-BASE_URL=https://api.fieldtrack.meowsician.tech
-ADMIN_TOKEN=<your-admin-jwt>
-EMPLOYEE_TOKEN=<your-employee-jwt>
-
-for script in dashboard-load-test.js map-load-test.js expenses-load-test.js queue-impact-test.js; do
-  echo "=== Running $script ==="
-  k6 run "$script" -e BASE_URL="$BASE_URL" -e ADMIN_TOKEN="$ADMIN_TOKEN" -e EMPLOYEE_TOKEN="$EMPLOYEE_TOKEN"
-done
-```
diff --git a/scripts/load-testing/dashboard-load-test.js b/scripts/load-testing/dashboard-load-test.js
deleted file mode 100644
index 4fa8934..0000000
--- a/scripts/load-testing/dashboard-load-test.js
+++ /dev/null
@@ -1,124 +0,0 @@
-/**
- * FieldTrack Phase 23 — Dashboard Load Test
- *
- * Simulates 50 concurrent admin users polling the dashboard and sessions
- * endpoints over a 2-minute steady state period.
- *
- * Run:
- *   k6 run dashboard-load-test.js \
- *     -e BASE_URL=https://api.getfieldtrack.app \
- *     -e ADMIN_TOKEN=<JWT>
- *
- * Performance targets:
- *   p95 latency < 1000 ms  (/admin/dashboard)
- *   p95 latency < 800 ms   (/admin/sessions)
- *   error rate  < 1 %
- *
- * NOTE on rate limiting:
- *   All 50 VUs share a single ADMIN_TOKEN, so they appear as ONE user to the
- *   per-token rate limiter (1200 req/min).  50 VUs × ~12 req/min ≈ 600 req/min
- *   — comfortably within budget.  In production, 50 real admins would each hold
- *   their own token and each get the full 1200 req/min quota.
- */
-
-import http from "k6/http";
-import { check, sleep } from "k6";
-import { Trend, Rate, Counter } from "k6/metrics";
-
-// ─── Custom metrics ─────────────────────────────────────────────────────────
-
-const dashboardDuration = new Trend("dashboard_duration_ms", true);
-const sessionsDuration = new Trend("sessions_duration_ms", true);
-const errorRate = new Rate("error_rate");
-const requestsTotal = new Counter("requests_total");
-
-// ─── Test options ────────────────────────────────────────────────────────────
-
-export const options = {
-  scenarios: {
-    dashboard_polling: {
-      executor: "constant-vus",
-      vus: 50,
-      duration: "2m",
-    },
-  },
-  thresholds: {
-    // Performance targets updated in Phase 24 (O(1) snapshot query)
-    dashboard_duration_ms: ["p(95)<1000"],
-    sessions_duration_ms: ["p(95)<800"],
-    error_rate: ["rate<0.01"],
-    http_req_failed: ["rate<0.01"],
-  },
-};
-
-// ─── Helpers ─────────────────────────────────────────────────────────────────
-
-const BASE_URL = __ENV.BASE_URL || "https://api.getfieldtrack.app";
-const ADMIN_TOKEN = __ENV.ADMIN_TOKEN || "";
-
-function authHeaders() {
-  return {
-    Authorization: `Bearer ${ADMIN_TOKEN}`,
-    "Content-Type": "application/json",
-    "Accept-Encoding": "gzip, br",
-  };
-}
-
-// ─── Default scenario ─────────────────────────────────────────────────────────
-
-export default function () {
-  const headers = authHeaders();
-
-  // ── /admin/dashboard ─────────────────────────────────────────────────────
-  const dashRes = http.get(`${BASE_URL}/admin/dashboard`, { headers, tags: { name: "admin_dashboard" } });
-  requestsTotal.add(1);
-  dashboardDuration.add(dashRes.timings.duration);
-
-  // Correctness check — only logical failures increment error_rate
-  const dashOk = check(dashRes, {
-    "dashboard status 200": (r) => r.status === 200,
-    "dashboard response is success": (r) => {
-      try { return JSON.parse(r.body).success === true; } catch { return false; }
-    },
-    "dashboard has activeEmployeeCount": (r) => {
-      try {
-        const body = JSON.parse(r.body);
-        return typeof body.data?.activeEmployeeCount === "number";
-      } catch {
-        return false;
-      }
-    },
-  });
-  // Latency check — observability only, does not affect error_rate
-  check(dashRes, { "dashboard response time < 500ms": (r) => r.timings.duration < 500 });
-  errorRate.add(!dashOk);
-
-  sleep(0.5);
-
-  // ── /admin/sessions ──────────────────────────────────────────────────────
-  const sessRes = http.get(`${BASE_URL}/admin/sessions?limit=50`, { headers, tags: { name: "admin_sessions" } });
-  requestsTotal.add(1);
-  sessionsDuration.add(sessRes.timings.duration);
-
-  // Correctness check — only logical failures increment error_rate
-  const sessOk = check(sessRes, {
-    "sessions status 200": (r) => r.status === 200,
-    "sessions response is success": (r) => {
-      try { return JSON.parse(r.body).success === true; } catch { return false; }
-    },
-    "sessions has pagination": (r) => {
-      try {
-        const body = JSON.parse(r.body);
-        return typeof body.pagination?.total === "number";
-      } catch {
-        return false;
-      }
-    },
-  });
-  // Latency check — observability only, does not affect error_rate
-  check(sessRes, { "sessions response time < 500ms": (r) => r.timings.duration < 500 });
-  errorRate.add(!sessOk);
-
-  // Simulate realistic admin polling cadence — 5 s between full refreshes
-  sleep(5);
-}
diff --git a/scripts/load-testing/expenses-load-test.js b/scripts/load-testing/expenses-load-test.js
deleted file mode 100644
index c839411..0000000
--- a/scripts/load-testing/expenses-load-test.js
+++ /dev/null
@@ -1,134 +0,0 @@
-/**
- * FieldTrack Phase 23 — Expense Workflow Load Test
- *
- * Simulates 100 concurrent employees submitting expense claims and then
- * retrieving their expense list. Validates that the API remains responsive
- * under realistic bulk-submission conditions (e.g. end-of-month expense flush).
- *
- * Run:
- *   k6 run expenses-load-test.js \
- *     -e BASE_URL=https://api.getfieldtrack.app \
- *     -e EMPLOYEE_TOKEN=<JWT>
- *
- * NOTE: This test writes real data. Run against a staging environment or clean
- * up submitted expenses afterwards via the Supabase dashboard / admin API.
- *
- * Performance targets:
- *   POST /expenses p95 < 300 ms
- *   GET  /expenses/my p95 < 200 ms
- *   error rate < 1 %
- */
-
-import http from "k6/http";
-import { check, sleep } from "k6";
-import { Trend, Rate, Counter } from "k6/metrics";
-
-// ─── Custom metrics ─────────────────────────────────────────────────────────
-
-const submitDuration = new Trend("expense_submit_duration_ms", true);
-const listDuration = new Trend("expense_list_duration_ms", true);
-const errorRate = new Rate("error_rate");
-const requestsTotal = new Counter("requests_total");
-
-// ─── Test options ────────────────────────────────────────────────────────────
-
-export const options = {
-  scenarios: {
-    expense_submission: {
-      executor: "constant-vus",
-      vus: 100,
-      duration: "2m",
-    },
-  },
-  thresholds: {
-    expense_submit_duration_ms: ["p(95)<300"],
-    expense_list_duration_ms: ["p(95)<200"],
-    error_rate: ["rate<0.01"],
-    http_req_failed: ["rate<0.01"],
-  },
-};
-
-// ─── Helpers ─────────────────────────────────────────────────────────────────
-
-const BASE_URL = __ENV.BASE_URL || "https://api.getfieldtrack.app";
-// Each VU can use the same employee token in a load test (shared org context)
-const EMPLOYEE_TOKEN = __ENV.EMPLOYEE_TOKEN || "";
-
-function authHeaders() {
-  return {
-    Authorization: `Bearer ${EMPLOYEE_TOKEN}`,
-    "Content-Type": "application/json",
-    "Accept-Encoding": "gzip, br",
-  };
-}
-
-// ─── Default scenario ─────────────────────────────────────────────────────────
-
-export default function () {
-  const headers = authHeaders();
-  const vu = __VU;
-  const iter = __ITER;
-
-  // ── POST /expenses — submit a new expense claim ───────────────────────────
-  const payload = JSON.stringify({
-    amount: Math.round((10 + Math.random() * 490) * 100) / 100,
-    description: `Load test expense — VU ${vu} iteration ${iter}`,
-  });
-
-  const submitRes = http.post(`${BASE_URL}/expenses`, payload, {
-    headers,
-    tags: { name: "expense_submit" },
-  });
-
-  requestsTotal.add(1);
-  submitDuration.add(submitRes.timings.duration);
-
-  // Correctness check — only logical failures increment error_rate
-  const submitOk = check(submitRes, {
-    "expense submit 201": (r) => r.status === 201,
-    "expense response is success": (r) => {
-      try { return JSON.parse(r.body).success === true; } catch { return false; }
-    },
-    "expense has id": (r) => {
-      try {
-        const body = JSON.parse(r.body);
-        return typeof body.data?.id === "string";
-      } catch {
-        return false;
-      }
-    },
-  });
-  // Latency check — observability only, does not affect error_rate
-  check(submitRes, { "expense submit < 1s": (r) => r.timings.duration < 1000 });
-  errorRate.add(!submitOk);
-
-  sleep(1);
-
-  // ── GET /expenses/my — list own expenses ──────────────────────────────────
-  const listRes = http.get(`${BASE_URL}/expenses/my?limit=20`, {
-    headers,
-    tags: { name: "expense_list" },
-  });
-
-  requestsTotal.add(1);
-  listDuration.add(listRes.timings.duration);
-
-  const listOk = check(listRes, {
-    "expense list 200": (r) => r.status === 200,
-    "expense list response is success": (r) => {
-      try { return JSON.parse(r.body).success === true; } catch { return false; }
-    },
-    "expense list has pagination": (r) => {
-      try {
-        const body = JSON.parse(r.body);
-        return typeof body.pagination?.total === "number";
-      } catch {
-        return false;
-      }
-    },
-  });
-  errorRate.add(!listOk);
-
-  // Simulate realistic inter-request think time
-  sleep(2 + Math.random() * 3);
-}
diff --git a/scripts/load-testing/map-load-test.js b/scripts/load-testing/map-load-test.js
deleted file mode 100644
index 79a3584..0000000
--- a/scripts/load-testing/map-load-test.js
+++ /dev/null
@@ -1,92 +0,0 @@
-/**
- * FieldTrack Phase 23 — Monitoring Map Load Test
- *
- * Simulates 20 concurrent monitoring clients that poll the live map endpoint
- * every 30 seconds, mirroring the production frontend SSE/polling cadence.
- *
- * Run:
- *   k6 run map-load-test.js \
- *     -e BASE_URL=https://api.getfieldtrack.app \
- *     -e ADMIN_TOKEN=<JWT>
- *
- * Performance target:
- *   p95 latency < 200 ms
- *   error rate  < 1 %
- */
-
-import http from "k6/http";
-import { check, sleep } from "k6";
-import { Trend, Rate, Counter } from "k6/metrics";
-
-// ─── Custom metrics ─────────────────────────────────────────────────────────
-
-const mapDuration = new Trend("map_duration_ms", true);
-const errorRate = new Rate("error_rate");
-const requestsTotal = new Counter("requests_total");
-
-// ─── Test options ────────────────────────────────────────────────────────────
-
-export const options = {
-  scenarios: {
-    live_map_polling: {
-      executor: "constant-vus",
-      vus: 20,
-      duration: "3m",
-    },
-  },
-  thresholds: {
-    map_duration_ms: ["p(95)<200"],
-    error_rate: ["rate<0.01"],
-    http_req_failed: ["rate<0.01"],
-  },
-};
-
-// ─── Helpers ─────────────────────────────────────────────────────────────────
-
-const BASE_URL = __ENV.BASE_URL || "https://api.getfieldtrack.app";
-const ADMIN_TOKEN = __ENV.ADMIN_TOKEN || "";
-
-function authHeaders() {
-  return {
-    Authorization: `Bearer ${ADMIN_TOKEN}`,
-    "Accept-Encoding": "gzip, br",
-  };
-}
-
-// ─── Default scenario ─────────────────────────────────────────────────────────
-
-export default function () {
-  const headers = authHeaders();
-
-  const res = http.get(`${BASE_URL}/admin/monitoring/map`, {
-    headers,
-    tags: { name: "monitoring_map" },
-  });
-
-  requestsTotal.add(1);
-  mapDuration.add(res.timings.duration);
-
-  // Correctness check — only logical failures increment error_rate
-  const ok = check(res, {
-    "map status 200": (r) => r.status === 200,
-    "map response is success": (r) => {
-      try { return JSON.parse(r.body).success === true; } catch { return false; }
-    },
-    "map has markers array": (r) => {
-      try {
-        const body = JSON.parse(r.body);
-        return Array.isArray(body.data);
-      } catch {
-        return false;
-      }
-    },
-    "map content-encoding compressed": (r) =>
-      r.headers["Content-Encoding"] !== undefined || r.body.length > 0,
-  });
-  // Latency check — observability only, does not affect error_rate
-  check(res, { "map response time < 500ms": (r) => r.timings.duration < 500 });
-  errorRate.add(!ok);
-
-  // Simulate 30-second polling interval (realistic monitoring cadence)
-  sleep(30);
-}
diff --git a/scripts/load-testing/queue-impact-test.js b/scripts/load-testing/queue-impact-test.js
deleted file mode 100644
index 36e304c..0000000
--- a/scripts/load-testing/queue-impact-test.js
+++ /dev/null
@@ -1,146 +0,0 @@
-/**
- * FieldTrack Phase 23 — Queue Impact Load Test
- *
- * Simulates a burst of session checkouts to stress the distance and analytics
- * worker queues. After the burst, the script polls /admin/queues to watch the
- * backlog drain and verify the queues recover within the target SLA.
- *
- * Run:
- *   k6 run queue-impact-test.js \
- *     -e BASE_URL=https://api.getfieldtrack.app \
- *     -e EMPLOYEE_TOKEN=<JWT> \
- *     -e ADMIN_TOKEN=<JWT>
- *
- * NOTE: This test checks out real sessions. Pre-create checked-in sessions
- * in a staging environment or use the smoke-test helper to seed data first.
- *
- * Metrics monitored:
- *   analytics_queue_depth  — Prometheus gauge via /admin/queues
- *   checkout latency       — POST /attendance/check-out p95
- *   queue drain time       — how quickly depth returns to 0
- */
-
-import http from "k6/http";
-import { check, sleep } from "k6";
-import { Trend, Rate, Counter, Gauge } from "k6/metrics";
-
-// ─── Custom metrics ─────────────────────────────────────────────────────────
-
-const checkoutDuration = new Trend("checkout_duration_ms", true);
-const queueDepth = new Gauge("analytics_queue_depth_observed");
-const errorRate = new Rate("error_rate");
-const requestsTotal = new Counter("requests_total");
-
-// ─── Test options ────────────────────────────────────────────────────────────
-
-export const options = {
-  scenarios: {
-    // Phase 1: burst checkout load (simulates end-of-day mass checkout)
-    checkout_burst: {
-      executor: "constant-vus",
-      vus: 30,
-      duration: "30s",
-      tags: { phase: "burst" },
-    },
-    // Phase 2: queue drain monitoring — starts after the burst ends
-    queue_drain_monitor: {
-      executor: "constant-vus",
-      vus: 1,
-      startTime: "35s",
-      duration: "2m",
-      tags: { phase: "monitor" },
-    },
-  },
-  thresholds: {
-    // Checkout must stay fast even under queue pressure
-    checkout_duration_ms: ["p(95)<400"],
-    error_rate: ["rate<0.05"],
-    http_req_failed: ["rate<0.05"],
-  },
-};
-
-// ─── Helpers ─────────────────────────────────────────────────────────────────
-
-const BASE_URL = __ENV.BASE_URL || "https://api.getfieldtrack.app";
-const EMPLOYEE_TOKEN = __ENV.EMPLOYEE_TOKEN || "";
-const ADMIN_TOKEN = __ENV.ADMIN_TOKEN || "";
-
-function empHeaders() {
-  return {
-    Authorization: `Bearer ${EMPLOYEE_TOKEN}`,
-    "Content-Type": "application/json",
-  };
-}
-
-function adminHeaders() {
-  return {
-    Authorization: `Bearer ${ADMIN_TOKEN}`,
-    "Content-Type": "application/json",
-  };
-}
-
-// ─── Checkout burst scenario ──────────────────────────────────────────────────
-
-export function checkoutBurst() {
-  // POST check-out triggers distance + analytics job enqueue
-  const res = http.post(
-    `${BASE_URL}/attendance/check-out`,
-    "{}",
-    { headers: empHeaders(), tags: { name: "checkout" } },
-  );
-
-  requestsTotal.add(1);
-  checkoutDuration.add(res.timings.duration);
-
-  // Correctness check — only logical failures increment error_rate
-  const ok = check(res, {
-    // 200 = checked out successfully; 409 = no open session (idempotent)
-    "checkout accepted": (r) => r.status === 200 || r.status === 409,
-  });
-  // Latency check — observability only, does not affect error_rate
-  check(res, { "checkout < 1s": (r) => r.timings.duration < 1000 });
-  errorRate.add(!ok);
-
-  sleep(1);
-}
-
-// ─── Queue drain monitor scenario ─────────────────────────────────────────────
-
-export function queueDrainMonitor() {
-  const res = http.get(`${BASE_URL}/admin/queues`, {
-    headers: adminHeaders(),
-    tags: { name: "queue_stats" },
-  });
-
-  requestsTotal.add(1);
-
-  if (res.status === 200) {
-    try {
-      const body = JSON.parse(res.body);
-      const analyticsWaiting = body.queues?.analytics?.waiting ?? -1;
-      const distanceWaiting = body.queues?.distance?.waiting ?? -1;
-
-      queueDepth.add(analyticsWaiting + distanceWaiting);
-
-      check(res, {
-        "queue depth within SLA (<500)": () =>
-          analyticsWaiting + distanceWaiting < 500,
-        "no DLQ overflow (<10)": () =>
-          (body.queues?.analytics?.dlq?.waiting ?? 0) < 10,
-      });
-    } catch { /* parse error — log as failure */ }
-  }
-
-  // Poll every 10 seconds
-  sleep(10);
-}
-
-// ─── Default function — routes to correct scenario function ───────────────────
-// k6 uses exec tags to map VUs to named functions when using "scenarios" config.
-// The default export is only called when no `exec` is specified on a scenario.
-// Since we have two named scenarios above, we point each one at its function.
-
-export default function () {
-  // Fallback: if run without scenarios config, execute the checkout burst.
-  checkoutBurst();
-}
diff --git a/scripts/monitoring-sync.sh b/scripts/monitoring-sync.sh
deleted file mode 100644
index 4273f0a..0000000
--- a/scripts/monitoring-sync.sh
+++ /dev/null
@@ -1,344 +0,0 @@
-#!/usr/bin/env bash
-# =============================================================================
-# monitoring-sync.sh — Self-Healing Monitoring Stack Sync
-#
-# Called by the CI sync-monitoring job after every production deploy.
-#
-# Responsibilities:
-#   1. SELF-HEAL  — create missing .env.monitoring from example if absent
-#   2. BOOTSTRAP  — detect placeholder values and warn (cold-start mode)
-#   3. ENSURE NETWORK — create api_network if it does not exist
-#   4. SYNC       — idempotent `docker compose up -d` (starts if down, no-ops if healthy)
-#   5. VALIDATE   — confirm prometheus / grafana / alertmanager are running + healthy
-#   6. ENFORCE    — exit 1 if any required container is not healthy after timeout
-#
-# Self-healing rules (safe defaults):
-#   - .env.monitoring missing  → copy from infra/.env.monitoring.example + warn
-#   - .env.monitoring has placeholders (change-me) → skip health wait, warn operator
-#   - api_network missing      → create it
-#   - alertmanager rendered config missing → render it
-#
-# Timeouts:
-#   - Per-container health check: 60 seconds max (20 attempts × 3 s)
-#   - Polling interval: 3 seconds
-#   - Total wait tracked to prevent cascading timeouts
-#
-# Exit codes:
-#   0  All required monitoring containers are healthy
-#   1  One or more required containers failed to become healthy (deploy must fail)
-#
-# Required env (exported by load-env.sh / present in DEPLOY_ROOT):
-#   DEPLOY_ROOT   — absolute path to the repository root on the VPS
-# =============================================================================
-set -euo pipefail
-trap '_ft_mon_trap "$LINENO"' ERR
-
-# ─────────────────────────────────────────────────────────────────────────
-# STATE CLASSIFICATION
-# ─────────────────────────────────────────────────────────────────────────
-DEPLOY_STATE="SUCCESS"
-trap '[ $? -ne 0 ] && DEPLOY_STATE="FAILED" || true' EXIT
-
-# ---------------------------------------------------------------------------
-# LOGGING
-# ---------------------------------------------------------------------------
-_FT_MON_LOG_FILE="${DEPLOY_LOG_FILE:-/var/log/api/deploy.log}"
-_LOG_DIR="$(dirname "$_FT_MON_LOG_FILE")"
-if ! mkdir -p "$_LOG_DIR" 2>/dev/null; then
-    _LOG_DIR="$HOME/api/logs"
-    _FT_MON_LOG_FILE="$_LOG_DIR/deploy.log"
-    mkdir -p "$_LOG_DIR"
-fi
-
-_log() {
-    printf '[MON-SYNC] ts=%s %s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$*" \
-        | tee -a "$_FT_MON_LOG_FILE" >&2
-}
-
-_ft_mon_trap() {
-    printf '[MON-SYNC] ts=%s level=ERROR msg="unexpected failure at line %s"\n' \
-        "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$1" >&2
-}
-
-# ---------------------------------------------------------------------------
-# RESOLVE PATHS
-# ---------------------------------------------------------------------------
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
-
-if [ ! -d "$DEPLOY_ROOT" ]; then
-    _log "level=ERROR msg='DEPLOY_ROOT not found' path=$DEPLOY_ROOT"
-    exit 1
-fi
-
-INFRA_DIR="$DEPLOY_ROOT/infra"
-MON_ENV="$INFRA_DIR/.env.monitoring"
-MON_ENV_EXAMPLE="$INFRA_DIR/.env.monitoring.example"
-MON_COMPOSE="$INFRA_DIR/docker-compose.monitoring.yml"
-ALERTMANAGER_RENDERED="$INFRA_DIR/alertmanager/alertmanager.rendered.yml"
-RENDER_SCRIPT="$INFRA_DIR/scripts/render-alertmanager.sh"
-
-_log "msg='monitoring-sync started' deploy_root=$DEPLOY_ROOT state=$DEPLOY_STATE"
-
-# ---------------------------------------------------------------------------
-# STEP 1 — SELF-HEAL: .env.monitoring
-# Create from example if missing instead of failing hard.
-# The user MUST still fill in real values after first-time creation.
-# ---------------------------------------------------------------------------
-BOOTSTRAP_MODE=false
-if [ ! -f "$MON_ENV" ]; then
-    if [ -f "$MON_ENV_EXAMPLE" ]; then
-        cp "$MON_ENV_EXAMPLE" "$MON_ENV"
-        chmod 600 "$MON_ENV"
-        BOOTSTRAP_MODE=true
-        _log "level=WARN msg='monitoring env file missing — created from example' path=$MON_ENV"
-        _log "level=WARN msg='ACTION REQUIRED: edit $MON_ENV with real GRAFANA_ADMIN_PASSWORD, METRICS_SCRAPE_TOKEN, ALERTMANAGER_SLACK_WEBHOOK'"
-    else
-        _log "level=ERROR msg='monitoring env file and example both missing' path=$MON_ENV"
-        DEPLOY_STATE="FAILED"
-        exit 1
-    fi
-else
-    chmod 600 "$MON_ENV"
-    _log "msg='monitoring env file exists' path=$MON_ENV"
-fi
-
-# ─────────────────────────────────────────────────────────────────────────
-# STEP 1B — BOOTSTRAP MODE: Detect placeholders
-# If .env.monitoring contains default 'change-me' values, we're in cold-start.
-# Skip health polling to avoid timeout on misconfigured system.
-# ─────────────────────────────────────────────────────────────────────────
-if grep -q "change-me" "$MON_ENV" 2>/dev/null; then
-    BOOTSTRAP_MODE=true
-    _log "level=WARN msg='bootstrap mode detected: .env.monitoring contains placeholder values' action='skipping health check'"
-    _log "level=WARN msg='OPERATOR ACTION: edit infra/.env.monitoring and set real values, then re-run deploy'"
-fi
-
-# ---------------------------------------------------------------------------
-# STEP 2 — SELF-HEAL: Docker network api_network
-# ---------------------------------------------------------------------------
-if ! docker network ls --format '{{.Name}}' | grep -Eq '^api_network$'; then
-    _log "msg='api_network missing — creating' driver=bridge"
-    docker network create --driver bridge api_network
-    _log "msg='api_network created'"
-else
-    _log "msg='api_network exists'"
-fi
-
-# ---------------------------------------------------------------------------
-# STEP 3 — SELF-HEAL: Render alertmanager config
-# render-alertmanager.sh is idempotent; always safe to run.
-# ---------------------------------------------------------------------------
-if [ -x "$RENDER_SCRIPT" ]; then
-    _log "msg='rendering alertmanager config'"
-    bash "$RENDER_SCRIPT"
-    _log "msg='alertmanager config rendered' file=$ALERTMANAGER_RENDERED"
-elif [ ! -f "$ALERTMANAGER_RENDERED" ]; then
-    _log "level=ERROR msg='render-alertmanager.sh not found AND rendered config missing' script=$RENDER_SCRIPT"
-    exit 1
-else
-    _log "level=WARN msg='render-alertmanager.sh not found but rendered config exists — continuing' script=$RENDER_SCRIPT"
-fi
-
-# ---------------------------------------------------------------------------
-# STEP 4 — SYNC: docker compose up -d (idempotent)
-# Creates containers that are missing; leaves healthy containers untouched.
-# ---------------------------------------------------------------------------
-_log "msg='starting monitoring stack (idempotent)'"
-cd "$INFRA_DIR"
-docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml up -d --remove-orphans
-cd "$DEPLOY_ROOT"
-_log "msg='docker compose up -d complete'"
-
-# ---------------------------------------------------------------------------
-# STEP 5 — VALIDATE: wait for required containers to become healthy
-#
-# Required containers (must be healthy for deploy to succeed):
-#   prometheus   — metrics collection (health: http://prometheus:9090/-/healthy)
-#   alertmanager — alert routing      (health: http://alertmanager:9093/-/healthy)
-#   grafana      — dashboards         (health: http://grafana:3000/api/health)
-#
-# Strategy: poll docker inspect for Health.Status via Docker service DNS.
-# Times out at 60 s per container (20 attempts × 3 s).
-# Note: Using service names (not localhost) because containers are in Docker network only.
-# ---------------------------------------------------------------------------
-
-_wait_container_healthy() {
-    local name="$1"
-    local max_wait_sec="${2:-60}"
-    local interval="${3:-3}"
-
-    _log "msg='waiting for container health' container=$name max_wait_sec=$max_wait_sec interval=$interval"
-
-    local waited=0
-    while [ $waited -lt $max_wait_sec ]; do
-        # Explicit container name enforcement: use docker inspect directly.
-        # Avoids fragile grep patterns; fails fast if container name is wrong.
-        if ! docker inspect "$name" >/dev/null 2>&1; then
-            _log "level=WARN msg='container does not exist or inspect failed' container=$name waited_sec=$waited"
-            sleep "$interval"
-            waited=$((waited + interval))
-            continue
-        fi
-
-        # Container exists — check health status
-        local health_status
-        health_status=$(docker inspect --format='{{if .State.Health}}{{.State.Health.Status}}{{else}}no-healthcheck{{end}}' "$name" 2>/dev/null || echo "inspect-failed")
-
-        case "$health_status" in
-            healthy)
-                _log "msg='container healthy' container=$name waited_sec=$waited"
-                return 0
-                ;;
-            no-healthcheck)
-                # Container has no Docker healthcheck — verify it is at least running.
-                local running
-                running=$(docker inspect --format='{{.State.Running}}' "$name" 2>/dev/null || echo "false")
-                if [ "$running" = "true" ]; then
-                    _log "msg='container running (no healthcheck configured)' container=$name"
-                    return 0
-                fi
-                ;;
-            starting)
-                _log "msg='container starting' container=$name waited_sec=$waited/$max_wait_sec"
-                ;;
-            unhealthy)
-                _log "level=WARN msg='container unhealthy' container=$name waited_sec=$waited/$max_wait_sec"
-                ;;
-            inspect-failed)
-                _log "level=WARN msg='docker inspect failed' container=$name waited_sec=$waited"
-                ;;
-            *)
-                _log "level=WARN msg='unknown health status' container=$name status=$health_status waited_sec=$waited"
-                ;;
-        esac
-
-        sleep "$interval"
-        waited=$((waited + interval))
-    done
-
-    _log "level=ERROR msg='container did not become healthy within timeout' container=$name max_wait_sec=$max_wait_sec"
-    docker logs "$name" --tail 30 >&2 2>/dev/null || true
-    return 1
-}
-
-_check_endpoint() {
-    # Execute the health check INSIDE the container via docker exec.
-    # Monitoring containers live only on api_network and are NOT reachable via
-    # host-side DNS — their names (prometheus, alertmanager, grafana) only
-    # resolve from other containers on the same Docker network.
-    # Prefer wget (present in prom/* alpine images); fall back to curl (grafana).
-    local name="$1"
-    local url="$2"
-
-    if docker exec "$name" wget --spider -q "$url" >/dev/null 2>&1; then
-        _log "msg='endpoint healthy' container=$name url=$url"
-        return 0
-    elif docker exec "$name" curl -sf --max-time 5 "$url" >/dev/null 2>&1; then
-        _log "msg='endpoint healthy (curl)' container=$name url=$url"
-        return 0
-    else
-        _log "level=ERROR msg='endpoint unhealthy' container=$name url=$url"
-        return 1
-    fi
-}
-
-# ─────────────────────────────────────────────────────────────────────────
-# SKIP HEALTH CHECKS IN BOOTSTRAP MODE
-# ─────────────────────────────────────────────────────────────────────────
-if [ "$BOOTSTRAP_MODE" = "true" ]; then
-    DEPLOY_STATE="BOOTSTRAP"
-    _log "level=WARN msg='bootstrap mode detected — skipping container health checks' state=$DEPLOY_STATE"
-    _log "level=WARN msg='ACTION: configure infra/.env.monitoring with real values and re-run deploy to enable monitoring'"
-    exit 0
-fi
-
-# ─────────────────────────────────────────────────────────────────────────
-# ENFORCE: Container name validation + health checks
-# ─────────────────────────────────────────────────────────────────────────
-# Exact container name enforcement: fail fast if any required container is missing
-REQUIRED_CONTAINERS=("prometheus" "alertmanager" "grafana")
-for c in "${REQUIRED_CONTAINERS[@]}"; do
-    if ! docker inspect "$c" >/dev/null 2>&1; then
-        _log "level=ERROR msg='required container missing' container=$c"
-        DEPLOY_STATE="FAILED"
-        docker ps --format 'table {{.Names}}\t{{.Status}}' 2>/dev/null >&2 || true
-        exit 1
-    fi
-done
-
-MONITORING_ERRORS=0
-
-# ── Prometheus ──────────────────────────────────────────────────────────────
-if _wait_container_healthy "prometheus" 60 3; then
-    _check_endpoint "prometheus" "http://prometheus:9090/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
-else
-    MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
-fi
-
-# ── Alertmanager ─────────────────────────────────────────────────────────────
-if _wait_container_healthy "alertmanager" 60 3; then
-    _check_endpoint "alertmanager" "http://alertmanager:9093/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
-else
-    MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
-fi
-
-# ── Grafana ──────────────────────────────────────────────────────────────────
-# Grafana may take longer to start; allow 60s timeout.
-if _wait_container_healthy "grafana" 60 3; then
-    # Grafana health endpoint returns 200 with JSON when ready.
-    _check_endpoint "grafana" "http://grafana:3000/api/health" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
-else
-    MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
-fi
-
-# ---------------------------------------------------------------------------
-# STABILITY WINDOW — Verify containers remain healthy after initial pass
-# This catches "flaky startup" where containers pass health check but crash
-# immediately after. Wait settle window then re-verify all containers.
-# ---------------------------------------------------------------------------
-_log "msg='entering stability window (5s settle + re-check)'"
-sleep 5
-
-for c in "${REQUIRED_CONTAINERS[@]}"; do
-    STABLE_STATUS=$(docker inspect --format='{{if .State.Health}}{{.State.Health.Status}}{{else}}running{{end}}' "$c" 2>/dev/null || echo "inspect-failed")
-    if [ "$STABLE_STATUS" != "healthy" ] && [ "$STABLE_STATUS" != "running" ]; then
-        _log "level=ERROR msg='container became unhealthy during stability window' container=$c status=$STABLE_STATUS"
-        MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
-    fi
-done
-
-# ---------------------------------------------------------------------------
-# PROMETHEUS SCRAPING VALIDATION — Ensure Prometheus is actually working
-# A healthy Prometheus container is useless if it's not scraping targets.
-# Query the Prometheus API to verify targets are UP.
-# ---------------------------------------------------------------------------
-_log "msg='validating prometheus scraping targets'"
-# Use docker exec to query the Prometheus API from inside the container.
-# The prometheus container name is only resolvable within api_network, not from the host.
-PROM_TARGETS=$(docker exec prometheus wget -qO- "http://localhost:9090/api/v1/targets" 2>/dev/null || echo "")
-
-if [ -z "$PROM_TARGETS" ]; then
-    _log "level=WARN msg='prometheus API query failed — cannot validate scraping (proceeding with caution)'"
-elif ! echo "$PROM_TARGETS" | grep -q '"health":"up"' 2>/dev/null; then
-    _log "level=ERROR msg='prometheus has no healthy scrape targets' curl_response=${PROM_TARGETS:0:200}"
-    MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
-else
-    # Count active targets
-    ACTIVE_TARGETS=$(echo "$PROM_TARGETS" | grep -o '"health":"up"' | wc -l)
-    _log "msg='prometheus scraping targets' active_count=$ACTIVE_TARGETS"
-fi
-
-# ---------------------------------------------------------------------------
-# FINAL ENFORCEMENT
-# ---------------------------------------------------------------------------
-if [ "$MONITORING_ERRORS" -gt 0 ]; then
-    _log "level=ERROR msg='monitoring validation failed' errors=$MONITORING_ERRORS state=$DEPLOY_STATE"
-    _log "level=ERROR msg='container state at failure:'"
-    docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Ports}}' 2>/dev/null >&2 || true
-    DEPLOY_STATE="FAILED"
-    exit 1
-fi
-
-_log "msg='monitoring-sync complete' state=$DEPLOY_STATE containers=healthy required=3"
-exit 0
diff --git a/scripts/rollback.sh b/scripts/rollback.sh
deleted file mode 100644
index 2f2ef11..0000000
--- a/scripts/rollback.sh
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-set -x
-trap '[[ "${BASH_COMMAND}" != _ft_log* ]] && printf "[DEPLOY] ts=%s state=ROLLBACK level=ERROR msg=\"rollback script failed at line %s\"\n" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$LINENO"' ERR
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-# Load and validate environment.
-# Sets: DEPLOY_ROOT, ENV_FILE, API_HOSTNAME.
-# Exports all variables from .env into this process.
-# Disable trace to prevent secrets from leaking into logs.
-set +x
-source "$SCRIPT_DIR/load-env.sh"
-set -x
-
-DEPLOY_HISTORY="$DEPLOY_ROOT/.deploy_history"
-
-AUTO_MODE=false
-
-if [[ "${1:-}" == "--auto" ]]; then
-  AUTO_MODE=true
-fi
-
-echo "========================================="
-echo "FieldTrack Rollback System"
-echo "========================================="
-
-# Check if deployment history exists and validate checksum
-if [ ! -f "$DEPLOY_HISTORY" ]; then
-    echo "ERROR: No deployment history found."
-    echo "File not found: $DEPLOY_HISTORY"
-    exit 1
-fi
-
-# Validate deployment history file integrity
-if [ ! -s "$DEPLOY_HISTORY" ]; then
-    echo "ERROR: Deployment history file is empty or corrupted."
-    exit 1
-fi
-
-mapfile -t HISTORY < "$DEPLOY_HISTORY"
-
-if [ ${#HISTORY[@]} -lt 2 ]; then
-    echo "ERROR: Need at least two deployments to rollback."
-    exit 1
-fi
-
-CURRENT_SHA="${HISTORY[0]}"
-PREVIOUS_SHA="${HISTORY[1]}"
-
-echo "Current deployment : $CURRENT_SHA"
-echo "Rollback target    : $PREVIOUS_SHA"
-echo ""
-
-# Validate that the rollback image exists in the registry
-echo "Validating rollback image exists..."
-if ! docker manifest inspect "ghcr.io/fieldtrack-tech/api:$PREVIOUS_SHA" >/dev/null 2>&1; then
-    echo "ERROR: Rollback image not found in registry."
-    echo "Image: ghcr.io/fieldtrack-tech/api:$PREVIOUS_SHA"
-    echo "Cannot proceed with rollback to non-existent image."
-    exit 1
-fi
-echo "✓ Rollback image verified in registry."
-echo ""
-
-if [ "$AUTO_MODE" = false ]; then
-  echo "⚠️  WARNING: This will replace the current deployment."
-  read -p "Continue with rollback? (yes/no): " -r
-
-  if [[ ! $REPLY =~ ^[Yy][Ee][Ss]$ ]]; then
-      echo "Rollback cancelled."
-      exit 0
-  fi
-else
-  echo "Auto rollback mode enabled (CI)."
-fi
-
-echo ""
-echo "Starting rollback to: $PREVIOUS_SHA"
-echo ""
-
-# Set guard to prevent infinite rollback loops
-export API_ROLLBACK_IN_PROGRESS=1
-
-# Attempt rollback deploy
-if ! "$SCRIPT_DIR/deploy-bluegreen.sh" "$PREVIOUS_SHA"; then
-    echo ""
-    echo "========================================="
-    echo "❌ CRITICAL: ROLLBACK FAILED"
-    echo "========================================="
-    echo "Both deployment and rollback have failed."
-    echo ""
-    echo "SYSTEM STATE SNAPSHOT:"
-    echo "  Active containers:"
-    docker ps --format '  {{.Names}} → {{.Status}} ({{.Ports}})' 2>/dev/null || echo "  (docker ps failed)"
-    echo "  Active slot file: $(cat "/var/run/api/active-slot" 2>/dev/null || echo 'MISSING')"
-    echo "  Nginx config test: $(docker exec nginx nginx -t 2>&1)"
-    echo ""
-    echo "Target SHA:    $PREVIOUS_SHA"
-    echo ""
-    echo "Action required:"
-    echo "  1. Check container status: docker ps -a"
-    echo "  2. Check nginx config: docker exec nginx nginx -t"
-    echo "  3. Review logs: docker logs api-blue api-green"
-    echo "  4. Manually restore last known good state"
-    echo "========================================="
-    exit 2
-fi
-
-echo ""
-echo "========================================="
-echo "Rollback completed successfully"
-echo "Production now running: $PREVIOUS_SHA"
-echo "========================================="
diff --git a/scripts/smoke-test.sh b/scripts/smoke-test.sh
deleted file mode 100644
index 977cc0c..0000000
--- a/scripts/smoke-test.sh
+++ /dev/null
@@ -1,445 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-BASE_URL="${API_BASE_URL:-https://api.getfieldtrack.app}"
-API="${BASE_URL}"
-
-EMP_EMAIL="${FT_EMP_EMAIL:-}"
-EMP_PASSWORD="${FT_EMP_PASSWORD:-}"
-ADMIN_EMAIL="${FT_ADMIN_EMAIL:-}"
-ADMIN_PASSWORD="${FT_ADMIN_PASSWORD:-}"
-
-SUPABASE_URL="${SUPABASE_URL:-}"
-SUPABASE_ANON="${SUPABASE_ANON_KEY:-}"
-
-PASS=0
-FAIL=0
-TMP_HEADERS=$(mktemp)
-TMP_BODY=$(mktemp)
-
-# ----------------------------------------------------------------
-# Decode the payload section of a JWT (base64url → JSON string).
-# Usage: decode_jwt_payload <token>
-# ----------------------------------------------------------------
-decode_jwt_payload() {
-  local token=$1
-  local payload
-  payload=$(echo "$token" | cut -d'.' -f2)
-  # Restore standard base64 alphabet and add required padding
-  local mod=$(( ${#payload} % 4 ))
-  case $mod in
-    2) payload="${payload}==" ;;
-    3) payload="${payload}=" ;;
-  esac
-  echo "$payload" | tr '_-' '/+' | base64 -d 2>/dev/null
-}
-
-# ----------------------------------------------------------------
-# Assert that a JWT contains the required hook-injected claims.
-# Exits with code 1 if any required claim is missing.
-# Usage: assert_hook_claims <label> <token> <require_employee_id>
-# ----------------------------------------------------------------
-assert_hook_claims() {
-  local label="$1"
-  local token="$2"
-  local require_employee_id="${3:-false}"
-
-  local payload
-  payload=$(decode_jwt_payload "$token")
-
-  echo ""
-  echo "Decoded JWT payload ($label):"
-  echo "$payload" | jq '.' 2>/dev/null || echo "$payload"
-
-  local role org_id employee_id
-  role=$(echo "$payload"        | jq -r '.role        // empty' 2>/dev/null)
-  org_id=$(echo "$payload"      | jq -r '.org_id      // empty' 2>/dev/null)
-  employee_id=$(echo "$payload" | jq -r '.employee_id // empty' 2>/dev/null)
-
-  local hook_ok=true
-
-  if [ -z "$role" ] || [ -z "$org_id" ]; then
-    echo "✗ Auth Hook Integrity ($label): JWT is missing required claims"
-    echo "  role   = '${role:-<MISSING>}'"
-    echo "  org_id = '${org_id:-<MISSING>}'"
-    echo ""
-    echo "  ╔══════════════════════════════════════════════════════════╗"
-    echo "  ║  Supabase Auth Hook is NOT injecting claims.            ║"
-    echo "  ║  All API calls will fail with 401.                      ║"
-    echo "  ║                                                          ║"
-    echo "  ║  ACTION: Supabase Dashboard → Authentication → Hooks    ║"
-    echo "  ║  Enable: Customize Access Token (JWT) Claims            ║"
-    echo "  ║  Hook type: Postgres                                     ║"
-    echo "  ║  Schema:    public                                       ║"
-    echo "  ║  Function:  custom_access_token_hook                    ║"
-    echo "  ╚══════════════════════════════════════════════════════════╝"
-    FAIL=$((FAIL+1))
-    hook_ok=false
-  fi
-
-  if [ "$require_employee_id" = "true" ] && [ -z "$employee_id" ]; then
-    echo "✗ Auth Hook Integrity ($label): JWT missing employee_id claim"
-    echo "  This user may not have an employee record in public.employees."
-    echo "  Verify seed data: the test employee user must exist in both"
-    echo "  auth.users AND public.users AND public.employees."
-    FAIL=$((FAIL+1))
-    hook_ok=false
-  fi
-
-  if [ "$hook_ok" = "true" ]; then
-    local emp_part=""
-    if [ -n "$employee_id" ]; then
-      emp_part=", employee_id=${employee_id:0:8}..."
-    fi
-    log_pass "Auth Hook Integrity ($label): role=$role, org_id=${org_id:0:8}...${emp_part}"
-  fi
-
-  # Fail fast — if hook claims are missing, API calls will all 401.
-  # No point running the rest of the smoke suite.
-  if [ "$hook_ok" = "false" ]; then
-    echo ""
-    echo "⛔ Aborting smoke test: JWT claims missing. Fix auth hook first."
-    exit 1
-  fi
-}
-
-# ----------------------------------------------------------------
-# Login to Supabase and return a guaranteed-fresh access_token via
-# password login → immediate token refresh.
-#
-# WHY:  Supabase may return a cached access_token for recent logins.
-#       Forcing a refresh guarantees the Hook runs on the new token
-#       so custom claims (role, org_id, employee_id) are present.
-#
-# Usage: login_and_refresh <email> <password>
-# Outputs:  access_token string to stdout; exits 1 on failure.
-# ----------------------------------------------------------------
-login_and_refresh() {
-  local email="$1"
-  local password="$2"
-
-  # Step 1: Password login — get access_token + refresh_token
-  local auth_response
-  auth_response=$(curl -s -X POST \
-    -H "apikey: $SUPABASE_ANON" \
-    -H "Content-Type: application/json" \
-    -d "{\"email\":\"$email\",\"password\":\"$password\"}" \
-    "$SUPABASE_URL/auth/v1/token?grant_type=password")
-
-  local refresh_token
-  refresh_token=$(echo "$auth_response" | jq -r '.refresh_token // empty')
-
-  if [ -z "$refresh_token" ] || [ "$refresh_token" = "null" ]; then
-    echo "  ERROR: password login failed for $email" >&2
-    echo "  Response: $auth_response" >&2
-    return 1
-  fi
-
-  # Step 2: Force-refresh — guarantees Hook runs on the new token
-  local refresh_response
-  refresh_response=$(curl -s -X POST \
-    -H "apikey: $SUPABASE_ANON" \
-    -H "Content-Type: application/json" \
-    -d "{\"refresh_token\":\"$refresh_token\"}" \
-    "$SUPABASE_URL/auth/v1/token?grant_type=refresh_token")
-
-  local access_token
-  access_token=$(echo "$refresh_response" | jq -r '.access_token // empty')
-
-  if [ -z "$access_token" ] || [ "$access_token" = "null" ]; then
-    echo "  ERROR: token refresh failed for $email" >&2
-    echo "  Response: $refresh_response" >&2
-    return 1
-  fi
-
-  echo "$access_token"
-}
-
-cleanup() {
-  rm -f "$TMP_HEADERS" "$TMP_BODY"
-}
-
-trap cleanup EXIT
-
-log_pass() {
-  echo "✓ $1"
-  PASS=$((PASS+1))
-}
-
-log_fail() {
-  echo "✗ $1"
-  FAIL=$((FAIL+1))
-}
-
-request() {
-  METHOD=$1
-  URL=$2
-  TOKEN=${3:-}
-  MAX_RETRIES=3
-
-  for attempt in $(seq 1 $MAX_RETRIES); do
-    if [ -n "$TOKEN" ]; then
-      STATUS=$(curl -L -s --max-time 15 -D "$TMP_HEADERS" -o "$TMP_BODY" -w "%{http_code}" \
-        -H "Authorization: Bearer $TOKEN" \
-        -X "$METHOD" "$API$URL" || echo "000")
-    else
-      STATUS=$(curl -L -s --max-time 15 -D "$TMP_HEADERS" -o "$TMP_BODY" -w "%{http_code}" \
-        -X "$METHOD" "$API$URL" || echo "000")
-    fi
-
-    # Retry on transient gateway errors (502/503/504) from nginx during deploy
-    # Also retry on 000 (connection refused / timeout) to handle slow restarts
-    if [ "$STATUS" = "502" ] || [ "$STATUS" = "503" ] || [ "$STATUS" = "504" ] || [ "$STATUS" = "000" ]; then
-      if [ "$attempt" -lt "$MAX_RETRIES" ]; then
-        echo "  ↻ $METHOD $URL returned $STATUS, retrying ($attempt/$MAX_RETRIES)..." >&2
-        sleep 3
-        continue
-      fi
-    fi
-    break
-  done
-
-  echo "$STATUS"
-}
-
-request_health() {
-  STATUS=$(curl -L -s -D "$TMP_HEADERS" -o "$TMP_BODY" -w "%{http_code}" "$BASE_URL/health")
-  echo "$STATUS"
-}
-
-validate_api_response() {
-  ENDPOINT=$1
-
-  if ! jq -e . "$TMP_BODY" >/dev/null 2>&1; then
-    echo "Invalid JSON response for $ENDPOINT"
-    return 1
-  fi
-
-  if grep -qiE "<!doctype html|<html" "$TMP_BODY"; then
-    echo "Invalid response body for $ENDPOINT: received HTML"
-    return 1
-  fi
-
-  return 0
-}
-
-echo "================================"
-echo "FieldTrack API Smoke Test"
-echo "================================"
-
-echo "Waiting for API..."
-
-HEALTH_OK=false
-for i in {1..30}; do
-  STATUS=$(curl -L -s -o /dev/null -w "%{http_code}" --max-time 5 "$BASE_URL/health" 2>/dev/null || echo "000")
-  if [ "$STATUS" = "200" ]; then
-    echo "API healthy (attempt $i)"
-    HEALTH_OK=true
-    break
-  fi
-  echo "  Waiting... attempt $i/30 (HTTP $STATUS)" >&2
-  sleep 2
-done
-
-if [ "$HEALTH_OK" = "false" ]; then
-  echo "✗ API health check timed out after 30 attempts"
-  echo "  Last status: HTTP $STATUS"
-  echo "  URL: $BASE_URL/health"
-  echo ""
-  echo "Diagnostics:"
-  curl -sS -D - -o /dev/null --max-time 5 "$BASE_URL/health" 2>&1 || true
-  exit 1
-fi
-
-echo ""
-
-# ------------------------------------------------
-# Health check
-# ------------------------------------------------
-
-STATUS=$(request_health)
-BODY=$(cat "$TMP_BODY")
-
-if [ "$STATUS" = "200" ] && echo "$BODY" | grep -Eq '"status":"(ok|online)"'; then
-  log_pass "GET /health"
-else
-  log_fail "GET /health invalid ($STATUS)"
-fi
-
-# ------------------------------------------------
-# Auth guard tests
-# ------------------------------------------------
-
-echo ""
-echo "Auth guards"
-
-STATUS=$(request POST "/attendance/check-in")
-if validate_api_response "POST /attendance/check-in" && [ "$STATUS" = "401" ]; then
-  log_pass "POST /attendance/check-in protected"
-else
-  log_fail "POST /attendance/check-in invalid or unprotected ($STATUS)"
-fi
-
-STATUS=$(request POST "/attendance/check-out")
-if validate_api_response "POST /attendance/check-out" && [ "$STATUS" = "401" ]; then
-  log_pass "POST /attendance/check-out protected"
-else
-  log_fail "POST /attendance/check-out invalid or unprotected ($STATUS)"
-fi
-
-STATUS=$(request GET "/attendance/my-sessions")
-if validate_api_response "GET /attendance/my-sessions" && [ "$STATUS" = "401" ]; then
-  log_pass "GET /attendance/my-sessions protected"
-else
-  log_fail "GET /attendance/my-sessions invalid or unprotected ($STATUS)"
-fi
-
-STATUS=$(request GET "/attendance/org-sessions")
-if validate_api_response "GET /attendance/org-sessions" && [ "$STATUS" = "401" ]; then
-  log_pass "GET /attendance/org-sessions protected"
-else
-  log_fail "GET /attendance/org-sessions invalid or unprotected ($STATUS)"
-fi
-
-STATUS=$(request POST "/expenses")
-if validate_api_response "POST /expenses" && [ "$STATUS" = "401" ]; then
-  log_pass "POST /expenses protected"
-else
-  log_fail "POST /expenses invalid or unprotected ($STATUS)"
-fi
-
-STATUS=$(request GET "/expenses/my")
-if validate_api_response "GET /expenses/my" && [ "$STATUS" = "401" ]; then
-  log_pass "GET /expenses/my protected"
-else
-  log_fail "GET /expenses/my invalid or unprotected ($STATUS)"
-fi
-
-STATUS=$(request GET "/admin/expenses")
-if validate_api_response "GET /admin/expenses" && [ "$STATUS" = "401" ]; then
-  log_pass "GET /admin/expenses protected"
-else
-  log_fail "GET /admin/expenses invalid or unprotected ($STATUS)"
-fi
-
-# ------------------------------------------------
-# Get employee token (fresh login + refresh — never reuse)
-# ------------------------------------------------
-
-echo ""
-echo "Authenticating employee (login + refresh)..."
-
-EMP_TOKEN=$(login_and_refresh "$EMP_EMAIL" "$EMP_PASSWORD")
-if [ $? -ne 0 ] || [ -z "$EMP_TOKEN" ]; then
-  echo "✗ Failed to obtain employee token — check FT_EMP_EMAIL / FT_EMP_PASSWORD secrets"
-  echo "  Also verify the user exists in Supabase Auth and public.users"
-  exit 1
-fi
-
-# ── Auth Hook Integrity — Employee ────────────────────────────
-# Validate JWT claims BEFORE making any API calls.
-# If claims are missing the hook is not enabled; all calls will 401.
-assert_hook_claims "employee" "$EMP_TOKEN" "true"
-
-
-# ------------------------------------------------
-# Employee tests
-# ------------------------------------------------
-
-STATUS=$(request GET "/attendance/my-sessions" "$EMP_TOKEN")
-
-if validate_api_response "GET /attendance/my-sessions (employee)" && [ "$STATUS" = "200" ]; then
-  log_pass "Employee access /attendance/my-sessions"
-else
-  log_fail "Employee access invalid or failed ($STATUS)"
-fi
-
-# ------------------------------------------------
-# Get admin token (fresh login + refresh — never reuse)
-# ------------------------------------------------
-
-echo ""
-echo "Authenticating admin (login + refresh)..."
-
-ADMIN_TOKEN=$(login_and_refresh "$ADMIN_EMAIL" "$ADMIN_PASSWORD")
-if [ $? -ne 0 ] || [ -z "$ADMIN_TOKEN" ]; then
-  echo "✗ Failed to obtain admin token — check FT_ADMIN_EMAIL / FT_ADMIN_PASSWORD secrets"
-  echo "  Also verify the user exists in Supabase Auth and public.users"
-  exit 1
-fi
-
-# ── Auth Hook Integrity — Admin ───────────────────────────────
-assert_hook_claims "admin" "$ADMIN_TOKEN" "false"
-
-STATUS=$(request GET "/admin/org-summary" "$ADMIN_TOKEN")
-
-if validate_api_response "GET /admin/org-summary (admin)" && [ "$STATUS" = "200" ]; then
-  log_pass "Admin access /admin/org-summary"
-else
-  log_fail "Admin access /admin/org-summary invalid or failed ($STATUS)"
-fi
-
-STATUS=$(request GET "/admin/sessions?page=1&limit=2" "$ADMIN_TOKEN")
-
-if validate_api_response "GET /admin/sessions (admin)" && [ "$STATUS" = "200" ]; then
-  log_pass "Admin access /admin/sessions"
-else
-  log_fail "Admin access /admin/sessions invalid or failed ($STATUS) $(cat "$TMP_BODY")"
-fi
-
-STATUS=$(request GET "/admin/expenses?page=1&limit=2" "$ADMIN_TOKEN")
-
-if validate_api_response "GET /admin/expenses (admin)" && [ "$STATUS" = "200" ]; then
-  log_pass "Admin access /admin/expenses"
-else
-  log_fail "Admin access /admin/expenses invalid or failed ($STATUS) $(cat "$TMP_BODY")"
-fi
-
-STATUS=$(request GET "/admin/dashboard" "$ADMIN_TOKEN")
-
-if validate_api_response "GET /admin/dashboard (admin)" && [ "$STATUS" = "200" ]; then
-  log_pass "Admin access /admin/dashboard"
-else
-  log_fail "Admin access /admin/dashboard invalid or failed ($STATUS) $(cat "$TMP_BODY")"
-fi
-
-STATUS=$(request GET "/admin/employees?page=1&limit=5" "$ADMIN_TOKEN")
-
-if validate_api_response "GET /admin/employees (admin)" && [ "$STATUS" = "200" ]; then
-  log_pass "Admin access /admin/employees"
-else
-  log_fail "Admin access /admin/employees invalid or failed ($STATUS) $(cat "$TMP_BODY")"
-fi
-
-STATUS=$(request GET "/admin/expenses/export" "$ADMIN_TOKEN")
-
-if [ "$STATUS" = "200" ]; then
-  log_pass "Admin access /admin/expenses/export"
-else
-  log_fail "Admin access /admin/expenses/export invalid or failed ($STATUS)"
-fi
-
-# NOTE: /admin/queues is NOT tested here because it requires Redis and BullMQ,
-# which are only available in production/staging (WORKERS_ENABLED=true).
-# Use GET /ready on production to verify queue/worker health.
-
-# ------------------------------------------------
-# Summary
-# ------------------------------------------------
-
-echo ""
-echo "==============================="
-echo "Passed: $PASS"
-echo "Failed: $FAIL"
-echo "==============================="
-
-cat <<EOF > smoke-report.json
-{
-  "passed": $PASS,
-  "failed": $FAIL
-}
-EOF
-
-if [ "$FAIL" -gt 0 ]; then
-  exit 1
-fi
diff --git a/scripts/validate-env.sh b/scripts/validate-env.sh
deleted file mode 100644
index ebe8c2d..0000000
--- a/scripts/validate-env.sh
+++ /dev/null
@@ -1,289 +0,0 @@
-#!/usr/bin/env bash
-# =============================================================================
-# validate-env.sh — FieldTrack 2.0 environment contract validator
-#
-# Run before any deployment to catch configuration drift early.
-# SELF-SUFFICIENT: sources load-env.sh internally, does NOT depend on caller env.
-#
-# Usage:
-#   bash scripts/validate-env.sh
-#   bash scripts/validate-env.sh --check-monitoring
-#
-# Options:
-#   --check-monitoring        Also validate infra/.env.monitoring and
-#                             cross-check API_HOSTNAME + METRICS_SCRAPE_TOKEN.
-#   --env-file <path>         Override default .env path.
-#   --monitoring-env <path>   Override default infra/.env.monitoring path.
-#
-# ENV CONTRACT:
-#   APP layer   → API_BASE_URL  (full URL:      https://api.example.com)
-#   INFRA layer → API_HOSTNAME  (hostname only: api.example.com)
-#                 Derived at deploy-time from API_BASE_URL by load-env.sh.
-#                 Set explicitly in infra/.env.monitoring (Docker Compose reads it).
-#                 Must NOT be set in .env.
-#
-# Forbidden variable:
-#   API_DOMAIN  — fully removed; using it is a hard error.
-#
-# Exit codes:
-#   0 — all checks passed
-#   1 — one or more checks failed
-# =============================================================================
-set -euo pipefail
-
-# ── Colour helpers ─────────────────────────────────────────────────────────────
-GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; BOLD='\033[1m'; NC='\033[0m'
-pass()   { printf "${GREEN}✓${NC} %s\n" "$1"; }
-warn()   { printf "${YELLOW}⚠${NC} %s\n" "$1"; }
-fail()   { printf "${RED}✗${NC} %s\n" "$1" >&2; ERRORS=$((ERRORS + 1)); }
-header() { printf "\n${BOLD}── %s ──${NC}\n" "$1"; }
-
-ERRORS=0
-
-# ── Argument parsing ───────────────────────────────────────────────────────────
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
-API_ENV_FILE="$REPO_ROOT/.env"
-MONITORING_ENV_FILE="$REPO_ROOT/infra/.env.monitoring"
-CHECK_MONITORING=false
-
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --check-monitoring)   CHECK_MONITORING=true;            shift ;;
-        --env-file)           API_ENV_FILE="$2";                shift 2 ;;
-        --monitoring-env)     MONITORING_ENV_FILE="$2";         shift 2 ;;
-        -h|--help)
-            grep '^#' "${BASH_SOURCE[0]}" | head -30 | sed 's/^# \{0,1\}//'
-            exit 0 ;;
-        *) printf "Unknown option: %s\n" "$1" >&2; exit 1 ;;
-    esac
-done
-
-# ── Helper: read a value from a KEY=VALUE env file ─────────────────────────────
-# Usage: get_val KEY /path/to/file
-#
-# Uses grep + cut only — avoids sed quote-stripping which corrupts values
-# containing special characters (URLs, tokens, passwords).
-# Head -1 prevents duplicate-key ambiguity; -E anchors on the key name so
-# KEY_EXTRA= cannot accidentally match KEY=.
-get_val() {
-    local key="$1" file="$2"
-    grep -E "^${key}=" "$file" 2>/dev/null | head -1 | cut -d'=' -f2-
-}
-
-DERIVED_HOSTNAME=""
-
-# ── Load environment (self-sufficient) ────────────────────────────────────────
-# Source load-env.sh to get API_HOSTNAME derived using the SAME Node logic.
-# This ensures validate-env.sh uses identical parsing to deploy scripts.
-# Disable trace to prevent secrets from leaking into logs.
-set +x 2>/dev/null || true
-source "$SCRIPT_DIR/load-env.sh"
-set -x 2>/dev/null || true
-
-# ── Banner ─────────────────────────────────────────────────────────────────────
-printf "\n${BOLD}╔══════════════════════════════════════════════════════════╗${NC}\n"
-printf "${BOLD}║   FieldTrack 2.0 — Environment Contract Validator        ║${NC}\n"
-printf "${BOLD}╚══════════════════════════════════════════════════════════╝${NC}\n"
-printf "  API env:           %s\n" "$API_ENV_FILE"
-printf "  Monitoring env:    %s\n" "$MONITORING_ENV_FILE"
-printf "  Check monitoring:  %s\n" "$CHECK_MONITORING"
-
-# =============================================================================
-# SECTION 0: Forbidden variable check (repository-wide)
-# =============================================================================
-header "Forbidden variable check (API_DOMAIN)"
-
-# Hard error: API_DOMAIN assignment must not appear ANYWHERE in the repository.
-# Repository-wide scan (excluding node_modules) to catch drift across scripts
-# and generated env files, not only the two primary env files.
-if grep -r "API_DOMAIN" "$REPO_ROOT" --exclude-dir=node_modules 2>/dev/null | grep -E "API_DOMAIN[[:space:]]*="; then
-    fail "API_DOMAIN assignment found in repository — this variable has been REMOVED"
-    fail "  Replace with: API_BASE_URL=https://your-domain.com (in .env)"
-    fail "                API_HOSTNAME=your-domain.com (in infra/.env.monitoring)"
-else
-    pass "API_DOMAIN assignment not found in repository (correct)"
-fi
-
-# =============================================================================
-# SECTION 1: Backend .env
-# =============================================================================
-header "Backend environment  (.env)"
-
-if [[ ! -f "$API_ENV_FILE" ]]; then
-    fail ".env file not found: $API_ENV_FILE"
-    fail "Run:  cp .env.example .env  and fill in values"
-    printf "\n${RED}Cannot continue — .env is missing.${NC}\n\n"
-    exit 1
-fi
-pass ".env file exists"
-
-# Required backend variables
-REQUIRED_API_VARS=(
-    API_BASE_URL
-    CORS_ORIGIN
-    SUPABASE_URL
-    REDIS_URL
-    METRICS_SCRAPE_TOKEN
-)
-for var in "${REQUIRED_API_VARS[@]}"; do
-    val="$(get_val "$var" "$API_ENV_FILE")"
-    if [[ -z "$val" ]]; then
-        fail "$var is not set in .env"
-    else
-        pass "$var is set"
-    fi
-done
-
-# =============================================================================
-# SECTION 2: API_BASE_URL format + API_HOSTNAME derivation
-# =============================================================================
-header "API_BASE_URL validation"
-
-# API_BASE_URL is already loaded by load-env.sh (sourced above)
-if [[ -z "$API_BASE_URL" ]]; then
-    fail "API_BASE_URL is empty — skipping format checks"
-else
-    if [[ "$API_BASE_URL" =~ ^https:// ]]; then
-        pass "API_BASE_URL uses https:// (production-safe)"
-    elif [[ "$API_BASE_URL" =~ ^http:// ]]; then
-        warn "API_BASE_URL uses http:// — OK for local dev only, not production"
-    else
-        fail "API_BASE_URL must start with https:// or http://"
-    fi
-
-    # Derive API_HOSTNAME using bash-safe parsing (no Node.js dependency)
-    # Strip protocol (http:// or https://) and take first path segment
-    DERIVED_HOSTNAME=$(echo "$API_BASE_URL" | sed -E 's|^https?://||' | cut -d'/' -f1)
-
-    if [[ -z "$DERIVED_HOSTNAME" ]]; then
-        fail "Cannot derive API_HOSTNAME from API_BASE_URL='$API_BASE_URL'"
-    elif [[ "$DERIVED_HOSTNAME" =~ [[:space:]/@?#] ]]; then
-        fail "Derived API_HOSTNAME contains invalid characters: '$DERIVED_HOSTNAME'"
-        fail "  API_BASE_URL must not contain credentials (@), embedded paths (/), or query strings (?#)"
-    elif [[ ! "$DERIVED_HOSTNAME" =~ \. ]]; then
-        warn "API_HOSTNAME '$DERIVED_HOSTNAME' has no dot — OK for localhost only"
-    else
-        pass "API_HOSTNAME derived: $DERIVED_HOSTNAME"
-    fi
-
-    # Compare with API_HOSTNAME from load-env.sh (already exported)
-    if [[ -n "$API_HOSTNAME" && "$API_HOSTNAME" != "$DERIVED_HOSTNAME" ]]; then
-        fail "API_HOSTNAME MISMATCH between load-env.sh and validate-env.sh"
-        fail "  load-env.sh:     $API_HOSTNAME"
-        fail "  validate-env.sh: $DERIVED_HOSTNAME"
-        fail "  This indicates a parsing inconsistency — both must use identical logic"
-    else
-        pass "API_HOSTNAME consistent: $DERIVED_HOSTNAME"
-    fi
-fi
-
-# =============================================================================
-# SECTION 3: Contract boundary — API_HOSTNAME must NOT be in .env
-# =============================================================================
-header "Contract boundary check"
-
-# STRICT: API_HOSTNAME must NOT exist in .env
-if grep -q "^API_HOSTNAME=" "$API_ENV_FILE" 2>/dev/null; then
-    fail "API_HOSTNAME found in .env — this violates the env contract"
-    fail "  API_HOSTNAME is derived at deploy-time from API_BASE_URL"
-    fail "  Remove API_HOSTNAME from .env immediately"
-else
-    pass "API_HOSTNAME absent from .env (correct — derived from API_BASE_URL)"
-fi
-
-# =============================================================================
-# SECTION 4: Monitoring env
-# =============================================================================
-header "Monitoring environment  (infra/.env.monitoring)"
-
-if [[ ! -f "$MONITORING_ENV_FILE" ]]; then
-    if [[ "$CHECK_MONITORING" == "true" ]]; then
-        fail ".env.monitoring not found: $MONITORING_ENV_FILE"
-        fail "Run:  cp infra/.env.monitoring.example infra/.env.monitoring  and fill values"
-    else
-        warn ".env.monitoring not found  (use --check-monitoring to enforce this check)"
-    fi
-else
-    pass ".env.monitoring exists"
-
-    # Required monitoring variables
-    REQUIRED_MON_VARS=(
-        API_HOSTNAME
-        METRICS_SCRAPE_TOKEN
-        GRAFANA_ADMIN_PASSWORD
-        ALERTMANAGER_SLACK_WEBHOOK
-    )
-    for var in "${REQUIRED_MON_VARS[@]}"; do
-        val="$(get_val "$var" "$MONITORING_ENV_FILE")"
-        if [[ -z "$val" ]]; then
-            fail "$var not set in infra/.env.monitoring"
-        else
-            pass "$var is set in .env.monitoring"
-        fi
-    done
-
-    # Optional hardening: Slack webhook must be a valid Slack incoming webhook URL.
-    SLACK_WEBHOOK="$(get_val "ALERTMANAGER_SLACK_WEBHOOK" "$MONITORING_ENV_FILE")"
-    if [[ -z "$SLACK_WEBHOOK" ]]; then
-        fail "ALERTMANAGER_SLACK_WEBHOOK not set in infra/.env.monitoring"
-    elif [[ ! "$SLACK_WEBHOOK" =~ ^https://hooks\.slack\.com/ ]]; then
-        fail "ALERTMANAGER_SLACK_WEBHOOK is not a valid Slack webhook URL"
-    else
-        pass "ALERTMANAGER_SLACK_WEBHOOK is valid"
-    fi
-    unset SLACK_WEBHOOK
-
-    # Cross-check 1: API_HOSTNAME must match the hostname derived from API_BASE_URL
-    MON_HOSTNAME="$(get_val "API_HOSTNAME" "$MONITORING_ENV_FILE")"
-    if [[ -n "$DERIVED_HOSTNAME" && -n "$MON_HOSTNAME" ]]; then
-        if [[ "$DERIVED_HOSTNAME" == "$MON_HOSTNAME" ]]; then
-            pass "API_HOSTNAME is consistent: derived($DERIVED_HOSTNAME) = .env.monitoring($MON_HOSTNAME)"
-        else
-            fail "API_HOSTNAME MISMATCH:"
-            fail "  .env   → API_BASE_URL → $DERIVED_HOSTNAME"
-            fail "  .env.monitoring → API_HOSTNAME  = $MON_HOSTNAME"
-            fail "  Fix: set  API_HOSTNAME=$DERIVED_HOSTNAME  in infra/.env.monitoring"
-        fi
-    fi
-
-    # Cross-check 2: METRICS_SCRAPE_TOKEN must be identical in both files
-    API_MST="$(get_val "METRICS_SCRAPE_TOKEN" "$API_ENV_FILE")"
-    MON_MST="$(get_val "METRICS_SCRAPE_TOKEN" "$MONITORING_ENV_FILE")"
-    if [[ -n "$API_MST" && -n "$MON_MST" ]]; then
-        if [[ "$API_MST" == "$MON_MST" ]]; then
-            pass "METRICS_SCRAPE_TOKEN is identical in both env files"
-        else
-            fail "METRICS_SCRAPE_TOKEN MISMATCH between .env and infra/.env.monitoring"
-            fail "  Prometheus will receive 401s and all metric alerts will go blind"
-        fi
-    elif [[ -n "$API_MST" && -z "$MON_MST" ]]; then
-        fail "METRICS_SCRAPE_TOKEN set in .env but missing in infra/.env.monitoring"
-    fi
-fi
-
-# =============================================================================
-# Summary
-# =============================================================================
-printf "\n══════════════════════════════════════════════════════════\n"
-if [[ $ERRORS -eq 0 ]]; then
-    printf "${GREEN}${BOLD}✅ All checks passed — environment contract is valid${NC}\n\n"
-    printf "  Active ENV contract:\n"
-    printf "  ┌─ APP layer ─────────────────────────────────────────────\n"
-    printf "  │  API_BASE_URL  = %s\n" "${API_BASE_URL:-(not set)}"
-    printf "  └─ INFRA layer ──────────────────────────────────────────\n"
-    printf "     API_HOSTNAME  = %s\n" "${DERIVED_HOSTNAME:-(not derivable)}"
-    printf "\n"
-    printf "  RULES:\n"
-    printf "  • API_BASE_URL  → set in .env  (app layer only)\n"
-    printf "  • API_HOSTNAME  → set in infra/.env.monitoring, derived by load-env.sh\n"
-    printf "  • API_DOMAIN    → REMOVED — do not re-add\n\n"
-    exit 0
-else
-    printf "${RED}${BOLD}❌ %d check(s) failed — fix errors before deploying${NC}\n\n" "$ERRORS"
-    printf "  ENV contract:\n"
-    printf "  • API_BASE_URL  = full URL    (https://api.example.com)  → .env\n"
-    printf "  • API_HOSTNAME  = host only   (api.example.com)          → infra/.env.monitoring\n"
-    printf "  • API_DOMAIN is deprecated — use API_BASE_URL instead\n\n"
-    exit 1
-fi
diff --git a/scripts/verify-stabilization.sh b/scripts/verify-stabilization.sh
index 7032c81..840d2d6 100644
--- a/scripts/verify-stabilization.sh
+++ b/scripts/verify-stabilization.sh
@@ -1,347 +1,67 @@
-#!/bin/bash
-# =============================================================================
-# verify-stabilization.sh — Test suite for stabilization changes
-#
-# Run this script to verify all stabilization changes are working correctly.
-# Safe to run on VPS or locally (uses test values, doesn't modify production).
-#
-# Usage:
-#   bash scripts/verify-stabilization.sh
-# =============================================================================
+#!/usr/bin/env bash
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
 
-GREEN='\033[0;32m'
-RED='\033[0;31m'
-YELLOW='\033[1;33m'
-BOLD='\033[1m'
-NC='\033[0m'
-
 TESTS_PASSED=0
 TESTS_FAILED=0
 
-pass() { printf "${GREEN}✓${NC} %s\n" "$1"; TESTS_PASSED=$((TESTS_PASSED + 1)); }
-fail() { printf "${RED}✗${NC} %s\n" "$1"; TESTS_FAILED=$((TESTS_FAILED + 1)); }
-info() { printf "${YELLOW}ℹ${NC} %s\n" "$1"; }
-
-echo ""
-printf "${BOLD}╔══════════════════════════════════════════════════════════╗${NC}\n"
-printf "${BOLD}║   FieldTrack 2.0 — Stabilization Verification Suite     ║${NC}\n"
-printf "${BOLD}╚══════════════════════════════════════════════════════════╝${NC}\n"
-echo ""
-
-# =============================================================================
-# TEST 1: Node.js hostname parsing
-# =============================================================================
-echo "TEST 1: Node.js hostname parsing"
-echo "--------------------------------"
-
-# Check if Node.js is available
-if ! command -v node &> /dev/null; then
-    info "Node.js not found - skipping URL parsing tests"
-    info "These tests will run on VPS during actual deployment"
-else
-    test_url_parsing() {
-        local input="$1"
-        local expected="$2"
-        local result
-        
-        # Use a more portable approach for Windows bash
-        result=$(node -e "try { const url = new URL('$input'); console.log(url.host); } catch (err) { console.log('ERROR'); }" 2>&1) || result="ERROR"
-        
-        if [ "$result" = "$expected" ]; then
-            pass "Parse '$input' → '$expected'"
-        elif [ "$result" = "ERROR" ] && [ "$expected" = "ERROR" ]; then
-            pass "Parse '$input' → '$expected'"
-        else
-            # On Windows bash, Node.js URL parsing may behave differently
-            # Mark as info instead of fail if we're not on Linux
-            if [[ "$OSTYPE" == "linux-gnu"* ]]; then
-                fail "Parse '$input' → expected '$expected', got '$result'"
-            else
-                info "Parse '$input' → expected '$expected', got '$result' (Windows bash - may differ)"
-                TESTS_PASSED=$((TESTS_PASSED + 1))
-            fi
-        fi
-    }
-
-    test_url_parsing "https://api.example.com" "api.example.com"
-    test_url_parsing "https://api.example.com:8443" "api.example.com:8443"
-    test_url_parsing "http://localhost:3000" "localhost:3000"
-    test_url_parsing "https://api.example.com/" "api.example.com"
-    test_url_parsing "https://api.example.com/path" "api.example.com"
-    test_url_parsing "invalid-url" "ERROR"
-fi
-
-echo ""
-
-# =============================================================================
-# TEST 2: Script error handling
-# =============================================================================
-echo "TEST 2: Script error handling"
-echo "-----------------------------"
-
-check_script_guards() {
-    local script="$1"
-    local script_path="$SCRIPT_DIR/$script"
-    
-    if [ ! -f "$script_path" ]; then
-        fail "$script not found"
-        return
-    fi
-    
-    # Check for set -euo pipefail
-    if grep -q "set -euo pipefail" "$script_path"; then
-        pass "$script has 'set -euo pipefail'"
-    else
-        fail "$script missing 'set -euo pipefail'"
-    fi
-    
-    # Check for trap (deploy and rollback only)
-    if [[ "$script" == "deploy-bluegreen.sh" || "$script" == "rollback.sh" ]]; then
-        if grep -q "trap.*ERR" "$script_path"; then
-            pass "$script has ERR trap"
-        else
-            fail "$script missing ERR trap"
-        fi
-    fi
-}
-
-check_script_guards "load-env.sh"
-check_script_guards "validate-env.sh"
-check_script_guards "deploy-bluegreen.sh"
-check_script_guards "rollback.sh"
-
-echo ""
-
-# =============================================================================
-# TEST 3: Forbidden variable check
-# =============================================================================
-echo "TEST 3: Forbidden variable check"
-echo "--------------------------------"
-
-# Create temporary test env files
-TEST_DIR=$(mktemp -d)
-trap "rm -rf $TEST_DIR" EXIT
-
-# Test 1: Clean env (no API_DOMAIN)
-cat > "$TEST_DIR/.env" <<EOF
-API_BASE_URL=https://api.example.com
-CORS_ORIGIN=https://app.example.com
-EOF
-
-if ! grep -q "API_DOMAIN" "$TEST_DIR/.env" 2>/dev/null; then
-    pass "Forbidden variable check: clean env passes"
-else
-    fail "Forbidden variable check: clean env should pass"
-fi
-
-# Test 2: Env with API_DOMAIN (should be detected)
-DEPRECATED_API_VAR="API_DOMAIN"
-cat > "$TEST_DIR/.env.bad" <<EOF
-API_BASE_URL=https://api.example.com
-CORS_ORIGIN=https://app.example.com
-EOF
-printf '%s=%s\n' "$DEPRECATED_API_VAR" "api.example.com" >> "$TEST_DIR/.env.bad"
-
-if grep "API_DOMAIN" "$TEST_DIR/.env.bad" 2>/dev/null | grep -qv "^#"; then
-    pass "Forbidden variable check: detects API_DOMAIN"
-else
-    fail "Forbidden variable check: should detect API_DOMAIN"
-fi
-
-echo ""
-
-# =============================================================================
-# TEST 4: Secret protection patterns
-# =============================================================================
-echo "TEST 4: Secret protection patterns"
-echo "----------------------------------"
-
-check_secret_protection() {
-    local script="$1"
-    local script_path="$SCRIPT_DIR/$script"
-    
-    if [ ! -f "$script_path" ]; then
-        fail "$script not found"
-        return
-    fi
-    
-    # Check for set +x before sensitive operations
-    if grep -q "set +x" "$script_path"; then
-        pass "$script has secret protection (set +x)"
-    else
-        fail "$script missing secret protection"
-    fi
-}
-
-check_secret_protection "load-env.sh"
-check_secret_protection "validate-env.sh"
-check_secret_protection "deploy-bluegreen.sh"
-check_secret_protection "rollback.sh"
-
-echo ""
-
-# =============================================================================
-# TEST 5: Rollback guard
-# =============================================================================
-echo "TEST 5: Rollback guard"
-echo "---------------------"
-
-if grep -q "API_ROLLBACK_IN_PROGRESS" "$SCRIPT_DIR/rollback.sh"; then
-    pass "rollback.sh sets API_ROLLBACK_IN_PROGRESS guard"
-else
-    fail "rollback.sh missing rollback guard"
-fi
-
-if grep -q "API_ROLLBACK_IN_PROGRESS" "$SCRIPT_DIR/deploy-bluegreen.sh"; then
-    pass "deploy-bluegreen.sh checks API_ROLLBACK_IN_PROGRESS"
-else
-    fail "deploy-bluegreen.sh missing rollback guard check"
-fi
+pass() { echo "[PASS] $*"; TESTS_PASSED=$((TESTS_PASSED + 1)); }
+fail() { echo "[FAIL] $*"; TESTS_FAILED=$((TESTS_FAILED + 1)); }
 
-echo ""
-
-# =============================================================================
-# TEST 6: Exit codes
-# =============================================================================
-echo "TEST 6: Exit codes"
-echo "-----------------"
-
-check_exit_codes() {
-    local script="$1"
-    local script_path="$SCRIPT_DIR/$script"
-    
-    if [ ! -f "$script_path" ]; then
-        fail "$script not found"
-        return
-    fi
-    
-    # Check for exit 2 (critical failure)
-    if grep -q "exit 2" "$script_path"; then
-        pass "$script has exit code 2 for critical failures"
-    else
-        info "$script has no exit code 2 (may be OK)"
-    fi
+check_exists() {
+  local p="$1"
+  if [ -f "$p" ]; then pass "Exists: $p"; else fail "Missing: $p"; fi
 }
 
-check_exit_codes "deploy-bluegreen.sh"
-check_exit_codes "rollback.sh"
-
-echo ""
-
-# =============================================================================
-# TEST 7: Documentation files
-# =============================================================================
-echo "TEST 7: Documentation files"
-echo "--------------------------"
-
-check_doc_exists() {
-    local doc="$1"
-    local doc_path="$REPO_ROOT/$doc"
-    
-    if [ -f "$doc_path" ]; then
-        pass "$doc exists"
-    else
-        fail "$doc missing"
-    fi
+check_not_exists() {
+  local p="$1"
+  if [ ! -e "$p" ]; then pass "Removed: $p"; else fail "Still present: $p"; fi
 }
 
-check_doc_exists "STABILIZATION_SUMMARY.md"
-check_doc_exists "DEPLOY_QUICK_REFERENCE.md"
-check_doc_exists "CHANGES_DIFF_SUMMARY.md"
-
-echo ""
-
-# =============================================================================
-# TEST 8: Backend startup logging
-# =============================================================================
-echo "TEST 8: Backend startup logging"
-echo "-------------------------------"
-
-if grep -q "apiHostname" "$REPO_ROOT/src/config/env.ts"; then
-    pass "env.ts logs apiHostname"
+echo "FieldTrack API stabilization checks"
+echo "=================================="
+
+# Required scripts only
+check_exists "$SCRIPT_DIR/deploy.sh"
+check_exists "$SCRIPT_DIR/vps-readiness-check.sh"
+check_exists "$SCRIPT_DIR/verify-stabilization.sh"
+
+# Removed scripts
+check_not_exists "$SCRIPT_DIR/load-env.sh"
+check_not_exists "$SCRIPT_DIR/validate-env.sh"
+check_not_exists "$SCRIPT_DIR/smoke-test.sh"
+check_not_exists "$SCRIPT_DIR/deploy-bluegreen.sh"
+check_not_exists "$SCRIPT_DIR/rollback.sh"
+check_not_exists "$SCRIPT_DIR/monitoring-sync.sh"
+check_not_exists "$SCRIPT_DIR/vps-setup.sh"
+check_not_exists "$SCRIPT_DIR/analytics-backfill.ts"
+check_not_exists "$SCRIPT_DIR/load-testing"
+
+# Infra coupling guard (deployment/runtime paths only)
+# We only block local repo-relative infra paths (./infra or ../infra) in
+# executable/deploy code paths. External absolute paths like /opt/infra are allowed.
+if grep -R -E "\.\./infra/|\./infra/" \
+  "$REPO_ROOT/scripts" "$REPO_ROOT/src" "$REPO_ROOT/.github/workflows" \
+  --exclude="verify-stabilization.sh" \
+  --binary-files=without-match --exclude-dir=node_modules --exclude-dir=.git >/dev/null; then
+  fail "Found local repo-relative infra coupling in scripts/src/workflows"
 else
-    fail "env.ts missing apiHostname logging"
+  pass "No local repo-relative infra coupling found"
 fi
 
-if grep -q "configHash" "$REPO_ROOT/src/config/env.ts"; then
-    pass "env.ts logs configHash"
+# Deploy workflow guard
+if grep -q "validate-env.sh\|load-env.sh\|smoke-test.sh\|monitoring-sync" "$REPO_ROOT/.github/workflows/deploy.yml"; then
+  fail "deploy.yml still references removed or infra-specific helpers"
 else
-    fail "env.ts missing configHash logging"
+  pass "deploy.yml has no removed/infra-specific helper references"
 fi
 
 echo ""
+echo "Passed: $TESTS_PASSED"
+echo "Failed: $TESTS_FAILED"
 
-# =============================================================================
-# TEST 9: CI validation step
-# =============================================================================
-echo "TEST 9: CI validation step"
-echo "-------------------------"
-
-if grep -q "Validate environment contract before deploy" "$REPO_ROOT/.github/workflows/deploy.yml"; then
-    pass "deploy.yml has pre-deploy validation step"
-else
-    fail "deploy.yml missing pre-deploy validation step"
-fi
-
-if grep -q "validate-env.sh --check-monitoring" "$REPO_ROOT/.github/workflows/deploy.yml"; then
-    pass "deploy.yml runs validate-env.sh with --check-monitoring"
-else
-    fail "deploy.yml missing validate-env.sh call"
-fi
-
-echo ""
-
-# =============================================================================
-# TEST 10: Idempotency checks
-# =============================================================================
-echo "TEST 10: Idempotency checks"
-echo "--------------------------"
-
-# Check for || true on cleanup operations
-if grep -q "docker rm -f.*|| true" "$SCRIPT_DIR/deploy-bluegreen.sh"; then
-    pass "deploy-bluegreen.sh has idempotent cleanup (|| true)"
-else
-    fail "deploy-bluegreen.sh missing idempotent cleanup"
-fi
-
-# Check for atomic writes
-if grep -q "DEPLOY_HISTORY_TMP" "$SCRIPT_DIR/deploy-bluegreen.sh"; then
-    pass "deploy-bluegreen.sh uses atomic writes for history"
-else
-    fail "deploy-bluegreen.sh missing atomic writes"
-fi
-
-echo ""
-
-# =============================================================================
-# Summary
-# =============================================================================
-printf "${BOLD}══════════════════════════════════════════════════════════${NC}\n"
-printf "${BOLD}Test Results${NC}\n"
-printf "${BOLD}══════════════════════════════════════════════════════════${NC}\n"
-printf "  ${GREEN}Passed:${NC} %d\n" "$TESTS_PASSED"
-printf "  ${RED}Failed:${NC} %d\n" "$TESTS_FAILED"
-printf "${BOLD}══════════════════════════════════════════════════════════${NC}\n"
-
-if [ $TESTS_FAILED -eq 0 ]; then
-    echo ""
-    printf "${GREEN}${BOLD}✅ All stabilization checks passed!${NC}\n"
-    echo ""
-    echo "Next steps:"
-    echo "  1. Review STABILIZATION_SUMMARY.md for complete documentation"
-    echo "  2. Review DEPLOY_QUICK_REFERENCE.md for operator guide"
-    echo "  3. On VPS: Update infra/.env.monitoring with API_HOSTNAME"
-    echo "  4. On VPS: Run validate-env.sh --check-monitoring"
-    echo "  5. Deploy to production and monitor"
-    echo ""
-    exit 0
-else
-    echo ""
-    printf "${RED}${BOLD}❌ Some checks failed - review output above${NC}\n"
-    echo ""
-    exit 1
-fi
+if [ "$TESTS_FAILED" -gt 0 ]; then
+  exit 1
+fi
\ No newline at end of file
diff --git a/scripts/vps-readiness-check.sh b/scripts/vps-readiness-check.sh
index f026a5c..b1cd55a 100644
--- a/scripts/vps-readiness-check.sh
+++ b/scripts/vps-readiness-check.sh
@@ -58,7 +58,7 @@ echo ""
 # ── CHECK 1: DEPLOY_ROOT exists ────────────────────────────────────────────────
 echo "--- CHECK 1: Deploy root directory ---"
 if [ ! -d "$DEPLOY_ROOT" ]; then
-  fail "DEPLOY_ROOT not found: $DEPLOY_ROOT — VPS may not be provisioned. Run vps-setup.sh first."
+  fail "DEPLOY_ROOT not found: $DEPLOY_ROOT — ensure infra bootstrap has been run and DEPLOY_ROOT is correct."
 fi
 ok "DEPLOY_ROOT exists: $DEPLOY_ROOT"
 
@@ -191,11 +191,6 @@ for f in "${REQUIRED_ENV_FILES[@]}"; do
   fi
 done
 
-# .env.monitoring is optional (monitoring-sync.sh self-heals from example)
-if [ ! -f "$DEPLOY_ROOT/infra/.env.monitoring" ]; then
-  warn ".env.monitoring not found — monitoring-sync.sh will create it from example during deploy."
-fi
-
 # ── CHECK 7: Runtime state directories ────────────────────────────────────────
 echo ""
 echo "--- CHECK 7: Runtime directories ---"
@@ -210,30 +205,14 @@ for dir in "$RUNTIME_DIR" "$LOG_DIR"; do
   fi
 done
 
-# ── CHECK 8: Nginx live config directory ──────────────────────────────────────
-echo ""
-echo "--- CHECK 8: Nginx live config directory ---"
-NGINX_LIVE_DIR="$DEPLOY_ROOT/infra/nginx/live"
-NGINX_BACKUP_DIR="$DEPLOY_ROOT/infra/nginx/backup"
-
-for dir in "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR"; do
-  if [ ! -d "$dir" ]; then
-    warn "Nginx directory missing: $dir — creating it."
-    mkdir -p "$dir"
-    ok "Created: $dir"
-  else
-    ok "Directory exists: $dir"
-  fi
-done
-
-# ── CHECK 9: Network attachment for expected containers ───────────────────────
+# ── CHECK 8: Network attachment for expected containers ───────────────────────
 #
-# If nginx, prometheus, grafana, or alertmanager are running, they MUST be
+# If nginx is running, it MUST be
 # attached to api_network. If they're not, Docker DNS resolution will fail
 # and api-blue/api-green will be unreachable by name.
 echo ""
-echo "--- CHECK 9: Network attachment enforcement ---"
-NETWORK_REQUIRED=(nginx prometheus grafana alertmanager)
+echo "--- CHECK 8: Network attachment enforcement ---"
+NETWORK_REQUIRED=(nginx)
 for c in "${NETWORK_REQUIRED[@]}"; do
   if docker inspect "$c" >/dev/null 2>&1; then
     if ! docker inspect "$c" --format '{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' \
@@ -249,9 +228,9 @@ for c in "${NETWORK_REQUIRED[@]}"; do
   fi
 done
 
-# ── CHECK 10: Disk space (warn if < 2GB free) ──────────────────────────────────
+# ── CHECK 9: Disk space (warn if < 2GB free) ──────────────────────────────────
 echo ""
-echo "--- CHECK 10: Disk space ---"
+echo "--- CHECK 9: Disk space ---"
 FREE_KB=$(df -k / | awk 'NR==2 {print $4}')
 FREE_GB=$(awk "BEGIN {printf \"%.1f\", $FREE_KB/1024/1024}")
 if [ "$FREE_KB" -lt 2097152 ]; then
diff --git a/scripts/vps-setup.sh b/scripts/vps-setup.sh
deleted file mode 100644
index 1df411a..0000000
--- a/scripts/vps-setup.sh
+++ /dev/null
@@ -1,528 +0,0 @@
-#!/bin/bash
-# ============================================================================
-# FieldTrack API — VPS Setup Script
-# ============================================================================
-#
-# Deterministic first-time setup for a fresh Ubuntu 22.04/24.04 VPS.
-#
-# USAGE:
-#   Step 1: Copy this script to the VPS
-#   Step 2: Set the variables below
-#   Step 3: Run: sudo bash vps-setup.sh
-#
-# PREREQUISITES:
-#   - Fresh Ubuntu 22.04 or 24.04 LTS VPS
-#   - Root or sudo access
-#   - GitHub PAT with packages:read scope (for GHCR)
-#
-# ============================================================================
-
-set -euo pipefail
-
-# ── Configuration (EDIT THESE) ─────────────────────────────────────────────────
-DOMAIN="api.getfieldtrack.app"               # Production API domain
-GH_USER="fieldtrack-tech"                    # GitHub org name
-GH_PAT=""                                   # GitHub Personal Access Token (packages:read)
-DEPLOY_USER="ashish"                        # Non-root user for deployment
-DEPLOY_USER_SSH_PUBLIC_KEY=""               # Required public key for deploy user (ssh-ed25519 ...)
-REPO_URL="https://github.com/fieldtrack-tech/api.git"
-DEPLOY_HOME="/home/${DEPLOY_USER}"
-DEPLOY_ROOT="${DEPLOY_ROOT:-${DEPLOY_HOME}/api}"
-REPO_DIR="$DEPLOY_ROOT"
-LEGACY_REPO_DIR="${DEPLOY_HOME}/FieldTrack-2.0"
-AUTO_CLEAN_LEGACY_REPO="${AUTO_CLEAN_LEGACY_REPO:-false}"
-NETWORK="api_network"
-NGINX_LIVE_DIR="$DEPLOY_ROOT/infra/nginx/live"
-NGINX_SITE_LINK="$NGINX_LIVE_DIR/api.conf"
-
-# ── Colour output ─────────────────────────────────────────────────────────────
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-RED='\033[0;31m'
-NC='\033[0m'
-
-log()  { echo -e "${GREEN}[✓]${NC} $1"; }
-warn() { echo -e "${YELLOW}[!]${NC} $1"; }
-err()  { echo -e "${RED}[✗]${NC} $1"; exit 1; }
-
-render_nginx_ssl_config() {
-    local target_file="$1"
-    mkdir -p "$(dirname "$target_file")"
-    cp "$REPO_DIR/infra/nginx/api.conf" "$target_file"
-    sed -i "s|__API_HOSTNAME__|$DOMAIN|g" "$target_file"
-    sed -i "s|__ACTIVE_CONTAINER__|api-blue|g" "$target_file"  # bootstrap slot
-}
-
-echo ""
-echo "============================================="
-echo "  FieldTrack API — VPS Setup"
-echo "============================================="
-echo ""
-
-# ── Pre-flight checks ─────────────────────────────────────────────────────────
-if [ "$EUID" -ne 0 ]; then
-    err "This script must be run as root (sudo bash vps-setup.sh)"
-fi
-
-if [ "$GH_PAT" = "" ]; then
-    err "Set GH_PAT (GitHub Personal Access Token) before running."
-fi
-
-# ============================================================================
-# PHASE 1: System Packages
-# ============================================================================
-log "Phase 1: Updating system packages..."
-
-apt-get update -y
-apt-get upgrade -y
-apt-get install -y \
-    curl \
-    wget \
-    git \
-    ufw \
-    fail2ban \
-    htop \
-    unzip \
-    nano \
-    ca-certificates \
-    gnupg \
-    lsb-release \
-    software-properties-common
-
-log "System packages installed."
-
-# ============================================================================
-# PHASE 2: Create Deploy User (if not exists)
-# ============================================================================
-if ! id "$DEPLOY_USER" &>/dev/null; then
-    log "Phase 2: Creating deploy user '$DEPLOY_USER'..."
-    adduser --disabled-password --gecos "" "$DEPLOY_USER"
-    usermod -aG sudo "$DEPLOY_USER"
-    log "User '$DEPLOY_USER' created."
-else
-    log "Phase 2: User '$DEPLOY_USER' already exists."
-fi
-
-# Install deploy user SSH key before any SSH hardening.
-log "Phase 2b: Installing deploy user SSH authorized_keys..."
-
-DEPLOY_HOME="/home/${DEPLOY_USER}"
-DEPLOY_SSH_DIR="$DEPLOY_HOME/.ssh"
-DEPLOY_AUTH_KEYS="$DEPLOY_SSH_DIR/authorized_keys"
-
-install -d -m 700 -o "$DEPLOY_USER" -g "$DEPLOY_USER" "$DEPLOY_SSH_DIR"
-touch "$DEPLOY_AUTH_KEYS"
-chown "$DEPLOY_USER:$DEPLOY_USER" "$DEPLOY_AUTH_KEYS"
-chmod 600 "$DEPLOY_AUTH_KEYS"
-
-if [ -n "$DEPLOY_USER_SSH_PUBLIC_KEY" ]; then
-    if ! grep -qxF "$DEPLOY_USER_SSH_PUBLIC_KEY" "$DEPLOY_AUTH_KEYS"; then
-        echo "$DEPLOY_USER_SSH_PUBLIC_KEY" >> "$DEPLOY_AUTH_KEYS"
-        log "Deploy user public key installed."
-    else
-        log "Deploy user public key already present."
-    fi
-else
-    warn "DEPLOY_USER_SSH_PUBLIC_KEY is empty."
-fi
-
-if [ ! -s "$DEPLOY_AUTH_KEYS" ]; then
-    err "No deploy user SSH keys installed. Set DEPLOY_USER_SSH_PUBLIC_KEY before running this script."
-fi
-
-ssh-keygen -l -f "$DEPLOY_AUTH_KEYS" >/dev/null || err "authorized_keys content is invalid."
-log "Deploy user SSH key material verified."
-
-# ============================================================================
-# PHASE 3: Docker Installation
-# ============================================================================
-log "Phase 3: Installing Docker..."
-
-if command -v docker &>/dev/null; then
-    warn "Docker already installed, skipping."
-else
-    install -m 0755 -d /etc/apt/keyrings
-    curl -fsSL https://download.docker.com/linux/ubuntu/gpg | \
-        gpg --dearmor -o /etc/apt/keyrings/docker.gpg
-    chmod a+r /etc/apt/keyrings/docker.gpg
-
-    echo \
-      "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \
-      https://download.docker.com/linux/ubuntu \
-      $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
-      tee /etc/apt/sources.list.d/docker.list > /dev/null
-
-    apt-get update -y
-    apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
-
-    usermod -aG docker "$DEPLOY_USER"
-
-    systemctl enable docker
-    systemctl start docker
-
-    log "Docker installed and started."
-fi
-
-# ============================================================================
-# PHASE 4: Swap File (2GB)
-# ============================================================================
-log "Phase 4: Configuring swap..."
-
-if swapon --show | grep -q "/swapfile"; then
-    warn "Swap already configured, skipping."
-else
-    fallocate -l 2G /swapfile
-    chmod 600 /swapfile
-    mkswap /swapfile
-    swapon /swapfile
-
-    if ! grep -q "/swapfile" /etc/fstab; then
-        echo "/swapfile none swap sw 0 0" >> /etc/fstab
-    fi
-
-    sysctl vm.swappiness=10
-    echo "vm.swappiness=10" >> /etc/sysctl.conf
-
-    log "2GB swap file created."
-fi
-
-# ============================================================================
-# PHASE 5: UFW Firewall
-# ============================================================================
-log "Phase 5: Configuring UFW firewall..."
-
-ufw default deny incoming
-ufw default allow outgoing
-ufw allow OpenSSH
-ufw allow 80/tcp      # HTTP (nginx)
-ufw allow 443/tcp     # HTTPS (nginx)
-# All other ports are internal only — no UFW rule needed
-
-echo "y" | ufw enable
-
-log "UFW firewall configured (SSH, HTTP, HTTPS allowed)."
-
-# ============================================================================
-# PHASE 6: SSH Safety Precheck
-# ============================================================================
-log "Phase 6: Verifying SSH safety prerequisites..."
-
-SSHD_CONFIG="/etc/ssh/sshd_config"
-
-if [ ! -s "$DEPLOY_AUTH_KEYS" ]; then
-    err "Deploy user authorized_keys is empty; refusing to harden SSH."
-fi
-
-sshd -t
-log "SSH config syntax is valid. Hardening will run after deploy user access is in place."
-
-# ============================================================================
-# PHASE 7: Fail2Ban
-# ============================================================================
-log "Phase 7: Configuring Fail2Ban..."
-
-cat > /etc/fail2ban/jail.local << 'EOF'
-[DEFAULT]
-bantime  = 3600
-findtime = 600
-maxretry = 5
-
-[sshd]
-enabled = true
-port    = ssh
-logpath = %(sshd_log)s
-backend = %(sshd_backend)s
-EOF
-
-systemctl enable fail2ban
-systemctl restart fail2ban
-
-log "Fail2Ban configured."
-
-# ============================================================================
-# PHASE 8: Clone Repository
-# ============================================================================
-log "Phase 8: Cloning repository..."
-
-if [ -d "$LEGACY_REPO_DIR" ] && [ "$LEGACY_REPO_DIR" != "$REPO_DIR" ]; then
-    warn "Legacy deployment directory detected: $LEGACY_REPO_DIR"
-    if [ "$AUTO_CLEAN_LEGACY_REPO" = "true" ]; then
-        if [ -L "$LEGACY_REPO_DIR" ]; then
-            err "Refusing to remove symlinked legacy path: $LEGACY_REPO_DIR"
-        fi
-        rm -rf "$LEGACY_REPO_DIR"
-        log "Removed legacy deployment directory: $LEGACY_REPO_DIR"
-    else
-        warn "Leaving legacy directory untouched (AUTO_CLEAN_LEGACY_REPO=false)."
-        warn "Set AUTO_CLEAN_LEGACY_REPO=true to auto-remove $LEGACY_REPO_DIR"
-    fi
-fi
-
-if [ -d "$REPO_DIR" ]; then
-    warn "Repository directory already exists, pulling latest..."
-    cd "$REPO_DIR"
-    sudo -u "$DEPLOY_USER" git fetch origin
-    sudo -u "$DEPLOY_USER" git reset --hard origin/master
-else
-    sudo -u "$DEPLOY_USER" git clone "$REPO_URL" "$REPO_DIR"
-fi
-
-log "Repository ready at $REPO_DIR"
-
-# ============================================================================
-# PHASE 9: Create Docker Network
-# ============================================================================
-log "Phase 9: Creating Docker network..."
-
-if docker network ls --format '{{.Name}}' | grep -Eq "^${NETWORK}$"; then
-    warn "Network '$NETWORK' already exists."
-else
-    docker network create --driver bridge "$NETWORK"
-    log "Docker network '$NETWORK' created."
-fi
-
-# Create runtime state directory for blue-green slot tracking.
-# /var/run is tmpfs (cleared on reboot); _ft_ensure_slot_dir recreates it on
-# each deploy, but creating it here avoids a first-boot race condition.
-log "Phase 9b: Creating runtime state directories..."
-install -d -m 750 -o "$DEPLOY_USER" -g "$DEPLOY_USER" /var/run/api 2>/dev/null || true
-install -d -m 755 /var/log/api 2>/dev/null || true
-chown "$DEPLOY_USER:$DEPLOY_USER" /var/log/api 2>/dev/null || true
-log "Runtime state directories ready (/var/run/api, /var/log/api)."
-
-# ============================================================================
-# PHASE 10: Nginx Installation & Configuration
-# ============================================================================
-log "Phase 10: Installing and configuring Nginx (HTTP bootstrap stage)..."
-
-apt-get install -y nginx
-
-# Remove default site
-rm -f /etc/nginx/conf.d/default
-
-mkdir -p /var/www/certbot
-
-BOOTSTRAP_NGINX_CONF="/tmp/api-bootstrap-http.conf"
-cat > "$BOOTSTRAP_NGINX_CONF" << EOF
-server {
-    listen 80;
-    listen [::]:80;
-    server_name $DOMAIN;
-    location /.well-known/acme-challenge/ {
-        root /var/www/certbot;
-    }
-    location / {
-        return 200 'API bootstrap HTTP mode';
-        add_header Content-Type text/plain;
-    }
-}
-EOF
-
-# Stage 1: temporary HTTP-only config to allow certificate bootstrap
-cp "$BOOTSTRAP_NGINX_CONF" "$NGINX_SITE_LINK"
-nginx -t && systemctl enable nginx && systemctl restart nginx
-log "Nginx HTTP bootstrap config active at $NGINX_SITE_LINK"
-
-# =========================================================================
-# PHASE 11: SSL Certificate (Let's Encrypt / Certbot)
-# =========================================================================
-log "Phase 11: Provisioning SSL certificate..."
-apt-get install -y certbot python3-certbot-nginx
-certbot certonly --webroot -w /var/www/certbot -d "$DOMAIN" --non-interactive --agree-tos --email "admin@$DOMAIN" --keep-until-expiring
-systemctl enable certbot.timer
-log "SSL certificate provisioned for $DOMAIN"
-
-# Stage 2: install SSL Nginx config after certs exist
-log "Phase 11b: Activating SSL Nginx config..."
-render_nginx_ssl_config "$NGINX_SITE_LINK"
-# Validate config with system nginx while it's still running
-nginx -t && systemctl reload nginx
-log "SSL Nginx config rendered at $NGINX_SITE_LINK"
-
-# ============================================================================
-# PHASE 12: GHCR Login
-# ============================================================================
-log "Phase 12: Logging into GitHub Container Registry..."
-
-echo "$GH_PAT" | sudo -u "$DEPLOY_USER" docker login ghcr.io -u "$GH_USER" --password-stdin
-
-log "GHCR login successful."
-
-# ============================================================================
-# PHASE 13: Environment File
-# ============================================================================
-log "Phase 13: Setting up environment file..."
-
-ENV_FILE="$REPO_DIR/.env"
-
-if [ -f "$ENV_FILE" ]; then
-    warn ".env file already exists. Verify its contents are correct."
-else
-    cp "$REPO_DIR/.env.example" "$ENV_FILE"
-    chmod 600 "$ENV_FILE"
-    chown "$DEPLOY_USER:$DEPLOY_USER" "$ENV_FILE"
-    warn ".env file created from template. EDIT IT NOW:"
-    warn "  nano $ENV_FILE"
-fi
-
-MONITORING_ENV_FILE="$REPO_DIR/infra/.env.monitoring"
-MONITORING_ENV_EXAMPLE="$REPO_DIR/infra/.env.monitoring.example"
-
-if [ -f "$MONITORING_ENV_FILE" ]; then
-    chmod 600 "$MONITORING_ENV_FILE"
-    chown "$DEPLOY_USER:$DEPLOY_USER" "$MONITORING_ENV_FILE"
-    warn "Monitoring env file detected. Verify its values: $MONITORING_ENV_FILE"
-elif [ -f "$MONITORING_ENV_EXAMPLE" ]; then
-    # Self-heal: create from example so subsequent deploy scripts do not fail.
-    # The operator MUST fill in real values before monitoring is functional.
-    cp "$MONITORING_ENV_EXAMPLE" "$MONITORING_ENV_FILE"
-    chmod 600 "$MONITORING_ENV_FILE"
-    chown "$DEPLOY_USER:$DEPLOY_USER" "$MONITORING_ENV_FILE"
-    warn "infra/.env.monitoring created from example — ACTION REQUIRED:"
-    warn "  Edit $MONITORING_ENV_FILE and set:"
-    warn "    GRAFANA_ADMIN_PASSWORD   — strong password (min 12 chars)"
-    warn "    METRICS_SCRAPE_TOKEN     — must match METRICS_SCRAPE_TOKEN in .env"
-    warn "    ALERTMANAGER_SLACK_WEBHOOK — Slack incoming webhook URL"
-    warn "    API_HOSTNAME             — bare hostname (e.g. api.getfieldtrack.app)"
-else
-    err "infra/.env.monitoring and infra/.env.monitoring.example both missing. Cannot continue."
-fi
-
-# ============================================================================
-# PHASE 14: Start Monitoring Stack (including Docker nginx)
-# ============================================================================
-log "Phase 14: Starting monitoring stack..."
-
-# Stop system nginx — Docker nginx takes over ports 80/443 from this point.
-# System nginx is no longer needed after cert acquisition; the Docker nginx
-# container handles ACME challenge renewal via the /var/www/certbot mount.
-log "Phase 14a: Stopping system nginx (Docker nginx takes over)..."
-systemctl stop nginx || true
-systemctl disable nginx || true
-log "System nginx stopped and disabled."
-
-# Kill any docker-proxy ghost processes that may be holding host ports 80/443
-# from a previous failed start. pkill is a safe no-op if no process matches.
-pkill docker-proxy 2>/dev/null || true
-
-# Ensure api_network exists before starting compose (idempotent).
-# The compose file declares it as external; Docker will NOT create it automatically.
-if ! docker network ls --format '{{.Name}}' | grep -Eq "^${NETWORK}$"; then
-    docker network create --driver bridge "$NETWORK"
-    log "Docker network '$NETWORK' created before compose."
-else
-    log "Docker network '$NETWORK' already exists."
-fi
-
-# Ensure nginx live config dir and initial config exist before starting nginx,
-# so the container can mount the directory even before the first deploy runs.
-mkdir -p "$NGINX_LIVE_DIR"
-if [ ! -f "$NGINX_SITE_LINK" ]; then
-    sed \
-        -e "s|__ACTIVE_CONTAINER__|api-blue|g" \
-        -e "s|__API_HOSTNAME__|$DOMAIN|g" \
-        "$REPO_DIR/infra/nginx/api.conf" > "$NGINX_SITE_LINK"
-    log "Bootstrap nginx config written (pointing to api-blue) at $NGINX_SITE_LINK"
-fi
-
-# Start nginx FIRST using --no-deps to avoid being blocked by the
-# grafana → prometheus → alertmanager health-check dependency chain.
-# nginx uses deferred Docker DNS resolution so it starts cleanly without
-# needing any backend container to be up.
-log "Phase 14b: Starting Docker nginx (without dependency wait)..."
-cd "$REPO_DIR/infra"
-sudo -u "$DEPLOY_USER" docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml \
-    up -d --no-deps nginx
-log "Docker nginx container started."
-
-# Now start the rest of the monitoring stack (prometheus, alertmanager, grafana, etc.).
-log "Phase 14c: Starting full monitoring stack..."
-sudo -u "$DEPLOY_USER" docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml up -d
-cd "$REPO_DIR"
-
-log "Monitoring stack started (Prometheus, Alertmanager, Grafana, Loki, Promtail, Node Exporter, Nginx)"
-
-# ============================================================================
-# PHASE 15: First Deployment (Bootstrap)
-# ============================================================================
-log "Phase 15: Starting bootstrap API container..."
-#
-# IMPORTANT: This phase uses :latest for the initial bootstrap ONLY.
-# :latest is the only available tag before any CI deploy has run.
-# After this script completes, every subsequent deploy uses a SHA-pinned
-# image (ghcr.io/fieldtrack-tech/api:<7-char-sha>) via deploy-bluegreen.sh.
-# Immutability is enforced from the first CI push onwards.
-#
-# NO HOST PORT BINDINGS — api-blue connects solely via api_network.
-# nginx routes to it via Docker DNS: server api-blue:3000.
-
-sudo -u "$DEPLOY_USER" docker pull ghcr.io/fieldtrack-tech/api:latest
-
-if [ -f "$ENV_FILE" ] && grep -q "SUPABASE_URL=your-" "$ENV_FILE"; then
-    warn "Skipping container start — .env still has placeholder values."
-    warn "After editing .env, push to master and let CI deploy, or run:"
-    warn "  cd $REPO_DIR && ./scripts/deploy-bluegreen.sh <sha>"
-else
-    # Remove a stale api-blue if it exists from a previous aborted attempt
-    if docker ps -a --format '{{.Names}}' | grep -Eq '^api-blue$'; then
-        docker stop --time 5 api-blue 2>/dev/null || true
-        docker rm api-blue 2>/dev/null || true
-        log "Removed stale api-blue container."
-    fi
-
-    # Start api-blue on api_network — NO -p / no host port binding.
-    sudo -u "$DEPLOY_USER" docker run -d \
-        --name api-blue \
-        --network "$NETWORK" \
-        --restart unless-stopped \
-        --label "api.slot=blue" \
-        --label "api.sha=latest-bootstrap" \
-        --env-file "$ENV_FILE" \
-        ghcr.io/fieldtrack-tech/api:latest
-
-    log "Bootstrap container api-blue started (network: $NETWORK, no host ports)."
-
-    # Write the active-slot file so deploy-bluegreen.sh recovery finds it.
-    install -d -m 750 -o "$DEPLOY_USER" -g "$DEPLOY_USER" /var/run/api 2>/dev/null || true
-    echo "blue" > /var/run/api/active-slot
-    log "Active slot file written: blue"
-fi
-
-# ============================================================================
-# PHASE 16: SSH Hardening (post-key-install)
-# ============================================================================
-log "Phase 16: Applying SSH hardening..."
-
-sed -i 's/^#*PermitRootLogin.*/PermitRootLogin no/' "$SSHD_CONFIG"
-sed -i 's/^#*PasswordAuthentication.*/PasswordAuthentication no/' "$SSHD_CONFIG"
-sed -i 's/^#*PermitEmptyPasswords.*/PermitEmptyPasswords no/' "$SSHD_CONFIG"
-
-sshd -t
-systemctl restart sshd
-
-log "SSH hardened (root login disabled, password auth disabled)."
-
-# ============================================================================
-# DONE
-# ============================================================================
-echo ""
-echo "============================================="
-echo "  FieldTrack API — VPS Setup Complete"
-echo "============================================="
-echo ""
-echo "  Next steps:"
-echo "    1. Edit $ENV_FILE with production values"
-echo "    2. Verify: curl https://$DOMAIN/health"
-echo "    3. Verify: curl https://$DOMAIN/health"
-echo "    4. Edit $MONITORING_ENV_FILE (set GRAFANA_ADMIN_PASSWORD and METRICS_SCRAPE_TOKEN)"
-echo "    5. Grafana: https://$DOMAIN/monitor (admin / configured password)"
-echo "    6. Prometheus: Internal only (via Grafana or SSH tunnel)"
-echo "    7. Set up GitHub Secrets: DO_HOST, DO_USER, DO_SSH_KEY"
-echo ""
-echo "  Public endpoints:"
-echo "    https://$DOMAIN/health    → Backend health check"
-echo "    https://$DOMAIN/api/      → Backend API"
-echo "    https://$DOMAIN/monitor   → Grafana dashboard"
-echo ""
-echo "  Done! 🎉"
-echo ""
diff --git a/src/config/env.ts b/src/config/env.ts
index 4f5327d..c88bf66 100644
--- a/src/config/env.ts
+++ b/src/config/env.ts
@@ -2,7 +2,7 @@
 //   - .env.example                      (developer template — every variable must appear here)
 //   - .github/workflows/pr.yml          (CI env: block — non-secret defaults for container bootstrap)
 //   - docs/env-contract.md              (source-of-truth documentation table)
-//   - scripts/validate-env.sh           (if monitoring-layer or cross-file validation needed)
+//   - scripts/deploy.sh                 (deploy-time environment validation)
 //
 // Failure to keep these in sync causes config drift, silent CI failures, and
 // environment contract violations caught only at production deploy time.
@@ -778,8 +778,9 @@ export function logStartupConfig(logger: MinimalLogger): void {
       appBaseUrl:      env.APP_BASE_URL      ?? "(unset)",
       apiBaseUrl:      env.API_BASE_URL      ?? "(unset)",
       // Bare hostname derived from apiBaseUrl — matches the API_HOSTNAME used
-      // by nginx, Prometheus, load-env.sh, and infra scripts. Logged here for
-      // cross-checking: if this value differs from infra/.env.monitoring, the
+      // by nginx and infra automation. Logged here for
+      // cross-checking: if this value differs from the infra repo's
+      // .env.monitoring, the
       // env contract is violated and deployment will fail validation.
       apiHostname:     env.API_BASE_URL      ? new URL(env.API_BASE_URL).host : "(unset)",
       frontendBaseUrl: env.FRONTEND_BASE_URL ?? "(unset)",
diff --git a/src/routes/events.routes.ts b/src/routes/events.routes.ts
index 0392fbf..c0b4094 100644
--- a/src/routes/events.routes.ts
+++ b/src/routes/events.routes.ts
@@ -26,7 +26,7 @@ const MAX_SSE_CONNECTIONS_PER_ORG = 20;
  * keep the connection alive through proxies and load balancers.
  *
  * Auth: ADMIN only.
- * Nginx: requires `proxy_buffering off` — already configured in infra/nginx/api.conf.
+ * Nginx: requires `proxy_buffering off` — configured in the standalone infra repository.
  * Limit: max 20 concurrent connections per org (M4 — FD exhaustion protection).
  */
 export async function eventsRoutes(app: FastifyInstance): Promise<void> {
diff --git a/src/routes/health.ts b/src/routes/health.ts
index 98afbcc..434fbf3 100644
--- a/src/routes/health.ts
+++ b/src/routes/health.ts
@@ -2,6 +2,16 @@ import type { FastifyInstance } from "fastify";
 import { getConfigHash } from "../config/env.js";
 import { shouldStartWorkers, areWorkersStarted, getExpectedWorkerCount } from "../workers/startup.js";
 
+// Bootstrap flag: set to true only after Fastify has fully initialised
+// (plugins registered, routes attached, app.listen() resolved).
+// /health returns 503 until this is set — prevents the deploy gate from
+// treating a partially-initialised process as healthy.
+let isBootstrapped = false;
+
+export function setBootstrapped(): void {
+    isBootstrapped = true;
+}
+
 interface HealthResponse {
         status: string;
         timestamp: string;
@@ -50,7 +60,15 @@ export async function healthRoutes(app: FastifyInstance): Promise<void> {
 
     app.get<{ Reply: HealthResponse }>("/health", {
         schema: { tags: ["health"] },
-    }, async () => {
+    }, async (_request, reply) => {
+        if (!isBootstrapped) {
+            await reply.status(503).send({
+                status: "starting",
+                timestamp: new Date().toISOString(),
+                config_hash: "",
+            });
+            return;
+        }
         return {
             status: "ok",
             timestamp: new Date().toISOString(),
diff --git a/src/server.ts b/src/server.ts
index 65182c7..64ed82b 100644
--- a/src/server.ts
+++ b/src/server.ts
@@ -1,9 +1,14 @@
-import "./tracing.js";
+import { initTelemetry } from "./tracing.js";
 import { env, getConfigHash, getEnv, logStartupConfig } from "./config/env.js";
 import { buildApp } from "./app.js";
 import { shouldStartWorkers, getExpectedWorkerCount } from "./workers/startup.js";
+import { setBootstrapped } from "./routes/health.js";
 
 async function start(): Promise<void> {
+  // Start OTel before any Fastify/HTTP listener so auto-instrumentation hooks
+  // fire before the first request is handled. Must precede buildApp().
+  initTelemetry();
+
   // Force environment validation at process startup so production fails fast.
   // Lazy env loading remains useful for tests and CI that do not run server.ts.
   getEnv();
@@ -28,6 +33,10 @@ async function start(): Promise<void> {
     await app.listen({ port: env.PORT, host: "0.0.0.0" });
     app.log.info({ port: env.PORT, appEnv: env.APP_ENV }, "[BOOT] server listening");
 
+    // Mark the process as bootstrapped: all plugins and routes are registered
+    // and the HTTP server is bound. /health returns 200 from this point.
+    setBootstrapped();
+
     // Structured startup config log — safe values only, no secrets.
     // Logs APP_ENV, PORT, all base URLs, CORS policy, Tempo endpoint, and the
     // deployed commit SHA so operators can verify the deployment in one glance
@@ -53,22 +62,35 @@ async function start(): Promise<void> {
       const { replayPendingRetryIntents } = await import("./workers/retry-intents.js");
       const { startRetryIntentCleanupJob } = await import("./workers/retry-cleanup.job.js");
 
-      await startWorkers(app);
-      app.log.info({ activeWorkers: getExpectedWorkerCount() }, "[BOOT] workers started");
-      performStartupRecovery(app);
-      void replayPendingRetryIntents(app);
-      startRetryIntentCleanupJob(app);
+      // Phase 3: Redis resilience — worker startup failures must not crash the
+      // process. BullMQ workers retry Redis connections internally; a transient
+      // Redis unavailability at boot time should not prevent traffic serving.
+      try {
+        await startWorkers(app);
+        app.log.info({ activeWorkers: getExpectedWorkerCount() }, "[BOOT] workers started");
+        performStartupRecovery(app);
+        void replayPendingRetryIntents(app);
+        startRetryIntentCleanupJob(app);
 
-      // Restore any open circuit-breaker states from DB into Redis so that
-      // delivery workers respect open circuits after a Redis flush/restart.
-      const { syncCircuitBreakerState } = await import("./workers/circuit-breaker.js");
-      const { getRedisConnectionOptions } = await import("./config/redis.js");
-      const { Redis } = await import("ioredis");
-      const cbSyncRedis = new Redis(getRedisConnectionOptions());
-      cbSyncRedis.on("error", () => { /* non-fatal */ });
-      void syncCircuitBreakerState(cbSyncRedis, app.log).finally(() => {
-        void cbSyncRedis.quit().catch(() => undefined);
-      });
+        // Restore any open circuit-breaker states from DB into Redis so that
+        // delivery workers respect open circuits after a Redis flush/restart.
+        const { syncCircuitBreakerState } = await import("./workers/circuit-breaker.js");
+        const { getRedisConnectionOptions } = await import("./config/redis.js");
+        const { Redis } = await import("ioredis");
+        const cbSyncRedis = new Redis(getRedisConnectionOptions());
+        cbSyncRedis.on("error", () => { /* non-fatal */ });
+        void syncCircuitBreakerState(cbSyncRedis, app.log).finally(() => {
+          void cbSyncRedis.quit().catch(() => undefined);
+        });
+      } catch (workerErr) {
+        // Workers failed to start (Redis likely unavailable at boot time).
+        // The HTTP server is already bound and serving; /ready will reflect
+        // the degraded state. Log the error and continue — do not exit.
+        app.log.error(
+          { error: workerErr instanceof Error ? workerErr.message : String(workerErr) },
+          "[BOOT] workers failed to start — server continues without background workers. /ready will return 503 until Redis is available.",
+        );
+      }
     } else {
       app.log.info(
         {
diff --git a/src/tracing.ts b/src/tracing.ts
index 1104b88..6f1b4ea 100644
--- a/src/tracing.ts
+++ b/src/tracing.ts
@@ -1,7 +1,7 @@
 // FieldTrack 2.0 — OpenTelemetry tracing bootstrap
 //
-// This file MUST be imported as the very first import in server.ts so that
-// the SDK can wrap all subsequently-loaded modules (Fastify, HTTP, BullMQ).
+// Call initTelemetry() as the very first statement in server.ts so that the
+// SDK can patch all subsequently-used modules (Fastify, HTTP, BullMQ).
 //
 // Traces are shipped via OTLP HTTP to Grafana Tempo on the shared Docker
 // network. View them in: Grafana → Explore → Tempo → Search traces
@@ -48,4 +48,7 @@ const sdk = new NodeSDK({
     ],
 });
 
-sdk.start();
+/** Call once at process startup, before any Fastify/HTTP listener is created. */
+export function initTelemetry(): void {
+    sdk.start();
+}
diff --git a/tests/setup/env-setup.ts b/tests/setup/env-setup.ts
index 55bc741..09ef06c 100644
--- a/tests/setup/env-setup.ts
+++ b/tests/setup/env-setup.ts
@@ -18,3 +18,6 @@ process.env["SUPABASE_ANON_KEY"] ??= "test-anon-key-placeholder";
 process.env["REDIS_URL"] ??= "redis://localhost:6379";
 process.env["TEMPO_ENDPOINT"] ??= "http://localhost:4318";
 process.env["CORS_ORIGIN"] ??= "http://localhost:3000";
+// Disable OTel SDK in tests — prevents network dial-out and import-time
+// side effects caused by sdk.start() during module loading.
+process.env["OTEL_SDK_DISABLED"] = "true";
diff --git a/vitest.config.ts b/vitest.config.ts
index 47cb86e..b6070fa 100644
--- a/vitest.config.ts
+++ b/vitest.config.ts
@@ -4,6 +4,10 @@ export default defineConfig({
   test: {
     globals: true,
     environment: "node",
+    // Heavy ESM packages (BullMQ, @supabase/supabase-js) can take >5 s on
+    // first import in a cold Vitest worker. 15 s gives ample headroom while
+    // still catching genuine infinite-loop hangs.
+    testTimeout: 15000,
     // Run env-setup before every test file so required env vars are set
     // before any project module is imported.
     setupFiles: ["./tests/setup/env-setup.ts", "./tests/setup/mock-jwt-verifier.ts"],

From d1252826b0dcc8ab69e71cbaa805520fbf5a9cf8 Mon Sep 17 00:00:00 2001
From: rajashish147 <rajashish147@gmail.com>
Date: Sat, 4 Apr 2026 03:35:01 +0530
Subject: [PATCH 2/8] feat(ci): add guard against forbidden docker exec curl
 usage and improve health checks

---
 .github/workflows/pr.yml | 51 ++++++++++++++++++++++++++++++----------
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index cf568a1..a9efec0 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -150,12 +150,29 @@ jobs:
             -f Dockerfile \
             .
 
+      - name: Guard — no docker exec curl in workflows
+        if: needs.detect-changes.outputs.api == 'true'
+        run: |
+          if grep -R "docker exec.*curl" .github/workflows; then
+            echo "❌ Forbidden pattern: docker exec curl detected in workflows"
+            exit 1
+          fi
+          echo "✅ No docker exec curl patterns found"
+
+      - name: Pull curl image
+        if: needs.detect-changes.outputs.api == 'true'
+        run: docker pull curlimages/curl:8.7.1
+
       - name: Container bootstrap validation
         if: needs.detect-changes.outputs.api == 'true'
         run: |
           # NO host port bindings — container runs on an isolated Docker bridge
-          # network. All checks use docker exec to reach the container directly,
-          # matching the production pattern (api_network / Docker DNS).
+          # network. All health checks and smoke tests run from an external
+          # curlimages/curl container on the same network (ci_api_net), matching
+          # production pattern (api_network / Docker DNS). The API image does NOT
+          # include curl; no tooling is assumed inside the container.
+          trap 'docker rm -f api-ci-test 2>/dev/null || true; docker network rm ci_api_net 2>/dev/null || true' EXIT
+
           docker network create ci_api_net
           docker run -d \
             --name api-ci-test \
@@ -188,10 +205,21 @@ jobs:
             -e SUPABASE_SERVICE_ROLE_KEY \
             fieldtrack-api:ci-validation
 
+          # Fail fast if container exited immediately
+          docker ps | grep api-ci-test || {
+            echo "❌ Container failed to start"
+            docker logs api-ci-test
+            exit 1
+          }
+
+          # External health probe — curlimages/curl on ci_api_net, no container tooling assumed
           STATUS="000"
           for i in $(seq 1 12); do
-            STATUS=$(docker exec api-ci-test \
-              curl -s -o /dev/null -w "%{http_code}" http://localhost:3000/health 2>/dev/null || echo "000")
+            STATUS=$(docker run --rm \
+              --network ci_api_net \
+              curlimages/curl:8.7.1 \
+              -s -o /dev/null -w "%{http_code}" \
+              http://api-ci-test:3000/health || echo "000")
             if [ "$STATUS" = "200" ]; then break; fi
             echo "Health check attempt $i: HTTP $STATUS — waiting..."
             sleep 2
@@ -199,9 +227,8 @@ jobs:
 
           if [ "$STATUS" != "200" ]; then
             echo "❌ /health returned HTTP $STATUS after 24 s (expected 200)"
+            echo "Container logs (last 50 lines):"
             docker logs api-ci-test --tail 50
-            docker rm -f api-ci-test || true
-            docker network rm ci_api_net || true
             exit 1
           fi
 
@@ -209,20 +236,20 @@ jobs:
 
           # Smoke tests: admin endpoints must reject unauthenticated requests with 401
           for ENDPOINT in /admin/audit-log /admin/webhook-dlq; do
-            ECODE=$(docker exec api-ci-test \
-              curl -s -o /dev/null -w "%{http_code}" "http://localhost:3000${ENDPOINT}" 2>/dev/null || echo "000")
+            ECODE=$(docker run --rm \
+              --network ci_api_net \
+              curlimages/curl:8.7.1 \
+              -s -o /dev/null -w "%{http_code}" \
+              "http://api-ci-test:3000${ENDPOINT}" || echo "000")
             if [ "$ECODE" != "401" ]; then
               echo "❌ ${ENDPOINT} expected 401 (unauthenticated), got ${ECODE}"
+              echo "Container logs (last 50 lines):"
               docker logs api-ci-test --tail 50
-              docker rm -f api-ci-test || true
-              docker network rm ci_api_net || true
               exit 1
             fi
             echo "✓ ${ENDPOINT} → 401 (auth guard verified)"
           done
 
-          docker rm -f api-ci-test
-          docker network rm ci_api_net
           docker rmi fieldtrack-api:ci-validation
 
   # ---------------------------------------------------------------------------

From f264a82c5224f2875ca1973a07339cfdcd965add Mon Sep 17 00:00:00 2001
From: rajashish147 <rajashish147@gmail.com>
Date: Sat, 4 Apr 2026 03:38:40 +0530
Subject: [PATCH 3/8] fix(ci): enhance guard against forbidden docker exec curl
 usage in workflows

---
 .github/workflows/pr.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index a9efec0..682c453 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -153,8 +153,14 @@ jobs:
       - name: Guard — no docker exec curl in workflows
         if: needs.detect-changes.outputs.api == 'true'
         run: |
-          if grep -R "docker exec.*curl" .github/workflows; then
+          MATCHES=$(grep -R "docker exec.*curl" .github/workflows || true)
+          FILTERED=$(echo "$MATCHES" \
+            | grep -v "Guard — no docker exec curl" \
+            | grep -v "Forbidden pattern" \
+            | grep -v "grep -R")
+          if [ -n "$FILTERED" ]; then
             echo "❌ Forbidden pattern: docker exec curl detected in workflows"
+            echo "$FILTERED"
             exit 1
           fi
           echo "✅ No docker exec curl patterns found"

From e409cacca71720f5ce6c4ef6f606bc17fac46314 Mon Sep 17 00:00:00 2001
From: rajashish147 <rajashish147@gmail.com>
Date: Sat, 4 Apr 2026 03:41:37 +0530
Subject: [PATCH 4/8] fix(ci): refine guard against forbidden docker exec curl
 usage in workflows

---
 .github/workflows/pr.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 682c453..e28711b 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -153,11 +153,11 @@ jobs:
       - name: Guard — no docker exec curl in workflows
         if: needs.detect-changes.outputs.api == 'true'
         run: |
-          MATCHES=$(grep -R "docker exec.*curl" .github/workflows || true)
+          MATCHES=$(grep -R "docker exec .* curl" .github/workflows || true)
           FILTERED=$(echo "$MATCHES" \
-            | grep -v "Guard — no docker exec curl" \
-            | grep -v "Forbidden pattern" \
-            | grep -v "grep -R")
+            | grep -v 'echo "' \
+            | grep -v '^#' \
+            | grep -v 'grep -R')
           if [ -n "$FILTERED" ]; then
             echo "❌ Forbidden pattern: docker exec curl detected in workflows"
             echo "$FILTERED"

From f3200c91347e3986fda89075507690751b5dfdce Mon Sep 17 00:00:00 2001
From: rajashish147 <rajashish147@gmail.com>
Date: Sat, 4 Apr 2026 15:47:40 +0530
Subject: [PATCH 5/8] feat: enhance CI/CD workflow with production simulation
 and infra contract checks

- Added a new job `production-simulation` to validate the image under production-like conditions.
- Implemented infra contract naming guard to enforce canonical naming conventions for network and Redis URLs.
- Updated existing checks in the CI workflow to ensure no forbidden patterns are present in deploy paths.
- Enhanced `deploy.sh` script to track phase-aware deploy results for better rollback decisions.
- Modified readiness checks in `vps-readiness-check.sh` to ensure nginx and Redis are reachable before deployment.
- Updated documentation in `infra-contract.md` to reflect new naming conventions and requirements.
---
 .env.example                   |   4 +-
 .github/workflows/deploy.yml   | 158 ++++++++++++++-----
 .github/workflows/pr.yml       | 276 +++++++++++++++++++++++++++++++--
 docs/infra-contract.md         | 112 +++++++++++--
 scripts/deploy.sh              |  96 ++++++++----
 scripts/vps-readiness-check.sh |  92 ++++++++---
 6 files changed, 630 insertions(+), 108 deletions(-)

diff --git a/.env.example b/.env.example
index ba45bc6..b23a35b 100644
--- a/.env.example
+++ b/.env.example
@@ -26,7 +26,9 @@ SUPABASE_SERVICE_ROLE_KEY=
 SUPABASE_JWT_SECRET=
 
 # --- Redis ---
-REDIS_URL=redis://localhost:6379
+# Production/Docker: Redis runs as 'redis' container on api_network (canonical contract).
+# Local development: change to redis://localhost:6379 if running Redis on the host.
+REDIS_URL=redis://redis:6379
 
 # --- Security ---
 METRICS_SCRAPE_TOKEN=
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index 66c54ea..63a9fd9 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -252,19 +252,14 @@ jobs:
 
       - name: Block docker-compose.monitoring references in deploy path
         run: |
-          # deploy.sh is the only script in the deploy path.
+          # Check deploy.sh (the executable deploy script) only.
+          # Searching deploy.yml for its own guard step is circular and self-defeating;
+          # the workflow's guard is enforced at the job level by the existence of this step.
           if grep -E "docker-compose\.monitoring" scripts/deploy.sh 2>/dev/null | grep -Ev '^\s*#'; then
             echo "::error::deploy.sh references docker-compose.monitoring — deploy must be monitoring-independent"
             exit 1
           fi
-          # Verify deploy.yml does not execute monitoring compose commands.
-          # Guard comments are allowed; executable command lines are not.
-          if grep -E "docker-compose\.monitoring|docker compose.*monitoring" .github/workflows/deploy.yml \
-            | grep -Ev '(infra-leakage-guard|Block docker|No docker)'; then
-            echo "::error::deploy.yml workflow references docker-compose.monitoring outside guard comments"
-            exit 1
-          fi
-          echo "✓ No docker-compose.monitoring references in deploy path"
+          echo "✓ No docker-compose.monitoring references in scripts/deploy.sh"
 
       - name: Block /ready in deploy path (deploy.sh)
         run: |
@@ -274,6 +269,34 @@ jobs:
           fi
           echo "✓ deploy.sh does not reference /ready"
 
+      - name: Infra contract naming guard
+        run: |
+          # Enforce canonical naming contract (docs/infra-contract.md).
+          # Scoped to production paths only. Workflow files are intentionally
+          # excluded so the guard never matches its own grep command or logs.
+          FAIL=0
+
+          # Guard 1: no stale network name (fieldtrack_network is not the canonical name)
+          if grep -rE '\bfieldtrack_network\b' src/ scripts/ \
+               --include='*.ts' --include='*.sh' \
+               2>/dev/null | grep -Ev '^\s*#'; then
+            echo "::error::Forbidden network name 'fieldtrack_network' found — canonical name is 'api_network'"
+            FAIL=1
+          fi
+
+          # Guard 2: no localhost Redis in production paths
+          # Excludes: tests/ (unit/integration tests run on host, localhost is correct there)
+          if grep -rE 'redis://localhost:[0-9]+|redis://127\.0\.0\.1:[0-9]+' \
+               src/ scripts/ .env.example \
+               --include='*.ts' --include='*.sh' --include='*.env*' \
+               2>/dev/null | grep -Ev '^\s*(//|#)'; then
+            echo "::error::localhost Redis URL found in production paths — canonical URL is redis://redis:6379"
+            FAIL=1
+          fi
+
+          [ "$FAIL" -eq 0 ] || exit 1
+          echo "✓ Infra contract naming guard passed (api_network, redis://redis:6379)"
+
   # ---------------------------------------------------------------------------
   # JOB: build-scan-push
   #
@@ -576,11 +599,19 @@ jobs:
       - name: Push verified image to registry
         run: |
           OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
+          # Tag and push using the FULL commit SHA so deploy.sh can reference
+          # the exact image via IMAGE=ghcr.io/.../api:$deploy_sha.
+          # Short SHA (sha_short) is kept as a convenience alias only.
+          docker tag \
+            fieldtrack-api:${{ steps.meta.outputs.sha_short }} \
+            ghcr.io/${OWNER}/api:${{ steps.meta.outputs.deploy_sha }}
+          docker push ghcr.io/${OWNER}/api:${{ steps.meta.outputs.deploy_sha }}
+          # Also push short-SHA alias for human convenience (inspect, debugging).
           docker tag \
             fieldtrack-api:${{ steps.meta.outputs.sha_short }} \
             ghcr.io/${OWNER}/api:${{ steps.meta.outputs.sha_short }}
           docker push ghcr.io/${OWNER}/api:${{ steps.meta.outputs.sha_short }}
-          echo "✓ Pushed ghcr.io/${OWNER}/api:${{ steps.meta.outputs.sha_short }}"
+          echo "✓ Pushed ghcr.io/${OWNER}/api:${{ steps.meta.outputs.deploy_sha }} (full SHA — reference for deploy)"
 
       # Use the same pinned Trivy image to generate the SBOM — no additional
       # tool dependency, no unpinned action, same supply-chain guarantees.
@@ -607,9 +638,10 @@ jobs:
           IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }}
           IMAGE_DIGEST: ${{ steps.digest.outputs.digest }}
         run: |
-          echo "commit=${{ github.sha }}" > provenance.txt
+          echo "commit=${{ needs.codeql-gate.outputs.deploy_sha }}" > provenance.txt
           echo "ref=${{ github.ref }}" >> provenance.txt
           echo "image=${IMAGE_NAME}" >> provenance.txt
+          echo "registry_tag=ghcr.io/${{ github.repository_owner }}/api:${{ steps.meta.outputs.deploy_sha }}" >> provenance.txt
           echo "digest=${IMAGE_DIGEST}" >> provenance.txt
           echo "workflow=${{ github.workflow }}" >> provenance.txt
           echo "run_id=${{ github.run_id }}" >> provenance.txt
@@ -631,8 +663,9 @@ jobs:
             echo "## Build · Scan · Push"
             echo "| Field | Value |"
             echo "|---|---|"
-            echo "| Commit SHA | \`${{ github.sha }}\` |"
-            echo "| Image tag | \`fieldtrack-api:${{ steps.meta.outputs.sha_short }}\` |"
+            echo "| Deploy SHA | \`${{ steps.meta.outputs.deploy_sha }}\` |"
+            echo "| Image tag (local) | \`fieldtrack-api:${{ steps.meta.outputs.sha_short }}\` |"
+            echo "| Registry tag | \`ghcr.io/${{ github.repository_owner }}/api:${{ steps.meta.outputs.deploy_sha }}\` |"
             echo "| Image digest | \`${IMAGE_DIGEST}\` |"
             echo "| SBOM components | ${SBOM_COUNT} |"
             echo "| Trivy gate | HIGH,CRITICAL / exit-code 1 / ignore-unfixed |"
@@ -711,18 +744,19 @@ jobs:
 
       - name: Log deployment metadata and trigger info
         run: |
+          DEPLOY_SHA="${{ needs.build-scan-push.outputs.deploy_sha }}"
           {
             echo "## Deployment Initiated"
             echo "| Field | Value |"
             echo "|---|---|"
-            echo "| Commit SHA | \`${{ github.sha }}\` |"
+            echo "| Deploy SHA | \`${DEPLOY_SHA}\` |"
             echo "| Trigger event | ${{ github.event_name }} |"
             echo "| Triggered by | ${{ github.actor }} |"
             echo "| Branch | ${{ github.ref_name }} |"
             echo "| Workflow run | [${{ github.run_id }}](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) |"
             echo "| Commit message | \`${{ github.event.head_commit.message }}\` |"
           } >> "$GITHUB_STEP_SUMMARY"
-          echo "[DEPLOY] Deployment initiated — SHA=${{ github.sha }} EVENT=${{ github.event_name }} ACTOR=${{ github.actor }}"
+          echo "[DEPLOY] Deployment initiated — SHA=${DEPLOY_SHA} EVENT=${{ github.event_name }} ACTOR=${{ github.actor }}"
 
       - name: Blue-Green deploy via SSH
         uses: appleboy/ssh-action@v1.0.3
@@ -741,7 +775,7 @@ jobs:
             git fetch origin
             git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }}
             chmod +x scripts/*.sh
-            ./scripts/deploy.sh "${{ needs.build-scan-push.outputs.sha_short }}"
+            ./scripts/deploy.sh "${{ needs.build-scan-push.outputs.deploy_sha }}"
             echo "[DEPLOY] Deploy completed in $(($(date +%s) - T0))s"
 
       - name: Log deployment state (slot + SHA for debugging)
@@ -752,7 +786,7 @@ jobs:
           username: ${{ secrets.DO_USER }}
           key: ${{ secrets.DO_SSH_KEY }}
           script: |
-            ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "unknown")
+            ACTIVE_SLOT=$(cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "unknown")
             ACTIVE_CONTAINER="api-${ACTIVE_SLOT}"
             DEPLOY_STATUS="UNKNOWN"
 
@@ -797,7 +831,7 @@ jobs:
             [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
             cd "$DEPLOY_ROOT"
 
-            ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "blue")
+            ACTIVE_SLOT=$(cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "blue")
             ACTIVE_CONTAINER="api-$ACTIVE_SLOT"
 
             docker inspect "$ACTIVE_CONTAINER" >/dev/null 2>&1 || {
@@ -830,11 +864,13 @@ jobs:
   health-and-smoke:
     name: Health Checks & Smoke Tests
     runs-on: ubuntu-latest
-    needs: [api-health-gate]
+    needs: [api-health-gate, build-scan-push]
     timeout-minutes: 15
     steps:
       - name: Checkout
         uses: actions/checkout@v5
+        with:
+          ref: ${{ needs.build-scan-push.outputs.deploy_sha }}
 
       - name: CI guard — deploy.sh must not reference /ready or monitoring stack
         run: |
@@ -849,12 +885,17 @@ jobs:
             echo "::error::deploy.sh references monitoring stack — deploy must be monitoring-independent"
             exit 1
           fi
-          echo "Validating no local infra coupling..."
-          if grep -R "infra/" . | grep -v "docs/infra-contract.md"; then
-            echo "::error::Local infra coupling detected"
+          # Check for local repo-relative infra path coupling in scripts/src only.
+          # Uses the same pattern as verify-stabilization.sh: only relative paths
+          # (./infra/ or ../infra/) are forbidden. Absolute /opt/infra is allowed.
+          # Scope: scripts/ and src/ only (not workflows where guard steps live).
+          if grep -rE "\./infra/|\.\.\./infra/" scripts/ src/ \
+               --binary-files=without-match --exclude-dir=node_modules 2>/dev/null \
+               | grep -Ev '^\s*#'; then
+            echo "::error::Local repo-relative infra coupling (./infra/ or ../infra/) detected in scripts/ or src/"
             exit 1
           fi
-          echo "✓ CI guards passed: no /ready or monitoring references in deploy.sh"
+          echo "✓ CI guards passed: no /ready, no monitoring, no local infra coupling in deploy path"
 
       - name: Wait for /health endpoint (via VPS)
         uses: appleboy/ssh-action@v1.0.3
@@ -872,7 +913,9 @@ jobs:
             for i in $(seq 1 30); do
               # Phase 1: in-network (source of truth)
               if docker run --rm --network api_network \
-                curlimages/curl:8.7.1 -sk --max-time 5 https://nginx/health 2>/dev/null \
+                curlimages/curl:8.7.1 -sk --max-time 5 \
+                -H "Host: ${API_HOSTNAME}" \
+                https://nginx/health 2>/dev/null \
                 | grep -q '"status":"ok"'; then
                 echo "[DEPLOY] /health OK (in-network, attempt $i)"
                 exit 0
@@ -906,7 +949,9 @@ jobs:
             for i in $(seq 1 10); do
               # Phase 1: in-network (source of truth)
               if docker run --rm --network api_network \
-                curlimages/curl:8.7.1 -sk --max-time 5 https://nginx/health 2>/dev/null \
+                curlimages/curl:8.7.1 -sk --max-time 5 \
+                -H "Host: ${API_HOSTNAME}" \
+                https://nginx/health 2>/dev/null \
                 | grep -q '"status":"ok"'; then
                 echo "[DEPLOY] /health OK (in-network, attempt $i)"
                 exit 0
@@ -924,19 +969,60 @@ jobs:
             echo "::error::Final health check failed after 10 attempts"
             exit 1
 
+      # Post-deploy /ready check — informational only, never fails the deploy.
+      # /health is the deploy gate (shallow, network-independent).
+      # /ready reflects deep system state: Redis, workers, DB connectivity.
+      # Logged in the job summary for operator visibility without blocking production.
+      - name: Post-deploy /ready check (informational)
+        continue-on-error: true
+        uses: appleboy/ssh-action@v1.0.3
+        with:
+          host: ${{ secrets.DO_HOST }}
+          username: ${{ secrets.DO_USER }}
+          key: ${{ secrets.DO_SSH_KEY }}
+          script: |
+            ACTIVE_SLOT=$(cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "unknown")
+            ACTIVE_CONTAINER="api-${ACTIVE_SLOT}"
+            FT_CURL_IMG="curlimages/curl:8.7.1"
+            READY_RESP=$(docker run --rm --network api_network "$FT_CURL_IMG" \
+              -sf --max-time 10 "http://$ACTIVE_CONTAINER:3000/ready" 2>/dev/null || echo "UNREACHABLE")
+            READY_STATUS=$(echo "$READY_RESP" | grep -o '"status":"[^"]*"' | head -1 || echo "unknown")
+            echo "[DEPLOY] /ready check: slot=$ACTIVE_SLOT status=$READY_STATUS"
+            echo "[DEPLOY] /ready response: $READY_RESP"
+
       - name: Deployment summary
         run: |
-          echo "[DEPLOY] Production deployment complete"
-          echo "  Commit : ${{ github.sha }}"
-          echo "  Health : OK"
-          echo "  Post-deploy checks : passed"
+          DEPLOY_SHA="${{ needs.build-scan-push.outputs.deploy_sha }}"
+          {
+            echo "## Deployment Complete"
+            echo "| Field | Value |"
+            echo "|---|---|"
+            echo "| Status | ✅ SUCCESS |"
+            echo "| Deploy SHA | \`${DEPLOY_SHA}\` |"
+            echo "| Health gate | /health → 200 |"
+            echo "| Post-deploy checks | passed |"
+          } >> "$GITHUB_STEP_SUMMARY"
+          echo "[DEPLOY] Production deployment complete — SHA=${DEPLOY_SHA} Health=OK"
 
   # ---------------------------------------------------------------------------
   # JOB: rollback
   #
-  # Triggered automatically when deploy or health-and-smoke fails.
-  # Restores the previously healthy Blue-Green slot via the rollback script.
-  # 'if: always()' ensures this job can evaluate even if upstream jobs failed.
+  # Triggered automatically when POST-DEPLOY health checks fail.
+  #
+  # PHASE-AWARE CONDITION (prevents double-rollback):
+  #   Rollback triggers ONLY when the deploy job itself SUCCEEDED (exit 0)
+  #   AND a post-deploy CI health check subsequently failed.
+  #
+  #   This means nginx traffic was already switched to the new container
+  #   (deploy.sh reported DEPLOY_RESULT=SWITCHED) and then the CI-level
+  #   health gate caught a regression.
+  #
+  #   It does NOT trigger when:
+  #     • vps-readiness-check fails (deploy never started)
+  #     • deploy job fails (deploy.sh already handled internal recovery;
+  #       see DEPLOY_RESULT=FAILED_PRE_SWITCH or RESTORED in deploy.sh)
+  #
+  # 'if: always()' ensures this job evaluates even if upstream jobs failed.
   # ---------------------------------------------------------------------------
   rollback:
     name: Rollback Deployment (auto)
@@ -946,17 +1032,13 @@ jobs:
     if: |
       always() &&
       (
-        needs.vps-readiness-check.result == 'failure' ||
-        needs.deploy.result == 'failure' ||
         needs.api-health-gate.result == 'failure' ||
         needs.health-and-smoke.result == 'failure'
       )
     steps:
       - name: Log rollback trigger
         run: |
-          echo "::error::Rollback triggered — SHA=${{ github.sha }}"
-          [ "${{ needs.vps-readiness-check.result }}" = "failure" ] && echo "  [ERROR] failed job: vps-readiness-check" || true
-          [ "${{ needs.deploy.result }}" = "failure" ] && echo "  [ERROR] failed job: deploy" || true
+          echo "::error::Rollback triggered (post-switch health failure) — deploy_sha=${{ needs.build-scan-push.outputs.deploy_sha }}"
           [ "${{ needs.api-health-gate.result }}" = "failure" ] && echo "  [ERROR] failed job: api-health-gate" || true
           [ "${{ needs.health-and-smoke.result }}" = "failure" ] && echo "  [ERROR] failed job: health-and-smoke" || true
 
@@ -974,5 +1056,5 @@ jobs:
             chmod +x scripts/*.sh
             ./scripts/deploy.sh --rollback --auto
 
-            ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "unknown")
+            ACTIVE_SLOT=$(cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "unknown")
             echo "[DEPLOY] Rollback complete — slot=$ACTIVE_SLOT sha=${{ github.sha }}"
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index e28711b..f2dbd16 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -36,6 +36,9 @@ jobs:
               - 'package-lock.json'
               - 'tsconfig.json'
               - 'vitest.config.ts'
+              - 'Dockerfile'
+              - 'scripts/**'
+              - '.github/workflows/**'
 
   api-ci:
     name: API CI
@@ -111,6 +114,31 @@ jobs:
             exit 1
           fi
           echo "✅ Env contract clean — no direct process.env access outside env.ts"
+      - name: Infra contract naming guard
+        if: needs.detect-changes.outputs.api == 'true'
+        run: |
+          # Enforce canonical naming contract (docs/infra-contract.md).
+          # Scoped to production paths; workflow files are intentionally
+          # excluded so the guard never matches its own grep command.
+          FAIL=0
+
+          if grep -rE '\bfieldtrack_network\b' src/ scripts/ \
+               --include='*.ts' --include='*.sh' \
+               2>/dev/null | grep -Ev '^\s*#'; then
+            echo "::error::Forbidden network name 'fieldtrack_network' — canonical name is 'api_network'"
+            FAIL=1
+          fi
+
+          if grep -rE 'redis://localhost:[0-9]+|redis://127\.0\.0\.1:[0-9]+' \
+               src/ scripts/ .env.example \
+               --include='*.ts' --include='*.sh' --include='*.env*' \
+               2>/dev/null | grep -Ev '^\s*(//|#)'; then
+            echo "::error::localhost Redis URL in production paths — canonical URL is redis://redis:6379"
+            FAIL=1
+          fi
+
+          [ "$FAIL" -eq 0 ] || exit 1
+          echo "✅ Infra contract naming guard passed"
       - name: Dependency vulnerability scan (production deps)
         if: needs.detect-changes.outputs.api == 'true'
         run: |
@@ -150,20 +178,28 @@ jobs:
             -f Dockerfile \
             .
 
-      - name: Guard — no docker exec curl in workflows
+      - name: Guard — no docker exec curl in deploy paths
         if: needs.detect-changes.outputs.api == 'true'
         run: |
-          MATCHES=$(grep -R "docker exec .* curl" .github/workflows || true)
-          FILTERED=$(echo "$MATCHES" \
-            | grep -v 'echo "' \
-            | grep -v '^#' \
-            | grep -v 'grep -R')
-          if [ -n "$FILTERED" ]; then
-            echo "❌ Forbidden pattern: docker exec curl detected in workflows"
-            echo "$FILTERED"
+          # docker exec <container> curl is forbidden in production because:
+          #   - API containers use distroless (no curl)
+          #   - Health checks run from external curlimages/curl on api_network
+          # Search only deploy-relevant files; use word boundaries to prevent
+          # matching curl inside grep patterns, comments, or echo statements.
+          MATCHES=$(grep -rE '\bdocker exec\b.+\bcurl\b' \
+            scripts/ .github/workflows/ \
+            --include='*.sh' --include='*.yml' --include='*.yaml' \
+            2>/dev/null \
+            | grep -Ev '^\s*(#|echo |#.*)' \
+            | grep -v 'grep ' \
+            || true)
+          if [ -n "$MATCHES" ]; then
+            echo "::error::Forbidden pattern: \`docker exec ... curl\` detected in scripts/ or .github/workflows/"
+            echo "  Use: docker run --rm --network <net> curlimages/curl:8.7.1 instead"
+            echo "$MATCHES"
             exit 1
           fi
-          echo "✅ No docker exec curl patterns found"
+          echo "✓ No forbidden docker exec curl patterns in deploy paths"
 
       - name: Pull curl image
         if: needs.detect-changes.outputs.api == 'true'
@@ -306,4 +342,222 @@ jobs:
       - name: Perform CodeQL Analysis
         uses: github/codeql-action/analyze@v4
         with:
-          category: "codeql-lite"
\ No newline at end of file
+          category: "codeql-lite"
+
+  # ---------------------------------------------------------------------------
+  # JOB: production-simulation
+  #
+  # Validates the image behaves correctly under production-like conditions:
+  #   - Real Docker bridge network (api_network) — same as production
+  #   - Redis container on the same network (same container DNS path)
+  #   - WORKERS_ENABLED=true (workers active, unlike CI test job)
+  #   - All probes use an EXTERNAL curlimages/curl container via Docker DNS
+  #     (no host port bindings, no docker exec — mirrors nginx→api path)
+  #
+  # Also captures the image digest for parity verification during deploy
+  # (deploy.yml's build-scan-push compares against this artifact).
+  #
+  # This job runs independently of api-ci; both must pass before merge.
+  # ---------------------------------------------------------------------------
+  production-simulation:
+    name: Production Simulation (workers + Redis + container DNS)
+    runs-on: ubuntu-latest
+    needs: [detect-changes]
+    timeout-minutes: 20
+    if: always()
+    env:
+      CONFIG_VERSION: "1"
+      APP_ENV: production
+      NODE_ENV: production
+      PORT: "3000"
+      # Use placeholder URLs — real Supabase secrets injected below.
+      # APP_BASE_URL / API_BASE_URL are intentionally non-localhost to match
+      # what a production container would see (workers read these for callbacks).
+      APP_BASE_URL: https://api.example.com
+      API_BASE_URL: https://api.example.com
+      FRONTEND_BASE_URL: https://app.example.com
+      CORS_ORIGIN: https://app.example.com
+      # Redis runs as a container named "redis" on api_network — same DNS name
+      # as production. Workers will connect; Redis unavailability is non-fatal
+      # (graceful degradation is verified, not crash-on-missing-redis).
+      REDIS_URL: redis://redis:6379
+      WORKERS_ENABLED: "true"
+      METRICS_SCRAPE_TOKEN: dummy-sim-token
+      SERVICE_NAME: fieldtrack-api-sim
+      BODY_LIMIT_BYTES: "1000000"
+      REQUEST_TIMEOUT_MS: "30000"
+      MAX_QUEUE_DEPTH: "1000"
+      MAX_POINTS_PER_SESSION: "50000"
+      MAX_SESSION_DURATION_HOURS: "168"
+      WORKER_CONCURRENCY: "1"
+      ANALYTICS_WORKER_CONCURRENCY: "2"
+      WEBHOOK_WORKER_CONCURRENCY: "2"
+      WEBHOOK_DLQ_MAX_SIZE: "10000"
+      WEBHOOK_DLQ_RETENTION_DAYS: "30"
+      WEBHOOK_MAX_PAYLOAD_BYTES: "262144"
+      SUPABASE_URL: ${{ secrets.SUPABASE_URL_TEST }}
+      SUPABASE_ANON_KEY: ${{ secrets.SUPABASE_ANON_KEY_TEST }}
+      SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY_TEST }}
+    steps:
+      - name: Skip if no API changes
+        if: needs.detect-changes.outputs.api != 'true'
+        run: |
+          echo "No API changes — skipping production simulation"
+          echo "✓ Production Simulation (skipped — no relevant changes)"
+
+      - uses: actions/checkout@v5
+        if: needs.detect-changes.outputs.api == 'true'
+
+      - name: Pull base images
+        if: needs.detect-changes.outputs.api == 'true'
+        run: |
+          docker pull node:24.2.0-bookworm-slim
+          docker pull gcr.io/distroless/nodejs24-debian12:nonroot
+
+      - name: Build production image
+        if: needs.detect-changes.outputs.api == 'true'
+        run: |
+          docker build \
+            --pull \
+            --target production \
+            --build-arg NODE_ENV=production \
+            --build-arg CACHE_BUSTER=${{ hashFiles('**/package-lock.json') }} \
+            --cache-from=type=gha,scope=production \
+            --cache-to=type=gha,mode=max,scope=production \
+            -t fieldtrack-api:sim \
+            -f Dockerfile \
+            .
+
+      - name: Create production-like network and start Redis
+        if: needs.detect-changes.outputs.api == 'true'
+        run: |
+          # Create the same network name used in production.
+          docker network create api_network
+
+          # Start Redis on api_network using the same container DNS name
+          # production uses (workers connect via REDIS_URL=redis://redis:6379).
+          docker run -d \
+            --name redis \
+            --network api_network \
+            redis:7-alpine
+
+          # Wait up to 15 s for Redis to accept connections.
+          for i in $(seq 1 15); do
+            if docker run --rm --network api_network redis:7-alpine \
+                redis-cli -h redis ping 2>/dev/null | grep -q PONG; then
+              echo "Redis ready (attempt $i)"
+              break
+            fi
+            [ "$i" -eq 15 ] && { echo "::error::Redis did not become ready"; exit 1; }
+            sleep 1
+          done
+
+      - name: Start API (workers enabled, api_network, no host ports)
+        if: needs.detect-changes.outputs.api == 'true'
+        run: |
+          # NO -p flags — all comms via api_network Docker DNS, same as production.
+          docker run -d \
+            --name api-blue \
+            --network api_network \
+            -e CONFIG_VERSION \
+            -e APP_ENV \
+            -e NODE_ENV \
+            -e PORT \
+            -e APP_BASE_URL \
+            -e API_BASE_URL \
+            -e FRONTEND_BASE_URL \
+            -e CORS_ORIGIN \
+            -e REDIS_URL \
+            -e WORKERS_ENABLED \
+            -e METRICS_SCRAPE_TOKEN \
+            -e SERVICE_NAME \
+            -e BODY_LIMIT_BYTES \
+            -e REQUEST_TIMEOUT_MS \
+            -e MAX_QUEUE_DEPTH \
+            -e MAX_POINTS_PER_SESSION \
+            -e MAX_SESSION_DURATION_HOURS \
+            -e WORKER_CONCURRENCY \
+            -e ANALYTICS_WORKER_CONCURRENCY \
+            -e WEBHOOK_WORKER_CONCURRENCY \
+            -e WEBHOOK_DLQ_MAX_SIZE \
+            -e WEBHOOK_DLQ_RETENTION_DAYS \
+            -e WEBHOOK_MAX_PAYLOAD_BYTES \
+            -e SUPABASE_URL \
+            -e SUPABASE_ANON_KEY \
+            -e SUPABASE_SERVICE_ROLE_KEY \
+            fieldtrack-api:sim
+
+      - name: Pull curl image for external probing
+        if: needs.detect-changes.outputs.api == 'true'
+        run: docker pull curlimages/curl:8.7.1
+
+      - name: Health check via container DNS (production network path)
+        if: needs.detect-changes.outputs.api == 'true'
+        run: |
+          trap 'docker rm -f api-blue redis 2>/dev/null || true; docker network rm api_network 2>/dev/null || true' EXIT
+
+          # Verify container actually started.
+          docker ps | grep api-blue || {
+            echo "::error::api-blue container failed to start"
+            docker logs api-blue --tail 50
+            exit 1
+          }
+
+          # External health probe — curlimages/curl on api_network.
+          # This is the same network path nginx uses in production:
+          #   nginx → api_network → api-blue:3000
+          # No host port, no docker exec — container tooling not assumed.
+          STATUS="000"
+          for i in $(seq 1 15); do
+            STATUS=$(docker run --rm \
+              --network api_network \
+              curlimages/curl:8.7.1 \
+              -s -o /dev/null -w "%{http_code}" \
+              "http://api-blue:3000/health" 2>/dev/null || echo "000")
+            if [ "$STATUS" = "200" ]; then
+              echo "✓ /health → 200 (attempt $i, container DNS, api_network)"
+              break
+            fi
+            echo "Attempt $i: HTTP $STATUS — waiting..."
+            sleep 2
+          done
+
+          if [ "$STATUS" != "200" ]; then
+            echo "::error::/health returned HTTP $STATUS after 30s (expected 200)"
+            docker logs api-blue --tail 50
+            exit 1
+          fi
+
+          # Smoke: auth guards must reject unauthenticated requests with 401.
+          for ENDPOINT in /admin/audit-log /admin/webhook-dlq; do
+            CODE=$(docker run --rm \
+              --network api_network \
+              curlimages/curl:8.7.1 \
+              -s -o /dev/null -w "%{http_code}" \
+              "http://api-blue:3000${ENDPOINT}" 2>/dev/null || echo "000")
+            if [ "$CODE" != "401" ]; then
+              echo "::error::${ENDPOINT} expected 401 (unauthenticated), got ${CODE}"
+              docker logs api-blue --tail 50
+              exit 1
+            fi
+            echo "✓ ${ENDPOINT} → 401 (auth guard verified via container DNS)"
+          done
+
+          echo "✓ Production simulation passed (workers enabled, Redis connected, container DNS routing)"
+
+      - name: Capture image digest for parity check
+        id: sim-digest
+        if: needs.detect-changes.outputs.api == 'true'
+        run: |
+          DIGEST=$(docker inspect fieldtrack-api:sim --format='{{.Id}}' 2>/dev/null || echo "unknown")
+          echo "$DIGEST" > /tmp/image-digest.txt
+          echo "digest=$DIGEST" >> "$GITHUB_OUTPUT"
+          echo "Simulation image digest: $DIGEST"
+
+      - name: Upload image digest artifact (for deploy parity check)
+        if: needs.detect-changes.outputs.api == 'true'
+        uses: actions/upload-artifact@v4
+        with:
+          name: image-digest-pr-${{ github.event.number }}
+          path: /tmp/image-digest.txt
+          retention-days: 7
diff --git a/docs/infra-contract.md b/docs/infra-contract.md
index 63425bd..d7d6259 100644
--- a/docs/infra-contract.md
+++ b/docs/infra-contract.md
@@ -1,20 +1,104 @@
 # Infra Contract
 
-This API repository expects an external infra repository to provide runtime infrastructure.
+This document defines the **canonical naming contract** between the API repo and the
+infra repo. All values here are the single source of truth. CI guards enforce these
+values — any deviation blocks deployment.
 
-Required external services:
-- nginx container attached to `api_network`
-- Redis reachable at `redis:6379`
+---
 
-Required external paths under `INFRA_ROOT`:
-- `$INFRA_ROOT/nginx/live`
-- `$INFRA_ROOT/nginx/backup`
-- `$INFRA_ROOT/nginx/api.conf`
+## Docker Network
 
-Default on server:
-- `INFRA_ROOT=/opt/infra`
+```
+api_network
+```
 
-Deployment assumptions:
-- API deploy script (`scripts/deploy.sh`) never starts infra services
-- API deploy script only renders and reloads nginx config via paths under `INFRA_ROOT`
-- API and infra share the Docker bridge network `api_network`
+All containers that need to communicate must be attached to this network.
+Docker DNS resolution (`api-blue`, `api-green`, `redis`, `nginx`) only works within it.
+
+---
+
+## Container Names
+
+| Role | Name |
+|---|---|
+| Active API slot A | `api-blue` |
+| Active API slot B | `api-green` |
+| Cache / queue broker | `redis` |
+| Reverse proxy | `nginx` |
+
+API containers (`api-blue`, `api-green`) must **never** bind host ports.
+All traffic reaches them via `nginx` → `api_network` → container DNS.
+
+---
+
+## Slot File
+
+```
+/var/lib/fieldtrack/active-slot
+```
+
+Contains one of: `blue` or `green`. Written atomically after nginx reload.
+Persistent across reboots (in `/var/lib`, not tmpfs `/var/run`).
+
+Backup (belt-and-suspenders):
+```
+/var/lib/fieldtrack/active-slot.backup
+```
+
+---
+
+## Redis URL
+
+```
+REDIS_URL=redis://redis:6379
+```
+
+Must be used in all production and CI-production-simulation environments.
+Local development may use `redis://localhost:6379` but that value must never
+appear in `.env.example`, scripts, or workflows.
+
+---
+
+## nginx Config
+
+Template location (infra repo, rendered by deploy.sh):
+```
+$INFRA_ROOT/nginx/api.conf          (template — contains __ACTIVE_CONTAINER__ placeholder)
+$INFRA_ROOT/nginx/live/api.conf     (live, rendered config — what nginx reads)
+$INFRA_ROOT/nginx/backup/           (rolling backup directory)
+```
+
+Default `INFRA_ROOT` on server: `/opt/infra`
+
+---
+
+## Health Endpoints
+
+| Endpoint | Purpose | Used where |
+|---|---|---|
+| `/health` | Shallow liveness check (HTTP + process alive). **Deploy gate.** | `deploy.sh`, CI |
+| `/ready` | Deep readiness (Redis, DB, workers). **Observability only.** | Post-deploy logging |
+
+`deploy.sh` must use only `/health` as the deploy gate. `/ready` is never a blocking check.
+
+---
+
+## API Deploy Script Invariants
+
+- `deploy.sh` **never** starts infra services (nginx, Redis) — only renders and reloads nginx config
+- API containers run `--network api_network` with **no** `-p` (host port) bindings
+- nginx reload is performed exactly once per deploy (inside `switch_nginx()`)
+- Slot file is written **after** nginx reload — always reflects what nginx is actually serving
+
+---
+
+## External Dependencies (infra repo)
+
+Required paths under `INFRA_ROOT=/opt/infra`:
+```
+$INFRA_ROOT/nginx/live/          (directory, created by infra)
+$INFRA_ROOT/nginx/backup/        (directory, created by infra)
+$INFRA_ROOT/nginx/api.conf       (template, managed by infra)
+```
+
+The API deploy script (`scripts/deploy.sh`) fails fast if these are absent.
diff --git a/scripts/deploy.sh b/scripts/deploy.sh
index 90da016..4ad1d62 100644
--- a/scripts/deploy.sh
+++ b/scripts/deploy.sh
@@ -62,6 +62,29 @@ START_TS=$(date +%s)
 DEPLOY_ID=$(date +%Y%m%d_%H%M%S)_$$
 PREFLIGHT_STRICT="${PREFLIGHT_STRICT:-false}"
 
+# ---------------------------------------------------------------------------
+# PHASE-AWARE DEPLOY RESULT TRACKING
+# Written to /tmp/ft_deploy_result on every exit path so CI can make
+# phase-aware rollback decisions.
+#
+# Values:
+#   FAILED_PRE_SWITCH  -- deploy failed before nginx traffic was switched
+#                         (no rollback needed — production was never touched)
+#   SWITCHED           -- nginx upstream was reloaded to new container
+#                         (CI rollback IS appropriate if health checks fail)
+#   RESTORED           -- switch happened but nginx was restored to old config
+#                         (deploy.sh handled recovery — CI must NOT re-rollback)
+#   FAILED             -- catastrophic failure, both deploy and internal
+#                         rollback failed (manual intervention required)
+# ---------------------------------------------------------------------------
+_FT_DEPLOY_RESULT="FAILED_PRE_SWITCH"
+
+# Capture whether this process was launched AS a rollback subprocess by
+# _trigger_internal_rollback().  Subprocess inherits API_ROLLBACK_IN_PROGRESS=1
+# from the parent; capturing it here prevents the subprocess from
+# overwriting the parent's result file when the parent calls _ft_exit.
+_FT_IS_ROLLBACK_SUBPROCESS="${API_ROLLBACK_IN_PROGRESS:-0}"
+
 # ---------------------------------------------------------------------------
 # STRUCTURED LOGGING
 # ALL logging writes to stderr so stdout is data-only (subshell returns safe).
@@ -111,7 +134,12 @@ _ft_trap_err() {
 _ft_exit() {
     local code="$1"; shift
     local duration=$(( $(date +%s) - START_TS ))
-    _ft_state "$@" "duration_sec=$duration"
+    # Write machine-readable result for CI phase-aware rollback.
+    # Skip in subprocess rollbacks (parent writes the authoritative value).
+    if [ "${_FT_IS_ROLLBACK_SUBPROCESS:-0}" != "1" ]; then
+        printf '%s\n' "$_FT_DEPLOY_RESULT" > /tmp/ft_deploy_result 2>/dev/null || true
+    fi
+    _ft_state "$@" "duration_sec=$duration deploy_result=$_FT_DEPLOY_RESULT"
     exit "$code"
 }
 
@@ -135,9 +163,9 @@ _ft_snapshot() {
     { set +x; } 2>/dev/null
     printf '[DEPLOY] -- SYSTEM SNAPSHOT ----------------------------------------\n' >&2
     printf '[DEPLOY]   slot_file  = %s\n' \
-        "$(cat "${ACTIVE_SLOT_FILE:-/var/run/api/active-slot}" 2>/dev/null || echo 'MISSING')" >&2
+        "$(cat "${ACTIVE_SLOT_FILE:-/var/lib/fieldtrack/active-slot}" 2>/dev/null || echo 'MISSING')" >&2
     printf '[DEPLOY]   backup_file = %s\n' \
-        "$(cat "${SLOT_BACKUP_FILE:-/var/lib/api/active-slot.backup}" 2>/dev/null || echo 'MISSING')" >&2
+        "$(cat "${SLOT_BACKUP_FILE:-/var/lib/fieldtrack/active-slot.backup}" 2>/dev/null || echo 'MISSING')" >&2
     printf '[DEPLOY]   nginx_upstream = %s\n' \
         "$(grep -oE 'http://(api-blue|api-green):3000' \
             "${NGINX_CONF:-/opt/infra/nginx/live/api.conf}" 2>/dev/null \
@@ -186,7 +214,7 @@ _ft_wait_docker_health() {
         case "$STATUS" in
             healthy)   _ft_log "msg='docker health check passed' container=$name"; return 0 ;;
             unhealthy) _ft_error "msg='docker health check failed' container=$name status=unhealthy"; return 1 ;;
-            none)      _ft_log "msg='docker health gate skipped (no HEALTHCHECK)' container=$name"; return 0 ;;
+            none)      _ft_error "msg='docker HEALTHCHECK not found — add HEALTHCHECK to Dockerfile; required for deploy gate' container=$name status=none"; return 1 ;;
         esac
         [ $(( i % 5 )) -eq 0 ] && _ft_log "msg='waiting for docker health' attempt=$i/30 status=$STATUS container=$name"
         sleep 2; i=$(( i + 1 ))
@@ -210,9 +238,15 @@ _ft_net_curl_out() {
     printf '%s' "$_out"
 }
 
-_ft_check_external_ready() {
+_ft_nginx_route_health_ok() {
     docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" \
-        -sfk --max-time 5 "https://nginx/health" 2>/dev/null | grep -q '"status":"ok"'
+        -sfk --max-time 5 \
+        -H "Host: $API_HOSTNAME" \
+        "https://nginx/health" 2>/dev/null | grep -q '"status":"ok"'
+}
+
+_ft_check_external_ready() {
+    _ft_nginx_route_health_ok
 }
 
 # ---------------------------------------------------------------------------
@@ -425,8 +459,8 @@ pull_image() {
 # resolve_slot — determine ACTIVE/INACTIVE slots with full recovery
 #
 # Reads slot from (in precedence order):
-#   1. /var/run/api/active-slot       (primary, tmpfs)
-#   2. /var/lib/api/active-slot.backup (persistent, survives reboots)
+#   1. /var/lib/fieldtrack/active-slot  (primary, persistent)
+#   2. /var/lib/fieldtrack/active-slot.backup (secondary, same dir — belt-and-suspenders)
 #   3. nginx config upstream          (tiebreaker when both containers run)
 #   4. running containers             (recovery when slot files missing)
 #   5. default "green" / inactive "blue" (first deploy)
@@ -752,7 +786,10 @@ switch_nginx() {
 
     # Write slot AFTER nginx reload — slot always reflects what nginx serves
     _ft_write_slot "$INACTIVE"
-    _ft_log "msg='TRAFFIC_SWITCH' active=$INACTIVE_NAME sha=$IMAGE_SHA deploy_id=$DEPLOY_ID"
+    # Mark SWITCHED: from this point forward the new container is live.
+    # CI rollback is appropriate if post-deploy health checks fail.
+    _FT_DEPLOY_RESULT="SWITCHED"
+    _ft_log "msg='TRAFFIC_SWITCH' active=$INACTIVE_NAME sha=$IMAGE_SHA deploy_id=$DEPLOY_ID deploy_result=SWITCHED"
     _ft_phase_end "SWITCH_NGINX"
 
     # Store backup path in global for rollback use in verify_routing / stability
@@ -770,8 +807,7 @@ verify_routing() {
     # Post-switch routing verification (5 retries)
     local ps_ok=false
     for _ps in 1 2 3 4 5; do
-        if docker run --rm --network api_network "$_FT_CURL_IMG" \
-               -sfk --max-time 5 "https://nginx/health" >/dev/null 2>&1; then
+        if _ft_nginx_route_health_ok; then
             ps_ok=true; break
         fi
         sleep $(( RANDOM % 2 + 2 ))
@@ -800,8 +836,7 @@ verify_routing() {
 
     # Public health check via nginx
     local pub_passed=false
-    if docker run --rm --network api_network "$_FT_CURL_IMG" \
-           -sfk --max-time 10 "https://nginx/health" 2>/dev/null | grep -q '"status":"ok"'; then
+    if _ft_nginx_route_health_ok; then
         pub_passed=true
         _ft_log "msg='public health check passed' container=$INACTIVE_NAME"
     else
@@ -871,6 +906,10 @@ verify_routing() {
 # Called from verify_routing on route/stability failure.
 _restore_nginx_and_slot() {
     local prev_slot="$1"
+    # Nginx was switched to new container, now being restored to old one.
+    # Mark RESTORED so CI knows the traffic switch was undone by this script
+    # and must NOT trigger an additional external rollback.
+    _FT_DEPLOY_RESULT="RESTORED"
     _ft_log "msg='restoring previous nginx config' slot=$prev_slot"
     cp "$NGINX_BACKUP" "$NGINX_CONF"
     if docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1; then
@@ -887,9 +926,14 @@ _trigger_internal_rollback() {
     local reason="$1"
     if [ "${API_ROLLBACK_IN_PROGRESS:-0}" != "1" ]; then
         _ft_error "msg='ROLLBACK triggered' reason=$reason"
+        # RESTORED: nginx was switched then this script restored it.
+        # Set before launching subprocess so the subprocess (with
+        # API_ROLLBACK_IN_PROGRESS=1) will not overwrite the result file.
+        _FT_DEPLOY_RESULT="RESTORED"
         export API_ROLLBACK_IN_PROGRESS=1
         _ft_release_lock
         if ! "$SCRIPT_DIR/deploy.sh" --rollback --auto; then
+            _FT_DEPLOY_RESULT="FAILED"
             _ft_snapshot
             _ft_exit 2 "DEPLOY_FAILED_FATAL" "reason=${reason}_and_rollback_failed"
         fi
@@ -968,8 +1012,7 @@ success() {
     for _sa in 1 2 3; do
         local t0 t1
         t0=$(date +%s%3N)
-        if docker run --rm --network api_network "$_FT_CURL_IMG" \
-               -sk --max-time 3 "https://nginx/health" 2>/dev/null | grep -q '"status":"ok"'; then
+        if _ft_nginx_route_health_ok; then
             t1=$(date +%s%3N)
             ext_latency_ms=$(( t1 - t0 ))
             ext_ok=true; break
@@ -1056,8 +1099,12 @@ main() {
         docker rm -f api-blue 2>/dev/null || true
         start_inactive
         health_check_internal
-        # Write nginx config directly for first deploy (no backup to restore)
+        # Write nginx config directly for first deploy, but keep the current
+        # maintenance config as a rollback target for the routed verification.
         mkdir -p "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR"
+        if [ -f "$NGINX_CONF" ]; then
+            cp "$NGINX_CONF" "$NGINX_BACKUP"
+        fi
         local boot_tmp; boot_tmp="$(mktemp /tmp/api-nginx-boot.XXXXXX.conf)"
         sed -e "s|__ACTIVE_CONTAINER__|$INACTIVE_NAME|g" \
             -e "s|__API_HOSTNAME__|${API_HOSTNAME}|g" \
@@ -1080,14 +1127,11 @@ main() {
             || _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_reload_failed_bootstrap"
         _ft_log "msg='bootstrap: nginx reloaded'"
         _ft_write_slot "blue"
-        local snap_tmp; snap_tmp=$(mktemp "${SNAP_DIR}/last-good.XXXXXX")
-        printf 'slot=blue container=api-blue ts=%s\n' "$(date -Iseconds)" > "$snap_tmp"
-        mv "$snap_tmp" "$LAST_GOOD_FILE"
-        # Deploy history
-        DEPLOY_HISTORY="${DEPLOY_HISTORY:-$DEPLOY_ROOT/.deploy_history}"
-        local hist_tmp="${DEPLOY_HISTORY}.tmp.$$"
-        echo "$IMAGE_SHA" > "$hist_tmp"
-        mv "$hist_tmp" "$DEPLOY_HISTORY"
+        # Nginx reloaded and traffic is now routing to api-blue.
+        _FT_DEPLOY_RESULT="SWITCHED"
+        verify_routing
+        cleanup_old
+        success
         _ft_exit 0 "BOOTSTRAP_SUCCESS" "slot=blue image=$IMAGE"
     fi
 
@@ -1175,9 +1219,9 @@ APP_PORT=3000
 NETWORK="api_network"
 _FT_CURL_IMG="curlimages/curl:8.7.1"
 
-SLOT_DIR="/var/run/api"
+SLOT_DIR="/var/lib/fieldtrack"
 ACTIVE_SLOT_FILE="$SLOT_DIR/active-slot"
-SLOT_BACKUP_FILE="/var/lib/api/active-slot.backup"  # persistent, survives reboots
+SLOT_BACKUP_FILE="/var/lib/fieldtrack/active-slot.backup"  # persists; same dir as primary
 
 NGINX_CONF="$INFRA_ROOT/nginx/live/api.conf"
 NGINX_LIVE_DIR="$INFRA_ROOT/nginx/live"
diff --git a/scripts/vps-readiness-check.sh b/scripts/vps-readiness-check.sh
index b1cd55a..ade5d32 100644
--- a/scripts/vps-readiness-check.sh
+++ b/scripts/vps-readiness-check.sh
@@ -33,7 +33,7 @@ set -euo pipefail
 
 DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
 NETWORK="api_network"
-RUNTIME_DIR="/var/run/api"
+RUNTIME_DIR="/var/lib/fieldtrack"
 LOG_DIR="/var/log/api"
 
 # ── Colour helpers ─────────────────────────────────────────────────────────────
@@ -87,20 +87,6 @@ if ! docker network ls --format '{{.Name}}' | grep -Eq "^${NETWORK}$"; then
 else
   ok "Network '$NETWORK' exists."
 fi
-# ── AUTO-FIX: Kill ghost docker-proxy processes that may hold stale ports ──────
-#
-# docker-proxy processes can linger after container removal and hold ports
-# 80/443 as ghosts. They are safe to kill (Docker recreates them as needed).
-echo ""
-echo "--- AUTO-FIX: ghost docker-proxy cleanup ---"
-if pgrep -x docker-proxy >/dev/null 2>&1; then
-  warn "Ghost docker-proxy processes detected — killing stale port holders."
-  sudo pkill -x docker-proxy 2>/dev/null || true
-  sleep 1
-  ok "Ghost docker-proxy processes cleared."
-else
-  ok "No ghost docker-proxy processes."
-fi
 # ── CHECK 4: Ports 80 and 443 — no non-docker processes ──────────────────────
 #
 # Design: we do NOT auto-kill unknown processes. If port 80 or 443 is held by
@@ -188,9 +174,22 @@ for f in "${REQUIRED_ENV_FILES[@]}"; do
     echo "  See docs/env-contract.md for required variables."
   else
     ok "Env file present: $f"
+    # Env contract spot-check: verify critical variables are set (not empty).
+    # A valid .env file existence alone is insufficient — missing values cause
+    # the API to start then crash after nginx has already been reloaded.
+    for var in API_BASE_URL CORS_ORIGIN; do
+      if ! grep -qE "^${var}=.+" "$DEPLOY_ROOT/$f" 2>/dev/null; then
+        record_failure "Env contract violation: '$var' is missing or empty in $DEPLOY_ROOT/$f"
+        echo "  See docs/env-contract.md for required variables."
+      fi
+    done
+    ok "Env contract spot-check passed (API_BASE_URL, CORS_ORIGIN present)."
   fi
 done
 
+API_BASE_URL_VALUE="$(grep -E '^API_BASE_URL=' "$DEPLOY_ROOT/.env" 2>/dev/null | head -1 | cut -d'=' -f2- || true)"
+API_HOSTNAME="${API_HOSTNAME:-$(printf '%s' "$API_BASE_URL_VALUE" | sed -E 's|^https?://||' | cut -d'/' -f1)}"
+
 # ── CHECK 7: Runtime state directories ────────────────────────────────────────
 echo ""
 echo "--- CHECK 7: Runtime directories ---"
@@ -205,7 +204,7 @@ for dir in "$RUNTIME_DIR" "$LOG_DIR"; do
   fi
 done
 
-# ── CHECK 8: Network attachment for expected containers ───────────────────────
+# ── CHECK 8: Network attachment enforcement ───────────────────────────────────
 #
 # If nginx is running, it MUST be
 # attached to api_network. If they're not, Docker DNS resolution will fail
@@ -228,9 +227,66 @@ for c in "${NETWORK_REQUIRED[@]}"; do
   fi
 done
 
-# ── CHECK 9: Disk space (warn if < 2GB free) ──────────────────────────────────
+# ── CHECK 9: Nginx container exists and is reachable ─────────────────────────
+#
+# nginx is the sole entry point for all API traffic (Cloudflare → nginx → api).
+# Deploying without a running, reachable nginx means zero traffic will reach
+# the new container even after a successful switch.
+#
+# Hard failure: nginx container must exist before deployment is allowed.
+# Note: reachability check is advisory (nginx may not serve requests until
+# an API container is first deployed).
+echo ""
+echo "--- CHECK 9: Nginx container ---"
+if ! docker inspect nginx >/dev/null 2>&1; then
+  record_failure "nginx container not found — required for deployment routing."
+  echo "  nginx must be running before deploy can proceed."
+  echo "  Fix: docker compose -f docker-compose.nginx.yml up -d"
+else
+  ok "nginx container exists."
+  # Advisory in-network health probe — nginx may return non-200 before
+  # first API deploy (upstream not yet configured), so warn but don't fail.
+  FT_CURL_IMG="curlimages/curl:8.7.1"
+  if [ -z "${API_HOSTNAME}" ]; then
+    warn "Skipping advisory nginx probe because API_HOSTNAME could not be derived from .env."
+  elif docker run --rm --network api_network "$FT_CURL_IMG" \
+       -skf --max-time 5 -H "Host: ${API_HOSTNAME}" "https://nginx/health" >/dev/null 2>&1 \
+     || docker run --rm --network api_network "$FT_CURL_IMG" \
+       -sf --max-time 5 -H "Host: ${API_HOSTNAME}" "http://nginx/health" >/dev/null 2>&1; then
+    ok "nginx is reachable on api_network with Host=${API_HOSTNAME}."
+  else
+    warn "nginx running but health probe returned non-2xx — may be normal before first API deploy."
+  fi
+fi
+
+# ── CHECK 10: Redis reachability (if redis container is running) ───────────────
+#
+# If a Redis container named 'redis' is running, validate it is attached to
+# api_network and responding to PING. Workers will fail to start if Redis is
+# present but unreachable via Docker DNS.
+echo ""
+echo "--- CHECK 10: Redis (if running) ---"
+if docker inspect redis >/dev/null 2>&1; then
+  if ! docker inspect redis \
+       --format '{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' \
+       2>/dev/null | grep -q 'api_network'; then
+    record_failure "redis container is running but NOT attached to api_network."
+    echo "  Fix: docker network connect api_network redis"
+  else
+    if docker exec redis redis-cli ping 2>/dev/null | grep -q PONG; then
+      ok "Redis is reachable on api_network."
+    else
+      record_failure "redis container is running on api_network but not responding to PING."
+      echo "  Check redis container logs: docker logs redis"
+    fi
+  fi
+else
+  ok "Redis container not running — skipping check (validated at application startup)."
+fi
+
+# ── CHECK 11: Disk space (warn if < 2GB free) ———————————————————————————————
 echo ""
-echo "--- CHECK 9: Disk space ---"
+echo "--- CHECK 11: Disk space ---"
 FREE_KB=$(df -k / | awk 'NR==2 {print $4}')
 FREE_GB=$(awk "BEGIN {printf \"%.1f\", $FREE_KB/1024/1024}")
 if [ "$FREE_KB" -lt 2097152 ]; then

From 6f1110bc8e1d216904bf17ece5d920a5e75f62cd Mon Sep 17 00:00:00 2001
From: rajashish147 <rajashish147@gmail.com>
Date: Sat, 4 Apr 2026 16:20:32 +0530
Subject: [PATCH 6/8] fix: enforce canonical redis URL in env.example + scope
 guard to production paths only

Made-with: Cursor
---
 .env.example                 |  2 +-
 .github/workflows/deploy.yml | 21 +++++++++++++--------
 .github/workflows/pr.yml     | 18 ++++++++++++------
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/.env.example b/.env.example
index b23a35b..92edccb 100644
--- a/.env.example
+++ b/.env.example
@@ -27,7 +27,7 @@ SUPABASE_JWT_SECRET=
 
 # --- Redis ---
 # Production/Docker: Redis runs as 'redis' container on api_network (canonical contract).
-# Local development: change to redis://localhost:6379 if running Redis on the host.
+# Local development with Redis on the host: override REDIS_URL in your local .env (see docs/env-contract.md).
 REDIS_URL=redis://redis:6379
 
 # --- Security ---
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index 63a9fd9..7b2cdeb 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -272,8 +272,7 @@ jobs:
       - name: Infra contract naming guard
         run: |
           # Enforce canonical naming contract (docs/infra-contract.md).
-          # Scoped to production paths only. Workflow files are intentionally
-          # excluded so the guard never matches its own grep command or logs.
+          # Redis guard: scan the repo but skip docs, tests, markdown, and known local-dev fixtures.
           FAIL=0
 
           # Guard 1: no stale network name (fieldtrack_network is not the canonical name)
@@ -284,12 +283,18 @@ jobs:
             FAIL=1
           fi
 
-          # Guard 2: no localhost Redis in production paths
-          # Excludes: tests/ (unit/integration tests run on host, localhost is correct there)
-          if grep -rE 'redis://localhost:[0-9]+|redis://127\.0\.0\.1:[0-9]+' \
-               src/ scripts/ .env.example \
-               --include='*.ts' --include='*.sh' --include='*.env*' \
-               2>/dev/null | grep -Ev '^\s*(//|#)'; then
+          # Guard 2: no localhost Redis URLs outside allowed paths (see docs/infra-contract.md)
+          if grep -rE 'redis://localhost:[0-9]+|redis://127\.0\.0\.1:[0-9]+' . \
+               --exclude-dir=docs \
+               --exclude-dir=tests \
+               --exclude-dir=node_modules \
+               --exclude-dir=.git \
+               --exclude-dir=codeql-db \
+               --exclude='*.md' \
+               --exclude='*.test.ts' \
+               --exclude='*.unit.ts' \
+               --exclude='.env.example.dev' \
+               2>/dev/null | grep -Fv 'env-setup.ts' | grep -q .; then
             echo "::error::localhost Redis URL found in production paths — canonical URL is redis://redis:6379"
             FAIL=1
           fi
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index f2dbd16..e55c00a 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -118,8 +118,7 @@ jobs:
         if: needs.detect-changes.outputs.api == 'true'
         run: |
           # Enforce canonical naming contract (docs/infra-contract.md).
-          # Scoped to production paths; workflow files are intentionally
-          # excluded so the guard never matches its own grep command.
+          # Redis guard: scan the repo but skip docs, tests, markdown, and known local-dev fixtures.
           FAIL=0
 
           if grep -rE '\bfieldtrack_network\b' src/ scripts/ \
@@ -129,10 +128,17 @@ jobs:
             FAIL=1
           fi
 
-          if grep -rE 'redis://localhost:[0-9]+|redis://127\.0\.0\.1:[0-9]+' \
-               src/ scripts/ .env.example \
-               --include='*.ts' --include='*.sh' --include='*.env*' \
-               2>/dev/null | grep -Ev '^\s*(//|#)'; then
+          if grep -rE 'redis://localhost:[0-9]+|redis://127\.0\.0\.1:[0-9]+' . \
+               --exclude-dir=docs \
+               --exclude-dir=tests \
+               --exclude-dir=node_modules \
+               --exclude-dir=.git \
+               --exclude-dir=codeql-db \
+               --exclude='*.md' \
+               --exclude='*.test.ts' \
+               --exclude='*.unit.ts' \
+               --exclude='.env.example.dev' \
+               2>/dev/null | grep -Fv 'env-setup.ts' | grep -q .; then
             echo "::error::localhost Redis URL in production paths — canonical URL is redis://redis:6379"
             FAIL=1
           fi

From 9967c4c90bb84b20e30fcfdee516ba2f306c6a66 Mon Sep 17 00:00:00 2001
From: rajashish147 <rajashish147@gmail.com>
Date: Sat, 4 Apr 2026 16:26:04 +0530
Subject: [PATCH 7/8] fix(ci): docker-exec curl guard ignores path-prefixed
 lines and self-doc

Made-with: Cursor
---
 .github/workflows/pr.yml | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index e55c00a..868a6b2 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -184,28 +184,29 @@ jobs:
             -f Dockerfile \
             .
 
-      - name: Guard — no docker exec curl in deploy paths
+      - name: Guard — no in-container curl via docker exec
         if: needs.detect-changes.outputs.api == 'true'
         run: |
-          # docker exec <container> curl is forbidden in production because:
+          # Invoking curl inside a running API container (docker's exec + curl) is forbidden because:
           #   - API containers use distroless (no curl)
           #   - Health checks run from external curlimages/curl on api_network
-          # Search only deploy-relevant files; use word boundaries to prevent
-          # matching curl inside grep patterns, comments, or echo statements.
+          # grep -r prints path:line; filter on content after the first colon only (not ^# on whole line).
           MATCHES=$(grep -rE '\bdocker exec\b.+\bcurl\b' \
             scripts/ .github/workflows/ \
             --include='*.sh' --include='*.yml' --include='*.yaml' \
             2>/dev/null \
-            | grep -Ev '^\s*(#|echo |#.*)' \
-            | grep -v 'grep ' \
+            | grep -Ev '^[^:]+:\s*#' \
+            | grep -Ev '^[^:]+:\s*echo' \
+            | grep -Ev '^[^:]+:\s*- name:' \
+            | grep -Fv 'MATCHES=$(grep' \
             || true)
           if [ -n "$MATCHES" ]; then
-            echo "::error::Forbidden pattern: \`docker exec ... curl\` detected in scripts/ or .github/workflows/"
+            echo "::error::Forbidden pattern: docker exec into a container to run curl (see workflow guard in pr.yml)"
             echo "  Use: docker run --rm --network <net> curlimages/curl:8.7.1 instead"
             echo "$MATCHES"
             exit 1
           fi
-          echo "✓ No forbidden docker exec curl patterns in deploy paths"
+          echo "✓ No forbidden in-container curl via exec in deploy paths"
 
       - name: Pull curl image
         if: needs.detect-changes.outputs.api == 'true'

From 46f2abdd2522e4f28bf22078b30ca9d0552709f3 Mon Sep 17 00:00:00 2001
From: rajashish147 <rajashish147@gmail.com>
Date: Sat, 4 Apr 2026 16:30:17 +0530
Subject: [PATCH 8/8] fix(ci): remove api-ci-test container before docker rmi
 in bootstrap step

Made-with: Cursor
---
 .github/workflows/pr.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 868a6b2..53f667a 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -220,7 +220,11 @@ jobs:
           # curlimages/curl container on the same network (ci_api_net), matching
           # production pattern (api_network / Docker DNS). The API image does NOT
           # include curl; no tooling is assumed inside the container.
-          trap 'docker rm -f api-ci-test 2>/dev/null || true; docker network rm ci_api_net 2>/dev/null || true' EXIT
+          cleanup() {
+            docker rm -f api-ci-test 2>/dev/null || true
+            docker network rm ci_api_net 2>/dev/null || true
+          }
+          trap cleanup EXIT
 
           docker network create ci_api_net
           docker run -d \
@@ -299,6 +303,8 @@ jobs:
             echo "✓ ${ENDPOINT} → 401 (auth guard verified)"
           done
 
+          # Drop container + network before rmi (trap runs only after this script finishes).
+          cleanup
           docker rmi fieldtrack-api:ci-validation
 
   # ---------------------------------------------------------------------------