From 1aeddcee74dd28c1d1c1d154bcd0c7d11758f3e6 Mon Sep 17 00:00:00 2001
From: rajashish147 <rajashish147@gmail.com>
Date: Fri, 3 Apr 2026 15:09:07 +0530
Subject: [PATCH 1/2] feat(ci): enhance CodeQL workflows and add deep scan for
 post-merge analysis feat(deploy): implement API health gate and improve
 monitoring stack sync fix(vps-setup): create runtime state directories for
 blue-green deployment

---
 .github/workflows/codeql-deep.yml |  86 ++++++++
 .github/workflows/codeql.yml      |  30 +--
 .github/workflows/deploy.yml      | 157 +++++++++++++-
 scripts/monitoring-sync.sh        | 336 ++++++++++++++++++++++++++++++
 scripts/vps-setup.sh              |  24 ++-
 5 files changed, 613 insertions(+), 20 deletions(-)
 create mode 100644 .github/workflows/codeql-deep.yml
 create mode 100644 scripts/monitoring-sync.sh

diff --git a/.github/workflows/codeql-deep.yml b/.github/workflows/codeql-deep.yml
new file mode 100644
index 0000000..364cc2b
--- /dev/null
+++ b/.github/workflows/codeql-deep.yml
@@ -0,0 +1,86 @@
+name: "CodeQL — Deep Scan (post-merge)"
+# Runs after every merge to master AND on a weekly schedule.
+# Uses the full security-and-quality query suite — significantly more thorough
+# than the PR lightweight scan.
+#
+# DOES NOT block the Deploy pipeline.  Both workflows trigger independently on
+# a master push; deploy.yml never depends on this workflow.  Results are
+# uploaded to the GitHub Security tab for async review.
+#
+# If critical issues are found, the security team should open a tracking issue
+# and gate the next deployment manually.  This workflow itself never fails the
+# deploy unless an operator explicitly adds it as a required check.
+
+on:
+  push:
+    branches: ["master"]
+  schedule:
+    # Every Monday at 03:15 UTC — offset from midnight to avoid GHA congestion.
+    - cron: "15 3 * * 1"
+
+# Do not cancel in-progress deep scans — let them complete for full coverage.
+concurrency:
+  group: codeql-deep-${{ github.ref }}
+  cancel-in-progress: false
+
+permissions:
+  actions: read
+  contents: read
+  security-events: write
+
+jobs:
+  analyze-deep:
+    name: Deep Analyze (CodeQL)
+    runs-on: ubuntu-latest
+    timeout-minutes: 40
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: ["javascript"]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Setup Node.js (match production)
+        uses: actions/setup-node@v5
+        with:
+          node-version: 24
+          cache: npm
+          cache-dependency-path: package-lock.json
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Build API
+        run: npm run build || true
+
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v4
+        with:
+          languages: ${{ matrix.language }}
+          # Full suite: security + quality + style rules.
+          # Catches OWASP Top-10 plus code-quality issues that may hide security risks.
+          queries: security-and-quality
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v4
+        with:
+          category: "codeql-deep"
+          # Upload unconditionally — results land in the Security tab regardless
+          # of whether any alerts are found.
+          upload: always
+
+      - name: Write deep-scan summary
+        if: always()
+        run: |
+          {
+            echo "## CodeQL Deep Scan"
+            echo "| Field | Value |"
+            echo "|---|---|"
+            echo "| Commit | \`${{ github.sha }}\` |"
+            echo "| Ref | \`${{ github.ref }}\` |"
+            echo "| Query suite | \`security-and-quality\` |"
+            echo "| Results | [Security tab](/${{ github.repository }}/security/code-scanning) |"
+          } >> "$GITHUB_STEP_SUMMARY"
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 36ebca9..888623f 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -1,15 +1,15 @@
-name: "CodeQL Security Scan"
+name: "CodeQL — PR Scan (lightweight)"
+# Runs on every PR to master.  Fast feedback: security-extended queries only.
+# The deep security-and-quality scan runs separately in codeql-deep.yml after
+# a merge lands on master and does NOT block this pipeline.
 
 on:
-  push:
-    branches: ["master"]
   pull_request:
     branches: ["master"]
-  schedule:
-    - cron: "0 3 * * 1"
 
+# Cancel in-flight scans for the same PR when new commits are pushed.
 concurrency:
-  group: codeql-${{ github.ref }}
+  group: codeql-pr-${{ github.event.pull_request.number }}
   cancel-in-progress: true
 
 permissions:
@@ -21,7 +21,7 @@ jobs:
   analyze:
     name: Analyze (CodeQL)
     runs-on: ubuntu-latest
-    timeout-minutes: 25
+    timeout-minutes: 15
 
     strategy:
       fail-fast: false
@@ -32,7 +32,6 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v5
 
-      # ✅ Match production runtime
       - name: Setup Node.js (match production)
         uses: actions/setup-node@v5
         with:
@@ -40,21 +39,24 @@ jobs:
           cache: npm
           cache-dependency-path: package-lock.json
 
-      # ✅ Install ALL dependencies
       - name: Install dependencies
         run: npm ci
 
-      # ✅ Build API (critical for CodeQL flow analysis)
+      # Build so CodeQL can trace data flows through compiled output.
       - name: Build API
         run: npm run build || true
 
-      # ✅ Initialize CodeQL AFTER dependencies
+      # Initialize AFTER install + build so the database includes all sources.
       - name: Initialize CodeQL
         uses: github/codeql-action/init@v4
         with:
           languages: ${{ matrix.language }}
-          queries: security-and-quality
+          # security-extended: broader than the default security set but
+          # significantly faster than security-and-quality (no style/quality rules).
+          # Catches OWASP Top-10 class issues without slowing PR feedback.
+          queries: security-extended
 
-      # ✅ Analyze
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@v4
\ No newline at end of file
+        uses: github/codeql-action/analyze@v4
+        with:
+          category: "codeql-pr"
\ No newline at end of file
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index 07458d9..627e35b 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -14,8 +14,8 @@
 #
 # Parallel stages:
 #   validate ─┐
-#   test-api  ├─► build-scan-push ─► deploy ─► sync-infra ─► health-and-smoke
-#             ┘                                                      │
+#   test-api  ├─► build-scan-push ─► deploy ─► api-health-gate ─► sync-infra ─► sync-monitoring ─► health-and-smoke
+#             ┘                                                                                             │
 #                                              rollback ◄────────────┘ (on failure)
 
 name: Deploy to Production
@@ -517,6 +517,21 @@ jobs:
           fi
           echo "✓ CORS_ORIGIN is set"
 
+      - name: Log deployment metadata and trigger info
+        run: |
+          {
+            echo "## Deployment Initiated"
+            echo "| Field | Value |"
+            echo "|---|---|"
+            echo "| Commit SHA | \`${{ github.sha }}\` |"
+            echo "| Trigger event | ${{ github.event_name }} |"
+            echo "| Triggered by | ${{ github.actor }} |"
+            echo "| Branch | ${{ github.ref_name }} |"
+            echo "| Workflow run | [${{ github.run_id }}](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) |"
+            echo "| Commit message | \`${{ github.event.head_commit.message }}\` |"
+          } >> "$GITHUB_STEP_SUMMARY"
+          echo "📋 Deployment initiated — SHA=${{ github.sha }} EVENT=${{ github.event_name }} ACTOR=${{ github.actor }} RUN=${{ github.run_id }}"
+
       - name: Validate environment contract before deploy
         uses: appleboy/ssh-action@v1.0.3
         with:
@@ -581,6 +596,53 @@ jobs:
             
             echo "DEPLOY_STATE=$DEPLOY_STATUS| SLOT=$ACTIVE_SLOT | SHA=${{ github.sha }}"
 
+  # ---------------------------------------------------------------------------
+  # JOB: api-health-gate  (Step E+)
+  #
+  # Early API health validation — runs AFTER deploy but BEFORE infra sync.
+  # Ensures the API container is truly healthy before we sync monitoring/nginx.
+  # If the API is not healthy at this point, STOP before touching infra.
+  # ---------------------------------------------------------------------------
+  api-health-gate:
+    name: API Health Gate
+    runs-on: ubuntu-latest
+    needs: [deploy]
+    timeout-minutes: 5
+    steps:
+      - name: Verify API container is healthy before infra sync
+        uses: appleboy/ssh-action@v1.0.3
+        with:
+          host: ${{ secrets.DO_HOST }}
+          username: ${{ secrets.DO_USER }}
+          key: ${{ secrets.DO_SSH_KEY }}
+          script: |
+            set -euo pipefail
+            export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
+            [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
+            cd "$DEPLOY_ROOT"
+            source scripts/load-env.sh
+
+            # Determine active slot (blue/green)
+            ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "blue")
+            if [ "$ACTIVE_SLOT" = "green" ]; then BACKEND_PORT=3002; else BACKEND_PORT=3001; fi
+            
+            echo "=== API Health Gate (slot: $ACTIVE_SLOT, port: $BACKEND_PORT) ==="
+            
+            # Poll /ready endpoint (internal readiness probe)
+            for i in $(seq 1 15); do
+              STATUS=$(curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:$BACKEND_PORT/ready" 2>/dev/null || echo "000")
+              if [ "$STATUS" = "200" ]; then
+                echo "✓ API ready on port $BACKEND_PORT (attempt $i)"
+                exit 0
+              fi
+              echo "  Attempt $i: HTTP $STATUS — waiting..."
+              sleep 2
+            done
+            
+            echo "❌ API /ready did not return 200 after 30s — monitoring sync would fail anyway"
+            docker logs "api-$ACTIVE_SLOT" --tail 30 2>/dev/null || true
+            exit 1
+
   # ---------------------------------------------------------------------------
   # JOB: sync-infra
   #
@@ -590,7 +652,7 @@ jobs:
   sync-infra:
     name: Sync Infrastructure (nginx)
     runs-on: ubuntu-latest
-    needs: [deploy]
+    needs: [api-health-gate]
     timeout-minutes: 10
     steps:
       - name: Sync infrastructure configs via SSH
@@ -641,8 +703,89 @@ jobs:
             sudo systemctl reload nginx
             echo "✓ Nginx reloaded."
 
+            # ROUTING VALIDATION — Test actual traffic through Nginx
+            # Config syntax is valid (nginx -t) but routing may still be broken.
+            # Test by hitting the /health endpoint via localhost + Host header.
+            echo "=== Testing Nginx routing (localhost + Host header) ==="
+            sleep 2  # Give Nginx a moment to fully apply reload
+            
+            ROUTE_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
+              --resolve "$API_HOSTNAME:443:127.0.0.1" \
+              -H "Host: $API_HOSTNAME" \
+              "https://127.0.0.1/health" --insecure 2>/dev/null || echo "000")
+            
+            if [ "$ROUTE_STATUS" = "200" ]; then
+              echo "✓ Nginx routing verified (HTTP $ROUTE_STATUS)"
+            else
+              echo "❌ Nginx routing broken (HTTP $ROUTE_STATUS expected 200) — restoring backup..."
+              sudo cp /tmp/api.conf.bak "$NGINX_LIVE"
+              sudo nginx -t 2>&1 && sudo systemctl reload nginx || true
+              exit 1
+            fi
+
             echo "✓ Infra sync completed in $(($(date +%s) - T0))s"
 
+  # ---------------------------------------------------------------------------
+  # JOB: sync-monitoring  (Step F)
+  #
+  # Idempotent monitoring stack sync — runs after every deploy.
+  # Delegates to scripts/monitoring-sync.sh which:
+  #   - Self-heals missing .env.monitoring from example
+  #   - Creates api_network if absent
+  #   - Renders alertmanager.rendered.yml
+  #   - Runs docker compose up -d
+  #   - Validates prometheus / alertmanager / grafana health
+  # Monitoring is REQUIRED — deploy fails if any required container is unhealthy.
+  # ---------------------------------------------------------------------------
+  sync-monitoring:
+    name: Sync Monitoring Stack
+    runs-on: ubuntu-latest
+    needs: [sync-infra]
+    timeout-minutes: 15
+    steps:
+      - name: Sync and validate monitoring stack via SSH
+        uses: appleboy/ssh-action@v1.0.3
+        with:
+          host: ${{ secrets.DO_HOST }}
+          username: ${{ secrets.DO_USER }}
+          key: ${{ secrets.DO_SSH_KEY }}
+          script: |
+            set -euo pipefail
+            export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
+            [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
+            cd "$DEPLOY_ROOT"
+            chmod +x scripts/monitoring-sync.sh
+            ./scripts/monitoring-sync.sh
+
+      - name: Monitoring sync summary
+        if: always()
+        run: |
+          {
+            echo "## Monitoring Sync"
+            echo "| Container | Required |"
+            echo "|---|---|"
+            echo "| prometheus | ✅ |"
+            echo "| alertmanager | ✅ |"
+            echo "| grafana | ✅ |"
+          } >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Deployment artifact traceability
+        if: always()
+        run: |
+          {
+            echo "## Deployment Artifacts"
+            echo "| Field | Value |"
+            echo "|---|---|"
+            echo "| Deployment SHA | \`${{ github.sha }}\` |"
+            echo "| Image Tag | \`fieldtrack-api:${{ needs.get-metadata.outputs.sha_short || github.sha }}\` |"
+            echo "| Workflow Run | [\#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) |"
+            echo "| Triggered By | \`${{ github.event_name }}\` |"
+            echo "| Commit Message | \`${{ github.event.head_commit.message }}\` |"
+          } >> "$GITHUB_STEP_SUMMARY"
+          
+          # Also output to logs for audit trail
+          echo "DEPLOYMENT_COMPLETE: SHA=${{ github.sha }} IMAGE=ghcr.io/${{ github.repository_owner }}/api:${{ github.sha }} RUN=${{ github.run_id }}"
+
   # ---------------------------------------------------------------------------
   # JOB: health-and-smoke
   #
@@ -653,7 +796,7 @@ jobs:
   health-and-smoke:
     name: Health Checks & Smoke Tests
     runs-on: ubuntu-latest
-    needs: [sync-infra]
+    needs: [sync-infra, sync-monitoring]
     timeout-minutes: 15
     steps:
       - name: Checkout
@@ -767,13 +910,15 @@ jobs:
   rollback:
     name: Rollback Deployment (auto)
     runs-on: ubuntu-latest
-    needs: [deploy, sync-infra, health-and-smoke]
+    needs: [deploy, api-health-gate, sync-infra, sync-monitoring, health-and-smoke]
     timeout-minutes: 10
     if: |
       always() &&
       (
         needs.deploy.result == 'failure' ||
+        needs.api-health-gate.result == 'failure' ||
         needs.sync-infra.result == 'failure' ||
+        needs.sync-monitoring.result == 'failure' ||
         needs.health-and-smoke.result == 'failure'
       )
     steps:
@@ -781,7 +926,9 @@ jobs:
         run: |
           echo "ROLLBACK_TRIGGERED=TRUE | FAILED_JOBS:"
           [ "${{ needs.deploy.result }}" = "failure" ] && echo "  - deploy"
+          [ "${{ needs.api-health-gate.result }}" = "failure" ] && echo "  - api-health-gate"
           [ "${{ needs.sync-infra.result }}" = "failure" ] && echo "  - sync-infra"
+          [ "${{ needs.sync-monitoring.result }}" = "failure" ] && echo "  - sync-monitoring"
           [ "${{ needs.health-and-smoke.result }}" = "failure" ] && echo "  - health-and-smoke"
           echo "SHA=${{ github.sha }}"
 
diff --git a/scripts/monitoring-sync.sh b/scripts/monitoring-sync.sh
new file mode 100644
index 0000000..26c5b62
--- /dev/null
+++ b/scripts/monitoring-sync.sh
@@ -0,0 +1,336 @@
+#!/usr/bin/env bash
+# =============================================================================
+# monitoring-sync.sh — Self-Healing Monitoring Stack Sync
+#
+# Called by the CI sync-monitoring job after every production deploy.
+#
+# Responsibilities:
+#   1. SELF-HEAL  — create missing .env.monitoring from example if absent
+#   2. BOOTSTRAP  — detect placeholder values and warn (cold-start mode)
+#   3. ENSURE NETWORK — create api_network if it does not exist
+#   4. SYNC       — idempotent `docker compose up -d` (starts if down, no-ops if healthy)
+#   5. VALIDATE   — confirm prometheus / grafana / alertmanager are running + healthy
+#   6. ENFORCE    — exit 1 if any required container is not healthy after timeout
+#
+# Self-healing rules (safe defaults):
+#   - .env.monitoring missing  → copy from infra/.env.monitoring.example + warn
+#   - .env.monitoring has placeholders (change-me) → skip health wait, warn operator
+#   - api_network missing      → create it
+#   - alertmanager rendered config missing → render it
+#
+# Timeouts:
+#   - Per-container health check: 60 seconds max (20 attempts × 3 s)
+#   - Polling interval: 3 seconds
+#   - Total wait tracked to prevent cascading timeouts
+#
+# Exit codes:
+#   0  All required monitoring containers are healthy
+#   1  One or more required containers failed to become healthy (deploy must fail)
+#
+# Required env (exported by load-env.sh / present in DEPLOY_ROOT):
+#   DEPLOY_ROOT   — absolute path to the repository root on the VPS
+# =============================================================================
+set -euo pipefail
+trap '_ft_mon_trap "$LINENO"' ERR
+
+# ─────────────────────────────────────────────────────────────────────────
+# STATE CLASSIFICATION
+# ─────────────────────────────────────────────────────────────────────────
+DEPLOY_STATE="SUCCESS"
+trap '[ $? -ne 0 ] && DEPLOY_STATE="FAILED" || true' EXIT
+
+# ---------------------------------------------------------------------------
+# LOGGING
+# ---------------------------------------------------------------------------
+_FT_MON_LOG_FILE="${DEPLOY_LOG_FILE:-/var/log/api/deploy.log}"
+_LOG_DIR="$(dirname "$_FT_MON_LOG_FILE")"
+if ! mkdir -p "$_LOG_DIR" 2>/dev/null; then
+    _LOG_DIR="$HOME/api/logs"
+    _FT_MON_LOG_FILE="$_LOG_DIR/deploy.log"
+    mkdir -p "$_LOG_DIR"
+fi
+
+_log() {
+    printf '[MON-SYNC] ts=%s %s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$*" \
+        | tee -a "$_FT_MON_LOG_FILE" >&2
+}
+
+_ft_mon_trap() {
+    printf '[MON-SYNC] ts=%s level=ERROR msg="unexpected failure at line %s"\n' \
+        "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$1" >&2
+}
+
+# ---------------------------------------------------------------------------
+# RESOLVE PATHS
+# ---------------------------------------------------------------------------
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
+
+if [ ! -d "$DEPLOY_ROOT" ]; then
+    _log "level=ERROR msg='DEPLOY_ROOT not found' path=$DEPLOY_ROOT"
+    exit 1
+fi
+
+INFRA_DIR="$DEPLOY_ROOT/infra"
+MON_ENV="$INFRA_DIR/.env.monitoring"
+MON_ENV_EXAMPLE="$INFRA_DIR/.env.monitoring.example"
+MON_COMPOSE="$INFRA_DIR/docker-compose.monitoring.yml"
+ALERTMANAGER_RENDERED="$INFRA_DIR/alertmanager/alertmanager.rendered.yml"
+RENDER_SCRIPT="$INFRA_DIR/scripts/render-alertmanager.sh"
+
+_log "msg='monitoring-sync started' deploy_root=$DEPLOY_ROOT state=$DEPLOY_STATE"
+
+# ---------------------------------------------------------------------------
+# STEP 1 — SELF-HEAL: .env.monitoring
+# Create from example if missing instead of failing hard.
+# The user MUST still fill in real values after first-time creation.
+# ---------------------------------------------------------------------------
+BOOTSTRAP_MODE=false
+if [ ! -f "$MON_ENV" ]; then
+    if [ -f "$MON_ENV_EXAMPLE" ]; then
+        cp "$MON_ENV_EXAMPLE" "$MON_ENV"
+        chmod 600 "$MON_ENV"
+        BOOTSTRAP_MODE=true
+        _log "level=WARN msg='monitoring env file missing — created from example' path=$MON_ENV"
+        _log "level=WARN msg='ACTION REQUIRED: edit $MON_ENV with real GRAFANA_ADMIN_PASSWORD, METRICS_SCRAPE_TOKEN, ALERTMANAGER_SLACK_WEBHOOK'"
+    else
+        _log "level=ERROR msg='monitoring env file and example both missing' path=$MON_ENV"
+        DEPLOY_STATE="FAILED"
+        exit 1
+    fi
+else
+    chmod 600 "$MON_ENV"
+    _log "msg='monitoring env file exists' path=$MON_ENV"
+fi
+
+# ─────────────────────────────────────────────────────────────────────────
+# STEP 1B — BOOTSTRAP MODE: Detect placeholders
+# If .env.monitoring contains default 'change-me' values, we're in cold-start.
+# Skip health polling to avoid timeout on misconfigured system.
+# ─────────────────────────────────────────────────────────────────────────
+if grep -q "change-me" "$MON_ENV" 2>/dev/null; then
+    BOOTSTRAP_MODE=true
+    _log "level=WARN msg='bootstrap mode detected: .env.monitoring contains placeholder values' action='skipping health check'"
+    _log "level=WARN msg='OPERATOR ACTION: edit infra/.env.monitoring and set real values, then re-run deploy'"
+fi
+
+# ---------------------------------------------------------------------------
+# STEP 2 — SELF-HEAL: Docker network api_network
+# ---------------------------------------------------------------------------
+if ! docker network ls --format '{{.Name}}' | grep -Eq '^api_network$'; then
+    _log "msg='api_network missing — creating' driver=bridge"
+    docker network create --driver bridge api_network
+    _log "msg='api_network created'"
+else
+    _log "msg='api_network exists'"
+fi
+
+# ---------------------------------------------------------------------------
+# STEP 3 — SELF-HEAL: Render alertmanager config
+# render-alertmanager.sh is idempotent; always safe to run.
+# ---------------------------------------------------------------------------
+if [ -x "$RENDER_SCRIPT" ]; then
+    _log "msg='rendering alertmanager config'"
+    bash "$RENDER_SCRIPT"
+    _log "msg='alertmanager config rendered' file=$ALERTMANAGER_RENDERED"
+elif [ ! -f "$ALERTMANAGER_RENDERED" ]; then
+    _log "level=ERROR msg='render-alertmanager.sh not found AND rendered config missing' script=$RENDER_SCRIPT"
+    exit 1
+else
+    _log "level=WARN msg='render-alertmanager.sh not found but rendered config exists — continuing' script=$RENDER_SCRIPT"
+fi
+
+# ---------------------------------------------------------------------------
+# STEP 4 — SYNC: docker compose up -d (idempotent)
+# Creates containers that are missing; leaves healthy containers untouched.
+# ---------------------------------------------------------------------------
+_log "msg='starting monitoring stack (idempotent)'"
+cd "$INFRA_DIR"
+docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml up -d --remove-orphans
+cd "$DEPLOY_ROOT"
+_log "msg='docker compose up -d complete'"
+
+# ---------------------------------------------------------------------------
+# STEP 5 — VALIDATE: wait for required containers to become healthy
+#
+# Required containers (must be healthy for deploy to succeed):
+#   prometheus   — metrics collection (health: http://localhost:9090/-/healthy)
+#   alertmanager — alert routing      (health: http://localhost:9093/-/healthy)
+#   grafana      — dashboards         (health: http://localhost:3001/api/health)
+#
+# Strategy: poll docker inspect for Health.Status.
+# Times out at 60 s per container (20 attempts × 3 s).
+# ---------------------------------------------------------------------------
+
+_wait_container_healthy() {
+    local name="$1"
+    local max_wait_sec="${2:-60}"
+    local interval="${3:-3}"
+
+    _log "msg='waiting for container health' container=$name max_wait_sec=$max_wait_sec interval=$interval"
+
+    local waited=0
+    while [ $waited -lt $max_wait_sec ]; do
+        # Explicit container name enforcement: use docker inspect directly.
+        # Avoids fragile grep patterns; fails fast if container name is wrong.
+        if ! docker inspect "$name" >/dev/null 2>&1; then
+            _log "level=WARN msg='container does not exist or inspect failed' container=$name waited_sec=$waited"
+            sleep "$interval"
+            waited=$((waited + interval))
+            continue
+        fi
+
+        # Container exists — check health status
+        local health_status
+        health_status=$(docker inspect --format='{{if .State.Health}}{{.State.Health.Status}}{{else}}no-healthcheck{{end}}' "$name" 2>/dev/null || echo "inspect-failed")
+
+        case "$health_status" in
+            healthy)
+                _log "msg='container healthy' container=$name waited_sec=$waited"
+                return 0
+                ;;
+            no-healthcheck)
+                # Container has no Docker healthcheck — verify it is at least running.
+                local running
+                running=$(docker inspect --format='{{.State.Running}}' "$name" 2>/dev/null || echo "false")
+                if [ "$running" = "true" ]; then
+                    _log "msg='container running (no healthcheck configured)' container=$name"
+                    return 0
+                fi
+                ;;
+            starting)
+                _log "msg='container starting' container=$name waited_sec=$waited/$max_wait_sec"
+                ;;
+            unhealthy)
+                _log "level=WARN msg='container unhealthy' container=$name waited_sec=$waited/$max_wait_sec"
+                ;;
+            inspect-failed)
+                _log "level=WARN msg='docker inspect failed' container=$name waited_sec=$waited"
+                ;;
+            *)
+                _log "level=WARN msg='unknown health status' container=$name status=$health_status waited_sec=$waited"
+                ;;
+        esac
+
+        sleep "$interval"
+        waited=$((waited + interval))
+    done
+
+    _log "level=ERROR msg='container did not become healthy within timeout' container=$name max_wait_sec=$max_wait_sec"
+    docker logs "$name" --tail 30 >&2 2>/dev/null || true
+    return 1
+}
+
+_check_endpoint() {
+    local name="$1"
+    local url="$2"
+
+    local status
+    status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url" 2>/dev/null || echo "000")
+
+    if [ "$status" = "200" ]; then
+        _log "msg='endpoint healthy' container=$name url=$url status=200"
+        return 0
+    else
+        _log "level=ERROR msg='endpoint unhealthy' container=$name url=$url status=$status"
+        return 1
+    fi
+}
+
+# ─────────────────────────────────────────────────────────────────────────
+# SKIP HEALTH CHECKS IN BOOTSTRAP MODE
+# ─────────────────────────────────────────────────────────────────────────
+if [ "$BOOTSTRAP_MODE" = "true" ]; then
+    DEPLOY_STATE="BOOTSTRAP"
+    _log "level=WARN msg='bootstrap mode detected — skipping container health checks' state=$DEPLOY_STATE"
+    _log "level=WARN msg='ACTION: configure infra/.env.monitoring with real values and re-run deploy to enable monitoring'"
+    exit 0
+fi
+
+# ─────────────────────────────────────────────────────────────────────────
+# ENFORCE: Container name validation + health checks
+# ─────────────────────────────────────────────────────────────────────────
+# Exact container name enforcement: fail fast if any required container is missing
+REQUIRED_CONTAINERS=("prometheus" "alertmanager" "grafana")
+for c in "${REQUIRED_CONTAINERS[@]}"; do
+    if ! docker inspect "$c" >/dev/null 2>&1; then
+        _log "level=ERROR msg='required container missing' container=$c"
+        DEPLOY_STATE="FAILED"
+        docker ps --format 'table {{.Names}}\t{{.Status}}' 2>/dev/null >&2 || true
+        exit 1
+    fi
+done
+
+MONITORING_ERRORS=0
+
+# ── Prometheus ──────────────────────────────────────────────────────────────
+if _wait_container_healthy "prometheus" 60 3; then
+    _check_endpoint "prometheus" "http://localhost:9090/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
+else
+    MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
+fi
+
+# ── Alertmanager ─────────────────────────────────────────────────────────────
+if _wait_container_healthy "alertmanager" 60 3; then
+    _check_endpoint "alertmanager" "http://localhost:9093/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
+else
+    MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
+fi
+
+# ── Grafana ──────────────────────────────────────────────────────────────────
+# Grafana may take longer to start; allow 60s timeout.
+if _wait_container_healthy "grafana" 60 3; then
+    # Grafana health endpoint returns 200 with JSON when ready.
+    _check_endpoint "grafana" "http://localhost:3001/api/health" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
+else
+    MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
+fi
+
+# ---------------------------------------------------------------------------
+# STABILITY WINDOW — Verify containers remain healthy after initial pass
+# This catches "flaky startup" where containers pass health check but crash
+# immediately after. Wait settle window then re-verify all containers.
+# ---------------------------------------------------------------------------
+_log "msg='entering stability window (5s settle + re-check)'"
+sleep 5
+
+for c in "${REQUIRED_CONTAINERS[@]}"; do
+    STABLE_STATUS=$(docker inspect --format='{{if .State.Health}}{{.State.Health.Status}}{{else}}running{{end}}' "$c" 2>/dev/null || echo "inspect-failed")
+    if [ "$STABLE_STATUS" != "healthy" ] && [ "$STABLE_STATUS" != "running" ]; then
+        _log "level=ERROR msg='container became unhealthy during stability window' container=$c status=$STABLE_STATUS"
+        MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
+    fi
+done
+
+# ---------------------------------------------------------------------------
+# PROMETHEUS SCRAPING VALIDATION — Ensure Prometheus is actually working
+# A healthy Prometheus container is useless if it's not scraping targets.
+# Query the Prometheus API to verify targets are UP.
+# ---------------------------------------------------------------------------
+_log "msg='validating prometheus scraping targets'"
+PROM_TARGETS=$(curl -s "http://localhost:9090/api/v1/targets" 2>/dev/null || echo "")
+
+if [ -z "$PROM_TARGETS" ]; then
+    _log "level=WARN msg='prometheus API query failed — cannot validate scraping (proceeding with caution)'"
+elif ! echo "$PROM_TARGETS" | grep -q '"health":"up"' 2>/dev/null; then
+    _log "level=ERROR msg='prometheus has no healthy scrape targets' curl_response=${PROM_TARGETS:0:200}"
+    MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
+else
+    # Count active targets
+    ACTIVE_TARGETS=$(echo "$PROM_TARGETS" | grep -o '"health":"up"' | wc -l)
+    _log "msg='prometheus scraping targets' active_count=$ACTIVE_TARGETS"
+fi
+
+# ---------------------------------------------------------------------------
+# FINAL ENFORCEMENT
+# ---------------------------------------------------------------------------
+if [ "$MONITORING_ERRORS" -gt 0 ]; then
+    _log "level=ERROR msg='monitoring validation failed' errors=$MONITORING_ERRORS state=$DEPLOY_STATE"
+    _log "level=ERROR msg='container state at failure:'"
+    docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Ports}}' 2>/dev/null >&2 || true
+    DEPLOY_STATE="FAILED"
+    exit 1
+fi
+
+_log "msg='monitoring-sync complete' state=$DEPLOY_STATE containers=healthy required=3"
+exit 0
diff --git a/scripts/vps-setup.sh b/scripts/vps-setup.sh
index 9fd7ea7..679e481 100644
--- a/scripts/vps-setup.sh
+++ b/scripts/vps-setup.sh
@@ -278,6 +278,15 @@ else
     log "Docker network '$NETWORK' created."
 fi
 
+# Create runtime state directory for blue-green slot tracking.
+# /var/run is tmpfs (cleared on reboot); _ft_ensure_slot_dir recreates it on
+# each deploy, but creating it here avoids a first-boot race condition.
+log "Phase 9b: Creating runtime state directories..."
+install -d -m 750 -o "$DEPLOY_USER" -g "$DEPLOY_USER" /var/run/api 2>/dev/null || true
+install -d -m 755 /var/log/api 2>/dev/null || true
+chown "$DEPLOY_USER:$DEPLOY_USER" /var/log/api 2>/dev/null || true
+log "Runtime state directories ready (/var/run/api, /var/log/api)."
+
 # ============================================================================
 # PHASE 10: Nginx Installation & Configuration
 # ============================================================================
@@ -353,13 +362,26 @@ else
 fi
 
 MONITORING_ENV_FILE="$REPO_DIR/infra/.env.monitoring"
+MONITORING_ENV_EXAMPLE="$REPO_DIR/infra/.env.monitoring.example"
 
 if [ -f "$MONITORING_ENV_FILE" ]; then
     chmod 600 "$MONITORING_ENV_FILE"
     chown "$DEPLOY_USER:$DEPLOY_USER" "$MONITORING_ENV_FILE"
     warn "Monitoring env file detected. Verify its values: $MONITORING_ENV_FILE"
+elif [ -f "$MONITORING_ENV_EXAMPLE" ]; then
+    # Self-heal: create from example so subsequent deploy scripts do not fail.
+    # The operator MUST fill in real values before monitoring is functional.
+    cp "$MONITORING_ENV_EXAMPLE" "$MONITORING_ENV_FILE"
+    chmod 600 "$MONITORING_ENV_FILE"
+    chown "$DEPLOY_USER:$DEPLOY_USER" "$MONITORING_ENV_FILE"
+    warn "infra/.env.monitoring created from example — ACTION REQUIRED:"
+    warn "  Edit $MONITORING_ENV_FILE and set:"
+    warn "    GRAFANA_ADMIN_PASSWORD   — strong password (min 12 chars)"
+    warn "    METRICS_SCRAPE_TOKEN     — must match METRICS_SCRAPE_TOKEN in .env"
+    warn "    ALERTMANAGER_SLACK_WEBHOOK — Slack incoming webhook URL"
+    warn "    API_HOSTNAME             — bare hostname (e.g. api.getfieldtrack.app)"
 else
-    err "Missing $MONITORING_ENV_FILE. Ensure infra/.env.monitoring exists in the repository."
+    err "infra/.env.monitoring and infra/.env.monitoring.example both missing. Cannot continue."
 fi
 
 # ============================================================================

From 06c839496d6f973343f069f069171d4796ab048f Mon Sep 17 00:00:00 2001
From: rajashish147 <rajashish147@gmail.com>
Date: Fri, 3 Apr 2026 15:25:37 +0530
Subject: [PATCH 2/2] fix(monitoring): update health check URLs to use Docker
 service names

---
 scripts/monitoring-sync.sh | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/scripts/monitoring-sync.sh b/scripts/monitoring-sync.sh
index 26c5b62..dea17d6 100644
--- a/scripts/monitoring-sync.sh
+++ b/scripts/monitoring-sync.sh
@@ -154,12 +154,13 @@ _log "msg='docker compose up -d complete'"
 # STEP 5 — VALIDATE: wait for required containers to become healthy
 #
 # Required containers (must be healthy for deploy to succeed):
-#   prometheus   — metrics collection (health: http://localhost:9090/-/healthy)
-#   alertmanager — alert routing      (health: http://localhost:9093/-/healthy)
-#   grafana      — dashboards         (health: http://localhost:3001/api/health)
+#   prometheus   — metrics collection (health: http://prometheus:9090/-/healthy)
+#   alertmanager — alert routing      (health: http://alertmanager:9093/-/healthy)
+#   grafana      — dashboards         (health: http://grafana:3000/api/health)
 #
-# Strategy: poll docker inspect for Health.Status.
+# Strategy: poll docker inspect for Health.Status via Docker service DNS.
 # Times out at 60 s per container (20 attempts × 3 s).
+# Note: Using service names (not localhost) because containers are in Docker network only.
 # ---------------------------------------------------------------------------
 
 _wait_container_healthy() {
@@ -265,14 +266,14 @@ MONITORING_ERRORS=0
 
 # ── Prometheus ──────────────────────────────────────────────────────────────
 if _wait_container_healthy "prometheus" 60 3; then
-    _check_endpoint "prometheus" "http://localhost:9090/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
+    _check_endpoint "prometheus" "http://prometheus:9090/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
 else
     MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
 fi
 
 # ── Alertmanager ─────────────────────────────────────────────────────────────
 if _wait_container_healthy "alertmanager" 60 3; then
-    _check_endpoint "alertmanager" "http://localhost:9093/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
+    _check_endpoint "alertmanager" "http://alertmanager:9093/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
 else
     MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
 fi
@@ -281,7 +282,7 @@ fi
 # Grafana may take longer to start; allow 60s timeout.
 if _wait_container_healthy "grafana" 60 3; then
     # Grafana health endpoint returns 200 with JSON when ready.
-    _check_endpoint "grafana" "http://localhost:3001/api/health" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
+    _check_endpoint "grafana" "http://grafana:3000/api/health" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
 else
     MONITORING_ERRORS=$((MONITORING_ERRORS + 1))
 fi
@@ -308,7 +309,7 @@ done
 # Query the Prometheus API to verify targets are UP.
 # ---------------------------------------------------------------------------
 _log "msg='validating prometheus scraping targets'"
-PROM_TARGETS=$(curl -s "http://localhost:9090/api/v1/targets" 2>/dev/null || echo "")
+PROM_TARGETS=$(curl -s "http://prometheus:9090/api/v1/targets" 2>/dev/null || echo "")
 
 if [ -z "$PROM_TARGETS" ]; then
     _log "level=WARN msg='prometheus API query failed — cannot validate scraping (proceeding with caution)'"