diff --git a/.github/workflows/codeql-deep.yml b/.github/workflows/codeql-deep.yml
index 364cc2b..a62f586 100644
--- a/.github/workflows/codeql-deep.yml
+++ b/.github/workflows/codeql-deep.yml
@@ -29,7 +29,10 @@ permissions:
   security-events: write
 
 jobs:
-  analyze-deep:
+  # Job name MUST stay "codeql-deep" — deploy.yml polls for this exact status
+  # check, and branch protection on master references it as:
+  # "CodeQL — Deep Scan (post-merge) / codeql-deep"
+  codeql-deep:
     name: Deep Analyze (CodeQL)
     runs-on: ubuntu-latest
     timeout-minutes: 40
@@ -68,9 +71,6 @@ jobs:
         uses: github/codeql-action/analyze@v4
         with:
           category: "codeql-deep"
-          # Upload unconditionally — results land in the Security tab regardless
-          # of whether any alerts are found.
-          upload: always
 
       - name: Write deep-scan summary
         if: always()
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 888623f..bb21dab 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -18,8 +18,10 @@ permissions:
   security-events: write
 
 jobs:
-  analyze:
-    name: Analyze (CodeQL)
+  # Job name MUST stay "codeql-lite" — branch protection references this exact
+  # status check: "CodeQL — PR Scan (lightweight) / codeql-lite"
+  codeql-lite:
+    name: CodeQL Lite (PR)
     runs-on: ubuntu-latest
     timeout-minutes: 15
 
@@ -59,4 +61,4 @@ jobs:
       - name: Perform CodeQL Analysis
         uses: github/codeql-action/analyze@v4
         with:
-          category: "codeql-pr"
\ No newline at end of file
+          category: "codeql-lite"
\ No newline at end of file
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index 670aec5..a71b69c 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -3,7 +3,8 @@
 # Production Deployment Pipeline
 #
 # Design principles:
-#   1. Triggers on every push to master (no paths filter — ensures sync-beta always runs)
+#   1. Triggered ONLY after CodeQL deep scan completes successfully — no polling, no race.
+#      Uses workflow_run event: deploy is event-driven, not concurrent with security scan.
 #   2. Runs ALL validation from scratch — no trust built on PR results alone
 #   3. Trivy scan runs BEFORE Docker push — vulnerable images never reach the registry
 #   4. target: production + build-args mirror pr.yml exactly (bit-for-bit parity)
@@ -12,18 +13,33 @@
 #   7. timeout-minutes on every job — hung processes never block CI indefinitely
 #   8. npm ci retried up to 3x — registry flakiness never kills a valid deploy
 #
-# Parallel stages:
-#   validate ─┐
-#   test-api  ├─► build-scan-push ─► deploy ─► api-health-gate ─► sync-infra ─► sync-monitoring ─► health-and-smoke
-#             ┘                                                                                             │
-#                                              rollback ◄────────────┘ (on failure)
+# Pipeline order:
+#   codeql-gate
+#     ├─► validate ─┐
+#     └─► test-api  ├─► build-scan-push ─► vps-readiness-check ─► deploy
+#                   ┘                                                  │
+#                                             api-health-gate ◄────────┘
+#                                                   │
+#                                              sync-infra ─► sync-monitoring ─► health-and-smoke
+#                                                                                       │
+#                                              rollback ◄──────────────────────────────┘ (on failure)
 
 name: Deploy to Production
 
 on:
-  push:
+  # Triggered ONLY when the CodeQL deep scan workflow completes on master.
+  # This replaces the previous push trigger + polling approach:
+  #   - No race conditions (workflow_run fires AFTER codeql-deep finishes)
+  #   - No API polling loops or timing-dependent checks
+  #   - Deployment is blocked at the event level if CodeQL did not succeed
+  workflow_run:
+    workflows: ["CodeQL — Deep Scan (post-merge)"]
+    types:
+      - completed
     branches:
       - master
+  # Manual dispatch retained for emergency/hotfix deploys.
+  # The codeql-gate job enforces the conclusion check only for workflow_run.
   workflow_dispatch:
 
 # Never cancel an in-progress deployment — let it finish or fail cleanly.
@@ -36,6 +52,56 @@ permissions:
   contents: read
 
 jobs:
+  # ---------------------------------------------------------------------------
+  # JOB: codeql-gate
+  #
+  # First job in every deploy run. Two responsibilities:
+  #
+  #   1. SECURITY GATE (workflow_run only):
+  #      Reads github.event.workflow_run.conclusion and fails hard if CodeQL
+  #      did not pass. This makes the event-driven guarantee explicit and
+  #      visible in the pipeline UI.
+  #
+  #   2. SHA RESOLUTION:
+  #      On workflow_run, github.sha = HEAD of default branch at event time,
+  #      NOT the commit that triggered CodeQL. We must deploy exactly the SHA
+  #      that was scanned. Exports deploy_sha = github.event.workflow_run.head_sha
+  #      so all downstream jobs checkout and tag the correct commit.
+  #      On workflow_dispatch, deploy_sha = github.sha (HEAD of triggered branch).
+  #
+  # All subsequent jobs that do git checkout use ref: needs.codeql-gate.outputs.deploy_sha.
+  # ---------------------------------------------------------------------------
+  codeql-gate:
+    name: CodeQL Security Gate
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    outputs:
+      deploy_sha: ${{ steps.sha.outputs.deploy_sha }}
+    steps:
+      - name: Resolve deploy SHA
+        id: sha
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_run" ]; then
+            echo "deploy_sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
+          else
+            echo "deploy_sha=${{ github.sha }}" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Verify CodeQL deep scan passed
+        if: github.event_name == 'workflow_run'
+        run: |
+          CONCLUSION="${{ github.event.workflow_run.conclusion }}"
+          SHA="${{ github.event.workflow_run.head_sha }}"
+          echo "CodeQL deep scan conclusion : $CONCLUSION"
+          echo "Scanned commit SHA          : $SHA"
+          if [ "$CONCLUSION" != "success" ]; then
+            echo "::error::CodeQL deep scan did not pass (conclusion=$CONCLUSION)."
+            echo "  Deployment is blocked. Review findings before retrying:"
+            echo "  https://github.com/${{ github.repository }}/security/code-scanning"
+            exit 1
+          fi
+          echo "✓ CodeQL gate passed — safe to deploy SHA $SHA"
+
   # ---------------------------------------------------------------------------
   # JOB: validate
   #
@@ -45,6 +111,7 @@ jobs:
   validate:
     name: Validate (typecheck + audit)
     runs-on: ubuntu-latest
+    needs: [codeql-gate]
     timeout-minutes: 10
     steps:
       - name: Confirm deployment trigger
@@ -58,27 +125,10 @@ jobs:
 
       - name: Checkout
         uses: actions/checkout@v5
-
-      - name: Setup Node.js 24
-        uses: actions/setup-node@v5
         with:
-          node-version: '24'
-          cache: npm
-          cache-dependency-path: package-lock.json
-
-      - name: Install dependencies (with retry)
-        run: |
-          echo "::group::npm ci"
-          for attempt in 1 2 3; do
-            npm ci && break
-            [ $attempt -eq 3 ] && { echo "::error::npm ci failed after 3 attempts"; exit 1; }
-            echo "Attempt $attempt failed — retrying in 15s..."
-            sleep 15
-          done
-          echo "::endgroup::"
+          ref: ${{ needs.codeql-gate.outputs.deploy_sha }}
 
-      - name: Dependency vulnerability scan
-        run: npm audit --omit=dev --audit-level=high
+      - name: Setup Node.js 24
 
       - name: TypeScript check
         run: npm run typecheck
@@ -102,6 +152,7 @@ jobs:
   test-api:
     name: API Tests (unit + integration)
     runs-on: ubuntu-latest
+    needs: [codeql-gate]
     timeout-minutes: 15
     env:
       SUPABASE_URL: ${{ secrets.SUPABASE_URL_TEST }}
@@ -110,6 +161,8 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v5
+        with:
+          ref: ${{ needs.codeql-gate.outputs.deploy_sha }}
 
       - name: Setup Node.js 24
         uses: actions/setup-node@v5
@@ -152,22 +205,29 @@ jobs:
   build-scan-push:
     name: Build, Scan & Push Docker Image
     runs-on: ubuntu-latest
-    needs: [validate, test-api]
+    needs: [codeql-gate, validate, test-api]
     timeout-minutes: 25
     permissions:
       contents: read
       packages: write
       security-events: write
     outputs:
-      sha_short: ${{ steps.meta.outputs.sha_short }}
-      digest:    ${{ steps.digest.outputs.digest }}
+      sha_short:  ${{ steps.meta.outputs.sha_short }}
+      digest:     ${{ steps.digest.outputs.digest }}
+      deploy_sha: ${{ steps.meta.outputs.deploy_sha }}
     steps:
       - name: Checkout
         uses: actions/checkout@v5
+        with:
+          ref: ${{ needs.codeql-gate.outputs.deploy_sha }}
 
       - name: Extract commit SHA
         id: meta
-        run: echo "sha_short=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
+        env:
+          DEPLOY_SHA: ${{ needs.codeql-gate.outputs.deploy_sha }}
+        run: |
+          echo "sha_short=${DEPLOY_SHA::7}" >> "$GITHUB_OUTPUT"
+          echo "deploy_sha=$DEPLOY_SHA" >> "$GITHUB_OUTPUT"
 
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
@@ -489,17 +549,59 @@ jobs:
             echo "| Registry | ghcr.io/${{ github.repository_owner }}/api |"
           } >> "$GITHUB_STEP_SUMMARY"
 
+  # ---------------------------------------------------------------------------
+  # JOB: vps-readiness-check
+  #
+  # Validates the VPS is in a deployable state BEFORE running the deploy.
+  # Runs in PARALLEL with vps-readiness-check (both depend on build-scan-push).
+  # Both must succeed before deploy is allowed to proceed.
+  #
+  # Delegates to scripts/vps-readiness-check.sh which checks:
+  #   - Docker daemon running
+  #   - api_network exists (auto-created if missing)
+  #   - Ports 80/443 free from non-nginx processes
+  #   - No API containers with host port bindings
+  #   - Required .env file present
+  #   - Runtime directories present (auto-created if missing)
+  #   - Sufficient disk space (auto-prunes if borderline)
+  # ---------------------------------------------------------------------------
+  vps-readiness-check:
+    name: VPS Readiness Gate
+    runs-on: ubuntu-latest
+    needs: [build-scan-push]
+    timeout-minutes: 10
+    steps:
+      - name: Run VPS readiness check via SSH
+        uses: appleboy/ssh-action@v1.0.3
+        with:
+          host: ${{ secrets.DO_HOST }}
+          username: ${{ secrets.DO_USER }}
+          key: ${{ secrets.DO_SSH_KEY }}
+          script: |
+            set -euo pipefail
+            export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
+            [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT — run vps-setup.sh first"; exit 1; }
+            cd "$DEPLOY_ROOT"
+            # Pull latest scripts without full deploy
+            git fetch origin master --depth=1
+            git checkout origin/master -- scripts/vps-readiness-check.sh 2>/dev/null || true
+            chmod +x scripts/vps-readiness-check.sh
+            ./scripts/vps-readiness-check.sh
+
   # ---------------------------------------------------------------------------
   # JOB: deploy
   #
   # Blue-Green deployment to VPS via SSH.
   # The deploy-bluegreen.sh script manages slot switching and container health.
+  #
+  # DEPENDENCY GATES (both must pass):
+  #   - vps-readiness-check: ensures VPS can accept the deployment
   # ---------------------------------------------------------------------------
   deploy:
     name: Deploy (Blue-Green SSH)
     runs-on: ubuntu-latest
-    needs: [build-scan-push]
-    timeout-minutes: 15
+    needs: [build-scan-push, vps-readiness-check]
+    timeout-minutes: 20
     steps:
       - name: Validate required deployment secrets
         env:
@@ -548,8 +650,10 @@ jobs:
             ls -la "$HOME/api"
             [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
             cd "$DEPLOY_ROOT"
+            # Pin repo to the exact SHA that was built and scanned by CodeQL.
+            # Prevents stale scripts from running if concurrent commits landed.
             git fetch origin
-            git reset --hard origin/master
+            git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }}
             chmod +x scripts/*.sh
             echo "=== Pre-deploy environment validation ==="
             ./scripts/validate-env.sh --check-monitoring
@@ -573,6 +677,10 @@ jobs:
             ls -la "$DEPLOY_ROOT"
             [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
             cd "$DEPLOY_ROOT"
+            # Enforce repo is at the exact SHA being deployed (issue 7 — prevents
+            # stale deploy scripts if another commit landed during this pipeline run).
+            git fetch origin
+            git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }}
             chmod +x scripts/*.sh
             # Environment already validated in previous step
             ./scripts/deploy-bluegreen.sh "${{ needs.build-scan-push.outputs.sha_short }}"
@@ -587,14 +695,24 @@ jobs:
           key: ${{ secrets.DO_SSH_KEY }}
           script: |
             ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "unknown")
+            ACTIVE_CONTAINER="api-${ACTIVE_SLOT}"
             DEPLOY_STATUS="UNKNOWN"
-            
-            # Check if health endpoint is responding (good sign of successful deploy)
-            if timeout 5 curl -sf http://127.0.0.1:3000/health >/dev/null 2>&1; then
-              DEPLOY_STATUS="SUCCESS"
+
+            # Health check via docker exec — NO host port binding required.
+            # api containers live only on api_network; localhost:3000 here means
+            # the container's own loopback (executed via docker exec).
+            if docker inspect "$ACTIVE_CONTAINER" >/dev/null 2>&1; then
+              if docker exec "$ACTIVE_CONTAINER" \
+                  curl -sf --max-time 5 "http://localhost:3000/health" >/dev/null 2>&1; then
+                DEPLOY_STATUS="SUCCESS"
+              else
+                DEPLOY_STATUS="UNHEALTHY"
+              fi
+            else
+              DEPLOY_STATUS="CONTAINER_MISSING"
             fi
-            
-            echo "DEPLOY_STATE=$DEPLOY_STATUS| SLOT=$ACTIVE_SLOT | SHA=${{ github.sha }}"
+
+            echo "DEPLOY_STATE=$DEPLOY_STATUS | SLOT=$ACTIVE_SLOT | CONTAINER=$ACTIVE_CONTAINER | SHA=${{ github.sha }}"
 
   # ---------------------------------------------------------------------------
   # JOB: api-health-gate  (Step E+)
@@ -923,11 +1041,12 @@ jobs:
   rollback:
     name: Rollback Deployment (auto)
     runs-on: ubuntu-latest
-    needs: [deploy, api-health-gate, sync-infra, sync-monitoring, health-and-smoke]
+    needs: [vps-readiness-check, deploy, api-health-gate, sync-infra, sync-monitoring, health-and-smoke]
     timeout-minutes: 10
     if: |
       always() &&
       (
+        needs.vps-readiness-check.result == 'failure' ||
         needs.deploy.result == 'failure' ||
         needs.api-health-gate.result == 'failure' ||
         needs.sync-infra.result == 'failure' ||
@@ -938,6 +1057,7 @@ jobs:
       - name: Log rollback trigger
         run: |
           echo "ROLLBACK_TRIGGERED=TRUE | FAILED_JOBS:"
+          [ "${{ needs.vps-readiness-check.result }}" = "failure" ] && echo "  - vps-readiness-check"
           [ "${{ needs.deploy.result }}" = "failure" ] && echo "  - deploy"
           [ "${{ needs.api-health-gate.result }}" = "failure" ] && echo "  - api-health-gate"
           [ "${{ needs.sync-infra.result }}" = "failure" ] && echo "  - sync-infra"
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 1f82df4..89ebacb 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -5,8 +5,10 @@ on:
     branches:
       - master
 
+# Cancel stale runs for the same PR when new commits are pushed.
+# Uses workflow+ref so different PRs get independent concurrency groups.
 concurrency:
-  group: pr-${{ github.event.pull_request.number }}
+  group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 permissions:
@@ -155,9 +157,13 @@ jobs:
       - name: Container bootstrap validation
         if: needs.detect-changes.outputs.api == 'true'
         run: |
+          # NO host port bindings — container runs on an isolated Docker bridge
+          # network. All checks use docker exec to reach the container directly,
+          # matching the production pattern (api_network / Docker DNS).
+          docker network create ci_api_net
           docker run -d \
             --name api-ci-test \
-            -p 127.0.0.1:3001:3000 \
+            --network ci_api_net \
             -e CONFIG_VERSION \
             -e APP_ENV \
             -e NODE_ENV \
@@ -188,7 +194,8 @@ jobs:
 
           STATUS="000"
           for i in $(seq 1 12); do
-            STATUS=$(curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:3001/health || echo "000")
+            STATUS=$(docker exec api-ci-test \
+              curl -s -o /dev/null -w "%{http_code}" http://localhost:3000/health 2>/dev/null || echo "000")
             if [ "$STATUS" = "200" ]; then break; fi
             echo "Health check attempt $i: HTTP $STATUS — waiting..."
             sleep 2
@@ -197,21 +204,29 @@ jobs:
           if [ "$STATUS" != "200" ]; then
             echo "❌ /health returned HTTP $STATUS after 24 s (expected 200)"
             docker logs api-ci-test --tail 50
+            docker rm -f api-ci-test || true
+            docker network rm ci_api_net || true
             exit 1
           fi
 
+          echo "✓ /health returned 200"
+
           # Smoke tests: admin endpoints must reject unauthenticated requests with 401
           for ENDPOINT in /admin/audit-log /admin/webhook-dlq; do
-            ECODE=$(curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:3001${ENDPOINT}" || echo "000")
+            ECODE=$(docker exec api-ci-test \
+              curl -s -o /dev/null -w "%{http_code}" "http://localhost:3000${ENDPOINT}" 2>/dev/null || echo "000")
             if [ "$ECODE" != "401" ]; then
               echo "❌ ${ENDPOINT} expected 401 (unauthenticated), got ${ECODE}"
               docker logs api-ci-test --tail 50
+              docker rm -f api-ci-test || true
+              docker network rm ci_api_net || true
               exit 1
             fi
             echo "✓ ${ENDPOINT} → 401 (auth guard verified)"
           done
 
           docker rm -f api-ci-test
+          docker network rm ci_api_net
           docker rmi fieldtrack-api:ci-validation
 
   infra-ci:
@@ -260,4 +275,54 @@ jobs:
           docker run --rm \
             -v /tmp/nginx.conf:/etc/nginx/conf.d/default.conf:ro \
             -v /tmp/ssl:/etc/ssl/api:ro \
-            nginx:1.27-alpine nginx -t
\ No newline at end of file
+            nginx:1.27-alpine nginx -t
+
+  # ---------------------------------------------------------------------------
+  # JOB: codeql-lite
+  #
+  # Lightweight CodeQL security scan — runs in PARALLEL with api-ci and infra-ci.
+  # Uses security-extended queries (OWASP Top-10 class) for fast PR feedback.
+  # This job is REQUIRED in branch protection; PRs cannot merge until it passes.
+  #
+  # Job name "codeql-lite" is the required status check identifier.
+  # Branch protection setting: "PR Validation / codeql-lite"
+  # ---------------------------------------------------------------------------
+  codeql-lite:
+    name: CodeQL Lite (Security Scan)
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+    strategy:
+      fail-fast: false
+      matrix:
+        language: ["javascript"]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Setup Node.js (match production)
+        uses: actions/setup-node@v5
+        with:
+          node-version: 24
+          cache: npm
+          cache-dependency-path: package-lock.json
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Build API (enables data-flow tracing)
+        run: npm run build || true
+
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v4
+        with:
+          languages: ${{ matrix.language }}
+          queries: security-extended
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v4
+        with:
+          category: "codeql-lite"
\ No newline at end of file
diff --git a/docs/OBSERVABILITY_ARCHITECTURE.md b/docs/OBSERVABILITY_ARCHITECTURE.md
index 8404eef..2edb05a 100644
--- a/docs/OBSERVABILITY_ARCHITECTURE.md
+++ b/docs/OBSERVABILITY_ARCHITECTURE.md
@@ -286,10 +286,11 @@ Nginx references LetsEncrypt certificates at `/etc/letsencrypt/live/<API_HOSTNAM
 4. Render and install the full SSL config from the template:
    ```bash
    sed \
-     -e "s|__BACKEND_PORT__|3001|g" \
+     -e "s|__ACTIVE_CONTAINER__|api-blue|g" \
      -e "s|__API_HOSTNAME__|$API_HOSTNAME|g" \
-     infra/nginx/api.conf | sudo tee /etc/nginx/sites-enabled/api.conf
-   sudo nginx -t && sudo systemctl reload nginx
+     infra/nginx/api.conf > infra/nginx/live/api.conf
+   # nginx runs in Docker — reload via docker exec (no host nginx service):
+   docker exec nginx nginx -t && docker exec nginx nginx -s reload
    ```
 
 5. Enable auto-renewal (Certbot installs a systemd timer automatically on Ubuntu):
diff --git a/infra/docker-compose.monitoring.yml b/infra/docker-compose.monitoring.yml
index 133fef6..fdbb69d 100644
--- a/infra/docker-compose.monitoring.yml
+++ b/infra/docker-compose.monitoring.yml
@@ -224,9 +224,15 @@ services:
     networks:
       - api_network
 
+    # nginx can start as soon as the grafana *container* exists.
+    # Waiting for service_healthy would create a blocking chain:
+    #   nginx → grafana → prometheus → alertmanager
+    # which delays the ingress layer on fresh deployments by minutes.
+    # nginx uses deferred Docker DNS ($api_backend variable + resolver 127.0.0.11)
+    # so it starts cleanly before any backend container is ready.
     depends_on:
       grafana:
-        condition: service_healthy
+        condition: service_started
 
     deploy:
       resources:
diff --git a/scripts/deploy-bluegreen.sh b/scripts/deploy-bluegreen.sh
index ba28df1..2e6a7c4 100644
--- a/scripts/deploy-bluegreen.sh
+++ b/scripts/deploy-bluegreen.sh
@@ -426,13 +426,57 @@ chmod 600 "$DEPLOY_ROOT/infra/.env.monitoring" 2>/dev/null || true
 
 _ft_log "msg='env contract validated'"
 
+# Ensure api_network exists (idempotent). All containers MUST be on this network.
+docker network create --driver bridge "$NETWORK" 2>/dev/null \
+    && _ft_log "msg='api_network created'" \
+    || _ft_log "msg='api_network already exists'"
+
 # NGINX CONTAINER GUARD -- nginx MUST run as a Docker container on api_network.
 # With container-name upstreams (server api-blue:3000), Docker's embedded DNS
 # (127.0.0.11) is required for name resolution. This only works from WITHIN
 # Docker containers on the same network -- not from a host systemd nginx service.
+#
+# BOOTSTRAP MODE: If nginx is missing, start it via docker compose --no-deps so
+# the monitoring dependency chain (nginx→grafana→prometheus→alertmanager) does
+# NOT block a first-deploy. nginx starts immediately; monitoring catches up.
 if ! docker inspect nginx >/dev/null 2>&1; then
-    _ft_log "level=ERROR msg='nginx container not found -- nginx must run as Docker container on api_network. Run: docker compose --env-file infra/.env.monitoring -f infra/docker-compose.monitoring.yml up -d nginx'"
-    _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_container_missing"
+    _ft_log "msg='nginx container missing — bootstrapping via docker compose --no-deps'"
+    mkdir -p "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR"
+    # Write a bootstrap config pointing at api-blue (default first-deploy slot)
+    # so nginx can start without waiting for an API container.
+    if [ ! -f "$NGINX_CONF" ]; then
+        sed \
+            -e "s|__ACTIVE_CONTAINER__|api-blue|g" \
+            -e "s|__API_HOSTNAME__|${API_HOSTNAME}|g" \
+            "$NGINX_TEMPLATE" > "$NGINX_CONF"
+        _ft_log "msg='bootstrap nginx config written' target=api-blue path=$NGINX_CONF"
+    fi
+    # Kill any ghost docker-proxy holdind host ports before starting nginx
+    pkill docker-proxy 2>/dev/null || true
+    cd "$DEPLOY_ROOT/infra"
+    if ! docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml \
+            up -d --no-deps nginx 2>&1 | tee -a "$DEPLOY_LOG_FILE" >&2; then
+        _ft_log "level=ERROR msg='docker compose up --no-deps nginx failed'"
+        cd "$DEPLOY_ROOT"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_bootstrap_compose_failed"
+    fi
+    cd "$DEPLOY_ROOT"
+    # Wait up to 30 s for the nginx container to become available
+    _NGINX_STARTED=false
+    for _ni in $(seq 1 10); do
+        if docker inspect nginx >/dev/null 2>&1; then
+            _ft_log "msg='nginx bootstrap complete' attempt=$_ni"
+            _NGINX_STARTED=true
+            break
+        fi
+        _ft_log "msg='waiting for nginx container' attempt=$_ni/10"
+        sleep 3
+    done
+    if [ "$_NGINX_STARTED" != "true" ]; then
+        _ft_log "level=ERROR msg='nginx container failed to start after bootstrap'"
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_bootstrap_timeout"
+    fi
+    unset _NGINX_STARTED _ni
 fi
 _NGINX_NETWORK=$(docker inspect nginx --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "")
 if ! echo "$_NGINX_NETWORK" | grep -q "$NETWORK"; then
@@ -479,6 +523,87 @@ _ft_state "PULL_IMAGE" "msg='pulling container image' sha=$IMAGE_SHA"
 timeout 300 docker pull "$IMAGE"
 _ft_log "msg='image pulled' image=$IMAGE"
 
+# ---------------------------------------------------------------------------
+# BOOTSTRAP GUARD -- no API containers exist (first deploy or full restart)
+#
+# When no api-blue or api-green containers are present, the normal slot
+# recovery path works but is implicit. This guard makes first-deploy
+# explicit: start api-blue directly, wait for readiness, write nginx config,
+# write slot file, and exit cleanly with BOOTSTRAP_SUCCESS.
+#
+# WHY THIS IS NECESSARY:
+#   - nginx starts (via the guard above) with bootstrap config pointing at api-blue
+#   - Without this guard, nginx is serving 502 until the normal START_INACTIVE
+#     path eventually starts api-blue. This can be 30-60s of errors.
+#   - Explicit bootstrap gives a deterministic, logged, traceable first-deploy.
+#
+# SKIPPED when any api container already exists (normal redeploy path).
+# ---------------------------------------------------------------------------
+if ! docker ps -a --format '{{.Names}}' | grep -Eq '^api-(blue|green)$'; then
+    _ft_state "BOOTSTRAP" "msg='no api containers found — first deploy, starting api-blue directly'"
+
+    # Remove stale container if left in a stopped state somehow
+    docker rm -f api-blue 2>/dev/null || true
+
+    timeout 60 docker run -d \
+        --name api-blue \
+        --network "$NETWORK" \
+        --restart unless-stopped \
+        --label "api.sha=$IMAGE_SHA" \
+        --label "api.slot=blue" \
+        --label "api.deploy_id=$DEPLOY_ID" \
+        --env-file "$ENV_FILE" \
+        "$IMAGE"
+
+    _ft_log "msg='bootstrap: api-blue started' image=$IMAGE"
+
+    # Wait for /ready — same polling logic as [4/7] HEALTH_CHECK_INTERNAL
+    _BOOT_OK=false
+    for _bi in $(seq 1 20); do
+        if timeout 4 curl -sf "http://api-blue:${APP_PORT}/ready" >/dev/null 2>&1; then
+            _ft_log "msg='bootstrap: api-blue ready' attempt=$_bi"
+            _BOOT_OK=true
+            break
+        fi
+        _ft_log "msg='bootstrap: waiting for api-blue readiness' attempt=$_bi/20"
+        sleep 3
+    done
+
+    if [ "$_BOOT_OK" != "true" ]; then
+        _ft_log "level=ERROR msg='bootstrap: api-blue did not become ready after 60s'"
+        docker logs api-blue --tail 50 >&2 || true
+        docker stop --time 10 api-blue 2>/dev/null || true
+        docker rm api-blue || true
+        _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=bootstrap_api_ready_timeout"
+    fi
+    unset _bi _BOOT_OK
+
+    # Write nginx config pointing at api-blue (same sed logic as SWITCH_NGINX)
+    mkdir -p "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR"
+    NGINX_BOOT_TMP="$(mktemp /tmp/api-nginx-boot.XXXXXX.conf)"
+    sed \
+        -e "s|__ACTIVE_CONTAINER__|api-blue|g" \
+        -e "s|__API_HOSTNAME__|${API_HOSTNAME}|g" \
+        "$NGINX_TEMPLATE" > "$NGINX_BOOT_TMP"
+    cp "$NGINX_BOOT_TMP" "$NGINX_CONF"
+    rm -f "$NGINX_BOOT_TMP"
+
+    if docker exec nginx nginx -t 2>&1; then
+        docker exec nginx nginx -s reload
+        _ft_log "msg='bootstrap: nginx reloaded to api-blue'"
+    else
+        _ft_log "level=ERROR msg='bootstrap: nginx config test failed — leaving existing config'"
+    fi
+
+    # Persist slot state
+    _ft_write_slot "blue"
+
+    # Snapshot last-known-good
+    printf 'blue\n%s\n%s\n' "$IMAGE_SHA" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" > "$LAST_GOOD_FILE"
+
+    _ft_exit 0 "DEPLOY_SUCCESS" "reason=bootstrap_success slot=blue image=$IMAGE"
+fi
+
 # ---------------------------------------------------------------------------
 # [2/7] RESOLVE ACTIVE SLOT (with recovery)
 # ---------------------------------------------------------------------------
@@ -501,14 +626,6 @@ fi
 
 _ft_log "msg='slot resolved' active=$ACTIVE active_name=$ACTIVE_NAME inactive=$INACTIVE inactive_name=$INACTIVE_NAME"
 
-# ---------------------------------------------------------------------------
-# INITIAL DEPLOYMENT DETECTION -- no containers exist yet
-# ---------------------------------------------------------------------------
-if ! docker ps -a --format '{{.Names}}' | grep -Eq '^api-(blue|green)$'; then
-    _ft_log "msg='initial deployment detected — no existing containers'"
-    INITIAL_DEPLOY=true
-fi
-
 # ---------------------------------------------------------------------------
 # ACTIVE CONTAINER EXISTENCE GUARD
 # Protect against race: active slot file says "blue" but container doesn't exist.
@@ -658,9 +775,11 @@ _ft_state "SWITCH_NGINX" "msg='switching nginx upstream' container=$INACTIVE_NAM
 # switching nginx (complements the jitter already in the health check loop).
 sleep 2
 
-# Backup goes to /etc/nginx/ (NOT sites-enabled/) so nginx does not parse it
-# during validation and trigger a duplicate-upstream error.
-NGINX_BACKUP="/etc/nginx/api.conf.bak.$(date +%s)"
+# Backup stored in NGINX_BACKUP_DIR (under the repo) — consistent with the
+# pruning logic below. Avoids creating files in /etc/nginx/ (host-side)
+# which is not guaranteed to exist when nginx runs only inside Docker.
+mkdir -p "$NGINX_BACKUP_DIR"
+NGINX_BACKUP="$NGINX_BACKUP_DIR/api.conf.bak.$(date +%s)"
 NGINX_TMP="$(mktemp /tmp/api-nginx.XXXXXX.conf)"
 
 # PRE-RELOAD GATE: confirm container is still ready before pointing nginx at it
diff --git a/scripts/monitoring-sync.sh b/scripts/monitoring-sync.sh
index dea17d6..4273f0a 100644
--- a/scripts/monitoring-sync.sh
+++ b/scripts/monitoring-sync.sh
@@ -223,17 +223,22 @@ _wait_container_healthy() {
 }
 
 _check_endpoint() {
+    # Execute the health check INSIDE the container via docker exec.
+    # Monitoring containers live only on api_network and are NOT reachable via
+    # host-side DNS — their names (prometheus, alertmanager, grafana) only
+    # resolve from other containers on the same Docker network.
+    # Prefer wget (present in prom/* alpine images); fall back to curl (grafana).
     local name="$1"
     local url="$2"
 
-    local status
-    status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url" 2>/dev/null || echo "000")
-
-    if [ "$status" = "200" ]; then
-        _log "msg='endpoint healthy' container=$name url=$url status=200"
+    if docker exec "$name" wget --spider -q "$url" >/dev/null 2>&1; then
+        _log "msg='endpoint healthy' container=$name url=$url"
+        return 0
+    elif docker exec "$name" curl -sf --max-time 5 "$url" >/dev/null 2>&1; then
+        _log "msg='endpoint healthy (curl)' container=$name url=$url"
         return 0
     else
-        _log "level=ERROR msg='endpoint unhealthy' container=$name url=$url status=$status"
+        _log "level=ERROR msg='endpoint unhealthy' container=$name url=$url"
         return 1
     fi
 }
@@ -309,7 +314,9 @@ done
 # Query the Prometheus API to verify targets are UP.
 # ---------------------------------------------------------------------------
 _log "msg='validating prometheus scraping targets'"
-PROM_TARGETS=$(curl -s "http://prometheus:9090/api/v1/targets" 2>/dev/null || echo "")
+# Use docker exec to query the Prometheus API from inside the container.
+# The prometheus container name is only resolvable within api_network, not from the host.
+PROM_TARGETS=$(docker exec prometheus wget -qO- "http://localhost:9090/api/v1/targets" 2>/dev/null || echo "")
 
 if [ -z "$PROM_TARGETS" ]; then
     _log "level=WARN msg='prometheus API query failed — cannot validate scraping (proceeding with caution)'"
diff --git a/scripts/vps-readiness-check.sh b/scripts/vps-readiness-check.sh
new file mode 100644
index 0000000..cbf3750
--- /dev/null
+++ b/scripts/vps-readiness-check.sh
@@ -0,0 +1,266 @@
+#!/bin/bash
+# ============================================================================
+# FieldTrack API — VPS Readiness Check
+# ============================================================================
+#
+# Validates VPS state before a blue-green deployment is allowed to proceed.
+# Invoked by the vps-readiness-check job in deploy.yml via SSH.
+#
+# SAFE AUTO-FIXES (non-destructive):
+#   - Creates api_network if missing
+#   - Creates missing deploy-time directories
+#   - Auto-prunes docker images if disk is low
+#
+# HARD FAILURES (exit 1):
+#   - Docker daemon not running
+#   - Ports 80 or 443 occupied by ANY non-docker-proxy, non-nginx process
+#   - Any container has host port bindings (violates production architecture)
+#   - Required containers not attached to api_network
+#   - Required .env file missing
+#   - DEPLOY_ROOT does not exist
+#
+# USAGE:
+#   Called automatically by deploy.yml.
+#   Can be run manually: bash scripts/vps-readiness-check.sh
+#
+# EXIT CODES:
+#   0 — VPS is ready (all checks passed, auto-fixes applied as needed)
+#   1 — VPS is NOT ready (hard failure, deployment must not proceed)
+#
+# ============================================================================
+
+set -euo pipefail
+
+DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
+NETWORK="api_network"
+RUNTIME_DIR="/var/run/api"
+LOG_DIR="/var/log/api"
+
+# ── Colour helpers ─────────────────────────────────────────────────────────────
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+ok()   { echo -e "${GREEN}[✓]${NC} $1"; }
+warn() { echo -e "${YELLOW}[!]${NC} $1"; }
+fail() { echo -e "${RED}[✗]${NC} $1"; exit 1; }
+
+FAILURES=0
+record_failure() { echo -e "${RED}[FAIL]${NC} $1"; FAILURES=$((FAILURES + 1)); }
+
+echo ""
+echo "============================================="
+echo "  VPS Readiness Check"
+echo "  $(date -u '+%Y-%m-%dT%H:%M:%SZ')"
+echo "============================================="
+echo ""
+
+# ── CHECK 1: DEPLOY_ROOT exists ────────────────────────────────────────────────
+echo "--- CHECK 1: Deploy root directory ---"
+if [ ! -d "$DEPLOY_ROOT" ]; then
+  fail "DEPLOY_ROOT not found: $DEPLOY_ROOT — VPS may not be provisioned. Run vps-setup.sh first."
+fi
+ok "DEPLOY_ROOT exists: $DEPLOY_ROOT"
+
+# ── CHECK 2: Docker daemon running ─────────────────────────────────────────────
+echo ""
+echo "--- CHECK 2: Docker daemon ---"
+if ! docker info >/dev/null 2>&1; then
+  record_failure "Docker daemon is not running."
+  echo "  Attempting to start Docker..."
+  if sudo systemctl start docker 2>/dev/null && sleep 3 && docker info >/dev/null 2>&1; then
+    ok "Docker started successfully."
+  else
+    fail "Docker daemon could not be started. VPS is not ready."
+  fi
+else
+  ok "Docker daemon is running."
+fi
+
+# ── CHECK 3: api_network exists (auto-fix: create if missing) ──────────────────
+echo ""
+echo "--- CHECK 3: Docker network '$NETWORK' ---"
+if ! docker network ls --format '{{.Name}}' | grep -Eq "^${NETWORK}$"; then
+  warn "Network '$NETWORK' not found — creating it."
+  docker network create --driver bridge "$NETWORK"
+  ok "Network '$NETWORK' created."
+else
+  ok "Network '$NETWORK' exists."
+fi
+
+# ── CHECK 4: Ports 80 and 443 — no non-docker processes ──────────────────────
+#
+# Design: we do NOT auto-kill unknown processes. If port 80 or 443 is held by
+# a non-docker process (e.g., system nginx, apache, lighttpd), that is a VPS
+# configuration error that requires operator action. Silently killing unknown
+# processes risks breaking the system in unpredictable ways.
+#
+# Allowed occupants (hard-coded safe list):
+#   - docker-proxy  (managed by Docker / our nginx container)
+#   - nginx         (running as Docker container — docker exec nginx)
+#
+# Everything else → hard fail with diagnostics.
+echo ""
+echo "--- CHECK 4: Port 80/443 — no non-docker processes ---"
+_check_port() {
+  local port="$1"
+
+  # Check if anything is listening on the port at all
+  if ! ss -tlnp "sport = :${port}" 2>/dev/null | grep -q 'LISTEN'; then
+    ok "Port $port is free."
+    return 0
+  fi
+
+  # Check for non-docker-proxy, non-nginx processes via lsof
+  # lsof -i :PORT lists ALL processes holding the port.
+  # We exclude docker-proxy and nginx (expected Docker-managed processes).
+  NON_DOCKER=$(sudo lsof -i ":${port}" -sTCP:LISTEN -P -n 2>/dev/null \
+    | awk 'NR>1 {print $1, $2}' \
+    | grep -vE '^(docker-pro|nginx)' || true)
+
+  if [ -n "$NON_DOCKER" ]; then
+    record_failure "Port $port is occupied by a non-docker process."
+    echo "  Offending process(es):"
+    sudo lsof -i ":${port}" -sTCP:LISTEN -P -n 2>/dev/null | awk 'NR>1' | sed 's/^/    /'
+    echo "  This is a VPS configuration error. Stop the conflicting service before deploying."
+    echo "  Example: sudo systemctl stop nginx  OR  sudo systemctl stop apache2"
+    return 1
+  fi
+
+  ok "Port $port is held by docker-proxy/nginx (expected)."
+  return 0
+}
+
+_check_port 80
+_check_port 443
+
+# ── CHECK 5: No host port bindings on ANY container ────────────────────────────
+#
+# Production architecture invariant: NO container may bind host ports.
+# All inter-service communication uses Docker DNS on api_network.
+# A host port binding on any container indicates a misconfigured container
+# that could expose services unintentionally or break Docker DNS routing.
+echo ""
+echo "--- CHECK 5: Global host port binding invariant ---"
+BOUND=$(docker ps --format '{{.Names}} {{.Ports}}' 2>/dev/null \
+  | grep -E '(0\.0\.0\.0:|127\.0\.0\.1:)[0-9]+->' || true)
+
+if [ -n "$BOUND" ]; then
+  record_failure "Host port bindings detected — violates production architecture:"
+  echo "$BOUND" | sed 's/^/  /'
+  echo "  Production pattern: all containers run --network api_network without -p."
+  echo "  Remove and recreate the offending container(s) without port bindings."
+else
+  ok "No host port bindings on any running container."
+fi
+
+# ── CHECK 6: Required env files ────────────────────────────────────────────────
+echo ""
+echo "--- CHECK 6: Required environment files ---"
+cd "$DEPLOY_ROOT"
+
+REQUIRED_ENV_FILES=(
+  ".env"
+)
+
+for f in "${REQUIRED_ENV_FILES[@]}"; do
+  if [ ! -f "$DEPLOY_ROOT/$f" ]; then
+    record_failure "Required env file missing: $DEPLOY_ROOT/$f"
+    echo "  This file must be created on the VPS before deployment."
+    echo "  See docs/env-contract.md for required variables."
+  else
+    ok "Env file present: $f"
+  fi
+done
+
+# .env.monitoring is optional (monitoring-sync.sh self-heals from example)
+if [ ! -f "$DEPLOY_ROOT/.env.monitoring" ]; then
+  warn ".env.monitoring not found — monitoring-sync.sh will create it from example during deploy."
+fi
+
+# ── CHECK 7: Runtime state directories ────────────────────────────────────────
+echo ""
+echo "--- CHECK 7: Runtime directories ---"
+
+for dir in "$RUNTIME_DIR" "$LOG_DIR"; do
+  if [ ! -d "$dir" ]; then
+    warn "Runtime directory missing: $dir — creating it."
+    install -d -m 750 "$dir" 2>/dev/null || sudo install -d -m 750 "$dir"
+    ok "Created: $dir"
+  else
+    ok "Directory exists: $dir"
+  fi
+done
+
+# ── CHECK 8: Nginx live config directory ──────────────────────────────────────
+echo ""
+echo "--- CHECK 8: Nginx live config directory ---"
+NGINX_LIVE_DIR="$DEPLOY_ROOT/infra/nginx/live"
+NGINX_BACKUP_DIR="$DEPLOY_ROOT/infra/nginx/backup"
+
+for dir in "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR"; do
+  if [ ! -d "$dir" ]; then
+    warn "Nginx directory missing: $dir — creating it."
+    mkdir -p "$dir"
+    ok "Created: $dir"
+  else
+    ok "Directory exists: $dir"
+  fi
+done
+
+# ── CHECK 9: Network attachment for expected containers ───────────────────────
+#
+# If nginx, prometheus, grafana, or alertmanager are running, they MUST be
+# attached to api_network. If they're not, Docker DNS resolution will fail
+# and api-blue/api-green will be unreachable by name.
+echo ""
+echo "--- CHECK 9: Network attachment enforcement ---"
+NETWORK_REQUIRED=(nginx prometheus grafana alertmanager)
+for c in "${NETWORK_REQUIRED[@]}"; do
+  if docker inspect "$c" >/dev/null 2>&1; then
+    if ! docker inspect "$c" --format '{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' \
+         2>/dev/null | grep -q 'api_network'; then
+      record_failure "Container '$c' is running but NOT attached to api_network."
+      echo "  Docker DNS (container name resolution) requires api_network attachment."
+      echo "  Fix: docker network connect api_network $c"
+    else
+      ok "$c is attached to api_network."
+    fi
+  else
+    ok "$c not running — skipping network check."
+  fi
+done
+
+# ── CHECK 10: Disk space (warn if < 2GB free) ──────────────────────────────────
+echo ""
+echo "--- CHECK 10: Disk space ---"
+FREE_KB=$(df -k / | awk 'NR==2 {print $4}')
+FREE_GB=$(awk "BEGIN {printf \"%.1f\", $FREE_KB/1024/1024}")
+if [ "$FREE_KB" -lt 2097152 ]; then
+  warn "Low disk space: ${FREE_GB}GB free (< 2GB). Pruning unused Docker images."
+  docker image prune -f --filter "until=48h" >/dev/null 2>&1 || true
+  FREE_KB_AFTER=$(df -k / | awk 'NR==2 {print $4}')
+  FREE_GB_AFTER=$(awk "BEGIN {printf \"%.1f\", $FREE_KB_AFTER/1024/1024}")
+  ok "After prune: ${FREE_GB_AFTER}GB free."
+  if [ "$FREE_KB_AFTER" -lt 1048576 ]; then
+    record_failure "Critically low disk space: ${FREE_GB_AFTER}GB free after prune. Cannot deploy safely."
+  fi
+else
+  ok "Disk space OK: ${FREE_GB}GB free."
+fi
+
+# ── FINAL RESULT ───────────────────────────────────────────────────────────────
+echo ""
+echo "============================================="
+if [ "$FAILURES" -eq 0 ]; then
+  echo -e "${GREEN}  VPS READY — all checks passed${NC}"
+  echo "============================================="
+  echo ""
+  exit 0
+else
+  echo -e "${RED}  VPS NOT READY — $FAILURES check(s) failed${NC}"
+  echo "  Deployment must not proceed."
+  echo "============================================="
+  echo ""
+  exit 1
+fi
diff --git a/scripts/vps-setup.sh b/scripts/vps-setup.sh
index 091121f..1df411a 100644
--- a/scripts/vps-setup.sh
+++ b/scripts/vps-setup.sh
@@ -393,41 +393,99 @@ fi
 # ============================================================================
 log "Phase 14: Starting monitoring stack..."
 
-# Stop system nginx — Docker nginx in the monitoring stack takes over ports 80/443.
-# System nginx is no longer needed after cert acquisition; Docker nginx handles
-# ACME challenge renewal via /var/www/certbot mount.
+# Stop system nginx — Docker nginx takes over ports 80/443 from this point.
+# System nginx is no longer needed after cert acquisition; the Docker nginx
+# container handles ACME challenge renewal via the /var/www/certbot mount.
 log "Phase 14a: Stopping system nginx (Docker nginx takes over)..."
 systemctl stop nginx || true
 systemctl disable nginx || true
 log "System nginx stopped and disabled."
 
+# Kill any docker-proxy ghost processes that may be holding host ports 80/443
+# from a previous failed start. pkill is a safe no-op if no process matches.
+pkill docker-proxy 2>/dev/null || true
+
+# Ensure api_network exists before starting compose (idempotent).
+# The compose file declares it as external; Docker will NOT create it automatically.
+if ! docker network ls --format '{{.Name}}' | grep -Eq "^${NETWORK}$"; then
+    docker network create --driver bridge "$NETWORK"
+    log "Docker network '$NETWORK' created before compose."
+else
+    log "Docker network '$NETWORK' already exists."
+fi
+
+# Ensure nginx live config dir and initial config exist before starting nginx,
+# so the container can mount the directory even before the first deploy runs.
+mkdir -p "$NGINX_LIVE_DIR"
+if [ ! -f "$NGINX_SITE_LINK" ]; then
+    sed \
+        -e "s|__ACTIVE_CONTAINER__|api-blue|g" \
+        -e "s|__API_HOSTNAME__|$DOMAIN|g" \
+        "$REPO_DIR/infra/nginx/api.conf" > "$NGINX_SITE_LINK"
+    log "Bootstrap nginx config written (pointing to api-blue) at $NGINX_SITE_LINK"
+fi
+
+# Start nginx FIRST using --no-deps to avoid being blocked by the
+# grafana → prometheus → alertmanager health-check dependency chain.
+# nginx uses deferred Docker DNS resolution so it starts cleanly without
+# needing any backend container to be up.
+log "Phase 14b: Starting Docker nginx (without dependency wait)..."
 cd "$REPO_DIR/infra"
+sudo -u "$DEPLOY_USER" docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml \
+    up -d --no-deps nginx
+log "Docker nginx container started."
+
+# Now start the rest of the monitoring stack (prometheus, alertmanager, grafana, etc.).
+log "Phase 14c: Starting full monitoring stack..."
 sudo -u "$DEPLOY_USER" docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml up -d
+cd "$REPO_DIR"
 
-log "Monitoring stack started (Prometheus, Grafana, Node Exporter, Nginx)"
+log "Monitoring stack started (Prometheus, Alertmanager, Grafana, Loki, Promtail, Node Exporter, Nginx)"
 
 # ============================================================================
-# PHASE 15: First Deployment
+# PHASE 15: First Deployment (Bootstrap)
 # ============================================================================
-log "Phase 15: Pulling and starting initial backend container..."
+log "Phase 15: Starting bootstrap API container..."
+#
+# IMPORTANT: This phase uses :latest for the initial bootstrap ONLY.
+# :latest is the only available tag before any CI deploy has run.
+# After this script completes, every subsequent deploy uses a SHA-pinned
+# image (ghcr.io/fieldtrack-tech/api:<7-char-sha>) via deploy-bluegreen.sh.
+# Immutability is enforced from the first CI push onwards.
+#
+# NO HOST PORT BINDINGS — api-blue connects solely via api_network.
+# nginx routes to it via Docker DNS: server api-blue:3000.
 
-# Pull the latest image
 sudo -u "$DEPLOY_USER" docker pull ghcr.io/fieldtrack-tech/api:latest
 
-# Start the blue container as initial deployment
 if [ -f "$ENV_FILE" ] && grep -q "SUPABASE_URL=your-" "$ENV_FILE"; then
     warn "Skipping container start — .env still has placeholder values."
-    warn "After editing .env, run:"
-    warn "  cd $REPO_DIR && ./scripts/deploy-bluegreen.sh latest"
+    warn "After editing .env, push to master and let CI deploy, or run:"
+    warn "  cd $REPO_DIR && ./scripts/deploy-bluegreen.sh <sha>"
 else
+    # Remove a stale api-blue if it exists from a previous aborted attempt
+    if docker ps -a --format '{{.Names}}' | grep -Eq '^api-blue$'; then
+        docker stop --time 5 api-blue 2>/dev/null || true
+        docker rm api-blue 2>/dev/null || true
+        log "Removed stale api-blue container."
+    fi
+
+    # Start api-blue on api_network — NO -p / no host port binding.
     sudo -u "$DEPLOY_USER" docker run -d \
         --name api-blue \
         --network "$NETWORK" \
         --restart unless-stopped \
+        --label "api.slot=blue" \
+        --label "api.sha=latest-bootstrap" \
         --env-file "$ENV_FILE" \
         ghcr.io/fieldtrack-tech/api:latest
 
-    log "Backend container (api-blue) started (network: $NETWORK)."
+    log "Bootstrap container api-blue started (network: $NETWORK, no host ports)."
+
+    # Write the active-slot file so deploy-bluegreen.sh recovery finds it.
+    install -d -m 750 -o "$DEPLOY_USER" -g "$DEPLOY_USER" /var/run/api 2>/dev/null || true
+    echo "blue" > /var/run/api/active-slot
+    log "Active slot file written: blue"
 fi
 
 # ============================================================================