diff --git a/.github/workflows/codeql-deep.yml b/.github/workflows/codeql-deep.yml index 364cc2b..a62f586 100644 --- a/.github/workflows/codeql-deep.yml +++ b/.github/workflows/codeql-deep.yml @@ -29,7 +29,10 @@ permissions: security-events: write jobs: - analyze-deep: + # Job name MUST stay "codeql-deep" — deploy.yml polls for this exact status + # check, and branch protection on master references it as: + # "CodeQL — Deep Scan (post-merge) / codeql-deep" + codeql-deep: name: Deep Analyze (CodeQL) runs-on: ubuntu-latest timeout-minutes: 40 @@ -68,9 +71,6 @@ jobs: uses: github/codeql-action/analyze@v4 with: category: "codeql-deep" - # Upload unconditionally — results land in the Security tab regardless - # of whether any alerts are found. - upload: always - name: Write deep-scan summary if: always() diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 888623f..bb21dab 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -18,8 +18,10 @@ permissions: security-events: write jobs: - analyze: - name: Analyze (CodeQL) + # Job name MUST stay "codeql-lite" — branch protection references this exact + # status check: "CodeQL — PR Scan (lightweight) / codeql-lite" + codeql-lite: + name: CodeQL Lite (PR) runs-on: ubuntu-latest timeout-minutes: 15 @@ -59,4 +61,4 @@ jobs: - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v4 with: - category: "codeql-pr" \ No newline at end of file + category: "codeql-lite" \ No newline at end of file diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 670aec5..a71b69c 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -3,7 +3,8 @@ # Production Deployment Pipeline # # Design principles: -# 1. Triggers on every push to master (no paths filter — ensures sync-beta always runs) +# 1. Triggered ONLY after CodeQL deep scan completes successfully — no polling, no race. +# Uses workflow_run event: deploy is event-driven, not concurrent with security scan. # 2. Runs ALL validation from scratch — no trust built on PR results alone # 3. Trivy scan runs BEFORE Docker push — vulnerable images never reach the registry # 4. target: production + build-args mirror pr.yml exactly (bit-for-bit parity) @@ -12,18 +13,33 @@ # 7. timeout-minutes on every job — hung processes never block CI indefinitely # 8. npm ci retried up to 3x — registry flakiness never kills a valid deploy # -# Parallel stages: -# validate ─┐ -# test-api ├─► build-scan-push ─► deploy ─► api-health-gate ─► sync-infra ─► sync-monitoring ─► health-and-smoke -# ┘ │ -# rollback ◄────────────┘ (on failure) +# Pipeline order: +# codeql-gate +# ├─► validate ─┐ +# └─► test-api ├─► build-scan-push ─► vps-readiness-check ─► deploy +# ┘ │ +# api-health-gate ◄────────┘ +# │ +# sync-infra ─► sync-monitoring ─► health-and-smoke +# │ +# rollback ◄──────────────────────────────┘ (on failure) name: Deploy to Production on: - push: + # Triggered ONLY when the CodeQL deep scan workflow completes on master. + # This replaces the previous push trigger + polling approach: + # - No race conditions (workflow_run fires AFTER codeql-deep finishes) + # - No API polling loops or timing-dependent checks + # - Deployment is blocked at the event level if CodeQL did not succeed + workflow_run: + workflows: ["CodeQL — Deep Scan (post-merge)"] + types: + - completed branches: - master + # Manual dispatch retained for emergency/hotfix deploys. + # The codeql-gate job enforces the conclusion check only for workflow_run. workflow_dispatch: # Never cancel an in-progress deployment — let it finish or fail cleanly. @@ -36,6 +52,56 @@ permissions: contents: read jobs: + # --------------------------------------------------------------------------- + # JOB: codeql-gate + # + # First job in every deploy run. Two responsibilities: + # + # 1. SECURITY GATE (workflow_run only): + # Reads github.event.workflow_run.conclusion and fails hard if CodeQL + # did not pass. This makes the event-driven guarantee explicit and + # visible in the pipeline UI. + # + # 2. SHA RESOLUTION: + # On workflow_run, github.sha = HEAD of default branch at event time, + # NOT the commit that triggered CodeQL. We must deploy exactly the SHA + # that was scanned. Exports deploy_sha = github.event.workflow_run.head_sha + # so all downstream jobs checkout and tag the correct commit. + # On workflow_dispatch, deploy_sha = github.sha (HEAD of triggered branch). + # + # All subsequent jobs that do git checkout use ref: needs.codeql-gate.outputs.deploy_sha. + # --------------------------------------------------------------------------- + codeql-gate: + name: CodeQL Security Gate + runs-on: ubuntu-latest + timeout-minutes: 5 + outputs: + deploy_sha: ${{ steps.sha.outputs.deploy_sha }} + steps: + - name: Resolve deploy SHA + id: sha + run: | + if [ "${{ github.event_name }}" = "workflow_run" ]; then + echo "deploy_sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT" + else + echo "deploy_sha=${{ github.sha }}" >> "$GITHUB_OUTPUT" + fi + + - name: Verify CodeQL deep scan passed + if: github.event_name == 'workflow_run' + run: | + CONCLUSION="${{ github.event.workflow_run.conclusion }}" + SHA="${{ github.event.workflow_run.head_sha }}" + echo "CodeQL deep scan conclusion : $CONCLUSION" + echo "Scanned commit SHA : $SHA" + if [ "$CONCLUSION" != "success" ]; then + echo "::error::CodeQL deep scan did not pass (conclusion=$CONCLUSION)." + echo " Deployment is blocked. Review findings before retrying:" + echo " https://github.com/${{ github.repository }}/security/code-scanning" + exit 1 + fi + echo "✓ CodeQL gate passed — safe to deploy SHA $SHA" + # --------------------------------------------------------------------------- # JOB: validate # @@ -45,6 +111,7 @@ jobs: validate: name: Validate (typecheck + audit) runs-on: ubuntu-latest + needs: [codeql-gate] timeout-minutes: 10 steps: - name: Confirm deployment trigger @@ -58,27 +125,10 @@ jobs: - name: Checkout uses: actions/checkout@v5 - - - name: Setup Node.js 24 - uses: actions/setup-node@v5 with: - node-version: '24' - cache: npm - cache-dependency-path: package-lock.json - - - name: Install dependencies (with retry) - run: | - echo "::group::npm ci" - for attempt in 1 2 3; do - npm ci && break - [ $attempt -eq 3 ] && { echo "::error::npm ci failed after 3 attempts"; exit 1; } - echo "Attempt $attempt failed — retrying in 15s..." - sleep 15 - done - echo "::endgroup::" + ref: ${{ needs.codeql-gate.outputs.deploy_sha }} - - name: Dependency vulnerability scan - run: npm audit --omit=dev --audit-level=high + - name: Setup Node.js 24 - name: TypeScript check run: npm run typecheck @@ -102,6 +152,7 @@ jobs: test-api: name: API Tests (unit + integration) runs-on: ubuntu-latest + needs: [codeql-gate] timeout-minutes: 15 env: SUPABASE_URL: ${{ secrets.SUPABASE_URL_TEST }} @@ -110,6 +161,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v5 + with: + ref: ${{ needs.codeql-gate.outputs.deploy_sha }} - name: Setup Node.js 24 uses: actions/setup-node@v5 @@ -152,22 +205,29 @@ jobs: build-scan-push: name: Build, Scan & Push Docker Image runs-on: ubuntu-latest - needs: [validate, test-api] + needs: [codeql-gate, validate, test-api] timeout-minutes: 25 permissions: contents: read packages: write security-events: write outputs: - sha_short: ${{ steps.meta.outputs.sha_short }} - digest: ${{ steps.digest.outputs.digest }} + sha_short: ${{ steps.meta.outputs.sha_short }} + digest: ${{ steps.digest.outputs.digest }} + deploy_sha: ${{ steps.meta.outputs.deploy_sha }} steps: - name: Checkout uses: actions/checkout@v5 + with: + ref: ${{ needs.codeql-gate.outputs.deploy_sha }} - name: Extract commit SHA id: meta - run: echo "sha_short=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" + env: + DEPLOY_SHA: ${{ needs.codeql-gate.outputs.deploy_sha }} + run: | + echo "sha_short=${DEPLOY_SHA::7}" >> "$GITHUB_OUTPUT" + echo "deploy_sha=$DEPLOY_SHA" >> "$GITHUB_OUTPUT" - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -489,17 +549,59 @@ jobs: echo "| Registry | ghcr.io/${{ github.repository_owner }}/api |" } >> "$GITHUB_STEP_SUMMARY" + # --------------------------------------------------------------------------- + # JOB: vps-readiness-check + # + # Validates the VPS is in a deployable state BEFORE running the deploy. + # Runs in PARALLEL with vps-readiness-check (both depend on build-scan-push). + # Both must succeed before deploy is allowed to proceed. + # + # Delegates to scripts/vps-readiness-check.sh which checks: + # - Docker daemon running + # - api_network exists (auto-created if missing) + # - Ports 80/443 free from non-nginx processes + # - No API containers with host port bindings + # - Required .env file present + # - Runtime directories present (auto-created if missing) + # - Sufficient disk space (auto-prunes if borderline) + # --------------------------------------------------------------------------- + vps-readiness-check: + name: VPS Readiness Gate + runs-on: ubuntu-latest + needs: [build-scan-push] + timeout-minutes: 10 + steps: + - name: Run VPS readiness check via SSH + uses: appleboy/ssh-action@v1.0.3 + with: + host: ${{ secrets.DO_HOST }} + username: ${{ secrets.DO_USER }} + key: ${{ secrets.DO_SSH_KEY }} + script: | + set -euo pipefail + export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" + [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT — run vps-setup.sh first"; exit 1; } + cd "$DEPLOY_ROOT" + # Pull latest scripts without full deploy + git fetch origin master --depth=1 + git checkout origin/master -- scripts/vps-readiness-check.sh 2>/dev/null || true + chmod +x scripts/vps-readiness-check.sh + ./scripts/vps-readiness-check.sh + # --------------------------------------------------------------------------- # JOB: deploy # # Blue-Green deployment to VPS via SSH. # The deploy-bluegreen.sh script manages slot switching and container health. + # + # DEPENDENCY GATES (both must pass): + # - vps-readiness-check: ensures VPS can accept the deployment # --------------------------------------------------------------------------- deploy: name: Deploy (Blue-Green SSH) runs-on: ubuntu-latest - needs: [build-scan-push] - timeout-minutes: 15 + needs: [build-scan-push, vps-readiness-check] + timeout-minutes: 20 steps: - name: Validate required deployment secrets env: @@ -548,8 +650,10 @@ jobs: ls -la "$HOME/api" [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } cd "$DEPLOY_ROOT" + # Pin repo to the exact SHA that was built and scanned by CodeQL. + # Prevents stale scripts from running if concurrent commits landed. git fetch origin - git reset --hard origin/master + git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }} chmod +x scripts/*.sh echo "=== Pre-deploy environment validation ===" ./scripts/validate-env.sh --check-monitoring @@ -573,6 +677,10 @@ jobs: ls -la "$DEPLOY_ROOT" [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } cd "$DEPLOY_ROOT" + # Enforce repo is at the exact SHA being deployed (issue 7 — prevents + # stale deploy scripts if another commit landed during this pipeline run). + git fetch origin + git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }} chmod +x scripts/*.sh # Environment already validated in previous step ./scripts/deploy-bluegreen.sh "${{ needs.build-scan-push.outputs.sha_short }}" @@ -587,14 +695,24 @@ jobs: key: ${{ secrets.DO_SSH_KEY }} script: | ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "unknown") + ACTIVE_CONTAINER="api-${ACTIVE_SLOT}" DEPLOY_STATUS="UNKNOWN" - - # Check if health endpoint is responding (good sign of successful deploy) - if timeout 5 curl -sf http://127.0.0.1:3000/health >/dev/null 2>&1; then - DEPLOY_STATUS="SUCCESS" + + # Health check via docker exec — NO host port binding required. + # api containers live only on api_network; localhost:3000 here means + # the container's own loopback (executed via docker exec). + if docker inspect "$ACTIVE_CONTAINER" >/dev/null 2>&1; then + if docker exec "$ACTIVE_CONTAINER" \ + curl -sf --max-time 5 "http://localhost:3000/health" >/dev/null 2>&1; then + DEPLOY_STATUS="SUCCESS" + else + DEPLOY_STATUS="UNHEALTHY" + fi + else + DEPLOY_STATUS="CONTAINER_MISSING" fi - - echo "DEPLOY_STATE=$DEPLOY_STATUS| SLOT=$ACTIVE_SLOT | SHA=${{ github.sha }}" + + echo "DEPLOY_STATE=$DEPLOY_STATUS | SLOT=$ACTIVE_SLOT | CONTAINER=$ACTIVE_CONTAINER | SHA=${{ github.sha }}" # --------------------------------------------------------------------------- # JOB: api-health-gate (Step E+) @@ -923,11 +1041,12 @@ jobs: rollback: name: Rollback Deployment (auto) runs-on: ubuntu-latest - needs: [deploy, api-health-gate, sync-infra, sync-monitoring, health-and-smoke] + needs: [vps-readiness-check, deploy, api-health-gate, sync-infra, sync-monitoring, health-and-smoke] timeout-minutes: 10 if: | always() && ( + needs.vps-readiness-check.result == 'failure' || needs.deploy.result == 'failure' || needs.api-health-gate.result == 'failure' || needs.sync-infra.result == 'failure' || @@ -938,6 +1057,7 @@ jobs: - name: Log rollback trigger run: | echo "ROLLBACK_TRIGGERED=TRUE | FAILED_JOBS:" + [ "${{ needs.vps-readiness-check.result }}" = "failure" ] && echo " - vps-readiness-check" [ "${{ needs.deploy.result }}" = "failure" ] && echo " - deploy" [ "${{ needs.api-health-gate.result }}" = "failure" ] && echo " - api-health-gate" [ "${{ needs.sync-infra.result }}" = "failure" ] && echo " - sync-infra" diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 1f82df4..89ebacb 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -5,8 +5,10 @@ on: branches: - master +# Cancel stale runs for the same PR when new commits are pushed. +# Uses workflow+ref so different PRs get independent concurrency groups. concurrency: - group: pr-${{ github.event.pull_request.number }} + group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true permissions: @@ -155,9 +157,13 @@ jobs: - name: Container bootstrap validation if: needs.detect-changes.outputs.api == 'true' run: | + # NO host port bindings — container runs on an isolated Docker bridge + # network. All checks use docker exec to reach the container directly, + # matching the production pattern (api_network / Docker DNS). + docker network create ci_api_net docker run -d \ --name api-ci-test \ - -p 127.0.0.1:3001:3000 \ + --network ci_api_net \ -e CONFIG_VERSION \ -e APP_ENV \ -e NODE_ENV \ @@ -188,7 +194,8 @@ jobs: STATUS="000" for i in $(seq 1 12); do - STATUS=$(curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:3001/health || echo "000") + STATUS=$(docker exec api-ci-test \ + curl -s -o /dev/null -w "%{http_code}" http://localhost:3000/health 2>/dev/null || echo "000") if [ "$STATUS" = "200" ]; then break; fi echo "Health check attempt $i: HTTP $STATUS — waiting..." sleep 2 @@ -197,21 +204,29 @@ jobs: if [ "$STATUS" != "200" ]; then echo "❌ /health returned HTTP $STATUS after 24 s (expected 200)" docker logs api-ci-test --tail 50 + docker rm -f api-ci-test || true + docker network rm ci_api_net || true exit 1 fi + echo "✓ /health returned 200" + # Smoke tests: admin endpoints must reject unauthenticated requests with 401 for ENDPOINT in /admin/audit-log /admin/webhook-dlq; do - ECODE=$(curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:3001${ENDPOINT}" || echo "000") + ECODE=$(docker exec api-ci-test \ + curl -s -o /dev/null -w "%{http_code}" "http://localhost:3000${ENDPOINT}" 2>/dev/null || echo "000") if [ "$ECODE" != "401" ]; then echo "❌ ${ENDPOINT} expected 401 (unauthenticated), got ${ECODE}" docker logs api-ci-test --tail 50 + docker rm -f api-ci-test || true + docker network rm ci_api_net || true exit 1 fi echo "✓ ${ENDPOINT} → 401 (auth guard verified)" done docker rm -f api-ci-test + docker network rm ci_api_net docker rmi fieldtrack-api:ci-validation infra-ci: @@ -260,4 +275,54 @@ jobs: docker run --rm \ -v /tmp/nginx.conf:/etc/nginx/conf.d/default.conf:ro \ -v /tmp/ssl:/etc/ssl/api:ro \ - nginx:1.27-alpine nginx -t \ No newline at end of file + nginx:1.27-alpine nginx -t + + # --------------------------------------------------------------------------- + # JOB: codeql-lite + # + # Lightweight CodeQL security scan — runs in PARALLEL with api-ci and infra-ci. + # Uses security-extended queries (OWASP Top-10 class) for fast PR feedback. + # This job is REQUIRED in branch protection; PRs cannot merge until it passes. + # + # Job name "codeql-lite" is the required status check identifier. + # Branch protection setting: "PR Validation / codeql-lite" + # --------------------------------------------------------------------------- + codeql-lite: + name: CodeQL Lite (Security Scan) + runs-on: ubuntu-latest + timeout-minutes: 15 + permissions: + actions: read + contents: read + security-events: write + strategy: + fail-fast: false + matrix: + language: ["javascript"] + steps: + - name: Checkout repository + uses: actions/checkout@v5 + + - name: Setup Node.js (match production) + uses: actions/setup-node@v5 + with: + node-version: 24 + cache: npm + cache-dependency-path: package-lock.json + + - name: Install dependencies + run: npm ci + + - name: Build API (enables data-flow tracing) + run: npm run build || true + + - name: Initialize CodeQL + uses: github/codeql-action/init@v4 + with: + languages: ${{ matrix.language }} + queries: security-extended + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v4 + with: + category: "codeql-lite" \ No newline at end of file diff --git a/docs/OBSERVABILITY_ARCHITECTURE.md b/docs/OBSERVABILITY_ARCHITECTURE.md index 8404eef..2edb05a 100644 --- a/docs/OBSERVABILITY_ARCHITECTURE.md +++ b/docs/OBSERVABILITY_ARCHITECTURE.md @@ -286,10 +286,11 @@ Nginx references LetsEncrypt certificates at `/etc/letsencrypt/live/ infra/nginx/live/api.conf + # nginx runs in Docker — reload via docker exec (no host nginx service): + docker exec nginx nginx -t && docker exec nginx nginx -s reload ``` 5. Enable auto-renewal (Certbot installs a systemd timer automatically on Ubuntu): diff --git a/infra/docker-compose.monitoring.yml b/infra/docker-compose.monitoring.yml index 133fef6..fdbb69d 100644 --- a/infra/docker-compose.monitoring.yml +++ b/infra/docker-compose.monitoring.yml @@ -224,9 +224,15 @@ services: networks: - api_network + # nginx can start as soon as the grafana *container* exists. + # Waiting for service_healthy would create a blocking chain: + # nginx → grafana → prometheus → alertmanager + # which delays the ingress layer on fresh deployments by minutes. + # nginx uses deferred Docker DNS ($api_backend variable + resolver 127.0.0.11) + # so it starts cleanly before any backend container is ready. depends_on: grafana: - condition: service_healthy + condition: service_started deploy: resources: diff --git a/scripts/deploy-bluegreen.sh b/scripts/deploy-bluegreen.sh index ba28df1..2e6a7c4 100644 --- a/scripts/deploy-bluegreen.sh +++ b/scripts/deploy-bluegreen.sh @@ -426,13 +426,57 @@ chmod 600 "$DEPLOY_ROOT/infra/.env.monitoring" 2>/dev/null || true _ft_log "msg='env contract validated'" +# Ensure api_network exists (idempotent). All containers MUST be on this network. +docker network create --driver bridge "$NETWORK" 2>/dev/null \ + && _ft_log "msg='api_network created'" \ + || _ft_log "msg='api_network already exists'" + # NGINX CONTAINER GUARD -- nginx MUST run as a Docker container on api_network. # With container-name upstreams (server api-blue:3000), Docker's embedded DNS # (127.0.0.11) is required for name resolution. This only works from WITHIN # Docker containers on the same network -- not from a host systemd nginx service. +# +# BOOTSTRAP MODE: If nginx is missing, start it via docker compose --no-deps so +# the monitoring dependency chain (nginx→grafana→prometheus→alertmanager) does +# NOT block a first-deploy. nginx starts immediately; monitoring catches up. if ! docker inspect nginx >/dev/null 2>&1; then - _ft_log "level=ERROR msg='nginx container not found -- nginx must run as Docker container on api_network. Run: docker compose --env-file infra/.env.monitoring -f infra/docker-compose.monitoring.yml up -d nginx'" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_container_missing" + _ft_log "msg='nginx container missing — bootstrapping via docker compose --no-deps'" + mkdir -p "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR" + # Write a bootstrap config pointing at api-blue (default first-deploy slot) + # so nginx can start without waiting for an API container. + if [ ! -f "$NGINX_CONF" ]; then + sed \ + -e "s|__ACTIVE_CONTAINER__|api-blue|g" \ + -e "s|__API_HOSTNAME__|${API_HOSTNAME}|g" \ + "$NGINX_TEMPLATE" > "$NGINX_CONF" + _ft_log "msg='bootstrap nginx config written' target=api-blue path=$NGINX_CONF" + fi + # Kill any ghost docker-proxy holdind host ports before starting nginx + pkill docker-proxy 2>/dev/null || true + cd "$DEPLOY_ROOT/infra" + if ! docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml \ + up -d --no-deps nginx 2>&1 | tee -a "$DEPLOY_LOG_FILE" >&2; then + _ft_log "level=ERROR msg='docker compose up --no-deps nginx failed'" + cd "$DEPLOY_ROOT" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_bootstrap_compose_failed" + fi + cd "$DEPLOY_ROOT" + # Wait up to 30 s for the nginx container to become available + _NGINX_STARTED=false + for _ni in $(seq 1 10); do + if docker inspect nginx >/dev/null 2>&1; then + _ft_log "msg='nginx bootstrap complete' attempt=$_ni" + _NGINX_STARTED=true + break + fi + _ft_log "msg='waiting for nginx container' attempt=$_ni/10" + sleep 3 + done + if [ "$_NGINX_STARTED" != "true" ]; then + _ft_log "level=ERROR msg='nginx container failed to start after bootstrap'" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_bootstrap_timeout" + fi + unset _NGINX_STARTED _ni fi _NGINX_NETWORK=$(docker inspect nginx --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "") if ! echo "$_NGINX_NETWORK" | grep -q "$NETWORK"; then @@ -479,6 +523,87 @@ _ft_state "PULL_IMAGE" "msg='pulling container image' sha=$IMAGE_SHA" timeout 300 docker pull "$IMAGE" _ft_log "msg='image pulled' image=$IMAGE" +# --------------------------------------------------------------------------- +# BOOTSTRAP GUARD -- no API containers exist (first deploy or full restart) +# +# When no api-blue or api-green containers are present, the normal slot +# recovery path works but is implicit. This guard makes first-deploy +# explicit: start api-blue directly, wait for readiness, write nginx config, +# write slot file, and exit cleanly with BOOTSTRAP_SUCCESS. +# +# WHY THIS IS NECESSARY: +# - nginx starts (via the guard above) with bootstrap config pointing at api-blue +# - Without this guard, nginx is serving 502 until the normal START_INACTIVE +# path eventually starts api-blue. This can be 30-60s of errors. +# - Explicit bootstrap gives a deterministic, logged, traceable first-deploy. +# +# SKIPPED when any api container already exists (normal redeploy path). +# --------------------------------------------------------------------------- +if ! docker ps -a --format '{{.Names}}' | grep -Eq '^api-(blue|green)$'; then + _ft_state "BOOTSTRAP" "msg='no api containers found — first deploy, starting api-blue directly'" + + # Remove stale container if left in a stopped state somehow + docker rm -f api-blue 2>/dev/null || true + + timeout 60 docker run -d \ + --name api-blue \ + --network "$NETWORK" \ + --restart unless-stopped \ + --label "api.sha=$IMAGE_SHA" \ + --label "api.slot=blue" \ + --label "api.deploy_id=$DEPLOY_ID" \ + --env-file "$ENV_FILE" \ + "$IMAGE" + + _ft_log "msg='bootstrap: api-blue started' image=$IMAGE" + + # Wait for /ready — same polling logic as [4/7] HEALTH_CHECK_INTERNAL + _BOOT_OK=false + for _bi in $(seq 1 20); do + if timeout 4 curl -sf "http://api-blue:${APP_PORT}/ready" >/dev/null 2>&1; then + _ft_log "msg='bootstrap: api-blue ready' attempt=$_bi" + _BOOT_OK=true + break + fi + _ft_log "msg='bootstrap: waiting for api-blue readiness' attempt=$_bi/20" + sleep 3 + done + + if [ "$_BOOT_OK" != "true" ]; then + _ft_log "level=ERROR msg='bootstrap: api-blue did not become ready after 60s'" + docker logs api-blue --tail 50 >&2 || true + docker stop --time 10 api-blue 2>/dev/null || true + docker rm api-blue || true + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=bootstrap_api_ready_timeout" + fi + unset _bi _BOOT_OK + + # Write nginx config pointing at api-blue (same sed logic as SWITCH_NGINX) + mkdir -p "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR" + NGINX_BOOT_TMP="$(mktemp /tmp/api-nginx-boot.XXXXXX.conf)" + sed \ + -e "s|__ACTIVE_CONTAINER__|api-blue|g" \ + -e "s|__API_HOSTNAME__|${API_HOSTNAME}|g" \ + "$NGINX_TEMPLATE" > "$NGINX_BOOT_TMP" + cp "$NGINX_BOOT_TMP" "$NGINX_CONF" + rm -f "$NGINX_BOOT_TMP" + + if docker exec nginx nginx -t 2>&1; then + docker exec nginx nginx -s reload + _ft_log "msg='bootstrap: nginx reloaded to api-blue'" + else + _ft_log "level=ERROR msg='bootstrap: nginx config test failed — leaving existing config'" + fi + + # Persist slot state + _ft_write_slot "blue" + + # Snapshot last-known-good + printf 'blue\n%s\n%s\n' "$IMAGE_SHA" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" > "$LAST_GOOD_FILE" + + _ft_exit 0 "DEPLOY_SUCCESS" "reason=bootstrap_success slot=blue image=$IMAGE" +fi + # --------------------------------------------------------------------------- # [2/7] RESOLVE ACTIVE SLOT (with recovery) # --------------------------------------------------------------------------- @@ -501,14 +626,6 @@ fi _ft_log "msg='slot resolved' active=$ACTIVE active_name=$ACTIVE_NAME inactive=$INACTIVE inactive_name=$INACTIVE_NAME" -# --------------------------------------------------------------------------- -# INITIAL DEPLOYMENT DETECTION -- no containers exist yet -# --------------------------------------------------------------------------- -if ! docker ps -a --format '{{.Names}}' | grep -Eq '^api-(blue|green)$'; then - _ft_log "msg='initial deployment detected — no existing containers'" - INITIAL_DEPLOY=true -fi - # --------------------------------------------------------------------------- # ACTIVE CONTAINER EXISTENCE GUARD # Protect against race: active slot file says "blue" but container doesn't exist. @@ -658,9 +775,11 @@ _ft_state "SWITCH_NGINX" "msg='switching nginx upstream' container=$INACTIVE_NAM # switching nginx (complements the jitter already in the health check loop). sleep 2 -# Backup goes to /etc/nginx/ (NOT sites-enabled/) so nginx does not parse it -# during validation and trigger a duplicate-upstream error. -NGINX_BACKUP="/etc/nginx/api.conf.bak.$(date +%s)" +# Backup stored in NGINX_BACKUP_DIR (under the repo) — consistent with the +# pruning logic below. Avoids creating files in /etc/nginx/ (host-side) +# which is not guaranteed to exist when nginx runs only inside Docker. +mkdir -p "$NGINX_BACKUP_DIR" +NGINX_BACKUP="$NGINX_BACKUP_DIR/api.conf.bak.$(date +%s)" NGINX_TMP="$(mktemp /tmp/api-nginx.XXXXXX.conf)" # PRE-RELOAD GATE: confirm container is still ready before pointing nginx at it diff --git a/scripts/monitoring-sync.sh b/scripts/monitoring-sync.sh index dea17d6..4273f0a 100644 --- a/scripts/monitoring-sync.sh +++ b/scripts/monitoring-sync.sh @@ -223,17 +223,22 @@ _wait_container_healthy() { } _check_endpoint() { + # Execute the health check INSIDE the container via docker exec. + # Monitoring containers live only on api_network and are NOT reachable via + # host-side DNS — their names (prometheus, alertmanager, grafana) only + # resolve from other containers on the same Docker network. + # Prefer wget (present in prom/* alpine images); fall back to curl (grafana). local name="$1" local url="$2" - local status - status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url" 2>/dev/null || echo "000") - - if [ "$status" = "200" ]; then - _log "msg='endpoint healthy' container=$name url=$url status=200" + if docker exec "$name" wget --spider -q "$url" >/dev/null 2>&1; then + _log "msg='endpoint healthy' container=$name url=$url" + return 0 + elif docker exec "$name" curl -sf --max-time 5 "$url" >/dev/null 2>&1; then + _log "msg='endpoint healthy (curl)' container=$name url=$url" return 0 else - _log "level=ERROR msg='endpoint unhealthy' container=$name url=$url status=$status" + _log "level=ERROR msg='endpoint unhealthy' container=$name url=$url" return 1 fi } @@ -309,7 +314,9 @@ done # Query the Prometheus API to verify targets are UP. # --------------------------------------------------------------------------- _log "msg='validating prometheus scraping targets'" -PROM_TARGETS=$(curl -s "http://prometheus:9090/api/v1/targets" 2>/dev/null || echo "") +# Use docker exec to query the Prometheus API from inside the container. +# The prometheus container name is only resolvable within api_network, not from the host. +PROM_TARGETS=$(docker exec prometheus wget -qO- "http://localhost:9090/api/v1/targets" 2>/dev/null || echo "") if [ -z "$PROM_TARGETS" ]; then _log "level=WARN msg='prometheus API query failed — cannot validate scraping (proceeding with caution)'" diff --git a/scripts/vps-readiness-check.sh b/scripts/vps-readiness-check.sh new file mode 100644 index 0000000..cbf3750 --- /dev/null +++ b/scripts/vps-readiness-check.sh @@ -0,0 +1,266 @@ +#!/bin/bash +# ============================================================================ +# FieldTrack API — VPS Readiness Check +# ============================================================================ +# +# Validates VPS state before a blue-green deployment is allowed to proceed. +# Invoked by the vps-readiness-check job in deploy.yml via SSH. +# +# SAFE AUTO-FIXES (non-destructive): +# - Creates api_network if missing +# - Creates missing deploy-time directories +# - Auto-prunes docker images if disk is low +# +# HARD FAILURES (exit 1): +# - Docker daemon not running +# - Ports 80 or 443 occupied by ANY non-docker-proxy, non-nginx process +# - Any container has host port bindings (violates production architecture) +# - Required containers not attached to api_network +# - Required .env file missing +# - DEPLOY_ROOT does not exist +# +# USAGE: +# Called automatically by deploy.yml. +# Can be run manually: bash scripts/vps-readiness-check.sh +# +# EXIT CODES: +# 0 — VPS is ready (all checks passed, auto-fixes applied as needed) +# 1 — VPS is NOT ready (hard failure, deployment must not proceed) +# +# ============================================================================ + +set -euo pipefail + +DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" +NETWORK="api_network" +RUNTIME_DIR="/var/run/api" +LOG_DIR="/var/log/api" + +# ── Colour helpers ───────────────────────────────────────────────────────────── +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' +ok() { echo -e "${GREEN}[✓]${NC} $1"; } +warn() { echo -e "${YELLOW}[!]${NC} $1"; } +fail() { echo -e "${RED}[✗]${NC} $1"; exit 1; } + +FAILURES=0 +record_failure() { echo -e "${RED}[FAIL]${NC} $1"; FAILURES=$((FAILURES + 1)); } + +echo "" +echo "=============================================" +echo " VPS Readiness Check" +echo " $(date -u '+%Y-%m-%dT%H:%M:%SZ')" +echo "=============================================" +echo "" + +# ── CHECK 1: DEPLOY_ROOT exists ──────────────────────────────────────────────── +echo "--- CHECK 1: Deploy root directory ---" +if [ ! -d "$DEPLOY_ROOT" ]; then + fail "DEPLOY_ROOT not found: $DEPLOY_ROOT — VPS may not be provisioned. Run vps-setup.sh first." +fi +ok "DEPLOY_ROOT exists: $DEPLOY_ROOT" + +# ── CHECK 2: Docker daemon running ───────────────────────────────────────────── +echo "" +echo "--- CHECK 2: Docker daemon ---" +if ! docker info >/dev/null 2>&1; then + record_failure "Docker daemon is not running." + echo " Attempting to start Docker..." + if sudo systemctl start docker 2>/dev/null && sleep 3 && docker info >/dev/null 2>&1; then + ok "Docker started successfully." + else + fail "Docker daemon could not be started. VPS is not ready." + fi +else + ok "Docker daemon is running." +fi + +# ── CHECK 3: api_network exists (auto-fix: create if missing) ────────────────── +echo "" +echo "--- CHECK 3: Docker network '$NETWORK' ---" +if ! docker network ls --format '{{.Name}}' | grep -Eq "^${NETWORK}$"; then + warn "Network '$NETWORK' not found — creating it." + docker network create --driver bridge "$NETWORK" + ok "Network '$NETWORK' created." +else + ok "Network '$NETWORK' exists." +fi + +# ── CHECK 4: Ports 80 and 443 — no non-docker processes ────────────────────── +# +# Design: we do NOT auto-kill unknown processes. If port 80 or 443 is held by +# a non-docker process (e.g., system nginx, apache, lighttpd), that is a VPS +# configuration error that requires operator action. Silently killing unknown +# processes risks breaking the system in unpredictable ways. +# +# Allowed occupants (hard-coded safe list): +# - docker-proxy (managed by Docker / our nginx container) +# - nginx (running as Docker container — docker exec nginx) +# +# Everything else → hard fail with diagnostics. +echo "" +echo "--- CHECK 4: Port 80/443 — no non-docker processes ---" +_check_port() { + local port="$1" + + # Check if anything is listening on the port at all + if ! ss -tlnp "sport = :${port}" 2>/dev/null | grep -q 'LISTEN'; then + ok "Port $port is free." + return 0 + fi + + # Check for non-docker-proxy, non-nginx processes via lsof + # lsof -i :PORT lists ALL processes holding the port. + # We exclude docker-proxy and nginx (expected Docker-managed processes). + NON_DOCKER=$(sudo lsof -i ":${port}" -sTCP:LISTEN -P -n 2>/dev/null \ + | awk 'NR>1 {print $1, $2}' \ + | grep -vE '^(docker-pro|nginx)' || true) + + if [ -n "$NON_DOCKER" ]; then + record_failure "Port $port is occupied by a non-docker process." + echo " Offending process(es):" + sudo lsof -i ":${port}" -sTCP:LISTEN -P -n 2>/dev/null | awk 'NR>1' | sed 's/^/ /' + echo " This is a VPS configuration error. Stop the conflicting service before deploying." + echo " Example: sudo systemctl stop nginx OR sudo systemctl stop apache2" + return 1 + fi + + ok "Port $port is held by docker-proxy/nginx (expected)." + return 0 +} + +_check_port 80 +_check_port 443 + +# ── CHECK 5: No host port bindings on ANY container ──────────────────────────── +# +# Production architecture invariant: NO container may bind host ports. +# All inter-service communication uses Docker DNS on api_network. +# A host port binding on any container indicates a misconfigured container +# that could expose services unintentionally or break Docker DNS routing. +echo "" +echo "--- CHECK 5: Global host port binding invariant ---" +BOUND=$(docker ps --format '{{.Names}} {{.Ports}}' 2>/dev/null \ + | grep -E '(0\.0\.0\.0:|127\.0\.0\.1:)[0-9]+->' || true) + +if [ -n "$BOUND" ]; then + record_failure "Host port bindings detected — violates production architecture:" + echo "$BOUND" | sed 's/^/ /' + echo " Production pattern: all containers run --network api_network without -p." + echo " Remove and recreate the offending container(s) without port bindings." +else + ok "No host port bindings on any running container." +fi + +# ── CHECK 6: Required env files ──────────────────────────────────────────────── +echo "" +echo "--- CHECK 6: Required environment files ---" +cd "$DEPLOY_ROOT" + +REQUIRED_ENV_FILES=( + ".env" +) + +for f in "${REQUIRED_ENV_FILES[@]}"; do + if [ ! -f "$DEPLOY_ROOT/$f" ]; then + record_failure "Required env file missing: $DEPLOY_ROOT/$f" + echo " This file must be created on the VPS before deployment." + echo " See docs/env-contract.md for required variables." + else + ok "Env file present: $f" + fi +done + +# .env.monitoring is optional (monitoring-sync.sh self-heals from example) +if [ ! -f "$DEPLOY_ROOT/.env.monitoring" ]; then + warn ".env.monitoring not found — monitoring-sync.sh will create it from example during deploy." +fi + +# ── CHECK 7: Runtime state directories ──────────────────────────────────────── +echo "" +echo "--- CHECK 7: Runtime directories ---" + +for dir in "$RUNTIME_DIR" "$LOG_DIR"; do + if [ ! -d "$dir" ]; then + warn "Runtime directory missing: $dir — creating it." + install -d -m 750 "$dir" 2>/dev/null || sudo install -d -m 750 "$dir" + ok "Created: $dir" + else + ok "Directory exists: $dir" + fi +done + +# ── CHECK 8: Nginx live config directory ────────────────────────────────────── +echo "" +echo "--- CHECK 8: Nginx live config directory ---" +NGINX_LIVE_DIR="$DEPLOY_ROOT/infra/nginx/live" +NGINX_BACKUP_DIR="$DEPLOY_ROOT/infra/nginx/backup" + +for dir in "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR"; do + if [ ! -d "$dir" ]; then + warn "Nginx directory missing: $dir — creating it." + mkdir -p "$dir" + ok "Created: $dir" + else + ok "Directory exists: $dir" + fi +done + +# ── CHECK 9: Network attachment for expected containers ─────────────────────── +# +# If nginx, prometheus, grafana, or alertmanager are running, they MUST be +# attached to api_network. If they're not, Docker DNS resolution will fail +# and api-blue/api-green will be unreachable by name. +echo "" +echo "--- CHECK 9: Network attachment enforcement ---" +NETWORK_REQUIRED=(nginx prometheus grafana alertmanager) +for c in "${NETWORK_REQUIRED[@]}"; do + if docker inspect "$c" >/dev/null 2>&1; then + if ! docker inspect "$c" --format '{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' \ + 2>/dev/null | grep -q 'api_network'; then + record_failure "Container '$c' is running but NOT attached to api_network." + echo " Docker DNS (container name resolution) requires api_network attachment." + echo " Fix: docker network connect api_network $c" + else + ok "$c is attached to api_network." + fi + else + ok "$c not running — skipping network check." + fi +done + +# ── CHECK 10: Disk space (warn if < 2GB free) ────────────────────────────────── +echo "" +echo "--- CHECK 10: Disk space ---" +FREE_KB=$(df -k / | awk 'NR==2 {print $4}') +FREE_GB=$(awk "BEGIN {printf \"%.1f\", $FREE_KB/1024/1024}") +if [ "$FREE_KB" -lt 2097152 ]; then + warn "Low disk space: ${FREE_GB}GB free (< 2GB). Pruning unused Docker images." + docker image prune -f --filter "until=48h" >/dev/null 2>&1 || true + FREE_KB_AFTER=$(df -k / | awk 'NR==2 {print $4}') + FREE_GB_AFTER=$(awk "BEGIN {printf \"%.1f\", $FREE_KB_AFTER/1024/1024}") + ok "After prune: ${FREE_GB_AFTER}GB free." + if [ "$FREE_KB_AFTER" -lt 1048576 ]; then + record_failure "Critically low disk space: ${FREE_GB_AFTER}GB free after prune. Cannot deploy safely." + fi +else + ok "Disk space OK: ${FREE_GB}GB free." +fi + +# ── FINAL RESULT ─────────────────────────────────────────────────────────────── +echo "" +echo "=============================================" +if [ "$FAILURES" -eq 0 ]; then + echo -e "${GREEN} VPS READY — all checks passed${NC}" + echo "=============================================" + echo "" + exit 0 +else + echo -e "${RED} VPS NOT READY — $FAILURES check(s) failed${NC}" + echo " Deployment must not proceed." + echo "=============================================" + echo "" + exit 1 +fi diff --git a/scripts/vps-setup.sh b/scripts/vps-setup.sh index 091121f..1df411a 100644 --- a/scripts/vps-setup.sh +++ b/scripts/vps-setup.sh @@ -393,41 +393,99 @@ fi # ============================================================================ log "Phase 14: Starting monitoring stack..." -# Stop system nginx — Docker nginx in the monitoring stack takes over ports 80/443. -# System nginx is no longer needed after cert acquisition; Docker nginx handles -# ACME challenge renewal via /var/www/certbot mount. +# Stop system nginx — Docker nginx takes over ports 80/443 from this point. +# System nginx is no longer needed after cert acquisition; the Docker nginx +# container handles ACME challenge renewal via the /var/www/certbot mount. log "Phase 14a: Stopping system nginx (Docker nginx takes over)..." systemctl stop nginx || true systemctl disable nginx || true log "System nginx stopped and disabled." +# Kill any docker-proxy ghost processes that may be holding host ports 80/443 +# from a previous failed start. pkill is a safe no-op if no process matches. +pkill docker-proxy 2>/dev/null || true + +# Ensure api_network exists before starting compose (idempotent). +# The compose file declares it as external; Docker will NOT create it automatically. +if ! docker network ls --format '{{.Name}}' | grep -Eq "^${NETWORK}$"; then + docker network create --driver bridge "$NETWORK" + log "Docker network '$NETWORK' created before compose." +else + log "Docker network '$NETWORK' already exists." +fi + +# Ensure nginx live config dir and initial config exist before starting nginx, +# so the container can mount the directory even before the first deploy runs. +mkdir -p "$NGINX_LIVE_DIR" +if [ ! -f "$NGINX_SITE_LINK" ]; then + sed \ + -e "s|__ACTIVE_CONTAINER__|api-blue|g" \ + -e "s|__API_HOSTNAME__|$DOMAIN|g" \ + "$REPO_DIR/infra/nginx/api.conf" > "$NGINX_SITE_LINK" + log "Bootstrap nginx config written (pointing to api-blue) at $NGINX_SITE_LINK" +fi + +# Start nginx FIRST using --no-deps to avoid being blocked by the +# grafana → prometheus → alertmanager health-check dependency chain. +# nginx uses deferred Docker DNS resolution so it starts cleanly without +# needing any backend container to be up. +log "Phase 14b: Starting Docker nginx (without dependency wait)..." cd "$REPO_DIR/infra" +sudo -u "$DEPLOY_USER" docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml \ + up -d --no-deps nginx +log "Docker nginx container started." + +# Now start the rest of the monitoring stack (prometheus, alertmanager, grafana, etc.). +log "Phase 14c: Starting full monitoring stack..." sudo -u "$DEPLOY_USER" docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml up -d +cd "$REPO_DIR" -log "Monitoring stack started (Prometheus, Grafana, Node Exporter, Nginx)" +log "Monitoring stack started (Prometheus, Alertmanager, Grafana, Loki, Promtail, Node Exporter, Nginx)" # ============================================================================ -# PHASE 15: First Deployment +# PHASE 15: First Deployment (Bootstrap) # ============================================================================ -log "Phase 15: Pulling and starting initial backend container..." +log "Phase 15: Starting bootstrap API container..." +# +# IMPORTANT: This phase uses :latest for the initial bootstrap ONLY. +# :latest is the only available tag before any CI deploy has run. +# After this script completes, every subsequent deploy uses a SHA-pinned +# image (ghcr.io/fieldtrack-tech/api:<7-char-sha>) via deploy-bluegreen.sh. +# Immutability is enforced from the first CI push onwards. +# +# NO HOST PORT BINDINGS — api-blue connects solely via api_network. +# nginx routes to it via Docker DNS: server api-blue:3000. -# Pull the latest image sudo -u "$DEPLOY_USER" docker pull ghcr.io/fieldtrack-tech/api:latest -# Start the blue container as initial deployment if [ -f "$ENV_FILE" ] && grep -q "SUPABASE_URL=your-" "$ENV_FILE"; then warn "Skipping container start — .env still has placeholder values." - warn "After editing .env, run:" - warn " cd $REPO_DIR && ./scripts/deploy-bluegreen.sh latest" + warn "After editing .env, push to master and let CI deploy, or run:" + warn " cd $REPO_DIR && ./scripts/deploy-bluegreen.sh " else + # Remove a stale api-blue if it exists from a previous aborted attempt + if docker ps -a --format '{{.Names}}' | grep -Eq '^api-blue$'; then + docker stop --time 5 api-blue 2>/dev/null || true + docker rm api-blue 2>/dev/null || true + log "Removed stale api-blue container." + fi + + # Start api-blue on api_network — NO -p / no host port binding. sudo -u "$DEPLOY_USER" docker run -d \ --name api-blue \ --network "$NETWORK" \ --restart unless-stopped \ + --label "api.slot=blue" \ + --label "api.sha=latest-bootstrap" \ --env-file "$ENV_FILE" \ ghcr.io/fieldtrack-tech/api:latest - log "Backend container (api-blue) started (network: $NETWORK)." + log "Bootstrap container api-blue started (network: $NETWORK, no host ports)." + + # Write the active-slot file so deploy-bluegreen.sh recovery finds it. + install -d -m 750 -o "$DEPLOY_USER" -g "$DEPLOY_USER" /var/run/api 2>/dev/null || true + echo "blue" > /var/run/api/active-slot + log "Active slot file written: blue" fi # ============================================================================