From e9905ca6995dc7d63d4588a8ce0a9d0303a65b76 Mon Sep 17 00:00:00 2001 From: rajashish147 Date: Sat, 4 Apr 2026 03:24:35 +0530 Subject: [PATCH 1/8] Refactor code structure for improved readability and maintainability --- .github/pull_request_template.md | 2 +- .github/workflows/deploy.yml | 325 ++-- .github/workflows/pr.yml | 54 +- .gitignore | 11 - CHANGELOG.md | 20 +- CONTRIBUTING.md | 1 - README.md | 223 ++- docs/ARCHITECTURE.md | 51 +- docs/DEPLOYMENT.md | 143 +- docs/OBSERVABILITY_ARCHITECTURE.md | 309 +--- docs/ROLLBACK_QUICKREF.md | 43 +- docs/ROLLBACK_SYSTEM.md | 38 +- docs/SLO.md | 8 +- docs/env-contract.md | 28 +- docs/infra-contract.md | 20 + docs/walkthrough.md | 116 +- infra/.env.monitoring.example | 61 - infra/alertmanager/alertmanager.yml | 65 - infra/blackbox/blackbox.yml | 21 - infra/docker-compose.monitoring.yml | 264 --- infra/docker-compose.nginx.yml | 44 + infra/docker-compose.redis.yml | 38 + infra/grafana/dashboards/fieldtrack.json | 680 -------- .../provisioning/dashboards/dashboard.yml | 15 - .../provisioning/datasources/prometheus.yml | 13 - infra/loki/loki-config.yaml | 47 - infra/nginx/api.conf | 274 --- infra/prometheus/alerts.yml | 559 ------ infra/prometheus/prometheus.yml | 101 -- infra/promtail/promtail.yml | 62 - infra/scripts/render-alertmanager.sh | 133 -- infra/scripts/verify-alertmanager.sh | 192 -- infra/tempo/tempo.yml | 43 - package.json | 3 +- scripts/analytics-backfill.ts | 242 --- scripts/deploy-bluegreen.sh | 1539 ----------------- scripts/deploy.sh | 1217 +++++++++++++ scripts/load-env.sh | 97 -- scripts/load-testing/README.md | 127 -- scripts/load-testing/dashboard-load-test.js | 124 -- scripts/load-testing/expenses-load-test.js | 134 -- scripts/load-testing/map-load-test.js | 92 - scripts/load-testing/queue-impact-test.js | 146 -- scripts/monitoring-sync.sh | 344 ---- scripts/rollback.sh | 114 -- scripts/smoke-test.sh | 445 ----- scripts/validate-env.sh | 289 ---- scripts/verify-stabilization.sh | 372 +--- scripts/vps-readiness-check.sh | 35 +- scripts/vps-setup.sh | 528 ------ src/config/env.ts | 7 +- src/routes/events.routes.ts | 2 +- src/routes/health.ts | 20 +- src/server.ts | 54 +- src/tracing.ts | 9 +- tests/setup/env-setup.ts | 3 + vitest.config.ts | 4 + 57 files changed, 1826 insertions(+), 8125 deletions(-) create mode 100644 docs/infra-contract.md delete mode 100644 infra/.env.monitoring.example delete mode 100644 infra/alertmanager/alertmanager.yml delete mode 100644 infra/blackbox/blackbox.yml delete mode 100644 infra/docker-compose.monitoring.yml create mode 100644 infra/docker-compose.nginx.yml create mode 100644 infra/docker-compose.redis.yml delete mode 100644 infra/grafana/dashboards/fieldtrack.json delete mode 100644 infra/grafana/provisioning/dashboards/dashboard.yml delete mode 100644 infra/grafana/provisioning/datasources/prometheus.yml delete mode 100644 infra/loki/loki-config.yaml delete mode 100644 infra/nginx/api.conf delete mode 100644 infra/prometheus/alerts.yml delete mode 100644 infra/prometheus/prometheus.yml delete mode 100644 infra/promtail/promtail.yml delete mode 100644 infra/scripts/render-alertmanager.sh delete mode 100644 infra/scripts/verify-alertmanager.sh delete mode 100644 infra/tempo/tempo.yml delete mode 100644 scripts/analytics-backfill.ts delete mode 100644 scripts/deploy-bluegreen.sh create mode 100644 scripts/deploy.sh delete mode 100644 scripts/load-env.sh delete mode 100644 scripts/load-testing/README.md delete mode 100644 scripts/load-testing/dashboard-load-test.js delete mode 100644 scripts/load-testing/expenses-load-test.js delete mode 100644 scripts/load-testing/map-load-test.js delete mode 100644 scripts/load-testing/queue-impact-test.js delete mode 100644 scripts/monitoring-sync.sh delete mode 100644 scripts/rollback.sh delete mode 100644 scripts/smoke-test.sh delete mode 100644 scripts/validate-env.sh delete mode 100644 scripts/vps-setup.sh diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 93109a4..d599dfe 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -52,7 +52,7 @@ Closes # ## Final Checklist - [ ] PR title follows conventional commit format (`type(scope): description`) -- [ ] Branch name follows convention (`feat/*`, `fix/*`, `infra/*`, etc.) +- [ ] Branch name follows convention (`feat/*`, `fix/*`, `docs/*`, etc.) - [ ] No debug logs, commented-out code, or `TODO` / `FIXME` left in diff - [ ] No secrets or credentials committed - [ ] Relevant documentation updated (if applicable) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 66a7dd5..66c54ea 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -20,9 +20,9 @@ # ┘ │ # api-health-gate ◄────────┘ # │ -# sync-infra ─► sync-monitoring ─► health-and-smoke -# │ -# rollback ◄──────────────────────────────┘ (on failure) +# health-and-smoke +# │ +# rollback ◄───────────────── (on failure) name: Deploy to Production @@ -211,6 +211,69 @@ jobs: - name: Run all tests run: npm test + # --------------------------------------------------------------------------- + # JOB: infra-leakage-guard + # + # Pre-deploy safety gate: ensures the API repo has not re-introduced + # references to infra concerns (monitoring stack, /ready in deploy path). + # Runs in parallel with validate and test-api. + # + # Guards: + # 1. No alertmanager/docker-compose.monitoring client code in src/ or tests/ + # 2. No docker-compose.monitoring references in deploy.sh or deploy.yml executable steps + # 3. No /ready usage in scripts/deploy.sh (health gate must use /health only) + # --------------------------------------------------------------------------- + infra-leakage-guard: + name: Infra Leakage Guard + runs-on: ubuntu-latest + needs: [codeql-gate] + timeout-minutes: 5 + steps: + - name: Checkout + uses: actions/checkout@v5 + with: + ref: ${{ needs.codeql-gate.outputs.deploy_sha }} + + - name: Block monitoring infra client references in API source + run: | + # The API legitimately uses prom-client (prometheus.ts plugin) and emits + # OTLP traces. What must NOT appear is external infra client code — + # i.e., direct references to alertmanager, loki push clients, or + # docker-compose.monitoring in the application source. + # Exclude comment-only lines (-h suppresses filenames for grep -Ev). + LEAKS=$(grep -rhE "(alertmanager|docker-compose\.monitoring)" src/ tests/ 2>/dev/null \ + | grep -Ev '^\s*(//|#|\*|/\*)') + if [ -n "$LEAKS" ]; then + echo "::error::Infra client references found in src/ or tests/" + echo "$LEAKS" + exit 1 + fi + echo "✓ No alertmanager/monitoring-compose references in src/ or tests/" + + - name: Block docker-compose.monitoring references in deploy path + run: | + # deploy.sh is the only script in the deploy path. + if grep -E "docker-compose\.monitoring" scripts/deploy.sh 2>/dev/null | grep -Ev '^\s*#'; then + echo "::error::deploy.sh references docker-compose.monitoring — deploy must be monitoring-independent" + exit 1 + fi + # Verify deploy.yml does not execute monitoring compose commands. + # Guard comments are allowed; executable command lines are not. + if grep -E "docker-compose\.monitoring|docker compose.*monitoring" .github/workflows/deploy.yml \ + | grep -Ev '(infra-leakage-guard|Block docker|No docker)'; then + echo "::error::deploy.yml workflow references docker-compose.monitoring outside guard comments" + exit 1 + fi + echo "✓ No docker-compose.monitoring references in deploy path" + + - name: Block /ready in deploy path (deploy.sh) + run: | + if grep -E "(/ready)" scripts/deploy.sh | grep -Ev '^\s*#'; then + echo "::error::deploy.sh references /ready — deploy gate must use /health only" + exit 1 + fi + echo "✓ deploy.sh does not reference /ready" + # --------------------------------------------------------------------------- # JOB: build-scan-push # @@ -231,7 +294,7 @@ jobs: build-scan-push: name: Build, Scan & Push Docker Image runs-on: ubuntu-latest - needs: [codeql-gate, validate, test-api] + needs: [codeql-gate, validate, test-api, infra-leakage-guard] timeout-minutes: 25 permissions: contents: read @@ -607,7 +670,7 @@ jobs: script: | set -euo pipefail export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" - [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT — run vps-setup.sh first"; exit 1; } + [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } cd "$DEPLOY_ROOT" # Pull latest scripts without full deploy git fetch origin master --depth=1 @@ -619,7 +682,7 @@ jobs: # JOB: deploy # # Blue-Green deployment to VPS via SSH. - # The deploy-bluegreen.sh script manages slot switching and container health. + # deploy.sh manages slot switching and container health. # # DEPENDENCY GATES (both must pass): # - vps-readiness-check: ensures VPS can accept the deployment @@ -661,27 +724,6 @@ jobs: } >> "$GITHUB_STEP_SUMMARY" echo "[DEPLOY] Deployment initiated — SHA=${{ github.sha }} EVENT=${{ github.event_name }} ACTOR=${{ github.actor }}" - - name: Validate environment contract before deploy - uses: appleboy/ssh-action@v1.0.3 - with: - host: ${{ secrets.DO_HOST }} - username: ${{ secrets.DO_USER }} - key: ${{ secrets.DO_SSH_KEY }} - script: | - set -euo pipefail - export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" - [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } - cd "$DEPLOY_ROOT" - # Pin repo to the exact SHA that was built and scanned by CodeQL. - # Prevents stale scripts from running if concurrent commits landed. - git fetch origin - git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }} - chmod +x scripts/*.sh - echo "::group::Environment validation" - ./scripts/validate-env.sh --check-monitoring - echo "::endgroup::" - echo "[DEPLOY] Environment contract validated" - - name: Blue-Green deploy via SSH uses: appleboy/ssh-action@v1.0.3 with: @@ -699,8 +741,7 @@ jobs: git fetch origin git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }} chmod +x scripts/*.sh - # Environment already validated in previous step - ./scripts/deploy-bluegreen.sh "${{ needs.build-scan-push.outputs.sha_short }}" + ./scripts/deploy.sh "${{ needs.build-scan-push.outputs.sha_short }}" echo "[DEPLOY] Deploy completed in $(($(date +%s) - T0))s" - name: Log deployment state (slot + SHA for debugging) @@ -734,9 +775,9 @@ jobs: # --------------------------------------------------------------------------- # JOB: api-health-gate (Step E+) # - # Early API health validation — runs AFTER deploy but BEFORE infra sync. - # Ensures the API container is truly healthy before we sync monitoring/nginx. - # If the API is not healthy at this point, STOP before touching infra. + # Validates the API container is healthy after deploy. + # Ensures /health returns 200 before proceeding to smoke tests. + # If the API is not healthy at this point, rollback is triggered. # --------------------------------------------------------------------------- api-health-gate: name: API Health Gate @@ -744,7 +785,7 @@ jobs: needs: [deploy] timeout-minutes: 5 steps: - - name: Verify API container is healthy before infra sync + - name: Verify API container is healthy after deploy uses: appleboy/ssh-action@v1.0.3 with: host: ${{ secrets.DO_HOST }} @@ -755,7 +796,6 @@ jobs: export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } cd "$DEPLOY_ROOT" - source scripts/load-env.sh ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "blue") ACTIVE_CONTAINER="api-$ACTIVE_SLOT" @@ -769,181 +809,53 @@ jobs: for i in $(seq 1 15); do STATUS=$(docker run --rm --network api_network "$FT_CURL_IMG" \ -s -o /dev/null -w "%{http_code}" \ - "http://$ACTIVE_CONTAINER:3000/ready" 2>/dev/null || echo "000") + "http://$ACTIVE_CONTAINER:3000/health" 2>/dev/null || echo "000") if [ "$STATUS" = "200" ]; then - echo "[DEPLOY] API ready (slot=$ACTIVE_SLOT attempt=$i)" + echo "[DEPLOY] API healthy (slot=$ACTIVE_SLOT attempt=$i)" exit 0 fi sleep 2 done - echo "::error::API /ready did not return 200 after 30s" + echo "::error::API /health did not return 200 after 30s" docker logs "$ACTIVE_CONTAINER" --tail 30 >&2 2>/dev/null || true exit 1 - # --------------------------------------------------------------------------- - # JOB: sync-infra - # - # Syncs Nginx config (with slot-aware port substitution). - # Monitoring restarts are handled exclusively by deploy-bluegreen.sh. - # --------------------------------------------------------------------------- - sync-infra: - name: Sync Infrastructure (nginx) - runs-on: ubuntu-latest - needs: [api-health-gate] - timeout-minutes: 10 - steps: - - name: Sync infrastructure configs via SSH - uses: appleboy/ssh-action@v1.0.3 - with: - host: ${{ secrets.DO_HOST }} - username: ${{ secrets.DO_USER }} - key: ${{ secrets.DO_SSH_KEY }} - script: | - set -euo pipefail - T0=$(date +%s) - export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" - [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } - cd "$DEPLOY_ROOT" - INFRA_DIR="$DEPLOY_ROOT/infra" - NGINX_LIVE="$DEPLOY_ROOT/infra/nginx/live/api.conf" - NGINX_BACKUP_DIR="$DEPLOY_ROOT/infra/nginx/backup" - ACTIVE_SLOT_FILE="/var/run/api/active-slot" - - ACTIVE_SLOT=$(cat "$ACTIVE_SLOT_FILE" 2>/dev/null || echo "blue") - ACTIVE_CONTAINER="api-$ACTIVE_SLOT" - - # Load env from .env — exports DEPLOY_ROOT, API_HOSTNAME, and all - # app variables. DEPLOY_ROOT is already exported above; load-env.sh uses it. - source "$DEPLOY_ROOT/scripts/load-env.sh" - - # Ensure live/backup dirs exist - mkdir -p "$(dirname "$NGINX_LIVE")" "$NGINX_BACKUP_DIR" - - echo "::group::Nginx sync (slot=$ACTIVE_SLOT)" - cp "$NGINX_LIVE" "$NGINX_BACKUP_DIR/api.conf.bak.$(date +%s)" 2>/dev/null || true - NGINX_TMP=$(mktemp /tmp/fieldtrack-nginx.XXXXXX.conf) - sed \ - -e "s|__ACTIVE_CONTAINER__|$ACTIVE_CONTAINER|g" \ - -e "s|__API_HOSTNAME__|$API_HOSTNAME|g" \ - "$INFRA_DIR/nginx/api.conf" > "$NGINX_TMP" - cp "$NGINX_TMP" "$NGINX_LIVE" - rm -f "$NGINX_TMP" - - NGINX_TEST_OUT=$(docker exec nginx nginx -t 2>&1) || { - echo "::error::Nginx config test failed — restoring backup" - printf '%s\n' "$NGINX_TEST_OUT" >&2 - LATEST_BAK=$(ls -1t "$NGINX_BACKUP_DIR"/api.conf.bak.* 2>/dev/null | head -1 || true) - [ -n "$LATEST_BAK" ] && cp "$LATEST_BAK" "$NGINX_LIVE" - exit 1 - } - docker exec nginx nginx -s reload >/dev/null 2>&1 - echo "[DEPLOY] Nginx reloaded → upstream=$ACTIVE_CONTAINER" - echo "::endgroup::" - - # ROUTING VALIDATION — in-network (source of truth) - sleep 2 - ROUTE_STATUS=$(docker run --rm --network api_network \ - curlimages/curl:8.7.1 -sk -o /dev/null -w "%{http_code}" \ - --max-time 10 https://nginx/health 2>/dev/null || echo "000") - - if [ "$ROUTE_STATUS" = "200" ]; then - echo "[DEPLOY] Nginx routing verified (HTTP $ROUTE_STATUS)" - else - echo "::error::Nginx routing check failed (HTTP $ROUTE_STATUS) — restoring backup" - LATEST_BAK=$(ls -1t "$NGINX_BACKUP_DIR"/api.conf.bak.* 2>/dev/null | head -1 || true) - [ -n "$LATEST_BAK" ] && cp "$LATEST_BAK" "$NGINX_LIVE" - docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1 || true - exit 1 - fi - - # HTTPS advisory check (non-blocking) - HTTPS_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \ - --resolve "$API_HOSTNAME:443:127.0.0.1" \ - -H "Host: $API_HOSTNAME" \ - "https://127.0.0.1/health" --insecure 2>/dev/null || echo "000") - [ "$HTTPS_STATUS" != "200" ] && \ - echo "[DEPLOY] HTTPS advisory status=$HTTPS_STATUS (host→Docker TCP routing; in-network check is authoritative)" - - echo "[DEPLOY] Infra sync completed in $(($(date +%s) - T0))s" - - # --------------------------------------------------------------------------- - # JOB: sync-monitoring (Step F) - # - # Idempotent monitoring stack sync — runs after every deploy. - # Delegates to scripts/monitoring-sync.sh which: - # - Self-heals missing .env.monitoring from example - # - Creates api_network if absent - # - Renders alertmanager.rendered.yml - # - Runs docker compose up -d - # - Validates prometheus / alertmanager / grafana health - # Monitoring is REQUIRED — deploy fails if any required container is unhealthy. - # --------------------------------------------------------------------------- - sync-monitoring: - name: Sync Monitoring Stack - runs-on: ubuntu-latest - needs: [sync-infra] - timeout-minutes: 15 - steps: - - name: Sync and validate monitoring stack via SSH - uses: appleboy/ssh-action@v1.0.3 - with: - host: ${{ secrets.DO_HOST }} - username: ${{ secrets.DO_USER }} - key: ${{ secrets.DO_SSH_KEY }} - script: | - set -euo pipefail - export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" - [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } - cd "$DEPLOY_ROOT" - chmod +x scripts/monitoring-sync.sh - ./scripts/monitoring-sync.sh - - - name: Monitoring sync summary - if: always() - run: | - { - echo "## Monitoring Sync" - echo "| Container | Required |" - echo "|---|---|" - echo "| prometheus | ✅ |" - echo "| alertmanager | ✅ |" - echo "| grafana | ✅ |" - } >> "$GITHUB_STEP_SUMMARY" - - - name: Deployment artifact traceability - if: always() - run: | - { - echo "## Deployment Artifacts" - echo "| Field | Value |" - echo "|---|---|" - echo "| Deployment SHA | \`${{ github.sha }}\` |" - echo "| Image Tag | \`fieldtrack-api:${{ needs.get-metadata.outputs.sha_short || github.sha }}\` |" - echo "| Workflow Run | [\#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) |" - echo "| Triggered By | \`${{ github.event_name }}\` |" - echo "| Commit Message | \`${{ github.event.head_commit.message }}\` |" - } >> "$GITHUB_STEP_SUMMARY" - - # Also output to logs for audit trail - echo "DEPLOYMENT_COMPLETE: SHA=${{ github.sha }} IMAGE=ghcr.io/${{ github.repository_owner }}/api:${{ github.sha }} RUN=${{ github.run_id }}" - # --------------------------------------------------------------------------- # JOB: health-and-smoke # - # Step 1: Poll /health and /ready until they return 200 (up to 60 s each). - # Step 2: Run the full smoke test suite (login + core API flows). + # Post-deploy health verification and CI coupling guard. # Failure here triggers the rollback job automatically. # --------------------------------------------------------------------------- health-and-smoke: name: Health Checks & Smoke Tests runs-on: ubuntu-latest - needs: [sync-infra, sync-monitoring] + needs: [api-health-gate] timeout-minutes: 15 steps: - name: Checkout uses: actions/checkout@v5 + - name: CI guard — deploy.sh must not reference /ready or monitoring stack + run: | + set -euo pipefail + echo "Checking deploy.sh for forbidden references..." + # Exclude comment lines (starting with optional whitespace then #) + if grep -E "(/ready)" scripts/deploy.sh | grep -Ev '^\s*#'; then + echo "::error::deploy.sh references /ready — deploy gate must only use /health" + exit 1 + fi + if grep -E "(prometheus|grafana|alertmanager|loki)" scripts/deploy.sh | grep -Ev '^\s*#'; then + echo "::error::deploy.sh references monitoring stack — deploy must be monitoring-independent" + exit 1 + fi + echo "Validating no local infra coupling..." + if grep -R "infra/" . | grep -v "docs/infra-contract.md"; then + echo "::error::Local infra coupling detected" + exit 1 + fi + echo "✓ CI guards passed: no /ready or monitoring references in deploy.sh" + - name: Wait for /health endpoint (via VPS) uses: appleboy/ssh-action@v1.0.3 with: @@ -955,7 +867,8 @@ jobs: export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } cd "$DEPLOY_ROOT" - source scripts/load-env.sh + API_BASE_URL=$(grep -E '^API_BASE_URL=' .env | head -1 | cut -d'=' -f2-) + API_HOSTNAME=$(echo "$API_BASE_URL" | sed -E 's|^https?://||' | cut -d'/' -f1) for i in $(seq 1 30); do # Phase 1: in-network (source of truth) if docker run --rm --network api_network \ @@ -988,7 +901,8 @@ jobs: export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } cd "$DEPLOY_ROOT" - source scripts/load-env.sh + API_BASE_URL=$(grep -E '^API_BASE_URL=' .env | head -1 | cut -d'=' -f2-) + API_HOSTNAME=$(echo "$API_BASE_URL" | sed -E 's|^https?://||' | cut -d'/' -f1) for i in $(seq 1 10); do # Phase 1: in-network (source of truth) if docker run --rm --network api_network \ @@ -1010,45 +924,24 @@ jobs: echo "::error::Final health check failed after 10 attempts" exit 1 - - name: Run smoke tests - env: - API_BASE_URL: ${{ secrets.API_BASE_URL }} - FT_EMP_EMAIL: ${{ secrets.FT_EMP_EMAIL }} - FT_EMP_PASSWORD: ${{ secrets.FT_EMP_PASSWORD }} - FT_ADMIN_EMAIL: ${{ secrets.FT_ADMIN_EMAIL }} - FT_ADMIN_PASSWORD: ${{ secrets.FT_ADMIN_PASSWORD }} - SUPABASE_URL: ${{ secrets.SUPABASE_URL }} - SUPABASE_ANON_KEY: ${{ secrets.SUPABASE_ANON_KEY }} - run: | - chmod +x scripts/smoke-test.sh - ./scripts/smoke-test.sh - - - name: Upload smoke test report - if: always() - uses: actions/upload-artifact@v4 - with: - name: smoke-test-report-${{ github.sha }} - path: smoke-report.json - retention-days: 30 - - name: Deployment summary run: | echo "[DEPLOY] Production deployment complete" echo " Commit : ${{ github.sha }}" echo " Health : OK" - echo " Smoke : passed" + echo " Post-deploy checks : passed" # --------------------------------------------------------------------------- # JOB: rollback # - # Triggered automatically when deploy, sync-infra, OR health-and-smoke fails. + # Triggered automatically when deploy or health-and-smoke fails. # Restores the previously healthy Blue-Green slot via the rollback script. # 'if: always()' ensures this job can evaluate even if upstream jobs failed. # --------------------------------------------------------------------------- rollback: name: Rollback Deployment (auto) runs-on: ubuntu-latest - needs: [vps-readiness-check, deploy, api-health-gate, sync-infra, sync-monitoring, health-and-smoke] + needs: [vps-readiness-check, deploy, api-health-gate, health-and-smoke] timeout-minutes: 10 if: | always() && @@ -1056,8 +949,6 @@ jobs: needs.vps-readiness-check.result == 'failure' || needs.deploy.result == 'failure' || needs.api-health-gate.result == 'failure' || - needs.sync-infra.result == 'failure' || - needs.sync-monitoring.result == 'failure' || needs.health-and-smoke.result == 'failure' ) steps: @@ -1067,8 +958,6 @@ jobs: [ "${{ needs.vps-readiness-check.result }}" = "failure" ] && echo " [ERROR] failed job: vps-readiness-check" || true [ "${{ needs.deploy.result }}" = "failure" ] && echo " [ERROR] failed job: deploy" || true [ "${{ needs.api-health-gate.result }}" = "failure" ] && echo " [ERROR] failed job: api-health-gate" || true - [ "${{ needs.sync-infra.result }}" = "failure" ] && echo " [ERROR] failed job: sync-infra" || true - [ "${{ needs.sync-monitoring.result }}" = "failure" ] && echo " [ERROR] failed job: sync-monitoring" || true [ "${{ needs.health-and-smoke.result }}" = "failure" ] && echo " [ERROR] failed job: health-and-smoke" || true - name: Rollback on VPS @@ -1083,7 +972,7 @@ jobs: [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } cd "$DEPLOY_ROOT" chmod +x scripts/*.sh - ./scripts/rollback.sh --auto + ./scripts/deploy.sh --rollback --auto ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "unknown") echo "[DEPLOY] Rollback complete — slot=$ACTIVE_SLOT sha=${{ github.sha }}" diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 89ebacb..cf568a1 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -22,7 +22,6 @@ jobs: timeout-minutes: 5 outputs: api: ${{ steps.filter.outputs.api }} - infra: ${{ steps.filter.outputs.infra }} steps: - uses: actions/checkout@v5 @@ -37,9 +36,6 @@ jobs: - 'package-lock.json' - 'tsconfig.json' - 'vitest.config.ts' - infra: - - 'infra/**' - - '.github/workflows/**' api-ci: name: API CI @@ -229,58 +225,10 @@ jobs: docker network rm ci_api_net docker rmi fieldtrack-api:ci-validation - infra-ci: - name: Infra CI - runs-on: ubuntu-latest - needs: detect-changes - timeout-minutes: 10 - if: always() - steps: - - name: Abort if change detection failed - if: needs.detect-changes.result != 'success' - run: | - echo "❌ Change detection did not succeed (result: ${{ needs.detect-changes.result }}) — cannot safely skip checks" - exit 1 - - - name: Skip if no infra changes - if: needs.detect-changes.outputs.infra != 'true' - run: | - echo "No infra changes — skipping all infra validation" - echo "✓ Infra CI (skipped)" - exit 0 - - - uses: actions/checkout@v5 - if: needs.detect-changes.outputs.infra == 'true' - - - name: Validate nginx config - if: needs.detect-changes.outputs.infra == 'true' - run: | - sed \ - -e 's/__ACTIVE_CONTAINER__/api-blue/g' \ - -e 's/__API_HOSTNAME__/api.test.local/g' \ - infra/nginx/api.conf > /tmp/nginx.conf - - if grep -q '__[A-Z_]*__' /tmp/nginx.conf; then - echo "❌ Unreplaced placeholders" - exit 1 - fi - - mkdir -p /tmp/ssl - openssl req -x509 -nodes -days 1 \ - -newkey rsa:2048 \ - -keyout /tmp/ssl/origin.key \ - -out /tmp/ssl/origin.crt \ - -subj "/CN=localhost" - - docker run --rm \ - -v /tmp/nginx.conf:/etc/nginx/conf.d/default.conf:ro \ - -v /tmp/ssl:/etc/ssl/api:ro \ - nginx:1.27-alpine nginx -t - # --------------------------------------------------------------------------- # JOB: codeql-lite # - # Lightweight CodeQL security scan — runs in PARALLEL with api-ci and infra-ci. + # Lightweight CodeQL security scan — runs in PARALLEL with api-ci. # Uses security-extended queries (OWASP Top-10 class) for fast PR feedback. # This job is REQUIRED in branch protection; PRs cannot merge until it passes. # diff --git a/.gitignore b/.gitignore index 789b129..11fb191 100644 --- a/.gitignore +++ b/.gitignore @@ -2,16 +2,6 @@ # .gitignore for FieldTrack API # ============================================ -# ---------------- -# Infrastructure -# ---------------- -# Monitoring data -infra/tempo/data/ -infra/prometheus/data/ -infra/grafana/data/ -# Rendered Alertmanager config (contains real webhook URL — VPS only) -infra/alertmanager/alertmanager.rendered.yml - # Deployment history (VPS-side file, never committed) .deploy_history .last_deploy @@ -44,7 +34,6 @@ packages/*/node_modules/ .env.test.local .env.production.local !.env.example -!.env.monitoring.example # ---------------- # Build Output diff --git a/CHANGELOG.md b/CHANGELOG.md index f5399bd..309b088 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,9 +62,9 @@ All significant changes to FieldTrack 2.0 are documented here by development pha - Every image is tagged with both `latest` and a 7-character SHA ### Rollback System (commits `35db851`, `23e7720`) -- Added `backend/scripts/rollback.sh` — reads `.deploy_history`, validates ≥ 2 deployments, displays history table with current/target markers, prompts for confirmation, redeploys previous image using `deploy-bluegreen.sh` -- Updated `backend/scripts/deploy-bluegreen.sh` to prepend the deployed SHA to `.deploy_history` (rolling window of 5) after every successful deploy -- Added `backend/.gitignore` entry for `.deploy_history` +- Added rollback mode to `scripts/deploy.sh` — reads `.deploy_history`, validates ≥ 2 deployments, displays history table with current/target markers, prompts for confirmation, and redeploys the previous image +- Updated `scripts/deploy.sh` to prepend the deployed SHA to `.deploy_history` (rolling window of 5) after every successful deploy +- Added `.gitignore` entry for `.deploy_history` - Added `docs/ROLLBACK_SYSTEM.md` and `docs/ROLLBACK_QUICKREF.md` --- @@ -137,20 +137,14 @@ All significant changes to FieldTrack 2.0 are documented here by development pha - Added `otelMixin` in `src/config/logger.ts` — injects `trace_id`, `span_id`, `trace_flags` into every Pino log line - Added OTel span enrichment in `app.ts` `onRequest` hook — sets `http.route`, `http.client_ip`, `request.id`, `enduser.id` on every request - Upgraded Prometheus histogram to `observeWithExemplar()` with `traceId` on every observation -- Updated `infra/docker-compose.monitoring.yml` — Tempo ports 4317/4318; Prometheus `--enable-feature=exemplar-storage` -- Updated `infra/prometheus/prometheus.yml` — OpenMetrics scrape format for exemplar ingestion +- Updated standalone infra repository monitoring config — Tempo ports 4317/4318; Prometheus `--enable-feature=exemplar-storage` +- Updated standalone infra repository Prometheus config — OpenMetrics scrape format for exemplar ingestion --- ## [Phase 13] — Production Infrastructure: VPS, Nginx & Monitoring Stack — 2026 -- Added `backend/scripts/vps-setup.sh` — idempotent VPS provisioning (Docker, Nginx, systemd, certbot, ufw) -- Added `infra/nginx/api.conf` — TLS termination, HTTP→HTTPS redirect, proxy headers, WebSocket upgrade, gzip -- Added `infra/docker-compose.monitoring.yml` — Prometheus, Grafana, Loki, Promtail, Tempo on `api_network` -- Added `infra/grafana/dashboards/fieldtrack.json` — pre-built dashboard (HTTP rate, latency, queue depth, heap, Redis) -- Added `infra/grafana/provisioning/` — auto-provisioned dashboard and Prometheus datasource -- Added `infra/prometheus/alerts.yml` — alert rules for API latency, queue depth, Redis connectivity, host metrics -- Added `infra/promtail/promtail.yml` — Docker log discovery and shipping to Loki +- Added VPS setup and infra assets for production infrastructure (later extracted into standalone infra repository) --- @@ -167,7 +161,7 @@ All significant changes to FieldTrack 2.0 are documented here by development pha ## [Phase 11] — CI/CD Deployment Hardening — 2025 - Added initial GitHub Actions workflow for automated deployment -- Added `backend/scripts/deploy-bluegreen.sh` — blue-green zero-downtime deployment using Docker port-swap and Nginx upstream switch +- Added blue-green zero-downtime deployment script (later unified into `scripts/deploy.sh`) - Health-check validation before traffic switch - Old container removed only after successful switchover diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1c1a8b1..a654b37 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -23,7 +23,6 @@ cp .env.example .env |---------|---------|---------| | New feature | `feature/` | `feature/expense-attachments` | | Bug fix | `fix/` | `fix/session-double-close` | -| Infrastructure | `infra/` | `infra/add-redis-tls` | | Documentation | `docs/` | `docs/update-api-reference` | | Tests | `test/` | `test/analytics-edge-cases` | | Chores / deps | `chore/` | `chore/bump-fastify-5` | diff --git a/README.md b/README.md index 6a731f6..d0d8650 100644 --- a/README.md +++ b/README.md @@ -11,25 +11,25 @@ ## Overview -FieldTrack 2.0 is a production-ready REST API backend for managing field workforce operations. It provides secure, multi-tenant APIs for tracking employee attendance, real-time GPS location, expense workflows, and aggregate analytics — all with a full observability stack, automated CI/CD, and zero-downtime blue-green deployments. +FieldTrack 2.0 is a production-ready REST API for managing field workforce operations. It provides secure, multi-tenant APIs for tracking employee attendance, real-time GPS location, expense workflows, and aggregate analytics. + +**Boundaries:** This repository is the API only. Infrastructure (nginx, monitoring stack, VPS provisioning) lives in the infra repository. --- ## Features -- **Multi-tenant isolation** — every data query is scoped to the authenticated organization; cross-tenant access is architecturally impossible -- **Attendance sessions** — check-in / check-out lifecycle with state machine enforcement (`EmployeeAlreadyCheckedIn`, `SessionAlreadyClosed`) +- **Multi-tenant isolation** — every query is scoped to the authenticated organization; cross-tenant access is architecturally impossible +- **Attendance sessions** — check-in / check-out lifecycle with state machine enforcement - **Real-time GPS ingestion** — single and batch endpoints (up to 100 points), idempotent upsert, per-user rate limiting - **Async distance calculation** — BullMQ background worker computes Haversine distance after check-out; never blocks the HTTP response -- **Expense workflow** — PENDING → APPROVED / REJECTED lifecycle, ADMIN review endpoints, re-review guard -- **Admin analytics** — org-wide summaries, per-user breakdowns, configurable leaderboard (distance / duration / sessions) -- **Redis-backed rate limiting** — per-JWT-sub limits on write endpoints survive corporate NAT and horizontal scaling -- **Security plugins** — Helmet, CORS, Redis rate limiter, brute-force detection with Prometheus counters -- **Distributed tracing** — OpenTelemetry → Tempo; trace IDs injected into every Pino log line -- **One-click metric-to-trace** — Prometheus exemplars link latency spikes directly to Tempo traces in Grafana -- **Blue-green zero-downtime deployments** — Nginx upstream swap, health-check gate, 5-SHA rollback history -- **Automated rollback** — `rollback.sh` restores the previous version in under 10 seconds -- **Full test suite** — 124 tests (8 files) with Vitest; unit + integration coverage; CI blocks deploy on failure +- **Expense workflow** — PENDING → APPROVED / REJECTED lifecycle, with re-review guard +- **Admin analytics** — org-wide summaries, per-user breakdowns, configurable leaderboard +- **Redis-backed rate limiting** — per-JWT-sub limits survive corporate NAT and horizontal scaling +- **Security** — Helmet, CORS, Redis rate limiter, brute-force detection +- **Distributed tracing** — OpenTelemetry → OTLP; trace IDs injected into every Pino log line +- **Blue-green zero-downtime deployments** — nginx upstream swap, health-check gate, 5-SHA rollback history +- **Full test suite** — Vitest unit + integration coverage; CI blocks deploy on failure --- @@ -37,139 +37,185 @@ FieldTrack 2.0 is a production-ready REST API backend for managing field workfor | Layer | Technology | |-------|------------| -| **Runtime** | Node.js 24 (Alpine) | +| **Runtime** | Node.js 24 (Debian slim / distroless) | | **Language** | TypeScript 5.9 (strict, ESM) | | **Framework** | Fastify 5 | | **Database** | PostgreSQL via [Supabase](https://supabase.com) | | **Auth** | JWT (`@fastify/jwt`) — Supabase-issued tokens | | **Job Queue** | [BullMQ](https://docs.bullmq.io/) + Redis | | **Validation** | [Zod 4](https://zod.dev/) | -| **Observability** | Prometheus · Grafana · Loki · Tempo · Promtail · OpenTelemetry | +| **Tracing** | OpenTelemetry (OTLP export) | | **Security** | `@fastify/helmet` · `@fastify/cors` · `@fastify/rate-limit` · `@fastify/compress` | | **Testing** | [Vitest](https://vitest.dev/) | | **CI/CD** | GitHub Actions → GHCR → Blue-Green VPS Deploy | --- -## Architecture +## Local Development -### System Overview +**Prerequisites:** Node.js ≥ 24, npm, a running Redis instance, a Supabase project -``` -┌─────────────────────────────────────────────────────────────────┐ -│ CLIENT LAYER │ -│ Mobile App → Web Dashboard → Desktop Client │ -└────────────────────────────┬────────────────────────────────────┘ - │ HTTPS / REST API - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ APPLICATION LAYER │ -│ │ -│ Nginx (TLS · Blue-Green Routing) │ -│ │ │ -│ ▼ │ -│ Fastify 5 API Server │ -│ ├─ Auth Middleware (JWT) │ -│ ├─ Security (Helmet · CORS · Rate Limit) │ -│ ├─ Validation (Zod) │ -│ └─ Business Logic │ -└────────────────────────────┬────────────────────────────────────┘ - │ - ┌────────────┼────────────┐ - │ │ │ - ▼ ▼ ▼ -┌──────────────────┐ ┌──────────────┐ ┌──────────────────┐ -│ Supabase │ │ Redis │ │ BullMQ Worker │ -│ PostgreSQL │ │ Job Queue │ │ (Distance Calc) │ -│ (Multi-tenant) │ │ │ │ │ -└──────────────────┘ └──────────────┘ └──────────────────┘ - -┌─────────────────────────────────────────────────────────────────┐ -│ OBSERVABILITY LAYER │ -│ │ -│ Prometheus → Grafana ← Loki ← Tempo │ -│ (Metrics) (Dashboards) (Logs) (Traces) │ -└─────────────────────────────────────────────────────────────────┘ +```bash +# Install dependencies +npm install + +# Configure environment +cp .env.example .env +# Edit .env — fill in SUPABASE_URL, keys, REDIS_URL, and CORS_ORIGIN + +# Start in development mode (hot reload) +npm run dev ``` -**📊 For detailed architecture diagrams, data flows, and deployment topology see [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md)** +The API will start on `http://localhost:3000`. --- -## Quick Start +## Environment Variables -**Prerequisites:** Node.js ≥ 24, npm, Redis, a Supabase project +All variables are validated at startup by `src/config/env.ts` (Zod schema, fail-fast). -```bash -# 1. Install dependencies -npm install +### URLs -# 2. Configure environment -cp .env.example .env -# Edit .env — fill in Supabase URL, keys, Redis URL, and ALLOWED_ORIGINS +| Variable | Required | Purpose | +|----------|:---:|---------| +| `API_BASE_URL` | ✅ | Canonical public URL of this API (`https://…`, no trailing slash) | +| `APP_BASE_URL` | ✅ | Root URL of the application — used in email footers and redirects | +| `FRONTEND_BASE_URL` | ✅ prod | URL of the web frontend — used to build email links | -# 3. Run in development mode -npm run dev +### Runtime -# 4. Run the test suite -npm run test -``` +| Variable | Required | Default | Purpose | +|----------|:---:|---------|---------| +| `CONFIG_VERSION` | ✅ | `"1"` | Schema version guard — must be `"1"` | +| `APP_ENV` | ✅ | `development` | Application environment — drives all app-level logic | +| `PORT` | ✅ | `3000` | Container listen port | + +### Auth & Data + +| Variable | Required | Purpose | +|----------|:---:|---------| +| `SUPABASE_URL` | ✅ | Supabase project URL | +| `SUPABASE_ANON_KEY` | ✅ | Supabase public/anon key | +| `SUPABASE_SERVICE_ROLE_KEY` | ✅ | Service role key — bypasses RLS, never expose to clients | +| `SUPABASE_JWT_SECRET` | ✅ | JWT signing secret (≥ 32 chars, HS256) | +| `REDIS_URL` | ✅ | Redis connection URL (`redis://` or `rediss://`) | + +### Security + +| Variable | Required in Prod | Default | Purpose | +|----------|:---:|---------|---------| +| `CORS_ORIGIN` | ✅ | `""` | Comma-separated allowed CORS origins. Empty activates localhost fallback in dev | +| `METRICS_SCRAPE_TOKEN` | ✅ | — | Token required to scrape `/metrics`. Unset = open in dev/test | +| `TEMPO_ENDPOINT` | — | `http://tempo:4318` | OTLP HTTP endpoint for trace export | + +> **Observability variables (`METRICS_SCRAPE_TOKEN`, `TEMPO_ENDPOINT`) are optional for standalone operation.** The API starts and handles requests without them. `METRICS_SCRAPE_TOKEN` gates the `/metrics` endpoint (unset = endpoint is open, safe in dev/test). `TEMPO_ENDPOINT` controls where traces are exported; if the Tempo collector is unreachable, traces are silently dropped with no impact to request handling. The monitoring stack that scrapes these endpoints is managed in the [infra repository]. + +--- + +## Scripts + +| Command | Purpose | +|---------|---------| +| `npm run dev` | Start development server with hot reload | +| `npm run typecheck` | TypeScript type check (no emit) | +| `npm test` | Run full test suite (Vitest) | +| `npm run build` | Compile TypeScript to `dist/` | +| `npm start` | Start compiled production server | +| `./scripts/deploy.sh ` | Blue-green deploy a specific image SHA | +| `./scripts/deploy.sh --rollback` | Interactive rollback to previous SHA | +| `./scripts/deploy.sh --rollback --auto` | Non-interactive rollback (CI) | + +--- + +## Health Endpoints + +| Endpoint | Purpose | Deploy Gate | +|----------|---------|-------------| +| `GET /health` | Liveness check — returns `{"status":"ok"}` once the server bootstraps | **YES** — used by deploy.sh and CI | +| `GET /ready` | Dependency check — verifies Redis and Supabase connectivity | NO — informational only, not a deploy gate | + +`/health` returns 200 after server bootstrap regardless of dependency status. `/ready` failing does not block a deployment; a degraded-but-running API is preferred over a stuck deploy. --- -## Deployment +## Deployment Overview + +> **First-deployment requirement:** The API container joins `api_network`. On a fresh VPS, **nginx** (reverse-proxy) and **Redis** must already be running and attached to that network via the infra repository before the first `deploy.sh` run. Subsequent deploys are fully self-contained. + +## Infra Requirement + +This API requires an external infra repository. + +Expected on server: +- nginx (connected to `api_network`) +- Redis (`redis:6379`) + +Default path: +- `INFRA_ROOT=/opt/infra` -FieldTrack 2.0 deploys automatically via GitHub Actions on every push to `master`. +Deployments run automatically via GitHub Actions on every push to `master` (after CodeQL scan passes). ``` -Push to master - → test job (npm ci · tsc · vitest) — blocks on failure - → build-and-deploy job (Docker Buildx with GHA cache → GHCR → VPS SSH) +CodeQL deep scan (master) + → validate (typecheck + audit) ──┐ + → test-api ─────────────────────┼──► build-scan-push ──► vps-readiness-check ──► deploy + ┘ │ + api-health-gate ◄────────────────┘ + │ + health-and-smoke ──► rollback (on failure) ``` -### Manual deploy / rollback +**Blue-green strategy:** The VPS always runs two containers (`api-blue`, `api-green`). On each deploy, the inactive slot is updated and nginx is reloaded to point at it. The previous slot is stopped only after the health gate passes. +**nginx is managed by the infra repository.** The API container joins `api_network`; nginx is expected to already be running and configured. + +**Manual deploy:** ```bash -# On the VPS -./scripts/deploy-bluegreen.sh # Deploy a specific image -./scripts/rollback.sh # Restore previous version (~10 s) +./scripts/deploy.sh ``` -See [docs/DEPLOYMENT.md](docs/DEPLOYMENT.md) for full setup instructions including VPS provisioning, Nginx config, and CI/CD secret configuration. +**Rollback:** +```bash +./scripts/deploy.sh --rollback +``` + +See [docs/DEPLOYMENT.md](docs/DEPLOYMENT.md) for full deployment details. --- ## Project Structure -> **Note:** The web frontend is maintained in a separate repository: [fieldtrack-tech/web](https://github.com/fieldtrack-tech/web) - ``` api/ ├── src/ # Application source │ ├── modules/ # Domain modules (attendance · locations · expenses · analytics) -│ ├── plugins/ # Fastify plugins (JWT · Prometheus · security stack) +│ ├── plugins/ # Fastify plugins (JWT · metrics · security) │ ├── workers/ # BullMQ distance calculation worker │ ├── middleware/ # Auth + role guard -│ └── utils/ # Shared utilities (errors · response · tenant · metrics) +│ └── utils/ # Shared utilities (errors · response · tenant) ├── tests/ # Vitest unit and integration tests -├── scripts/ # Blue-green deploy + rollback scripts -├── infra/ # Monitoring stack (Prometheus · Grafana · Loki · Tempo) +├── scripts/ # Deploy, rollback, and utility scripts ├── docs/ # Project documentation └── .github/workflows/ # GitHub Actions CI/CD ``` +> The web frontend is in a separate repository: [fieldtrack-tech/web](https://github.com/fieldtrack-tech/web) +> Infrastructure (nginx, monitoring, VPS setup) is in a separate infra repository. + --- ## Documentation | Document | Description | |----------|-------------| -| [Architecture](docs/ARCHITECTURE.md) | System design, component diagrams, data flows, deployment topology, security layers | +| [Architecture](docs/ARCHITECTURE.md) | System design, component diagrams, data flows | | [API Reference](docs/API_REFERENCE.md) | All endpoints, auth requirements, request/response schemas, error codes | | [Deployment Guide](docs/DEPLOYMENT.md) | VPS provisioning, CI/CD setup, blue-green deploy, troubleshooting | | [Rollback System](docs/ROLLBACK_SYSTEM.md) | Rollback architecture, deployment history, safety features | -| [Rollback Quick Reference](docs/ROLLBACK_QUICKREF.md) | Fast operator reference card for deployments | -| [Walkthrough](docs/walkthrough.md) | Phase-by-phase build history and deep-dives | +| [Rollback Quick Reference](docs/ROLLBACK_QUICKREF.md) | Fast operator reference card | +| [Environment Contract](docs/env-contract.md) | All environment variables, naming rules | +| [Infra Contract](docs/infra-contract.md) | External infra responsibilities and path contract (`INFRA_ROOT`) | | [Changelog](CHANGELOG.md) | Full history of every phase | | [Contributing](CONTRIBUTING.md) | Contribution workflow, branching, code conventions | | [Security Policy](SECURITY.md) | How to report vulnerabilities | @@ -184,7 +230,6 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for setup instructions, branch naming con ``` feature/ # new functionality fix/ # bug fixes -infra/ # infrastructure changes docs/ # documentation test/ # test additions chore/ # maintenance / deps @@ -194,12 +239,6 @@ chore/ # maintenance / deps ``` type(scope): short imperative description ``` -Allowed types: `feat` `fix` `refactor` `ci` `infra` `docs` `test` `chore` +Allowed types: `feat` `fix` `refactor` `ci` `docs` `test` `chore` All PRs require review from CODEOWNERS and must pass CI before merge. - ---- - -## License - -[MIT](LICENSE) © 2026 FieldTrack diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 61fb68e..d425872 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -99,33 +99,10 @@ │ │ └───────────────────────────────────────────────────────────────────────────┘ -┌─────────────────────────────────────────────────────────────────────────┐ -│ OBSERVABILITY LAYER │ -├─────────────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ -│ │ Prometheus │───▶│ Grafana │◀───│ Loki │ │ -│ │ (Metrics) │ │ (Dashboard) │ │ (Logs) │ │ -│ └──────────────┘ └──────────────┘ └──────────────┘ │ -│ ▲ ▲ │ -│ │ │ │ -│ │ ┌──────────────┐ │ │ -│ └────────────│ Tempo │────────────┘ │ -│ │ (Traces) │ │ -│ └──────────────┘ │ -│ ▲ │ -│ │ │ -│ │ OpenTelemetry │ -│ │ │ -└─────────────────────────────┼────────────────────────────────────────────┘ - │ - │ - ┌─────────┴─────────┐ - │ │ - │ Fastify API │ - │ (Instrumented) │ - │ │ - └───────────────────┘ +``` + +> Monitoring stack (Prometheus, Grafana, Loki, Tempo) is managed by the **infra repository**. +> The API exposes `/metrics` and OTLP traces, which the infra repo consumes. ``` ## Component Details @@ -158,12 +135,9 @@ - Configurable concurrency (`WORKER_CONCURRENCY` env var) - Job retention limits: 1 000 completed, 5 000 failed (prevents Redis memory growth) -### Observability Layer -- **Prometheus**: Metrics collection and alerting -- **Grafana**: Visualization dashboards -- **Loki**: Log aggregation and querying -- **Tempo**: Distributed tracing -- **OpenTelemetry**: Unified instrumentation +### Observability +- The API emits metrics (Prometheus format on `/metrics`), structured logs (Pino/JSON), and traces (OpenTelemetry OTLP) +- Collection, dashboards, and alerting are handled by the **infra repository** ## Data Flow @@ -337,8 +311,8 @@ Fastify API │ Layer 4: Monitoring & Response │ │ ┌──────────────────────────────────────────────────────────────┐ │ │ │ • Abuse detection logging │ │ -│ │ • Prometheus alerting │ │ -│ │ • Distributed tracing │ │ + │ • Alerting (handled by infra repository) │ │ + │ • Distributed tracing (OpenTelemetry OTLP) │ │ │ │ • Error tracking │ │ │ └──────────────────────────────────────────────────────────────┘ │ │ │ @@ -367,10 +341,9 @@ Fastify API - **Compression**: @fastify/compress ### Observability -- **Metrics**: Prometheus + prom-client -- **Logs**: Pino + Loki -- **Traces**: OpenTelemetry 2.x + Tempo -- **Dashboards**: Grafana +- **Metrics**: prom-client (exposed on `/metrics`, scraped by infra repo) +- **Logs**: Pino (structured JSON, collected by infra repo) +- **Traces**: OpenTelemetry 2.x (exported via OTLP to `TEMPO_ENDPOINT`) ### DevOps - **Containerization**: Docker (node:24-alpine) diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index a21d390..76249b2 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -2,6 +2,8 @@ This document covers deploying FieldTrack API to a Linux VPS using the included blue-green deployment system. +> **Scope:** This document covers the API only. Nginx configuration, TLS, and the monitoring stack are managed by the **infra repository**. + --- ## Prerequisites @@ -9,84 +11,48 @@ This document covers deploying FieldTrack API to a Linux VPS using the included - A Linux VPS (Ubuntu 22.04 recommended) accessible via SSH - A GitHub Container Registry (GHCR) account with push access to the repository - GitHub Actions secrets configured (see [CI/CD Setup](#cicd-setup)) -- Docker and Docker Compose installed on the VPS (handled by `vps-setup.sh`) +- Docker installed on the VPS +- Nginx already running and configured via the **infra repository** --- -## Initial VPS Provisioning - -The `vps-setup.sh` script handles the full first-time setup of a fresh VPS: - -```bash -# Copy the script to the VPS and run as root -scp scripts/vps-setup.sh root@your-server:/tmp/ -ssh root@your-server 'bash /tmp/vps-setup.sh' -``` - -This script: +## API Deployment -1. Installs Docker, Docker Compose, Nginx, and system dependencies -2. Creates a dedicated `deploy` OS user with limited permissions -3. Clones the repository and initialises the directory structure -4. Obtains a TLS certificate via Let's Encrypt (`certbot`) -5. Configures Nginx as a reverse proxy (TLS termination + blue-green upstream switching) -6. Sets up a `systemd` service for auto-restart on boot -7. Configures log rotation and minimal `ufw` firewall rules -8. Starts the monitoring stack (Prometheus, Grafana, Loki, Tempo) +1. SSH into VPS +2. Ensure nginx is running (managed via infra repository) +3. Copy `.env.example` to `.env` and fill in all values +4. Deploy: `./scripts/deploy.sh ` +5. Confirm health: `curl https:///health` -Before running, update the variables at the top of the script: +## Rollback ```bash -DOMAIN="yourdomain.com" # Your server's domain -DEPLOY_USER="fieldtrack" # OS user to run the service -GH_USER="your-github-username" # GitHub username (for GHCR) -REPO_URL="https://github.com/your-username/api.git" +./scripts/deploy.sh --rollback # interactive +./scripts/deploy.sh --rollback --auto # non-interactive (CI) ``` ---- +## Monitoring -## API Deployment -1. SSH into VPS -2. Run `scripts/vps-setup.sh` from workspace root -3. Set `.env` and `.env.monitoring` in workspace root -4. Start monitoring stack: `docker-compose -f infra/docker-compose.monitoring.yml up -d` -5. Deploy API: `scripts/deploy-bluegreen.sh` -6. Confirm readiness: `curl https:///ready` -7. Confirm Prometheus target status is UP +The observability stack (Prometheus, Grafana, Loki, Tempo) is **handled by the infra repository**. The API exposes: +- `GET /metrics` — Prometheus-format metrics (protected by `METRICS_SCRAPE_TOKEN`) +- Traces exported via OTLP to `TEMPO_ENDPOINT` -## Rollback -1. API: `scripts/rollback.sh` +--- -## Monitoring -1. Set `.env.monitoring` in workspace root -2. Start stack: `docker-compose -f infra/docker-compose.monitoring.yml up -d` -3. Grafana: `http://:3000` -4. Prometheus: `http://:9090` -5. Loki: `http://:3100` -6. Tempo: `http://:3200` - -## Nginx -1. Config: `infra/nginx/api.conf` -2. Canonical path: `/etc/nginx/conf.d/api.conf` -3. TLS bootstrap: two-stage via Certbot +## Blue-Green Deployment -## Troubleshooting -1. Logs: `infra/promtail/promtail.yml` -2. Alerts: `infra/prometheus/alerts.yml` -3. Config: `infra/prometheus/prometheus.yml` -4. Grafana dashboards: `infra/grafana/dashboards/` -5. Nginx config: `infra/nginx/api.conf` The deployment uses a blue-green strategy for zero-downtime releases. ### How It Works -The VPS always runs **two containers** (`api-blue` on port 3001, `api-green` on port 3002). Nginx routes all traffic to whichever is currently active. +The VPS keeps **two named slots** (`api-blue`, `api-green`). Only the active slot receives traffic through nginx over `api_network`. +The API containers do **not** bind host ports. On each deploy: 1. The new image is pulled from GHCR 2. The **inactive** container is replaced with the new image -3. Readiness checks poll `GET /ready` until the new container is ready (up to 60 s) +3. The new container is health-checked via `GET /health` 4. Nginx upstream is switched to the new container (`nginx -s reload`) 5. The previously active container is stopped and removed 6. The deployed SHA is prepended to `.deploy_history` (keeps last 5) @@ -95,13 +61,10 @@ On each deploy: ```bash # SSH into the VPS -cd /home/ashish/api +cd $HOME/api # Deploy a specific image SHA (e.g. from CI output) -./scripts/deploy-bluegreen.sh a4f91c2 - -# Deploy the latest tag -./scripts/deploy-bluegreen.sh latest +./scripts/deploy.sh a4f91c2 ``` --- @@ -111,66 +74,38 @@ cd /home/ashish/api To instantly revert to the previous deployment: ```bash -cd /home/ashish/api -./scripts/rollback.sh +cd $HOME/api +./scripts/deploy.sh --rollback ``` The script: 1. Reads `.deploy_history` (requires at least 2 recorded deployments) 2. Displays the full history with current/target markers 3. Prompts for confirmation before proceeding -4. Calls `deploy-bluegreen.sh ` — no rebuild, image already in GHCR +4. Redeploys the previous SHA — no rebuild, image already in GHCR **Typical rollback time: under 10 seconds.** -To deploy any specific historical SHA: - -```bash -./scripts/deploy-bluegreen.sh 7b3e9f1 -``` - For full rollback system documentation, see [ROLLBACK_SYSTEM.md](ROLLBACK_SYSTEM.md). --- -## Monitoring Stack - -The observability stack runs alongside the application on the same VPS: - -```bash -cd infra -docker compose -f docker-compose.monitoring.yml up -d -``` - -| Service | Default Port | Access | -|---------|-------------|--------| -| Grafana | 3001 (internal) | Via Nginx proxy or direct | -| Prometheus | 9090 (internal) | Internal only | -| Loki | 3100 (internal) | Internal only | -| Tempo | 3200 / 4318 | Internal only | - -The pre-built Grafana dashboard (`infra/grafana/dashboards/fieldtrack.json`) is auto-provisioned and covers HTTP metrics, queue depth, latency, and Redis health. - ---- - ## Environment Variables Copy `.env.example` to `.env` on the VPS and fill in all values before the first deploy. -See [README.md](../README.md) for the full variable reference. +See [README.md](../README.md) and [env-contract.md](env-contract.md) for the full variable reference. --- -## Health Check +## Health Endpoints -The application exposes a public health endpoint: +| Endpoint | Purpose | Deploy gate | +|----------|---------|-------------| +| `GET /health` | Liveness — returns `{"status":"ok"}` after bootstrap | **YES** | +| `GET /ready` | Dependency check (Redis + Supabase) | NO — informational only | -```bash -curl https://yourdomain.com/health -# {"status":"ok","timestamp":"2026-03-10T12:00:00.000Z"} -``` - -The deployment script now uses `/ready` to validate dependency readiness before switching Nginx traffic. +The deploy script uses `/health` exclusively. `/ready` failing does not block a deployment. --- @@ -185,7 +120,7 @@ docker logs api-green # or api-blue **Rollback fails: "insufficient deployment history"** Only one deployment has been recorded. Deploy manually with a known-good SHA: ```bash -./scripts/deploy-bluegreen.sh +./scripts/deploy.sh ``` **Container image not found in GHCR** @@ -195,7 +130,7 @@ docker pull ghcr.io/fieldtrack-tech/api: ``` **Nginx fails to reload** -Check the Nginx config syntax: -```bash -nginx -t -``` +Nginx is managed by the infra repository. Check its configuration and reload there. + +**API starts but /ready fails** +Acceptable — Redis or Supabase may be temporarily unavailable. The deploy is still considered successful if `/health` returns 200. diff --git a/docs/OBSERVABILITY_ARCHITECTURE.md b/docs/OBSERVABILITY_ARCHITECTURE.md index 2edb05a..9a23f1f 100644 --- a/docs/OBSERVABILITY_ARCHITECTURE.md +++ b/docs/OBSERVABILITY_ARCHITECTURE.md @@ -1,299 +1,24 @@ -# FieldTrack API — Observability Architecture +# FieldTrack — Observability Architecture -This document describes the monitoring, logging, and metrics systems in FieldTrack API and how they fit together in production. +> **Handled by infra repository.** +> +> The monitoring stack (Prometheus, Grafana, Loki, Tempo, Promtail, Alertmanager) is +> configured and operated out of the infra repository, not this one. ---- +## What this API exposes -## Stack Topology +| Endpoint | Purpose | +|----------|---------| +| GET /metrics | Prometheus-format metrics (protected by \METRICS_SCRAPE_TOKEN\) | +| OTLP traces | Exported to \TEMPO_ENDPOINT\ (default: \http://tempo:4318\) | +| Structured logs | JSON via Pino, written to stdout — collected by infra's Promtail | -``` - ┌─────────────────────────────────────────────────┐ - │ VPS (single host) │ - │ │ - Browser / Client │ Nginx (public) │ - │ │ ├─ / → api-blue:3000 │ - │ HTTPS │ │ or api-green:3000 │ - └─────────────────►│ └─ /monitor/ → 127.0.0.1:3333 (Grafana) │ - │ │ - │ ┌──────────────────────────────────────────┐ │ - │ │ api_network (Docker) │ │ - │ │ │ │ - │ │ api-blue:3000 ──────────────────┐ │ │ - │ │ api-green:3000 ── /metrics ──────┼──┼──►│ Prometheus - │ │ │ │ │ 127.0.0.1:9090 - │ │ node-exporter:9100 ─── /metrics ───┘ │ │ - │ │ │ │ - │ │ Promtail ──── push ──► Loki:3100 │ │ - │ │ │ │ │ │ - │ │ │ reads │ │ │ - │ │ /var/log/* ▼ │ │ - │ │ /var/lib/docker/ Grafana │ │ - │ │ containers/ :3000 → │ │ - │ │ 127.0.0.1:3333 │ │ - │ └──────────────────────────────────────────┘ │ - └─────────────────────────────────────────────────┘ -``` +## Environment variables (API side) ---- +| Variable | Purpose | +|----------|---------| +| \METRICS_SCRAPE_TOKEN\ | Token that Prometheus must send when scraping \/metrics\ | +| \TEMPO_ENDPOINT\ | OTLP HTTP endpoint for trace export | -## Metrics Flow +See [env-contract.md](env-contract.md) for full details. -### Scrape chain - -``` -Prometheus (every 15 s) - ├─ GET api-blue:3000/metrics [x-metrics-token: ] - ├─ GET api-green:3000/metrics [x-metrics-token: ] ← inactive = DOWN (expected) - ├─ GET node-exporter:9100/metrics [no auth — host-internal only] - └─ GET localhost:9090/metrics [self-monitoring] -``` - -### Endpoint - -The Fastify API exposes `/metrics` in [OpenMetrics](https://openmetrics.io/) format via the `@fastify/metrics` plugin. The endpoint is **not** reachable through Nginx (blocked by `location /metrics { return 403; }`). - -### Authentication - -Prometheus sends a custom header on every scrape: - -``` -x-metrics-token: -``` - -The API validates this header in its metrics middleware. Requests without a matching token receive `403 Forbidden`. - -`METRICS_SCRAPE_TOKEN` is injected into the Prometheus container via the `METRICS_SCRAPE_TOKEN` environment variable, which Prometheus expands when loading `prometheus.yml` -(`headers: { x-metrics-token: ${METRICS_SCRAPE_TOKEN} }`). - -### Prometheus config file - -[infra/prometheus/prometheus.yml](../infra/prometheus/prometheus.yml) - -### Retention - -- Time-based: **30 days** -- Size-based: **5 GB** - Prometheus evicts oldest data first when the size limit is reached. - ---- - -## Logs Flow - -### Collection chain - -``` -Container stdout/stderr - │ - ▼ -Docker JSON log files - /var/lib/docker/containers//*-json.log - │ - ▼ (Promtail reads, parses, labels) - │ - ▼ -Loki:3100/loki/api/v1/push - │ - ▼ -Grafana (Loki datasource) → Explore / Dashboard panels -``` - -### Promtail config file - -[infra/promtail/promtail.yml](../infra/promtail/promtail.yml) - -### Log sources - -| Source | Path | Labels added | -|--------|------|--------------| -| Docker containers | `/var/lib/docker/containers/*/*-json.log` | `job=docker`, `container_id`, `level`, `trace_id` | -| Host syslog | `/var/log/*.log` | `job=syslog` | - -### Log parsing pipeline (Docker) - -Promtail applies a multi-stage pipeline to container logs: - -1. **`docker: {}`** — unwraps Docker's JSON envelope (`log`, `stream`, `time`) -2. **regex** — extracts `container_id` from the file path -3. **json** — extracts `level`, `msg`, `trace_id`, `span_id` from Pino structured logs -4. **labels** — promotes `level` and `trace_id` as Loki stream labels - -### Positions persistence - -Promtail records log offsets in: - -``` -/data/positions.yaml (inside promtail_data Docker volume → fieldtrack_promtail_data) -``` - -This file survives container restarts so Promtail never re-ingests already-processed logs. - -### Loki retention - -Loki is configured via [infra/loki/loki-config.yaml](../infra/loki/loki-config.yaml). - -| Setting | Value | Location | -|---------|-------|----------| -| `limits_config.retention_period` | `30d` | `loki-config.yaml` | -| `compactor.retention_enabled` | `true` | `loki-config.yaml` | -| Compaction interval | every 10 minutes | `loki-config.yaml` | -| Deletion delay | 2 hours | `loki-config.yaml` | - -The compactor process runs inside the single-binary Loki container. It scans the index every 10 minutes, marks chunks older than 30 days for deletion, and removes them 2 hours later. The `loki_data` Docker volume (stored in `/loki/chunks`, `/loki/rules`, `/loki/compactor`) must have enough disk space for at most 30 days of logs. - ---- - -## Grafana - -| Property | Value | -|----------|-------| -| Bound to | `127.0.0.1:3333` | -| Public URL | `https:///monitor/` | -| Served via | Nginx `location /monitor/` → `proxy_pass http://127.0.0.1:3333` | -| Auth | Admin credentials from `GRAFANA_ADMIN_PASSWORD` secret | -| Sign-up | Disabled (`GF_USERS_ALLOW_SIGN_UP=false`) | - -### Datasources (provisioned) - -Configured under [infra/grafana/provisioning/datasources/](../infra/grafana/provisioning/datasources/). - -| Name | Type | URL | -|------|------|-----| -| Prometheus | prometheus | `http://prometheus:9090` | -| Loki | loki | `http://loki:3100` | - -### Dashboards (provisioned) - -Pre-built dashboards are stored in [infra/grafana/dashboards/](../infra/grafana/dashboards/) and automatically loaded at startup. - ---- - -## Container Services - -All services run inside the `api_network` Docker bridge network. - -| Container | Image | Bound port | Role | -|-----------|-------|------------|------| -| `prometheus` | `prom/prometheus:v2.52.0` | `127.0.0.1:9090` | Metrics scraper & TSDB | -| `grafana` | `grafana/grafana:10.4.2` | `127.0.0.1:3333` | Dashboards | -| `loki` | `grafana/loki:2.9.6` | internal `:3100` | Log aggregation | -| `promtail` | `grafana/promtail:2.9.6` | — | Log shipper | -| `node-exporter` | `prom/node-exporter:v1.8.1` | internal `:9100` | Host metrics | - -All images are **pinned** to exact versions to ensure deterministic restarts. - -### Resource limits - -Each monitoring container has a Docker-managed memory ceiling enforced via `deploy.resources.limits`: - -| Container | Memory limit | -|-----------|--------------| -| `loki` | 1 GB | -| `prometheus` | 1 GB | -| `grafana` | 512 MB | -| `promtail` | 128 MB | -| `node-exporter` | *(no limit — minimal footprint)* | - ---- - -## Persistent Volumes - -| Docker Volume | Named Volume | Contents | -|---------------|-------------|----------| -| `prometheus_data` | `fieldtrack_prometheus_data` | Prometheus TSDB | -| `grafana_data` | `fieldtrack_grafana_data` | Grafana DB, plugins | -| `loki_data` | `fieldtrack_loki_data` | Loki chunks & index | -| `promtail_data` | `fieldtrack_promtail_data` | Log offset positions file | - ---- - -## Monitoring Stack Restart Policy - -The deploy script ([scripts/deploy-bluegreen.sh](../scripts/deploy-bluegreen.sh)) and the CI sync-infra job only restart the monitoring stack when monitoring configuration has actually changed. - -Change detection uses a SHA-256 hash over all files matching: - -``` -infra/**/*.{yml,yaml,conf,toml,json} -``` - -with the `infra/nginx/` subtree excluded (nginx is rendered on every deploy and does not require a monitoring restart). - -The last-known hash is stored at `~/.fieldtrack-monitoring-hash`. If the new hash matches, the monitoring stack is left running untouched. - ---- - -## Security Notes - -| Control | Detail | -|---------|--------| -| `/metrics` blocked at Nginx | `location /metrics { return 403; }` — scraping is only possible from inside `api_network` | -| Prometheus token auth | `x-metrics-token` header required; value stored in `METRICS_SCRAPE_TOKEN` env var | -| Grafana not publicly listed | Accessible only at `/monitor/`; no signup | -| Monitoring ports loopback-bound | Prometheus `:9090` and Grafana `:3333` bound to `127.0.0.1`; not accessible externally | -| Image versions pinned | No `latest` tags — prevents silent breaking changes on container restart | -| Container log limits | All monitoring containers use `json-file` driver with `max-size: 10m` / `max-file: 3` | - ---- - -## Alerting (Deployed) - -The [infra/prometheus/alerts.yml](../infra/prometheus/alerts.yml) file defines alerting rules. Prometheus loads it via: - -```yaml -rule_files: - - alerts.yml -``` - -Alertmanager is now deployed in [infra/docker-compose.monitoring.yml](../infra/docker-compose.monitoring.yml) and configured in [infra/prometheus/prometheus.yml](../infra/prometheus/prometheus.yml): - -```yaml -alerting: - alertmanagers: - - static_configs: - - targets: - - alertmanager:9093 -``` - -Alertmanager is configured at [infra/alertmanager/alertmanager.yml](../infra/alertmanager/alertmanager.yml), and Slack webhook is loaded from `infra/.env.monitoring` (ALERTMANAGER_SLACK_WEBHOOK). - -Alerting now uses Slack only. Set this in `infra/.env.monitoring` with a valid Slack incoming webhook endpoint: - -- `ALERTMANAGER_SLACK_WEBHOOK` - -Then redeploy the monitoring stack. - - ---- - -## Certbot Bootstrap (Fresh VPS) - -Nginx references LetsEncrypt certificates at `/etc/letsencrypt/live//`. On a fresh VPS these do not exist yet, so a full SSL config causes Nginx to refuse to start. - -**Safe bootstrap sequence:** - -1. Deploy a temporary HTTP-only Nginx config that only serves `/.well-known/acme-challenge/` and your `server_name`. Comment out the `listen 443` server block and all `ssl_*` directives. - -2. Start Nginx with the HTTP-only config: - ```bash - sudo nginx -t && sudo systemctl start nginx - ``` - -3. Obtain the certificate: - ```bash - sudo certbot certonly --webroot -w /var/www/certbot -d $API_HOSTNAME - ``` - -4. Render and install the full SSL config from the template: - ```bash - sed \ - -e "s|__ACTIVE_CONTAINER__|api-blue|g" \ - -e "s|__API_HOSTNAME__|$API_HOSTNAME|g" \ - infra/nginx/api.conf > infra/nginx/live/api.conf - # nginx runs in Docker — reload via docker exec (no host nginx service): - docker exec nginx nginx -t && docker exec nginx nginx -s reload - ``` - -5. Enable auto-renewal (Certbot installs a systemd timer automatically on Ubuntu): - ```bash - sudo systemctl status certbot.timer - ``` diff --git a/docs/ROLLBACK_QUICKREF.md b/docs/ROLLBACK_QUICKREF.md index d130b7b..db05a05 100644 --- a/docs/ROLLBACK_QUICKREF.md +++ b/docs/ROLLBACK_QUICKREF.md @@ -5,18 +5,23 @@ ### Deploy Latest Version ```bash cd "$HOME/api" -./scripts/deploy-bluegreen.sh +./scripts/deploy.sh ``` ### Rollback to Previous Version ```bash cd "$HOME/api" -./scripts/rollback.sh +./scripts/deploy.sh --rollback +``` + +### Rollback (non-interactive, for CI) +```bash +./scripts/deploy.sh --rollback --auto ``` ### Deploy Specific Version ```bash -./scripts/deploy-bluegreen.sh 7b3e9f1 +./scripts/deploy.sh 7b3e9f1 ``` ## How It Works @@ -61,7 +66,8 @@ cd "$HOME/api" ``` ┌──────────────────┐ -│ ./rollback.sh │ +│ ./deploy.sh │ +│ --rollback │ └──────┬───────────┘ │ ▼ @@ -97,8 +103,7 @@ cd "$HOME/api" ``` /api/ ├── scripts/ -│ ├── deploy-bluegreen.sh -│ └── rollback.sh +│ └── deploy.sh # Deploy and rollback └── .deploy_history (last 5 SHAs) ``` @@ -106,19 +111,15 @@ cd "$HOME/api" ```bash # Deploy new version -$ ./scripts/deploy-bluegreen.sh b8c4d2e -[1/7] Pulling image... -[2/7] Detecting active container... -[3/7] Starting inactive container... -[4/7] Waiting for health check... -[5/7] Switching nginx upstream... -[6/7] Reloading nginx... -[7/7] Cleaning old container... -Deployment successful. -Deployment history updated: b8c4d2e +$ ./scripts/deploy.sh b8c4d2e +[DEPLOY] state=PULL_IMAGE ... +[DEPLOY] state=START_INACTIVE ... +[DEPLOY] state=HEALTH_CHECK_INTERNAL ... +[DEPLOY] state=SWITCH_NGINX ... +[DEPLOY] state=SUCCESS duration_sec=18 # Issue discovered - rollback -$ ./scripts/rollback.sh +$ ./scripts/deploy.sh --rollback Current deployment : b8c4d2e Previous deployment: a4f91c2 @@ -131,11 +132,7 @@ Deployment history: Current production will be replaced with: a4f91c2 Continue with rollback? (yes/no): yes - -Starting rollback to image: a4f91c2 -[1/7] Pulling image... -... -Rollback completed successfully. +[DEPLOY] state=SUCCESS duration_sec=9 msg=DEPLOY_SUCCESS Production is now running: a4f91c2 ``` @@ -143,7 +140,7 @@ Production is now running: a4f91c2 | Issue | Solution | |-------|----------| -| Script not executable | `chmod +x scripts/rollback.sh` | +| Script not executable | `chmod +x scripts/deploy.sh` | | No deployment history | Deploy at least once before rollback | | Insufficient history | Need at least 2 deployments to rollback | | Image not found | Verify SHA exists in GHCR | diff --git a/docs/ROLLBACK_SYSTEM.md b/docs/ROLLBACK_SYSTEM.md index 0aad560..56f9a8d 100644 --- a/docs/ROLLBACK_SYSTEM.md +++ b/docs/ROLLBACK_SYSTEM.md @@ -8,9 +8,8 @@ The rollback system provides instant production recovery by redeploying previous ### Components -1. **deploy-bluegreen.sh** - Blue-green deployment script with deployment tracking -2. **rollback.sh** - Automated rollback to previous deployment -3. **.deploy_history** - Deployment history file storing the last 5 deployed image SHAs +1. **deploy.sh** - Unified blue-green deployment and rollback script +2. **.deploy_history** - Deployment history file storing the last 5 deployed image SHAs ### How It Works @@ -23,8 +22,8 @@ The rollback system provides instant production recovery by redeploying previous 2. Deploy script pulls image and performs blue-green deployment 3. After successful deployment → prepends "a4f91c2" to .deploy_history 4. History maintains last 5 deployments -5. If deployment fails → rollback.sh reads line 2 from .deploy_history -6. Rollback redeploys previous image using deploy-bluegreen.sh +5. If deployment fails → `deploy.sh --rollback --auto` is triggered by CI +6. Rollback redeploys previous image using `deploy.sh ` ``` ### Deployment Tracking @@ -63,7 +62,7 @@ Deploy the latest image from CI: ```bash cd "$HOME/api" -./scripts/deploy-bluegreen.sh a4f91c2 +./scripts/deploy.sh a4f91c2 ``` ### Rollback to Previous Version @@ -72,7 +71,7 @@ Instantly restore the last working deployment: ```bash cd "$HOME/api" -./scripts/rollback.sh +./scripts/deploy.sh --rollback ``` **Interactive output with history:** @@ -99,10 +98,7 @@ Manually deploy any historical image: ```bash # Deploy a specific commit SHA -./scripts/deploy-bluegreen.sh 7b3e9f1 - -# Deploy a specific tag -./scripts/deploy-bluegreen.sh v1.2.3 +./scripts/deploy.sh 7b3e9f1 ``` ## Safety Features @@ -147,7 +143,7 @@ sudo systemctl reload nginx # Reload only if valid ```bash # Deploy new version -./scripts/deploy-bluegreen.sh b8c4d2e +./scripts/deploy.sh b8c4d2e # Health check fails → deployment aborted # Production still running previous version @@ -158,7 +154,7 @@ sudo systemctl reload nginx # Reload only if valid ```bash # Deploy succeeds but issue discovered later -./scripts/rollback.sh +./scripts/deploy.sh --rollback # Confirms rollback # Redeploys previous image in <10 seconds @@ -169,11 +165,11 @@ sudo systemctl reload nginx # Reload only if valid ```bash # Need to deploy a specific older version -./scripts/deploy-bluegreen.sh 7b3e9f1 +./scripts/deploy.sh 7b3e9f1 # Pulls specific image from GHCR # Performs blue-green deployment -# Updates .last_deploy to 7b3e9f1 +# Prepends SHA to .deploy_history (rolling last 5) ``` ## Integration with CI/CD @@ -185,7 +181,7 @@ sudo systemctl reload nginx # Reload only if valid run: | ssh ${{ secrets.VPS_USER }}@${{ secrets.VPS_HOST }} \ "cd \"$HOME/api\" && \ - ./scripts/deploy-bluegreen.sh ${{ env.SHA_SHORT }}" + ./scripts/deploy.sh ${{ env.SHA_SHORT }}" ``` ### Deployment History @@ -206,10 +202,9 @@ The history maintains the last 5 deployments in chronological order (newest firs ``` $HOME/api/ ├── scripts/ -│ ├── deploy-bluegreen.sh # Blue-green deployment -│ └── rollback.sh # Rollback automation -├── .deploy_history # Last 5 deployment SHAs -└── .env # Environment configuration +│ └── deploy.sh # Unified deploy + rollback +├── .deploy_history # Last 5 deployment SHAs +└── .env # Environment configuration ``` ## Troubleshooting @@ -218,7 +213,7 @@ $HOME/api/ ```bash # Make script executable -chmod +x scripts/rollback.sh +chmod +x scripts/deploy.sh ``` ### No Deployment History @@ -278,4 +273,3 @@ Potential improvements (not currently implemented): - [Blue-Green Deployment](./DEPLOYMENT.md) - [CI/CD Pipeline](.github/workflows/deploy.yml) -- [VPS Setup](../scripts/vps-setup.sh) diff --git a/docs/SLO.md b/docs/SLO.md index b3daab9..88683bc 100644 --- a/docs/SLO.md +++ b/docs/SLO.md @@ -1,6 +1,6 @@ # FieldTrack Service Level Objectives (SLOs) -This document defines the service-level objectives for FieldTrack production services. Each SLO has a corresponding error budget and alert rules in `infra/prometheus/alerts.yml`. +This document defines the service-level objectives for FieldTrack production services. Alert rules are implemented in the standalone infra repository. --- @@ -19,7 +19,7 @@ This document defines the service-level objectives for FieldTrack production ser | | | |---|---| -| **SLI** | `up{job=~"fieldtrack-api.*"}` | +| **SLI** | HTTP availability measured from `up` metric on API containers | | **Target** | 99.9% monthly availability | | **Error budget** | 43.8 minutes / month | | **Window** | 30-day rolling | @@ -33,7 +33,7 @@ Sub-1h monthly downtime budget is appropriate for a B2B scheduling SaaS. Breach | | | |---|---| -| **SLI** | `histogram_quantile(0.95, ...)` over `http_request_duration_seconds_bucket` | +| **SLI** | p95 HTTP request duration (measured via `http_request_duration_seconds_bucket`) | | **Target p95** | < 500 ms | | **Target p99** | < 2 000 ms | | **Error budget** | 5% of requests may exceed the p95 threshold | @@ -48,7 +48,7 @@ Sub-1h monthly downtime budget is appropriate for a B2B scheduling SaaS. Breach | | | |---|---| -| **SLI** | `rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m])` | +| **SLI** | Ratio of 5xx responses to total HTTP requests | | **Target** | < 1% 5xx error rate | | **Error budget** | 1% of requests may fail with 5xx | | **Window** | 5-minute rolling | diff --git a/docs/env-contract.md b/docs/env-contract.md index 922bc1d..7e1d63d 100644 --- a/docs/env-contract.md +++ b/docs/env-contract.md @@ -35,8 +35,8 @@ | `*_BASE_URL` | Full URL — scheme + host, **no trailing slash** | `https://api.getfieldtrack.app` | | `*_HOSTNAME` | Bare domain — **no scheme, no path** | `api.getfieldtrack.app` | -**`API_HOSTNAME` is always DERIVED from `API_BASE_URL` at deploy-time by `load-env.sh`.** -It must **never** be set in `.env` — set it only in `infra/.env.monitoring`. +**`API_HOSTNAME` is always DERIVED from `API_BASE_URL` at deploy-time by the deployment workflow/script.** +It must **never** be set in `.env` — set it only in the infra repository's `.env.monitoring`. --- @@ -48,7 +48,7 @@ Validated by `src/config/env.ts` (Zod schema, fail-fast). | Variable | Required in Prod | Type | Purpose | |----------|:---:|------|---------| -| `API_BASE_URL` | ✅ | `https://…` URL | **The canonical public URL of this API.** Used in OpenAPI server definitions and any server-generated links referencing the API itself. Also used by all deploy scripts and CI smoke tests. | +| `API_BASE_URL` | ✅ | `https://…` URL | **The canonical public URL of this API.** Used in OpenAPI server definitions and any server-generated links referencing the API itself. Also used by deploy scripts and CI health checks. | | `APP_BASE_URL` | ✅ | `https://…` URL | Canonical root URL for the whole application. Used in email footers, OpenGraph canonical tags, and generic redirects that don't need to distinguish API vs frontend. | | `FRONTEND_BASE_URL` | ✅ | `https://…` URL | Public URL of the web frontend (maintained in a separate repository: `fieldtrack-tech/web`). Used to build password-reset and invitation email links. | @@ -124,28 +124,22 @@ Validated by `src/config/env.ts` (Zod schema, fail-fast). ## CI / Scripts — GitHub Actions + Shell Scripts -Variables consumed by `smoke-test.sh`, deploy scripts, and workflows. +Variables consumed by deploy scripts and workflows. Stored as **GitHub repository secrets**. | Secret Name | Purpose | Used By | |------------|---------|---------| -| `API_BASE_URL` | Full public URL of the API for health probes and smoke tests | `deploy.yml`, `smoke-test.sh` | +| `API_BASE_URL` | Full public URL of the API for health probes | `deploy.yml`, `deploy.sh` | | `CORS_ORIGIN` | Allowed CORS origins for the deployed container | `deploy.yml` (pre-flight validation) | | `DO_HOST` | DigitalOcean VPS IP / hostname | SSH deploy steps | | `DO_USER` | SSH username on VPS | SSH deploy steps | | `DO_SSH_KEY` | SSH private key (PEM) | SSH deploy steps | -| `FT_EMP_EMAIL` | Employee test account email | `smoke-test.sh` | -| `FT_EMP_PASSWORD` | Employee test account password | `smoke-test.sh` | -| `FT_ADMIN_EMAIL` | Admin test account email | `smoke-test.sh` | -| `FT_ADMIN_PASSWORD` | Admin test account password | `smoke-test.sh` | -| `SUPABASE_URL` | Supabase project URL (for smoke test auth) | `smoke-test.sh` | -| `SUPABASE_ANON_KEY` | Supabase anon key (for smoke test auth) | `smoke-test.sh` | > **Renamed:** `FT_API_BASE_URL` → `API_BASE_URL`. Update the GitHub repo secret accordingly. --- -## Infra — `infra/.env.monitoring` +## Infra (standalone infra repository) — `.env.monitoring` Used by Docker Compose for Prometheus, Grafana, Nginx, Blackbox Exporter. @@ -206,7 +200,7 @@ FRONTEND_BASE_URL=https://app.getfieldtrack.app CORS_ORIGIN=https://app.getfieldtrack.app METRICS_SCRAPE_TOKEN= -# Infra (infra/.env.monitoring on VPS) +# Infra (.env.monitoring in infra repo on VPS) API_HOSTNAME=api.getfieldtrack.app METRICS_SCRAPE_TOKEN= GRAFANA_ADMIN_PASSWORD= @@ -217,12 +211,6 @@ CORS_ORIGIN=https://app.getfieldtrack.app DO_HOST= DO_USER=ashish DO_SSH_KEY= -FT_EMP_EMAIL= -FT_EMP_PASSWORD= -FT_ADMIN_EMAIL= -FT_ADMIN_PASSWORD= -SUPABASE_URL=https://your-project.supabase.co -SUPABASE_ANON_KEY=eyJ... ``` --- @@ -248,7 +236,7 @@ The following variables were **renamed** as part of the env contract cleanup (Ma | Old Name | New Name | Where | |----------|----------|-------| -| `FT_API_BASE_URL` | `API_BASE_URL` | GitHub secrets, `smoke-test.sh`, `deploy.yml` | +| `FT_API_BASE_URL` | `API_BASE_URL` | GitHub secrets, `deploy.yml` | **Action required:** 1. Rename the GitHub repository secret `FT_API_BASE_URL` → `API_BASE_URL` diff --git a/docs/infra-contract.md b/docs/infra-contract.md new file mode 100644 index 0000000..63425bd --- /dev/null +++ b/docs/infra-contract.md @@ -0,0 +1,20 @@ +# Infra Contract + +This API repository expects an external infra repository to provide runtime infrastructure. + +Required external services: +- nginx container attached to `api_network` +- Redis reachable at `redis:6379` + +Required external paths under `INFRA_ROOT`: +- `$INFRA_ROOT/nginx/live` +- `$INFRA_ROOT/nginx/backup` +- `$INFRA_ROOT/nginx/api.conf` + +Default on server: +- `INFRA_ROOT=/opt/infra` + +Deployment assumptions: +- API deploy script (`scripts/deploy.sh`) never starts infra services +- API deploy script only renders and reloads nginx config via paths under `INFRA_ROOT` +- API and infra share the Docker bridge network `api_network` diff --git a/docs/walkthrough.md b/docs/walkthrough.md index 2f07725..c80d357 100644 --- a/docs/walkthrough.md +++ b/docs/walkthrough.md @@ -1784,101 +1784,18 @@ Result: **0 errors** across all 3 utility files, 3 repositories, 2 services, 5 c --- -## Phase 13 — Production Infrastructure: VPS, Nginx & Monitoring Stack +## Phase 13 — Production Infrastructure (Moved) -### Overview - -Phase 13 moved FieldTrack 2.0 from a locally-runnable service to a fully operational production deployment. It introduced the VPS setup automation, Nginx reverse proxy, and a complete self-hosted observability stack (Prometheus + Grafana + Loki + Tempo). - ---- - -### 13.1 — VPS Setup Script - -**File:** `scripts/vps-setup.sh` - -A single idempotent script provisions a fresh Ubuntu VPS from zero to production-ready: - -- Installs Docker, Docker Compose, Nginx, and dependencies -- Creates the `fieldtrack` OS user with limited permissions -- Clones the repository and creates the directory structure -- Configures the `systemd` service for auto-restart -- Issues and renews TLS certificates via Let's Encrypt (`certbot`) -- Sets up log rotation and minimal firewall rules (`ufw`) -- Starts the monitoring stack alongside the application - ---- +Infrastructure ownership was extracted from this API repository. -### 13.2 — Nginx Reverse Proxy +The following are now managed in the standalone infra repository: +- VPS bootstrap and host setup +- nginx reverse proxy configuration +- Redis runtime service +- monitoring stack (Prometheus, Grafana, Loki, Alertmanager, Promtail) -**File:** `infra/nginx/api.conf` - -- Terminates TLS (HTTPS → HTTP to backend containers) -- Upstream block points to the active blue/green container port -- HTTP → HTTPS redirect on port 80 -- Proxy headers: `X-Real-IP`, `X-Forwarded-For`, `X-Forwarded-Proto` -- WebSocket upgrade support (`Upgrade`, `Connection` headers) -- Gzip compression for JSON responses -- Security headers: `X-Frame-Options`, `X-Content-Type-Options`, `HSTS` - ---- - -### 13.3 — Monitoring Stack - -**File:** `infra/docker-compose.monitoring.yml` - -Five services on the `api_network` Docker network: - -| Service | Port | Role | -|---------|------|------| -| `prometheus` | 9090 | Scrapes `/metrics` every 15 s; stores time-series | -| `grafana` | 3001 | Dashboards, alerting, data-source wiring | -| `loki` | 3100 | Log aggregation backend | -| `promtail` | — | Reads Docker container logs; ships to Loki | -| `tempo` | 3200 / 4317 / 4318 | Distributed trace storage; OTLP ingest | - ---- +This API repository now focuses on application code and deployment orchestration only. -### 13.4 — Grafana Dashboard - -**File:** `infra/grafana/dashboards/fieldtrack.json` - -A provisioned Grafana dashboard covering: - -- HTTP request rate and error rate by route -- p50/p95/p99 latency per endpoint -- Node.js heap usage and event-loop lag -- BullMQ queue depth and recalculation throughput -- Active session count -- Redis memory usage - -Dashboard is automatically loaded on container start via `infra/grafana/provisioning/`. - ---- - -### Files Created - -| File | Purpose | -|------|----------| -| `scripts/vps-setup.sh` | Full VPS provisioning from scratch | -| `infra/docker-compose.monitoring.yml` | Prometheus, Grafana, Loki, Promtail, Tempo | -| `infra/grafana/dashboards/fieldtrack.json` | Application dashboard (auto-provisioned) | -| `infra/grafana/provisioning/dashboards/dashboard.yml` | Dashboard provisioning config | -| `infra/grafana/provisioning/datasources/prometheus.yml` | Prometheus datasource provisioning | -| `infra/nginx/api.conf` | Nginx reverse proxy and TLS termination | -| `infra/prometheus/prometheus.yml` | Scrape config targeting backend `/metrics` | - ---- - -### Verification Results - -| Check | Result | -|-------|--------| -| VPS setup script idempotent | Can be re-run safely on existing VPS | -| Nginx serves HTTPS | TLS via Let's Encrypt certbot | -| Grafana auto-provisioned | Dashboard loads on container start | -| Prometheus scrapes backend | `http_requests_total` visible in Grafana | - ---- ## Phase 14 — Distributed Tracing, Log Correlation & Metric Exemplars @@ -1961,7 +1878,7 @@ httpRequestDuration.labels(labels).observeWithExemplar( Exemplars make individual high-latency data points "clickable" in Grafana: clicking a spike in the latency graph jumps directly to the Tempo trace for that exact request. -Infrastructure requirements enabled in `docker-compose.monitoring.yml`: +Infrastructure requirements enabled in the standalone infra repository: - Prometheus `--enable-feature=exemplar-storage` flag - Backend scraped with `Content-Type: application/openmetrics-text` (required for exemplar ingestion) @@ -1972,11 +1889,9 @@ Infrastructure requirements enabled in `docker-compose.monitoring.yml`: | File | Action | |------|--------| | `src/tracing.ts` | **NEW** — OpenTelemetry SDK bootstrap; OTLP exporter to Tempo | -| `src/server.ts` | **MODIFIED** — `import "./tracing.js"` as the very first import | +| `src/server.ts` | **MODIFIED** — calls `initTelemetry()` at startup before app bootstrap | | `src/config/logger.ts` | **MODIFIED** — `otelMixin` injects trace/span IDs into every log line | | `src/plugins/prometheus.ts` | **MODIFIED** — exemplar support on duration histogram | -| `infra/docker-compose.monitoring.yml` | **MODIFIED** — Tempo OTLP ports 4317/4318; Prometheus exemplar storage | -| `infra/prometheus/prometheus.yml` | **MODIFIED** — OpenMetrics scrape protocol for backend jobs | | `src/app.ts` | **MODIFIED** — `onRequest` hook enriches active span with route pattern and request ID | --- @@ -2372,7 +2287,7 @@ The pipeline is split into two jobs: ### Multi-Version Rollback System -**Files:** `scripts/deploy-bluegreen.sh`, `scripts/rollback.sh` +**File:** `scripts/deploy.sh` #### Deployment History @@ -2391,19 +2306,19 @@ The history window is capped at the **last 5 deployments**. #### Rollback Procedure ```bash -./scripts/rollback.sh +./scripts/deploy.sh --rollback ``` 1. Reads `.deploy_history` — requires ≥ 2 entries 2. Displays current and target versions with the full history 3. Prompts for interactive confirmation: `Continue with rollback? (yes/no)` -4. Calls `deploy-bluegreen.sh ` to redeploy the previous image +4. Calls `deploy.sh ` to redeploy the previous image 5. The previous image is already in GHCR — no rebuild, **< 10 seconds** end-to-end #### Deploy a Specific Historical Version ```bash -./scripts/deploy-bluegreen.sh 7b3e9f1 +./scripts/deploy.sh 7b3e9f1 ``` Any SHA from `.deploy_history` (or any valid GHCR tag) can be targeted directly. @@ -2415,8 +2330,7 @@ Any SHA from `.deploy_history` (or any valid GHCR tag) can be targeted directly. | File | Action | |------|--------| | `.github/workflows/deploy.yml` | **MODIFIED** — Split into `test` + `build-and-deploy` jobs; `npm ci`; `tsc --noEmit`; GHA cache | -| `scripts/deploy-bluegreen.sh` | **MODIFIED** — Appends SHA to `.deploy_history`; maintains 5-entry window | -| `scripts/rollback.sh` | **NEW** — Reads history, confirms, re-deploys previous image | +| `scripts/deploy.sh` | **MODIFIED** — Unified deploy + rollback, appends SHA to `.deploy_history`; maintains 5-entry window | | `.gitignore` | **MODIFIED** — `.deploy_history` excluded | | `docs/ROLLBACK_SYSTEM.md` | **NEW** — Architecture, usage, troubleshooting guide | | `docs/ROLLBACK_QUICKREF.md` | **NEW** — Fast reference card for operators | diff --git a/infra/.env.monitoring.example b/infra/.env.monitoring.example deleted file mode 100644 index 711716d..0000000 --- a/infra/.env.monitoring.example +++ /dev/null @@ -1,61 +0,0 @@ -# ============================================================================= -# FieldTrack 2.0 — Monitoring Stack Environment -# -# Copy to infra/.env.monitoring on the VPS and fill in values. -# Do NOT commit this file with real secrets — keep it on the VPS only. -# -# Usage: -# docker compose --env-file infra/.env.monitoring \ -# -f infra/docker-compose.monitoring.yml up -d -# -# Validate before deploy: -# bash scripts/validate-env.sh --check-monitoring -# ============================================================================= - -# ── ENV CONTRACT ────────────────────────────────────────────────────────────── -# -# APP layer → API_BASE_URL lives in .env (full URL) -# INFRA layer → API_HOSTNAME lives here (hostname only) -# -# API_HOSTNAME MUST match the hostname portion of API_BASE_URL: -# .env: API_BASE_URL=https://api.example.com -# this file: API_HOSTNAME=api.example.com -# -# Verify consistency before every deploy: -# bash scripts/validate-env.sh --check-monitoring -# -# API_DOMAIN IS REMOVED — do not add it here or anywhere else. -# ============================================================================= - -# ── Infra layer (hostname only — no scheme, no trailing slash) ──────────────── -# -# Derived from API_BASE_URL in .env. -# Prometheus uses this for the readiness probe target. -# Grafana uses this for GF_SERVER_ROOT_URL. -# -# Example: API_BASE_URL=https://api.getfieldtrack.app -# → API_HOSTNAME=api.getfieldtrack.app -API_HOSTNAME=api.getfieldtrack.app - -# ── Grafana ─────────────────────────────────────────────────────────────────── -# Strong password for the Grafana admin account (min 12 chars). -GRAFANA_ADMIN_PASSWORD=change-me-use-a-strong-password - -# ── Prometheus scrape authentication ────────────────────────────────────────── -# Bearer token for the /metrics endpoint. -# MUST be identical to METRICS_SCRAPE_TOKEN in .env. -# Mismatch → Prometheus receives 401s → all metric alerts go blind. -# -# Generate: openssl rand -hex 32 -METRICS_SCRAPE_TOKEN=change-me-generate-with-openssl-rand-hex-32 - -# ── Alertmanager Slack notification target ──────────────────────────────────── -# Used by infra/scripts/render-alertmanager.sh to render the Alertmanager config -# template before container start. Alertmanager does NOT support env vars natively. -# -# Generate from: Slack → Your App → Incoming Webhooks → Add New Webhook -# Must start with: https://hooks.slack.com/ -# -# IMPORTANT: Do NOT add FRONTEND_DOMAIN here — it has been removed from the -# env contract. The render script will exit 1 if it detects that variable. -ALERTMANAGER_SLACK_WEBHOOK=YOUR_SLACK_INCOMING_WEBHOOK_URL diff --git a/infra/alertmanager/alertmanager.yml b/infra/alertmanager/alertmanager.yml deleted file mode 100644 index db267a1..0000000 --- a/infra/alertmanager/alertmanager.yml +++ /dev/null @@ -1,65 +0,0 @@ -# Alertmanager route and receiver configuration for Slack-only alerting. -# -# NOTE: -# This file is a TEMPLATE and MUST be rendered via envsubst before use. -# Alertmanager does NOT support environment variables natively. -# Render this file by running: -# bash infra/scripts/render-alertmanager.sh -# The rendered output is written to: infra/alertmanager/alertmanager.rendered.yml -# docker-compose mounts ONLY the rendered file — never this template directly. -# -# No email, SMTP, or PagerDuty configurations are present. - -route: - receiver: ops-slack-warning - group_by: ["alertname", "severity"] - group_wait: 30s - group_interval: 5m - repeat_interval: 4h - routes: - - match: - severity: critical - receiver: ops-slack-critical - - - match: - severity: warning - receiver: ops-slack-warning - -receivers: - # Critical alerts: dedicated Slack channel for immediate response - - name: ops-slack-critical - slack_configs: - - api_url: "${ALERTMANAGER_SLACK_WEBHOOK}" - channel: "#critical-alerts" - send_resolved: true - title: "[{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}" - text: | - *Severity:* {{ .CommonLabels.severity }} - *Instance:* {{ .CommonLabels.instance }} - *Summary:* {{ .CommonAnnotations.summary }} - *Description:* {{ .CommonAnnotations.description }} - - # Warning alerts: standard alerts channel - - name: ops-slack-warning - slack_configs: - - api_url: "${ALERTMANAGER_SLACK_WEBHOOK}" - channel: "#alerts" - send_resolved: true - title: "[{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}" - text: | - *Severity:* {{ .CommonLabels.severity }} - *Instance:* {{ .CommonLabels.instance }} - *Summary:* {{ .CommonAnnotations.summary }} - *Description:* {{ .CommonAnnotations.description }} - -# Silence rules: suppress expected noise during planned maintenance. -# Add entries here before a deployment rather than disabling alerting entirely. -inhibit_rules: - # If the backend container is down (DeploymentFailure), suppress the - # dependent high-latency and error-rate alerts — they are all downstream - # effects of the same root cause and would produce redundant notifications. - - source_matchers: - - alertname="DeploymentFailure" - target_matchers: - - alertname=~"FieldTrackHighErrorRate|FieldTrackHighLatency|FieldTrackAvgLatencyHigh|ReadinessCheckFailing" - equal: ["job"] diff --git a/infra/blackbox/blackbox.yml b/infra/blackbox/blackbox.yml deleted file mode 100644 index 6e114d0..0000000 --- a/infra/blackbox/blackbox.yml +++ /dev/null @@ -1,21 +0,0 @@ -# FieldTrack 2.0 — Blackbox Exporter Configuration -# -# Modules used by Prometheus scrape jobs (see prometheus.yml). -# fieldtrack-readiness: probes HTTPS /ready, also exposes TLS cert expiry metrics. - -modules: - - # HTTP probe used for readiness check and TLS certificate monitoring. - # probe_success == 1 when /ready returns HTTP 200. - # probe_ssl_earliest_cert_expiry exposes the TLS certificate expiry timestamp. - http_2xx: - prober: http - timeout: 10s - http: - valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] - valid_status_codes: [200] - method: GET - tls_config: - insecure_skip_verify: false - preferred_ip_protocol: "ip4" - ip_protocol_fallback: false diff --git a/infra/docker-compose.monitoring.yml b/infra/docker-compose.monitoring.yml deleted file mode 100644 index 4d973e9..0000000 --- a/infra/docker-compose.monitoring.yml +++ /dev/null @@ -1,264 +0,0 @@ -services: - - loki: - image: grafana/loki:2.9.6 - container_name: loki - restart: unless-stopped - expose: - - "3100" - volumes: - - loki_data:/loki - - ./loki/loki-config.yaml:/etc/loki/local-config.yaml:ro - command: -config.file=/etc/loki/local-config.yaml - networks: - - api_network - deploy: - resources: - limits: - memory: 1g - logging: - driver: json-file - options: - max-size: "10m" - max-file: "3" - healthcheck: - test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"] - interval: 30s - timeout: 5s - retries: 3 - start_period: 15s - - promtail: - image: grafana/promtail:2.9.6 - container_name: promtail - restart: unless-stopped - volumes: - - /var/log:/var/log:ro - - /var/lib/docker/containers:/var/lib/docker/containers:ro - - ./promtail/promtail.yml:/etc/promtail/promtail.yml:ro - - promtail_data:/data - command: -config.file=/etc/promtail/promtail.yml - networks: - - api_network - depends_on: - loki: - condition: service_healthy - deploy: - resources: - limits: - memory: 128m - logging: - driver: json-file - options: - max-size: "10m" - max-file: "3" - - alertmanager: - image: prom/alertmanager:v0.27.0 - container_name: alertmanager - restart: unless-stopped - expose: - - "9093" - - volumes: - - ./alertmanager/alertmanager.rendered.yml:/etc/alertmanager/alertmanager.yml:ro - - alertmanager_data:/alertmanager - - command: - - "--config.file=/etc/alertmanager/alertmanager.yml" - - "--storage.path=/alertmanager" - - "--web.listen-address=:9093" - - networks: - - api_network - - deploy: - resources: - limits: - memory: 128m - - logging: - driver: json-file - options: - max-size: "10m" - max-file: "3" - - healthcheck: - test: ["CMD", "wget", "--spider", "-q", "http://localhost:9093/-/healthy"] - interval: 30s - timeout: 5s - retries: 3 - start_period: 10s - - prometheus: - image: prom/prometheus:v2.52.0 - container_name: prometheus - restart: unless-stopped - expose: - - "9090" - - environment: - - METRICS_SCRAPE_TOKEN=${METRICS_SCRAPE_TOKEN} - - API_HOSTNAME=${API_HOSTNAME} - - volumes: - - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro - - prometheus_data:/prometheus - - command: - - "--config.file=/etc/prometheus/prometheus.yml" - - "--storage.tsdb.retention.time=30d" - - "--storage.tsdb.retention.size=5GB" - - "--web.enable-lifecycle" - - networks: - - api_network - - depends_on: - alertmanager: - condition: service_healthy - - deploy: - resources: - limits: - memory: 512m - - logging: - driver: json-file - options: - max-size: "10m" - max-file: "3" - - healthcheck: - test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"] - interval: 30s - timeout: 5s - retries: 3 - start_period: 10s - - grafana: - image: grafana/grafana:10.4.2 - container_name: grafana - restart: unless-stopped - expose: - - "3000" - - environment: - - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD} - - GF_SERVER_ROOT_URL=https://${API_HOSTNAME}/grafana - - volumes: - - grafana_data:/var/lib/grafana - - networks: - - api_network - - depends_on: - prometheus: - condition: service_healthy - - deploy: - resources: - limits: - memory: 256m - - logging: - driver: json-file - options: - max-size: "10m" - max-file: "3" - - healthcheck: - test: ["CMD", "wget", "--spider", "-q", "http://localhost:3000/api/health"] - interval: 30s - timeout: 5s - retries: 3 - start_period: 30s - - node-exporter: - image: prom/node-exporter:v1.8.1 - container_name: node-exporter - restart: unless-stopped - expose: - - "9100" - - command: - - "--path.rootfs=/host" - - volumes: - - /:/host:ro,rslave - - networks: - - api_network - - deploy: - resources: - limits: - memory: 64m - - logging: - driver: json-file - options: - max-size: "10m" - max-file: "3" - - nginx: - image: nginx:1.25-alpine - container_name: nginx - restart: unless-stopped - ports: - - "80:80" - - "443:443" - - volumes: - # Rendered nginx config — written by deploy script on each deploy - - ./nginx/live:/etc/nginx/conf.d:ro - # SSL certificates (managed by certbot on the host) - - /etc/ssl/api:/etc/ssl/api:ro - # ACME challenge webroot for certbot renewal - - /var/www/certbot:/var/www/certbot:ro - # Nginx access logs shared with promtail - - /var/log/nginx:/var/log/nginx - - networks: - - api_network - - # nginx can start as soon as the grafana *container* exists. - # Waiting for service_healthy would create a blocking chain: - # nginx → grafana → prometheus → alertmanager - # which delays the ingress layer on fresh deployments by minutes. - # nginx uses deferred Docker DNS ($api_backend variable + resolver 127.0.0.11) - # so it starts cleanly before any backend container is ready. - depends_on: - grafana: - condition: service_started - - deploy: - resources: - limits: - memory: 64m - - logging: - driver: json-file - options: - max-size: "10m" - max-file: "3" - - healthcheck: - test: ["CMD", "wget", "--no-check-certificate", "--spider", "-q", "https://localhost/health"] - interval: 30s - timeout: 5s - retries: 3 - start_period: 10s - -networks: - api_network: - external: true - -volumes: - prometheus_data: - alertmanager_data: - grafana_data: - loki_data: - promtail_data: \ No newline at end of file diff --git a/infra/docker-compose.nginx.yml b/infra/docker-compose.nginx.yml new file mode 100644 index 0000000..82e4cba --- /dev/null +++ b/infra/docker-compose.nginx.yml @@ -0,0 +1,44 @@ +services: + + nginx: + image: nginx:1.25-alpine + container_name: nginx + restart: unless-stopped + ports: + - "80:80" + - "443:443" + + volumes: + # Rendered nginx config — written by deploy script on each deploy + - ./nginx/live:/etc/nginx/conf.d:ro + # SSL certificates (managed by certbot on the host) + - /etc/ssl/api:/etc/ssl/api:ro + # ACME challenge webroot for certbot renewal + - /var/www/certbot:/var/www/certbot:ro + # Nginx access logs shared with promtail + - /var/log/nginx:/var/log/nginx + + networks: + - api_network + + deploy: + resources: + limits: + memory: 64m + + logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" + + healthcheck: + test: ["CMD", "wget", "--no-check-certificate", "--spider", "-q", "https://localhost/health"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 10s + +networks: + api_network: + external: true diff --git a/infra/docker-compose.redis.yml b/infra/docker-compose.redis.yml new file mode 100644 index 0000000..7043cc3 --- /dev/null +++ b/infra/docker-compose.redis.yml @@ -0,0 +1,38 @@ +services: + + redis: + image: redis:7-alpine + container_name: redis + restart: unless-stopped + command: redis-server --save 60 1 --loglevel warning + + volumes: + - redis_data:/data + + networks: + - api_network + + deploy: + resources: + limits: + memory: 256m + + logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" + + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 10s + +networks: + api_network: + external: true + +volumes: + redis_data: diff --git a/infra/grafana/dashboards/fieldtrack.json b/infra/grafana/dashboards/fieldtrack.json deleted file mode 100644 index 48e11ac..0000000 --- a/infra/grafana/dashboards/fieldtrack.json +++ /dev/null @@ -1,680 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 1, - "id": null, - "links": [], - "panels": [ - { - "title": "HTTP Request Rate (req/s)", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - }, - "datasource": { - "type": "prometheus", - "uid": "" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisLabel": "req/s", - "drawStyle": "line", - "fillOpacity": 20, - "lineWidth": 2, - "pointSize": 5, - "showPoints": "auto", - "stacking": { - "mode": "none" - } - }, - "unit": "reqps" - }, - "overrides": [] - }, - "targets": [ - { - "expr": "sum(rate(http_requests_total{job=~\"fieldtrack-api.*\"}[5m])) by (status_code)", - "legendFormat": "{{status_code}}", - "refId": "A" - } - ] - }, - { - "title": "HTTP Request Latency (p50 / p95 / p99)", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - }, - "datasource": { - "type": "prometheus", - "uid": "" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisLabel": "seconds", - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 2, - "pointSize": 5, - "showPoints": "auto", - "stacking": { - "mode": "none" - } - }, - "unit": "s" - }, - "overrides": [] - }, - "targets": [ - { - "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{job=~\"fieldtrack-api.*\"}[5m])) by (le))", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=~\"fieldtrack-api.*\"}[5m])) by (le))", - "legendFormat": "p95", - "refId": "B" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{job=~\"fieldtrack-api.*\"}[5m])) by (le))", - "legendFormat": "p99", - "refId": "C" - } - ] - }, - { - "title": "In-Flight Requests", - "type": "stat", - "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 8 - }, - "datasource": { - "type": "prometheus", - "uid": "" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "red", - "value": 100 - } - ] - } - }, - "overrides": [] - }, - "targets": [ - { - "expr": "http_requests_in_flight{job=~\"fieldtrack-api.*\"}", - "legendFormat": "In-Flight", - "refId": "A" - } - ] - }, - { - "title": "Total Requests (24h)", - "type": "stat", - "gridPos": { - "h": 4, - "w": 6, - "x": 6, - "y": 8 - }, - "datasource": { - "type": "prometheus", - "uid": "" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "blue", - "value": null - } - ] - } - }, - "overrides": [] - }, - "targets": [ - { - "expr": "sum(increase(http_requests_total{job=~\"fieldtrack-api.*\"}[24h]))", - "legendFormat": "Total", - "refId": "A" - } - ] - }, - { - "title": "Error Rate (5xx)", - "type": "stat", - "gridPos": { - "h": 4, - "w": 6, - "x": 12, - "y": 8 - }, - "datasource": { - "type": "prometheus", - "uid": "" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "unit": "percentunit", - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 0.01 - }, - { - "color": "red", - "value": 0.05 - } - ] - } - }, - "overrides": [] - }, - "targets": [ - { - "expr": "sum(rate(http_requests_total{job=~\"fieldtrack-api.*\", status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=~\"fieldtrack-api.*\"}[5m]))", - "legendFormat": "5xx Rate", - "refId": "A" - } - ] - }, - { - "title": "Uptime", - "type": "stat", - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 8 - }, - "datasource": { - "type": "prometheus", - "uid": "" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "unit": "s", - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "targets": [ - { - "expr": "process_uptime_seconds{job=~\"fieldtrack-api.*\"}", - "legendFormat": "Uptime", - "refId": "A" - } - ] - }, - { - "title": "Request Rate by Route", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 12 - }, - "datasource": { - "type": "prometheus", - "uid": "" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 15, - "lineWidth": 2 - }, - "unit": "reqps" - }, - "overrides": [] - }, - "targets": [ - { - "expr": "sum(rate(http_requests_total{job=~\"fieldtrack-api.*\"}[5m])) by (route)", - "legendFormat": "{{route}}", - "refId": "A" - } - ] - }, - { - "title": "Node.js Heap Memory", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 12 - }, - "datasource": { - "type": "prometheus", - "uid": "" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 20, - "lineWidth": 2 - }, - "unit": "bytes" - }, - "overrides": [] - }, - "targets": [ - { - "expr": "nodejs_heap_size_used_bytes{job=~\"fieldtrack-api.*\"}", - "legendFormat": "Heap Used", - "refId": "A" - }, - { - "expr": "nodejs_heap_size_total_bytes{job=~\"fieldtrack-api.*\"}", - "legendFormat": "Heap Total", - "refId": "B" - }, - { - "expr": "process_resident_memory_bytes{job=~\"fieldtrack-api.*\"}", - "legendFormat": "RSS", - "refId": "C" - } - ] - }, - { - "title": "CPU Usage (%)", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 20 - }, - "datasource": { - "type": "prometheus", - "uid": "" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 20, - "lineWidth": 2 - }, - "unit": "percentunit", - "max": 1 - }, - "overrides": [] - }, - "targets": [ - { - "expr": "1 - avg(rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\"}[5m]))", - "legendFormat": "CPU Usage", - "refId": "A" - } - ] - }, - { - "title": "System Memory Usage", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 20 - }, - "datasource": { - "type": "prometheus", - "uid": "" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 20, - "lineWidth": 2 - }, - "unit": "bytes" - }, - "overrides": [] - }, - "targets": [ - { - "expr": "node_memory_MemTotal_bytes{job=\"node-exporter\"} - node_memory_MemAvailable_bytes{job=\"node-exporter\"}", - "legendFormat": "Used", - "refId": "A" - }, - { - "expr": "node_memory_MemTotal_bytes{job=\"node-exporter\"}", - "legendFormat": "Total", - "refId": "B" - } - ] - }, - { - "title": "Disk Usage", - "type": "gauge", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "datasource": { - "type": "prometheus", - "uid": "" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "unit": "percentunit", - "max": 1, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 0.7 - }, - { - "color": "red", - "value": 0.9 - } - ] - } - }, - "overrides": [] - }, - "targets": [ - { - "expr": "1 - (node_filesystem_avail_bytes{job=\"node-exporter\", mountpoint=\"/\", fstype!=\"tmpfs\"} / node_filesystem_size_bytes{job=\"node-exporter\", mountpoint=\"/\", fstype!=\"tmpfs\"})", - "legendFormat": "Disk Used", - "refId": "A" - } - ] - }, - { - "title": "Network I/O", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "datasource": { - "type": "prometheus", - "uid": "" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 15, - "lineWidth": 2 - }, - "unit": "Bps" - }, - "overrides": [] - }, - "targets": [ - { - "expr": "sum(rate(node_network_receive_bytes_total{job=\"node-exporter\", device!~\"lo|veth.*|docker.*|br-.*\"}[5m]))", - "legendFormat": "Receive", - "refId": "A" - }, - { - "expr": "sum(rate(node_network_transmit_bytes_total{job=\"node-exporter\", device!~\"lo|veth.*|docker.*|br-.*\"}[5m]))", - "legendFormat": "Transmit", - "refId": "B" - } - ] - }, - { - "title": "API Error Budget Remaining (30d)", - "type": "stat", - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 36 - }, - "datasource": { - "type": "prometheus", - "uid": "" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "unit": "percentunit", - "thresholds": { - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "yellow", - "value": 0.5 - }, - { - "color": "green", - "value": 0.9 - } - ] - } - }, - "overrides": [] - }, - "targets": [ - { - "expr": "fieldtrack:api_error_budget_remaining_30d", - "legendFormat": "Remaining", - "refId": "A" - } - ] - }, - { - "title": "API Error Burn Rate (1h / 6h)", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 36 - }, - "datasource": { - "type": "prometheus", - "uid": "" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 15, - "lineWidth": 2 - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "targets": [ - { - "expr": "fieldtrack:api_error_rate_1h", - "legendFormat": "1h", - "refId": "A" - }, - { - "expr": "fieldtrack:api_error_rate_6h", - "legendFormat": "6h", - "refId": "B" - } - ] - }, - { - "title": "Webhook Permanent Failure Rate (5m)", - "type": "stat", - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 36 - }, - "datasource": { - "type": "prometheus", - "uid": "" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "unit": "percentunit", - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 0.1 - }, - { - "color": "red", - "value": 0.3 - } - ] - } - }, - "overrides": [] - }, - "targets": [ - { - "expr": "fieldtrack:webhook_failure_rate_5m", - "legendFormat": "Failure Rate", - "refId": "A" - } - ] - } - ], - "schemaVersion": 39, - "tags": [ - "fieldtrack", - "backend", - "monitoring" - ], - "templating": { - "list": [] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": {}, - "timezone": "browser", - "title": "FieldTrack 2.0 — Backend & System", - "uid": "fieldtrack-api", - "version": 2 -} \ No newline at end of file diff --git a/infra/grafana/provisioning/dashboards/dashboard.yml b/infra/grafana/provisioning/dashboards/dashboard.yml deleted file mode 100644 index ddd035f..0000000 --- a/infra/grafana/provisioning/dashboards/dashboard.yml +++ /dev/null @@ -1,15 +0,0 @@ -# FieldTrack 2.0 — Grafana Dashboard Provisioning -apiVersion: 1 - -providers: - - name: "FieldTrack Dashboards" - orgId: 1 - folder: "FieldTrack" - type: file - disableDeletion: false - editable: true - updateIntervalSeconds: 30 - allowUiUpdates: true - options: - path: /var/lib/grafana/dashboards - foldersFromFilesStructure: false diff --git a/infra/grafana/provisioning/datasources/prometheus.yml b/infra/grafana/provisioning/datasources/prometheus.yml deleted file mode 100644 index ce07def..0000000 --- a/infra/grafana/provisioning/datasources/prometheus.yml +++ /dev/null @@ -1,13 +0,0 @@ -# FieldTrack 2.0 — Grafana Datasource Provisioning -apiVersion: 1 - -datasources: - - name: Prometheus - type: prometheus - access: proxy - url: http://prometheus:9090 - isDefault: true - editable: false - jsonData: - timeInterval: "15s" - httpMethod: POST diff --git a/infra/loki/loki-config.yaml b/infra/loki/loki-config.yaml deleted file mode 100644 index e283b99..0000000 --- a/infra/loki/loki-config.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# FieldTrack 2.0 — Loki Configuration -# -# Single-binary mode (grafana/loki:2.9.6) with filesystem storage. -# Retention is enforced by the compactor (retention_enabled: true). - -auth_enabled: false - -server: - http_listen_port: 3100 - grpc_listen_port: 9095 - log_level: warn - -common: - path_prefix: /loki - storage: - filesystem: - chunks_directory: /loki/chunks - rules_directory: /loki/rules - replication_factor: 1 - ring: - kvstore: - store: inmemory - -schema_config: - configs: - - from: 2024-01-01 - store: tsdb - object_store: filesystem - schema: v13 - index: - prefix: index_ - period: 24h - -limits_config: - # Retain logs for 30 days; compactor enforces deletion - retention_period: 30d - -compactor: - working_directory: /loki/compactor - compaction_interval: 10m - retention_enabled: true - retention_delete_delay: 2h - retention_delete_worker_count: 150 - delete_request_store: filesystem - -analytics: - reporting_enabled: false diff --git a/infra/nginx/api.conf b/infra/nginx/api.conf deleted file mode 100644 index 1128f26..0000000 --- a/infra/nginx/api.conf +++ /dev/null @@ -1,274 +0,0 @@ -# ============================================================================ -# FieldTrack API — Production Nginx Reverse Proxy -# ============================================================================ - -map $http_upgrade $connection_upgrade { - default upgrade; - '' close; -} - -# NOTE: No upstream block for api_backend. -# upstream blocks resolve server hostnames at config-load time, which fails -# for Docker service names (api-blue / api-green) that may not exist yet. -# Instead, use a variable + proxy_pass to defer resolution to request time via -# the resolver 127.0.0.11 directive defined in the server block below. - -limit_req_zone $binary_remote_addr zone=api_rate:10m rate=60r/s; -limit_req_zone $binary_remote_addr zone=api_health:10m rate=5r/s; - -# Cloudflare IPs -set_real_ip_from 103.21.244.0/22; -set_real_ip_from 103.22.200.0/22; -set_real_ip_from 103.31.4.0/22; -set_real_ip_from 104.16.0.0/13; -set_real_ip_from 104.24.0.0/14; -set_real_ip_from 108.162.192.0/18; -set_real_ip_from 131.0.72.0/22; -set_real_ip_from 141.101.64.0/18; -set_real_ip_from 162.158.0.0/15; -set_real_ip_from 172.64.0.0/13; -set_real_ip_from 173.245.48.0/20; -set_real_ip_from 188.114.96.0/20; -set_real_ip_from 190.93.240.0/20; -set_real_ip_from 197.234.240.0/22; -set_real_ip_from 198.41.128.0/17; - -real_ip_header CF-Connecting-IP; -real_ip_recursive on; - -# --------------------------------------------------------------------------- -# Trusted-source detection via $realip_remote_addr -# -# $realip_remote_addr = the original TCP-connecting IP before the real_ip -# module rewrites $remote_addr to the end-user IP (from CF-Connecting-IP). -# For Cloudflare-proxied requests: $realip_remote_addr = CF edge IP. -# For VPS-local requests: $realip_remote_addr = 127.0.0.1. -# -# Used ONLY for the /monitor/ (Grafana) proxy — an internal dashboard that -# should not be directly reachable from arbitrary IPs. -# -# API routes (/, /admin/events, etc.) are intentionally NOT restricted here: -# - Application layer enforces all auth (JWT + RBAC) -# - CI, debugging, and direct-origin access must work without going through Cloudflare -# - Cloudflare still proxies all production user traffic (no change in UX) -# --------------------------------------------------------------------------- -geo $realip_remote_addr $is_trusted_source { - default 0; - 127.0.0.1/32 1; - ::1/128 1; - # Cloudflare IPv4 (https://www.cloudflare.com/ips-v4/) - 103.21.244.0/22 1; - 103.22.200.0/22 1; - 103.31.4.0/22 1; - 104.16.0.0/13 1; - 104.24.0.0/14 1; - 108.162.192.0/18 1; - 131.0.72.0/22 1; - 141.101.64.0/18 1; - 162.158.0.0/15 1; - 172.64.0.0/13 1; - 173.245.48.0/20 1; - 188.114.96.0/20 1; - 190.93.240.0/20 1; - 197.234.240.0/22 1; - 198.41.128.0/17 1; -} - -# HTTP → HTTPS (with ACME challenge passthrough for certbot renewal) -server { - listen 80; - listen [::]:80; - server_name __API_HOSTNAME__; - - # Let certbot serve ACME challenges for certificate renewal - location /.well-known/acme-challenge/ { - root /var/www/certbot; - } - - # Nginx-level liveness probe — answered by nginx directly, no upstream needed. - # Used by: in-network post-switch routing checks, CI health gates, and - # monitoring probes. Returns 200 even when the API container is down so that - # nginx infrastructure health never depends on backend readiness. - # This endpoint intentionally does NOT proxy to the API backend. - location = /health { - access_log off; - add_header Content-Type 'application/json; charset=utf-8' always; - return 200 '{"status":"ok"}'; - } - - location / { - return 301 https://$host$request_uri; - } -} - -# HTTPS SERVER -server { - - listen 443 ssl; - listen [::]:443 ssl; - - server_name __API_HOSTNAME__; - - ssl_certificate /etc/ssl/api/origin.crt; - ssl_certificate_key /etc/ssl/api/origin.key; - - ssl_protocols TLSv1.2 TLSv1.3; - ssl_prefer_server_ciphers on; - - server_tokens off; - - # ───────────────────────────────────────────────────────────────────────────── - # Docker DNS Resolution (CRITICAL for service name upstreams) - # - # Enables runtime DNS resolution for Docker service names (e.g., grafana:3000). - # Without this, Nginx fails at config-load with: "host not found in upstream". - # Docker's embedded resolver is at 127.0.0.11:53. - # valid=5s caches DNS lookups for 5 seconds — short enough that after a - # blue-green switch nginx re-resolves the new container within one health - # check cycle. ipv6=off stops AAAA queries that Docker bridge networks do - # not answer, which can add latency or cause spurious resolution failures. - # ───────────────────────────────────────────────────────────────────────────── - resolver 127.0.0.11 valid=5s ipv6=off; - resolver_timeout 5s; - - # Variable-based backend URL — resolved at request time via Docker DNS (127.0.0.11). - # __ACTIVE_CONTAINER__ is substituted with api-blue or api-green by deploy script. - set $api_backend "http://__ACTIVE_CONTAINER__:3000"; - - # safer host validation (still simple) - if ($host !~* ^(__API_HOSTNAME__|localhost|127\.0\.0\.1)$) { - return 444; - } - - # No server-level IP restrictions on API routes. - # All application endpoints are secured by JWT + RBAC in Fastify. - # IP-level access control is limited to /metrics, /internal (hard 403) - # and /monitor/ (Grafana dashboard, Cloudflare + localhost only). - - # Headers - add_header X-Frame-Options "SAMEORIGIN" always; - add_header X-Content-Type-Options "nosniff" always; - add_header X-XSS-Protection "1; mode=block" always; - add_header Referrer-Policy "strict-origin-when-cross-origin" always; - add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; - - add_header Content-Security-Policy "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'; img-src 'self' data: https:; font-src 'self' data:; connect-src 'self' https://__API_HOSTNAME__; frame-ancestors 'self';" always; - - access_log /var/log/nginx/api_access.log; - error_log /var/log/nginx/api_error.log; - - client_max_body_size 10M; - client_body_timeout 30s; - send_timeout 30s; - - # Upstream timeout defaults for non-streaming traffic. - # SSE keeps a longer read timeout in its location block. - proxy_connect_timeout 5s; - proxy_send_timeout 60s; - proxy_read_timeout 60s; - - gzip on; - gzip_comp_level 5; - gzip_min_length 256; - gzip_proxied any; - gzip_vary on; - gzip_types application/json application/javascript text/css text/plain text/xml application/xml; - - # Block sensitive endpoints - location /metrics { - allow 127.0.0.1; - deny all; - } - location /internal { return 403; } - location ~ ^/internal/ { return 403; } - location /prometheus { return 403; } - - # Health — publicly accessible liveness probe (no dependencies). - # Reachable from CI runners, monitoring probes (Blackbox, uptime services), - # load balancers, and deploy scripts. Rate-limited to prevent abuse. - # Explicit upstream URIs (proxy_pass .../health) guard against URI-rewrite - # regressions — nginx won't silently change the upstream path. - location = /health { - limit_req zone=api_health burst=10 nodelay; - proxy_pass $api_backend$request_uri; - proxy_buffering off; - proxy_set_header Host __API_HOSTNAME__; - proxy_set_header X-Forwarded-Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_connect_timeout 5s; - proxy_read_timeout 30s; - } - - location = /ready { - # INTERNAL ONLY — requires dependency checks (Redis, Supabase, BullMQ) - # Expensive operations; only reachable from within VPS or localhost. - allow 127.0.0.1; - allow ::1; - deny all; - limit_req zone=api_health burst=10 nodelay; - proxy_pass $api_backend$request_uri; - proxy_buffering off; - proxy_set_header Host __API_HOSTNAME__; - proxy_set_header X-Forwarded-Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_connect_timeout 5s; - proxy_read_timeout 30s; - } - - # SSE — open to all origins; application enforces JWT auth - location = /admin/events { - limit_req zone=api_rate burst=10 nodelay; - proxy_pass $api_backend$request_uri; - proxy_http_version 1.1; - proxy_set_header Connection ''; - proxy_set_header Host __API_HOSTNAME__; - proxy_set_header X-Forwarded-Host $host; - proxy_set_header Authorization $http_authorization; - proxy_buffering off; - proxy_cache off; - add_header X-Accel-Buffering no; - proxy_read_timeout 3600s; - } - - # MAIN API — open to all origins; application enforces JWT + RBAC - location / { - limit_req zone=api_rate burst=50 nodelay; - proxy_pass $api_backend$request_uri; - proxy_http_version 1.1; - proxy_set_header Host __API_HOSTNAME__; - proxy_set_header X-Forwarded-Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_set_header X-Request-ID $request_id; - proxy_set_header Authorization $http_authorization; - proxy_set_header Accept-Encoding ""; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection $connection_upgrade; - proxy_connect_timeout 10s; - proxy_send_timeout 30s; - proxy_read_timeout 30s; - proxy_buffering on; - } - - # Grafana (Cloudflare + localhost only, via Docker service DNS) - location /monitor/ { - if ($is_trusted_source = 0) { return 403; } - set $grafana_upstream "http://grafana:3000"; - proxy_pass $grafana_upstream; - proxy_http_version 1.1; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection $connection_upgrade; - proxy_set_header Host __API_HOSTNAME__; - proxy_set_header X-Forwarded-Host $host; - proxy_buffering off; - } - - location = /monitor { - return 301 $scheme://$host/monitor/; - } -} \ No newline at end of file diff --git a/infra/prometheus/alerts.yml b/infra/prometheus/alerts.yml deleted file mode 100644 index 0d23a9d..0000000 --- a/infra/prometheus/alerts.yml +++ /dev/null @@ -1,559 +0,0 @@ -groups: - -# --------------------------------------------------------- -# RECORDING RULES -# --------------------------------------------------------- - -- name: fieldtrack_recording_rules - rules: - - record: fieldtrack:api_requests_rate_5m - expr: sum(rate(http_requests_total{job=~"fieldtrack-api.*"}[5m])) - - - record: fieldtrack:api_errors_5xx_rate_5m - expr: sum(rate(http_requests_total{job=~"fieldtrack-api.*",status_code=~"5.."}[5m])) - - - record: fieldtrack:api_error_rate_5m - expr: fieldtrack:api_errors_5xx_rate_5m / clamp_min(fieldtrack:api_requests_rate_5m, 1e-9) - - - record: fieldtrack:api_error_rate_1h - expr: | - sum(rate(http_requests_total{job=~"fieldtrack-api.*",status_code=~"5.."}[1h])) - / - clamp_min(sum(rate(http_requests_total{job=~"fieldtrack-api.*"}[1h])), 1e-9) - - - record: fieldtrack:api_error_rate_6h - expr: | - sum(rate(http_requests_total{job=~"fieldtrack-api.*",status_code=~"5.."}[6h])) - / - clamp_min(sum(rate(http_requests_total{job=~"fieldtrack-api.*"}[6h])), 1e-9) - - - record: fieldtrack:webhook_failure_rate_5m - expr: | - sum(rate(webhook_failures_total[5m])) - / - clamp_min(sum(rate(webhook_deliveries_total[5m])), 1e-9) - - - record: fieldtrack:api_error_budget_remaining_30d - expr: | - 1 - ( - sum(increase(http_requests_total{job=~"fieldtrack-api.*",status_code=~"5.."}[30d])) - / - clamp_min(sum(increase(http_requests_total{job=~"fieldtrack-api.*"}[30d])), 1) - ) - -# --------------------------------------------------------- -# API HEALTH -# --------------------------------------------------------- - -- name: fieldtrack_api_alerts - rules: - - - alert: FieldTrackHighErrorRate - expr: fieldtrack:api_error_rate_5m > 0.05 - for: 2m - labels: - severity: critical - annotations: - summary: "High API error rate" - description: "5xx errors exceed 5%" - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ROLLBACK_QUICKREF.md" - runbook: | - Cause: Application throwing unhandled errors or DB/dependency failures. - Actions: - 1. Check container logs: docker logs fieldtrack-api --tail 200 - 2. Check /system-health endpoint from VPS - 3. Review recent deployments: git log --oneline -10 - 4. If DB: check Supabase dashboard for connection pool saturation - 5. If memory: check HostMemoryPressure alert and restart container - 6. Rollback if needed: see docs/ROLLBACK_QUICKREF.md - - - alert: FieldTrackHighLatency - expr: | - histogram_quantile( - 0.95, - sum(rate(http_request_duration_seconds_bucket{job=~"fieldtrack-api.*"}[5m])) by (le) - ) > 1 - for: 5m - labels: - severity: warning - annotations: - summary: "High API latency" - description: "p95 latency above 1 second" - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/SLO.md" - runbook: | - Cause: Slow DB queries, queue contention, or upstream dependency latency. - Actions: - 1. Open Grafana latency panel (p95/p99) and identify spike start time - 2. Check slow-response logs in Loki (`slow_response` and `very_slow_response`) - 3. Check DB load and connection saturation in Supabase dashboard - 4. Inspect queue backlogs via GET /admin/system-health - 5. Roll back recent deployment if latency regression started post-release - - - alert: FieldTrackAvgLatencyHigh - expr: | - sum(rate(http_request_duration_seconds_sum{job=~"fieldtrack-api.*"}[5m])) - / - sum(rate(http_request_duration_seconds_count{job=~"fieldtrack-api.*"}[5m])) - > 0.5 - for: 5m - labels: - severity: warning - annotations: - summary: "FieldTrack API latency exceeded threshold" - description: "Average response time exceeded 500 ms for 5 minutes" - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/SLO.md" - runbook: | - Cause: Sustained performance degradation across many routes. - Actions: - 1. Compare avg latency with p95/p99 to identify broad vs tail issue - 2. Review top routes by request rate and latency in Grafana - 3. Inspect backend logs for DB timeout and retry patterns - 4. Validate Redis and Supabase health via /ready and /system-health - 5. Trigger rollback if regression is tied to latest deploy - -# --------------------------------------------------------- -# WORKER ALERTS -# --------------------------------------------------------- - -- name: fieldtrack_worker_alerts - rules: - - - alert: DistanceWorkerJobFailuresHigh - expr: increase(distance_jobs_total{status="failed"}[5m]) > 3 - for: 1m - labels: - severity: critical - annotations: - summary: "Distance worker jobs failing at high rate" - description: "More than 3 distance recalculation jobs permanently failed in the last 5 minutes. Check Redis connectivity and the distance-engine queue." - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/OBSERVABILITY_ARCHITECTURE.md" - runbook: | - Cause: Redis connectivity failure, Supabase query errors, or malformed GPS data. - Actions: - 1. Check Redis: redis-cli -u $REDIS_URL ping - 2. Check worker logs: docker logs fieldtrack-api | grep "Distance worker" - 3. Inspect failed queue: GET /admin/system-health (worker section) - 4. Replay stuck sessions via queue_retry_intents if needed - 5. Check for GPS point anomalies (MAX_POINTS_PER_SESSION exceeded) - - - alert: AnalyticsQueueBacklogGrowing - expr: analytics_queue_depth > 500 - for: 5m - labels: - severity: warning - annotations: - summary: "Analytics queue backlog high" - description: "Analytics queue depth exceeded 500 for 5 minutes" - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/OBSERVABILITY_ARCHITECTURE.md" - runbook: | - Cause: Worker throughput below enqueue rate or downstream DB contention. - Actions: - 1. Check analytics worker logs for repeated errors/timeouts - 2. Inspect queue depth in GET /admin/system-health - 3. Validate Redis latency and connection health - 4. Check Supabase CPU/connection pressure - 5. Temporarily scale worker concurrency if safe - - # Phase 22: Fire if more than 5 analytics jobs permanently fail within 5 minutes. - # This indicates a systemic problem (bad DB schema change, Supabase outage, etc.) - # that retries alone cannot recover from — requires operator intervention. - - alert: AnalyticsJobFailuresHigh - expr: increase(analytics_job_failures_total[5m]) > 5 - for: 1m - labels: - severity: critical - annotations: - summary: "Analytics jobs failing at high rate" - description: "More than 5 analytics jobs permanently failed (exhausted all retries) in the last 5 minutes. Check the analytics-failed dead letter queue and worker logs." - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/OBSERVABILITY_ARCHITECTURE.md" - runbook: | - Cause: Supabase schema change, DB connection exhaustion, or analytics aggregation bug. - Actions: - 1. Check worker logs: docker logs fieldtrack-api | grep "analytics" - 2. Inspect dead letter queue via GET /admin/system-health - 3. Verify DB schema: check employee_daily_metrics and org_daily_metrics tables - 4. If transient: failed jobs auto-expire after 72 h; monitor retry_intents_dead metric - 5. If persistent: hotfix deployment required — see docs/ROLLBACK_QUICKREF.md - -# --------------------------------------------------------- -# HOST ALERTS -# --------------------------------------------------------- - -- name: fieldtrack_host_alerts - rules: - - - alert: HostHighCPU - expr: 100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[5m]))) > 85 - for: 5m - labels: - severity: warning - annotations: - summary: "High CPU usage" - description: "Host CPU usage above 85% for 5 minutes" - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/OBSERVABILITY_ARCHITECTURE.md" - runbook: | - Cause: Traffic surge, runaway process, or expensive query loops. - Actions: - 1. Check top CPU consumers on host (`top`/`htop`) - 2. Correlate with request rate and queue depth in Grafana - 3. Inspect container logs for retry storms or hot loops - 4. Scale out backend replicas or reduce noisy traffic source - 5. Roll back if a recent deploy caused the spike - - - alert: HostMemoryPressure - expr: | - (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) - / - node_memory_MemTotal_bytes - > 0.85 - for: 5m - labels: - severity: warning - annotations: - summary: "High memory usage" - description: "Host memory usage above 85% for 5 minutes" - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ROLLBACK_QUICKREF.md" - runbook: | - Cause: Memory leak, oversized cache, or traffic burst. - Actions: - 1. Inspect container RSS and heap charts in Grafana - 2. Check process logs for OOM warnings and GC pressure - 3. Restart affected container if memory does not recover - 4. If recurring post-deploy, roll back and open incident - 5. Confirm host swap/disk not under pressure simultaneously - - - alert: DiskAlmostFull - expr: | - (node_filesystem_size_bytes{mountpoint="/"} - - node_filesystem_free_bytes{mountpoint="/"}) - / - node_filesystem_size_bytes{mountpoint="/"} - > 0.85 - for: 5m - labels: - severity: critical - annotations: - summary: "Disk usage above 85%" - description: "Root filesystem usage exceeded 85% for 5 minutes" - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/DEPLOYMENT.md" - runbook: | - Cause: Log growth, artifact buildup, or runaway temp files. - Actions: - 1. Identify large directories (`du -sh /*` on host) - 2. Rotate/prune Docker images and logs - 3. Verify Loki/Promtail retention settings - 4. Free space before deployment operations - 5. Increase disk capacity if growth trend persists - -# --------------------------------------------------------- -# DEPLOYMENT & INFRASTRUCTURE ALERTS -# --------------------------------------------------------- - -- name: fieldtrack_infrastructure_alerts - rules: - - - alert: RedisDown - expr: up{job="redis"} == 0 - for: 2m - labels: - severity: critical - annotations: - summary: "Redis is unreachable" - description: "Redis has been down for more than 2 minutes. BullMQ workers, rate limiting, and the auth context cache will all degrade until Redis recovers." - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ARCHITECTURE.md" - runbook: | - Cause: Redis container crash, OOM kill, or network partition. - Actions: - 1. Check container: docker ps | grep redis; docker logs redis --tail 50 - 2. Restart if crashed: docker restart redis (or docker compose up -d redis) - 3. Verify BullMQ reconnects: check worker logs after Redis recovery - 4. Rate limiting degrades gracefully (requests allowed through) during outage - 5. Circuit-breaker state is DB-backed and survives Redis restart - - - alert: DeploymentFailure - expr: up{job=~"fieldtrack-api.*"} == 0 - for: 2m - labels: - severity: critical - annotations: - summary: "Backend container is down" - description: "{{ $labels.job }} has been down for more than 2 minutes. Check deployment logs and container status." - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ROLLBACK_QUICKREF.md" - runbook: | - Cause: Container OOM, crash loop, failed deployment, or host issue. - Actions: - 1. Check status: docker ps -a | grep fieldtrack - 2. Inspect last 100 lines: docker logs fieldtrack-api --tail 100 - 3. Check exit code: docker inspect fieldtrack-api | jq '.[0].State' - 4. Restart if safe: docker restart fieldtrack-api - 5. Rollback if bad deploy: see docs/ROLLBACK_QUICKREF.md - 6. Check host memory/disk: node_memory and node_filesystem alerts - - - alert: ReadinessCheckFailing - expr: probe_success{job="fieldtrack-readiness"} == 0 - for: 3m - labels: - severity: critical - annotations: - summary: "Readiness check failing" - description: "/ready endpoint has been failing for 3 minutes. Check DB, Redis, and Supabase connectivity." - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ROLLBACK_QUICKREF.md" - runbook: | - Cause: One or more hard dependencies unhealthy (Redis/Supabase/BullMQ). - Actions: - 1. Hit /ready and /health manually from VPS - 2. Check Redis ping and Supabase connectivity - 3. Inspect container logs for startup/recovery errors - 4. Check worker state in /admin/system-health - 5. Roll back if issue began immediately after deployment - -# --------------------------------------------------------- -# TLS CERTIFICATE ALERTS -# --------------------------------------------------------- - -- name: fieldtrack_tls_alerts - rules: - - - alert: TLSCertExpiringSoon - expr: probe_ssl_earliest_cert_expiry{job="fieldtrack-readiness"} - time() < 14 * 24 * 3600 - for: 1h - labels: - severity: warning - annotations: - summary: "TLS certificate expiring within 14 days" - description: "Certificate for {{ $labels.instance }} expires in less than 14 days. Renew via certbot." - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/DEPLOYMENT.md" - runbook: | - Cause: Certificate nearing expiry date. - Actions: - 1. Verify expiry date using blackbox panel and `openssl s_client` - 2. Renew certificate (certbot or managed provider) - 3. Reload NGINX and confirm certificate chain - 4. Recheck probe_ssl_earliest_cert_expiry metric - 5. Confirm no stale cert served via CDN edge - - - alert: TLSCertExpired - expr: probe_ssl_earliest_cert_expiry{job="fieldtrack-readiness"} - time() < 0 - for: 5m - labels: - severity: critical - annotations: - summary: "TLS certificate has expired" - description: "Certificate for {{ $labels.instance }} has expired. All HTTPS traffic is failing." - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/DEPLOYMENT.md" - runbook: | - Cause: Certificate renewal failed or cert not reloaded. - Actions: - 1. Renew certificate immediately - 2. Reload NGINX and verify HTTPS handshake - 3. Validate Cloudflare/full-chain configuration - 4. Confirm /health and /ready are reachable over HTTPS - 5. Open incident and track customer impact window - -# --------------------------------------------------------- -# WEBHOOK DELIVERY SLOs (SLO 4 + SLO 5) -# See docs/SLO.md for full SLO definitions and error-budget -# burn-rate strategy. -# --------------------------------------------------------- - -- name: fieldtrack_webhook_slo_alerts - rules: - - # --- SLO 4: Webhook delivery permanent failure rate > 10% for 5 m (warning) -- - - alert: WebhookDeliveryFailureRateWarning - expr: fieldtrack:webhook_failure_rate_5m > 0.10 - for: 5m - labels: - severity: warning - annotations: - summary: "Webhook permanent failure rate above 10%" - description: >- - More than 10% of webhook deliveries are permanently failing (all retries - exhausted) over the last 5 minutes. Check receiver endpoints and circuit - breaker status. DLQ jobs can be replayed via POST /admin/webhook-dlq/:id/replay. - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/WEBHOOK_SIGNATURES.md" - runbook: | - Cause: Elevated webhook failures for one or more receivers. - Actions: - 1. Check webhook worker logs for dominant error patterns - 2. Inspect DLQ depth and recent failed deliveries - 3. Confirm receiver endpoints are reachable and returning 2xx - 4. Check circuit breaker status in webhooks table - 5. Replay DLQ jobs after root cause is fixed - - # --- SLO 4: Webhook delivery permanent failure rate > 30% for 2 m (critical) - - - alert: WebhookDeliveryFailureRateHigh - expr: fieldtrack:webhook_failure_rate_5m > 0.30 - for: 2m - labels: - severity: critical - annotations: - summary: "Webhook permanent failure rate critically high (>30%)" - description: >- - Over 30% of webhook deliveries are permanently failing. This is a - customer-visible outage for all orgs with active webhooks. Investigate - immediately: check DB connectivity, receiver endpoints, and circuit breaker - state. - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/WEBHOOK_SIGNATURES.md" - runbook: | - Cause: Mass endpoint failures, DB outage, or a code bug in the delivery worker. - Actions: - 1. Check worker logs: docker logs fieldtrack-api | grep "webhook.worker" - 2. Inspect DLQ: GET /admin/webhook-dlq (admin token required) - 3. Check circuit breaker state: query webhooks table for circuit_open_until IS NOT NULL - 4. Replay DLQ entries after fixing root cause: POST /admin/webhook-dlq/:id/replay - 5. If DB issue: check Supabase dashboard, verify webhook_deliveries writes - 6. If code bug: rollback deployment — see docs/ROLLBACK_QUICKREF.md - - # --- SLO 5: DLQ depth above 100 for 30 min -------------------------------- - - alert: WebhookDlqGrowing - expr: dlq_size{queue="webhook-delivery-dlq"} > 100 - for: 30m - labels: - severity: warning - annotations: - summary: "Webhook DLQ depth above 100 for 30 minutes" - description: >- - The webhook dead-letter queue has had more than 100 unprocessed jobs for - 30 minutes. This indicates sustained delivery failures that exceed the - normal transient-failure pattern. Review DLQ via GET /admin/webhook-dlq - and replay or purge stale entries. - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/OBSERVABILITY_ARCHITECTURE.md" - runbook: | - Cause: Persistent downstream delivery failures. - Actions: - 1. Review DLQ entries and identify repeated endpoint failures - 2. Confirm webhook receiver health and DNS/TLS validity - 3. Inspect retry/error metrics and circuit breaker audit entries - 4. Purge stale DLQ entries after archival is confirmed - 5. Replay jobs only after receivers are healthy - - # --- Circuit breaker: any webhook circuit opened (leading indicator) ------- - # - # webhook_failures_total counts permanent failures; a sudden spike often - # indicates a circuit breaker tripped. A short `for: 0m` (fires immediately) - # gives the earliest possible signal to investigate the affected endpoint. - - alert: WebhookCircuitBreakerOpened - expr: increase(webhook_failures_total[2m]) > 5 - for: 0m - labels: - severity: warning - annotations: - summary: "Webhook failure spike — possible circuit breaker activation" - description: >- - More than 5 permanent webhook failures occurred in the last 2 minutes. - A circuit breaker may have opened, pausing delivery to one or more - endpoints. Check circuit breaker state in webhook_deliveries and the - webhooks.circuit_open_until column. - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/WEBHOOK_SIGNATURES.md" - runbook: | - Cause: Rapid repeated delivery failures triggered circuit breaker protection. - Actions: - 1. Query webhooks with circuit_open_until > now() - 2. Validate receiver status codes and timeout behavior - 3. Confirm auto-recovery scanner is running in worker logs - 4. Check whether failures are payload/size related vs network - 5. Re-enable/replay once endpoint stability is restored - - # --- Rate limit burst spike ----------------------------------------------- - - alert: RateLimitBurstSpike - expr: increase(security_rate_limit_hits_total[5m]) > 500 - for: 2m - labels: - severity: warning - annotations: - summary: "Rate limiter blocking unusually high request volume" - description: >- - More than 500 requests were rate-limited in the last 5 minutes. This - may indicate a misconfigured client, a burst from a single org, or the - start of a DoS attempt. Review the rate-limit logs to identify the - offending org / IP. - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ARCHITECTURE.md" - runbook: | - Cause: Burst traffic beyond per-user/per-org sliding window limits. - Actions: - 1. Inspect rate-limit logs for top offending keys - 2. Confirm traffic is expected (batch job) vs malicious - 3. Check Redis health to ensure limiter is functioning correctly - 4. Apply temporary edge-level mitigation if attack suspected - 5. Tune per-org/per-user thresholds only with incident review - -# --------------------------------------------------------- -# API ERROR BUDGET BURN RATE (SLO 3 multi-window alerting) -# See docs/SLO.md §Error Budget Alert Strategy -# --------------------------------------------------------- - -- name: fieldtrack_slo_error_budget - rules: - - # Fast burn: 1 h window at 14x burn rate (>14% error rate) - # exhausts monthly error budget in ~2 days if sustained. - - alert: FieldTrackSloErrorBudgetBurnFast - expr: fieldtrack:api_error_rate_1h > 0.14 - for: 5m - labels: - severity: critical - annotations: - summary: "API error budget burning fast (14x rate)" - description: >- - The 1-hour error rate exceeds 14% (14x normal budget burn). At this - rate the monthly error budget will be exhausted in under 2 days. - Investigate 5xx errors immediately - check logs, DB connectivity, and - recent deployments. - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/SLO.md" - runbook: | - Cause: Sustained high error rate burning error budget at 14x the normal rate. - Actions: - 1. Identify failing routes: check Grafana → FieldTrack API dashboard - 2. Check container logs for exceptions: docker logs fieldtrack-api --tail 500 - 3. Check DB connectivity: /ready endpoint from VPS - 4. If recent deploy: rollback immediately — see docs/ROLLBACK_QUICKREF.md - 5. Open an incident; notify stakeholders if budget < 50% - - # Slow burn: 6 h window at 6x burn rate (>6% error rate) - # exhausts monthly error budget in ~5 days if sustained. - - alert: FieldTrackSloErrorBudgetBurnSlow - expr: fieldtrack:api_error_rate_6h > 0.06 - for: 15m - labels: - severity: warning - annotations: - summary: "API error budget burning (6x rate over 6 h)" - description: >- - The 6-hour error rate exceeds 6% (6x normal budget burn). Open a ticket - and investigate the root cause before the error budget is exhausted. - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/SLO.md" - runbook: | - Cause: Sustained elevated 5xx errors over a long window. - Actions: - 1. Review error budget remaining metric on Grafana dashboard - 2. Identify top failing routes and error classes - 3. Correlate with deployments and infra incidents - 4. Open reliability ticket and assign owner - 5. Plan mitigations before entering critical burn threshold - - # p99 latency SLO breach - 2 s threshold (SLO 2) - - alert: FieldTrackLatencyP99High - expr: | - histogram_quantile( - 0.99, - sum(rate(http_request_duration_seconds_bucket{job=~"fieldtrack-api.*"}[10m])) by (le) - ) > 2 - for: 10m - labels: - severity: warning - annotations: - summary: "API p99 latency above 2 s (SLO 2 breach)" - description: >- - The 99th-percentile API response time has been above 2 seconds for 10 - minutes. This breaches the p99 latency SLO defined in docs/SLO.md. - Check slow queries, worker queue depths, and DB connection pool saturation. - runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/SLO.md" - runbook: | - Cause: Tail-latency degradation affecting a subset of requests. - Actions: - 1. Inspect p99 panel and compare with p95 for tail amplification - 2. Review very_slow_response logs for route-level concentration - 3. Check DB wait events and queue backlog growth - 4. Reduce load or scale services if saturation detected - 5. Roll back if latency regression tracks a release \ No newline at end of file diff --git a/infra/prometheus/prometheus.yml b/infra/prometheus/prometheus.yml deleted file mode 100644 index 4ce26f2..0000000 --- a/infra/prometheus/prometheus.yml +++ /dev/null @@ -1,101 +0,0 @@ -# FieldTrack API \u2014 Prometheus Configuration -# -# All containers share api_network, so we scrape by Docker service name. -# -# Scrape targets: -# 1. api-blue (port 3000 inside container, accessible as api-blue:3000) -# 2. api-green (port 3000 inside container, accessible as api-green:3000) -# 3. node-exporter (port 9100, accessible as node-exporter:9100) -# 4. prometheus (self-monitoring) -# -# NOTE: Both blue and green are listed. Only the active container will be -# running at any time. Prometheus will mark the stopped one as DOWN — this -# is expected and harmless. - -global: - scrape_interval: 15s - evaluation_interval: 15s - scrape_timeout: 10s -rule_files: - - alerts.yml - -# Route fired alerts to Alertmanager for delivery (email, etc.). -# Alertmanager runs on the shared Docker network at alertmanager:9093. -alerting: - alertmanagers: - - static_configs: - - targets: - - "alertmanager:9093" - timeout: 10s - -scrape_configs: - # ── Fastify Backend (Blue) ────────────────────────────────────────────────── - - job_name: "api-blue" - scrape_protocols: [OpenMetricsText1.0.0, PrometheusText0.0.4] - metrics_path: /metrics - scrape_interval: 15s - authorization: - credentials: ${METRICS_SCRAPE_TOKEN} - static_configs: - - targets: - - "api-blue:3000" - labels: - app: "fieldtrack" - component: "backend" - service: "fieldtrack-api" - slot: "blue" - - # ── Fastify Backend (Green) ───────────────────────────────────────────────── - - job_name: "api-green" - scrape_protocols: [OpenMetricsText1.0.0, PrometheusText0.0.4] - metrics_path: /metrics - scrape_interval: 15s - authorization: - credentials: ${METRICS_SCRAPE_TOKEN} - static_configs: - - targets: - - "api-green:3000" - labels: - app: "fieldtrack" - component: "backend" - service: "fieldtrack-api" - slot: "green" - - # ── Node Exporter ─────────────────────────────────────────────────────────── - - job_name: "node-exporter" - scrape_interval: 30s - static_configs: - - targets: - - "node-exporter:9100" - labels: - app: "fieldtrack" - component: "host" - - # ── Prometheus self-monitoring ────────────────────────────────────────────── - - job_name: "prometheus" - scrape_interval: 60s - static_configs: - - targets: - - "localhost:9090" - - # ── Blackbox: readiness probe + TLS certificate monitoring ────────────────── - # probe_success{job="fieldtrack-readiness"} → used by ReadinessCheckFailing alert. - # probe_ssl_earliest_cert_expiry → used by TLSCertExpiringSoon/TLSCertExpired alerts. - - job_name: "fieldtrack-readiness" - metrics_path: /probe - params: - module: [http_2xx] - scrape_interval: 60s - static_configs: - - targets: - - "https://${API_HOSTNAME}/ready" - labels: - app: "fieldtrack" - component: "tls" - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - target_label: __address__ - replacement: "blackbox:9115" - - source_labels: [__param_target] - target_label: instance diff --git a/infra/promtail/promtail.yml b/infra/promtail/promtail.yml deleted file mode 100644 index 415d550..0000000 --- a/infra/promtail/promtail.yml +++ /dev/null @@ -1,62 +0,0 @@ -server: - http_listen_port: 9080 - grpc_listen_port: 0 - -positions: - # Persisted to the promtail_data volume — survives container restarts. - # Must NOT use /tmp (ephemeral) so log offsets are never lost. - filename: /data/positions.yaml - -clients: - - url: http://loki:3100/loki/api/v1/push - -scrape_configs: - - - job_name: docker-containers - - static_configs: - - targets: - - localhost - labels: - job: docker - __path__: /var/lib/docker/containers/*/*-json.log - - pipeline_stages: - - # Docker JSON log parsing - - docker: {} - - # Extract container id from file path - - regex: - expression: '/var/lib/docker/containers/(?P[a-f0-9]+)/.*' - - # Attach container id as label - - labels: - container_id: - - # Extract trace_id if present - - regex: - expression: 'trace_id":"(?P[a-f0-9]+)' - - - labels: - trace_id: - - # Parse JSON logs from Pino - - json: - expressions: - level: level - msg: msg - trace_id: trace_id - span_id: span_id - - - labels: - level: - - - job_name: syslog - - static_configs: - - targets: - - localhost - labels: - job: syslog - __path__: /var/log/*.log \ No newline at end of file diff --git a/infra/scripts/render-alertmanager.sh b/infra/scripts/render-alertmanager.sh deleted file mode 100644 index 692d54f..0000000 --- a/infra/scripts/render-alertmanager.sh +++ /dev/null @@ -1,133 +0,0 @@ -#!/usr/bin/env bash -# ============================================================================= -# infra/scripts/render-alertmanager.sh -# -# Renders infra/alertmanager/alertmanager.yml (template) into -# infra/alertmanager/alertmanager.rendered.yml by substituting -# ${ALERTMANAGER_SLACK_WEBHOOK} from infra/.env.monitoring. -# -# MUST be run before `docker compose up` for the monitoring stack. -# Alertmanager does NOT support environment variables natively — rendering -# the config before container start is the only safe approach. -# -# Usage (from any directory): -# bash infra/scripts/render-alertmanager.sh -# -# Exit codes: -# 0 — rendered file written successfully -# 1 — validation or rendering failure -# ============================================================================= -set -euo pipefail - -# --------------------------------------------------------------------------- -# Resolve absolute paths relative to this script's location. -# This makes the script safe to call from any working directory. -# --------------------------------------------------------------------------- -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -INFRA_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" - -ENV_FILE="${INFRA_DIR}/.env.monitoring" -TEMPLATE_FILE="${INFRA_DIR}/alertmanager/alertmanager.yml" -OUTPUT_FILE="${INFRA_DIR}/alertmanager/alertmanager.rendered.yml" - -log_info() { printf '[render-alertmanager] INFO %s\n' "$*" >&2; } -log_error() { printf '[render-alertmanager] ERROR %s\n' "$*" >&2; } - -# --------------------------------------------------------------------------- -# Pre-flight: ensure required tools exist -# --------------------------------------------------------------------------- -if ! command -v envsubst &>/dev/null; then - log_error "envsubst not found. Install gettext (apt install gettext / yum install gettext)." - exit 1 -fi - -# --------------------------------------------------------------------------- -# Validate env file -# --------------------------------------------------------------------------- -if [ ! -f "${ENV_FILE}" ]; then - log_error "Env file not found: ${ENV_FILE}" - log_error "This file must exist on the VPS and must NOT be committed to the repo." - exit 1 -fi - -# Load env file via `source` under `set -a` so every assignment is exported. -# This correctly handles values containing special characters (e.g. https://). -# DO NOT replace this with `export $(grep ... | xargs)` — xargs splits on -# whitespace and breaks URLs, quoted strings, and any value with spaces. -set -a -# shellcheck source=/dev/null -source "${ENV_FILE}" -set +a - -# Warn loudly if stale / removed variables are still present in the env file. -# FRONTEND_DOMAIN was removed from the env contract — its presence here is a -# sign the file is out of date and should be cleaned up on the VPS. -if [ -n "${FRONTEND_DOMAIN:-}" ]; then - log_error "FRONTEND_DOMAIN is set in ${ENV_FILE} but is no longer part of the env contract." - log_error "Remove that line from .env.monitoring on the VPS, then re-run this script." - exit 1 -fi - -# --------------------------------------------------------------------------- -# Validate ALERTMANAGER_SLACK_WEBHOOK -# --------------------------------------------------------------------------- -if [ -z "${ALERTMANAGER_SLACK_WEBHOOK:-}" ]; then - log_error "ALERTMANAGER_SLACK_WEBHOOK is not set or empty in ${ENV_FILE}." - exit 1 -fi - -case "${ALERTMANAGER_SLACK_WEBHOOK}" in - https://hooks.slack.com/*) - : # valid prefix - ;; - *) - log_error "ALERTMANAGER_SLACK_WEBHOOK does not start with 'https://hooks.slack.com/'." - log_error "Value prefix: ***masked*** (redacted to prevent webhook exposure in logs)" - exit 1 - ;; -esac - -# --------------------------------------------------------------------------- -# Validate template file -# --------------------------------------------------------------------------- -if [ ! -f "${TEMPLATE_FILE}" ]; then - log_error "Template file not found: ${TEMPLATE_FILE}" - exit 1 -fi - -if ! grep -qF '${ALERTMANAGER_SLACK_WEBHOOK}' "${TEMPLATE_FILE}"; then - log_error "Template file does not contain '\${ALERTMANAGER_SLACK_WEBHOOK}' placeholder." - log_error "Check that ${TEMPLATE_FILE} is the correct template." - exit 1 -fi - -# --------------------------------------------------------------------------- -# Render: substitute ONLY ALERTMANAGER_SLACK_WEBHOOK (avoid clobbering any -# other ${...} placeholders that Alertmanager Go template syntax might use). -# --------------------------------------------------------------------------- -log_info "Rendering ${TEMPLATE_FILE} -> ${OUTPUT_FILE}" - -envsubst '${ALERTMANAGER_SLACK_WEBHOOK}' \ - < "${TEMPLATE_FILE}" \ - > "${OUTPUT_FILE}" - -# --------------------------------------------------------------------------- -# Post-render sanity check: no unsubstituted placeholder must remain -# --------------------------------------------------------------------------- -if grep -qF '${ALERTMANAGER_SLACK_WEBHOOK}' "${OUTPUT_FILE}"; then - log_error "Rendered file still contains the unsubstituted placeholder. Aborting." - rm -f "${OUTPUT_FILE}" - exit 1 -fi - -# Verify the rendered URL looks real (not a placeholder stub) -if grep -qF 'YOUR/WEBHOOK/URL' "${OUTPUT_FILE}"; then - log_error "Rendered file contains placeholder stub URL. Check your .env.monitoring." - rm -f "${OUTPUT_FILE}" - exit 1 -fi - -# Print a redacted preview so operators can confirm the URL was injected. -WEBHOOK_PREVIEW=$(grep 'api_url' "${OUTPUT_FILE}" | head -1 | sed 's|\(https://hooks.slack.com/services/[^/]*/[^/]*/\).*|\1***|') -log_info "Webhook preview (redacted): ${WEBHOOK_PREVIEW}" -log_info "Success. Rendered file: ${OUTPUT_FILE}" diff --git a/infra/scripts/verify-alertmanager.sh b/infra/scripts/verify-alertmanager.sh deleted file mode 100644 index efd472c..0000000 --- a/infra/scripts/verify-alertmanager.sh +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env bash -# infra/scripts/verify-alertmanager.sh -# -# Verifies that the Alertmanager integration is healthy and that alert routing -# works end-to-end. -# -# Usage: -# cd /path/to/fieldtrack/infra -# bash scripts/verify-alertmanager.sh -# -# Requirements: -# - Docker Compose monitoring stack must be running -# - curl, jq must be available in PATH -# - ALERTMANAGER_URL defaults to http://localhost:9093 (exposed by docker-compose) -# -# Exit codes: -# 0 — all checks passed -# 1 — one or more checks failed - -set -euo pipefail - -ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://localhost:9093}" -PROMETHEUS_URL="${PROMETHEUS_URL:-http://localhost:9090}" -PASS=0 -FAIL=0 - -# ── Helper functions ────────────────────────────────────────────────────────── - -log_pass() { echo "[PASS] $*"; PASS=$((PASS + 1)); } -log_fail() { echo "[FAIL] $*"; FAIL=$((FAIL + 1)); } -log_info() { echo "[INFO] $*"; } - -require_cmd() { - if ! command -v "$1" &>/dev/null; then - echo "[ERROR] Required command '$1' not found. Install it and retry." - exit 1 - fi -} - -# ── Pre-flight ──────────────────────────────────────────────────────────────── - -require_cmd curl -require_cmd jq - -# ── Step 1: Alertmanager health check ───────────────────────────────────────── - -log_info "Checking Alertmanager health at ${ALERTMANAGER_URL}/-/healthy" - -HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \ - --max-time 10 \ - "${ALERTMANAGER_URL}/-/healthy" || echo "000") - -if [ "$HTTP_STATUS" = "200" ]; then - log_pass "Alertmanager is healthy (HTTP 200)" -else - log_fail "Alertmanager health check returned HTTP ${HTTP_STATUS} (expected 200)" -fi - -# ── Step 2: Alertmanager ready check ────────────────────────────────────────── - -log_info "Checking Alertmanager ready state at ${ALERTMANAGER_URL}/-/ready" - -READY_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \ - --max-time 10 \ - "${ALERTMANAGER_URL}/-/ready" || echo "000") - -if [ "$READY_STATUS" = "200" ]; then - log_pass "Alertmanager is ready (HTTP 200)" -else - log_fail "Alertmanager ready check returned HTTP ${READY_STATUS} (expected 200)" -fi - -# ── Step 3: Alertmanager API — list current alerts ──────────────────────────── - -log_info "Fetching current alerts from Alertmanager API" - -ALERTS_RESPONSE=$(curl -s --max-time 10 \ - "${ALERTMANAGER_URL}/api/v2/alerts" \ - -H "Accept: application/json" || echo "") - -if echo "$ALERTS_RESPONSE" | jq empty 2>/dev/null; then - ALERT_COUNT=$(echo "$ALERTS_RESPONSE" | jq 'length') - log_pass "Alertmanager API responded with valid JSON (${ALERT_COUNT} active alerts)" -else - log_fail "Alertmanager API did not return valid JSON" -fi - -# ── Step 4: Prometheus → Alertmanager connection ────────────────────────────── - -log_info "Checking Prometheus alertmanager targets at ${PROMETHEUS_URL}/api/v1/alertmanagers" - -PROM_AM=$(curl -s --max-time 10 \ - "${PROMETHEUS_URL}/api/v1/alertmanagers" || echo "") - -if echo "$PROM_AM" | jq -e '.data.activeAlertmanagers | length > 0' &>/dev/null; then - ACTIVE=$(echo "$PROM_AM" | jq -r '.data.activeAlertmanagers[0].url // "unknown"') - log_pass "Prometheus is connected to Alertmanager at ${ACTIVE}" -else - log_fail "Prometheus has no active Alertmanager targets — check prometheus.yml alerting block" -fi - -# ── Step 5: Fire a test alert and verify it appears ─────────────────────────── - -log_info "Sending test alert to Alertmanager" - -TEST_ALERT_PAYLOAD=$(cat <<'EOF' -[{ - "labels": { - "alertname": "ApiAlertmanagerVerification", - "severity": "warning", - "job": "fieldtrack-api" - }, - "annotations": { - "summary": "Alertmanager verification test — safe to ignore", - "description": "This alert was fired by verify-alertmanager.sh to confirm end-to-end routing. It will auto-resolve in 5 minutes." - }, - "startsAt": "'"$(date -u +"%Y-%m-%dT%H:%M:%SZ")"'", - "endsAt": "'"$(date -u -d "+5 minutes" +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -v+5M +"%Y-%m-%dT%H:%M:%SZ")"'" -}] -EOF -) - -POST_STATUS=$(curl -s -o /tmp/am_post_response.txt -w "%{http_code}" \ - --max-time 10 \ - -X POST "${ALERTMANAGER_URL}/api/v2/alerts" \ - -H "Content-Type: application/json" \ - --data "$TEST_ALERT_PAYLOAD" || echo "000") - -if [ "$POST_STATUS" = "200" ]; then - log_pass "Test alert accepted by Alertmanager (HTTP 200)" -else - log_fail "Alertmanager rejected test alert (HTTP ${POST_STATUS})" - cat /tmp/am_post_response.txt 2>/dev/null || true -fi - -# ── Step 6: Confirm test alert is visible in active alerts ──────────────────── - -log_info "Waiting 2 seconds for alert to be indexed..." -sleep 2 - -ACTIVE_ALERTS=$(curl -s --max-time 10 \ - "${ALERTMANAGER_URL}/api/v2/alerts?filter=alertname%3DApiAlertmanagerVerification" \ - -H "Accept: application/json" || echo "[]") - -if echo "$ACTIVE_ALERTS" | jq -e 'length > 0' &>/dev/null; then - log_pass "Test alert is visible in Alertmanager active alerts list" -else - log_fail "Test alert not found in active alerts — check Alertmanager configuration" -fi - -# ── Step 7: Verify Prometheus rule files load without errors ────────────────── - -log_info "Checking Prometheus rule files are loaded correctly" - -RULES_RESPONSE=$(curl -s --max-time 10 \ - "${PROMETHEUS_URL}/api/v1/rules" || echo "") - -if echo "$RULES_RESPONSE" | jq -e '.data.groups | length > 0' &>/dev/null; then - GROUP_COUNT=$(echo "$RULES_RESPONSE" | jq '.data.groups | length') - log_pass "Prometheus loaded ${GROUP_COUNT} rule group(s) from alerts.yml" -else - log_fail "No rule groups found in Prometheus — check alerts.yml path in prometheus.yml" -fi - -# ── Summary ─────────────────────────────────────────────────────────────────── - -echo "" -echo "─────────────────────────────────────" -echo " Alertmanager Verification Summary" -echo "─────────────────────────────────────" -echo " PASS: ${PASS}" -echo " FAIL: ${FAIL}" -echo "─────────────────────────────────────" -echo "" - -if [ "$FAIL" -gt 0 ]; then - echo "One or more checks failed. Review the output above." - echo "" - echo "Common fixes:" - echo " • Not running? Start with:" - echo " docker compose -f infra/docker-compose.monitoring.yml up -d alertmanager prometheus" - echo " • Slack webhook missing? Add to infra/.env.monitoring:" - echo " ALERTMANAGER_SLACK_WEBHOOK" - echo " • Prometheus can't reach Alertmanager? Verify they share api_network." - exit 1 -fi - -echo "All checks passed. Alertmanager is operational." -echo "" -echo "NOTE: The test alert 'ApiAlertmanagerVerification' will auto-resolve in 5 minutes." -echo " You can silence it early via: ${ALERTMANAGER_URL}/#/silences" -exit 0 diff --git a/infra/tempo/tempo.yml b/infra/tempo/tempo.yml deleted file mode 100644 index 8c2b943..0000000 --- a/infra/tempo/tempo.yml +++ /dev/null @@ -1,43 +0,0 @@ -server: - http_listen_port: 3200 - -distributor: - receivers: - otlp: - protocols: - http: - grpc: - -ingester: - trace_idle_period: 10s - max_block_bytes: 1_000_000 - max_block_duration: 5m - -compactor: - compaction: - block_retention: 24h - -storage: - trace: - backend: local - local: - path: /var/tempo/traces - -metrics_generator: - storage: - path: /var/tempo/generator - remote_write: - - url: http://prometheus:9090/api/v1/write - send_exemplars: true - processor: - service_graphs: - wait: 10s - max_items: 10000 - span_metrics: - -overrides: - defaults: - metrics_generator: - processors: - - service-graphs - - span-metrics \ No newline at end of file diff --git a/package.json b/package.json index 1c87358..7fb4f99 100644 --- a/package.json +++ b/package.json @@ -13,8 +13,7 @@ "lint": "eslint src/modules --ext .ts", "start": "node dist/server.js", "test": "vitest run", - "test:watch": "vitest", - "analytics:backfill": "tsx scripts/analytics-backfill.ts" + "test:watch": "vitest" }, "keywords": [], "author": "", diff --git a/scripts/analytics-backfill.ts b/scripts/analytics-backfill.ts deleted file mode 100644 index 84260c8..0000000 --- a/scripts/analytics-backfill.ts +++ /dev/null @@ -1,242 +0,0 @@ -/** - * analytics-backfill.ts — Phase 21 backfill script. - * - * Scans historical attendance_sessions and populates employee_daily_metrics - * and org_daily_metrics for any dates that have missing or incomplete rows. - * - * Usage: - * npm run analytics:backfill - * - * The script is additive and idempotent: running it multiple times produces - * the same result. Existing rows are updated via UPSERT (SET, not increment), - * so it is safe to re-run after data corrections. - * - * Processing: - * - Fetches all completed sessions (checkout_at IS NOT NULL AND - * total_distance_km IS NOT NULL) in batches of BATCH_SIZE. - * - Groups by (organization_id, employee_id, date). - * - UPSERTs employee_daily_metrics for each group. - * - UPSERTs org_daily_metrics by aggregating the just-written employee rows. - * - * Skips sessions where total_distance_km is NULL (distance worker not yet run). - */ - -import dotenv from "dotenv"; -dotenv.config(); - -import { createClient } from "@supabase/supabase-js"; - -// ─── Configuration ──────────────────────────────────────────────────────────── - -const SUPABASE_URL = process.env["SUPABASE_URL"]; -const SUPABASE_SERVICE_ROLE_KEY = process.env["SUPABASE_SERVICE_ROLE_KEY"]; - -if (!SUPABASE_URL || !SUPABASE_SERVICE_ROLE_KEY) { - console.error( - "ERROR: SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY must be set in environment", - ); - process.exit(1); -} - -const supabase = createClient(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY); - -/** Number of sessions fetched per paginated round-trip. */ -const BATCH_SIZE = 500; - -/** Pause between batches to avoid overwhelming the DB connection pool. */ -const BATCH_DELAY_MS = 100; - -// ─── Types ──────────────────────────────────────────────────────────────────── - -interface SessionRow { - id: string; - employee_id: string; - organization_id: string; - checkin_at: string; - total_distance_km: number; - total_duration_seconds: number; -} - -interface DailyKey { - orgId: string; - empId: string; - date: string; -} - -interface DailyAggregate { - sessions: number; - distance_km: number; - duration_seconds: number; -} - -// ─── Helpers ────────────────────────────────────────────────────────────────── - -function sleep(ms: number): Promise { - return new Promise((resolve) => setTimeout(resolve, ms)); -} - -/** - * Group session rows by (organization_id, employee_id, date) and accumulate - * totals. Returns a map keyed by `orgId|empId|date`. - */ -function groupByEmployeeDay( - sessions: SessionRow[], -): Map { - const map = new Map(); - - for (const s of sessions) { - const date = s.checkin_at.substring(0, 10); - const mapKey = `${s.organization_id}|${s.employee_id}|${date}`; - const existing = map.get(mapKey); - if (existing) { - existing.agg.sessions++; - existing.agg.distance_km += s.total_distance_km ?? 0; - existing.agg.duration_seconds += s.total_duration_seconds ?? 0; - } else { - map.set(mapKey, { - key: { orgId: s.organization_id, empId: s.employee_id, date }, - agg: { - sessions: 1, - distance_km: s.total_distance_km ?? 0, - duration_seconds: s.total_duration_seconds ?? 0, - }, - }); - } - } - - return map; -} - -// ─── Backfill Logic ─────────────────────────────────────────────────────────── - -async function backfill(): Promise { - console.log("=== FieldTrack Analytics Backfill ==="); - console.log(`Batch size: ${BATCH_SIZE}`); - console.log("Fetching completed, distance-computed sessions...\n"); - - let page = 0; - let totalSessions = 0; - let totalEmployeeDays = 0; - let totalErrors = 0; - let hasMore = true; - - while (hasMore) { - const from = page * BATCH_SIZE; - const to = from + BATCH_SIZE - 1; - - const { data, error } = await supabase - .from("attendance_sessions") - .select( - "id, employee_id, organization_id, checkin_at, total_distance_km, total_duration_seconds", - ) - .not("checkout_at", "is", null) - .not("total_distance_km", "is", null) - .order("checkin_at", { ascending: true }) - .range(from, to); - - if (error) { - console.error(`Batch ${page + 1}: fetch error — ${error.message}`); - totalErrors++; - break; - } - - const batch = (data ?? []) as SessionRow[]; - if (batch.length === 0) { - break; - } - - console.log( - `Batch ${page + 1}: processing ${batch.length} sessions (offset ${from})...`, - ); - - // ── Group sessions by (org, employee, date) ─────────────────────────────── - - const employeeDayMap = groupByEmployeeDay(batch); - totalSessions += batch.length; - totalEmployeeDays += employeeDayMap.size; - - // ── UPSERT employee_daily_metrics ───────────────────────────────────────── - - const empUpsertRows = [...employeeDayMap.values()].map(({ key, agg }) => ({ - organization_id: key.orgId, - employee_id: key.empId, - date: key.date, - sessions: agg.sessions, - distance_km: Math.round(agg.distance_km * 1000) / 1000, - duration_seconds: agg.duration_seconds, - })); - - const { error: empErr } = await supabase - .from("employee_daily_metrics") - .upsert(empUpsertRows, { onConflict: "employee_id,date" }); - - if (empErr) { - console.error(` employee_daily_metrics upsert failed: ${empErr.message}`); - totalErrors++; - } else { - console.log(` employee_daily_metrics: upserted ${empUpsertRows.length} rows`); - } - - // ── Compute org-level aggregates from the employee rows we just wrote ───── - - // Group the same batch by (org, date) - const orgDayMap = new Map(); - for (const { key, agg } of employeeDayMap.values()) { - const mapKey = `${key.orgId}|${key.date}`; - const existing = orgDayMap.get(mapKey); - if (existing) { - existing.agg.sessions += agg.sessions; - existing.agg.distance_km += agg.distance_km; - existing.agg.duration_seconds += agg.duration_seconds; - } else { - orgDayMap.set(mapKey, { - orgId: key.orgId, - date: key.date, - agg: { ...agg }, - }); - } - } - - const orgUpsertRows = [...orgDayMap.values()].map(({ orgId, date, agg }) => ({ - organization_id: orgId, - date, - total_sessions: agg.sessions, - total_distance_km: Math.round(agg.distance_km * 1000) / 1000, - total_duration_seconds: agg.duration_seconds, - })); - - const { error: orgErr } = await supabase - .from("org_daily_metrics") - .upsert(orgUpsertRows, { onConflict: "organization_id,date" }); - - if (orgErr) { - console.error(` org_daily_metrics upsert failed: ${orgErr.message}`); - totalErrors++; - } else { - console.log(` org_daily_metrics: upserted ${orgUpsertRows.length} rows`); - } - - if (batch.length < BATCH_SIZE) { - // Last page — no more rows - hasMore = false; - } else { - page++; - await sleep(BATCH_DELAY_MS); - } - } - - console.log("\n=== Backfill Complete ==="); - console.log(`Sessions processed : ${totalSessions}`); - console.log(`Employee-day rows : ${totalEmployeeDays}`); - console.log(`Errors : ${totalErrors}`); - - if (totalErrors > 0) { - console.error("Backfill completed with errors — check output above."); - process.exit(1); - } -} - -backfill().catch((err: unknown) => { - console.error("Backfill failed:", err instanceof Error ? err.message : String(err)); - process.exit(1); -}); diff --git a/scripts/deploy-bluegreen.sh b/scripts/deploy-bluegreen.sh deleted file mode 100644 index 8771110..0000000 --- a/scripts/deploy-bluegreen.sh +++ /dev/null @@ -1,1539 +0,0 @@ -#!/usr/bin/env bash -# ============================================================================= -# deploy-bluegreen.sh — API Blue-Green Deployment -# -# State machine: -# INIT -# -> PRE_FLIGHT (preflight.sh + env validation) -# -> PULL_IMAGE (with timeout guard) -# -> RESOLVE_SLOT (recovery-aware slot detection) -# -> IDEMPOTENCY (skip if same SHA already running) -# -> START_INACTIVE (with timeout + image immutability check) -# -> HEALTH_CHECK_INTERNAL (connectivity pre-check + readiness loop) -# -> SWITCH_NGINX (nginx -t gate + atomic slot write) -# -> HEALTH_CHECK_PUBLIC (DNS/TLS/CDN end-to-end) -# -> STABILITY_CHECK (post-switch re-verify after settle window) -# -> CLEANUP (graceful shutdown of old container) -# -> SUCCESS (truth check + last-known-good snapshot) -# -# Deployment classification states emitted via _ft_state: -# DEPLOY_SUCCESS -- zero-downtime deploy completed -# DEPLOY_FAILED_SAFE -- deploy failed, old container still healthy -# DEPLOY_FAILED_ROLLBACK -- deploy failed AND rollback was triggered -# DEPLOY_FAILED_FATAL -- deploy AND rollback both failed (manual needed) -# -# On failure: -# -> if active container still running -> DEPLOY_FAILED_SAFE exit 1 -# -> if active container gone -> rollback triggered -# -> rollback succeeded -> DEPLOY_FAILED_ROLLBACK exit 1 -# -> rollback failed -> DEPLOY_FAILED_FATAL exit 2 -# -# Slot state file: /var/run/api/active-slot -# /var/run is a tmpfs (cleared on reboot). The _ft_resolve_slot() recovery -# function handles a missing file by inspecting running containers and the -# live nginx config, then re-writing the file. No manual step needed after -# a reboot or unexpected /run eviction. -# -# Exit codes: -# 0 DEPLOY_SUCCESS -- zero-downtime deploy succeeded -# 1 DEPLOY_FAILED_SAFE -- deploy failed, old container still serving -# or DEPLOY_FAILED_ROLLBACK -- deploy failed, rollback succeeded -# 2 DEPLOY_FAILED_FATAL -- deploy AND rollback both failed (rare) -# 3 DEPLOY_FAILED_FATAL -- fatal guard (active container missing, race condition) -# -# Observability features: -# DEPLOY_ID -- unique deploy identifier for log correlation (YYYYMMDD_HHMMSS_PID) -# deploy_id label -- container labeled with deploy ID for instant traceability -# api.sha -- container labeled with image SHA for quick version lookup -# api.slot -- container labeled with slot name (blue/green) -# duration_sec -- all exits logged with deploy duration for performance tracking -# PREFLIGHT_STRICT -- optional strict mode: enforces preflight checks, fails if missing -# -# ============================================================================= -set -euo pipefail -# Enable explicit debugging when DEBUG=true, otherwise suppress xtrace -if [ "${DEBUG:-false}" = "true" ]; then - set -x -fi -trap '_ft_trap_err "$LINENO"' ERR - -# --------------------------------------------------------------------------- -# STRUCTURED LOGGING [DEPLOY] ts= state= -# ALL logging writes to stderr (>&2) so that functions returning values via -# stdout are never contaminated. stdout = data only; stderr = logs. -# { set +x; } 2>/dev/null suppresses xtrace noise inside helpers. -# --------------------------------------------------------------------------- -_FT_STATE="INIT" -DEPLOY_LOG_FILE="${DEPLOY_LOG_FILE:-/var/log/api/deploy.log}" - -# Ensure log directory exists with fallback to home directory -LOG_DIR="$(dirname "$DEPLOY_LOG_FILE")" -if ! mkdir -p "$LOG_DIR" 2>/dev/null; then - LOG_DIR="$HOME/api/logs" - DEPLOY_LOG_FILE="$LOG_DIR/deploy.log" - mkdir -p "$LOG_DIR" -fi - -_ft_log() { - { set +x; } 2>/dev/null - local log_entry - log_entry=$(printf '[DEPLOY] deploy_id=%s ts=%s state=%s %s' "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$*") - printf '%s\n' "$log_entry" | tee -a "$DEPLOY_LOG_FILE" >&2 - if [ "${DEBUG:-false}" = "true" ]; then set -x; fi -} - -_ft_state() { - { set +x; } 2>/dev/null - _FT_STATE="$1"; shift - printf '[DEPLOY] deploy_id=%s ts=%s state=%s %s\n' "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$*" >&2 - if [ "${DEBUG:-false}" = "true" ]; then set -x; fi -} - -_ft_trap_err() { - { set +x; } 2>/dev/null - printf '[ERROR] deploy_id=%s ts=%s state=%s msg="unexpected failure at line %s"\n' \ - "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$1" >&2 - if [ "${DEBUG:-false}" = "true" ]; then set -x; fi -} - -# --------------------------------------------------------------------------- -# ERROR HELPER -- [ERROR]-prefixed log for failure paths -# --------------------------------------------------------------------------- -_ft_error() { - { set +x; } 2>/dev/null - local log_entry - log_entry=$(printf '[ERROR] deploy_id=%s ts=%s state=%s %s' "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$*") - printf '%s\n' "$log_entry" | tee -a "$DEPLOY_LOG_FILE" >&2 - if [ "${DEBUG:-false}" = "true" ]; then set -x; fi -} - -# --------------------------------------------------------------------------- -# PHASE TIMING HELPER -- wrap phases to measure wall-clock duration -# Usage: -# _ft_phase_start "PHASE_NAME" -# ... phase work ... -# _ft_phase_end "PHASE_NAME" -# --------------------------------------------------------------------------- -_ft_phase_start() { - eval "_${1}_START=\$(date +%s)" -} - -_ft_phase_end() { - local phase="$1" - local start_var="_${phase}_START" - local start_ts=${!start_var:-0} - if [ "$start_ts" -gt 0 ]; then - local duration=$(($(date +%s) - start_ts)) - _ft_log "msg='phase_complete' phase=$phase duration_sec=$duration" - fi -} - -# --------------------------------------------------------------------------- -# GITHUB ACTIONS SUMMARY -- writes deployment summary to Actions UI -# Called at end of deploy (success or failure) -# --------------------------------------------------------------------------- -_ft_github_summary() { - local status="$1" - local container="${2:-unknown}" - local image="${3:-unknown}" - local reason="${4:-}" - - if [ -z "$GITHUB_STEP_SUMMARY" ]; then - return 0 # Not running in GitHub Actions - fi - - { - echo "### 🚀 Deployment Summary" - echo "" - echo "| Field | Value |" - echo "|-------|-------|" - echo "| Status | **$status** |" - echo "| Deploy ID | \`$DEPLOY_ID\` |" - echo "| Duration | $(($(date +%s) - START_TS))s |" - echo "| Active Container | \`$container\` |" - echo "| Image SHA | \`${image:0:12}...\` |" - if [ -n "$reason" ]; then - echo "| Reason | $reason |" - fi - echo "| Timestamp | $(date -u +'%Y-%m-%d %H:%M:%S UTC') |" - } >> "$GITHUB_STEP_SUMMARY" -} - -# --------------------------------------------------------------------------- -# FINAL SYSTEM STATE SNAPSHOT -- records ground truth on success -# --------------------------------------------------------------------------- -_ft_final_state() { - local active_container="$1" - local image_sha="$2" - local nginx_upstream - nginx_upstream=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo 'unknown') - _ft_log "msg='final_state' deploy_id=$DEPLOY_ID active=$active_container sha=${image_sha:0:12} nginx_upstream=$nginx_upstream" -} - -# --------------------------------------------------------------------------- -# DOCKER HEALTH GATE -# Waits for the container's HEALTHCHECK to reach "healthy" before allowing -# nginx to switch. If the container has no HEALTHCHECK defined, this returns -# immediately (status="none") to avoid blocking on unconfigured containers. -# --------------------------------------------------------------------------- -_ft_wait_docker_health() { - local name="$1" - local i=1 - local STATUS - while [ "$i" -le 30 ]; do - STATUS=$(docker inspect --format='{{.State.Health.Status}}' "$name" 2>/dev/null || echo "none") - if [ "$STATUS" = "healthy" ]; then - _ft_log "msg='docker health check passed' container=$name" - return 0 - fi - if [ "$STATUS" = "unhealthy" ]; then - _ft_error "msg='docker health check failed' container=$name status=unhealthy" - return 1 - fi - # "none" means the image has no HEALTHCHECK — skip gate (return 0 immediately) - if [ "$STATUS" = "none" ]; then - _ft_log "msg='docker health gate skipped (no HEALTHCHECK defined)' container=$name" - return 0 - fi - [ $(( i % 5 )) -eq 0 ] && _ft_log "msg='waiting for docker health' attempt=$i/30 status=$STATUS container=$name" - sleep 2 - i=$(( i + 1 )) - done - _ft_error "msg='docker health timeout' container=$name last_status=$STATUS" - return 1 -} - -# --------------------------------------------------------------------------- -# SYSTEM SNAPSHOT -- emitted on any unrecoverable failure -# --------------------------------------------------------------------------- -_ft_snapshot() { - { set +x; } 2>/dev/null - printf '[DEPLOY] -- SYSTEM SNAPSHOT ----------------------------------------\n' >&2 - printf '[DEPLOY] slot_file = %s\n' "$(cat "${ACTIVE_SLOT_FILE:-/var/run/api/active-slot}" 2>/dev/null || echo 'MISSING')" >&2 - printf '[DEPLOY] nginx_upstream = %s\n' "$(grep -oE 'http://(api-blue|api-green):3000' "${NGINX_CONF:-$HOME/api/infra/nginx/live/api.conf}" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo 'unreadable')" >&2 - printf '[DEPLOY] containers =\n' >&2 - docker ps --format '[DEPLOY] {{.Names}} -> {{.Status}} ({{.Ports}})' 1>&2 2>/dev/null \ - || printf '[DEPLOY] (docker ps unavailable)\n' >&2 - printf '[DEPLOY] -----------------------------------------------------------\n' >&2 - if [ "${DEBUG:-false}" = "true" ]; then set -x; fi -} - -# --------------------------------------------------------------------------- -# DEPLOYMENT CLASSIFICATION -- single-source exit helper -# All terminal exit paths MUST go through _ft_exit to avoid state drift. -# -# _ft_exit [key=value ...] -# code 0 -> DEPLOY_SUCCESS -# code 1 -> DEPLOY_FAILED_SAFE | DEPLOY_FAILED_ROLLBACK -# code 2 -> DEPLOY_FAILED_FATAL -# -# DEPLOY_SUCCESS zero-downtime deploy completed -# DEPLOY_FAILED_SAFE deploy failed, old container still serving -# DEPLOY_FAILED_ROLLBACK deploy failed, rollback triggered (system restored) -# DEPLOY_FAILED_FATAL deploy AND rollback both failed (manual needed) -# --------------------------------------------------------------------------- -_ft_exit() { - local code="$1"; shift - local duration=$(( $(date +%s) - START_TS )) - _ft_state "$@" "duration_sec=$duration" - exit "$code" -} - -# Kept for compatibility; delegates to _ft_exit for a final classify+exit in one line. -_ft_classify() { - local outcome="$1"; shift - _ft_state "$outcome" "outcome=$outcome $*" -} - -# --------------------------------------------------------------------------- -# DEPLOYMENT TIMING & IDENTIFIERS -# --------------------------------------------------------------------------- -START_TS=$(date +%s) -DEPLOY_ID=$(date +%Y%m%d_%H%M%S)_$$ -PREFLIGHT_STRICT="${PREFLIGHT_STRICT:-false}" - -_ft_log "msg='deploy started' deploy_id=$DEPLOY_ID pid=$$ start_ts=$START_TS" -if [ "$PREFLIGHT_STRICT" = "true" ]; then - _ft_log "msg='PREFLIGHT_STRICT=true -- will enforce preflight checks'" -fi - -# --------------------------------------------------------------------------- -# CONSTANTS -# --------------------------------------------------------------------------- -# Immutable SHA tags ONLY — 'latest' is forbidden in production. -# Reject empty and 'latest' before any Docker operation so failures are -# loud and attributed to the caller rather than appearing as pull errors. -IMAGE_SHA="${1:-}" -if [ -z "$IMAGE_SHA" ] || [ "$IMAGE_SHA" = "latest" ]; then - printf '[DEPLOY] ts=%s state=INIT level=ERROR msg="image SHA required -- latest tag is forbidden in production" sha=%s\n' \ - "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "${IMAGE_SHA:-}" >&2 - exit 2 -fi -IMAGE="ghcr.io/fieldtrack-tech/api:$IMAGE_SHA" - -BLUE_NAME="api-blue" -GREEN_NAME="api-green" -APP_PORT=3000 -NETWORK="api_network" -# Pinned curl container for in-network health probes. -# Running on api_network exercises Docker DNS + bridge routing — the same -# path that nginx uses — catching connectivity issues that docker exec -# localhost bypasses (docker exec goes direct to the container loopback). -_FT_CURL_IMG="curlimages/curl:8.7.1" -# In-network curl helper with local fallback. -# -# PRIMARY CURL HELPERS — use docker run on api_network (reliable DNS + routing) -# -# Primary: short-lived curlimages/curl container on api_network. -# Exercises Docker DNS + bridge routing (same path nginx uses). -# Works with distroless containers (no curl binary available). -# -# Usage: _ft_net_curl -# The first argument is the container name — not used (kept for signature compat). -# Remaining arguments are passed verbatim to curl. -_ft_net_curl() { - local _target_container="$1"; shift - # Primary: in-network (Docker DNS + bridge routing) - docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" "$@" >/dev/null 2>&1 -} -# Variant that captures the response body or HTTP status code instead of -# just testing. Used where we need the response text for status checks. -# Usage: _ft_net_curl_out -_ft_net_curl_out() { - local _target_container="$1"; shift - local _out - _out=$(docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" "$@" 2>/dev/null) || _out="" - printf '%s' "$_out" -} - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" -[ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } -REPO_DIR="$DEPLOY_ROOT" - -# Slot state directory and file. -# /var/run/api/ is chosen over /tmp (world-writable, cleaned by tmpwatch) -# and $HOME (variable path, not auditable as runtime state). -# /var/run IS a tmpfs -- the _ft_resolve_slot() recovery handles missing files. -SLOT_DIR="/var/run/api" -ACTIVE_SLOT_FILE="$SLOT_DIR/active-slot" - -NGINX_CONF="$REPO_DIR/infra/nginx/live/api.conf" -NGINX_LIVE_DIR="$REPO_DIR/infra/nginx/live" -NGINX_BACKUP_DIR="$REPO_DIR/infra/nginx/backup" -NGINX_TEMPLATE="$REPO_DIR/infra/nginx/api.conf" -MAX_HISTORY=5 -MAX_HEALTH_ATTEMPTS=40 -HEALTH_INTERVAL=3 -LOCK_FILE="$SLOT_DIR/deploy.lock" -SNAP_DIR="$SLOT_DIR" -LAST_GOOD_FILE="$SNAP_DIR/last-good" - -_ft_ensure_log_dir() { - local log_dir - log_dir=$(dirname "$DEPLOY_LOG_FILE") - if [ ! -d "$log_dir" ]; then - mkdir -p "$log_dir" 2>/dev/null || sudo mkdir -p "$log_dir" || true - [ -d "$log_dir" ] && chmod 755 "$log_dir" 2>/dev/null || true - fi -} - -# --------------------------------------------------------------------------- -# DEPLOYMENT LOCK -- prevent concurrent deploys -# --------------------------------------------------------------------------- -_ft_acquire_lock() { - _ft_ensure_slot_dir - _ft_ensure_log_dir - _ft_log "msg='acquiring deployment lock' pid=$$ file=$LOCK_FILE" - exec 200>"$LOCK_FILE" - if ! flock -n 200; then - _ft_log "level=ERROR msg='another deployment already in progress -- aborting' pid=$$" - exit 1 - fi - _ft_log "msg='deployment lock acquired' pid=$$ file=$LOCK_FILE" - # Ensure lock is released on exit - trap '_ft_release_lock' EXIT -} - -_ft_release_lock() { - { set +x; } 2>/dev/null - printf '[DEPLOY] ts=%s state=%s msg="releasing deployment lock" pid=%s\n' \ - "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$$" >&2 - # Close FD 200 unconditionally; closing the FD releases the flock. - exec 200>&- 2>/dev/null || true - if [ "${DEBUG:-false}" = "true" ]; then set -x; fi -} - -# --------------------------------------------------------------------------- -# EXTERNAL ENDPOINT CHECK WITH RETRY + BACKOFF -# Smooths transient CDN/TLS edge jitter while maintaining strict semantics -# -# NOTE: Uses localhost (127.0.0.1) with Host header instead of external hostname. -# Rationale: nginx is protected by Cloudflare IP allowlist. Requests from the -# VPS itself (not through Cloudflare) would be blocked with 403. Using localhost -# + Host header allows the deploy script to: -# - Validate full nginx routing stack (localhost → nginx → backend) -# - Bypass Cloudflare IP restriction safely -# - Use --insecure to accept self-signed/origin certs (nginx rewrite) -# Security: unchanged. Cloudflare still protects production access; only -# localhost requests (VPS-internal) bypass the IP filter. -# --------------------------------------------------------------------------- -_ft_check_external_ready() { - # -f: fail on 4xx/5xx so HTML error pages never match the grep - docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" -sfk --max-time 5 "https://nginx/health" 2>/dev/null \ - | grep -q '"status":"ok"' -} - -# --------------------------------------------------------------------------- -# RETRY CURL -- wraps curl -sf with retries + 1s backoff -# _ft_retry_curl [max_attempts=10] [extra curl flags...] -# Returns 0 on first 2xx success, 1 after all attempts exhausted. -# --------------------------------------------------------------------------- -_ft_retry_curl() { - { set +x; } 2>/dev/null - local url="$1" - local max="${2:-10}" - shift 2 || shift $# - local i=0 - while [ "$i" -lt "$max" ]; do - i=$((i + 1)) - if curl -sf --max-time 5 "$@" "$url" >/dev/null 2>&1; then - if [ "${DEBUG:-false}" = "true" ]; then set -x; fi - return 0 - fi - sleep 1 - done - if [ "${DEBUG:-false}" = "true" ]; then set -x; fi - return 1 -} - -# --------------------------------------------------------------------------- -# SILENT EXECUTION WRAPPER -# All inherently noisy commands (docker pull, docker compose, etc.) go through -# run(). Output is suppressed unless DEBUG=true. -# On failure: surfaces the command name and captured output to stderr so -# failures are never silently swallowed. -# --------------------------------------------------------------------------- -run() { - if [ "${DEBUG:-false}" = "true" ]; then - "$@" - else - local _run_out - if ! _run_out=$("$@" 2>&1); then - printf '[ERROR] Command failed: %s\n' "$*" >&2 - printf '%s\n' "$_run_out" >&2 - return 1 - fi - fi -} - -# Like run() but always forwards stderr so error messages are never swallowed. -run_show_err() { - if [ "${DEBUG:-false}" = "true" ]; then - "$@" - else - "$@" >/dev/null - fi -} - -# --------------------------------------------------------------------------- -# SLOT DIRECTORY AND FILE MANAGEMENT -# --------------------------------------------------------------------------- -_ft_ensure_slot_dir() { - if [ ! -d "$SLOT_DIR" ]; then - _ft_log "msg='slot dir missing, creating' path=$SLOT_DIR" - sudo mkdir -p "$SLOT_DIR" - # Owned by the deploy user so subsequent writes do not need sudo. - sudo chown "$(id -un):$(id -gn)" "$SLOT_DIR" - sudo chmod 750 "$SLOT_DIR" - fi -} - -# Single authoritative validator. Returns 0 for "blue"|"green", 1 otherwise. -# Logs to stderr on failure so every call site gets a structured error for free. -_ft_validate_slot() { - case "$1" in - blue|green) return 0 ;; - *) _ft_log "level=ERROR msg='invalid slot value' slot='${1:0:80}'" - return 1 ;; - esac -} - -_ft_write_slot() { - local slot="$1" - _ft_validate_slot "$slot" || return 1 - _ft_ensure_slot_dir - local slot_tmp - slot_tmp=$(mktemp "${SLOT_DIR}/active-slot.XXXXXX") - printf '%s\n' "$slot" > "$slot_tmp" - mv "$slot_tmp" "$ACTIVE_SLOT_FILE" - _ft_log "msg='slot file updated (atomic)' slot=$slot path=$ACTIVE_SLOT_FILE" -} - -# _ft_resolve_slot -- returns the active slot name, recovering from a missing -# or corrupt slot file by inspecting running containers and the live nginx config. -# -# Recovery precedence: -# 1. slot file value (happy path) -# 2. only blue running -> blue -# 3. only green running -> green -# 4. both running -> nginx upstream port as tiebreaker -# 5. neither running -> green (first deploy; inactive = blue) -_ft_resolve_slot() { - _ft_ensure_slot_dir - - # Happy path -- slot file exists and is valid. - if [ -f "$ACTIVE_SLOT_FILE" ]; then - local current_slot - current_slot=$(tr -d '[:space:]' < "$ACTIVE_SLOT_FILE") - # Guard: detect log contamination in the file (pre-fix corruption defense). - # A valid slot is ONLY the literal string "blue" or "green". - if [[ "$current_slot" == *DEPLOY* ]] || [[ "$current_slot" == *\[* ]]; then - _ft_log "level=WARN msg='slot file contains log contamination -- treating as corrupt, recovering' value=${current_slot:0:80}" - elif _ft_validate_slot "$current_slot"; then - _ft_log "msg='slot file read' slot=$current_slot" - echo "$current_slot" - return 0 - else - # _ft_validate_slot already logged the invalid value; fall through to recovery. - _ft_log "level=WARN msg='slot file invalid, falling through to container recovery'" - fi - else - _ft_log "level=WARN msg='slot file missing, recovering from container state' path=$ACTIVE_SLOT_FILE" - fi - - # Try to recover from last-known-good snapshot first - if [ -f "$LAST_GOOD_FILE" ]; then - local last_good_state - last_good_state=$(head -1 "$LAST_GOOD_FILE" 2>/dev/null | tr -d '[:space:]') - if _ft_validate_slot "$last_good_state" 2>/dev/null; then - _ft_log "msg='recovered slot from last-known-good snapshot' slot=$last_good_state file=$LAST_GOOD_FILE" - echo "$last_good_state" - return 0 - fi - fi - - # Recovery -- infer from running containers, then nginx config. - local blue_running=false green_running=false recovered_slot="" - docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${BLUE_NAME}$" && blue_running=true || true - docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${GREEN_NAME}$" && green_running=true || true - - if [ "$blue_running" = "true" ] && [ "$green_running" = "false" ]; then - recovered_slot="blue" - _ft_log "msg='recovery: only blue running' slot=blue" - elif [ "$green_running" = "true" ] && [ "$blue_running" = "false" ]; then - recovered_slot="green" - _ft_log "msg='recovery: only green running' slot=green" - elif [ "$blue_running" = "true" ] && [ "$green_running" = "true" ]; then - # Both running -- read nginx upstream container as authoritative tiebreaker. - local nginx_upstream - nginx_upstream=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "") - if [ "$nginx_upstream" = "api-blue" ]; then recovered_slot="blue" - elif [ "$nginx_upstream" = "api-green" ]; then recovered_slot="green" - else - recovered_slot="blue" - _ft_log "level=WARN msg='both containers running and nginx upstream ambiguous, defaulting to blue' nginx_upstream=${nginx_upstream}" - fi - _ft_log "msg='recovery: both containers running, nginx tiebreaker' nginx_upstream=${nginx_upstream} slot=${recovered_slot}" - else - # Neither running -- first deploy. - recovered_slot="green" - _ft_log "msg='recovery: no containers running, assuming first deploy' slot=green" - fi - - # Validate before writing -- recovered_slot must be blue or green. - # (_ft_validate_slot logs the error; we just fail the subshell.) - _ft_validate_slot "$recovered_slot" || return 1 - - # Persist the recovered value (atomic write). - local slot_tmp - slot_tmp=$(mktemp "${SLOT_DIR}/active-slot.XXXXXX") - printf '%s\n' "$recovered_slot" > "$slot_tmp" - mv "$slot_tmp" "$ACTIVE_SLOT_FILE" - _ft_log "msg='slot file recreated (atomic)' slot=$recovered_slot" - echo "$recovered_slot" -} - -# --------------------------------------------------------------------------- -# ACQUIRE DEPLOYMENT LOCK -# --------------------------------------------------------------------------- -_ft_acquire_lock - -# --------------------------------------------------------------------------- -# PRE-FLIGHT: load environment + validate contract -# --------------------------------------------------------------------------- -_ft_state "PRE_FLIGHT" "msg='loading and validating environment'" - -# Log last-known-good state for faster triage -_LAST_GOOD=$(cat "$LAST_GOOD_FILE" 2>/dev/null || echo "none") -_ft_log "msg='startup recovery info' last_good=$_LAST_GOOD" - -# Disable xtrace while sourcing .env to prevent secrets in logs. -set +x -source "$SCRIPT_DIR/load-env.sh" -if [ "${DEBUG:-false}" = "true" ]; then set -x; fi - -# DEPLOY_ROOT is now exported by load-env.sh. -DEPLOY_HISTORY="$DEPLOY_ROOT/.deploy_history" - -_ft_log "msg='environment loaded' api_hostname=$API_HOSTNAME" - -set +x -"$SCRIPT_DIR/validate-env.sh" --check-monitoring -if [ "${DEBUG:-false}" = "true" ]; then set -x; fi -# Harden monitoring env file permissions on every deploy (defense-in-depth). -chmod 600 "$DEPLOY_ROOT/infra/.env.monitoring" 2>/dev/null || true - -_ft_log "msg='env contract validated'" - -# Ensure api_network exists (idempotent). All containers MUST be on this network. -docker network create --driver bridge "$NETWORK" 2>/dev/null \ - && _ft_log "msg='api_network created'" \ - || _ft_log "msg='api_network already exists'" - -# GLOBAL PORT-LEAK GUARD -- api-blue/api-green MUST NOT bind host ports. -# All API traffic flows: Cloudflare → nginx (binds 80/443) → api_network. -# nginx is exempt; api containers with host ports bypass the nginx layer -# and would expose the API without TLS or rate-limiting. -_API_PORT_LEAKS=$(docker ps --format '{{.Names}} {{.Ports}}' 2>/dev/null \ - | grep -E '^api-(blue|green)' \ - | grep -E '(0\.0\.0\.0:|127\.0\.0\.1:)[0-9]+->') || true -if [ -n "${_API_PORT_LEAKS:-}" ]; then - _ft_log "level=ERROR msg='API container has host port bindings — forbidden. Remove and recreate without -p.' leaks=${_API_PORT_LEAKS}" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=api_port_leak_detected" -fi -unset _API_PORT_LEAKS -_ft_log "msg='port-leak guard passed — no API containers with host port bindings'" - -# NGINX CONTAINER GUARD -- nginx MUST run as a Docker container on api_network. -# With container-name upstreams (server api-blue:3000), Docker's embedded DNS -# (127.0.0.11) is required for name resolution. This only works from WITHIN -# Docker containers on the same network -- not from a host systemd nginx service. -# -# BOOTSTRAP MODE: If nginx is missing, start it via docker compose --no-deps so -# the monitoring dependency chain (nginx→grafana→prometheus→alertmanager) does -# NOT block a first-deploy. nginx starts immediately; monitoring catches up. -if ! docker inspect nginx >/dev/null 2>&1; then - _ft_log "msg='nginx container missing — bootstrapping via docker compose --no-deps'" - mkdir -p "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR" - # Write a bootstrap config pointing at api-blue (default first-deploy slot) - # so nginx can start without waiting for an API container. - if [ ! -f "$NGINX_CONF" ]; then - # Permission check: ensure deploy user can write to nginx live dir - if [ ! -w "$(dirname "$NGINX_CONF")" ]; then - sudo chown -R "$(id -un):$(id -gn)" "$(dirname "$NGINX_CONF")" - fi - _NGINX_GUARD_TMP="$(mktemp /tmp/api-nginx-guard.XXXXXX.conf)" - sed \ - -e "s|__ACTIVE_CONTAINER__|api-blue|g" \ - -e "s|__API_HOSTNAME__|${API_HOSTNAME}|g" \ - "$NGINX_TEMPLATE" > "$_NGINX_GUARD_TMP" - mv "$_NGINX_GUARD_TMP" "$NGINX_CONF" - _ft_log "msg='bootstrap nginx config written (atomic)' target=api-blue path=$NGINX_CONF" - fi - # Kill any ghost docker-proxy holdind host ports before starting nginx - pkill docker-proxy 2>/dev/null || true - cd "$DEPLOY_ROOT/infra" - _COMPOSE_OUT=$(docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml \ - up -d --no-deps nginx 2>&1) || { - printf '%s\n' "$_COMPOSE_OUT" >&2 - _ft_log "level=ERROR msg='docker compose up --no-deps nginx failed'" - cd "$DEPLOY_ROOT" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_bootstrap_compose_failed" - } - unset _COMPOSE_OUT - cd "$DEPLOY_ROOT" - # Wait up to 30 s for the nginx container to become available - _NGINX_STARTED=false - for _ni in $(seq 1 10); do - if docker inspect nginx >/dev/null 2>&1; then - _ft_log "msg='nginx bootstrap complete' attempt=$_ni" - _NGINX_STARTED=true - break - fi - sleep 3 - done - if [ "$_NGINX_STARTED" != "true" ]; then - _ft_log "level=ERROR msg='nginx container failed to start after bootstrap'" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_bootstrap_timeout" - fi - unset _NGINX_STARTED _ni -fi -_NGINX_NETWORK=$(docker inspect nginx --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "") -if ! echo "$_NGINX_NETWORK" | grep -q "$NETWORK"; then - _ft_log "level=ERROR msg='nginx container not on api_network -- container DNS will fail' networks=${_NGINX_NETWORK}" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_not_on_api_network networks=${_NGINX_NETWORK}" -fi -unset _NGINX_NETWORK -_ft_log "msg='nginx container guard passed' container=nginx network=$NETWORK" - -# Ensure nginx live and backup directories exist (deploy user owns them) -mkdir -p "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR" - -# --------------------------------------------------------------------------- -# PREFLIGHT CHECK (policy=warn: missing preflight logs a warning, does not abort) -# --------------------------------------------------------------------------- -if [ "$PREFLIGHT_STRICT" = "true" ]; then - [ -x "$SCRIPT_DIR/preflight.sh" ] || _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=preflight_missing_strict_mode path=$SCRIPT_DIR/preflight.sh" - _ft_state "PREFLIGHT" "msg='running preflight checks (STRICT mode)'" - if ! "$SCRIPT_DIR/preflight.sh" 2>&1; then - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=preflight_failed_strict_mode" - fi - _ft_log "msg='preflight checks passed (strict mode)'" -elif [ -x "$SCRIPT_DIR/preflight.sh" ]; then - _ft_state "PREFLIGHT" "msg='running preflight checks'" - if ! "$SCRIPT_DIR/preflight.sh" 2>&1; then - _ft_log "level=ERROR msg='preflight checks failed -- aborting deploy'" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=preflight_failed" - fi - _ft_log "msg='preflight checks passed'" -else - _ft_log "level=WARN msg='preflight.sh not found or not executable -- continuing (policy=warn)' path=$SCRIPT_DIR/preflight.sh" -fi - -# --------------------------------------------------------------------------- -# DEPLOY METADATA -- structured log emitted once per deploy for observability -# --------------------------------------------------------------------------- -_ft_log "msg='deploy metadata' sha=$IMAGE_SHA image=$IMAGE script_dir=$SCRIPT_DIR repo_dir=$REPO_DIR app_env=${APP_ENV:-unset}" - -# --------------------------------------------------------------------------- -# [1/7] PULL IMAGE -# --------------------------------------------------------------------------- -_ft_state "PULL_IMAGE" "msg='pulling container image' sha=$IMAGE_SHA" -_ft_phase_start "PULL_IMAGE" - -# Explicit pull with hard error. -# Without this guard a missing image would cause docker run to attempt a -# background pull inside a 60-s timeout, racing the readiness loop. -if ! run timeout 120 docker pull "$IMAGE"; then - _ft_log "level=ERROR msg='image pull failed' image=$IMAGE" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=image_pull_failed image=$IMAGE" -fi -_ft_log "msg='image pulled' image=$IMAGE" -_ft_phase_end "PULL_IMAGE" - -# --------------------------------------------------------------------------- -# BOOTSTRAP GUARD -- no API containers exist (first deploy or full restart) -# -# When no api-blue or api-green containers are present, the normal slot -# recovery path works but is implicit. This guard makes first-deploy -# explicit: start api-blue directly, wait for readiness, write nginx config, -# write slot file, and exit cleanly with BOOTSTRAP_SUCCESS. -# -# WHY THIS IS NECESSARY: -# - nginx starts (via the guard above) with bootstrap config pointing at api-blue -# - Without this guard, nginx is serving 502 until the normal START_INACTIVE -# path eventually starts api-blue. This can be 30-60s of errors. -# - Explicit bootstrap gives a deterministic, logged, traceable first-deploy. -# -# SKIPPED when any api container already exists (normal redeploy path). -# --------------------------------------------------------------------------- -if ! docker ps -a --format '{{.Names}}' | grep -Eq '^api-(blue|green)$'; then - _ft_state "BOOTSTRAP" "msg='no api containers found — first deploy, starting api-blue directly'" - - # Remove stale container if left in a stopped state somehow - docker rm -f api-blue 2>/dev/null || true - - _CID=$(timeout 60 docker run -d \ - --name api-blue \ - --network "$NETWORK" \ - --restart unless-stopped \ - --label "api.sha=$IMAGE_SHA" \ - --label "api.slot=blue" \ - --label "api.deploy_id=$DEPLOY_ID" \ - --env-file "$ENV_FILE" \ - "$IMAGE" 2>&1) || { - printf '%s\n' "$_CID" >&2 - _ft_error "msg='bootstrap: container start failed' name=api-blue" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=bootstrap_container_start_failed" - } - unset _CID - - _ft_log "msg='bootstrap: api-blue started' image=$IMAGE" - - # Grace window: give the process time to bind and initialise workers. - # /ready can lag the HTTP server bind by ~1–3 s while workers start. - sleep 2 - - # Bootstrap readiness: use docker run (works with distroless containers). - _BOOT_OK=false - for _bi in $(seq 1 20); do - if docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" -sf --max-time 4 "http://api-blue:${APP_PORT}/ready" >/dev/null 2>&1; then - _ft_log "msg='bootstrap: api-blue ready' attempt=$_bi" - _BOOT_OK=true - break - fi - [ $((_bi % 10)) -eq 0 ] && _ft_log "msg='bootstrap: still waiting for api-blue readiness' attempt=$_bi/20" - sleep 2 - done - - if [ "$_BOOT_OK" != "true" ]; then - _ft_log "level=ERROR msg='bootstrap: api-blue did not become ready after 60s — container PRESERVED for debugging'" - # DO NOT remove the container on bootstrap failure: - # - Preserves logs and state for post-mortem: docker logs api-blue - # - Removing here loses all debugging visibility - # - Operator can inspect and restart manually - docker logs api-blue --tail 50 >&2 || true - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=bootstrap_api_ready_timeout" - fi - unset _bi _BOOT_OK - - # Write nginx config pointing at api-blue (same sed logic as SWITCH_NGINX) - mkdir -p "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR" - NGINX_BOOT_TMP="$(mktemp /tmp/api-nginx-boot.XXXXXX.conf)" - sed \ - -e "s|__ACTIVE_CONTAINER__|api-blue|g" \ - -e "s|__API_HOSTNAME__|${API_HOSTNAME}|g" \ - "$NGINX_TEMPLATE" > "$NGINX_BOOT_TMP" - cp "$NGINX_BOOT_TMP" "$NGINX_CONF" - rm -f "$NGINX_BOOT_TMP" - - # Nginx network attachment guard — must be on api_network before reload. - _NGINX_BOOT_NET=$(docker inspect nginx \ - --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "") - if ! echo "$_NGINX_BOOT_NET" | grep -q "$NETWORK"; then - _ft_log "level=ERROR msg='bootstrap: nginx not attached to api_network' networks=${_NGINX_BOOT_NET}" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_network_mismatch_bootstrap" - fi - unset _NGINX_BOOT_NET - - # Fail-fast: any nginx test/reload failure is a hard error at bootstrap. - _NGINX_TEST_OUT=$(docker exec nginx nginx -t 2>&1) || { - printf '%s\n' "$_NGINX_TEST_OUT" >&2 - _ft_log "level=ERROR msg='bootstrap: nginx config test failed'" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_config_test_failed_bootstrap" - } - unset _NGINX_TEST_OUT - docker exec nginx nginx -s reload >/dev/null 2>&1 \ - || { _ft_log "level=ERROR msg='bootstrap: nginx reload failed'"; _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_reload_failed_bootstrap"; } - _ft_log "msg='bootstrap: nginx reloaded to api-blue'" - - # Persist slot state (atomic write already in _ft_write_slot) - _ft_write_slot "blue" - - # Snapshot last-known-good - _SNAP_BOOT_TMP=$(mktemp "${SNAP_DIR}/last-good.XXXXXX") - printf 'slot=blue container=api-blue ts=%s\n' "$(date -Iseconds)" > "$_SNAP_BOOT_TMP" - mv "$_SNAP_BOOT_TMP" "$LAST_GOOD_FILE" - unset _SNAP_BOOT_TMP - - _ft_exit 0 "BOOTSTRAP_SUCCESS" "slot=blue image=$IMAGE" -fi - -# --------------------------------------------------------------------------- -# [2/7] RESOLVE ACTIVE SLOT (with recovery) -# --------------------------------------------------------------------------- -_ft_state "RESOLVE_SLOT" "msg='determining active slot'" - -ACTIVE=$(_ft_resolve_slot) || { - _ft_log "level=ERROR msg='_ft_resolve_slot failed or exited non-zero -- cannot continue safely'" - exit 1 -} -ACTIVE=$(printf '%s' "$ACTIVE" | tr -d '[:space:]') -_ft_validate_slot "$ACTIVE" || exit 1 - -# SLOT REPAIR — heal slot file drift from reality. -# If the slot file says "green" but api-green is gone (OOM/manual removal), -# flip the effective slot to whatever container IS actually running. -# This prevents a deploy from treating a missing container as the "active" one. -if [ "$ACTIVE" = "green" ] && ! docker inspect api-green >/dev/null 2>&1; then - _ft_log "msg='slot repair: green missing — switching effective slot to blue' original_slot=green" - ACTIVE="blue" - _ft_write_slot "blue" -elif [ "$ACTIVE" = "blue" ] && ! docker inspect api-blue >/dev/null 2>&1; then - # Both containers may be missing on a clean restart; this is ok — the - # BOOTSTRAP GUARD above will catch it. Here we only switch when the - # opposite slot is actually running. - if docker inspect api-green >/dev/null 2>&1; then - _ft_log "msg='slot repair: blue missing but green running — switching effective slot to green' original_slot=blue" - ACTIVE="green" - _ft_write_slot "green" - else - _ft_log "level=WARN msg='slot repair: neither container running — first deploy or crash; slot kept as blue'" - fi -fi -_ft_validate_slot "$ACTIVE" || exit 1 - -if [ "$ACTIVE" = "blue" ]; then - ACTIVE_NAME=$BLUE_NAME - INACTIVE="green"; INACTIVE_NAME=$GREEN_NAME -else - ACTIVE_NAME=$GREEN_NAME - INACTIVE="blue"; INACTIVE_NAME=$BLUE_NAME -fi - -_ft_log "msg='slot resolved' active=$ACTIVE active_name=$ACTIVE_NAME inactive=$INACTIVE inactive_name=$INACTIVE_NAME" - -# --------------------------------------------------------------------------- -# ACTIVE CONTAINER EXISTENCE GUARD -# Protect against race: active slot file says "blue" but container doesn't exist. -# This catches crash/OOM scenarios before any deploy logic runs. -# --------------------------------------------------------------------------- -if docker ps -a --format '{{.Names}}' | grep -q "^${ACTIVE_NAME}$"; then - if ! docker inspect "$ACTIVE_NAME" >/dev/null 2>&1; then - _ft_log "level=ERROR msg='active container listed by docker ps but inspect failed -- possible race' container=$ACTIVE_NAME" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=active_container_inspect_race container=$ACTIVE_NAME" - fi - _ft_log "msg='active container existence guard passed' container=$ACTIVE_NAME" -else - _ft_log "level=WARN msg='active container not running (first deploy or crash recovery)' container=$ACTIVE_NAME" -fi - -# --------------------------------------------------------------------------- -# IDEMPOTENCY GUARD -- skip deploy if this exact SHA is already the active container -# --------------------------------------------------------------------------- -_ft_state "IDEMPOTENCY" "msg='checking if target SHA already deployed' sha=$IMAGE_SHA" - -_RUNNING_IMAGE=$(docker inspect --format '{{.Config.Image}}' "$ACTIVE_NAME" 2>/dev/null || echo "") -if [ "$_RUNNING_IMAGE" = "$IMAGE" ]; then - # In-network health check: exercises Docker DNS + bridge routing. - _IDEMPOTENT_HEALTH=$(_ft_net_curl_out "$ACTIVE_NAME" \ - -s --max-time 3 "http://$ACTIVE_NAME:$APP_PORT/ready") - if echo "$_IDEMPOTENT_HEALTH" | grep -q '"status":"ready"' 2>/dev/null; then - _ft_log "msg='target SHA already running and healthy -- nothing to do' container=$ACTIVE_NAME image=$IMAGE" - _ft_final_state "$ACTIVE_NAME" "$IMAGE_SHA" - _ft_github_summary "✅ IDEMPOTENT (no change)" "$ACTIVE_NAME" "$IMAGE_SHA" "SHA already deployed" - _ft_exit 0 "DEPLOY_SUCCESS" "reason=idempotent_noop sha=$IMAGE_SHA container=$ACTIVE_NAME" - else - _ft_log "msg='idempotent SHA match but active container not healthy -- proceeding with deploy' container=$ACTIVE_NAME" - fi - unset _IDEMPOTENT_HEALTH - else - _ft_log "msg='SHA differs from running image -- proceeding' running=${_RUNNING_IMAGE:-none} target=$IMAGE" - fi - unset _RUNNING_IMAGE - -# --------------------------------------------------------------------------- -# [3/7] START INACTIVE CONTAINER -# --------------------------------------------------------------------------- -_ft_state "START_INACTIVE" "msg='starting inactive container' name=$INACTIVE_NAME" - -if docker ps -a --format '{{.Names}}' | grep -Eq "^${INACTIVE_NAME}$"; then - _ft_log "msg='renaming stale container for audit trail' name=$INACTIVE_NAME" - docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true - # Rename instead of hard-rm so a post-mortem can inspect the old container - # state. The -old- suffix lets the zombie purge below collect it. - _STALE_TS=$(date +%s) - docker rename "$INACTIVE_NAME" "${INACTIVE_NAME}-old-${_STALE_TS}" 2>/dev/null \ - || docker rm "$INACTIVE_NAME" -fi - -_CID=$(timeout 60 docker run -d \ - --name "$INACTIVE_NAME" \ - --network "$NETWORK" \ - --restart unless-stopped \ - --label "api.sha=$IMAGE_SHA" \ - --label "api.slot=$INACTIVE" \ - --label "api.deploy_id=$DEPLOY_ID" \ - --env-file "$ENV_FILE" \ - "$IMAGE" 2>&1) || { - printf '%s\n' "$_CID" >&2 - _ft_error "msg='container start failed' name=$INACTIVE_NAME" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=container_start_failed name=$INACTIVE_NAME" -} -unset _CID - -_ft_log "msg='container started' name=$INACTIVE_NAME" - -# IMAGE IMMUTABILITY CHECK -- confirm running container image matches target SHA. -_ACTUAL_IMAGE=$(docker inspect --format '{{.Config.Image}}' "$INACTIVE_NAME" 2>/dev/null || echo "") -if [ "$_ACTUAL_IMAGE" != "$IMAGE" ]; then - _ft_log "level=ERROR msg='image immutability check failed: running image does not match target' expected=$IMAGE actual=${_ACTUAL_IMAGE:-unknown}" - docker logs "$INACTIVE_NAME" --tail 50 >&2 || true - docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true - docker rm "$INACTIVE_NAME" || true - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=image_immutability_check_failed expected=$IMAGE actual=${_ACTUAL_IMAGE:-unknown}" -fi -_ft_log "msg='image immutability check passed' image=$_ACTUAL_IMAGE" -unset _ACTUAL_IMAGE -_ft_log "msg='phase_complete' state=START_INACTIVE status=success container=$INACTIVE_NAME" -# [4/7] INTERNAL HEALTH CHECK -# Uses /ready to validate Redis, Supabase, and BullMQ before traffic switch. -# --------------------------------------------------------------------------- -_ft_state "HEALTH_CHECK_INTERNAL" "msg='waiting for container readiness'" - -sleep 5 -HEALTH_ENDPOINT="/ready" - -# CONNECTIVITY PRE-CHECK (in-network) -# Probe /health via a short-lived curl container on api_network to verify: -# - Docker DNS resolution of $INACTIVE_NAME -# - Bridge routing to the container -# - HTTP server is bound and responding -# This exercises the same network path nginx uses, catching issues that -# docker exec localhost would silently skip. -_CONN_ATTEMPTS=0 -_CONN_OK=false -while [ "$_CONN_ATTEMPTS" -lt 5 ]; do - _CONN_ATTEMPTS=$((_CONN_ATTEMPTS + 1)) - if _ft_net_curl "$INACTIVE_NAME" \ - -sf --max-time 3 "http://$INACTIVE_NAME:$APP_PORT/health"; then - _CONN_OK=true - break - fi - sleep 2 -done -if [ "$_CONN_OK" = "false" ]; then - _ft_log "level=ERROR msg='container not reachable after connectivity pre-check' container=$INACTIVE_NAME" - docker logs "$INACTIVE_NAME" --tail 100 >&2 || true - docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true - docker rm "$INACTIVE_NAME" || true - _ft_log "msg='active container still serving -- deploy failed non-destructively' container=$ACTIVE_NAME" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=container_not_reachable container=$INACTIVE_NAME" -fi -unset _CONN_ATTEMPTS _CONN_OK -_ft_log "msg='connectivity pre-check passed' container=$INACTIVE_NAME" - -ATTEMPT=0 -until true; do - ATTEMPT=$((ATTEMPT + 1)) - STATUS=$(_ft_net_curl_out "$INACTIVE_NAME" \ - --max-time 4 -s -o /dev/null -w "%{http_code}" \ - "http://$INACTIVE_NAME:$APP_PORT${HEALTH_ENDPOINT}" || echo "000") - - if [ "$STATUS" = "200" ]; then - _ft_log "msg='internal health check passed' endpoint=$HEALTH_ENDPOINT attempts=$ATTEMPT" - break - fi - - if ! docker ps --format '{{.Names}}' | grep -q "^${INACTIVE_NAME}$"; then - _ft_log "level=ERROR msg='container exited unexpectedly' name=$INACTIVE_NAME" - docker logs "$INACTIVE_NAME" --tail 100 >&2 || true - docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true - docker rm "$INACTIVE_NAME" || true - _ft_log "msg='active container still serving -- deploy failed non-destructively' container=$ACTIVE_NAME" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=new_container_crashed" - fi - - if [ "$ATTEMPT" -ge "$MAX_HEALTH_ATTEMPTS" ]; then - _ft_log "level=ERROR msg='internal health check timed out' attempts=$ATTEMPT status=$STATUS endpoint=http://$INACTIVE_NAME:$APP_PORT${HEALTH_ENDPOINT}" - docker logs "$INACTIVE_NAME" --tail 100 >&2 || true - docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true - docker rm "$INACTIVE_NAME" || true - _ft_log "msg='active container still serving -- deploy failed non-destructively' container=$ACTIVE_NAME" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=new_container_health_timeout attempts=$ATTEMPT" - fi - - # Only log progress every 10 attempts to avoid spamming; failure threshold logs always appear above - [ $((ATTEMPT % 10)) -eq 0 ] && _ft_log "msg='still waiting for readiness' attempt=$ATTEMPT/$MAX_HEALTH_ATTEMPTS status=$STATUS" - # Add up to 2s of jitter to prevent synchronized retries under contention. - sleep $((HEALTH_INTERVAL + RANDOM % 3)) -done - -_ft_log "msg='phase_complete' phase=HEALTH_CHECK_INTERNAL status=success container=$INACTIVE_NAME" -_ft_phase_end "HEALTH_CHECK_INTERNAL" - -# --------------------------------------------------------------------------- -# DOCKER HEALTH GATE -# Ensures the container's HEALTHCHECK has settled to "healthy" before -# switching nginx. Prevents routing to a container that is "starting". -# --------------------------------------------------------------------------- -if ! _ft_wait_docker_health "$INACTIVE_NAME"; then - docker logs "$INACTIVE_NAME" --tail 50 >&2 || true - docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true - docker rm "$INACTIVE_NAME" || true - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=docker_health_failed container=$INACTIVE_NAME" -fi - -# STABILIZATION DELAY -- brief pause after docker health gate to let -# any in-flight connection setup settle (TLS session init, worker warm-up). -_ft_log "msg='stabilization delay' container=$INACTIVE_NAME" -sleep 3 - -# PRE-SWITCH CONNECTIVITY CHECK -# Direct in-network probe of the new container BEFORE touching nginx. -# Validates Docker DNS resolution + bridge routing work for the new container -# one final time with a clean, fresh curl invocation. -if ! docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" \ - -sf --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/ready" >/dev/null 2>&1; then - _ft_error "msg='pre-switch connectivity check failed' container=$INACTIVE_NAME" - docker logs "$INACTIVE_NAME" --tail 50 >&2 || true - docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true - docker rm "$INACTIVE_NAME" || true - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=pre_switch_connectivity_failed container=$INACTIVE_NAME" -fi -_ft_log "msg='pre-switch connectivity check passed' container=$INACTIVE_NAME" -# --------------------------------------------------------------------------- -_ft_state "SWITCH_NGINX" "msg='switching nginx upstream' container=$INACTIVE_NAME" - -# Deterministic stabilization window: give the new container a moment before -# switching nginx (complements the jitter already in the health check loop). -sleep 2 - -# Backup stored in NGINX_BACKUP_DIR (under the repo) — consistent with the -# pruning logic below. Avoids creating files in /etc/nginx/ (host-side) -# which is not guaranteed to exist when nginx runs only inside Docker. -mkdir -p "$NGINX_BACKUP_DIR" -NGINX_BACKUP="$NGINX_BACKUP_DIR/api.conf.bak.$(date +%s)" -NGINX_TMP="$(mktemp /tmp/api-nginx.XXXXXX.conf)" - -# PRE-RELOAD GATE (in-network with fallback): confirm container is still ready -# before pointing nginx at it. -if ! _ft_net_curl "$INACTIVE_NAME" \ - -sf --max-time 4 "http://$INACTIVE_NAME:$APP_PORT/ready"; then - _ft_log "level=ERROR msg='pre-reload gate failed: container not ready' container=$INACTIVE_NAME" - docker logs "$INACTIVE_NAME" --tail 50 >&2 || true - docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true - docker rm "$INACTIVE_NAME" || true - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=pre_reload_gate_failed container=$INACTIVE_NAME" -fi -_ft_log "msg='pre-reload gate passed' container=$INACTIVE_NAME" - -sed \ - -e "s|__ACTIVE_CONTAINER__|$INACTIVE_NAME|g" \ - -e "s|__API_HOSTNAME__|$API_HOSTNAME|g" \ - "$NGINX_TEMPLATE" > "$NGINX_TMP" - -cp "$NGINX_CONF" "$NGINX_BACKUP" -cp "$NGINX_TMP" "$NGINX_CONF" -rm -f "$NGINX_TMP" -# Prune old backups (keep last 5) to avoid unbounded growth -ls -1t "$NGINX_BACKUP_DIR"/api.conf.bak.* 2>/dev/null | tail -n +6 | xargs rm -f 2>/dev/null || true - -# Nginx network attachment guard: verify nginx is on api_network before every -# reload. If nginx was accidentally disconnected, Docker DNS resolution of -# api-blue/api-green will silently fail inside nginx. -_NGINX_RELOAD_NET=$(docker inspect nginx \ - --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "") -if ! echo "$_NGINX_RELOAD_NET" | grep -q "$NETWORK"; then - _ft_log "level=ERROR msg='nginx not attached to api_network at reload time' networks=${_NGINX_RELOAD_NET}" - cp "$NGINX_BACKUP" "$NGINX_CONF" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_network_mismatch" -fi -unset _NGINX_RELOAD_NET - -_NGINX_TEST_OUT=$(docker exec nginx nginx -t 2>&1) || { - printf '%s\n' "$_NGINX_TEST_OUT" >&2 - _ft_log "level=ERROR msg='nginx config test failed -- restoring backup'" - cp "$NGINX_BACKUP" "$NGINX_CONF" - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_config_test_failed" -} -unset _NGINX_TEST_OUT -docker exec nginx nginx -s reload >/dev/null 2>&1 \ - || { cp "$NGINX_BACKUP" "$NGINX_CONF"; _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_reload_failed"; } -_ft_log "msg='nginx reloaded' upstream=$INACTIVE_NAME:$APP_PORT" - -# Upstream sanity check -- confirm nginx config actually points at the new container. -# Catches template substitution failures before traffic is affected. -# Upstream sanity: live config must contain http://INACTIVE_NAME:3000 (set $api_backend format) -_RELOAD_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "") -if [ "$_RELOAD_CONTAINER" != "$INACTIVE_NAME" ]; then - _ft_log "level=ERROR msg='nginx upstream sanity check failed after reload' expected=$INACTIVE_NAME actual=${_RELOAD_CONTAINER:-unreadable}" - cp "$NGINX_BACKUP" "$NGINX_CONF" - docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1 || true - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_upstream_mismatch expected=$INACTIVE_NAME actual=${_RELOAD_CONTAINER:-unreadable}" -fi -unset _RELOAD_CONTAINER -_ft_log "msg='nginx upstream sanity check passed' container=$INACTIVE_NAME" -_ft_log "msg='phase_complete' phase=SWITCH_NGINX status=success container=$INACTIVE_NAME" -_ft_phase_end "SWITCH_NGINX" - -# Write the slot file AFTER nginx reload so it always reflects what nginx -# is currently serving. If the public health check then fails and we roll -# back, we restore nginx AND overwrite this file back to $ACTIVE. -_ft_write_slot "$INACTIVE" - -# Observability hook — log the traffic switch for monitoring/tracking -_ft_log "msg='TRAFFIC_SWITCH' active=$INACTIVE_NAME sha=$IMAGE_SHA deploy_id=$DEPLOY_ID" - -# Nginx warm-up delay — prevents race condition where reload completes before -# upstream connections are fully established and TLS sessions negotiated. -# Longer than typical TLS handshake + connection setup. -sleep $((RANDOM % 3 + 5)) - -# POST-SWITCH ROUTING VERIFICATION (in-network) -# Run a short-lived curl container on api_network to probe nginx/health. -# This exercises: Docker DNS resolution of 'nginx', bridge routing nginx→container, -# nginx upstream substitution, and proxy-pass to $INACTIVE_NAME:$APP_PORT. -# Same network path that real client traffic takes after the slot switch. -_ft_log "msg='post-switch nginx routing verification (in-network)'" -_POST_SWITCH_OK=false -for _ps in 1 2 3 4 5; do - if docker run --rm --network api_network curlimages/curl:8.7.1 \ - -sfk --max-time 5 "https://nginx/health" >/dev/null 2>&1; then - _POST_SWITCH_OK=true - break - fi - sleep $((RANDOM % 2 + 2)) -done -if [ "$_POST_SWITCH_OK" != "true" ]; then - _ft_error "msg='post-switch routing verification failed — nginx cannot reach new container'" - _ft_error "msg='ROLLBACK triggered → restoring $ACTIVE_NAME (post-switch restore)'" - _ft_snapshot - cp "$NGINX_BACKUP" "$NGINX_CONF" - if docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1; then - _ft_log "msg='nginx restored (post-switch routing failure)'" - else - _ft_log "level=ERROR msg='nginx restore failed during post-switch rollback'" - fi - _ft_write_slot "$ACTIVE" - docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true - docker rm "$INACTIVE_NAME" || true - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=post_switch_routing_failed container=$INACTIVE_NAME" -fi -unset _POST_SWITCH_OK _ps -_ft_log "msg='post-switch routing verification passed'" - -# POST-SWITCH UPSTREAM VERIFICATION -# Directly probe the new container via its in-network address after nginx -# has confirmed routing. Ensures the upstream backend itself is still -# responding — nginx routing healthy does NOT imply backend healthy. -if ! docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" \ - -sf --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/ready" >/dev/null 2>&1; then - _ft_error "msg='post-switch upstream verification failed' container=$INACTIVE_NAME" - _ft_snapshot - cp "$NGINX_BACKUP" "$NGINX_CONF" - if docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1; then - _ft_log "msg='nginx restored (post-switch upstream failure)'" - else - _ft_log "level=ERROR msg='nginx restore failed during upstream verification rollback'" - fi - _ft_write_slot "$ACTIVE" - docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true - docker rm "$INACTIVE_NAME" || true - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=post_switch_upstream_failed container=$INACTIVE_NAME" -fi -_ft_log "msg='post-switch upstream verification passed' container=$INACTIVE_NAME" - -# --------------------------------------------------------------------------- -# [6/7] PUBLIC HEALTH CHECK (end-to-end nginx routing) -# Validates: -# 1. HTTP 200 -- nginx routing, TLS, Host header matching -# 2. Body "status":"ready" -- backend /ready endpoint, external services -# 3. Container alignment -- live nginx config points at $INACTIVE_NAME -# -# NOTE: Uses localhost (127.0.0.1) + Host header to validate nginx routing -# while avoiding Cloudflare IP allowlist block (see _ft_check_external_ready). -# --------------------------------------------------------------------------- -_ft_state "HEALTH_CHECK_PUBLIC" "msg='validating nginx routing + backend health (localhost)' host=$API_HOSTNAME" - -# Give nginx a moment to apply the reloaded config cleanly. -sleep 3 - -_PUB_PASSED=false -_PUB_STATUS="000" - -# Public health check — single source of truth via docker network -# HTTPS with -k because nginx redirects HTTP to HTTPS -# -f: fail on 4xx/5xx so HTML error pages never match the grep -if docker run --rm --network api_network curlimages/curl:8.7.1 \ - -sfk --max-time 10 "https://nginx/health" 2>/dev/null | grep -q '"status":"ok"'; then - _PUB_PASSED=true - _PUB_STATUS="200" - _ft_log "msg='public health check passed' container=$INACTIVE_NAME" -else - _PUB_PASSED=false - _PUB_STATUS="000" - _ft_log "msg='public health check failed' container=$INACTIVE_NAME" -fi - -# Container alignment check -- live nginx config MUST contain http://INACTIVE_NAME:3000. -_NGINX_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "") -if [ -n "$_NGINX_CONTAINER" ] && [ "$_NGINX_CONTAINER" != "$INACTIVE_NAME" ]; then - _ft_log "level=ERROR msg='nginx container mismatch -- slot switch did not take effect' expected=$INACTIVE_NAME actual=$_NGINX_CONTAINER" - _PUB_PASSED=false -fi - -if [ "$_PUB_PASSED" != "true" ]; then - _ft_state "ROLLBACK" "reason='public health check failed' status=$_PUB_STATUS" - _ft_snapshot - - _ft_log "msg='restoring previous nginx config'" - cp "$NGINX_BACKUP" "$NGINX_CONF" - if docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1; then - _ft_log "msg='nginx restored to previous config'" - else - _ft_log "level=ERROR msg='nginx restore failed -- check manually'" - fi - - # Restore slot file to the slot that was active before this deploy attempt. - _ft_write_slot "$ACTIVE" - docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true - docker rm "$INACTIVE_NAME" || true - - unset _PUB_PASSED _attempt _PUB_STATUS _PUB_BODY _NGINX_CONTAINER - - if docker ps --format '{{.Names}}' | grep -q "^${ACTIVE_NAME}$"; then - _ACTIVE_HEALTH=$(_ft_net_curl_out "$ACTIVE_NAME" \ - -s --max-time 3 "http://$ACTIVE_NAME:$APP_PORT/ready") - if echo "$_ACTIVE_HEALTH" | grep -q '"status":"ready"' 2>/dev/null; then - _ft_log "msg='deploy failed but active container healthy -- skipping rollback' container=$ACTIVE_NAME" - unset _ACTIVE_HEALTH - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=public_health_check_failed active_container_healthy=true" - fi - unset _ACTIVE_HEALTH - _ft_log "msg='active container running but NOT healthy -- treating as degraded, rollback needed' container=$ACTIVE_NAME" - fi - - _ft_log "msg='system degraded -- triggering rollback' container=$ACTIVE_NAME" - if [ "${API_ROLLBACK_IN_PROGRESS:-0}" != "1" ]; then - _ft_log "msg='triggering image rollback to previous stable SHA'" - _ft_error "msg='ROLLBACK triggered → restoring $ACTIVE_NAME'" - export API_ROLLBACK_IN_PROGRESS=1 - _ft_release_lock - if ! "$SCRIPT_DIR/rollback.sh" --auto; then - _ft_snapshot - _ft_exit 2 "DEPLOY_FAILED_FATAL" "reason=deploy_and_rollback_both_failed" - fi - _ft_exit 1 "DEPLOY_FAILED_ROLLBACK" "reason=public_health_check_failed msg='rollback succeeded, system restored'" - else - _ft_log "msg='nested rollback guard reached -- stopping to prevent infinite loop'" - _ft_exit 1 "DEPLOY_FAILED_FATAL" "reason=nested_rollback_guard" - fi -fi - -unset _PUB_PASSED _PUB_STATUS _NGINX_CONTAINER -_ft_log "msg='public health check passed' container=$INACTIVE_NAME" - -# --------------------------------------------------------------------------- -# [6.5/7] STABILITY_CHECK -- re-verify external endpoint after a settle window -# Catches flapping services that pass the initial check then regress rapidly -# --------------------------------------------------------------------------- -_ft_state "STABILITY_CHECK" "msg='post-switch stability check' settle_seconds=5" -_ft_phase_start "STABILITY_CHECK" - -sleep 5 -_STABLE=false -if _ft_check_external_ready; then - _STABLE=true - _ft_log "msg='stability check passed' url=https://$API_HOSTNAME/ready" - _ft_log "msg='phase_complete' phase=STABILITY_CHECK status=success" - _ft_phase_end "STABILITY_CHECK" -fi - -if [ "$_STABLE" = "false" ]; then - _ft_log "level=ERROR msg='stability check failed -- service regressed after initial pass'" - _ft_snapshot - - # Restore nginx + slot - _ft_log "msg='restoring previous nginx config (stability failure)'" - cp "$NGINX_BACKUP" "$NGINX_CONF" - if docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1; then - _ft_log "msg='nginx restored (stability failure)'" - else - _ft_log "level=ERROR msg='nginx restore failed during stability rollback -- check manually'" - fi - _ft_write_slot "$ACTIVE" - docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true - docker rm "$INACTIVE_NAME" || true - - if docker ps --format '{{.Names}}' | grep -q "^${ACTIVE_NAME}$"; then - _ACTIVE_HEALTH=$(_ft_net_curl_out "$ACTIVE_NAME" \ - -s --max-time 3 "http://$ACTIVE_NAME:$APP_PORT/ready") - if echo "$_ACTIVE_HEALTH" | grep -q '"status":"ready"' 2>/dev/null; then - _ft_log "msg='active container healthy after stability failure -- skipping rollback' container=$ACTIVE_NAME" - unset _ACTIVE_HEALTH - _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=public_health_check_failed active_container_healthy=true" - fi - unset _ACTIVE_HEALTH - _ft_log "msg='active container running but NOT healthy after stability failure -- rollback needed'" - fi - - _ft_log "msg='triggering rollback after stability failure'" - if [ "${API_ROLLBACK_IN_PROGRESS:-0}" != "1" ]; then - _ft_error "msg='ROLLBACK triggered → restoring $ACTIVE_NAME'" - export API_ROLLBACK_IN_PROGRESS=1 - _ft_release_lock - if ! "$SCRIPT_DIR/rollback.sh" --auto; then - _ft_snapshot - _ft_exit 2 "DEPLOY_FAILED_FATAL" "reason=stability_check_and_rollback_both_failed" - fi - _ft_exit 1 "DEPLOY_FAILED_ROLLBACK" "reason=stability_check_failed msg='rollback succeeded'" - else - _ft_exit 1 "DEPLOY_FAILED_FATAL" "reason=stability_nested_rollback_guard" - fi -fi -unset _STABLE - -# --------------------------------------------------------------------------- -# [7/7] CLEANUP + SUCCESS -# --------------------------------------------------------------------------- -_ft_state "CLEANUP" "msg='validating active container exists before cleanup' name=$ACTIVE_NAME" - -# ACTIVE CONTAINER GUARD -- handle missing container gracefully (e.g., first deploy or crash) -if ! docker ps --format '{{.Names}}' | grep -q "^$ACTIVE_NAME$"; then - _ft_log "msg='active container missing — treating as first deploy, skipping cleanup' name=$ACTIVE_NAME" - SKIP_CLEANUP=true -else - _ft_log "msg='active container guard passed' name=$ACTIVE_NAME" -fi - -# Graceful shutdown: allow in-flight requests to drain before forcing removal. -if [ "${SKIP_CLEANUP:-false}" != "true" ]; then - docker stop --time 10 "$ACTIVE_NAME" 2>/dev/null || true - # Rename instead of hard-rm: keeps the previous-active container available - # for 60 s of post-mortem inspection. The -old- suffix is used by - # the zombie purge block below. - _CLEANUP_TS=$(date +%s) - docker rename "$ACTIVE_NAME" "${ACTIVE_NAME}-old-${_CLEANUP_TS}" 2>/dev/null \ - || docker rm "$ACTIVE_NAME" || true - _ft_log "msg='previous container renamed (graceful)' name=$ACTIVE_NAME rename=${ACTIVE_NAME}-old-${_CLEANUP_TS}" -else - _ft_log "msg='cleanup skipped (first deploy scenario or container already removed)'" -fi - -_ft_state "SUCCESS" "msg='deployment complete' container=$INACTIVE_NAME sha=$IMAGE_SHA slot=$INACTIVE" - -# --------------------------------------------------------------------------- -# FINAL TRUTH CHECK -- verify state matches deployment intent -# Compares internal (localhost) vs external (DNS/Cloudflare) endpoint health -# to catch routing, TLS, and proxy anomalies -# --------------------------------------------------------------------------- -_FT_TRUTH_CHECK_PASSED=true - -# (1) Verify slot file is correctly written -if [ -f "$ACTIVE_SLOT_FILE" ]; then - _SLOT_VALUE=$(cat "$ACTIVE_SLOT_FILE" | tr -d '[:space:]') - if [ "$_SLOT_VALUE" != "$INACTIVE" ]; then - _ft_log "level=ERROR msg='truth check failed: slot file mismatch' expected=$INACTIVE actual=$_SLOT_VALUE" - _FT_TRUTH_CHECK_PASSED=false - else - _ft_log "msg='truth check: slot file correct' slot=$_SLOT_VALUE" - fi -else - _ft_log "level=ERROR msg='truth check failed: slot file missing'" - _FT_TRUTH_CHECK_PASSED=false -fi - -# (2) Verify nginx upstream container matches target (set $api_backend format) -_NGINX_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "") -if [ -n "$_NGINX_CONTAINER" ]; then - if [ "$_NGINX_CONTAINER" != "$INACTIVE_NAME" ]; then - _ft_log "level=ERROR msg='truth check failed: nginx container mismatch' expected=$INACTIVE_NAME actual=$_NGINX_CONTAINER" - _FT_TRUTH_CHECK_PASSED=false - else - _ft_log "msg='truth check: nginx upstream correct' container=$_NGINX_CONTAINER" - fi -else - _ft_log "level=WARN msg='truth check: could not read nginx upstream'" -fi - -# (3) Compare internal vs external endpoint health -# Internal: direct container endpoint (http://$INACTIVE_NAME:$APP_PORT/ready) -# External: production DNS/Cloudflare (https://$API_HOSTNAME/ready) -# Mismatch indicates routing, TLS, or proxy issues -if command -v curl >/dev/null 2>&1; then - sleep 2 - - # Check internal endpoint via in-network curl with fallback. - _INT_READY=$(_ft_net_curl_out "$INACTIVE_NAME" \ - -s --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/ready") - _INT_READY_OK=false - if echo "$_INT_READY" | grep -q '"status":"ready"' 2>/dev/null; then - _INT_READY_OK=true - _ft_log "msg='truth check: internal endpoint ready' url=http://$INACTIVE_NAME:$APP_PORT/ready" - else - _ft_log "level=WARN msg='truth check: internal endpoint not ready' url=http://$INACTIVE_NAME:$APP_PORT/ready response=${_INT_READY:0:100}" - fi - - # Check external endpoint via docker network (deterministic, no host routing issues) - # Uses retry + backoff to smooth transient edge jitter - _EXT_READY_OK=false - _EXT_LATENCY_MS=0 - _slo_start=0 - _slo_end=0 - _slo_attempt=0 - for _slo_attempt in 1 2 3; do - _slo_start=$(date +%s%3N) - if docker run --rm --network api_network curlimages/curl:8.7.1 -sk --max-time 3 "https://nginx/health" 2>/dev/null | grep -q '"status":"ok"'; then - _slo_end=$(date +%s%3N) - _EXT_LATENCY_MS=$((_slo_end - _slo_start)) - _EXT_READY_OK=true - break - fi - if [ $_slo_attempt -lt 3 ]; then sleep $((RANDOM % 3 + 5)); fi - done - - if [ "$_EXT_READY_OK" = "true" ]; then - _ft_log "msg='truth check: external endpoint ready (retry succeeded)' url=https://$API_HOSTNAME/ready latency_ms=$_EXT_LATENCY_MS" - # SLO warning: latency threshold (500ms) - if [ "$_EXT_LATENCY_MS" -gt 500 ]; then - _ft_log "level=WARN msg='SLO warning: high latency detected on external endpoint' latency_ms=$_EXT_LATENCY_MS threshold_ms=500 url=https://$API_HOSTNAME/ready" - fi - else - _ft_log "level=ERROR msg='truth check: external endpoint not ready after 3 retries' url=https://$API_HOSTNAME/ready" - fi - - # Consistency check: if internal is ready but external is not, something is wrong - # (DNS/Cloudflare/TLS/nginx proxy layer) - if [ "$_INT_READY_OK" = "true" ] && [ "$_EXT_READY_OK" = "false" ]; then - _ft_log "level=ERROR msg='truth check FAILED: internal ready but external not reachable -- nginx/proxy/DNS/TLS issue' int_ok=$_INT_READY_OK ext_ok=$_EXT_READY_OK" - _FT_TRUTH_CHECK_PASSED=false - fi - - # Also fail if both are down (service actually not ready) - if [ "$_INT_READY_OK" = "false" ] || [ "$_EXT_READY_OK" = "false" ]; then - if [ "$_FT_TRUTH_CHECK_PASSED" = "true" ]; then - _ft_log "level=ERROR msg='truth check FAILED: endpoint(s) not returning ready status' int_ok=$_INT_READY_OK ext_ok=$_EXT_READY_OK" - _FT_TRUTH_CHECK_PASSED=false - fi - fi -else - _ft_log "level=WARN msg='truth check: curl not available, skipping endpoint checks'" -fi - -if [ "$_FT_TRUTH_CHECK_PASSED" != "true" ]; then - _ft_state "FAILURE" "reason='post_deployment_truth_check_failed'" - _ft_snapshot - exit 2 -fi - -# Persist last-known-good snapshot for fast recovery triage (atomic write) -_ft_log "msg='recording last-known-good state' slot=$INACTIVE container=$INACTIVE_NAME" -_SNAP_TMP=$(mktemp "${SNAP_DIR}/last-good.XXXXXX") -printf 'slot=%s container=%s ts=%s\n' "$INACTIVE" "$INACTIVE_NAME" "$(date -Iseconds)" > "$_SNAP_TMP" -mv "$_SNAP_TMP" "$LAST_GOOD_FILE" -_ft_log "msg='last-known-good snapshot recorded (atomic)' file=$LAST_GOOD_FILE" - -# Record deployment history (atomic write: temp file then mv). -DEPLOY_HISTORY_TMP="${DEPLOY_HISTORY}.tmp.$$" -if [ -f "$DEPLOY_HISTORY" ]; then - (echo "$IMAGE_SHA"; head -n $((MAX_HISTORY - 1)) "$DEPLOY_HISTORY") > "$DEPLOY_HISTORY_TMP" -else - echo "$IMAGE_SHA" > "$DEPLOY_HISTORY_TMP" -fi -mv "$DEPLOY_HISTORY_TMP" "$DEPLOY_HISTORY" -_ft_log "msg='deploy history updated' sha=$IMAGE_SHA" - -# Alertmanager config rendering: always render before monitoring stack operations. -# Alertmanager does NOT support env vars natively; the rendered file must exist -# before docker compose up. This is idempotent and safe to run on every deploy. -bash "$REPO_DIR/infra/scripts/render-alertmanager.sh" -_ft_log "msg='alertmanager config rendered' file=$REPO_DIR/infra/alertmanager/alertmanager.rendered.yml" - -# Monitoring stack: restart only when infra configs have actually changed. -# Hashes cover all infra config files EXCEPT the nginx template (re-rendered on -# every deploy) to avoid spurious monitoring restarts. -MONITORING_HASH=$(find "$REPO_DIR/infra" -readable \ - -not -path "$REPO_DIR/infra/nginx/*" \ - \( -name '*.yml' -o -name '*.yaml' -o -name '*.conf' -o -name '*.toml' -o -name '*.json' \) \ - | sort | xargs -r sha256sum 2>/dev/null | sha256sum | cut -d' ' -f1 || echo "changed") -MONITORING_HASH_FILE="$HOME/.api-monitoring-hash" - -if [ -f "$MONITORING_HASH_FILE" ] && [ "$(cat "$MONITORING_HASH_FILE")" = "$MONITORING_HASH" ]; then - _ft_log "msg='monitoring config unchanged -- skipping restart'" -else - _ft_log "msg='monitoring config changed -- restarting monitoring stack'" - cd "$REPO_DIR/infra" - run docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml pull --quiet - run docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml up -d --remove-orphans - cd "$REPO_DIR" - echo "$MONITORING_HASH" > "$MONITORING_HASH_FILE" - _ft_log "msg='monitoring stack restarted'" -fi - -# --------------------------------------------------------------------------- -# ZOMBIE PURGE: remove any api-(blue|green)-old- containers that have -# accumulated from previous deploys. Runs unconditionally so the Docker engine -# does not fill up with stopped containers across multiple deployments. -# --------------------------------------------------------------------------- -_ft_log "msg='running zombie purge'" -docker ps -a --format '{{.Names}}' \ - | grep -E '^api-(blue|green)-old-[0-9]+$' \ - | xargs -r docker rm -f 2>/dev/null || true - -# Final state snapshot and GitHub Actions summary -_ft_final_state "$INACTIVE_NAME" "$IMAGE_SHA" -_ft_github_summary "✅ SUCCESS" "$INACTIVE_NAME" "$IMAGE_SHA" - -_ft_exit 0 "DEPLOY_SUCCESS" "sha=$IMAGE_SHA container=$INACTIVE_NAME slot=$INACTIVE" diff --git a/scripts/deploy.sh b/scripts/deploy.sh new file mode 100644 index 0000000..90da016 --- /dev/null +++ b/scripts/deploy.sh @@ -0,0 +1,1217 @@ +#!/usr/bin/env bash +# ============================================================================= +# deploy.sh — FieldTrack API Deploy + Rollback (unified) +# +# Usage: +# deploy.sh # deploy a specific image SHA +# deploy.sh --rollback # interactive rollback to previous SHA +# deploy.sh --rollback --auto # non-interactive rollback (CI) +# +# State machine: +# INIT -> PRE_FLIGHT -> PULL_IMAGE -> RESOLVE_SLOT -> IDEMPOTENCY +# -> START_INACTIVE -> HEALTH_CHECK_INTERNAL -> SWITCH_NGINX +# -> HEALTH_CHECK_PUBLIC -> STABILITY_CHECK -> CLEANUP -> SUCCESS +# +# Deploy outcomes (via _ft_exit): +# DEPLOY_SUCCESS -- zero-downtime deploy completed +# BOOTSTRAP_SUCCESS -- first-ever deploy completed +# DEPLOY_FAILED_SAFE -- deploy failed, old container still serving +# DEPLOY_FAILED_ROLLBACK -- deploy failed, rollback succeeded (system restored) +# DEPLOY_FAILED_FATAL -- deploy AND rollback both failed (manual needed) +# +# Exit codes: +# 0 DEPLOY_SUCCESS / BOOTSTRAP_SUCCESS +# 1 DEPLOY_FAILED_SAFE / DEPLOY_FAILED_ROLLBACK +# 2 DEPLOY_FAILED_FATAL +# +# Invariants: +# - Success DEPENDS ONLY ON: container start + /health=200 + nginx routing +# - NEVER depends on: Redis, Supabase, BullMQ, monitoring stack +# - No /ready usage anywhere in this script +# - All nginx reloads flow through switch_nginx() — exactly once per deploy +# ============================================================================= +set -euo pipefail +if [ "${DEBUG:-false}" = "true" ]; then set -x; fi +trap '_ft_trap_err "$LINENO"' ERR + +# --------------------------------------------------------------------------- +# ARGUMENT PARSING +# MODE is set before helper functions are loaded so _ft_log can reference it. +# --------------------------------------------------------------------------- +MODE="deploy" +AUTO_MODE=false +IMAGE_SHA="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --rollback) MODE="rollback"; shift ;; + --auto) AUTO_MODE=true; shift ;; + -*) + printf '[ERROR] Unknown option: %s\n' "$1" >&2 + printf 'Usage: deploy.sh | deploy.sh --rollback [--auto]\n' >&2 + exit 2 + ;; + *) IMAGE_SHA="$1"; shift ;; + esac +done + +# --------------------------------------------------------------------------- +# DEPLOY ID + TIMING (set here so all functions and log lines share them) +# --------------------------------------------------------------------------- +START_TS=$(date +%s) +DEPLOY_ID=$(date +%Y%m%d_%H%M%S)_$$ +PREFLIGHT_STRICT="${PREFLIGHT_STRICT:-false}" + +# --------------------------------------------------------------------------- +# STRUCTURED LOGGING +# ALL logging writes to stderr so stdout is data-only (subshell returns safe). +# --------------------------------------------------------------------------- +_FT_STATE="INIT" +DEPLOY_LOG_FILE="${DEPLOY_LOG_FILE:-/var/log/api/deploy.log}" +LOG_DIR="$(dirname "$DEPLOY_LOG_FILE")" +if ! mkdir -p "$LOG_DIR" 2>/dev/null; then + LOG_DIR="$HOME/api/logs" + DEPLOY_LOG_FILE="$LOG_DIR/deploy.log" + mkdir -p "$LOG_DIR" +fi + +_ft_log() { + { set +x; } 2>/dev/null + local entry + entry=$(printf '[DEPLOY] deploy_id=%s ts=%s state=%s %s' \ + "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$*") + printf '%s\n' "$entry" | tee -a "$DEPLOY_LOG_FILE" >&2 + if [ "${DEBUG:-false}" = "true" ]; then set -x; fi +} + +_ft_state() { + { set +x; } 2>/dev/null + _FT_STATE="$1"; shift + printf '[DEPLOY] deploy_id=%s ts=%s state=%s %s\n' \ + "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$*" >&2 + if [ "${DEBUG:-false}" = "true" ]; then set -x; fi +} + +_ft_error() { + { set +x; } 2>/dev/null + local entry + entry=$(printf '[ERROR] deploy_id=%s ts=%s state=%s %s' \ + "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$*") + printf '%s\n' "$entry" | tee -a "$DEPLOY_LOG_FILE" >&2 + if [ "${DEBUG:-false}" = "true" ]; then set -x; fi +} + +_ft_trap_err() { + { set +x; } 2>/dev/null + printf '[ERROR] deploy_id=%s ts=%s state=%s msg="unexpected failure at line %s"\n' \ + "$DEPLOY_ID" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$1" >&2 + if [ "${DEBUG:-false}" = "true" ]; then set -x; fi +} + +_ft_exit() { + local code="$1"; shift + local duration=$(( $(date +%s) - START_TS )) + _ft_state "$@" "duration_sec=$duration" + exit "$code" +} + +# --------------------------------------------------------------------------- +# PHASE TIMING +# --------------------------------------------------------------------------- +_ft_phase_start() { eval "_${1}_START=\$(date +%s)"; } +_ft_phase_end() { + local phase="$1" + local start_var="_${phase}_START" + local start_ts=${!start_var:-0} + if [ "$start_ts" -gt 0 ]; then + _ft_log "msg='phase_complete' phase=$phase duration_sec=$(( $(date +%s) - start_ts ))" + fi +} + +# --------------------------------------------------------------------------- +# SYSTEM SNAPSHOT (emitted on unrecoverable failure) +# --------------------------------------------------------------------------- +_ft_snapshot() { + { set +x; } 2>/dev/null + printf '[DEPLOY] -- SYSTEM SNAPSHOT ----------------------------------------\n' >&2 + printf '[DEPLOY] slot_file = %s\n' \ + "$(cat "${ACTIVE_SLOT_FILE:-/var/run/api/active-slot}" 2>/dev/null || echo 'MISSING')" >&2 + printf '[DEPLOY] backup_file = %s\n' \ + "$(cat "${SLOT_BACKUP_FILE:-/var/lib/api/active-slot.backup}" 2>/dev/null || echo 'MISSING')" >&2 + printf '[DEPLOY] nginx_upstream = %s\n' \ + "$(grep -oE 'http://(api-blue|api-green):3000' \ + "${NGINX_CONF:-/opt/infra/nginx/live/api.conf}" 2>/dev/null \ + | grep -oE 'api-blue|api-green' | head -1 || echo 'unreadable')" >&2 + printf '[DEPLOY] containers =\n' >&2 + docker ps --format '[DEPLOY] {{.Names}} -> {{.Status}} ({{.Ports}})' 1>&2 2>/dev/null \ + || printf '[DEPLOY] (docker ps unavailable)\n' >&2 + printf '[DEPLOY] -----------------------------------------------------------\n' >&2 + if [ "${DEBUG:-false}" = "true" ]; then set -x; fi +} + +# --------------------------------------------------------------------------- +# GITHUB ACTIONS SUMMARY +# --------------------------------------------------------------------------- +_ft_github_summary() { + local status="$1" container="${2:-unknown}" image="${3:-unknown}" reason="${4:-}" + [ -z "$GITHUB_STEP_SUMMARY" ] && return 0 + { + echo "### 🚀 Deployment Summary" + echo "| Field | Value |" + echo "|-------|-------|" + echo "| Status | **$status** |" + echo "| Deploy ID | \`$DEPLOY_ID\` |" + echo "| Duration | $(($(date +%s) - START_TS))s |" + echo "| Active Container | \`$container\` |" + echo "| Image SHA | \`${image:0:12}...\` |" + [ -n "$reason" ] && echo "| Reason | $reason |" + echo "| Timestamp | $(date -u +'%Y-%m-%d %H:%M:%S UTC') |" + } >> "$GITHUB_STEP_SUMMARY" +} + +_ft_final_state() { + local active_container="$1" image_sha="$2" nginx_upstream + nginx_upstream=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null \ + | grep -oE 'api-blue|api-green' | head -1 || echo 'unknown') + _ft_log "msg='final_state' deploy_id=$DEPLOY_ID active=$active_container sha=${image_sha:0:12} nginx_upstream=$nginx_upstream" +} + +# --------------------------------------------------------------------------- +# DOCKER HEALTH GATE +# --------------------------------------------------------------------------- +_ft_wait_docker_health() { + local name="$1" i=1 STATUS + while [ "$i" -le 30 ]; do + STATUS=$(docker inspect --format='{{.State.Health.Status}}' "$name" 2>/dev/null || echo "none") + case "$STATUS" in + healthy) _ft_log "msg='docker health check passed' container=$name"; return 0 ;; + unhealthy) _ft_error "msg='docker health check failed' container=$name status=unhealthy"; return 1 ;; + none) _ft_log "msg='docker health gate skipped (no HEALTHCHECK)' container=$name"; return 0 ;; + esac + [ $(( i % 5 )) -eq 0 ] && _ft_log "msg='waiting for docker health' attempt=$i/30 status=$STATUS container=$name" + sleep 2; i=$(( i + 1 )) + done + _ft_error "msg='docker health timeout' container=$name last_status=$STATUS" + return 1 +} + +# --------------------------------------------------------------------------- +# IN-NETWORK CURL HELPERS (via curlimages/curl on api_network) +# --------------------------------------------------------------------------- +_ft_net_curl() { + local _c="$1"; shift + docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" "$@" >/dev/null 2>&1 +} + +_ft_net_curl_out() { + local _c="$1"; shift + local _out + _out=$(docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" "$@" 2>/dev/null) || _out="" + printf '%s' "$_out" +} + +_ft_check_external_ready() { + docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" \ + -sfk --max-time 5 "https://nginx/health" 2>/dev/null | grep -q '"status":"ok"' +} + +# --------------------------------------------------------------------------- +# ENV LOADER (inlined) +# Avoids coupling deploy.sh to auxiliary scripts. +# --------------------------------------------------------------------------- +_ft_load_env() { + ENV_FILE="$DEPLOY_ROOT/.env" + if [ ! -f "$ENV_FILE" ]; then + _ft_error "msg='required .env not found' path=$ENV_FILE" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=missing_env_file" + fi + + set +x + set -o allexport + # shellcheck source=/dev/null + source "$ENV_FILE" + set +o allexport + if [ "${DEBUG:-false}" = "true" ]; then set -x; fi + + if [ -z "${API_BASE_URL:-}" ]; then + _ft_error "msg='API_BASE_URL missing in .env'" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=missing_api_base_url" + fi + if [ -z "${CORS_ORIGIN:-}" ]; then + _ft_error "msg='CORS_ORIGIN missing in .env'" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=missing_cors_origin" + fi + + API_HOSTNAME=$(echo "$API_BASE_URL" | sed -E 's|^https?://||' | cut -d'/' -f1) + if [ -z "$API_HOSTNAME" ] || printf '%s' "$API_HOSTNAME" | grep -qE '[[:space:]/@?#]'; then + _ft_error "msg='invalid API_HOSTNAME derived from API_BASE_URL' api_base_url=$API_BASE_URL derived=$API_HOSTNAME" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=invalid_api_hostname" + fi + export ENV_FILE API_HOSTNAME +} + +# --------------------------------------------------------------------------- +# SILENT EXECUTION WRAPPERS +# --------------------------------------------------------------------------- +run() { + if [ "${DEBUG:-false}" = "true" ]; then + "$@" + else + local _out + if ! _out=$("$@" 2>&1); then + printf '[ERROR] Command failed: %s\n' "$*" >&2 + printf '%s\n' "$_out" >&2 + return 1 + fi + fi +} + +# --------------------------------------------------------------------------- +# SLOT DIRECTORY AND FILE MANAGEMENT +# --------------------------------------------------------------------------- +_ft_ensure_slot_dir() { + if [ ! -d "$SLOT_DIR" ]; then + _ft_log "msg='slot dir missing, creating' path=$SLOT_DIR" + sudo mkdir -p "$SLOT_DIR" + sudo chown "$(id -un):$(id -gn)" "$SLOT_DIR" + sudo chmod 750 "$SLOT_DIR" + fi +} + +_ft_ensure_slot_backup_dir() { + local backup_dir + backup_dir="$(dirname "$SLOT_BACKUP_FILE")" + if [ ! -d "$backup_dir" ]; then + sudo mkdir -p "$backup_dir" 2>/dev/null || mkdir -p "$backup_dir" || true + sudo chown "$(id -un):$(id -gn)" "$backup_dir" 2>/dev/null || true + fi +} + +_ft_validate_slot() { + case "$1" in + blue|green) return 0 ;; + *) _ft_log "level=ERROR msg='invalid slot value' slot='${1:0:80}'"; return 1 ;; + esac +} + +_ft_write_slot() { + local slot="$1" + _ft_validate_slot "$slot" || return 1 + _ft_ensure_slot_dir + local tmp + tmp=$(mktemp "${SLOT_DIR}/active-slot.XXXXXX") + printf '%s\n' "$slot" > "$tmp" + mv "$tmp" "$ACTIVE_SLOT_FILE" + _ft_log "msg='slot file updated (atomic)' slot=$slot" + # Mirror to persistent backup (survives reboots — /var/run is tmpfs) + _ft_ensure_slot_backup_dir + local btmp + btmp=$(mktemp "$(dirname "$SLOT_BACKUP_FILE")/slot-backup.XXXXXX") + printf '%s\n' "$slot" > "$btmp" + mv "$btmp" "$SLOT_BACKUP_FILE" + _ft_log "msg='slot backup updated' slot=$slot path=$SLOT_BACKUP_FILE" +} + +# --------------------------------------------------------------------------- +# DEPLOYMENT LOCK +# --------------------------------------------------------------------------- +_ft_acquire_lock() { + _ft_ensure_slot_dir + _ft_log "msg='acquiring deployment lock' pid=$$ file=$LOCK_FILE" + exec 200>"$LOCK_FILE" + if ! flock -n 200; then + _ft_log "level=ERROR msg='another deployment already in progress -- aborting' pid=$$" + exit 1 + fi + _ft_log "msg='deployment lock acquired' pid=$$ file=$LOCK_FILE" + trap '_ft_release_lock' EXIT +} + +_ft_release_lock() { + { set +x; } 2>/dev/null + printf '[DEPLOY] ts=%s state=%s msg="releasing deployment lock" pid=%s\n' \ + "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$_FT_STATE" "$$" >&2 + exec 200>&- 2>/dev/null || true + if [ "${DEBUG:-false}" = "true" ]; then set -x; fi +} + +# =========================================================================== +# PHASE FUNCTIONS +# =========================================================================== + +# --------------------------------------------------------------------------- +# preflight — load env, validate contract, port-leak guard +# --------------------------------------------------------------------------- +preflight() { + _ft_state "PRE_FLIGHT" "msg='loading and validating environment'" + + local last_good + last_good=$(cat "$LAST_GOOD_FILE" 2>/dev/null || echo "none") + _ft_log "msg='startup recovery info' last_good=$last_good" + + _ft_load_env + + DEPLOY_HISTORY="$DEPLOY_ROOT/.deploy_history" + _ft_log "msg='environment loaded' api_hostname=$API_HOSTNAME" + + # GLOBAL PORT-LEAK GUARD — api containers MUST NOT bind host ports + local leaks + leaks=$(docker ps --format '{{.Names}} {{.Ports}}' 2>/dev/null \ + | grep -E '^api-(blue|green)' \ + | grep -E '(0\.0\.0\.0:|127\.0\.0\.1:)[0-9]+->') || true + if [ -n "${leaks:-}" ]; then + _ft_log "level=ERROR msg='API container has host port bindings — forbidden' leaks=${leaks}" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=api_port_leak_detected" + fi + _ft_log "msg='port-leak guard passed'" +} + +# --------------------------------------------------------------------------- +# ensure_network — create api_network if absent (idempotent) +# --------------------------------------------------------------------------- +ensure_network() { + docker network create --driver bridge "$NETWORK" 2>/dev/null \ + && _ft_log "msg='api_network created'" \ + || _ft_log "msg='api_network already exists'" + mkdir -p "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR" +} + +# --------------------------------------------------------------------------- +# ensure_nginx — nginx MUST exist and be on api_network; hard fail otherwise +# --------------------------------------------------------------------------- +ensure_nginx() { + if [ ! -d "$INFRA_ROOT/nginx/live" ]; then + _ft_error "msg='infra not initialized at expected path' infra_root=$INFRA_ROOT required=$INFRA_ROOT/nginx/live" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=infra_not_initialized" + fi + if [ ! -d "$INFRA_ROOT/nginx/backup" ]; then + _ft_error "msg='infra not initialized at expected path' infra_root=$INFRA_ROOT required=$INFRA_ROOT/nginx/backup" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=infra_not_initialized" + fi + if [ ! -f "$INFRA_ROOT/nginx/api.conf" ]; then + _ft_error "msg='infra template missing' path=$INFRA_ROOT/nginx/api.conf" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=infra_template_missing" + fi + + if ! docker inspect nginx >/dev/null 2>&1; then + _ft_error "msg='nginx container not found — nginx is managed by the infra repo' hint='docker compose -f docker-compose.nginx.yml up -d'" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_missing" + fi + local net + net=$(docker inspect nginx \ + --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "") + if ! echo "$net" | grep -q "$NETWORK"; then + _ft_error "msg='nginx not on api_network' networks=${net}" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_not_on_api_network" + fi + _ft_log "msg='nginx guard passed' network=$NETWORK" +} + +# --------------------------------------------------------------------------- +# pull_image — explicit pull; fails fast so docker run never races a pull +# --------------------------------------------------------------------------- +pull_image() { + _ft_state "PULL_IMAGE" "msg='pulling container image' sha=$IMAGE_SHA" + _ft_phase_start "PULL_IMAGE" + if ! run timeout 120 docker pull "$IMAGE"; then + _ft_log "level=ERROR msg='image pull failed' image=$IMAGE" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=image_pull_failed image=$IMAGE" + fi + _ft_log "msg='image pulled' image=$IMAGE" + _ft_phase_end "PULL_IMAGE" +} + +# --------------------------------------------------------------------------- +# resolve_slot — determine ACTIVE/INACTIVE slots with full recovery +# +# Reads slot from (in precedence order): +# 1. /var/run/api/active-slot (primary, tmpfs) +# 2. /var/lib/api/active-slot.backup (persistent, survives reboots) +# 3. nginx config upstream (tiebreaker when both containers run) +# 4. running containers (recovery when slot files missing) +# 5. default "green" / inactive "blue" (first deploy) +# +# Sets globals: ACTIVE, ACTIVE_NAME, INACTIVE, INACTIVE_NAME +# --------------------------------------------------------------------------- +resolve_slot() { + _ft_state "RESOLVE_SLOT" "msg='determining active slot'" + _ft_ensure_slot_dir + + local recovered_slot="" + + # 1. Primary slot file + if [ -f "$ACTIVE_SLOT_FILE" ]; then + local val + val=$(tr -d '[:space:]' < "$ACTIVE_SLOT_FILE") + if [[ "$val" == *DEPLOY* ]] || [[ "$val" == *\[* ]]; then + _ft_log "level=WARN msg='slot file contaminated — treating as corrupt' value=${val:0:80}" + elif _ft_validate_slot "$val" 2>/dev/null; then + _ft_log "msg='slot file read' slot=$val" + recovered_slot="$val" + fi + fi + + # 2. Persistent backup slot file (survives /var/run tmpfs wipe on reboot) + if [ -z "$recovered_slot" ] && [ -f "$SLOT_BACKUP_FILE" ]; then + local bval + bval=$(tr -d '[:space:]' < "$SLOT_BACKUP_FILE") + if _ft_validate_slot "$bval" 2>/dev/null; then + _ft_log "msg='recovered slot from backup file' slot=$bval file=$SLOT_BACKUP_FILE" + recovered_slot="$bval" + fi + fi + + # 3. Last-known-good snapshot + if [ -z "$recovered_slot" ] && [ -f "$LAST_GOOD_FILE" ]; then + local lgval + lgval=$(awk -F= '/^slot=/{print $2}' "$LAST_GOOD_FILE" 2>/dev/null | tr -d '[:space:]') + if _ft_validate_slot "$lgval" 2>/dev/null; then + _ft_log "msg='recovered slot from last-good snapshot' slot=$lgval" + recovered_slot="$lgval" + fi + fi + + # 4+5. Container state + nginx tiebreaker + if [ -z "$recovered_slot" ]; then + local blue_running=false green_running=false + docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${BLUE_NAME}$" && blue_running=true || true + docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${GREEN_NAME}$" && green_running=true || true + + if [ "$blue_running" = "true" ] && [ "$green_running" = "false" ]; then + recovered_slot="blue"; _ft_log "msg='recovery: only blue running'" + elif [ "$green_running" = "true" ] && [ "$blue_running" = "false" ]; then + recovered_slot="green"; _ft_log "msg='recovery: only green running'" + elif [ "$blue_running" = "true" ] && [ "$green_running" = "true" ]; then + local upstream + upstream=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null \ + | grep -oE 'api-blue|api-green' | head -1 || echo "") + recovered_slot="${upstream#api-}" + [ -z "$recovered_slot" ] && recovered_slot="blue" + _ft_log "msg='recovery: both running, nginx tiebreaker' nginx_upstream=${upstream:-none} slot=$recovered_slot" + else + recovered_slot="green" + _ft_log "msg='recovery: no containers running — first deploy, starting with blue' slot=green" + fi + fi + + _ft_validate_slot "$recovered_slot" || exit 1 + + # Persist recovered value (atomic) + local tmp + tmp=$(mktemp "${SLOT_DIR}/active-slot.XXXXXX") + printf '%s\n' "$recovered_slot" > "$tmp" + mv "$tmp" "$ACTIVE_SLOT_FILE" + + ACTIVE="$recovered_slot" + if [ "$ACTIVE" = "blue" ]; then + ACTIVE_NAME=$BLUE_NAME; INACTIVE="green"; INACTIVE_NAME=$GREEN_NAME + else + ACTIVE_NAME=$GREEN_NAME; INACTIVE="blue"; INACTIVE_NAME=$BLUE_NAME + fi + + _ft_log "msg='slot resolved' active=$ACTIVE active_name=$ACTIVE_NAME inactive=$INACTIVE inactive_name=$INACTIVE_NAME" + + # SLOT REPAIR — heal slot/container drift + if [ "$ACTIVE" = "green" ] && ! docker inspect api-green >/dev/null 2>&1; then + if docker inspect api-blue >/dev/null 2>&1; then + _ft_log "msg='slot repair: green missing but blue running → switching to blue'" + ACTIVE="blue"; ACTIVE_NAME=$BLUE_NAME; INACTIVE="green"; INACTIVE_NAME=$GREEN_NAME + _ft_write_slot "blue" + fi + elif [ "$ACTIVE" = "blue" ] && ! docker inspect api-blue >/dev/null 2>&1; then + if docker inspect api-green >/dev/null 2>&1; then + _ft_log "msg='slot repair: blue missing but green running → switching to green'" + ACTIVE="green"; ACTIVE_NAME=$GREEN_NAME; INACTIVE="blue"; INACTIVE_NAME=$BLUE_NAME + _ft_write_slot "green" + fi + fi + _ft_validate_slot "$ACTIVE" || exit 1 +} + +# --------------------------------------------------------------------------- +# idempotency_check — skip deploy if target SHA already running + healthy +# --------------------------------------------------------------------------- +idempotency_check() { + _ft_state "IDEMPOTENCY" "msg='checking if target SHA already deployed' sha=$IMAGE_SHA" + local running_image + running_image=$(docker inspect --format '{{.Config.Image}}' "$ACTIVE_NAME" 2>/dev/null || echo "") + if [ "$running_image" = "$IMAGE" ]; then + local health + health=$(_ft_net_curl_out "$ACTIVE_NAME" \ + -s --max-time 3 "http://$ACTIVE_NAME:$APP_PORT/health") + if echo "$health" | grep -q '"status":"ok"' 2>/dev/null; then + _ft_log "msg='target SHA already running and healthy — nothing to do' container=$ACTIVE_NAME" + _ft_final_state "$ACTIVE_NAME" "$IMAGE_SHA" + _ft_github_summary "✅ IDEMPOTENT (no change)" "$ACTIVE_NAME" "$IMAGE_SHA" "SHA already deployed" + _ft_exit 0 "DEPLOY_SUCCESS" "reason=idempotent_noop sha=$IMAGE_SHA" + fi + _ft_log "msg='SHA matches but container not healthy — proceeding' container=$ACTIVE_NAME" + else + _ft_log "msg='SHA differs — proceeding' running=${running_image:-none} target=$IMAGE" + fi +} + +# --------------------------------------------------------------------------- +# start_inactive — start new container on api_network (no host ports) +# --------------------------------------------------------------------------- +start_inactive() { + _ft_state "START_INACTIVE" "msg='starting inactive container' name=$INACTIVE_NAME" + + # Rename any stale container for audit trail (graceful rename→purge later) + if docker ps -a --format '{{.Names}}' | grep -Eq "^${INACTIVE_NAME}$"; then + _ft_log "msg='renaming stale container' name=$INACTIVE_NAME" + docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true + local ts + ts=$(date +%s) + docker rename "$INACTIVE_NAME" "${INACTIVE_NAME}-old-${ts}" 2>/dev/null \ + || docker rm "$INACTIVE_NAME" + fi + + local cid + cid=$(timeout 60 docker run -d \ + --name "$INACTIVE_NAME" \ + --network "$NETWORK" \ + --restart unless-stopped \ + --label "api.sha=$IMAGE_SHA" \ + --label "api.slot=$INACTIVE" \ + --label "api.deploy_id=$DEPLOY_ID" \ + --env-file "$ENV_FILE" \ + "$IMAGE" 2>&1) || { + printf '%s\n' "$cid" >&2 + _ft_error "msg='container start failed' name=$INACTIVE_NAME" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=container_start_failed" + } + _ft_log "msg='container started' name=$INACTIVE_NAME" + + # Image immutability check + local actual + actual=$(docker inspect --format '{{.Config.Image}}' "$INACTIVE_NAME" 2>/dev/null || echo "") + if [ "$actual" != "$IMAGE" ]; then + _ft_log "level=ERROR msg='image immutability check failed' expected=$IMAGE actual=${actual:-unknown}" + docker logs "$INACTIVE_NAME" --tail 50 >&2 || true + docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true + docker rm "$INACTIVE_NAME" || true + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=image_immutability_check_failed" + fi + _ft_log "msg='image immutability check passed'" +} + +# --------------------------------------------------------------------------- +# health_check_internal — wait for /health=200 via in-network curl +# NO /ready usage. NO Redis/Supabase dependency. +# --------------------------------------------------------------------------- +health_check_internal() { + _ft_state "HEALTH_CHECK_INTERNAL" "msg='waiting for container readiness'" + _ft_phase_start "HEALTH_CHECK_INTERNAL" + sleep 5 + + # Connectivity pre-check (5 short probes before main loop) + local conn_ok=false conn_attempts=0 + while [ "$conn_attempts" -lt 5 ]; do + conn_attempts=$(( conn_attempts + 1 )) + if _ft_net_curl "$INACTIVE_NAME" \ + -sf --max-time 3 "http://$INACTIVE_NAME:$APP_PORT/health"; then + conn_ok=true; break + fi + sleep 2 + done + + if [ "$conn_ok" = "false" ]; then + _ft_log "level=ERROR msg='container not reachable after connectivity pre-check' container=$INACTIVE_NAME" + docker logs "$INACTIVE_NAME" --tail 100 >&2 || true + docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true + docker rm "$INACTIVE_NAME" || true + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=container_not_reachable" + fi + _ft_log "msg='connectivity pre-check passed' container=$INACTIVE_NAME" + + # Main readiness loop — waits for HTTP 200 on /health + local attempt=0 + until true; do + attempt=$(( attempt + 1 )) + local status + status=$(_ft_net_curl_out "$INACTIVE_NAME" \ + --max-time 4 -s -o /dev/null -w "%{http_code}" \ + "http://$INACTIVE_NAME:$APP_PORT/health" || echo "000") + + if [ "$status" = "200" ]; then + _ft_log "msg='health check passed' endpoint=/health attempts=$attempt" + break + fi + + if ! docker ps --format '{{.Names}}' | grep -q "^${INACTIVE_NAME}$"; then + _ft_log "level=ERROR msg='container exited unexpectedly' name=$INACTIVE_NAME" + docker logs "$INACTIVE_NAME" --tail 100 >&2 || true + docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true + docker rm "$INACTIVE_NAME" || true + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=new_container_crashed" + fi + + if [ "$attempt" -ge "$MAX_HEALTH_ATTEMPTS" ]; then + _ft_log "level=ERROR msg='health check timed out' attempts=$attempt status=$status" + docker logs "$INACTIVE_NAME" --tail 100 >&2 || true + docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true + docker rm "$INACTIVE_NAME" || true + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=health_timeout attempts=$attempt" + fi + + [ $(( attempt % 10 )) -eq 0 ] && _ft_log "msg='still waiting' attempt=$attempt/$MAX_HEALTH_ATTEMPTS status=$status" + sleep $(( HEALTH_INTERVAL + RANDOM % 3 )) + done + + _ft_phase_end "HEALTH_CHECK_INTERNAL" + + # Docker HEALTHCHECK gate (must be healthy, not just starting) + if ! _ft_wait_docker_health "$INACTIVE_NAME"; then + docker logs "$INACTIVE_NAME" --tail 50 >&2 || true + docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true + docker rm "$INACTIVE_NAME" || true + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=docker_health_failed" + fi + + sleep 3 # brief stabilization after healthcheck gate + + # Pre-switch final connectivity check (fresh curl invocation, same net path as nginx) + if ! docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" \ + -sf --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/health" >/dev/null 2>&1; then + _ft_error "msg='pre-switch connectivity check failed' container=$INACTIVE_NAME" + docker logs "$INACTIVE_NAME" --tail 50 >&2 || true + docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true + docker rm "$INACTIVE_NAME" || true + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=pre_switch_connectivity_failed" + fi + _ft_log "msg='pre-switch connectivity check passed' container=$INACTIVE_NAME" +} + +# --------------------------------------------------------------------------- +# switch_nginx — render config, test, reload ONCE; write slot file after reload +# --------------------------------------------------------------------------- +switch_nginx() { + _ft_state "SWITCH_NGINX" "msg='switching nginx upstream' container=$INACTIVE_NAME" + sleep 2 # brief stabilization window before touching nginx + + mkdir -p "$NGINX_BACKUP_DIR" + local backup tmp + backup="$NGINX_BACKUP_DIR/api.conf.bak.$(date +%s)" + tmp="$(mktemp /tmp/api-nginx.XXXXXX.conf)" + + # Pre-reload gate — one final health probe before writing nginx config + if ! _ft_net_curl "$INACTIVE_NAME" \ + -sf --max-time 4 "http://$INACTIVE_NAME:$APP_PORT/health"; then + _ft_log "level=ERROR msg='pre-reload gate failed' container=$INACTIVE_NAME" + docker logs "$INACTIVE_NAME" --tail 50 >&2 || true + docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true + docker rm "$INACTIVE_NAME" || true + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=pre_reload_gate_failed" + fi + + sed \ + -e "s|__ACTIVE_CONTAINER__|$INACTIVE_NAME|g" \ + -e "s|__API_HOSTNAME__|$API_HOSTNAME|g" \ + "$NGINX_TEMPLATE" > "$tmp" + + cp "$NGINX_CONF" "$backup" + cp "$tmp" "$NGINX_CONF" + rm -f "$tmp" + ls -1t "$NGINX_BACKUP_DIR"/api.conf.bak.* 2>/dev/null | tail -n +6 | xargs rm -f 2>/dev/null || true + + # nginx network guard before every reload + local net + net=$(docker inspect nginx \ + --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "") + if ! echo "$net" | grep -q "$NETWORK"; then + _ft_log "level=ERROR msg='nginx not on api_network at reload time' networks=${net}" + cp "$backup" "$NGINX_CONF" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_network_mismatch" + fi + + local test_out + test_out=$(docker exec nginx nginx -t 2>&1) || { + printf '%s\n' "$test_out" >&2 + _ft_log "level=ERROR msg='nginx config test failed — restoring backup'" + cp "$backup" "$NGINX_CONF" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_config_test_failed" + } + + # === SINGLE nginx reload per deploy === + docker exec nginx nginx -s reload >/dev/null 2>&1 \ + || { cp "$backup" "$NGINX_CONF"; _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_reload_failed"; } + _ft_log "msg='nginx reloaded (once)' upstream=$INACTIVE_NAME:$APP_PORT" + + # Upstream sanity: live config must match INACTIVE_NAME + local actual_upstream + actual_upstream=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null \ + | grep -oE 'api-blue|api-green' | head -1 || echo "") + if [ "$actual_upstream" != "$INACTIVE_NAME" ]; then + _ft_log "level=ERROR msg='nginx upstream sanity failed' expected=$INACTIVE_NAME actual=${actual_upstream:-unreadable}" + cp "$backup" "$NGINX_CONF" + docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1 || true + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_upstream_mismatch" + fi + _ft_log "msg='nginx upstream sanity passed' container=$INACTIVE_NAME" + + # Write slot AFTER nginx reload — slot always reflects what nginx serves + _ft_write_slot "$INACTIVE" + _ft_log "msg='TRAFFIC_SWITCH' active=$INACTIVE_NAME sha=$IMAGE_SHA deploy_id=$DEPLOY_ID" + _ft_phase_end "SWITCH_NGINX" + + # Store backup path in global for rollback use in verify_routing / stability + NGINX_BACKUP="$backup" +} + +# --------------------------------------------------------------------------- +# verify_routing — validate nginx→backend end-to-end via api_network +# Rolls back (with rollback logic inline) on failure. +# --------------------------------------------------------------------------- +verify_routing() { + _ft_state "HEALTH_CHECK_PUBLIC" "msg='validating nginx routing + backend health'" + sleep $(( RANDOM % 3 + 5 )) # nginx warm-up + + # Post-switch routing verification (5 retries) + local ps_ok=false + for _ps in 1 2 3 4 5; do + if docker run --rm --network api_network "$_FT_CURL_IMG" \ + -sfk --max-time 5 "https://nginx/health" >/dev/null 2>&1; then + ps_ok=true; break + fi + sleep $(( RANDOM % 2 + 2 )) + done + if [ "$ps_ok" != "true" ]; then + _ft_error "msg='post-switch routing verification failed'" + _ft_snapshot + _restore_nginx_and_slot "$ACTIVE" + docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true + docker rm "$INACTIVE_NAME" || true + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=post_switch_routing_failed" + fi + _ft_log "msg='post-switch routing verification passed'" + + # Post-switch upstream verification (direct container probe) + if ! docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" \ + -sf --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/health" >/dev/null 2>&1; then + _ft_error "msg='post-switch upstream verification failed' container=$INACTIVE_NAME" + _ft_snapshot + _restore_nginx_and_slot "$ACTIVE" + docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true + docker rm "$INACTIVE_NAME" || true + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=post_switch_upstream_failed" + fi + _ft_log "msg='post-switch upstream verified' container=$INACTIVE_NAME" + + # Public health check via nginx + local pub_passed=false + if docker run --rm --network api_network "$_FT_CURL_IMG" \ + -sfk --max-time 10 "https://nginx/health" 2>/dev/null | grep -q '"status":"ok"'; then + pub_passed=true + _ft_log "msg='public health check passed' container=$INACTIVE_NAME" + else + _ft_log "msg='public health check failed' container=$INACTIVE_NAME" + fi + + # Container alignment check + local nginx_container + nginx_container=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null \ + | grep -oE 'api-blue|api-green' | head -1 || echo "") + if [ -n "$nginx_container" ] && [ "$nginx_container" != "$INACTIVE_NAME" ]; then + _ft_log "level=ERROR msg='nginx container mismatch' expected=$INACTIVE_NAME actual=$nginx_container" + pub_passed=false + fi + + if [ "$pub_passed" != "true" ]; then + _ft_state "ROLLBACK" "reason='public health check failed'" + _ft_snapshot + _restore_nginx_and_slot "$ACTIVE" + docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true + docker rm "$INACTIVE_NAME" || true + + # If ACTIVE_NAME still healthy, no need for image rollback + if docker ps --format '{{.Names}}' | grep -q "^${ACTIVE_NAME}$"; then + local ah + ah=$(_ft_net_curl_out "$ACTIVE_NAME" \ + -s --max-time 3 "http://$ACTIVE_NAME:$APP_PORT/health") + if echo "$ah" | grep -q '"status":"ok"' 2>/dev/null; then + _ft_log "msg='active container still healthy — no image rollback needed' container=$ACTIVE_NAME" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=public_health_check_failed active_healthy=true" + fi + fi + + _ft_log "msg='system degraded — triggering image rollback'" + _trigger_internal_rollback "public_health_check_failed" + fi + + # Stability check (post-switch settle verification) + _ft_state "STABILITY_CHECK" "msg='post-switch stability check'" + _ft_phase_start "STABILITY_CHECK" + sleep 5 + + if _ft_check_external_ready; then + _ft_log "msg='stability check passed' url=https://$API_HOSTNAME/health" + _ft_phase_end "STABILITY_CHECK" + else + _ft_log "level=ERROR msg='stability check failed — service regressed after initial pass'" + _ft_snapshot + _restore_nginx_and_slot "$ACTIVE" + docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true + docker rm "$INACTIVE_NAME" || true + + if docker ps --format '{{.Names}}' | grep -q "^${ACTIVE_NAME}$"; then + local ah + ah=$(_ft_net_curl_out "$ACTIVE_NAME" \ + -s --max-time 3 "http://$ACTIVE_NAME:$APP_PORT/health") + if echo "$ah" | grep -q '"status":"ok"' 2>/dev/null; then + _ft_log "msg='active container healthy after stability failure' container=$ACTIVE_NAME" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=stability_check_failed active_healthy=true" + fi + fi + _trigger_internal_rollback "stability_check_failed" + fi +} + +# Restore nginx to backup config and write the previous slot. +# Called from verify_routing on route/stability failure. +_restore_nginx_and_slot() { + local prev_slot="$1" + _ft_log "msg='restoring previous nginx config' slot=$prev_slot" + cp "$NGINX_BACKUP" "$NGINX_CONF" + if docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1; then + _ft_log "msg='nginx restored'" + else + _ft_log "level=ERROR msg='nginx restore failed — check manually'" + fi + _ft_write_slot "$prev_slot" +} + +# Release lock and exec deploy.sh --rollback --auto as a subprocess. +# This is the internal failure path — separate from the user-facing rollback(). +_trigger_internal_rollback() { + local reason="$1" + if [ "${API_ROLLBACK_IN_PROGRESS:-0}" != "1" ]; then + _ft_error "msg='ROLLBACK triggered' reason=$reason" + export API_ROLLBACK_IN_PROGRESS=1 + _ft_release_lock + if ! "$SCRIPT_DIR/deploy.sh" --rollback --auto; then + _ft_snapshot + _ft_exit 2 "DEPLOY_FAILED_FATAL" "reason=${reason}_and_rollback_failed" + fi + _ft_exit 1 "DEPLOY_FAILED_ROLLBACK" "reason=$reason msg='rollback succeeded'" + else + _ft_log "msg='nested rollback guard reached — stopping'" + _ft_exit 1 "DEPLOY_FAILED_FATAL" "reason=nested_rollback_guard" + fi +} + +# --------------------------------------------------------------------------- +# cleanup_old — gracefully stop and rename the previously-active container +# --------------------------------------------------------------------------- +cleanup_old() { + _ft_state "CLEANUP" "msg='stopping previous container' name=$ACTIVE_NAME" + + if ! docker ps --format '{{.Names}}' | grep -q "^$ACTIVE_NAME$"; then + _ft_log "msg='previous container already gone — skipping cleanup' name=$ACTIVE_NAME" + return 0 + fi + + docker stop --time 10 "$ACTIVE_NAME" 2>/dev/null || true + local ts + ts=$(date +%s) + docker rename "$ACTIVE_NAME" "${ACTIVE_NAME}-old-${ts}" 2>/dev/null \ + || docker rm "$ACTIVE_NAME" || true + _ft_log "msg='previous container stopped + renamed' name=$ACTIVE_NAME rename=${ACTIVE_NAME}-old-${ts}" +} + +# --------------------------------------------------------------------------- +# success — truth check, last-known-good snapshot, deploy history +# --------------------------------------------------------------------------- +success() { + _ft_state "SUCCESS" "msg='deployment complete' container=$INACTIVE_NAME sha=$IMAGE_SHA slot=$INACTIVE" + + # Truth check + local truth_ok=true + + # 1. Slot file + if [ -f "$ACTIVE_SLOT_FILE" ]; then + local sv + sv=$(tr -d '[:space:]' < "$ACTIVE_SLOT_FILE") + if [ "$sv" != "$INACTIVE" ]; then + _ft_log "level=ERROR msg='truth check: slot mismatch' expected=$INACTIVE actual=$sv" + truth_ok=false + else + _ft_log "msg='truth check: slot correct' slot=$sv" + fi + else + _ft_log "level=ERROR msg='truth check: slot file missing'" + truth_ok=false + fi + + # 2. nginx upstream + local nginx_up + nginx_up=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null \ + | grep -oE 'api-blue|api-green' | head -1 || echo "") + if [ -n "$nginx_up" ] && [ "$nginx_up" != "$INACTIVE_NAME" ]; then + _ft_log "level=ERROR msg='truth check: nginx upstream mismatch' expected=$INACTIVE_NAME actual=$nginx_up" + truth_ok=false + else + _ft_log "msg='truth check: nginx upstream correct' container=${nginx_up:-unknown}" + fi + + # 3. Internal + external endpoint health + sleep 2 + local int_ok=false ext_ok=false + + local int_resp + int_resp=$(_ft_net_curl_out "$INACTIVE_NAME" \ + -s --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/health") + echo "$int_resp" | grep -q '"status":"ok"' 2>/dev/null && int_ok=true + _ft_log "msg='truth check: internal endpoint' ok=$int_ok url=http://$INACTIVE_NAME:$APP_PORT/health" + + local ext_latency_ms=0 + for _sa in 1 2 3; do + local t0 t1 + t0=$(date +%s%3N) + if docker run --rm --network api_network "$_FT_CURL_IMG" \ + -sk --max-time 3 "https://nginx/health" 2>/dev/null | grep -q '"status":"ok"'; then + t1=$(date +%s%3N) + ext_latency_ms=$(( t1 - t0 )) + ext_ok=true; break + fi + [ "$_sa" -lt 3 ] && sleep $(( RANDOM % 3 + 5 )) + done + + _ft_log "msg='truth check: external endpoint' ok=$ext_ok latency_ms=$ext_latency_ms url=https://$API_HOSTNAME/health" + if [ "$ext_latency_ms" -gt 500 ]; then + _ft_log "level=WARN msg='SLO warning: high latency' latency_ms=$ext_latency_ms threshold_ms=500" + fi + + if [ "$int_ok" = "true" ] && [ "$ext_ok" = "false" ]; then + _ft_log "level=ERROR msg='truth check FAILED: internal ok but external unreachable (nginx/proxy/DNS/TLS issue)'" + truth_ok=false + fi + if [ "$int_ok" = "false" ] || [ "$ext_ok" = "false" ]; then + [ "$truth_ok" = "true" ] && _ft_log "level=ERROR msg='truth check FAILED: endpoint(s) not healthy' int=$int_ok ext=$ext_ok" + truth_ok=false + fi + + if [ "$truth_ok" != "true" ]; then + _ft_state "FAILURE" "reason='post_deployment_truth_check_failed'" + _ft_snapshot + exit 2 + fi + + # Last-known-good snapshot (atomic) + _ft_log "msg='recording last-known-good' slot=$INACTIVE container=$INACTIVE_NAME" + local snap_tmp + snap_tmp=$(mktemp "${SNAP_DIR}/last-good.XXXXXX") + printf 'slot=%s container=%s ts=%s\n' "$INACTIVE" "$INACTIVE_NAME" "$(date -Iseconds)" > "$snap_tmp" + mv "$snap_tmp" "$LAST_GOOD_FILE" + + # Deploy history (rolling, atomic) + local hist_tmp="${DEPLOY_HISTORY}.tmp.$$" + if [ -f "$DEPLOY_HISTORY" ]; then + (echo "$IMAGE_SHA"; head -n $(( MAX_HISTORY - 1 )) "$DEPLOY_HISTORY") > "$hist_tmp" + else + echo "$IMAGE_SHA" > "$hist_tmp" + fi + mv "$hist_tmp" "$DEPLOY_HISTORY" + _ft_log "msg='deploy history updated' sha=$IMAGE_SHA" + + # Zombie purge + _ft_log "msg='running zombie purge'" + docker ps -a --format '{{.Names}}' \ + | grep -E '^api-(blue|green)-old-[0-9]+$' \ + | xargs -r docker rm -f 2>/dev/null || true + + _ft_final_state "$INACTIVE_NAME" "$IMAGE_SHA" + _ft_github_summary "✅ SUCCESS" "$INACTIVE_NAME" "$IMAGE_SHA" +} + +# --------------------------------------------------------------------------- +# main — full blue-green deploy flow +# --------------------------------------------------------------------------- +main() { + _ft_acquire_lock + + # Validate SHA in deploy mode (not needed for rollback — resolved before calling main) + if [ -z "$IMAGE_SHA" ] || [ "$IMAGE_SHA" = "latest" ]; then + printf '[DEPLOY] ts=%s state=INIT level=ERROR msg="image SHA required"\n' \ + "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >&2 + exit 2 + fi + + IMAGE="ghcr.io/fieldtrack-tech/api:$IMAGE_SHA" + _ft_log "msg='deploy started' mode=$MODE sha=$IMAGE_SHA deploy_id=$DEPLOY_ID pid=$$ start_ts=$START_TS" + + preflight + ensure_network + ensure_nginx + pull_image + + # BOOTSTRAP: first deploy when no api containers exist + if ! docker ps -a --format '{{.Names}}' | grep -Eq '^api-(blue|green)$'; then + _ft_state "BOOTSTRAP" "msg='no api containers — first deploy'" + # Initialize globals required by downstream functions + ACTIVE="green"; ACTIVE_NAME=$GREEN_NAME; INACTIVE="blue"; INACTIVE_NAME=$BLUE_NAME + DEPLOY_HISTORY="${DEPLOY_HISTORY:-$DEPLOY_ROOT/.deploy_history}" + NGINX_BACKUP="$NGINX_BACKUP_DIR/api.conf.bak.$(date +%s)" + + docker rm -f api-blue 2>/dev/null || true + start_inactive + health_check_internal + # Write nginx config directly for first deploy (no backup to restore) + mkdir -p "$NGINX_LIVE_DIR" "$NGINX_BACKUP_DIR" + local boot_tmp; boot_tmp="$(mktemp /tmp/api-nginx-boot.XXXXXX.conf)" + sed -e "s|__ACTIVE_CONTAINER__|$INACTIVE_NAME|g" \ + -e "s|__API_HOSTNAME__|${API_HOSTNAME}|g" \ + "$NGINX_TEMPLATE" > "$boot_tmp" + cp "$boot_tmp" "$NGINX_CONF" + rm -f "$boot_tmp" + local net_check + net_check=$(docker inspect nginx \ + --format='{{range $k,$v := .NetworkSettings.Networks}}{{$k}} {{end}}' 2>/dev/null || echo "") + if ! echo "$net_check" | grep -q "$NETWORK"; then + _ft_log "level=ERROR msg='bootstrap: nginx not on api_network' networks=${net_check}" + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_network_mismatch_bootstrap" + fi + local nt_out + nt_out=$(docker exec nginx nginx -t 2>&1) || { + printf '%s\n' "$nt_out" >&2 + _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_config_test_failed_bootstrap" + } + docker exec nginx nginx -s reload >/dev/null 2>&1 \ + || _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=nginx_reload_failed_bootstrap" + _ft_log "msg='bootstrap: nginx reloaded'" + _ft_write_slot "blue" + local snap_tmp; snap_tmp=$(mktemp "${SNAP_DIR}/last-good.XXXXXX") + printf 'slot=blue container=api-blue ts=%s\n' "$(date -Iseconds)" > "$snap_tmp" + mv "$snap_tmp" "$LAST_GOOD_FILE" + # Deploy history + DEPLOY_HISTORY="${DEPLOY_HISTORY:-$DEPLOY_ROOT/.deploy_history}" + local hist_tmp="${DEPLOY_HISTORY}.tmp.$$" + echo "$IMAGE_SHA" > "$hist_tmp" + mv "$hist_tmp" "$DEPLOY_HISTORY" + _ft_exit 0 "BOOTSTRAP_SUCCESS" "slot=blue image=$IMAGE" + fi + + # Normal deploy path + resolve_slot + idempotency_check + start_inactive + health_check_internal + switch_nginx + verify_routing + cleanup_old + success + + _ft_exit 0 "DEPLOY_SUCCESS" "sha=$IMAGE_SHA container=$INACTIVE_NAME slot=$INACTIVE" +} + +# --------------------------------------------------------------------------- +# rollback — restore previous SHA from deploy history +# --------------------------------------------------------------------------- +rollback() { + _ft_log "msg='rollback initiated' mode=${MODE} auto=$AUTO_MODE" + + if [ ! -f "$DEPLOY_HISTORY" ] || [ ! -s "$DEPLOY_HISTORY" ]; then + printf '[ERROR] No deployment history found: %s\n' "$DEPLOY_HISTORY" >&2 + exit 1 + fi + + mapfile -t HISTORY < "$DEPLOY_HISTORY" + if [ "${#HISTORY[@]}" -lt 2 ]; then + printf '[ERROR] Need at least two deployments to rollback (history has %d entries)\n' \ + "${#HISTORY[@]}" >&2 + exit 1 + fi + + local current_sha="${HISTORY[0]}" + local previous_sha="${HISTORY[1]}" + + printf '=========================================\n' + printf 'FieldTrack Rollback\n' + printf '=========================================\n' + printf 'Current deployment : %s\n' "$current_sha" + printf 'Rollback target : %s\n' "$previous_sha" + printf '\n' + + printf 'Validating rollback image exists...\n' + if ! docker manifest inspect "ghcr.io/fieldtrack-tech/api:$previous_sha" >/dev/null 2>&1; then + printf '[ERROR] Rollback image not found in registry: ghcr.io/fieldtrack-tech/api:%s\n' "$previous_sha" >&2 + exit 1 + fi + printf '✓ Rollback image verified.\n\n' + + if [ "$AUTO_MODE" = "false" ]; then + printf '⚠️ WARNING: This will replace the current deployment.\n' + read -r -p "Continue with rollback? (yes/no): " REPLY + if [[ ! $REPLY =~ ^[Yy][Ee][Ss]$ ]]; then + printf 'Rollback cancelled.\n' + exit 0 + fi + else + printf 'Auto rollback mode (CI).\n' + fi + + printf '\nStarting rollback to: %s\n\n' "$previous_sha" + export API_ROLLBACK_IN_PROGRESS=1 + IMAGE_SHA="$previous_sha" + main + + printf '\n=========================================\n' + printf 'Rollback completed: %s\n' "$previous_sha" + printf '=========================================\n' +} + +# =========================================================================== +# CONSTANTS (loaded after function definitions but before execution) +# =========================================================================== +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" +[ -d "$DEPLOY_ROOT" ] || { printf '[ERROR] DEPLOY_ROOT not found: %s\n' "$DEPLOY_ROOT" >&2; exit 1; } +REPO_DIR="$DEPLOY_ROOT" +INFRA_ROOT="${INFRA_ROOT:-/opt/infra}" + +BLUE_NAME="api-blue" +GREEN_NAME="api-green" +APP_PORT=3000 +NETWORK="api_network" +_FT_CURL_IMG="curlimages/curl:8.7.1" + +SLOT_DIR="/var/run/api" +ACTIVE_SLOT_FILE="$SLOT_DIR/active-slot" +SLOT_BACKUP_FILE="/var/lib/api/active-slot.backup" # persistent, survives reboots + +NGINX_CONF="$INFRA_ROOT/nginx/live/api.conf" +NGINX_LIVE_DIR="$INFRA_ROOT/nginx/live" +NGINX_BACKUP_DIR="$INFRA_ROOT/nginx/backup" +NGINX_TEMPLATE="$INFRA_ROOT/nginx/api.conf" +NGINX_BACKUP="" # set inside switch_nginx() + +MAX_HISTORY=5 +MAX_HEALTH_ATTEMPTS=40 +HEALTH_INTERVAL=3 + +LOCK_FILE="$SLOT_DIR/deploy.lock" +SNAP_DIR="$SLOT_DIR" +LAST_GOOD_FILE="$SNAP_DIR/last-good" + +# DEPLOY_HISTORY is set inside preflight() after _ft_load_env() +DEPLOY_HISTORY="" + +# ACTIVE/INACTIVE are set inside resolve_slot() +ACTIVE="" ACTIVE_NAME="" INACTIVE="" INACTIVE_NAME="" + +# IMAGE is set inside main() +IMAGE="" + +# =========================================================================== +# ENTRY POINT +# =========================================================================== +_ft_log "msg='deploy.sh invoked' mode=$MODE auto=$AUTO_MODE sha=${IMAGE_SHA:-} pid=$$" + +if [ "$MODE" = "rollback" ]; then + # For rollback we need env loaded early to find DEPLOY_HISTORY + _ft_load_env + DEPLOY_HISTORY="$DEPLOY_ROOT/.deploy_history" + rollback +else + main +fi diff --git a/scripts/load-env.sh b/scripts/load-env.sh deleted file mode 100644 index a00b5af..0000000 --- a/scripts/load-env.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env bash -# --------------------------------------------------------------------------- -# load-env.sh — Centralised environment loader for FieldTrack deploy scripts -# -# Source this file at the start of every deploy/rollback script: -# source "$(dirname "${BASH_SOURCE[0]}")/load-env.sh" -# -# After sourcing, the following are exported into the caller's environment: -# DEPLOY_ROOT — absolute path to the repository root on the VPS -# ENV_FILE — absolute path to .env -# API_HOSTNAME — bare hostname derived from API_BASE_URL (no scheme/path) -# -# All KEY=VALUE pairs from .env are also exported into the caller's -# process, so downstream scripts can reference any app env var directly. -# --------------------------------------------------------------------------- -set -euo pipefail - -# Disable trace to prevent secrets from leaking into logs -set +x 2>/dev/null || true - -# Derive repo root from this script's own location so the loader works -# regardless of the current working directory when it is sourced. -_LES_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -_LES_REPO="$(cd "$_LES_DIR/.." && pwd)" - -# ── DEPLOY_ROOT ───────────────────────────────────────────────────────────── -# Prefer an already-exported value (e.g. set explicitly by the CI SSH step); -# default to the canonical VPS deployment path under the current user's home. -export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" - -[ -d "$DEPLOY_ROOT" ] || { - echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT" - echo " Expected repository root at: $HOME/api" - echo " If your repo is elsewhere, export DEPLOY_ROOT before running scripts." - if [ -d "$_LES_REPO" ]; then - echo " Detected script-relative repo candidate: $_LES_REPO" - fi - exit 1 -} - -# ── ENV_FILE ───────────────────────────────────────────────────────────────── -export ENV_FILE="$DEPLOY_ROOT/.env" - -if [ ! -f "$ENV_FILE" ]; then - echo "❌ Required .env file not found: $ENV_FILE" - echo " Create it from .env.example and populate all required values." - exit 1 -fi -echo "✓ .env file exists: $ENV_FILE" - -# ── Load all variables from .env ───────────────────────────────────────────── -# allexport is enabled so every KEY=VALUE assignment is automatically exported; -# disabled immediately after to avoid exporting any later shell variables. -set -o allexport -# shellcheck source=/dev/null -source "$ENV_FILE" -set +o allexport - -# ── Validate required variables ────────────────────────────────────────────── -_LES_MISSING="" -for _LES_VAR in API_BASE_URL CORS_ORIGIN; do - eval "_LES_VAL=\"\${${_LES_VAR}:-}\"" - if [ -z "$_LES_VAL" ]; then - _LES_MISSING="${_LES_MISSING} - ${_LES_VAR}\n" - fi -done - -if [ -n "$_LES_MISSING" ]; then - echo "❌ Missing required variables in $ENV_FILE:" - printf "%b" "$_LES_MISSING" - exit 1 -fi - -echo "✓ API_BASE_URL is set" -echo "✓ CORS_ORIGIN is set" - -# ── Derive API_HOSTNAME from API_BASE_URL ──────────────────────────────────── -# Use bash-safe parsing (no Node.js dependency for VPS compatibility) -# Strip protocol (http:// or https://) and take first path segment -API_HOSTNAME=$(echo "$API_BASE_URL" | sed -E 's|^https?://||' | cut -d'/' -f1) - -# Validate: result must be a non-empty bare hostname (or host:port). -# Reject if it contains whitespace, path separators, credential markers (@), -# or query/fragment characters — any of these indicate a malformed API_BASE_URL. -if [ -z "$API_HOSTNAME" ] || printf '%s' "$API_HOSTNAME" | grep -qE '[[:space:]/@?#]'; then - echo "❌ Invalid API_HOSTNAME derived from API_BASE_URL='$API_BASE_URL'" - echo " Expected a bare hostname or host:port — e.g.: api.example.com" - echo " Got: '$API_HOSTNAME'" - echo " Check that API_BASE_URL has no embedded credentials, spaces, or bare paths." - exit 1 -fi - -export API_HOSTNAME -echo "✓ API_HOSTNAME: $API_HOSTNAME" - -# Clean up internal variables so they do not leak into the caller's scope. -unset _LES_DIR _LES_REPO _LES_VAR _LES_VAL _LES_MISSING diff --git a/scripts/load-testing/README.md b/scripts/load-testing/README.md deleted file mode 100644 index e212f9a..0000000 --- a/scripts/load-testing/README.md +++ /dev/null @@ -1,127 +0,0 @@ -# FieldTrack Phase 24 — Load Testing - -Load tests are written for [k6](https://k6.io/) — a modern open-source load testing tool. - -## Prerequisites - -Install k6: https://k6.io/docs/getting-started/installation/ - -```bash -# macOS -brew install k6 - -# Windows (winget) -winget install k6 - -# Linux (Debian/Ubuntu) -sudo gpg -k && sudo gpg --no-default-keyring --keyring /usr/share/keyrings/k6-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys C5AD17C747E3415A3642D57D77C6C491D6AC1D69 -echo "deb [signed-by=/usr/share/keyrings/k6-archive-keyring.gpg] https://dl.k6.io/deb stable main" | sudo tee /etc/apt/sources.list.d/k6.list -sudo apt-get update && sudo apt-get install k6 -``` - -## Environment Variables - -| Variable | Description | -|------------------|------------------------------------| -| `BASE_URL` | API base URL (default: prod) | -| `ADMIN_TOKEN` | Valid admin JWT | -| `EMPLOYEE_TOKEN` | Valid employee JWT | - -## Scripts - -### `dashboard-load-test.js` -Simulates **50 concurrent admins** polling `/admin/dashboard` and `/admin/sessions`. - -**Targets:** dashboard p95 < 1000 ms · sessions p95 < 800 ms · error rate < 1% - -> **Phase 24 note:** The dashboard now uses a single indexed `org_dashboard_snapshot` PK lookup. -> The tighter p95 < 100 ms target from Phase 22 has been replaced with a realistic 1000 ms budget -> that accounts for cold-cache misses and network latency. - -```bash -k6 run dashboard-load-test.js \ - -e BASE_URL=https://api.fieldtrack.meowsician.tech \ - -e ADMIN_TOKEN= -``` - ---- - -### `map-load-test.js` -Simulates **20 concurrent monitoring clients** polling `/admin/monitoring/map` every 30 seconds. - -**Target:** p95 < 200 ms · error rate < 1% - -```bash -k6 run map-load-test.js \ - -e BASE_URL=https://api.fieldtrack.meowsician.tech \ - -e ADMIN_TOKEN= -``` - ---- - -### `expenses-load-test.js` -Simulates **100 concurrent employees** submitting expense claims and listing their expenses. - -**Targets:** POST p95 < 300 ms · GET p95 < 200 ms · error rate < 1% - -> **Warning:** writes real data — use a staging environment or clean up afterward. - -```bash -k6 run expenses-load-test.js \ - -e BASE_URL=https://api.fieldtrack.meowsician.tech \ - -e EMPLOYEE_TOKEN= -``` - ---- - -### `queue-impact-test.js` -Simulates a **burst of 30 concurrent checkouts** to stress the BullMQ worker queues, then monitors `/admin/queues` for 2 minutes to verify the backlog drains. - -**Targets:** checkout p95 < 400 ms · analytics queue depth < 500 · DLQ < 10 - -```bash -k6 run queue-impact-test.js \ - -e BASE_URL=https://api.fieldtrack.meowsician.tech \ - -e EMPLOYEE_TOKEN= \ - -e ADMIN_TOKEN= -``` - -## API Response Structure - -All scripts parse JSON bodies. The API always returns an envelope: - -| Endpoint | Shape | -|---|---| -| `GET /admin/dashboard` | `{ success: true, data: { activeEmployeeCount, recentEmployeeCount, ... } }` | -| `GET /admin/sessions` | `{ success: true, data: SessionDTO[], pagination: { page, limit, total } }` | -| `GET /admin/monitoring/map` | `{ success: true, data: EmployeeMapMarker[] }` | -| `POST /expenses` | `{ success: true, data: { id, amount, description, ... } }` | -| `GET /expenses/my` | `{ success: true, data: Expense[], pagination: { page, limit, total } }` | -| `GET /admin/queues` | `{ success: true, queues: { analytics: { waiting, active, completed, failed, dlq }, distance: { ... } } }` | - -> **Note:** `pagination` appears at the response root alongside `data`, not nested inside `data`. -> The `/admin/queues` endpoint uses a `queues` key instead of `data`. - -## Metrics and Error Rate - -All scripts maintain two categories of checks: - -- **Correctness checks** (feed `error_rate`): HTTP status code + `success === true` + required body fields. - A request only increments `error_rate` when the API returns the wrong status or a malformed body. -- **Latency checks** (observability only): Response time assertions inside a separate `check()` call - that does **not** feed `error_rate`. Slow-but-correct responses do not inflate the error counter. - -This means `error_rate < 0.01` measures real API failures, not congestion. - -## Running All Tests Sequentially - -```bash -BASE_URL=https://api.fieldtrack.meowsician.tech -ADMIN_TOKEN= -EMPLOYEE_TOKEN= - -for script in dashboard-load-test.js map-load-test.js expenses-load-test.js queue-impact-test.js; do - echo "=== Running $script ===" - k6 run "$script" -e BASE_URL="$BASE_URL" -e ADMIN_TOKEN="$ADMIN_TOKEN" -e EMPLOYEE_TOKEN="$EMPLOYEE_TOKEN" -done -``` diff --git a/scripts/load-testing/dashboard-load-test.js b/scripts/load-testing/dashboard-load-test.js deleted file mode 100644 index 4fa8934..0000000 --- a/scripts/load-testing/dashboard-load-test.js +++ /dev/null @@ -1,124 +0,0 @@ -/** - * FieldTrack Phase 23 — Dashboard Load Test - * - * Simulates 50 concurrent admin users polling the dashboard and sessions - * endpoints over a 2-minute steady state period. - * - * Run: - * k6 run dashboard-load-test.js \ - * -e BASE_URL=https://api.getfieldtrack.app \ - * -e ADMIN_TOKEN= - * - * Performance targets: - * p95 latency < 1000 ms (/admin/dashboard) - * p95 latency < 800 ms (/admin/sessions) - * error rate < 1 % - * - * NOTE on rate limiting: - * All 50 VUs share a single ADMIN_TOKEN, so they appear as ONE user to the - * per-token rate limiter (1200 req/min). 50 VUs × ~12 req/min ≈ 600 req/min - * — comfortably within budget. In production, 50 real admins would each hold - * their own token and each get the full 1200 req/min quota. - */ - -import http from "k6/http"; -import { check, sleep } from "k6"; -import { Trend, Rate, Counter } from "k6/metrics"; - -// ─── Custom metrics ───────────────────────────────────────────────────────── - -const dashboardDuration = new Trend("dashboard_duration_ms", true); -const sessionsDuration = new Trend("sessions_duration_ms", true); -const errorRate = new Rate("error_rate"); -const requestsTotal = new Counter("requests_total"); - -// ─── Test options ──────────────────────────────────────────────────────────── - -export const options = { - scenarios: { - dashboard_polling: { - executor: "constant-vus", - vus: 50, - duration: "2m", - }, - }, - thresholds: { - // Performance targets updated in Phase 24 (O(1) snapshot query) - dashboard_duration_ms: ["p(95)<1000"], - sessions_duration_ms: ["p(95)<800"], - error_rate: ["rate<0.01"], - http_req_failed: ["rate<0.01"], - }, -}; - -// ─── Helpers ───────────────────────────────────────────────────────────────── - -const BASE_URL = __ENV.BASE_URL || "https://api.getfieldtrack.app"; -const ADMIN_TOKEN = __ENV.ADMIN_TOKEN || ""; - -function authHeaders() { - return { - Authorization: `Bearer ${ADMIN_TOKEN}`, - "Content-Type": "application/json", - "Accept-Encoding": "gzip, br", - }; -} - -// ─── Default scenario ───────────────────────────────────────────────────────── - -export default function () { - const headers = authHeaders(); - - // ── /admin/dashboard ───────────────────────────────────────────────────── - const dashRes = http.get(`${BASE_URL}/admin/dashboard`, { headers, tags: { name: "admin_dashboard" } }); - requestsTotal.add(1); - dashboardDuration.add(dashRes.timings.duration); - - // Correctness check — only logical failures increment error_rate - const dashOk = check(dashRes, { - "dashboard status 200": (r) => r.status === 200, - "dashboard response is success": (r) => { - try { return JSON.parse(r.body).success === true; } catch { return false; } - }, - "dashboard has activeEmployeeCount": (r) => { - try { - const body = JSON.parse(r.body); - return typeof body.data?.activeEmployeeCount === "number"; - } catch { - return false; - } - }, - }); - // Latency check — observability only, does not affect error_rate - check(dashRes, { "dashboard response time < 500ms": (r) => r.timings.duration < 500 }); - errorRate.add(!dashOk); - - sleep(0.5); - - // ── /admin/sessions ────────────────────────────────────────────────────── - const sessRes = http.get(`${BASE_URL}/admin/sessions?limit=50`, { headers, tags: { name: "admin_sessions" } }); - requestsTotal.add(1); - sessionsDuration.add(sessRes.timings.duration); - - // Correctness check — only logical failures increment error_rate - const sessOk = check(sessRes, { - "sessions status 200": (r) => r.status === 200, - "sessions response is success": (r) => { - try { return JSON.parse(r.body).success === true; } catch { return false; } - }, - "sessions has pagination": (r) => { - try { - const body = JSON.parse(r.body); - return typeof body.pagination?.total === "number"; - } catch { - return false; - } - }, - }); - // Latency check — observability only, does not affect error_rate - check(sessRes, { "sessions response time < 500ms": (r) => r.timings.duration < 500 }); - errorRate.add(!sessOk); - - // Simulate realistic admin polling cadence — 5 s between full refreshes - sleep(5); -} diff --git a/scripts/load-testing/expenses-load-test.js b/scripts/load-testing/expenses-load-test.js deleted file mode 100644 index c839411..0000000 --- a/scripts/load-testing/expenses-load-test.js +++ /dev/null @@ -1,134 +0,0 @@ -/** - * FieldTrack Phase 23 — Expense Workflow Load Test - * - * Simulates 100 concurrent employees submitting expense claims and then - * retrieving their expense list. Validates that the API remains responsive - * under realistic bulk-submission conditions (e.g. end-of-month expense flush). - * - * Run: - * k6 run expenses-load-test.js \ - * -e BASE_URL=https://api.getfieldtrack.app \ - * -e EMPLOYEE_TOKEN= - * - * NOTE: This test writes real data. Run against a staging environment or clean - * up submitted expenses afterwards via the Supabase dashboard / admin API. - * - * Performance targets: - * POST /expenses p95 < 300 ms - * GET /expenses/my p95 < 200 ms - * error rate < 1 % - */ - -import http from "k6/http"; -import { check, sleep } from "k6"; -import { Trend, Rate, Counter } from "k6/metrics"; - -// ─── Custom metrics ───────────────────────────────────────────────────────── - -const submitDuration = new Trend("expense_submit_duration_ms", true); -const listDuration = new Trend("expense_list_duration_ms", true); -const errorRate = new Rate("error_rate"); -const requestsTotal = new Counter("requests_total"); - -// ─── Test options ──────────────────────────────────────────────────────────── - -export const options = { - scenarios: { - expense_submission: { - executor: "constant-vus", - vus: 100, - duration: "2m", - }, - }, - thresholds: { - expense_submit_duration_ms: ["p(95)<300"], - expense_list_duration_ms: ["p(95)<200"], - error_rate: ["rate<0.01"], - http_req_failed: ["rate<0.01"], - }, -}; - -// ─── Helpers ───────────────────────────────────────────────────────────────── - -const BASE_URL = __ENV.BASE_URL || "https://api.getfieldtrack.app"; -// Each VU can use the same employee token in a load test (shared org context) -const EMPLOYEE_TOKEN = __ENV.EMPLOYEE_TOKEN || ""; - -function authHeaders() { - return { - Authorization: `Bearer ${EMPLOYEE_TOKEN}`, - "Content-Type": "application/json", - "Accept-Encoding": "gzip, br", - }; -} - -// ─── Default scenario ───────────────────────────────────────────────────────── - -export default function () { - const headers = authHeaders(); - const vu = __VU; - const iter = __ITER; - - // ── POST /expenses — submit a new expense claim ─────────────────────────── - const payload = JSON.stringify({ - amount: Math.round((10 + Math.random() * 490) * 100) / 100, - description: `Load test expense — VU ${vu} iteration ${iter}`, - }); - - const submitRes = http.post(`${BASE_URL}/expenses`, payload, { - headers, - tags: { name: "expense_submit" }, - }); - - requestsTotal.add(1); - submitDuration.add(submitRes.timings.duration); - - // Correctness check — only logical failures increment error_rate - const submitOk = check(submitRes, { - "expense submit 201": (r) => r.status === 201, - "expense response is success": (r) => { - try { return JSON.parse(r.body).success === true; } catch { return false; } - }, - "expense has id": (r) => { - try { - const body = JSON.parse(r.body); - return typeof body.data?.id === "string"; - } catch { - return false; - } - }, - }); - // Latency check — observability only, does not affect error_rate - check(submitRes, { "expense submit < 1s": (r) => r.timings.duration < 1000 }); - errorRate.add(!submitOk); - - sleep(1); - - // ── GET /expenses/my — list own expenses ────────────────────────────────── - const listRes = http.get(`${BASE_URL}/expenses/my?limit=20`, { - headers, - tags: { name: "expense_list" }, - }); - - requestsTotal.add(1); - listDuration.add(listRes.timings.duration); - - const listOk = check(listRes, { - "expense list 200": (r) => r.status === 200, - "expense list response is success": (r) => { - try { return JSON.parse(r.body).success === true; } catch { return false; } - }, - "expense list has pagination": (r) => { - try { - const body = JSON.parse(r.body); - return typeof body.pagination?.total === "number"; - } catch { - return false; - } - }, - }); - errorRate.add(!listOk); - - // Simulate realistic inter-request think time - sleep(2 + Math.random() * 3); -} diff --git a/scripts/load-testing/map-load-test.js b/scripts/load-testing/map-load-test.js deleted file mode 100644 index 79a3584..0000000 --- a/scripts/load-testing/map-load-test.js +++ /dev/null @@ -1,92 +0,0 @@ -/** - * FieldTrack Phase 23 — Monitoring Map Load Test - * - * Simulates 20 concurrent monitoring clients that poll the live map endpoint - * every 30 seconds, mirroring the production frontend SSE/polling cadence. - * - * Run: - * k6 run map-load-test.js \ - * -e BASE_URL=https://api.getfieldtrack.app \ - * -e ADMIN_TOKEN= - * - * Performance target: - * p95 latency < 200 ms - * error rate < 1 % - */ - -import http from "k6/http"; -import { check, sleep } from "k6"; -import { Trend, Rate, Counter } from "k6/metrics"; - -// ─── Custom metrics ───────────────────────────────────────────────────────── - -const mapDuration = new Trend("map_duration_ms", true); -const errorRate = new Rate("error_rate"); -const requestsTotal = new Counter("requests_total"); - -// ─── Test options ──────────────────────────────────────────────────────────── - -export const options = { - scenarios: { - live_map_polling: { - executor: "constant-vus", - vus: 20, - duration: "3m", - }, - }, - thresholds: { - map_duration_ms: ["p(95)<200"], - error_rate: ["rate<0.01"], - http_req_failed: ["rate<0.01"], - }, -}; - -// ─── Helpers ───────────────────────────────────────────────────────────────── - -const BASE_URL = __ENV.BASE_URL || "https://api.getfieldtrack.app"; -const ADMIN_TOKEN = __ENV.ADMIN_TOKEN || ""; - -function authHeaders() { - return { - Authorization: `Bearer ${ADMIN_TOKEN}`, - "Accept-Encoding": "gzip, br", - }; -} - -// ─── Default scenario ───────────────────────────────────────────────────────── - -export default function () { - const headers = authHeaders(); - - const res = http.get(`${BASE_URL}/admin/monitoring/map`, { - headers, - tags: { name: "monitoring_map" }, - }); - - requestsTotal.add(1); - mapDuration.add(res.timings.duration); - - // Correctness check — only logical failures increment error_rate - const ok = check(res, { - "map status 200": (r) => r.status === 200, - "map response is success": (r) => { - try { return JSON.parse(r.body).success === true; } catch { return false; } - }, - "map has markers array": (r) => { - try { - const body = JSON.parse(r.body); - return Array.isArray(body.data); - } catch { - return false; - } - }, - "map content-encoding compressed": (r) => - r.headers["Content-Encoding"] !== undefined || r.body.length > 0, - }); - // Latency check — observability only, does not affect error_rate - check(res, { "map response time < 500ms": (r) => r.timings.duration < 500 }); - errorRate.add(!ok); - - // Simulate 30-second polling interval (realistic monitoring cadence) - sleep(30); -} diff --git a/scripts/load-testing/queue-impact-test.js b/scripts/load-testing/queue-impact-test.js deleted file mode 100644 index 36e304c..0000000 --- a/scripts/load-testing/queue-impact-test.js +++ /dev/null @@ -1,146 +0,0 @@ -/** - * FieldTrack Phase 23 — Queue Impact Load Test - * - * Simulates a burst of session checkouts to stress the distance and analytics - * worker queues. After the burst, the script polls /admin/queues to watch the - * backlog drain and verify the queues recover within the target SLA. - * - * Run: - * k6 run queue-impact-test.js \ - * -e BASE_URL=https://api.getfieldtrack.app \ - * -e EMPLOYEE_TOKEN= \ - * -e ADMIN_TOKEN= - * - * NOTE: This test checks out real sessions. Pre-create checked-in sessions - * in a staging environment or use the smoke-test helper to seed data first. - * - * Metrics monitored: - * analytics_queue_depth — Prometheus gauge via /admin/queues - * checkout latency — POST /attendance/check-out p95 - * queue drain time — how quickly depth returns to 0 - */ - -import http from "k6/http"; -import { check, sleep } from "k6"; -import { Trend, Rate, Counter, Gauge } from "k6/metrics"; - -// ─── Custom metrics ───────────────────────────────────────────────────────── - -const checkoutDuration = new Trend("checkout_duration_ms", true); -const queueDepth = new Gauge("analytics_queue_depth_observed"); -const errorRate = new Rate("error_rate"); -const requestsTotal = new Counter("requests_total"); - -// ─── Test options ──────────────────────────────────────────────────────────── - -export const options = { - scenarios: { - // Phase 1: burst checkout load (simulates end-of-day mass checkout) - checkout_burst: { - executor: "constant-vus", - vus: 30, - duration: "30s", - tags: { phase: "burst" }, - }, - // Phase 2: queue drain monitoring — starts after the burst ends - queue_drain_monitor: { - executor: "constant-vus", - vus: 1, - startTime: "35s", - duration: "2m", - tags: { phase: "monitor" }, - }, - }, - thresholds: { - // Checkout must stay fast even under queue pressure - checkout_duration_ms: ["p(95)<400"], - error_rate: ["rate<0.05"], - http_req_failed: ["rate<0.05"], - }, -}; - -// ─── Helpers ───────────────────────────────────────────────────────────────── - -const BASE_URL = __ENV.BASE_URL || "https://api.getfieldtrack.app"; -const EMPLOYEE_TOKEN = __ENV.EMPLOYEE_TOKEN || ""; -const ADMIN_TOKEN = __ENV.ADMIN_TOKEN || ""; - -function empHeaders() { - return { - Authorization: `Bearer ${EMPLOYEE_TOKEN}`, - "Content-Type": "application/json", - }; -} - -function adminHeaders() { - return { - Authorization: `Bearer ${ADMIN_TOKEN}`, - "Content-Type": "application/json", - }; -} - -// ─── Checkout burst scenario ────────────────────────────────────────────────── - -export function checkoutBurst() { - // POST check-out triggers distance + analytics job enqueue - const res = http.post( - `${BASE_URL}/attendance/check-out`, - "{}", - { headers: empHeaders(), tags: { name: "checkout" } }, - ); - - requestsTotal.add(1); - checkoutDuration.add(res.timings.duration); - - // Correctness check — only logical failures increment error_rate - const ok = check(res, { - // 200 = checked out successfully; 409 = no open session (idempotent) - "checkout accepted": (r) => r.status === 200 || r.status === 409, - }); - // Latency check — observability only, does not affect error_rate - check(res, { "checkout < 1s": (r) => r.timings.duration < 1000 }); - errorRate.add(!ok); - - sleep(1); -} - -// ─── Queue drain monitor scenario ───────────────────────────────────────────── - -export function queueDrainMonitor() { - const res = http.get(`${BASE_URL}/admin/queues`, { - headers: adminHeaders(), - tags: { name: "queue_stats" }, - }); - - requestsTotal.add(1); - - if (res.status === 200) { - try { - const body = JSON.parse(res.body); - const analyticsWaiting = body.queues?.analytics?.waiting ?? -1; - const distanceWaiting = body.queues?.distance?.waiting ?? -1; - - queueDepth.add(analyticsWaiting + distanceWaiting); - - check(res, { - "queue depth within SLA (<500)": () => - analyticsWaiting + distanceWaiting < 500, - "no DLQ overflow (<10)": () => - (body.queues?.analytics?.dlq?.waiting ?? 0) < 10, - }); - } catch { /* parse error — log as failure */ } - } - - // Poll every 10 seconds - sleep(10); -} - -// ─── Default function — routes to correct scenario function ─────────────────── -// k6 uses exec tags to map VUs to named functions when using "scenarios" config. -// The default export is only called when no `exec` is specified on a scenario. -// Since we have two named scenarios above, we point each one at its function. - -export default function () { - // Fallback: if run without scenarios config, execute the checkout burst. - checkoutBurst(); -} diff --git a/scripts/monitoring-sync.sh b/scripts/monitoring-sync.sh deleted file mode 100644 index 4273f0a..0000000 --- a/scripts/monitoring-sync.sh +++ /dev/null @@ -1,344 +0,0 @@ -#!/usr/bin/env bash -# ============================================================================= -# monitoring-sync.sh — Self-Healing Monitoring Stack Sync -# -# Called by the CI sync-monitoring job after every production deploy. -# -# Responsibilities: -# 1. SELF-HEAL — create missing .env.monitoring from example if absent -# 2. BOOTSTRAP — detect placeholder values and warn (cold-start mode) -# 3. ENSURE NETWORK — create api_network if it does not exist -# 4. SYNC — idempotent `docker compose up -d` (starts if down, no-ops if healthy) -# 5. VALIDATE — confirm prometheus / grafana / alertmanager are running + healthy -# 6. ENFORCE — exit 1 if any required container is not healthy after timeout -# -# Self-healing rules (safe defaults): -# - .env.monitoring missing → copy from infra/.env.monitoring.example + warn -# - .env.monitoring has placeholders (change-me) → skip health wait, warn operator -# - api_network missing → create it -# - alertmanager rendered config missing → render it -# -# Timeouts: -# - Per-container health check: 60 seconds max (20 attempts × 3 s) -# - Polling interval: 3 seconds -# - Total wait tracked to prevent cascading timeouts -# -# Exit codes: -# 0 All required monitoring containers are healthy -# 1 One or more required containers failed to become healthy (deploy must fail) -# -# Required env (exported by load-env.sh / present in DEPLOY_ROOT): -# DEPLOY_ROOT — absolute path to the repository root on the VPS -# ============================================================================= -set -euo pipefail -trap '_ft_mon_trap "$LINENO"' ERR - -# ───────────────────────────────────────────────────────────────────────── -# STATE CLASSIFICATION -# ───────────────────────────────────────────────────────────────────────── -DEPLOY_STATE="SUCCESS" -trap '[ $? -ne 0 ] && DEPLOY_STATE="FAILED" || true' EXIT - -# --------------------------------------------------------------------------- -# LOGGING -# --------------------------------------------------------------------------- -_FT_MON_LOG_FILE="${DEPLOY_LOG_FILE:-/var/log/api/deploy.log}" -_LOG_DIR="$(dirname "$_FT_MON_LOG_FILE")" -if ! mkdir -p "$_LOG_DIR" 2>/dev/null; then - _LOG_DIR="$HOME/api/logs" - _FT_MON_LOG_FILE="$_LOG_DIR/deploy.log" - mkdir -p "$_LOG_DIR" -fi - -_log() { - printf '[MON-SYNC] ts=%s %s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$*" \ - | tee -a "$_FT_MON_LOG_FILE" >&2 -} - -_ft_mon_trap() { - printf '[MON-SYNC] ts=%s level=ERROR msg="unexpected failure at line %s"\n' \ - "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$1" >&2 -} - -# --------------------------------------------------------------------------- -# RESOLVE PATHS -# --------------------------------------------------------------------------- -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" - -if [ ! -d "$DEPLOY_ROOT" ]; then - _log "level=ERROR msg='DEPLOY_ROOT not found' path=$DEPLOY_ROOT" - exit 1 -fi - -INFRA_DIR="$DEPLOY_ROOT/infra" -MON_ENV="$INFRA_DIR/.env.monitoring" -MON_ENV_EXAMPLE="$INFRA_DIR/.env.monitoring.example" -MON_COMPOSE="$INFRA_DIR/docker-compose.monitoring.yml" -ALERTMANAGER_RENDERED="$INFRA_DIR/alertmanager/alertmanager.rendered.yml" -RENDER_SCRIPT="$INFRA_DIR/scripts/render-alertmanager.sh" - -_log "msg='monitoring-sync started' deploy_root=$DEPLOY_ROOT state=$DEPLOY_STATE" - -# --------------------------------------------------------------------------- -# STEP 1 — SELF-HEAL: .env.monitoring -# Create from example if missing instead of failing hard. -# The user MUST still fill in real values after first-time creation. -# --------------------------------------------------------------------------- -BOOTSTRAP_MODE=false -if [ ! -f "$MON_ENV" ]; then - if [ -f "$MON_ENV_EXAMPLE" ]; then - cp "$MON_ENV_EXAMPLE" "$MON_ENV" - chmod 600 "$MON_ENV" - BOOTSTRAP_MODE=true - _log "level=WARN msg='monitoring env file missing — created from example' path=$MON_ENV" - _log "level=WARN msg='ACTION REQUIRED: edit $MON_ENV with real GRAFANA_ADMIN_PASSWORD, METRICS_SCRAPE_TOKEN, ALERTMANAGER_SLACK_WEBHOOK'" - else - _log "level=ERROR msg='monitoring env file and example both missing' path=$MON_ENV" - DEPLOY_STATE="FAILED" - exit 1 - fi -else - chmod 600 "$MON_ENV" - _log "msg='monitoring env file exists' path=$MON_ENV" -fi - -# ───────────────────────────────────────────────────────────────────────── -# STEP 1B — BOOTSTRAP MODE: Detect placeholders -# If .env.monitoring contains default 'change-me' values, we're in cold-start. -# Skip health polling to avoid timeout on misconfigured system. -# ───────────────────────────────────────────────────────────────────────── -if grep -q "change-me" "$MON_ENV" 2>/dev/null; then - BOOTSTRAP_MODE=true - _log "level=WARN msg='bootstrap mode detected: .env.monitoring contains placeholder values' action='skipping health check'" - _log "level=WARN msg='OPERATOR ACTION: edit infra/.env.monitoring and set real values, then re-run deploy'" -fi - -# --------------------------------------------------------------------------- -# STEP 2 — SELF-HEAL: Docker network api_network -# --------------------------------------------------------------------------- -if ! docker network ls --format '{{.Name}}' | grep -Eq '^api_network$'; then - _log "msg='api_network missing — creating' driver=bridge" - docker network create --driver bridge api_network - _log "msg='api_network created'" -else - _log "msg='api_network exists'" -fi - -# --------------------------------------------------------------------------- -# STEP 3 — SELF-HEAL: Render alertmanager config -# render-alertmanager.sh is idempotent; always safe to run. -# --------------------------------------------------------------------------- -if [ -x "$RENDER_SCRIPT" ]; then - _log "msg='rendering alertmanager config'" - bash "$RENDER_SCRIPT" - _log "msg='alertmanager config rendered' file=$ALERTMANAGER_RENDERED" -elif [ ! -f "$ALERTMANAGER_RENDERED" ]; then - _log "level=ERROR msg='render-alertmanager.sh not found AND rendered config missing' script=$RENDER_SCRIPT" - exit 1 -else - _log "level=WARN msg='render-alertmanager.sh not found but rendered config exists — continuing' script=$RENDER_SCRIPT" -fi - -# --------------------------------------------------------------------------- -# STEP 4 — SYNC: docker compose up -d (idempotent) -# Creates containers that are missing; leaves healthy containers untouched. -# --------------------------------------------------------------------------- -_log "msg='starting monitoring stack (idempotent)'" -cd "$INFRA_DIR" -docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml up -d --remove-orphans -cd "$DEPLOY_ROOT" -_log "msg='docker compose up -d complete'" - -# --------------------------------------------------------------------------- -# STEP 5 — VALIDATE: wait for required containers to become healthy -# -# Required containers (must be healthy for deploy to succeed): -# prometheus — metrics collection (health: http://prometheus:9090/-/healthy) -# alertmanager — alert routing (health: http://alertmanager:9093/-/healthy) -# grafana — dashboards (health: http://grafana:3000/api/health) -# -# Strategy: poll docker inspect for Health.Status via Docker service DNS. -# Times out at 60 s per container (20 attempts × 3 s). -# Note: Using service names (not localhost) because containers are in Docker network only. -# --------------------------------------------------------------------------- - -_wait_container_healthy() { - local name="$1" - local max_wait_sec="${2:-60}" - local interval="${3:-3}" - - _log "msg='waiting for container health' container=$name max_wait_sec=$max_wait_sec interval=$interval" - - local waited=0 - while [ $waited -lt $max_wait_sec ]; do - # Explicit container name enforcement: use docker inspect directly. - # Avoids fragile grep patterns; fails fast if container name is wrong. - if ! docker inspect "$name" >/dev/null 2>&1; then - _log "level=WARN msg='container does not exist or inspect failed' container=$name waited_sec=$waited" - sleep "$interval" - waited=$((waited + interval)) - continue - fi - - # Container exists — check health status - local health_status - health_status=$(docker inspect --format='{{if .State.Health}}{{.State.Health.Status}}{{else}}no-healthcheck{{end}}' "$name" 2>/dev/null || echo "inspect-failed") - - case "$health_status" in - healthy) - _log "msg='container healthy' container=$name waited_sec=$waited" - return 0 - ;; - no-healthcheck) - # Container has no Docker healthcheck — verify it is at least running. - local running - running=$(docker inspect --format='{{.State.Running}}' "$name" 2>/dev/null || echo "false") - if [ "$running" = "true" ]; then - _log "msg='container running (no healthcheck configured)' container=$name" - return 0 - fi - ;; - starting) - _log "msg='container starting' container=$name waited_sec=$waited/$max_wait_sec" - ;; - unhealthy) - _log "level=WARN msg='container unhealthy' container=$name waited_sec=$waited/$max_wait_sec" - ;; - inspect-failed) - _log "level=WARN msg='docker inspect failed' container=$name waited_sec=$waited" - ;; - *) - _log "level=WARN msg='unknown health status' container=$name status=$health_status waited_sec=$waited" - ;; - esac - - sleep "$interval" - waited=$((waited + interval)) - done - - _log "level=ERROR msg='container did not become healthy within timeout' container=$name max_wait_sec=$max_wait_sec" - docker logs "$name" --tail 30 >&2 2>/dev/null || true - return 1 -} - -_check_endpoint() { - # Execute the health check INSIDE the container via docker exec. - # Monitoring containers live only on api_network and are NOT reachable via - # host-side DNS — their names (prometheus, alertmanager, grafana) only - # resolve from other containers on the same Docker network. - # Prefer wget (present in prom/* alpine images); fall back to curl (grafana). - local name="$1" - local url="$2" - - if docker exec "$name" wget --spider -q "$url" >/dev/null 2>&1; then - _log "msg='endpoint healthy' container=$name url=$url" - return 0 - elif docker exec "$name" curl -sf --max-time 5 "$url" >/dev/null 2>&1; then - _log "msg='endpoint healthy (curl)' container=$name url=$url" - return 0 - else - _log "level=ERROR msg='endpoint unhealthy' container=$name url=$url" - return 1 - fi -} - -# ───────────────────────────────────────────────────────────────────────── -# SKIP HEALTH CHECKS IN BOOTSTRAP MODE -# ───────────────────────────────────────────────────────────────────────── -if [ "$BOOTSTRAP_MODE" = "true" ]; then - DEPLOY_STATE="BOOTSTRAP" - _log "level=WARN msg='bootstrap mode detected — skipping container health checks' state=$DEPLOY_STATE" - _log "level=WARN msg='ACTION: configure infra/.env.monitoring with real values and re-run deploy to enable monitoring'" - exit 0 -fi - -# ───────────────────────────────────────────────────────────────────────── -# ENFORCE: Container name validation + health checks -# ───────────────────────────────────────────────────────────────────────── -# Exact container name enforcement: fail fast if any required container is missing -REQUIRED_CONTAINERS=("prometheus" "alertmanager" "grafana") -for c in "${REQUIRED_CONTAINERS[@]}"; do - if ! docker inspect "$c" >/dev/null 2>&1; then - _log "level=ERROR msg='required container missing' container=$c" - DEPLOY_STATE="FAILED" - docker ps --format 'table {{.Names}}\t{{.Status}}' 2>/dev/null >&2 || true - exit 1 - fi -done - -MONITORING_ERRORS=0 - -# ── Prometheus ────────────────────────────────────────────────────────────── -if _wait_container_healthy "prometheus" 60 3; then - _check_endpoint "prometheus" "http://prometheus:9090/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) -else - MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) -fi - -# ── Alertmanager ───────────────────────────────────────────────────────────── -if _wait_container_healthy "alertmanager" 60 3; then - _check_endpoint "alertmanager" "http://alertmanager:9093/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) -else - MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) -fi - -# ── Grafana ────────────────────────────────────────────────────────────────── -# Grafana may take longer to start; allow 60s timeout. -if _wait_container_healthy "grafana" 60 3; then - # Grafana health endpoint returns 200 with JSON when ready. - _check_endpoint "grafana" "http://grafana:3000/api/health" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) -else - MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) -fi - -# --------------------------------------------------------------------------- -# STABILITY WINDOW — Verify containers remain healthy after initial pass -# This catches "flaky startup" where containers pass health check but crash -# immediately after. Wait settle window then re-verify all containers. -# --------------------------------------------------------------------------- -_log "msg='entering stability window (5s settle + re-check)'" -sleep 5 - -for c in "${REQUIRED_CONTAINERS[@]}"; do - STABLE_STATUS=$(docker inspect --format='{{if .State.Health}}{{.State.Health.Status}}{{else}}running{{end}}' "$c" 2>/dev/null || echo "inspect-failed") - if [ "$STABLE_STATUS" != "healthy" ] && [ "$STABLE_STATUS" != "running" ]; then - _log "level=ERROR msg='container became unhealthy during stability window' container=$c status=$STABLE_STATUS" - MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) - fi -done - -# --------------------------------------------------------------------------- -# PROMETHEUS SCRAPING VALIDATION — Ensure Prometheus is actually working -# A healthy Prometheus container is useless if it's not scraping targets. -# Query the Prometheus API to verify targets are UP. -# --------------------------------------------------------------------------- -_log "msg='validating prometheus scraping targets'" -# Use docker exec to query the Prometheus API from inside the container. -# The prometheus container name is only resolvable within api_network, not from the host. -PROM_TARGETS=$(docker exec prometheus wget -qO- "http://localhost:9090/api/v1/targets" 2>/dev/null || echo "") - -if [ -z "$PROM_TARGETS" ]; then - _log "level=WARN msg='prometheus API query failed — cannot validate scraping (proceeding with caution)'" -elif ! echo "$PROM_TARGETS" | grep -q '"health":"up"' 2>/dev/null; then - _log "level=ERROR msg='prometheus has no healthy scrape targets' curl_response=${PROM_TARGETS:0:200}" - MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) -else - # Count active targets - ACTIVE_TARGETS=$(echo "$PROM_TARGETS" | grep -o '"health":"up"' | wc -l) - _log "msg='prometheus scraping targets' active_count=$ACTIVE_TARGETS" -fi - -# --------------------------------------------------------------------------- -# FINAL ENFORCEMENT -# --------------------------------------------------------------------------- -if [ "$MONITORING_ERRORS" -gt 0 ]; then - _log "level=ERROR msg='monitoring validation failed' errors=$MONITORING_ERRORS state=$DEPLOY_STATE" - _log "level=ERROR msg='container state at failure:'" - docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Ports}}' 2>/dev/null >&2 || true - DEPLOY_STATE="FAILED" - exit 1 -fi - -_log "msg='monitoring-sync complete' state=$DEPLOY_STATE containers=healthy required=3" -exit 0 diff --git a/scripts/rollback.sh b/scripts/rollback.sh deleted file mode 100644 index 2f2ef11..0000000 --- a/scripts/rollback.sh +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x -trap '[[ "${BASH_COMMAND}" != _ft_log* ]] && printf "[DEPLOY] ts=%s state=ROLLBACK level=ERROR msg=\"rollback script failed at line %s\"\n" "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$LINENO"' ERR - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Load and validate environment. -# Sets: DEPLOY_ROOT, ENV_FILE, API_HOSTNAME. -# Exports all variables from .env into this process. -# Disable trace to prevent secrets from leaking into logs. -set +x -source "$SCRIPT_DIR/load-env.sh" -set -x - -DEPLOY_HISTORY="$DEPLOY_ROOT/.deploy_history" - -AUTO_MODE=false - -if [[ "${1:-}" == "--auto" ]]; then - AUTO_MODE=true -fi - -echo "=========================================" -echo "FieldTrack Rollback System" -echo "=========================================" - -# Check if deployment history exists and validate checksum -if [ ! -f "$DEPLOY_HISTORY" ]; then - echo "ERROR: No deployment history found." - echo "File not found: $DEPLOY_HISTORY" - exit 1 -fi - -# Validate deployment history file integrity -if [ ! -s "$DEPLOY_HISTORY" ]; then - echo "ERROR: Deployment history file is empty or corrupted." - exit 1 -fi - -mapfile -t HISTORY < "$DEPLOY_HISTORY" - -if [ ${#HISTORY[@]} -lt 2 ]; then - echo "ERROR: Need at least two deployments to rollback." - exit 1 -fi - -CURRENT_SHA="${HISTORY[0]}" -PREVIOUS_SHA="${HISTORY[1]}" - -echo "Current deployment : $CURRENT_SHA" -echo "Rollback target : $PREVIOUS_SHA" -echo "" - -# Validate that the rollback image exists in the registry -echo "Validating rollback image exists..." -if ! docker manifest inspect "ghcr.io/fieldtrack-tech/api:$PREVIOUS_SHA" >/dev/null 2>&1; then - echo "ERROR: Rollback image not found in registry." - echo "Image: ghcr.io/fieldtrack-tech/api:$PREVIOUS_SHA" - echo "Cannot proceed with rollback to non-existent image." - exit 1 -fi -echo "✓ Rollback image verified in registry." -echo "" - -if [ "$AUTO_MODE" = false ]; then - echo "⚠️ WARNING: This will replace the current deployment." - read -p "Continue with rollback? (yes/no): " -r - - if [[ ! $REPLY =~ ^[Yy][Ee][Ss]$ ]]; then - echo "Rollback cancelled." - exit 0 - fi -else - echo "Auto rollback mode enabled (CI)." -fi - -echo "" -echo "Starting rollback to: $PREVIOUS_SHA" -echo "" - -# Set guard to prevent infinite rollback loops -export API_ROLLBACK_IN_PROGRESS=1 - -# Attempt rollback deploy -if ! "$SCRIPT_DIR/deploy-bluegreen.sh" "$PREVIOUS_SHA"; then - echo "" - echo "=========================================" - echo "❌ CRITICAL: ROLLBACK FAILED" - echo "=========================================" - echo "Both deployment and rollback have failed." - echo "" - echo "SYSTEM STATE SNAPSHOT:" - echo " Active containers:" - docker ps --format ' {{.Names}} → {{.Status}} ({{.Ports}})' 2>/dev/null || echo " (docker ps failed)" - echo " Active slot file: $(cat "/var/run/api/active-slot" 2>/dev/null || echo 'MISSING')" - echo " Nginx config test: $(docker exec nginx nginx -t 2>&1)" - echo "" - echo "Target SHA: $PREVIOUS_SHA" - echo "" - echo "Action required:" - echo " 1. Check container status: docker ps -a" - echo " 2. Check nginx config: docker exec nginx nginx -t" - echo " 3. Review logs: docker logs api-blue api-green" - echo " 4. Manually restore last known good state" - echo "=========================================" - exit 2 -fi - -echo "" -echo "=========================================" -echo "Rollback completed successfully" -echo "Production now running: $PREVIOUS_SHA" -echo "=========================================" diff --git a/scripts/smoke-test.sh b/scripts/smoke-test.sh deleted file mode 100644 index 977cc0c..0000000 --- a/scripts/smoke-test.sh +++ /dev/null @@ -1,445 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -BASE_URL="${API_BASE_URL:-https://api.getfieldtrack.app}" -API="${BASE_URL}" - -EMP_EMAIL="${FT_EMP_EMAIL:-}" -EMP_PASSWORD="${FT_EMP_PASSWORD:-}" -ADMIN_EMAIL="${FT_ADMIN_EMAIL:-}" -ADMIN_PASSWORD="${FT_ADMIN_PASSWORD:-}" - -SUPABASE_URL="${SUPABASE_URL:-}" -SUPABASE_ANON="${SUPABASE_ANON_KEY:-}" - -PASS=0 -FAIL=0 -TMP_HEADERS=$(mktemp) -TMP_BODY=$(mktemp) - -# ---------------------------------------------------------------- -# Decode the payload section of a JWT (base64url → JSON string). -# Usage: decode_jwt_payload -# ---------------------------------------------------------------- -decode_jwt_payload() { - local token=$1 - local payload - payload=$(echo "$token" | cut -d'.' -f2) - # Restore standard base64 alphabet and add required padding - local mod=$(( ${#payload} % 4 )) - case $mod in - 2) payload="${payload}==" ;; - 3) payload="${payload}=" ;; - esac - echo "$payload" | tr '_-' '/+' | base64 -d 2>/dev/null -} - -# ---------------------------------------------------------------- -# Assert that a JWT contains the required hook-injected claims. -# Exits with code 1 if any required claim is missing. -# Usage: assert_hook_claims