From 1aeddcee74dd28c1d1c1d154bcd0c7d11758f3e6 Mon Sep 17 00:00:00 2001 From: rajashish147 Date: Fri, 3 Apr 2026 15:09:07 +0530 Subject: [PATCH 1/2] feat(ci): enhance CodeQL workflows and add deep scan for post-merge analysis feat(deploy): implement API health gate and improve monitoring stack sync fix(vps-setup): create runtime state directories for blue-green deployment --- .github/workflows/codeql-deep.yml | 86 ++++++++ .github/workflows/codeql.yml | 30 +-- .github/workflows/deploy.yml | 157 +++++++++++++- scripts/monitoring-sync.sh | 336 ++++++++++++++++++++++++++++++ scripts/vps-setup.sh | 24 ++- 5 files changed, 613 insertions(+), 20 deletions(-) create mode 100644 .github/workflows/codeql-deep.yml create mode 100644 scripts/monitoring-sync.sh diff --git a/.github/workflows/codeql-deep.yml b/.github/workflows/codeql-deep.yml new file mode 100644 index 0000000..364cc2b --- /dev/null +++ b/.github/workflows/codeql-deep.yml @@ -0,0 +1,86 @@ +name: "CodeQL — Deep Scan (post-merge)" +# Runs after every merge to master AND on a weekly schedule. +# Uses the full security-and-quality query suite — significantly more thorough +# than the PR lightweight scan. +# +# DOES NOT block the Deploy pipeline. Both workflows trigger independently on +# a master push; deploy.yml never depends on this workflow. Results are +# uploaded to the GitHub Security tab for async review. +# +# If critical issues are found, the security team should open a tracking issue +# and gate the next deployment manually. This workflow itself never fails the +# deploy unless an operator explicitly adds it as a required check. + +on: + push: + branches: ["master"] + schedule: + # Every Monday at 03:15 UTC — offset from midnight to avoid GHA congestion. + - cron: "15 3 * * 1" + +# Do not cancel in-progress deep scans — let them complete for full coverage. +concurrency: + group: codeql-deep-${{ github.ref }} + cancel-in-progress: false + +permissions: + actions: read + contents: read + security-events: write + +jobs: + analyze-deep: + name: Deep Analyze (CodeQL) + runs-on: ubuntu-latest + timeout-minutes: 40 + + strategy: + fail-fast: false + matrix: + language: ["javascript"] + + steps: + - name: Checkout repository + uses: actions/checkout@v5 + + - name: Setup Node.js (match production) + uses: actions/setup-node@v5 + with: + node-version: 24 + cache: npm + cache-dependency-path: package-lock.json + + - name: Install dependencies + run: npm ci + + - name: Build API + run: npm run build || true + + - name: Initialize CodeQL + uses: github/codeql-action/init@v4 + with: + languages: ${{ matrix.language }} + # Full suite: security + quality + style rules. + # Catches OWASP Top-10 plus code-quality issues that may hide security risks. + queries: security-and-quality + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v4 + with: + category: "codeql-deep" + # Upload unconditionally — results land in the Security tab regardless + # of whether any alerts are found. + upload: always + + - name: Write deep-scan summary + if: always() + run: | + { + echo "## CodeQL Deep Scan" + echo "| Field | Value |" + echo "|---|---|" + echo "| Commit | \`${{ github.sha }}\` |" + echo "| Ref | \`${{ github.ref }}\` |" + echo "| Query suite | \`security-and-quality\` |" + echo "| Results | [Security tab](/${{ github.repository }}/security/code-scanning) |" + } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 36ebca9..888623f 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -1,15 +1,15 @@ -name: "CodeQL Security Scan" +name: "CodeQL — PR Scan (lightweight)" +# Runs on every PR to master. Fast feedback: security-extended queries only. +# The deep security-and-quality scan runs separately in codeql-deep.yml after +# a merge lands on master and does NOT block this pipeline. on: - push: - branches: ["master"] pull_request: branches: ["master"] - schedule: - - cron: "0 3 * * 1" +# Cancel in-flight scans for the same PR when new commits are pushed. concurrency: - group: codeql-${{ github.ref }} + group: codeql-pr-${{ github.event.pull_request.number }} cancel-in-progress: true permissions: @@ -21,7 +21,7 @@ jobs: analyze: name: Analyze (CodeQL) runs-on: ubuntu-latest - timeout-minutes: 25 + timeout-minutes: 15 strategy: fail-fast: false @@ -32,7 +32,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v5 - # ✅ Match production runtime - name: Setup Node.js (match production) uses: actions/setup-node@v5 with: @@ -40,21 +39,24 @@ jobs: cache: npm cache-dependency-path: package-lock.json - # ✅ Install ALL dependencies - name: Install dependencies run: npm ci - # ✅ Build API (critical for CodeQL flow analysis) + # Build so CodeQL can trace data flows through compiled output. - name: Build API run: npm run build || true - # ✅ Initialize CodeQL AFTER dependencies + # Initialize AFTER install + build so the database includes all sources. - name: Initialize CodeQL uses: github/codeql-action/init@v4 with: languages: ${{ matrix.language }} - queries: security-and-quality + # security-extended: broader than the default security set but + # significantly faster than security-and-quality (no style/quality rules). + # Catches OWASP Top-10 class issues without slowing PR feedback. + queries: security-extended - # ✅ Analyze - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v4 \ No newline at end of file + uses: github/codeql-action/analyze@v4 + with: + category: "codeql-pr" \ No newline at end of file diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 07458d9..627e35b 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -14,8 +14,8 @@ # # Parallel stages: # validate ─┐ -# test-api ├─► build-scan-push ─► deploy ─► sync-infra ─► health-and-smoke -# ┘ │ +# test-api ├─► build-scan-push ─► deploy ─► api-health-gate ─► sync-infra ─► sync-monitoring ─► health-and-smoke +# ┘ │ # rollback ◄────────────┘ (on failure) name: Deploy to Production @@ -517,6 +517,21 @@ jobs: fi echo "✓ CORS_ORIGIN is set" + - name: Log deployment metadata and trigger info + run: | + { + echo "## Deployment Initiated" + echo "| Field | Value |" + echo "|---|---|" + echo "| Commit SHA | \`${{ github.sha }}\` |" + echo "| Trigger event | ${{ github.event_name }} |" + echo "| Triggered by | ${{ github.actor }} |" + echo "| Branch | ${{ github.ref_name }} |" + echo "| Workflow run | [${{ github.run_id }}](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) |" + echo "| Commit message | \`${{ github.event.head_commit.message }}\` |" + } >> "$GITHUB_STEP_SUMMARY" + echo "📋 Deployment initiated — SHA=${{ github.sha }} EVENT=${{ github.event_name }} ACTOR=${{ github.actor }} RUN=${{ github.run_id }}" + - name: Validate environment contract before deploy uses: appleboy/ssh-action@v1.0.3 with: @@ -581,6 +596,53 @@ jobs: echo "DEPLOY_STATE=$DEPLOY_STATUS| SLOT=$ACTIVE_SLOT | SHA=${{ github.sha }}" + # --------------------------------------------------------------------------- + # JOB: api-health-gate (Step E+) + # + # Early API health validation — runs AFTER deploy but BEFORE infra sync. + # Ensures the API container is truly healthy before we sync monitoring/nginx. + # If the API is not healthy at this point, STOP before touching infra. + # --------------------------------------------------------------------------- + api-health-gate: + name: API Health Gate + runs-on: ubuntu-latest + needs: [deploy] + timeout-minutes: 5 + steps: + - name: Verify API container is healthy before infra sync + uses: appleboy/ssh-action@v1.0.3 + with: + host: ${{ secrets.DO_HOST }} + username: ${{ secrets.DO_USER }} + key: ${{ secrets.DO_SSH_KEY }} + script: | + set -euo pipefail + export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" + [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } + cd "$DEPLOY_ROOT" + source scripts/load-env.sh + + # Determine active slot (blue/green) + ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "blue") + if [ "$ACTIVE_SLOT" = "green" ]; then BACKEND_PORT=3002; else BACKEND_PORT=3001; fi + + echo "=== API Health Gate (slot: $ACTIVE_SLOT, port: $BACKEND_PORT) ===" + + # Poll /ready endpoint (internal readiness probe) + for i in $(seq 1 15); do + STATUS=$(curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:$BACKEND_PORT/ready" 2>/dev/null || echo "000") + if [ "$STATUS" = "200" ]; then + echo "✓ API ready on port $BACKEND_PORT (attempt $i)" + exit 0 + fi + echo " Attempt $i: HTTP $STATUS — waiting..." + sleep 2 + done + + echo "❌ API /ready did not return 200 after 30s — monitoring sync would fail anyway" + docker logs "api-$ACTIVE_SLOT" --tail 30 2>/dev/null || true + exit 1 + # --------------------------------------------------------------------------- # JOB: sync-infra # @@ -590,7 +652,7 @@ jobs: sync-infra: name: Sync Infrastructure (nginx) runs-on: ubuntu-latest - needs: [deploy] + needs: [api-health-gate] timeout-minutes: 10 steps: - name: Sync infrastructure configs via SSH @@ -641,8 +703,89 @@ jobs: sudo systemctl reload nginx echo "✓ Nginx reloaded." + # ROUTING VALIDATION — Test actual traffic through Nginx + # Config syntax is valid (nginx -t) but routing may still be broken. + # Test by hitting the /health endpoint via localhost + Host header. + echo "=== Testing Nginx routing (localhost + Host header) ===" + sleep 2 # Give Nginx a moment to fully apply reload + + ROUTE_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \ + --resolve "$API_HOSTNAME:443:127.0.0.1" \ + -H "Host: $API_HOSTNAME" \ + "https://127.0.0.1/health" --insecure 2>/dev/null || echo "000") + + if [ "$ROUTE_STATUS" = "200" ]; then + echo "✓ Nginx routing verified (HTTP $ROUTE_STATUS)" + else + echo "❌ Nginx routing broken (HTTP $ROUTE_STATUS expected 200) — restoring backup..." + sudo cp /tmp/api.conf.bak "$NGINX_LIVE" + sudo nginx -t 2>&1 && sudo systemctl reload nginx || true + exit 1 + fi + echo "✓ Infra sync completed in $(($(date +%s) - T0))s" + # --------------------------------------------------------------------------- + # JOB: sync-monitoring (Step F) + # + # Idempotent monitoring stack sync — runs after every deploy. + # Delegates to scripts/monitoring-sync.sh which: + # - Self-heals missing .env.monitoring from example + # - Creates api_network if absent + # - Renders alertmanager.rendered.yml + # - Runs docker compose up -d + # - Validates prometheus / alertmanager / grafana health + # Monitoring is REQUIRED — deploy fails if any required container is unhealthy. + # --------------------------------------------------------------------------- + sync-monitoring: + name: Sync Monitoring Stack + runs-on: ubuntu-latest + needs: [sync-infra] + timeout-minutes: 15 + steps: + - name: Sync and validate monitoring stack via SSH + uses: appleboy/ssh-action@v1.0.3 + with: + host: ${{ secrets.DO_HOST }} + username: ${{ secrets.DO_USER }} + key: ${{ secrets.DO_SSH_KEY }} + script: | + set -euo pipefail + export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" + [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } + cd "$DEPLOY_ROOT" + chmod +x scripts/monitoring-sync.sh + ./scripts/monitoring-sync.sh + + - name: Monitoring sync summary + if: always() + run: | + { + echo "## Monitoring Sync" + echo "| Container | Required |" + echo "|---|---|" + echo "| prometheus | ✅ |" + echo "| alertmanager | ✅ |" + echo "| grafana | ✅ |" + } >> "$GITHUB_STEP_SUMMARY" + + - name: Deployment artifact traceability + if: always() + run: | + { + echo "## Deployment Artifacts" + echo "| Field | Value |" + echo "|---|---|" + echo "| Deployment SHA | \`${{ github.sha }}\` |" + echo "| Image Tag | \`fieldtrack-api:${{ needs.get-metadata.outputs.sha_short || github.sha }}\` |" + echo "| Workflow Run | [\#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) |" + echo "| Triggered By | \`${{ github.event_name }}\` |" + echo "| Commit Message | \`${{ github.event.head_commit.message }}\` |" + } >> "$GITHUB_STEP_SUMMARY" + + # Also output to logs for audit trail + echo "DEPLOYMENT_COMPLETE: SHA=${{ github.sha }} IMAGE=ghcr.io/${{ github.repository_owner }}/api:${{ github.sha }} RUN=${{ github.run_id }}" + # --------------------------------------------------------------------------- # JOB: health-and-smoke # @@ -653,7 +796,7 @@ jobs: health-and-smoke: name: Health Checks & Smoke Tests runs-on: ubuntu-latest - needs: [sync-infra] + needs: [sync-infra, sync-monitoring] timeout-minutes: 15 steps: - name: Checkout @@ -767,13 +910,15 @@ jobs: rollback: name: Rollback Deployment (auto) runs-on: ubuntu-latest - needs: [deploy, sync-infra, health-and-smoke] + needs: [deploy, api-health-gate, sync-infra, sync-monitoring, health-and-smoke] timeout-minutes: 10 if: | always() && ( needs.deploy.result == 'failure' || + needs.api-health-gate.result == 'failure' || needs.sync-infra.result == 'failure' || + needs.sync-monitoring.result == 'failure' || needs.health-and-smoke.result == 'failure' ) steps: @@ -781,7 +926,9 @@ jobs: run: | echo "ROLLBACK_TRIGGERED=TRUE | FAILED_JOBS:" [ "${{ needs.deploy.result }}" = "failure" ] && echo " - deploy" + [ "${{ needs.api-health-gate.result }}" = "failure" ] && echo " - api-health-gate" [ "${{ needs.sync-infra.result }}" = "failure" ] && echo " - sync-infra" + [ "${{ needs.sync-monitoring.result }}" = "failure" ] && echo " - sync-monitoring" [ "${{ needs.health-and-smoke.result }}" = "failure" ] && echo " - health-and-smoke" echo "SHA=${{ github.sha }}" diff --git a/scripts/monitoring-sync.sh b/scripts/monitoring-sync.sh new file mode 100644 index 0000000..26c5b62 --- /dev/null +++ b/scripts/monitoring-sync.sh @@ -0,0 +1,336 @@ +#!/usr/bin/env bash +# ============================================================================= +# monitoring-sync.sh — Self-Healing Monitoring Stack Sync +# +# Called by the CI sync-monitoring job after every production deploy. +# +# Responsibilities: +# 1. SELF-HEAL — create missing .env.monitoring from example if absent +# 2. BOOTSTRAP — detect placeholder values and warn (cold-start mode) +# 3. ENSURE NETWORK — create api_network if it does not exist +# 4. SYNC — idempotent `docker compose up -d` (starts if down, no-ops if healthy) +# 5. VALIDATE — confirm prometheus / grafana / alertmanager are running + healthy +# 6. ENFORCE — exit 1 if any required container is not healthy after timeout +# +# Self-healing rules (safe defaults): +# - .env.monitoring missing → copy from infra/.env.monitoring.example + warn +# - .env.monitoring has placeholders (change-me) → skip health wait, warn operator +# - api_network missing → create it +# - alertmanager rendered config missing → render it +# +# Timeouts: +# - Per-container health check: 60 seconds max (20 attempts × 3 s) +# - Polling interval: 3 seconds +# - Total wait tracked to prevent cascading timeouts +# +# Exit codes: +# 0 All required monitoring containers are healthy +# 1 One or more required containers failed to become healthy (deploy must fail) +# +# Required env (exported by load-env.sh / present in DEPLOY_ROOT): +# DEPLOY_ROOT — absolute path to the repository root on the VPS +# ============================================================================= +set -euo pipefail +trap '_ft_mon_trap "$LINENO"' ERR + +# ───────────────────────────────────────────────────────────────────────── +# STATE CLASSIFICATION +# ───────────────────────────────────────────────────────────────────────── +DEPLOY_STATE="SUCCESS" +trap '[ $? -ne 0 ] && DEPLOY_STATE="FAILED" || true' EXIT + +# --------------------------------------------------------------------------- +# LOGGING +# --------------------------------------------------------------------------- +_FT_MON_LOG_FILE="${DEPLOY_LOG_FILE:-/var/log/api/deploy.log}" +_LOG_DIR="$(dirname "$_FT_MON_LOG_FILE")" +if ! mkdir -p "$_LOG_DIR" 2>/dev/null; then + _LOG_DIR="$HOME/api/logs" + _FT_MON_LOG_FILE="$_LOG_DIR/deploy.log" + mkdir -p "$_LOG_DIR" +fi + +_log() { + printf '[MON-SYNC] ts=%s %s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$*" \ + | tee -a "$_FT_MON_LOG_FILE" >&2 +} + +_ft_mon_trap() { + printf '[MON-SYNC] ts=%s level=ERROR msg="unexpected failure at line %s"\n' \ + "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$1" >&2 +} + +# --------------------------------------------------------------------------- +# RESOLVE PATHS +# --------------------------------------------------------------------------- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" + +if [ ! -d "$DEPLOY_ROOT" ]; then + _log "level=ERROR msg='DEPLOY_ROOT not found' path=$DEPLOY_ROOT" + exit 1 +fi + +INFRA_DIR="$DEPLOY_ROOT/infra" +MON_ENV="$INFRA_DIR/.env.monitoring" +MON_ENV_EXAMPLE="$INFRA_DIR/.env.monitoring.example" +MON_COMPOSE="$INFRA_DIR/docker-compose.monitoring.yml" +ALERTMANAGER_RENDERED="$INFRA_DIR/alertmanager/alertmanager.rendered.yml" +RENDER_SCRIPT="$INFRA_DIR/scripts/render-alertmanager.sh" + +_log "msg='monitoring-sync started' deploy_root=$DEPLOY_ROOT state=$DEPLOY_STATE" + +# --------------------------------------------------------------------------- +# STEP 1 — SELF-HEAL: .env.monitoring +# Create from example if missing instead of failing hard. +# The user MUST still fill in real values after first-time creation. +# --------------------------------------------------------------------------- +BOOTSTRAP_MODE=false +if [ ! -f "$MON_ENV" ]; then + if [ -f "$MON_ENV_EXAMPLE" ]; then + cp "$MON_ENV_EXAMPLE" "$MON_ENV" + chmod 600 "$MON_ENV" + BOOTSTRAP_MODE=true + _log "level=WARN msg='monitoring env file missing — created from example' path=$MON_ENV" + _log "level=WARN msg='ACTION REQUIRED: edit $MON_ENV with real GRAFANA_ADMIN_PASSWORD, METRICS_SCRAPE_TOKEN, ALERTMANAGER_SLACK_WEBHOOK'" + else + _log "level=ERROR msg='monitoring env file and example both missing' path=$MON_ENV" + DEPLOY_STATE="FAILED" + exit 1 + fi +else + chmod 600 "$MON_ENV" + _log "msg='monitoring env file exists' path=$MON_ENV" +fi + +# ───────────────────────────────────────────────────────────────────────── +# STEP 1B — BOOTSTRAP MODE: Detect placeholders +# If .env.monitoring contains default 'change-me' values, we're in cold-start. +# Skip health polling to avoid timeout on misconfigured system. +# ───────────────────────────────────────────────────────────────────────── +if grep -q "change-me" "$MON_ENV" 2>/dev/null; then + BOOTSTRAP_MODE=true + _log "level=WARN msg='bootstrap mode detected: .env.monitoring contains placeholder values' action='skipping health check'" + _log "level=WARN msg='OPERATOR ACTION: edit infra/.env.monitoring and set real values, then re-run deploy'" +fi + +# --------------------------------------------------------------------------- +# STEP 2 — SELF-HEAL: Docker network api_network +# --------------------------------------------------------------------------- +if ! docker network ls --format '{{.Name}}' | grep -Eq '^api_network$'; then + _log "msg='api_network missing — creating' driver=bridge" + docker network create --driver bridge api_network + _log "msg='api_network created'" +else + _log "msg='api_network exists'" +fi + +# --------------------------------------------------------------------------- +# STEP 3 — SELF-HEAL: Render alertmanager config +# render-alertmanager.sh is idempotent; always safe to run. +# --------------------------------------------------------------------------- +if [ -x "$RENDER_SCRIPT" ]; then + _log "msg='rendering alertmanager config'" + bash "$RENDER_SCRIPT" + _log "msg='alertmanager config rendered' file=$ALERTMANAGER_RENDERED" +elif [ ! -f "$ALERTMANAGER_RENDERED" ]; then + _log "level=ERROR msg='render-alertmanager.sh not found AND rendered config missing' script=$RENDER_SCRIPT" + exit 1 +else + _log "level=WARN msg='render-alertmanager.sh not found but rendered config exists — continuing' script=$RENDER_SCRIPT" +fi + +# --------------------------------------------------------------------------- +# STEP 4 — SYNC: docker compose up -d (idempotent) +# Creates containers that are missing; leaves healthy containers untouched. +# --------------------------------------------------------------------------- +_log "msg='starting monitoring stack (idempotent)'" +cd "$INFRA_DIR" +docker compose --env-file .env.monitoring -f docker-compose.monitoring.yml up -d --remove-orphans +cd "$DEPLOY_ROOT" +_log "msg='docker compose up -d complete'" + +# --------------------------------------------------------------------------- +# STEP 5 — VALIDATE: wait for required containers to become healthy +# +# Required containers (must be healthy for deploy to succeed): +# prometheus — metrics collection (health: http://localhost:9090/-/healthy) +# alertmanager — alert routing (health: http://localhost:9093/-/healthy) +# grafana — dashboards (health: http://localhost:3001/api/health) +# +# Strategy: poll docker inspect for Health.Status. +# Times out at 60 s per container (20 attempts × 3 s). +# --------------------------------------------------------------------------- + +_wait_container_healthy() { + local name="$1" + local max_wait_sec="${2:-60}" + local interval="${3:-3}" + + _log "msg='waiting for container health' container=$name max_wait_sec=$max_wait_sec interval=$interval" + + local waited=0 + while [ $waited -lt $max_wait_sec ]; do + # Explicit container name enforcement: use docker inspect directly. + # Avoids fragile grep patterns; fails fast if container name is wrong. + if ! docker inspect "$name" >/dev/null 2>&1; then + _log "level=WARN msg='container does not exist or inspect failed' container=$name waited_sec=$waited" + sleep "$interval" + waited=$((waited + interval)) + continue + fi + + # Container exists — check health status + local health_status + health_status=$(docker inspect --format='{{if .State.Health}}{{.State.Health.Status}}{{else}}no-healthcheck{{end}}' "$name" 2>/dev/null || echo "inspect-failed") + + case "$health_status" in + healthy) + _log "msg='container healthy' container=$name waited_sec=$waited" + return 0 + ;; + no-healthcheck) + # Container has no Docker healthcheck — verify it is at least running. + local running + running=$(docker inspect --format='{{.State.Running}}' "$name" 2>/dev/null || echo "false") + if [ "$running" = "true" ]; then + _log "msg='container running (no healthcheck configured)' container=$name" + return 0 + fi + ;; + starting) + _log "msg='container starting' container=$name waited_sec=$waited/$max_wait_sec" + ;; + unhealthy) + _log "level=WARN msg='container unhealthy' container=$name waited_sec=$waited/$max_wait_sec" + ;; + inspect-failed) + _log "level=WARN msg='docker inspect failed' container=$name waited_sec=$waited" + ;; + *) + _log "level=WARN msg='unknown health status' container=$name status=$health_status waited_sec=$waited" + ;; + esac + + sleep "$interval" + waited=$((waited + interval)) + done + + _log "level=ERROR msg='container did not become healthy within timeout' container=$name max_wait_sec=$max_wait_sec" + docker logs "$name" --tail 30 >&2 2>/dev/null || true + return 1 +} + +_check_endpoint() { + local name="$1" + local url="$2" + + local status + status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url" 2>/dev/null || echo "000") + + if [ "$status" = "200" ]; then + _log "msg='endpoint healthy' container=$name url=$url status=200" + return 0 + else + _log "level=ERROR msg='endpoint unhealthy' container=$name url=$url status=$status" + return 1 + fi +} + +# ───────────────────────────────────────────────────────────────────────── +# SKIP HEALTH CHECKS IN BOOTSTRAP MODE +# ───────────────────────────────────────────────────────────────────────── +if [ "$BOOTSTRAP_MODE" = "true" ]; then + DEPLOY_STATE="BOOTSTRAP" + _log "level=WARN msg='bootstrap mode detected — skipping container health checks' state=$DEPLOY_STATE" + _log "level=WARN msg='ACTION: configure infra/.env.monitoring with real values and re-run deploy to enable monitoring'" + exit 0 +fi + +# ───────────────────────────────────────────────────────────────────────── +# ENFORCE: Container name validation + health checks +# ───────────────────────────────────────────────────────────────────────── +# Exact container name enforcement: fail fast if any required container is missing +REQUIRED_CONTAINERS=("prometheus" "alertmanager" "grafana") +for c in "${REQUIRED_CONTAINERS[@]}"; do + if ! docker inspect "$c" >/dev/null 2>&1; then + _log "level=ERROR msg='required container missing' container=$c" + DEPLOY_STATE="FAILED" + docker ps --format 'table {{.Names}}\t{{.Status}}' 2>/dev/null >&2 || true + exit 1 + fi +done + +MONITORING_ERRORS=0 + +# ── Prometheus ────────────────────────────────────────────────────────────── +if _wait_container_healthy "prometheus" 60 3; then + _check_endpoint "prometheus" "http://localhost:9090/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) +else + MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) +fi + +# ── Alertmanager ───────────────────────────────────────────────────────────── +if _wait_container_healthy "alertmanager" 60 3; then + _check_endpoint "alertmanager" "http://localhost:9093/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) +else + MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) +fi + +# ── Grafana ────────────────────────────────────────────────────────────────── +# Grafana may take longer to start; allow 60s timeout. +if _wait_container_healthy "grafana" 60 3; then + # Grafana health endpoint returns 200 with JSON when ready. + _check_endpoint "grafana" "http://localhost:3001/api/health" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) +else + MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) +fi + +# --------------------------------------------------------------------------- +# STABILITY WINDOW — Verify containers remain healthy after initial pass +# This catches "flaky startup" where containers pass health check but crash +# immediately after. Wait settle window then re-verify all containers. +# --------------------------------------------------------------------------- +_log "msg='entering stability window (5s settle + re-check)'" +sleep 5 + +for c in "${REQUIRED_CONTAINERS[@]}"; do + STABLE_STATUS=$(docker inspect --format='{{if .State.Health}}{{.State.Health.Status}}{{else}}running{{end}}' "$c" 2>/dev/null || echo "inspect-failed") + if [ "$STABLE_STATUS" != "healthy" ] && [ "$STABLE_STATUS" != "running" ]; then + _log "level=ERROR msg='container became unhealthy during stability window' container=$c status=$STABLE_STATUS" + MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) + fi +done + +# --------------------------------------------------------------------------- +# PROMETHEUS SCRAPING VALIDATION — Ensure Prometheus is actually working +# A healthy Prometheus container is useless if it's not scraping targets. +# Query the Prometheus API to verify targets are UP. +# --------------------------------------------------------------------------- +_log "msg='validating prometheus scraping targets'" +PROM_TARGETS=$(curl -s "http://localhost:9090/api/v1/targets" 2>/dev/null || echo "") + +if [ -z "$PROM_TARGETS" ]; then + _log "level=WARN msg='prometheus API query failed — cannot validate scraping (proceeding with caution)'" +elif ! echo "$PROM_TARGETS" | grep -q '"health":"up"' 2>/dev/null; then + _log "level=ERROR msg='prometheus has no healthy scrape targets' curl_response=${PROM_TARGETS:0:200}" + MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) +else + # Count active targets + ACTIVE_TARGETS=$(echo "$PROM_TARGETS" | grep -o '"health":"up"' | wc -l) + _log "msg='prometheus scraping targets' active_count=$ACTIVE_TARGETS" +fi + +# --------------------------------------------------------------------------- +# FINAL ENFORCEMENT +# --------------------------------------------------------------------------- +if [ "$MONITORING_ERRORS" -gt 0 ]; then + _log "level=ERROR msg='monitoring validation failed' errors=$MONITORING_ERRORS state=$DEPLOY_STATE" + _log "level=ERROR msg='container state at failure:'" + docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Ports}}' 2>/dev/null >&2 || true + DEPLOY_STATE="FAILED" + exit 1 +fi + +_log "msg='monitoring-sync complete' state=$DEPLOY_STATE containers=healthy required=3" +exit 0 diff --git a/scripts/vps-setup.sh b/scripts/vps-setup.sh index 9fd7ea7..679e481 100644 --- a/scripts/vps-setup.sh +++ b/scripts/vps-setup.sh @@ -278,6 +278,15 @@ else log "Docker network '$NETWORK' created." fi +# Create runtime state directory for blue-green slot tracking. +# /var/run is tmpfs (cleared on reboot); _ft_ensure_slot_dir recreates it on +# each deploy, but creating it here avoids a first-boot race condition. +log "Phase 9b: Creating runtime state directories..." +install -d -m 750 -o "$DEPLOY_USER" -g "$DEPLOY_USER" /var/run/api 2>/dev/null || true +install -d -m 755 /var/log/api 2>/dev/null || true +chown "$DEPLOY_USER:$DEPLOY_USER" /var/log/api 2>/dev/null || true +log "Runtime state directories ready (/var/run/api, /var/log/api)." + # ============================================================================ # PHASE 10: Nginx Installation & Configuration # ============================================================================ @@ -353,13 +362,26 @@ else fi MONITORING_ENV_FILE="$REPO_DIR/infra/.env.monitoring" +MONITORING_ENV_EXAMPLE="$REPO_DIR/infra/.env.monitoring.example" if [ -f "$MONITORING_ENV_FILE" ]; then chmod 600 "$MONITORING_ENV_FILE" chown "$DEPLOY_USER:$DEPLOY_USER" "$MONITORING_ENV_FILE" warn "Monitoring env file detected. Verify its values: $MONITORING_ENV_FILE" +elif [ -f "$MONITORING_ENV_EXAMPLE" ]; then + # Self-heal: create from example so subsequent deploy scripts do not fail. + # The operator MUST fill in real values before monitoring is functional. + cp "$MONITORING_ENV_EXAMPLE" "$MONITORING_ENV_FILE" + chmod 600 "$MONITORING_ENV_FILE" + chown "$DEPLOY_USER:$DEPLOY_USER" "$MONITORING_ENV_FILE" + warn "infra/.env.monitoring created from example — ACTION REQUIRED:" + warn " Edit $MONITORING_ENV_FILE and set:" + warn " GRAFANA_ADMIN_PASSWORD — strong password (min 12 chars)" + warn " METRICS_SCRAPE_TOKEN — must match METRICS_SCRAPE_TOKEN in .env" + warn " ALERTMANAGER_SLACK_WEBHOOK — Slack incoming webhook URL" + warn " API_HOSTNAME — bare hostname (e.g. api.getfieldtrack.app)" else - err "Missing $MONITORING_ENV_FILE. Ensure infra/.env.monitoring exists in the repository." + err "infra/.env.monitoring and infra/.env.monitoring.example both missing. Cannot continue." fi # ============================================================================ From 06c839496d6f973343f069f069171d4796ab048f Mon Sep 17 00:00:00 2001 From: rajashish147 Date: Fri, 3 Apr 2026 15:25:37 +0530 Subject: [PATCH 2/2] fix(monitoring): update health check URLs to use Docker service names --- scripts/monitoring-sync.sh | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/scripts/monitoring-sync.sh b/scripts/monitoring-sync.sh index 26c5b62..dea17d6 100644 --- a/scripts/monitoring-sync.sh +++ b/scripts/monitoring-sync.sh @@ -154,12 +154,13 @@ _log "msg='docker compose up -d complete'" # STEP 5 — VALIDATE: wait for required containers to become healthy # # Required containers (must be healthy for deploy to succeed): -# prometheus — metrics collection (health: http://localhost:9090/-/healthy) -# alertmanager — alert routing (health: http://localhost:9093/-/healthy) -# grafana — dashboards (health: http://localhost:3001/api/health) +# prometheus — metrics collection (health: http://prometheus:9090/-/healthy) +# alertmanager — alert routing (health: http://alertmanager:9093/-/healthy) +# grafana — dashboards (health: http://grafana:3000/api/health) # -# Strategy: poll docker inspect for Health.Status. +# Strategy: poll docker inspect for Health.Status via Docker service DNS. # Times out at 60 s per container (20 attempts × 3 s). +# Note: Using service names (not localhost) because containers are in Docker network only. # --------------------------------------------------------------------------- _wait_container_healthy() { @@ -265,14 +266,14 @@ MONITORING_ERRORS=0 # ── Prometheus ────────────────────────────────────────────────────────────── if _wait_container_healthy "prometheus" 60 3; then - _check_endpoint "prometheus" "http://localhost:9090/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) + _check_endpoint "prometheus" "http://prometheus:9090/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) else MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) fi # ── Alertmanager ───────────────────────────────────────────────────────────── if _wait_container_healthy "alertmanager" 60 3; then - _check_endpoint "alertmanager" "http://localhost:9093/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) + _check_endpoint "alertmanager" "http://alertmanager:9093/-/healthy" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) else MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) fi @@ -281,7 +282,7 @@ fi # Grafana may take longer to start; allow 60s timeout. if _wait_container_healthy "grafana" 60 3; then # Grafana health endpoint returns 200 with JSON when ready. - _check_endpoint "grafana" "http://localhost:3001/api/health" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) + _check_endpoint "grafana" "http://grafana:3000/api/health" || MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) else MONITORING_ERRORS=$((MONITORING_ERRORS + 1)) fi @@ -308,7 +309,7 @@ done # Query the Prometheus API to verify targets are UP. # --------------------------------------------------------------------------- _log "msg='validating prometheus scraping targets'" -PROM_TARGETS=$(curl -s "http://localhost:9090/api/v1/targets" 2>/dev/null || echo "") +PROM_TARGETS=$(curl -s "http://prometheus:9090/api/v1/targets" 2>/dev/null || echo "") if [ -z "$PROM_TARGETS" ]; then _log "level=WARN msg='prometheus API query failed — cannot validate scraping (proceeding with caution)'"