diff --git a/.gitignore b/.gitignore index 9915558..789b129 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,8 @@ infra/tempo/data/ infra/prometheus/data/ infra/grafana/data/ +# Rendered Alertmanager config (contains real webhook URL — VPS only) +infra/alertmanager/alertmanager.rendered.yml # Deployment history (VPS-side file, never committed) .deploy_history diff --git a/infra/.env.monitoring.example b/infra/.env.monitoring.example index 6d5cf34..711716d 100644 --- a/infra/.env.monitoring.example +++ b/infra/.env.monitoring.example @@ -49,7 +49,13 @@ GRAFANA_ADMIN_PASSWORD=change-me-use-a-strong-password # Generate: openssl rand -hex 32 METRICS_SCRAPE_TOKEN=change-me-generate-with-openssl-rand-hex-32 -# ── Alertmanager Slack notification target -# Used by infra/alertmanager/alertmanager.yml. +# ── Alertmanager Slack notification target ──────────────────────────────────── +# Used by infra/scripts/render-alertmanager.sh to render the Alertmanager config +# template before container start. Alertmanager does NOT support env vars natively. +# # Generate from: Slack → Your App → Incoming Webhooks → Add New Webhook +# Must start with: https://hooks.slack.com/ +# +# IMPORTANT: Do NOT add FRONTEND_DOMAIN here — it has been removed from the +# env contract. The render script will exit 1 if it detects that variable. ALERTMANAGER_SLACK_WEBHOOK=YOUR_SLACK_INCOMING_WEBHOOK_URL diff --git a/infra/alertmanager/alertmanager.yml b/infra/alertmanager/alertmanager.yml index 98b5c9f..db267a1 100644 --- a/infra/alertmanager/alertmanager.yml +++ b/infra/alertmanager/alertmanager.yml @@ -1,8 +1,17 @@ # Alertmanager route and receiver configuration for Slack-only alerting. -# No email or SMTP configurations are present. +# +# NOTE: +# This file is a TEMPLATE and MUST be rendered via envsubst before use. +# Alertmanager does NOT support environment variables natively. +# Render this file by running: +# bash infra/scripts/render-alertmanager.sh +# The rendered output is written to: infra/alertmanager/alertmanager.rendered.yml +# docker-compose mounts ONLY the rendered file — never this template directly. +# +# No email, SMTP, or PagerDuty configurations are present. route: - receiver: ops-slack + receiver: ops-slack-warning group_by: ["alertname", "severity"] group_wait: 30s group_interval: 5m @@ -17,6 +26,7 @@ route: receiver: ops-slack-warning receivers: + # Critical alerts: dedicated Slack channel for immediate response - name: ops-slack-critical slack_configs: - api_url: "${ALERTMANAGER_SLACK_WEBHOOK}" @@ -29,6 +39,7 @@ receivers: *Summary:* {{ .CommonAnnotations.summary }} *Description:* {{ .CommonAnnotations.description }} + # Warning alerts: standard alerts channel - name: ops-slack-warning slack_configs: - api_url: "${ALERTMANAGER_SLACK_WEBHOOK}" diff --git a/infra/docker-compose.monitoring.yml b/infra/docker-compose.monitoring.yml index c025bd5..8f80053 100644 --- a/infra/docker-compose.monitoring.yml +++ b/infra/docker-compose.monitoring.yml @@ -1,17 +1,9 @@ -# FieldTrack API — Monitoring Stack -# -# All services use the shared api_network. -# No public ports exposed — access via Nginx reverse proxy only. -# -# Usage: docker compose -f infra/docker-compose.monitoring.yml --env-file infra/.env.monitoring up -d - services: loki: image: grafana/loki:2.9.6 container_name: loki restart: unless-stopped - # Internal only — Promtail pushes to loki:3100 on the shared network expose: - "3100" volumes: @@ -30,7 +22,7 @@ services: max-size: "10m" max-file: "3" healthcheck: - test: [ "CMD", "wget", "--spider", "-q", "http://localhost:3100/ready" ] + test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"] interval: 30s timeout: 5s retries: 3 @@ -48,6 +40,9 @@ services: command: -config.file=/etc/promtail/promtail.yml networks: - api_network + depends_on: + loki: + condition: service_healthy deploy: resources: limits: @@ -57,39 +52,39 @@ services: options: max-size: "10m" max-file: "3" - depends_on: - loki: - condition: service_healthy alertmanager: image: prom/alertmanager:v0.27.0 container_name: alertmanager restart: unless-stopped - # Loopback-only — accessed by Prometheus on the shared network expose: - "9093" - environment: - - ALERTMANAGER_SLACK_WEBHOOK=${ALERTMANAGER_SLACK_WEBHOOK} + volumes: - - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + - ./alertmanager/alertmanager.rendered.yml:/etc/alertmanager/alertmanager.yml:ro - alertmanager_data:/alertmanager + command: - "--config.file=/etc/alertmanager/alertmanager.yml" - "--storage.path=/alertmanager" - "--web.listen-address=:9093" + networks: - api_network + deploy: resources: limits: memory: 128m + logging: driver: json-file options: max-size: "10m" max-file: "3" + healthcheck: - test: [ "CMD", "wget", "--spider", "-q", "http://localhost:9093/-/healthy" ] + test: ["CMD", "wget", "--spider", "-q", "http://localhost:9093/-/healthy"] interval: 30s timeout: 5s retries: 3 @@ -99,40 +94,44 @@ services: image: prom/prometheus:v2.52.0 container_name: prometheus restart: unless-stopped - # Loopback-only — not reachable from the public internet ports: - "127.0.0.1:9090:9090" + environment: - # Passed through to prometheus.yml for scrape authentication and probe targets - METRICS_SCRAPE_TOKEN=${METRICS_SCRAPE_TOKEN} - API_HOSTNAME=${API_HOSTNAME} + volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro - prometheus_data:/prometheus + command: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.retention.time=30d" - "--storage.tsdb.retention.size=5GB" - "--web.enable-lifecycle" - - "--enable-feature=exemplar-storage" - - "--web.enable-remote-write-receiver" + networks: - api_network + depends_on: alertmanager: condition: service_healthy + deploy: resources: limits: - memory: 1g + memory: 512m + logging: driver: json-file options: max-size: "10m" max-file: "3" + healthcheck: - test: [ "CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy" ] + test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"] interval: 30s timeout: 5s retries: 3 @@ -142,110 +141,68 @@ services: image: grafana/grafana:10.4.2 container_name: grafana restart: unless-stopped - # Bind to loopback only — Nginx proxies /monitor to this port ports: - - "127.0.0.1:3333:3000" + - "127.0.0.1:3001:3000" + environment: - - GF_SECURITY_ADMIN_USER=admin - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD} - - GF_USERS_ALLOW_SIGN_UP=false - - GF_SERVER_ROOT_URL=https://${API_HOSTNAME}/monitor - - GF_SERVER_SERVE_FROM_SUB_PATH=true + - GF_SERVER_ROOT_URL=https://${API_HOSTNAME}/grafana + volumes: - grafana_data:/var/lib/grafana - - ./grafana/provisioning:/etc/grafana/provisioning:ro - - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + networks: - api_network - deploy: - resources: - limits: - memory: 512m - logging: - driver: json-file - options: - max-size: "10m" - max-file: "3" + depends_on: prometheus: condition: service_healthy - healthcheck: - test: [ "CMD", "wget", "--spider", "-q", "http://localhost:3000/api/health" ] - interval: 30s - timeout: 5s - retries: 3 - start_period: 15s - blackbox: - image: prom/blackbox-exporter:v0.25.0 - container_name: blackbox - restart: unless-stopped - expose: - - "9115" - volumes: - - ./blackbox/blackbox.yml:/etc/blackbox/blackbox.yml:ro - command: - - "--config.file=/etc/blackbox/blackbox.yml" - networks: - - api_network deploy: resources: limits: - memory: 64m + memory: 256m + logging: driver: json-file options: max-size: "10m" max-file: "3" - healthcheck: - test: [ "CMD", "wget", "--spider", "-q", "http://localhost:9115/-/healthy" ] - interval: 30s - timeout: 5s - retries: 3 - start_period: 10s node-exporter: image: prom/node-exporter:v1.8.1 container_name: node-exporter restart: unless-stopped - # No public port — internal only via api_network expose: - "9100" - volumes: - - /proc:/host/proc:ro - - /sys:/host/sys:ro - - /:/rootfs:ro + command: - - "--path.procfs=/host/proc" - - "--path.sysfs=/host/sys" - - "--path.rootfs=/rootfs" - - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)" + - "--path.rootfs=/host" + + volumes: + - /:/host:ro,rslave + networks: - api_network + + deploy: + resources: + limits: + memory: 64m + logging: driver: json-file options: max-size: "10m" max-file: "3" - healthcheck: - test: [ "CMD", "wget", "--spider", "-q", "http://localhost:9100/metrics" ] - interval: 30s - timeout: 5s - retries: 3 - -volumes: - prometheus_data: - name: api_prometheus_data - loki_data: - name: api_loki_data - promtail_data: - name: api_promtail_data - grafana_data: - name: api_grafana_data - alertmanager_data: - name: api_alertmanager_data networks: api_network: - name: api_network external: true + +volumes: + prometheus_data: + alertmanager_data: + grafana_data: + loki_data: + promtail_data: \ No newline at end of file diff --git a/infra/scripts/render-alertmanager.sh b/infra/scripts/render-alertmanager.sh new file mode 100644 index 0000000..952ac29 --- /dev/null +++ b/infra/scripts/render-alertmanager.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash +# ============================================================================= +# infra/scripts/render-alertmanager.sh +# +# Renders infra/alertmanager/alertmanager.yml (template) into +# infra/alertmanager/alertmanager.rendered.yml by substituting +# ${ALERTMANAGER_SLACK_WEBHOOK} from infra/.env.monitoring. +# +# MUST be run before `docker compose up` for the monitoring stack. +# Alertmanager does NOT support environment variables natively — rendering +# the config before container start is the only safe approach. +# +# Usage (from any directory): +# bash infra/scripts/render-alertmanager.sh +# +# Exit codes: +# 0 — rendered file written successfully +# 1 — validation or rendering failure +# ============================================================================= +set -euo pipefail + +# --------------------------------------------------------------------------- +# Resolve absolute paths relative to this script's location. +# This makes the script safe to call from any working directory. +# --------------------------------------------------------------------------- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INFRA_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" + +ENV_FILE="${INFRA_DIR}/.env.monitoring" +TEMPLATE_FILE="${INFRA_DIR}/alertmanager/alertmanager.yml" +OUTPUT_FILE="${INFRA_DIR}/alertmanager/alertmanager.rendered.yml" + +log_info() { printf '[render-alertmanager] INFO %s\n' "$*" >&2; } +log_error() { printf '[render-alertmanager] ERROR %s\n' "$*" >&2; } + +# --------------------------------------------------------------------------- +# Pre-flight: ensure required tools exist +# --------------------------------------------------------------------------- +if ! command -v envsubst &>/dev/null; then + log_error "envsubst not found. Install gettext (apt install gettext / yum install gettext)." + exit 1 +fi + +# --------------------------------------------------------------------------- +# Validate env file +# --------------------------------------------------------------------------- +if [ ! -f "${ENV_FILE}" ]; then + log_error "Env file not found: ${ENV_FILE}" + log_error "This file must exist on the VPS and must NOT be committed to the repo." + exit 1 +fi + +# Load env file via `source` under `set -a` so every assignment is exported. +# This correctly handles values containing special characters (e.g. https://). +# DO NOT replace this with `export $(grep ... | xargs)` — xargs splits on +# whitespace and breaks URLs, quoted strings, and any value with spaces. +set -a +# shellcheck source=/dev/null +source "${ENV_FILE}" +set +a + +# Warn loudly if stale / removed variables are still present in the env file. +# FRONTEND_DOMAIN was removed from the env contract — its presence here is a +# sign the file is out of date and should be cleaned up on the VPS. +if [ -n "${FRONTEND_DOMAIN:-}" ]; then + log_error "FRONTEND_DOMAIN is set in ${ENV_FILE} but is no longer part of the env contract." + log_error "Remove that line from .env.monitoring on the VPS, then re-run this script." + exit 1 +fi + +# --------------------------------------------------------------------------- +# Validate ALERTMANAGER_SLACK_WEBHOOK +# --------------------------------------------------------------------------- +if [ -z "${ALERTMANAGER_SLACK_WEBHOOK:-}" ]; then + log_error "ALERTMANAGER_SLACK_WEBHOOK is not set or empty in ${ENV_FILE}." + exit 1 +fi + +case "${ALERTMANAGER_SLACK_WEBHOOK}" in + https://hooks.slack.com/*) + : # valid prefix + ;; + *) + log_error "ALERTMANAGER_SLACK_WEBHOOK does not start with 'https://hooks.slack.com/'." + log_error "Value prefix: $(printf '%s' "${ALERTMANAGER_SLACK_WEBHOOK}" | cut -c1-30)..." + exit 1 + ;; +esac + +# --------------------------------------------------------------------------- +# Validate template file +# --------------------------------------------------------------------------- +if [ ! -f "${TEMPLATE_FILE}" ]; then + log_error "Template file not found: ${TEMPLATE_FILE}" + exit 1 +fi + +if ! grep -qF '${ALERTMANAGER_SLACK_WEBHOOK}' "${TEMPLATE_FILE}"; then + log_error "Template file does not contain '\${ALERTMANAGER_SLACK_WEBHOOK}' placeholder." + log_error "Check that ${TEMPLATE_FILE} is the correct template." + exit 1 +fi + +# --------------------------------------------------------------------------- +# Render: substitute ONLY ALERTMANAGER_SLACK_WEBHOOK (avoid clobbering any +# other ${...} placeholders that Alertmanager Go template syntax might use). +# --------------------------------------------------------------------------- +log_info "Rendering ${TEMPLATE_FILE} -> ${OUTPUT_FILE}" + +envsubst '${ALERTMANAGER_SLACK_WEBHOOK}' \ + < "${TEMPLATE_FILE}" \ + > "${OUTPUT_FILE}" + +# --------------------------------------------------------------------------- +# Post-render sanity check: no unsubstituted placeholder must remain +# --------------------------------------------------------------------------- +if grep -qF '${ALERTMANAGER_SLACK_WEBHOOK}' "${OUTPUT_FILE}"; then + log_error "Rendered file still contains the unsubstituted placeholder. Aborting." + rm -f "${OUTPUT_FILE}" + exit 1 +fi + +# Verify the rendered URL looks real (not a placeholder stub) +if grep -qF 'YOUR/WEBHOOK/URL' "${OUTPUT_FILE}"; then + log_error "Rendered file contains placeholder stub URL. Check your .env.monitoring." + rm -f "${OUTPUT_FILE}" + exit 1 +fi + +# Print a redacted preview so operators can confirm the URL was injected. +WEBHOOK_PREVIEW=$(grep 'api_url' "${OUTPUT_FILE}" | head -1 | sed 's|\(https://hooks.slack.com/services/[^/]*/[^/]*/\).*|\1***|') +log_info "Webhook preview (redacted): ${WEBHOOK_PREVIEW}" +log_info "Success. Rendered file: ${OUTPUT_FILE}" diff --git a/scripts/deploy-bluegreen.sh b/scripts/deploy-bluegreen.sh index 7a35b51..a9278ce 100644 --- a/scripts/deploy-bluegreen.sh +++ b/scripts/deploy-bluegreen.sh @@ -459,6 +459,14 @@ fi _ft_log "msg='slot resolved' active=$ACTIVE active_port=$ACTIVE_PORT inactive=$INACTIVE inactive_port=$INACTIVE_PORT" +# --------------------------------------------------------------------------- +# INITIAL DEPLOYMENT DETECTION -- no containers exist yet +# --------------------------------------------------------------------------- +if ! docker ps -a --format '{{.Names}}' | grep -Eq '^api-(blue|green)$'; then + _ft_log "msg='initial deployment detected — no existing containers'" + INITIAL_DEPLOY=true +fi + # --------------------------------------------------------------------------- # IDEMPOTENCY GUARD -- skip deploy if this exact SHA is already the active container # --------------------------------------------------------------------------- @@ -790,18 +798,22 @@ unset _STABLE # --------------------------------------------------------------------------- _ft_state "CLEANUP" "msg='validating active container exists before cleanup' name=$ACTIVE_NAME" -# ACTIVE CONTAINER GUARD -- prevent edge-case race corruption +# ACTIVE CONTAINER GUARD -- handle missing container gracefully (e.g., first deploy or crash) if ! docker ps --format '{{.Names}}' | grep -q "^$ACTIVE_NAME$"; then - _ft_log "level=ERROR msg='active container missing before cleanup -- cannot safely proceed (possible race condition or crash)' name=$ACTIVE_NAME" - _ft_snapshot - _ft_exit 3 "DEPLOY_FAILED_FATAL" "reason=active_container_missing_before_cleanup" + _ft_log "msg='active container missing — treating as first deploy, skipping cleanup' name=$ACTIVE_NAME" + SKIP_CLEANUP=true +else + _ft_log "msg='active container guard passed' name=$ACTIVE_NAME" fi -_ft_log "msg='active container guard passed' name=$ACTIVE_NAME" # Graceful shutdown: allow in-flight requests to drain before forcing removal. -docker stop --time 10 "$ACTIVE_NAME" 2>/dev/null || true -docker rm "$ACTIVE_NAME" || true -_ft_log "msg='previous container removed (graceful)' name=$ACTIVE_NAME" +if [ "${SKIP_CLEANUP:-false}" != "true" ]; then + docker stop --time 10 "$ACTIVE_NAME" 2>/dev/null || true + docker rm "$ACTIVE_NAME" || true + _ft_log "msg='previous container removed (graceful)' name=$ACTIVE_NAME" +else + _ft_log "msg='cleanup skipped (first deploy scenario or container already removed)'" +fi _ft_state "SUCCESS" "msg='deployment complete' container=$INACTIVE_NAME sha=$IMAGE_SHA slot=$INACTIVE port=$INACTIVE_PORT" @@ -925,6 +937,12 @@ fi mv "$DEPLOY_HISTORY_TMP" "$DEPLOY_HISTORY" _ft_log "msg='deploy history updated' sha=$IMAGE_SHA" +# Alertmanager config rendering: always render before monitoring stack operations. +# Alertmanager does NOT support env vars natively; the rendered file must exist +# before docker compose up. This is idempotent and safe to run on every deploy. +bash "$REPO_DIR/infra/scripts/render-alertmanager.sh" +_ft_log "msg='alertmanager config rendered' file=$REPO_DIR/infra/alertmanager/alertmanager.rendered.yml" + # Monitoring stack: restart only when infra configs have actually changed. # Hashes cover all infra config files EXCEPT the nginx template (re-rendered on # every deploy) to avoid spurious monitoring restarts.