Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
infra/tempo/data/
infra/prometheus/data/
infra/grafana/data/
# Rendered Alertmanager config (contains real webhook URL β€” VPS only)
infra/alertmanager/alertmanager.rendered.yml

# Deployment history (VPS-side file, never committed)
.deploy_history
Expand Down
10 changes: 8 additions & 2 deletions infra/.env.monitoring.example
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,13 @@ GRAFANA_ADMIN_PASSWORD=change-me-use-a-strong-password
# Generate: openssl rand -hex 32
METRICS_SCRAPE_TOKEN=change-me-generate-with-openssl-rand-hex-32

# ── Alertmanager Slack notification target
# Used by infra/alertmanager/alertmanager.yml.
# ── Alertmanager Slack notification target ────────────────────────────────────
# Used by infra/scripts/render-alertmanager.sh to render the Alertmanager config
# template before container start. Alertmanager does NOT support env vars natively.
#
# Generate from: Slack β†’ Your App β†’ Incoming Webhooks β†’ Add New Webhook
# Must start with: https://hooks.slack.com/
#
# IMPORTANT: Do NOT add FRONTEND_DOMAIN here β€” it has been removed from the
# env contract. The render script will exit 1 if it detects that variable.
ALERTMANAGER_SLACK_WEBHOOK=YOUR_SLACK_INCOMING_WEBHOOK_URL
15 changes: 13 additions & 2 deletions infra/alertmanager/alertmanager.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
# Alertmanager route and receiver configuration for Slack-only alerting.
# No email or SMTP configurations are present.
#
# NOTE:
# This file is a TEMPLATE and MUST be rendered via envsubst before use.
# Alertmanager does NOT support environment variables natively.
# Render this file by running:
# bash infra/scripts/render-alertmanager.sh
# The rendered output is written to: infra/alertmanager/alertmanager.rendered.yml
# docker-compose mounts ONLY the rendered file β€” never this template directly.
#
# No email, SMTP, or PagerDuty configurations are present.

route:
receiver: ops-slack
receiver: ops-slack-warning
group_by: ["alertname", "severity"]
group_wait: 30s
group_interval: 5m
Expand All @@ -17,6 +26,7 @@ route:
receiver: ops-slack-warning

receivers:
# Critical alerts: dedicated Slack channel for immediate response
- name: ops-slack-critical
slack_configs:
- api_url: "${ALERTMANAGER_SLACK_WEBHOOK}"
Expand All @@ -29,6 +39,7 @@ receivers:
*Summary:* {{ .CommonAnnotations.summary }}
*Description:* {{ .CommonAnnotations.description }}

# Warning alerts: standard alerts channel
- name: ops-slack-warning
slack_configs:
- api_url: "${ALERTMANAGER_SLACK_WEBHOOK}"
Expand Down
141 changes: 49 additions & 92 deletions infra/docker-compose.monitoring.yml
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
# FieldTrack API β€” Monitoring Stack
#
# All services use the shared api_network.
# No public ports exposed β€” access via Nginx reverse proxy only.
#
# Usage: docker compose -f infra/docker-compose.monitoring.yml --env-file infra/.env.monitoring up -d

services:

loki:
image: grafana/loki:2.9.6
container_name: loki
restart: unless-stopped
# Internal only β€” Promtail pushes to loki:3100 on the shared network
expose:
- "3100"
volumes:
Expand All @@ -30,7 +22,7 @@ services:
max-size: "10m"
max-file: "3"
healthcheck:
test: [ "CMD", "wget", "--spider", "-q", "http://localhost:3100/ready" ]
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
interval: 30s
timeout: 5s
retries: 3
Expand All @@ -48,6 +40,9 @@ services:
command: -config.file=/etc/promtail/promtail.yml
networks:
- api_network
depends_on:
loki:
condition: service_healthy
deploy:
resources:
limits:
Expand All @@ -57,39 +52,39 @@ services:
options:
max-size: "10m"
max-file: "3"
depends_on:
loki:
condition: service_healthy

alertmanager:
image: prom/alertmanager:v0.27.0
container_name: alertmanager
restart: unless-stopped
# Loopback-only β€” accessed by Prometheus on the shared network
expose:
- "9093"
environment:
- ALERTMANAGER_SLACK_WEBHOOK=${ALERTMANAGER_SLACK_WEBHOOK}

volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- ./alertmanager/alertmanager.rendered.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager_data:/alertmanager

command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--storage.path=/alertmanager"
- "--web.listen-address=:9093"

networks:
- api_network

deploy:
resources:
limits:
memory: 128m

logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"

healthcheck:
test: [ "CMD", "wget", "--spider", "-q", "http://localhost:9093/-/healthy" ]
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9093/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
Expand All @@ -99,40 +94,44 @@ services:
image: prom/prometheus:v2.52.0
container_name: prometheus
restart: unless-stopped
# Loopback-only β€” not reachable from the public internet
ports:
- "127.0.0.1:9090:9090"

environment:
# Passed through to prometheus.yml for scrape authentication and probe targets
- METRICS_SCRAPE_TOKEN=${METRICS_SCRAPE_TOKEN}
- API_HOSTNAME=${API_HOSTNAME}

volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
- prometheus_data:/prometheus

command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.retention.time=30d"
- "--storage.tsdb.retention.size=5GB"
- "--web.enable-lifecycle"
- "--enable-feature=exemplar-storage"
- "--web.enable-remote-write-receiver"

networks:
- api_network

depends_on:
alertmanager:
condition: service_healthy

deploy:
resources:
limits:
memory: 1g
memory: 512m

logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"

healthcheck:
test: [ "CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy" ]
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
Expand All @@ -142,110 +141,68 @@ services:
image: grafana/grafana:10.4.2
container_name: grafana
restart: unless-stopped
# Bind to loopback only β€” Nginx proxies /monitor to this port
ports:
- "127.0.0.1:3333:3000"
- "127.0.0.1:3001:3000"

environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
- GF_USERS_ALLOW_SIGN_UP=false
- GF_SERVER_ROOT_URL=https://${API_HOSTNAME}/monitor
- GF_SERVER_SERVE_FROM_SUB_PATH=true
- GF_SERVER_ROOT_URL=https://${API_HOSTNAME}/grafana

volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning:ro
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro

networks:
- api_network
deploy:
resources:
limits:
memory: 512m
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"

depends_on:
prometheus:
condition: service_healthy
healthcheck:
test: [ "CMD", "wget", "--spider", "-q", "http://localhost:3000/api/health" ]
interval: 30s
timeout: 5s
retries: 3
start_period: 15s

blackbox:
image: prom/blackbox-exporter:v0.25.0
container_name: blackbox
restart: unless-stopped
expose:
- "9115"
volumes:
- ./blackbox/blackbox.yml:/etc/blackbox/blackbox.yml:ro
command:
- "--config.file=/etc/blackbox/blackbox.yml"
networks:
- api_network
deploy:
resources:
limits:
memory: 64m
memory: 256m

logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
healthcheck:
test: [ "CMD", "wget", "--spider", "-q", "http://localhost:9115/-/healthy" ]
interval: 30s
timeout: 5s
retries: 3
start_period: 10s

node-exporter:
image: prom/node-exporter:v1.8.1
container_name: node-exporter
restart: unless-stopped
# No public port β€” internal only via api_network
expose:
- "9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro

command:
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- "--path.rootfs=/rootfs"
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
- "--path.rootfs=/host"

volumes:
- /:/host:ro,rslave

networks:
- api_network

deploy:
resources:
limits:
memory: 64m

logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
healthcheck:
test: [ "CMD", "wget", "--spider", "-q", "http://localhost:9100/metrics" ]
interval: 30s
timeout: 5s
retries: 3

volumes:
prometheus_data:
name: api_prometheus_data
loki_data:
name: api_loki_data
promtail_data:
name: api_promtail_data
grafana_data:
name: api_grafana_data
alertmanager_data:
name: api_alertmanager_data

networks:
api_network:
name: api_network
external: true

volumes:
prometheus_data:
alertmanager_data:
grafana_data:
loki_data:
promtail_data:
Loading
Loading