fieldtrack-tech · rajashish147 · Apr 3, 2026 · Apr 2, 2026 · Apr 3, 2026
@@ -9,6 +9,8 @@
 infra/tempo/data/
 infra/prometheus/data/
 infra/grafana/data/
+# Rendered Alertmanager config (contains real webhook URL — VPS only)
+infra/alertmanager/alertmanager.rendered.yml
 
 # Deployment history (VPS-side file, never committed)
 .deploy_history

@@ -49,7 +49,13 @@ GRAFANA_ADMIN_PASSWORD=change-me-use-a-strong-password
 # Generate:  openssl rand -hex 32
 METRICS_SCRAPE_TOKEN=change-me-generate-with-openssl-rand-hex-32
 
-# ── Alertmanager Slack notification target
-# Used by infra/alertmanager/alertmanager.yml.
+# ── Alertmanager Slack notification target ────────────────────────────────────
+# Used by infra/scripts/render-alertmanager.sh to render the Alertmanager config
+# template before container start. Alertmanager does NOT support env vars natively.
+#
 # Generate from: Slack → Your App → Incoming Webhooks → Add New Webhook
+# Must start with: https://hooks.slack.com/
+#
+# IMPORTANT: Do NOT add FRONTEND_DOMAIN here — it has been removed from the
+# env contract. The render script will exit 1 if it detects that variable.
 ALERTMANAGER_SLACK_WEBHOOK=YOUR_SLACK_INCOMING_WEBHOOK_URL
@@ -1,8 +1,17 @@
 # Alertmanager route and receiver configuration for Slack-only alerting.
-# No email or SMTP configurations are present.
+#
+# NOTE:
+# This file is a TEMPLATE and MUST be rendered via envsubst before use.
+# Alertmanager does NOT support environment variables natively.
+# Render this file by running:
+#   bash infra/scripts/render-alertmanager.sh
+# The rendered output is written to: infra/alertmanager/alertmanager.rendered.yml
+# docker-compose mounts ONLY the rendered file — never this template directly.
+#
+# No email, SMTP, or PagerDuty configurations are present.
 
 route:
-  receiver: ops-slack
+  receiver: ops-slack-warning
   group_by: ["alertname", "severity"]
   group_wait: 30s
   group_interval: 5m
@@ -17,6 +26,7 @@ route:
       receiver: ops-slack-warning
 
 receivers:
+  # Critical alerts: dedicated Slack channel for immediate response
   - name: ops-slack-critical
     slack_configs:
       - api_url: "${ALERTMANAGER_SLACK_WEBHOOK}"
@@ -29,6 +39,7 @@ receivers:
           *Summary:* {{ .CommonAnnotations.summary }}
           *Description:* {{ .CommonAnnotations.description }}
 
+  # Warning alerts: standard alerts channel
   - name: ops-slack-warning
     slack_configs:
       - api_url: "${ALERTMANAGER_SLACK_WEBHOOK}"

@@ -1,17 +1,9 @@
-# FieldTrack API — Monitoring Stack
-#
-# All services use the shared api_network.
-# No public ports exposed — access via Nginx reverse proxy only.
-#
-# Usage: docker compose -f infra/docker-compose.monitoring.yml --env-file infra/.env.monitoring up -d
-
 services:
 
   loki:
     image: grafana/loki:2.9.6
     container_name: loki
     restart: unless-stopped
-    # Internal only — Promtail pushes to loki:3100 on the shared network
     expose:
       - "3100"
     volumes:
@@ -30,7 +22,7 @@ services:
         max-size: "10m"
         max-file: "3"
     healthcheck:
-      test: [ "CMD", "wget", "--spider", "-q", "http://localhost:3100/ready" ]
+      test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
       interval: 30s
       timeout: 5s
       retries: 3
@@ -48,6 +40,9 @@ services:
     command: -config.file=/etc/promtail/promtail.yml
     networks:
       - api_network
+    depends_on:
+      loki:
+        condition: service_healthy
     deploy:
       resources:
         limits:
@@ -57,39 +52,39 @@ services:
       options:
         max-size: "10m"
         max-file: "3"
-    depends_on:
-      loki:
-        condition: service_healthy
 
   alertmanager:
     image: prom/alertmanager:v0.27.0
     container_name: alertmanager
     restart: unless-stopped
-    # Loopback-only — accessed by Prometheus on the shared network
     expose:
       - "9093"
-    environment:
-      - ALERTMANAGER_SLACK_WEBHOOK=${ALERTMANAGER_SLACK_WEBHOOK}
+
     volumes:
-      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+      - ./alertmanager/alertmanager.rendered.yml:/etc/alertmanager/alertmanager.yml:ro
       - alertmanager_data:/alertmanager
+
     command:
       - "--config.file=/etc/alertmanager/alertmanager.yml"
       - "--storage.path=/alertmanager"
       - "--web.listen-address=:9093"
+
     networks:
       - api_network
+
     deploy:
       resources:
         limits:
           memory: 128m
+
     logging:
       driver: json-file
       options:
         max-size: "10m"
         max-file: "3"
+
     healthcheck:
-      test: [ "CMD", "wget", "--spider", "-q", "http://localhost:9093/-/healthy" ]
+      test: ["CMD", "wget", "--spider", "-q", "http://localhost:9093/-/healthy"]
       interval: 30s
       timeout: 5s
       retries: 3
@@ -99,40 +94,44 @@ services:
     image: prom/prometheus:v2.52.0
     container_name: prometheus
     restart: unless-stopped
-    # Loopback-only — not reachable from the public internet
     ports:
       - "127.0.0.1:9090:9090"
+
     environment:
-      # Passed through to prometheus.yml for scrape authentication and probe targets
       - METRICS_SCRAPE_TOKEN=${METRICS_SCRAPE_TOKEN}
       - API_HOSTNAME=${API_HOSTNAME}
+
     volumes:
       - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
       - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
       - prometheus_data:/prometheus
+
     command:
       - "--config.file=/etc/prometheus/prometheus.yml"
       - "--storage.tsdb.retention.time=30d"
       - "--storage.tsdb.retention.size=5GB"
       - "--web.enable-lifecycle"
-      - "--enable-feature=exemplar-storage"
-      - "--web.enable-remote-write-receiver"
+
     networks:
       - api_network
+
     depends_on:
       alertmanager:
         condition: service_healthy
+
     deploy:
       resources:
         limits:
-          memory: 1g
+          memory: 512m
+
     logging:
       driver: json-file
       options:
         max-size: "10m"
         max-file: "3"
+
     healthcheck:
-      test: [ "CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy" ]
+      test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
       interval: 30s
       timeout: 5s
       retries: 3
@@ -142,110 +141,68 @@ services:
     image: grafana/grafana:10.4.2
     container_name: grafana
     restart: unless-stopped
-    # Bind to loopback only — Nginx proxies /monitor to this port
     ports:
-      - "127.0.0.1:3333:3000"
+      - "127.0.0.1:3001:3000"
+
     environment:
-      - GF_SECURITY_ADMIN_USER=admin
       - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
-      - GF_USERS_ALLOW_SIGN_UP=false
-      - GF_SERVER_ROOT_URL=https://${API_HOSTNAME}/monitor
-      - GF_SERVER_SERVE_FROM_SUB_PATH=true
+      - GF_SERVER_ROOT_URL=https://${API_HOSTNAME}/grafana
+
     volumes:
       - grafana_data:/var/lib/grafana
-      - ./grafana/provisioning:/etc/grafana/provisioning:ro
-      - ./grafana/dashboards:/var/lib/grafana/dashboards:ro
+
     networks:
       - api_network
-    deploy:
-      resources:
-        limits:
-          memory: 512m
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
+
     depends_on:
       prometheus:
         condition: service_healthy
-    healthcheck:
-      test: [ "CMD", "wget", "--spider", "-q", "http://localhost:3000/api/health" ]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-      start_period: 15s
 
-  blackbox:
-    image: prom/blackbox-exporter:v0.25.0
-    container_name: blackbox
-    restart: unless-stopped
-    expose:
-      - "9115"
-    volumes:
-      - ./blackbox/blackbox.yml:/etc/blackbox/blackbox.yml:ro
-    command:
-      - "--config.file=/etc/blackbox/blackbox.yml"
-    networks:
-      - api_network
     deploy:
       resources:
         limits:
-          memory: 64m
+          memory: 256m
+
     logging:
       driver: json-file
       options:
         max-size: "10m"
         max-file: "3"
-    healthcheck:
-      test: [ "CMD", "wget", "--spider", "-q", "http://localhost:9115/-/healthy" ]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-      start_period: 10s
 
   node-exporter:
     image: prom/node-exporter:v1.8.1
     container_name: node-exporter
     restart: unless-stopped
-    # No public port — internal only via api_network
     expose:
       - "9100"
-    volumes:
-      - /proc:/host/proc:ro
-      - /sys:/host/sys:ro
-      - /:/rootfs:ro
+
     command:
-      - "--path.procfs=/host/proc"
-      - "--path.sysfs=/host/sys"
-      - "--path.rootfs=/rootfs"
-      - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
+      - "--path.rootfs=/host"
+
+    volumes:
+      - /:/host:ro,rslave
+
     networks:
       - api_network
+
+    deploy:
+      resources:
+        limits:
+          memory: 64m
+
     logging:
       driver: json-file
       options:
         max-size: "10m"
         max-file: "3"
-    healthcheck:
-      test: [ "CMD", "wget", "--spider", "-q", "http://localhost:9100/metrics" ]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-
-volumes:
-  prometheus_data:
-    name: api_prometheus_data
-  loki_data:
-    name: api_loki_data
-  promtail_data:
-    name: api_promtail_data
-  grafana_data:
-    name: api_grafana_data
-  alertmanager_data:
-    name: api_alertmanager_data
 
 networks:
   api_network:
-    name: api_network
     external: true
+
+volumes:
+  prometheus_data:
+  alertmanager_data:
+  grafana_data:
+  loki_data:
+  promtail_data: