From 85486d91452aa6bca6d4de3fac04ae330c1bdfbf Mon Sep 17 00:00:00 2001 From: mulatta <67085791+mulatta@users.noreply.github.com> Date: Fri, 3 Jul 2026 15:20:00 +0900 Subject: [PATCH] gatus: persist status and tighten probes Gatus kept all endpoint state in memory, so every deploy reset uptime history and external endpoints disappeared until their next push. External checks also had no heartbeat, so stopped push timers could leave stale green state indefinitely. Persist results in SQLite, require external endpoint heartbeats, and let a single pushed endpoint aggregate multiple HTTP probes. Use that for services where both the app health endpoint and the browser-facing route need to pass. Collapse dashboard groups and set the default UI sort to group to make the status page easier to scan. --- docs/admin/monitoring.md | 8 ++- docs/guide/upterm.md | 2 +- modules/authentik/default.nix | 2 +- modules/buildbot/database.nix | 12 +--- modules/docling/default.nix | 7 ++- modules/gatus/check.nix | 81 ++++++++++++++++++++++----- modules/gatus/default.nix | 19 +++++-- modules/headscale/default.nix | 2 +- modules/monitoring/grafana.nix | 8 ++- modules/multievolve/default.nix | 10 +++- modules/nextcloud/default.nix | 3 +- modules/ntfy.nix | 2 +- modules/postgresql/default.nix | 2 +- modules/uptermd/default.nix | 9 +-- modules/vaultwarden/default.nix | 11 ---- modules/vaultwarden/reverse-proxy.nix | 13 ++++- 16 files changed, 130 insertions(+), 61 deletions(-) diff --git a/docs/admin/monitoring.md b/docs/admin/monitoring.md index f54a585..f9ce97c 100644 --- a/docs/admin/monitoring.md +++ b/docs/admin/monitoring.md @@ -57,6 +57,10 @@ flowchart LR ### Gatus (eta) -- Pull 방식: 외부 접근 가능한 서비스 (Authentik, Headscale, Upterm Web/Relay 등) -- Push 방식: 내부 서비스가 주기적으로 상태 보고 +- Pull 방식: eta에서 직접 접근 가능한 서비스 (Authentik, Headscale, Upterm 등) +- Push 방식: 내부 서비스가 로컬/사용자 경로를 확인한 뒤 상태 보고 +- 저장소: SQLite (`/var/lib/gatus/gatus.sqlite`)로 재시작 후 uptime 유지 +- External endpoint heartbeat: 15분 동안 push가 없으면 실패 처리 +- 그룹: `apps`, `ci`, `monitoring`, `platform` +- 기본 정렬: group 기준 - 알림: ntfy (`ntfy.sjanglab.org`, 토픽: `gatus`) diff --git a/docs/guide/upterm.md b/docs/guide/upterm.md index e5416aa..3124360 100644 --- a/docs/guide/upterm.md +++ b/docs/guide/upterm.md @@ -30,6 +30,6 @@ ssh :@upterm.sjanglab.org -p 2323 | 웹 안내 페이지 | `https://upterm.sjanglab.org` | | Relay endpoint | `ssh://upterm.sjanglab.org:2323` | | 인증 | 세션 호스트가 지정한 GitHub 사용자 allow-list | -| 모니터링 | Gatus `Upterm Web`, `Upterm Relay` | +| 모니터링 | Gatus `Upterm` | 세션은 임시 공유 용도입니다. 장기 접속이나 서버 작업은 [SSH 접속](../dev/ssh-access.md)을 사용하세요. diff --git a/modules/authentik/default.nix b/modules/authentik/default.nix index 5595bbc..10c375c 100644 --- a/modules/authentik/default.nix +++ b/modules/authentik/default.nix @@ -9,7 +9,7 @@ { name = "Authentik"; url = "https://auth.sjanglab.org"; - group = "auth"; + group = "platform"; } ]; diff --git a/modules/buildbot/database.nix b/modules/buildbot/database.nix index 2bea7a8..927b002 100644 --- a/modules/buildbot/database.nix +++ b/modules/buildbot/database.nix @@ -1,20 +1,10 @@ -# Nixbot PostgreSQL database health/backup helpers (deployed on psi) +# Nixbot database health/backup helpers (deployed on psi) { lib, pkgs, ... }: { - imports = [ ../gatus/check.nix ]; - - gatusCheck.push = [ - { - name = "Nixbot PostgreSQL"; - group = "ci"; - systemdService = "postgresql.service"; - } - ]; - # services.nixbot provisions the nixbot database and peer-authenticated user. services.postgresql = { enable = true; diff --git a/modules/docling/default.nix b/modules/docling/default.nix index 211b2a6..7c1c6ad 100644 --- a/modules/docling/default.nix +++ b/modules/docling/default.nix @@ -13,8 +13,11 @@ in gatusCheck.push = [ { name = "Docling"; - group = "ai"; - url = "http://127.0.0.1:${toString doclingPort}/health"; + group = "apps"; + checks = [ + { url = "http://127.0.0.1:${toString doclingPort}/health"; } + { url = "https://${domain}/health"; } + ]; } ]; diff --git a/modules/gatus/check.nix b/modules/gatus/check.nix index 574583f..481a96d 100644 --- a/modules/gatus/check.nix +++ b/modules/gatus/check.nix @@ -28,8 +28,25 @@ let in "${sanitize ep.group}_${sanitize ep.name}"; + mkUrlCheck = check: '' + check_url ${lib.escapeShellArg check.url} ${lib.escapeShellArg (toString check.expectedStatus)} + ''; + mkPushScript = ep: + let + checks = + if ep.checks != null then + ep.checks + else if ep.url != null then + [ + { + inherit (ep) url expectedStatus; + } + ] + else + [ ]; + in pkgs.writeShellScript "gatus-push-${mkKey ep}" ( if ep.systemdService != null then '' @@ -52,14 +69,24 @@ let else '' set -euo pipefail - status=$(${pkgs.curl}/bin/curl -sf --max-time 30 -o /dev/null -w "%{http_code}" "${ep.url}" 2>/dev/null) || true - if [ "$status" = "${toString ep.expectedStatus}" ]; then - success=true - error="" - else - success=false - error="expected ${toString ep.expectedStatus}, got $status" - fi + success=true + error="" + + check_url() { + local url=$1 + local expected=$2 + local status + status=$(${pkgs.curl}/bin/curl -sf --max-time 30 -o /dev/null -w "%{http_code}" "$url" 2>/dev/null) || true + if [ "$status" != "$expected" ]; then + success=false + if [ -n "$error" ]; then + error="$error; " + fi + error="''${error}$url expected $expected, got $status" + fi + } + + ${lib.concatMapStringsSep "" mkUrlCheck checks} ${pkgs.curl}/bin/curl -sf --max-time 10 \ -X POST \ -G \ @@ -89,6 +116,16 @@ let }; }; + urlCheckSubmodule = lib.types.submodule { + options = { + url = lib.mkOption { type = lib.types.str; }; + expectedStatus = lib.mkOption { + type = lib.types.int; + default = 200; + }; + }; + }; + pushSubmodule = lib.types.submodule { options = { name = lib.mkOption { type = lib.types.str; }; @@ -101,6 +138,11 @@ let type = lib.types.int; default = 200; }; + checks = lib.mkOption { + type = lib.types.nullOr (lib.types.listOf urlCheckSubmodule); + default = null; + description = "HTTP checks that must all pass before pushing success"; + }; systemdService = lib.mkOption { type = lib.types.nullOr lib.types.str; default = null; @@ -129,11 +171,24 @@ in # Push: systemd timers on the declaring host config = lib.mkIf (cfg.push != [ ]) { - # Validate: each push entry must have exactly one of url or systemdService - assertions = map (ep: { - assertion = (ep.url != null) != (ep.systemdService != null); - message = "gatusCheck.push '${ep.name}': exactly one of 'url' or 'systemdService' must be set"; - }) cfg.push; + # Validate: each push entry must have exactly one check source. + assertions = lib.concatMap (ep: [ + { + assertion = + builtins.length ( + lib.filter (x: x) [ + (ep.url != null) + (ep.checks != null) + (ep.systemdService != null) + ] + ) == 1; + message = "gatusCheck.push '${ep.name}': exactly one of 'url', 'checks', or 'systemdService' must be set"; + } + { + assertion = ep.checks == null || ep.checks != [ ]; + message = "gatusCheck.push '${ep.name}': checks must not be empty"; + } + ]) cfg.push; sops.secrets.gatus-push-token = { sopsFile = ./secrets.yaml; diff --git a/modules/gatus/default.nix b/modules/gatus/default.nix index 5591a2e..f0fda24 100644 --- a/modules/gatus/default.nix +++ b/modules/gatus/default.nix @@ -18,6 +18,16 @@ in }; metrics = true; + storage = { + type = "sqlite"; + path = "/var/lib/gatus/gatus.sqlite"; + caching = true; + maximum-number-of-results = 720; + maximum-number-of-events = 200; + }; + + ui.default-sort-by = "group"; + alerting.ntfy = { topic = "gatus"; url = "https://ntfy.sjanglab.org"; @@ -49,23 +59,24 @@ in mkExtEndpoint = name: group: { inherit name group; token = "\${GATUS_EXTERNAL_TOKEN}"; + heartbeat.interval = "15m"; alerts = [ { type = "ntfy"; } ]; }; in [ # psi (mkExtEndpoint "Nixbot" "ci") - (mkExtEndpoint "Nixbot PostgreSQL" "ci") - (mkExtEndpoint "Docling" "ai") - (mkExtEndpoint "MULTI-evolve" "ai") + (mkExtEndpoint "Docling" "apps") + (mkExtEndpoint "MULTI-evolve" "apps") # tau (mkExtEndpoint "Nextcloud" "apps") (mkExtEndpoint "n8n" "apps") + (mkExtEndpoint "Vaultwarden" "apps") # rho (mkExtEndpoint "Grafana" "monitoring") (mkExtEndpoint "Prometheus" "monitoring") (mkExtEndpoint "Loki" "monitoring") - (mkExtEndpoint "PostgreSQL" "db") + (mkExtEndpoint "PostgreSQL" "platform") ]; }; }; diff --git a/modules/headscale/default.nix b/modules/headscale/default.nix index 9c5464c..4fc066f 100644 --- a/modules/headscale/default.nix +++ b/modules/headscale/default.nix @@ -10,7 +10,7 @@ { name = "Headscale"; url = "https://hs.sjanglab.org/health"; - group = "auth"; + group = "platform"; } ]; diff --git a/modules/monitoring/grafana.nix b/modules/monitoring/grafana.nix index a70f16c..46263fa 100644 --- a/modules/monitoring/grafana.nix +++ b/modules/monitoring/grafana.nix @@ -16,7 +16,13 @@ in { name = "Grafana"; group = "monitoring"; - url = "http://${wgAdminAddr}:3000/api/health"; + checks = [ + { url = "http://${wgAdminAddr}:3000/api/health"; } + { + url = "https://logging.sjanglab.org/"; + expectedStatus = 302; + } + ]; } ]; diff --git a/modules/multievolve/default.nix b/modules/multievolve/default.nix index 7b434aa..71577d8 100644 --- a/modules/multievolve/default.nix +++ b/modules/multievolve/default.nix @@ -22,8 +22,14 @@ in gatusCheck.push = [ { name = "MULTI-evolve"; - group = "ai"; - url = "http://127.0.0.1:${toString port}/_stcore/health"; + group = "apps"; + checks = [ + { url = "http://127.0.0.1:${toString port}/_stcore/health"; } + { + url = "https://${domain}/"; + expectedStatus = 302; + } + ]; } ]; diff --git a/modules/nextcloud/default.nix b/modules/nextcloud/default.nix index e182083..1646c31 100644 --- a/modules/nextcloud/default.nix +++ b/modules/nextcloud/default.nix @@ -21,8 +21,7 @@ in { name = "Nextcloud"; group = "apps"; - url = "http://127.0.0.1:80/status.php"; - expectedStatus = 301; + url = "https://cloud.sjanglab.org/status.php"; } ]; diff --git a/modules/ntfy.nix b/modules/ntfy.nix index 397a8d8..9e0ca45 100644 --- a/modules/ntfy.nix +++ b/modules/ntfy.nix @@ -14,7 +14,7 @@ { name = "ntfy"; url = "https://ntfy.sjanglab.org/v1/health"; - group = "infra"; + group = "platform"; } ]; diff --git a/modules/postgresql/default.nix b/modules/postgresql/default.nix index f908d52..3f9c2b3 100644 --- a/modules/postgresql/default.nix +++ b/modules/postgresql/default.nix @@ -13,7 +13,7 @@ in gatusCheck.push = [ { name = "PostgreSQL"; - group = "db"; + group = "platform"; systemdService = "postgresql.service"; } ]; diff --git a/modules/uptermd/default.nix b/modules/uptermd/default.nix index cc15215..cd82151 100644 --- a/modules/uptermd/default.nix +++ b/modules/uptermd/default.nix @@ -35,14 +35,9 @@ in gatusCheck.pull = [ { - name = "Upterm Web"; - url = "https://${domain}"; - group = "dev"; - } - { - name = "Upterm Relay"; + name = "Upterm"; url = "tcp://${domain}:${toString port}"; - group = "dev"; + group = "platform"; conditions = [ "[CONNECTED] == true" ]; } ]; diff --git a/modules/vaultwarden/default.nix b/modules/vaultwarden/default.nix index 8213b6e..ede560c 100644 --- a/modules/vaultwarden/default.nix +++ b/modules/vaultwarden/default.nix @@ -1,16 +1,5 @@ { config, ... }: { - imports = [ ../gatus/check.nix ]; - - gatusCheck.pull = [ - { - name = "Vaultwarden"; - url = "http://127.0.0.1:8000/alive"; - group = "apps"; - conditions = [ "[STATUS] == 200" ]; - } - ]; - services.vaultwarden = { enable = true; environmentFile = config.sops.templates.vaultwarden-env.path; diff --git a/modules/vaultwarden/reverse-proxy.nix b/modules/vaultwarden/reverse-proxy.nix index d62b965..4dd5a0e 100644 --- a/modules/vaultwarden/reverse-proxy.nix +++ b/modules/vaultwarden/reverse-proxy.nix @@ -5,7 +5,18 @@ let certDir = "/var/lib/acme/${domain}"; in { - imports = [ ../acme/sync.nix ]; + imports = [ + ../acme/sync.nix + ../gatus/check.nix + ]; + + gatusCheck.push = [ + { + name = "Vaultwarden"; + group = "apps"; + url = "https://${domain}/alive"; + } + ]; acmeSyncer.mkReceiver = [ {