diff --git a/docs/admin/monitoring.md b/docs/admin/monitoring.md index f54a585..f9ce97c 100644 --- a/docs/admin/monitoring.md +++ b/docs/admin/monitoring.md @@ -57,6 +57,10 @@ flowchart LR ### Gatus (eta) -- Pull 방식: 외부 접근 가능한 서비스 (Authentik, Headscale, Upterm Web/Relay 등) -- Push 방식: 내부 서비스가 주기적으로 상태 보고 +- Pull 방식: eta에서 직접 접근 가능한 서비스 (Authentik, Headscale, Upterm 등) +- Push 방식: 내부 서비스가 로컬/사용자 경로를 확인한 뒤 상태 보고 +- 저장소: SQLite (`/var/lib/gatus/gatus.sqlite`)로 재시작 후 uptime 유지 +- External endpoint heartbeat: 15분 동안 push가 없으면 실패 처리 +- 그룹: `apps`, `ci`, `monitoring`, `platform` +- 기본 정렬: group 기준 - 알림: ntfy (`ntfy.sjanglab.org`, 토픽: `gatus`) diff --git a/docs/guide/upterm.md b/docs/guide/upterm.md index e5416aa..3124360 100644 --- a/docs/guide/upterm.md +++ b/docs/guide/upterm.md @@ -30,6 +30,6 @@ ssh :@upterm.sjanglab.org -p 2323 | 웹 안내 페이지 | `https://upterm.sjanglab.org` | | Relay endpoint | `ssh://upterm.sjanglab.org:2323` | | 인증 | 세션 호스트가 지정한 GitHub 사용자 allow-list | -| 모니터링 | Gatus `Upterm Web`, `Upterm Relay` | +| 모니터링 | Gatus `Upterm` | 세션은 임시 공유 용도입니다. 장기 접속이나 서버 작업은 [SSH 접속](../dev/ssh-access.md)을 사용하세요. diff --git a/modules/authentik/default.nix b/modules/authentik/default.nix index 5595bbc..10c375c 100644 --- a/modules/authentik/default.nix +++ b/modules/authentik/default.nix @@ -9,7 +9,7 @@ { name = "Authentik"; url = "https://auth.sjanglab.org"; - group = "auth"; + group = "platform"; } ]; diff --git a/modules/buildbot/database.nix b/modules/buildbot/database.nix index 2bea7a8..927b002 100644 --- a/modules/buildbot/database.nix +++ b/modules/buildbot/database.nix @@ -1,20 +1,10 @@ -# Nixbot PostgreSQL database health/backup helpers (deployed on psi) +# Nixbot database health/backup helpers (deployed on psi) { lib, pkgs, ... }: { - imports = [ ../gatus/check.nix ]; - - gatusCheck.push = [ - { - name = "Nixbot PostgreSQL"; - group = "ci"; - systemdService = "postgresql.service"; - } - ]; - # services.nixbot provisions the nixbot database and peer-authenticated user. services.postgresql = { enable = true; diff --git a/modules/docling/default.nix b/modules/docling/default.nix index 211b2a6..7c1c6ad 100644 --- a/modules/docling/default.nix +++ b/modules/docling/default.nix @@ -13,8 +13,11 @@ in gatusCheck.push = [ { name = "Docling"; - group = "ai"; - url = "http://127.0.0.1:${toString doclingPort}/health"; + group = "apps"; + checks = [ + { url = "http://127.0.0.1:${toString doclingPort}/health"; } + { url = "https://${domain}/health"; } + ]; } ]; diff --git a/modules/gatus/check.nix b/modules/gatus/check.nix index 574583f..481a96d 100644 --- a/modules/gatus/check.nix +++ b/modules/gatus/check.nix @@ -28,8 +28,25 @@ let in "${sanitize ep.group}_${sanitize ep.name}"; + mkUrlCheck = check: '' + check_url ${lib.escapeShellArg check.url} ${lib.escapeShellArg (toString check.expectedStatus)} + ''; + mkPushScript = ep: + let + checks = + if ep.checks != null then + ep.checks + else if ep.url != null then + [ + { + inherit (ep) url expectedStatus; + } + ] + else + [ ]; + in pkgs.writeShellScript "gatus-push-${mkKey ep}" ( if ep.systemdService != null then '' @@ -52,14 +69,24 @@ let else '' set -euo pipefail - status=$(${pkgs.curl}/bin/curl -sf --max-time 30 -o /dev/null -w "%{http_code}" "${ep.url}" 2>/dev/null) || true - if [ "$status" = "${toString ep.expectedStatus}" ]; then - success=true - error="" - else - success=false - error="expected ${toString ep.expectedStatus}, got $status" - fi + success=true + error="" + + check_url() { + local url=$1 + local expected=$2 + local status + status=$(${pkgs.curl}/bin/curl -sf --max-time 30 -o /dev/null -w "%{http_code}" "$url" 2>/dev/null) || true + if [ "$status" != "$expected" ]; then + success=false + if [ -n "$error" ]; then + error="$error; " + fi + error="''${error}$url expected $expected, got $status" + fi + } + + ${lib.concatMapStringsSep "" mkUrlCheck checks} ${pkgs.curl}/bin/curl -sf --max-time 10 \ -X POST \ -G \ @@ -89,6 +116,16 @@ let }; }; + urlCheckSubmodule = lib.types.submodule { + options = { + url = lib.mkOption { type = lib.types.str; }; + expectedStatus = lib.mkOption { + type = lib.types.int; + default = 200; + }; + }; + }; + pushSubmodule = lib.types.submodule { options = { name = lib.mkOption { type = lib.types.str; }; @@ -101,6 +138,11 @@ let type = lib.types.int; default = 200; }; + checks = lib.mkOption { + type = lib.types.nullOr (lib.types.listOf urlCheckSubmodule); + default = null; + description = "HTTP checks that must all pass before pushing success"; + }; systemdService = lib.mkOption { type = lib.types.nullOr lib.types.str; default = null; @@ -129,11 +171,24 @@ in # Push: systemd timers on the declaring host config = lib.mkIf (cfg.push != [ ]) { - # Validate: each push entry must have exactly one of url or systemdService - assertions = map (ep: { - assertion = (ep.url != null) != (ep.systemdService != null); - message = "gatusCheck.push '${ep.name}': exactly one of 'url' or 'systemdService' must be set"; - }) cfg.push; + # Validate: each push entry must have exactly one check source. + assertions = lib.concatMap (ep: [ + { + assertion = + builtins.length ( + lib.filter (x: x) [ + (ep.url != null) + (ep.checks != null) + (ep.systemdService != null) + ] + ) == 1; + message = "gatusCheck.push '${ep.name}': exactly one of 'url', 'checks', or 'systemdService' must be set"; + } + { + assertion = ep.checks == null || ep.checks != [ ]; + message = "gatusCheck.push '${ep.name}': checks must not be empty"; + } + ]) cfg.push; sops.secrets.gatus-push-token = { sopsFile = ./secrets.yaml; diff --git a/modules/gatus/default.nix b/modules/gatus/default.nix index 5591a2e..f0fda24 100644 --- a/modules/gatus/default.nix +++ b/modules/gatus/default.nix @@ -18,6 +18,16 @@ in }; metrics = true; + storage = { + type = "sqlite"; + path = "/var/lib/gatus/gatus.sqlite"; + caching = true; + maximum-number-of-results = 720; + maximum-number-of-events = 200; + }; + + ui.default-sort-by = "group"; + alerting.ntfy = { topic = "gatus"; url = "https://ntfy.sjanglab.org"; @@ -49,23 +59,24 @@ in mkExtEndpoint = name: group: { inherit name group; token = "\${GATUS_EXTERNAL_TOKEN}"; + heartbeat.interval = "15m"; alerts = [ { type = "ntfy"; } ]; }; in [ # psi (mkExtEndpoint "Nixbot" "ci") - (mkExtEndpoint "Nixbot PostgreSQL" "ci") - (mkExtEndpoint "Docling" "ai") - (mkExtEndpoint "MULTI-evolve" "ai") + (mkExtEndpoint "Docling" "apps") + (mkExtEndpoint "MULTI-evolve" "apps") # tau (mkExtEndpoint "Nextcloud" "apps") (mkExtEndpoint "n8n" "apps") + (mkExtEndpoint "Vaultwarden" "apps") # rho (mkExtEndpoint "Grafana" "monitoring") (mkExtEndpoint "Prometheus" "monitoring") (mkExtEndpoint "Loki" "monitoring") - (mkExtEndpoint "PostgreSQL" "db") + (mkExtEndpoint "PostgreSQL" "platform") ]; }; }; diff --git a/modules/headscale/default.nix b/modules/headscale/default.nix index 9c5464c..4fc066f 100644 --- a/modules/headscale/default.nix +++ b/modules/headscale/default.nix @@ -10,7 +10,7 @@ { name = "Headscale"; url = "https://hs.sjanglab.org/health"; - group = "auth"; + group = "platform"; } ]; diff --git a/modules/monitoring/grafana.nix b/modules/monitoring/grafana.nix index a70f16c..46263fa 100644 --- a/modules/monitoring/grafana.nix +++ b/modules/monitoring/grafana.nix @@ -16,7 +16,13 @@ in { name = "Grafana"; group = "monitoring"; - url = "http://${wgAdminAddr}:3000/api/health"; + checks = [ + { url = "http://${wgAdminAddr}:3000/api/health"; } + { + url = "https://logging.sjanglab.org/"; + expectedStatus = 302; + } + ]; } ]; diff --git a/modules/multievolve/default.nix b/modules/multievolve/default.nix index 7b434aa..71577d8 100644 --- a/modules/multievolve/default.nix +++ b/modules/multievolve/default.nix @@ -22,8 +22,14 @@ in gatusCheck.push = [ { name = "MULTI-evolve"; - group = "ai"; - url = "http://127.0.0.1:${toString port}/_stcore/health"; + group = "apps"; + checks = [ + { url = "http://127.0.0.1:${toString port}/_stcore/health"; } + { + url = "https://${domain}/"; + expectedStatus = 302; + } + ]; } ]; diff --git a/modules/nextcloud/default.nix b/modules/nextcloud/default.nix index e182083..1646c31 100644 --- a/modules/nextcloud/default.nix +++ b/modules/nextcloud/default.nix @@ -21,8 +21,7 @@ in { name = "Nextcloud"; group = "apps"; - url = "http://127.0.0.1:80/status.php"; - expectedStatus = 301; + url = "https://cloud.sjanglab.org/status.php"; } ]; diff --git a/modules/ntfy.nix b/modules/ntfy.nix index 397a8d8..9e0ca45 100644 --- a/modules/ntfy.nix +++ b/modules/ntfy.nix @@ -14,7 +14,7 @@ { name = "ntfy"; url = "https://ntfy.sjanglab.org/v1/health"; - group = "infra"; + group = "platform"; } ]; diff --git a/modules/postgresql/default.nix b/modules/postgresql/default.nix index f908d52..3f9c2b3 100644 --- a/modules/postgresql/default.nix +++ b/modules/postgresql/default.nix @@ -13,7 +13,7 @@ in gatusCheck.push = [ { name = "PostgreSQL"; - group = "db"; + group = "platform"; systemdService = "postgresql.service"; } ]; diff --git a/modules/uptermd/default.nix b/modules/uptermd/default.nix index cc15215..cd82151 100644 --- a/modules/uptermd/default.nix +++ b/modules/uptermd/default.nix @@ -35,14 +35,9 @@ in gatusCheck.pull = [ { - name = "Upterm Web"; - url = "https://${domain}"; - group = "dev"; - } - { - name = "Upterm Relay"; + name = "Upterm"; url = "tcp://${domain}:${toString port}"; - group = "dev"; + group = "platform"; conditions = [ "[CONNECTED] == true" ]; } ]; diff --git a/modules/vaultwarden/default.nix b/modules/vaultwarden/default.nix index 8213b6e..ede560c 100644 --- a/modules/vaultwarden/default.nix +++ b/modules/vaultwarden/default.nix @@ -1,16 +1,5 @@ { config, ... }: { - imports = [ ../gatus/check.nix ]; - - gatusCheck.pull = [ - { - name = "Vaultwarden"; - url = "http://127.0.0.1:8000/alive"; - group = "apps"; - conditions = [ "[STATUS] == 200" ]; - } - ]; - services.vaultwarden = { enable = true; environmentFile = config.sops.templates.vaultwarden-env.path; diff --git a/modules/vaultwarden/reverse-proxy.nix b/modules/vaultwarden/reverse-proxy.nix index d62b965..4dd5a0e 100644 --- a/modules/vaultwarden/reverse-proxy.nix +++ b/modules/vaultwarden/reverse-proxy.nix @@ -5,7 +5,18 @@ let certDir = "/var/lib/acme/${domain}"; in { - imports = [ ../acme/sync.nix ]; + imports = [ + ../acme/sync.nix + ../gatus/check.nix + ]; + + gatusCheck.push = [ + { + name = "Vaultwarden"; + group = "apps"; + url = "https://${domain}/alive"; + } + ]; acmeSyncer.mkReceiver = [ {