Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions docs/admin/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ flowchart LR

### Gatus (eta)

- Pull 방식: 외부 접근 가능한 서비스 (Authentik, Headscale, Upterm Web/Relay 등)
- Push 방식: 내부 서비스가 주기적으로 상태 보고
- Pull 방식: eta에서 직접 접근 가능한 서비스 (Authentik, Headscale, Upterm 등)
- Push 방식: 내부 서비스가 로컬/사용자 경로를 확인한 뒤 상태 보고
- 저장소: SQLite (`/var/lib/gatus/gatus.sqlite`)로 재시작 후 uptime 유지
- External endpoint heartbeat: 15분 동안 push가 없으면 실패 처리
- 그룹: `apps`, `ci`, `monitoring`, `platform`
- 기본 정렬: group 기준
- 알림: ntfy (`ntfy.sjanglab.org`, 토픽: `gatus`)
2 changes: 1 addition & 1 deletion docs/guide/upterm.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,6 @@ ssh <session-id>:<token>@upterm.sjanglab.org -p 2323
| 웹 안내 페이지 | `https://upterm.sjanglab.org` |
| Relay endpoint | `ssh://upterm.sjanglab.org:2323` |
| 인증 | 세션 호스트가 지정한 GitHub 사용자 allow-list |
| 모니터링 | Gatus `Upterm Web`, `Upterm Relay` |
| 모니터링 | Gatus `Upterm` |

세션은 임시 공유 용도입니다. 장기 접속이나 서버 작업은 [SSH 접속](../dev/ssh-access.md)을 사용하세요.
2 changes: 1 addition & 1 deletion modules/authentik/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
{
name = "Authentik";
url = "https://auth.sjanglab.org";
group = "auth";
group = "platform";
}
];

Expand Down
12 changes: 1 addition & 11 deletions modules/buildbot/database.nix
Original file line number Diff line number Diff line change
@@ -1,20 +1,10 @@
# Nixbot PostgreSQL database health/backup helpers (deployed on psi)
# Nixbot database health/backup helpers (deployed on psi)
{
lib,
pkgs,
...
}:
{
imports = [ ../gatus/check.nix ];

gatusCheck.push = [
{
name = "Nixbot PostgreSQL";
group = "ci";
systemdService = "postgresql.service";
}
];

# services.nixbot provisions the nixbot database and peer-authenticated user.
services.postgresql = {
enable = true;
Expand Down
7 changes: 5 additions & 2 deletions modules/docling/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@ in
gatusCheck.push = [
{
name = "Docling";
group = "ai";
url = "http://127.0.0.1:${toString doclingPort}/health";
group = "apps";
checks = [
{ url = "http://127.0.0.1:${toString doclingPort}/health"; }
{ url = "https://${domain}/health"; }
];
}
];

Expand Down
81 changes: 68 additions & 13 deletions modules/gatus/check.nix
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,25 @@ let
in
"${sanitize ep.group}_${sanitize ep.name}";

mkUrlCheck = check: ''
check_url ${lib.escapeShellArg check.url} ${lib.escapeShellArg (toString check.expectedStatus)}
'';

mkPushScript =
ep:
let
checks =
if ep.checks != null then
ep.checks
else if ep.url != null then
[
{
inherit (ep) url expectedStatus;
}
]
else
[ ];
in
pkgs.writeShellScript "gatus-push-${mkKey ep}" (
if ep.systemdService != null then
''
Expand All @@ -52,14 +69,24 @@ let
else
''
set -euo pipefail
status=$(${pkgs.curl}/bin/curl -sf --max-time 30 -o /dev/null -w "%{http_code}" "${ep.url}" 2>/dev/null) || true
if [ "$status" = "${toString ep.expectedStatus}" ]; then
success=true
error=""
else
success=false
error="expected ${toString ep.expectedStatus}, got $status"
fi
success=true
error=""

check_url() {
local url=$1
local expected=$2
local status
status=$(${pkgs.curl}/bin/curl -sf --max-time 30 -o /dev/null -w "%{http_code}" "$url" 2>/dev/null) || true
if [ "$status" != "$expected" ]; then
success=false
if [ -n "$error" ]; then
error="$error; "
fi
error="''${error}$url expected $expected, got $status"
fi
}

${lib.concatMapStringsSep "" mkUrlCheck checks}
${pkgs.curl}/bin/curl -sf --max-time 10 \
-X POST \
-G \
Expand Down Expand Up @@ -89,6 +116,16 @@ let
};
};

urlCheckSubmodule = lib.types.submodule {
options = {
url = lib.mkOption { type = lib.types.str; };
expectedStatus = lib.mkOption {
type = lib.types.int;
default = 200;
};
};
};

pushSubmodule = lib.types.submodule {
options = {
name = lib.mkOption { type = lib.types.str; };
Expand All @@ -101,6 +138,11 @@ let
type = lib.types.int;
default = 200;
};
checks = lib.mkOption {
type = lib.types.nullOr (lib.types.listOf urlCheckSubmodule);
default = null;
description = "HTTP checks that must all pass before pushing success";
};
systemdService = lib.mkOption {
type = lib.types.nullOr lib.types.str;
default = null;
Expand Down Expand Up @@ -129,11 +171,24 @@ in

# Push: systemd timers on the declaring host
config = lib.mkIf (cfg.push != [ ]) {
# Validate: each push entry must have exactly one of url or systemdService
assertions = map (ep: {
assertion = (ep.url != null) != (ep.systemdService != null);
message = "gatusCheck.push '${ep.name}': exactly one of 'url' or 'systemdService' must be set";
}) cfg.push;
# Validate: each push entry must have exactly one check source.
assertions = lib.concatMap (ep: [
{
assertion =
builtins.length (
lib.filter (x: x) [
(ep.url != null)
(ep.checks != null)
(ep.systemdService != null)
]
) == 1;
message = "gatusCheck.push '${ep.name}': exactly one of 'url', 'checks', or 'systemdService' must be set";
}
{
assertion = ep.checks == null || ep.checks != [ ];
message = "gatusCheck.push '${ep.name}': checks must not be empty";
}
]) cfg.push;

sops.secrets.gatus-push-token = {
sopsFile = ./secrets.yaml;
Expand Down
19 changes: 15 additions & 4 deletions modules/gatus/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@ in
};
metrics = true;

storage = {
type = "sqlite";
path = "/var/lib/gatus/gatus.sqlite";
caching = true;
maximum-number-of-results = 720;
maximum-number-of-events = 200;
};

ui.default-sort-by = "group";

alerting.ntfy = {
topic = "gatus";
url = "https://ntfy.sjanglab.org";
Expand Down Expand Up @@ -49,23 +59,24 @@ in
mkExtEndpoint = name: group: {
inherit name group;
token = "\${GATUS_EXTERNAL_TOKEN}";
heartbeat.interval = "15m";
alerts = [ { type = "ntfy"; } ];
};
in
[
# psi
(mkExtEndpoint "Nixbot" "ci")
(mkExtEndpoint "Nixbot PostgreSQL" "ci")
(mkExtEndpoint "Docling" "ai")
(mkExtEndpoint "MULTI-evolve" "ai")
(mkExtEndpoint "Docling" "apps")
(mkExtEndpoint "MULTI-evolve" "apps")
# tau
(mkExtEndpoint "Nextcloud" "apps")
(mkExtEndpoint "n8n" "apps")
(mkExtEndpoint "Vaultwarden" "apps")
# rho
(mkExtEndpoint "Grafana" "monitoring")
(mkExtEndpoint "Prometheus" "monitoring")
(mkExtEndpoint "Loki" "monitoring")
(mkExtEndpoint "PostgreSQL" "db")
(mkExtEndpoint "PostgreSQL" "platform")
];
};
};
Expand Down
2 changes: 1 addition & 1 deletion modules/headscale/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
{
name = "Headscale";
url = "https://hs.sjanglab.org/health";
group = "auth";
group = "platform";
}
];

Expand Down
8 changes: 7 additions & 1 deletion modules/monitoring/grafana.nix
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@ in
{
name = "Grafana";
group = "monitoring";
url = "http://${wgAdminAddr}:3000/api/health";
checks = [
{ url = "http://${wgAdminAddr}:3000/api/health"; }
{
url = "https://logging.sjanglab.org/";
expectedStatus = 302;
}
];
}
];

Expand Down
10 changes: 8 additions & 2 deletions modules/multievolve/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,14 @@ in
gatusCheck.push = [
{
name = "MULTI-evolve";
group = "ai";
url = "http://127.0.0.1:${toString port}/_stcore/health";
group = "apps";
checks = [
{ url = "http://127.0.0.1:${toString port}/_stcore/health"; }
{
url = "https://${domain}/";
expectedStatus = 302;
}
];
}
];

Expand Down
3 changes: 1 addition & 2 deletions modules/nextcloud/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ in
{
name = "Nextcloud";
group = "apps";
url = "http://127.0.0.1:80/status.php";
expectedStatus = 301;
url = "https://cloud.sjanglab.org/status.php";
}
];

Expand Down
2 changes: 1 addition & 1 deletion modules/ntfy.nix
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
{
name = "ntfy";
url = "https://ntfy.sjanglab.org/v1/health";
group = "infra";
group = "platform";
}
];

Expand Down
2 changes: 1 addition & 1 deletion modules/postgresql/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ in
gatusCheck.push = [
{
name = "PostgreSQL";
group = "db";
group = "platform";
systemdService = "postgresql.service";
}
];
Expand Down
9 changes: 2 additions & 7 deletions modules/uptermd/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,9 @@ in

gatusCheck.pull = [
{
name = "Upterm Web";
url = "https://${domain}";
group = "dev";
}
{
name = "Upterm Relay";
name = "Upterm";
url = "tcp://${domain}:${toString port}";
group = "dev";
group = "platform";
conditions = [ "[CONNECTED] == true" ];
}
];
Expand Down
11 changes: 0 additions & 11 deletions modules/vaultwarden/default.nix
Original file line number Diff line number Diff line change
@@ -1,16 +1,5 @@
{ config, ... }:
{
imports = [ ../gatus/check.nix ];

gatusCheck.pull = [
{
name = "Vaultwarden";
url = "http://127.0.0.1:8000/alive";
group = "apps";
conditions = [ "[STATUS] == 200" ];
}
];

services.vaultwarden = {
enable = true;
environmentFile = config.sops.templates.vaultwarden-env.path;
Expand Down
13 changes: 12 additions & 1 deletion modules/vaultwarden/reverse-proxy.nix
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,18 @@ let
certDir = "/var/lib/acme/${domain}";
in
{
imports = [ ../acme/sync.nix ];
imports = [
../acme/sync.nix
../gatus/check.nix
];

gatusCheck.push = [
{
name = "Vaultwarden";
group = "apps";
url = "https://${domain}/alive";
}
];

acmeSyncer.mkReceiver = [
{
Expand Down
Loading