Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/admin/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,18 @@ flowchart LR
| 수집 대상 | 전송처 | 주기 |
|---------|--------|------|
| sshd 로그 | Loki (rho:3100) | 실시간 |
| SSH bastion forward 매핑 | Loki (rho:3100) | 실시간 |
| auditd 로그 | Loki (rho:3100) | 실시간 |
| 호스트 메트릭 | Prometheus (rho:9090) | 60초 |

eta는 SSH 인증 로그와 같은 PID의 outbound socket을 관찰해 `ssh_bastion` 로그를 생성하고 Loki로 직접 전송합니다. ProxyJump 때문에 대상 호스트가 eta의 내부 IP만 보더라도 실제 접속원 IP, bastion 사용자, 대상 호스트를 함께 조회할 수 있습니다.

```logql
{log_type="ssh_bastion", event="bastion_forward"}
```

대상 호스트의 SSH 로그와 맞출 때는 `target_host`, `bastion_user`, 시간대, `bastion_local_port`를 함께 봅니다. 대상 호스트 sshd 로그의 `source_port`가 eta에서 기록한 `bastion_local_port`입니다.

### Prometheus (rho)

- 리텐션: 30일
Expand Down
1 change: 1 addition & 0 deletions modules/gatus/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ in
(mkExtEndpoint "n8n" "apps")
(mkExtEndpoint "Vaultwarden" "apps")
# rho
(mkExtEndpoint "Gatus" "monitoring")
(mkExtEndpoint "Grafana" "monitoring")
(mkExtEndpoint "Prometheus" "monitoring")
(mkExtEndpoint "Loki" "monitoring")
Expand Down
13 changes: 12 additions & 1 deletion modules/gatus/reverse-proxy.nix
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,18 @@ let
certDir = "/var/lib/acme/${domain}";
in
{
imports = [ ../acme/sync.nix ];
imports = [
./check.nix
../acme/sync.nix
];

gatusCheck.push = [
{
name = "Gatus";
group = "monitoring";
url = "https://${domain}/";
}
];

acmeSyncer.mkReceiver = [
{
Expand Down
265 changes: 265 additions & 0 deletions modules/monitoring/grafana-dashboards.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
{
pkgs,
lib,
...
}:
let
dashboard = {
uid = "sjanglab-infra";
title = "SjangLab Infrastructure";
tags = [
"infra"
"nixos"
];
timezone = "browser";
schemaVersion = 41;
version = 1;
refresh = "30s";
time = {
from = "now-6h";
to = "now";
};
templating.list = [ ];
annotations.list = [ ];
panels = [
{
id = 1;
title = "Host memory available";
type = "timeseries";
datasource = "Prometheus";
gridPos = {
h = 8;
w = 8;
x = 0;
y = 0;
};
fieldConfig.defaults = {
unit = "percent";
min = 0;
max = 100;
};
targets = [
{
refId = "A";
expr = "100 * host_memory_available_bytes / host_memory_total_bytes";
legendFormat = "{{host}}";
}
];
}
{
id = 2;
title = "CPU busy";
type = "timeseries";
datasource = "Prometheus";
gridPos = {
h = 8;
w = 8;
x = 8;
y = 0;
};
fieldConfig.defaults = {
unit = "percent";
min = 0;
max = 100;
};
targets = [
{
refId = "A";
expr = ''(1 - avg by (host) (rate(host_cpu_seconds_total{mode="idle"}[5m]))) * 100'';
legendFormat = "{{host}}";
}
];
}
{
id = 3;
title = "Root filesystem free";
type = "timeseries";
datasource = "Prometheus";
gridPos = {
h = 8;
w = 8;
x = 16;
y = 0;
};
fieldConfig.defaults = {
unit = "percent";
min = 0;
max = 100;
};
targets = [
{
refId = "A";
expr = ''100 * host_filesystem_free_bytes{mountpoint="/"} / host_filesystem_total_bytes{mountpoint="/"}'';
legendFormat = "{{host}}";
}
];
}
{
id = 4;
title = "Gatus endpoint success";
type = "stat";
datasource = "Prometheus";
gridPos = {
h = 8;
w = 12;
x = 0;
y = 8;
};
fieldConfig.defaults = {
unit = "bool_on_off";
mappings = [
{
type = "value";
options = {
"0" = {
text = "DOWN";
color = "red";
};
"1" = {
text = "UP";
color = "green";
};
};
}
];
thresholds = {
mode = "absolute";
steps = [
{
color = "red";
value = null;
}
{
color = "green";
value = 1;
}
];
};
};
options = {
reduceOptions = {
calcs = [ "lastNotNull" ];
fields = "";
values = false;
};
orientation = "auto";
textMode = "auto";
};
targets = [
{
refId = "A";
expr = "gatus_results_endpoint_success";
legendFormat = "{{group}}/{{name}}";
}
];
}
{
id = 5;
title = "Prometheus target health";
type = "stat";
datasource = "Prometheus";
gridPos = {
h = 8;
w = 12;
x = 12;
y = 8;
};
fieldConfig.defaults = {
unit = "bool_on_off";
mappings = [
{
type = "value";
options = {
"0" = {
text = "DOWN";
color = "red";
};
"1" = {
text = "UP";
color = "green";
};
};
}
];
};
targets = [
{
refId = "A";
expr = "up";
legendFormat = "{{job}} {{instance}}";
}
];
}
{
id = 6;
title = "Recent SSH events";
type = "logs";
datasource = "Loki";
gridPos = {
h = 10;
w = 24;
x = 0;
y = 16;
};
options = {
showTime = true;
showLabels = false;
wrapLogMessage = true;
enableLogDetails = true;
sortOrder = "Descending";
};
targets = [
{
refId = "A";
expr = ''{log_type="ssh"}'';
}
];
}
{
id = 7;
title = "SSH bastion forwards";
type = "logs";
datasource = "Loki";
gridPos = {
h = 8;
w = 24;
x = 0;
y = 26;
};
options = {
showTime = true;
showLabels = true;
wrapLogMessage = true;
enableLogDetails = true;
sortOrder = "Descending";
};
targets = [
{
refId = "A";
expr = ''{log_type="ssh_bastion", event="bastion_forward"}'';
}
];
}
];
};

dashboardsDir = pkgs.runCommand "grafana-dashboards" { } ''
mkdir -p $out
${lib.getExe pkgs.jq} . ${pkgs.writeText "sjanglab-infra-dashboard.json" (builtins.toJSON dashboard)} > $out/sjanglab-infra.json
'';
in
{
services.grafana.provision.dashboards.settings = {
apiVersion = 1;
providers = [
{
name = "infra";
type = "file";
disableDeletion = true;
updateIntervalSeconds = 30;
allowUiUpdates = false;
options.path = dashboardsDir;
}
];
};
}
7 changes: 5 additions & 2 deletions modules/monitoring/grafana.nix
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@ let
prometheusUrl = "http://${wgAdminAddr}:9090";
in
{
imports = [ ../gatus/check.nix ];
imports = [
./grafana-dashboards.nix
../gatus/check.nix
];

gatusCheck.push = [
{
Expand Down Expand Up @@ -54,7 +57,7 @@ in
# Grafana only listens on wg-admin, so only WG-authenticated hosts can reach it.
"auth.anonymous" = {
enabled = true;
org_name = "Public";
org_name = "Main Org.";
org_role = "Viewer";
};
};
Expand Down
Loading
Loading