From 945fedf9518d951259411a535750ad3998c7947b Mon Sep 17 00:00:00 2001 From: mulatta <67085791+mulatta@users.noreply.github.com> Date: Fri, 3 Jul 2026 16:27:22 +0900 Subject: [PATCH 1/8] monitoring: fix alert rules and local log sinks Prometheus alerts referenced Vector metric names that no longer exist, so disk, memory, and CPU alerts could not fire. Remote-write hosts also do not appear as scrape targets, which made the existing NodeDown rule ineffective for eta, psi, and tau. Use the current host metric names, add per-host freshness alerts for remote-write data, and alert on failing Gatus endpoints. Point local Vector Loki sinks at the address Loki actually listens on, exclude unreadable systemd credential mounts from filesystem metrics, and add a Gatus self-check so the status page path is visible in Gatus. --- modules/gatus/default.nix | 1 + modules/gatus/reverse-proxy.nix | 13 +++- modules/monitoring/prometheus/rules.nix | 64 ++++++++++++------- modules/monitoring/vector/default.nix | 9 +++ .../monitoring/vector/monitor-services.nix | 2 +- modules/monitoring/vector/monitor-systems.nix | 4 +- 6 files changed, 67 insertions(+), 26 deletions(-) diff --git a/modules/gatus/default.nix b/modules/gatus/default.nix index f0fda24..539906c 100644 --- a/modules/gatus/default.nix +++ b/modules/gatus/default.nix @@ -73,6 +73,7 @@ in (mkExtEndpoint "n8n" "apps") (mkExtEndpoint "Vaultwarden" "apps") # rho + (mkExtEndpoint "Gatus" "monitoring") (mkExtEndpoint "Grafana" "monitoring") (mkExtEndpoint "Prometheus" "monitoring") (mkExtEndpoint "Loki" "monitoring") diff --git a/modules/gatus/reverse-proxy.nix b/modules/gatus/reverse-proxy.nix index 122e5bd..0056750 100644 --- a/modules/gatus/reverse-proxy.nix +++ b/modules/gatus/reverse-proxy.nix @@ -5,7 +5,18 @@ let certDir = "/var/lib/acme/${domain}"; in { - imports = [ ../acme/sync.nix ]; + imports = [ + ./check.nix + ../acme/sync.nix + ]; + + gatusCheck.push = [ + { + name = "Gatus"; + group = "monitoring"; + url = "https://${domain}/"; + } + ]; acmeSyncer.mkReceiver = [ { diff --git a/modules/monitoring/prometheus/rules.nix b/modules/monitoring/prometheus/rules.nix index 41e1b21..7f11162 100644 --- a/modules/monitoring/prometheus/rules.nix +++ b/modules/monitoring/prometheus/rules.nix @@ -1,4 +1,24 @@ # monitoring/prometheus/rules.nix +{ config, lib, ... }: +let + monitoredHosts = lib.attrNames ( + lib.filterAttrs (_: host: host.wg-admin != null) config.networking.sbee.hosts + ); + + hostFreshnessRules = map (host: { + alert = "HostMetricsMissing"; + expr = ''absent_over_time(host_memory_total_bytes{host="${host}"}[10m])''; + for = "2m"; + labels = { + severity = "critical"; + inherit host; + }; + annotations = { + summary = "Host metrics missing"; + description = "${host}: no host metrics received for 10 minutes"; + }; + }) monitoredHosts; +in { services.prometheus.rules = [ (builtins.toJSON { @@ -6,31 +26,20 @@ { name = "system_alerts"; interval = "60s"; - rules = [ - { - alert = "SSHBruteForce"; - expr = ''sum by (host) (count_over_time({log_type="ssh", event="login_failed"}[5m])) > 10''; - for = "2m"; - labels.severity = "warning"; - annotations = { - summary = "SSH brute force attempt"; - description = "{{ $labels.host }}: {{ $value }} failed SSH attempts in 5min"; - }; - } - + rules = hostFreshnessRules ++ [ { alert = "DiskSpaceLow"; expr = '' ( - vector_host_filesystem_free_bytes{filesystem="/"} / - vector_host_filesystem_total_bytes + host_filesystem_free_bytes{mountpoint="/"} / + host_filesystem_total_bytes{mountpoint="/"} ) * 100 < 10 ''; for = "5m"; labels.severity = "warning"; annotations = { summary = "Low disk space"; - description = "{{ $labels.host }}: {{ $value | humanize }}% free"; + description = "{{ $labels.host }}: {{ $value | humanize }}% free on /"; }; } @@ -38,8 +47,8 @@ alert = "MemoryLow"; expr = '' ( - vector_host_memory_available_bytes / - vector_host_memory_total_bytes + host_memory_available_bytes / + host_memory_total_bytes ) * 100 < 10 ''; for = "5m"; @@ -55,7 +64,7 @@ expr = '' ( 1 - avg by (host) ( - rate(vector_host_cpu_seconds_total{mode="idle"}[5m]) + rate(host_cpu_seconds_total{mode="idle"}[5m]) ) ) * 100 > 90 ''; @@ -68,13 +77,24 @@ } { - alert = "NodeDown"; - expr = ''up{job="vector"} == 0''; + alert = "PrometheusTargetDown"; + expr = "up == 0"; for = "2m"; labels.severity = "critical"; annotations = { - summary = "Node is down"; - description = "{{ $labels.host }} is not responding"; + summary = "Prometheus target down"; + description = "{{ $labels.job }} target {{ $labels.instance }} is down"; + }; + } + + { + alert = "GatusEndpointDown"; + expr = "gatus_results_endpoint_success == 0"; + for = "5m"; + labels.severity = "warning"; + annotations = { + summary = "Gatus endpoint down"; + description = "{{ $labels.group }}/{{ $labels.name }} is failing"; }; } ]; diff --git a/modules/monitoring/vector/default.nix b/modules/monitoring/vector/default.nix index 7d67377..acf6fe5 100644 --- a/modules/monitoring/vector/default.nix +++ b/modules/monitoring/vector/default.nix @@ -55,6 +55,15 @@ in "memory" "network" ]; + filesystem.mountpoints.excludes = [ + "/etc/group" + "/etc/hostname" + "/etc/hosts" + "/etc/passwd" + "/etc/resolv.conf" + "/etc/shadow" + "/run/credentials/*" + ]; }; network_stats = { diff --git a/modules/monitoring/vector/monitor-services.nix b/modules/monitoring/vector/monitor-services.nix index 0877f6e..6feba27 100644 --- a/modules/monitoring/vector/monitor-services.nix +++ b/modules/monitoring/vector/monitor-services.nix @@ -65,7 +65,7 @@ in nextflow_logs = { type = "loki"; inputs = [ "parse_nextflow" ]; - endpoint = "http://127.0.0.1:3100"; + endpoint = "http://${wgAdminAddr}:3100"; encoding.codec = "json"; labels = { log_type = "nextflow"; diff --git a/modules/monitoring/vector/monitor-systems.nix b/modules/monitoring/vector/monitor-systems.nix index efec76a..43223ef 100644 --- a/modules/monitoring/vector/monitor-systems.nix +++ b/modules/monitoring/vector/monitor-systems.nix @@ -28,7 +28,7 @@ in ssh_logs_local = { type = "loki"; inputs = [ "filter_ssh" ]; - endpoint = "http://127.0.0.1:3100"; + endpoint = "http://${wgAdminAddr}:3100"; encoding.codec = "json"; labels = { host = "{{ host }}"; @@ -42,7 +42,7 @@ in audit_logs_local = { type = "loki"; inputs = [ "filter_audit" ]; - endpoint = "http://127.0.0.1:3100"; + endpoint = "http://${wgAdminAddr}:3100"; encoding.codec = "json"; labels = { host = "{{ host }}"; From a7eb2eebe02e7335aa80b460f0ebba583d4ec004 Mon Sep 17 00:00:00 2001 From: mulatta <67085791+mulatta@users.noreply.github.com> Date: Sat, 4 Jul 2026 01:20:55 +0900 Subject: [PATCH 2/8] nvidia: avoid 32-bit graphics drivers Keep psi system builds focused on CUDA services instead of pulling i686 graphics compatibility outputs needed only for 32-bit desktop applications. --- modules/nvidia.nix | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/nvidia.nix b/modules/nvidia.nix index 7271a7b..5b3b606 100644 --- a/modules/nvidia.nix +++ b/modules/nvidia.nix @@ -26,6 +26,7 @@ virtualisation.docker.enable = true; hardware.nvidia-container-toolkit.enable = true; - # allow only x86_64-linux - hardware.graphics.enable32Bit = true; + # psi runs CUDA services, not 32-bit desktop apps. Keeping this off avoids + # pulling i686 graphics drivers into every system build. + hardware.graphics.enable32Bit = false; } From b9ea2be7778aa2b9170a8d49e423592abd97c734 Mon Sep 17 00:00:00 2001 From: mulatta <67085791+mulatta@users.noreply.github.com> Date: Sat, 4 Jul 2026 00:54:28 +0900 Subject: [PATCH 3/8] monitoring: ignore Docker namespace mounts Avoid noisy host metric collection errors from Docker network namespace bind mounts that Vector cannot stat safely. --- modules/monitoring/vector/default.nix | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/monitoring/vector/default.nix b/modules/monitoring/vector/default.nix index acf6fe5..f92efbd 100644 --- a/modules/monitoring/vector/default.nix +++ b/modules/monitoring/vector/default.nix @@ -63,6 +63,7 @@ in "/etc/resolv.conf" "/etc/shadow" "/run/credentials/*" + "/run/docker/netns/*" ]; }; From 4538e3bb7f12c00bdd2a4ce54b0716ef79f3215a Mon Sep 17 00:00:00 2001 From: mulatta <67085791+mulatta@users.noreply.github.com> Date: Sat, 4 Jul 2026 01:02:26 +0900 Subject: [PATCH 4/8] monitoring: ignore Docker overlay mounts Avoid noisy host metric collection errors from Docker overlay mountpoints that Vector cannot stat safely. --- modules/monitoring/vector/default.nix | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/monitoring/vector/default.nix b/modules/monitoring/vector/default.nix index f92efbd..6bbae5e 100644 --- a/modules/monitoring/vector/default.nix +++ b/modules/monitoring/vector/default.nix @@ -64,6 +64,7 @@ in "/etc/shadow" "/run/credentials/*" "/run/docker/netns/*" + "/var/lib/docker/overlay2/*/merged" ]; }; From 3fbac0b9127fb3290ff1e79d5bd26fa0a805b77b Mon Sep 17 00:00:00 2001 From: mulatta <67085791+mulatta@users.noreply.github.com> Date: Sat, 4 Jul 2026 01:19:22 +0900 Subject: [PATCH 5/8] monitoring: provision an infrastructure dashboard Make Grafana useful immediately after deployment instead of requiring ad-hoc Explore queries for basic host, endpoint, and SSH visibility. --- modules/monitoring/grafana-dashboards.nix | 258 ++++++++++++++++++++++ modules/monitoring/grafana.nix | 7 +- 2 files changed, 264 insertions(+), 1 deletion(-) create mode 100644 modules/monitoring/grafana-dashboards.nix diff --git a/modules/monitoring/grafana-dashboards.nix b/modules/monitoring/grafana-dashboards.nix new file mode 100644 index 0000000..3714900 --- /dev/null +++ b/modules/monitoring/grafana-dashboards.nix @@ -0,0 +1,258 @@ +{ + pkgs, + lib, + ... +}: +let + dashboard = { + uid = "sjanglab-infra"; + title = "SjangLab Infrastructure"; + tags = [ + "infra" + "nixos" + ]; + timezone = "browser"; + schemaVersion = 41; + version = 1; + refresh = "30s"; + time = { + from = "now-6h"; + to = "now"; + }; + templating.list = [ ]; + annotations.list = [ ]; + panels = [ + { + id = 1; + title = "Host memory available"; + type = "timeseries"; + datasource = { + type = "prometheus"; + uid = "prometheus"; + }; + gridPos = { + h = 8; + w = 8; + x = 0; + y = 0; + }; + fieldConfig.defaults = { + unit = "percent"; + min = 0; + max = 100; + }; + targets = [ + { + refId = "A"; + expr = "100 * host_memory_available_bytes / host_memory_total_bytes"; + legendFormat = "{{host}}"; + } + ]; + } + { + id = 2; + title = "CPU busy"; + type = "timeseries"; + datasource = { + type = "prometheus"; + uid = "prometheus"; + }; + gridPos = { + h = 8; + w = 8; + x = 8; + y = 0; + }; + fieldConfig.defaults = { + unit = "percent"; + min = 0; + max = 100; + }; + targets = [ + { + refId = "A"; + expr = ''(1 - avg by (host) (rate(host_cpu_seconds_total{mode="idle"}[5m]))) * 100''; + legendFormat = "{{host}}"; + } + ]; + } + { + id = 3; + title = "Root filesystem free"; + type = "timeseries"; + datasource = { + type = "prometheus"; + uid = "prometheus"; + }; + gridPos = { + h = 8; + w = 8; + x = 16; + y = 0; + }; + fieldConfig.defaults = { + unit = "percent"; + min = 0; + max = 100; + }; + targets = [ + { + refId = "A"; + expr = ''100 * host_filesystem_free_bytes{mountpoint="/"} / host_filesystem_total_bytes{mountpoint="/"}''; + legendFormat = "{{host}}"; + } + ]; + } + { + id = 4; + title = "Gatus endpoint success"; + type = "stat"; + datasource = { + type = "prometheus"; + uid = "prometheus"; + }; + gridPos = { + h = 8; + w = 12; + x = 0; + y = 8; + }; + fieldConfig.defaults = { + unit = "bool_on_off"; + mappings = [ + { + type = "value"; + options = { + "0" = { + text = "DOWN"; + color = "red"; + }; + "1" = { + text = "UP"; + color = "green"; + }; + }; + } + ]; + thresholds = { + mode = "absolute"; + steps = [ + { + color = "red"; + value = null; + } + { + color = "green"; + value = 1; + } + ]; + }; + }; + options = { + reduceOptions = { + calcs = [ "lastNotNull" ]; + fields = ""; + values = false; + }; + orientation = "auto"; + textMode = "auto"; + }; + targets = [ + { + refId = "A"; + expr = "gatus_results_endpoint_success"; + legendFormat = "{{group}}/{{name}}"; + } + ]; + } + { + id = 5; + title = "Prometheus target health"; + type = "stat"; + datasource = { + type = "prometheus"; + uid = "prometheus"; + }; + gridPos = { + h = 8; + w = 12; + x = 12; + y = 8; + }; + fieldConfig.defaults = { + unit = "bool_on_off"; + mappings = [ + { + type = "value"; + options = { + "0" = { + text = "DOWN"; + color = "red"; + }; + "1" = { + text = "UP"; + color = "green"; + }; + }; + } + ]; + }; + targets = [ + { + refId = "A"; + expr = "up"; + legendFormat = "{{job}} {{instance}}"; + } + ]; + } + { + id = 6; + title = "Recent SSH events"; + type = "logs"; + datasource = { + type = "loki"; + uid = "loki"; + }; + gridPos = { + h = 10; + w = 24; + x = 0; + y = 16; + }; + options = { + showTime = true; + showLabels = false; + wrapLogMessage = true; + enableLogDetails = true; + sortOrder = "Descending"; + }; + targets = [ + { + refId = "A"; + expr = ''{log_type="ssh"}''; + } + ]; + } + ]; + }; + + dashboardsDir = pkgs.runCommand "grafana-dashboards" { } '' + mkdir -p $out + ${lib.getExe pkgs.jq} . ${pkgs.writeText "sjanglab-infra-dashboard.json" (builtins.toJSON dashboard)} > $out/sjanglab-infra.json + ''; +in +{ + services.grafana.provision.dashboards.settings = { + apiVersion = 1; + providers = [ + { + name = "infra"; + type = "file"; + disableDeletion = true; + updateIntervalSeconds = 30; + allowUiUpdates = false; + options.path = dashboardsDir; + } + ]; + }; +} diff --git a/modules/monitoring/grafana.nix b/modules/monitoring/grafana.nix index 46263fa..2f788bc 100644 --- a/modules/monitoring/grafana.nix +++ b/modules/monitoring/grafana.nix @@ -10,7 +10,10 @@ let prometheusUrl = "http://${wgAdminAddr}:9090"; in { - imports = [ ../gatus/check.nix ]; + imports = [ + ./grafana-dashboards.nix + ../gatus/check.nix + ]; gatusCheck.push = [ { @@ -65,6 +68,7 @@ in datasources.settings.datasources = [ { name = "Prometheus"; + uid = "prometheus"; type = "prometheus"; url = prometheusUrl; isDefault = true; @@ -72,6 +76,7 @@ in } { name = "Loki"; + uid = "loki"; type = "loki"; url = lokiUrl; editable = false; From 62275922ad09ace70a8df8b763e84a1b24ea47c4 Mon Sep 17 00:00:00 2001 From: mulatta <67085791+mulatta@users.noreply.github.com> Date: Sat, 4 Jul 2026 01:26:19 +0900 Subject: [PATCH 6/8] monitoring: keep Grafana datasource identity stable Avoid breaking startup on existing Grafana databases by keeping datasource provisioning compatible with the already-created Prometheus and Loki entries. --- modules/monitoring/grafana-dashboards.nix | 30 +++++------------------ modules/monitoring/grafana.nix | 2 -- 2 files changed, 6 insertions(+), 26 deletions(-) diff --git a/modules/monitoring/grafana-dashboards.nix b/modules/monitoring/grafana-dashboards.nix index 3714900..7b90484 100644 --- a/modules/monitoring/grafana-dashboards.nix +++ b/modules/monitoring/grafana-dashboards.nix @@ -26,10 +26,7 @@ let id = 1; title = "Host memory available"; type = "timeseries"; - datasource = { - type = "prometheus"; - uid = "prometheus"; - }; + datasource = "Prometheus"; gridPos = { h = 8; w = 8; @@ -53,10 +50,7 @@ let id = 2; title = "CPU busy"; type = "timeseries"; - datasource = { - type = "prometheus"; - uid = "prometheus"; - }; + datasource = "Prometheus"; gridPos = { h = 8; w = 8; @@ -80,10 +74,7 @@ let id = 3; title = "Root filesystem free"; type = "timeseries"; - datasource = { - type = "prometheus"; - uid = "prometheus"; - }; + datasource = "Prometheus"; gridPos = { h = 8; w = 8; @@ -107,10 +98,7 @@ let id = 4; title = "Gatus endpoint success"; type = "stat"; - datasource = { - type = "prometheus"; - uid = "prometheus"; - }; + datasource = "Prometheus"; gridPos = { h = 8; w = 12; @@ -169,10 +157,7 @@ let id = 5; title = "Prometheus target health"; type = "stat"; - datasource = { - type = "prometheus"; - uid = "prometheus"; - }; + datasource = "Prometheus"; gridPos = { h = 8; w = 12; @@ -209,10 +194,7 @@ let id = 6; title = "Recent SSH events"; type = "logs"; - datasource = { - type = "loki"; - uid = "loki"; - }; + datasource = "Loki"; gridPos = { h = 10; w = 24; diff --git a/modules/monitoring/grafana.nix b/modules/monitoring/grafana.nix index 2f788bc..8a5a6af 100644 --- a/modules/monitoring/grafana.nix +++ b/modules/monitoring/grafana.nix @@ -68,7 +68,6 @@ in datasources.settings.datasources = [ { name = "Prometheus"; - uid = "prometheus"; type = "prometheus"; url = prometheusUrl; isDefault = true; @@ -76,7 +75,6 @@ in } { name = "Loki"; - uid = "loki"; type = "loki"; url = lokiUrl; editable = false; From ddd52a16ceb5e149d277eef87167263c4f2636b1 Mon Sep 17 00:00:00 2001 From: mulatta <67085791+mulatta@users.noreply.github.com> Date: Sat, 4 Jul 2026 01:34:32 +0900 Subject: [PATCH 7/8] monitoring: use Grafana default organization for anonymous access Avoid anonymous API and UI failures on fresh deployments where the custom Public organization does not exist. --- modules/monitoring/grafana.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/monitoring/grafana.nix b/modules/monitoring/grafana.nix index 8a5a6af..298e441 100644 --- a/modules/monitoring/grafana.nix +++ b/modules/monitoring/grafana.nix @@ -57,7 +57,7 @@ in # Grafana only listens on wg-admin, so only WG-authenticated hosts can reach it. "auth.anonymous" = { enabled = true; - org_name = "Public"; + org_name = "Main Org."; org_role = "Viewer"; }; }; From ece4de15477678e367ed00949664e3b6ca8820e7 Mon Sep 17 00:00:00 2001 From: mulatta <67085791+mulatta@users.noreply.github.com> Date: Sat, 4 Jul 2026 01:52:25 +0900 Subject: [PATCH 8/8] monitoring: correlate SSH jumps through the bastion ProxyJump hides the real client address from internal hosts because they only see eta as the source. Emit a bastion-side audit event that ties the authenticated client IP to each forwarded target so Loki can answer who reached which host. --- docs/admin/monitoring.md | 9 + hosts/psi.nix | 105 ++++- modules/monitoring/grafana-dashboards.nix | 240 ---------- modules/monitoring/grafana/dashboards.nix | 435 ++++++++++++++++++ .../{grafana.nix => grafana/default.nix} | 10 +- modules/monitoring/vector/monitor-systems.nix | 2 +- modules/sshd/default.nix | 345 ++++++++++++++ 7 files changed, 889 insertions(+), 257 deletions(-) delete mode 100644 modules/monitoring/grafana-dashboards.nix create mode 100644 modules/monitoring/grafana/dashboards.nix rename modules/monitoring/{grafana.nix => grafana/default.nix} (92%) diff --git a/docs/admin/monitoring.md b/docs/admin/monitoring.md index f9ce97c..3805bd7 100644 --- a/docs/admin/monitoring.md +++ b/docs/admin/monitoring.md @@ -41,9 +41,18 @@ flowchart LR | 수집 대상 | 전송처 | 주기 | |---------|--------|------| | sshd 로그 | Loki (rho:3100) | 실시간 | +| SSH bastion forward 매핑 | Loki (rho:3100) | 실시간 | | auditd 로그 | Loki (rho:3100) | 실시간 | | 호스트 메트릭 | Prometheus (rho:9090) | 60초 | +eta는 SSH 인증 로그와 같은 PID의 outbound socket을 관찰해 `ssh_bastion` 로그를 생성하고 Loki로 직접 전송합니다. ProxyJump 때문에 대상 호스트가 eta의 내부 IP만 보더라도 실제 접속원 IP, bastion 사용자, 대상 호스트를 함께 조회할 수 있습니다. + +```logql +{log_type="ssh_bastion", event="bastion_forward"} +``` + +대상 호스트의 SSH 로그와 맞출 때는 `target_host`, `bastion_user`, 시간대, `bastion_local_port`를 함께 봅니다. 대상 호스트 sshd 로그의 `source_port`가 eta에서 기록한 `bastion_local_port`입니다. + ### Prometheus (rho) - 리텐션: 30일 diff --git a/hosts/psi.nix b/hosts/psi.nix index 9aa20bd..737c2db 100644 --- a/hosts/psi.nix +++ b/hosts/psi.nix @@ -1,8 +1,77 @@ { + config, lib, pkgs, ... }: +let + dbSyncDatabases = { + blast-nr.enable = true; + blast-nt.enable = true; + blast-swissprot.enable = true; + uniref90.enable = true; + uniref100.enable = true; + pdb.enable = true; + pdb-mmcif.enable = true; + rnacentral.enable = true; + pfam.enable = true; + rfam.enable = true; + # alphafold.enable = true; # Very large, enable when needed + }; + + monitoredSystemdUnits = [ + "borgbackup-job-psi.service" + ] + ++ map (name: "db-sync-${name}.service") (builtins.attrNames dbSyncDatabases); + + systemdStatusScript = pkgs.writeShellScript "psi-systemd-status" '' + exec ${pkgs.python3}/bin/python3 - <<'PY' + import json + import subprocess + + units = ${builtins.toJSON monitoredSystemdUnits} + properties = [ + "Description", + "LoadState", + "ActiveState", + "SubState", + "Result", + "ExecMainStatus", + "NRestarts", + ] + + for unit in units: + command = ["${pkgs.systemd}/bin/systemctl", "show", unit] + for prop in properties: + command.extend(["--property", prop]) + result = subprocess.run(command, text=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, check=False) + fields = {} + for line in result.stdout.splitlines(): + if "=" not in line: + continue + key, value = line.split("=", 1) + fields[key] = value + + event = { + "host": "psi", + "log_type": "systemd_status", + "unit": unit, + "description": fields.get("Description", ""), + "load_state": fields.get("LoadState", ""), + "active_state": fields.get("ActiveState", ""), + "sub_state": fields.get("SubState", ""), + "result": fields.get("Result", ""), + "last_exit_status": fields.get("ExecMainStatus", ""), + "restart_count": fields.get("NRestarts", ""), + } + event["message"] = ( + f"{unit}: {event['active_state']}/{event['sub_state']} " + f"result={event['result']} exit={event['last_exit_status']}" + ) + print(json.dumps(event, sort_keys=True)) + PY + ''; +in { imports = [ ../modules/disko/xfs-root.nix @@ -67,18 +136,30 @@ root = "/data/databases"; # Enable databases needed for research - databases = { - blast-nr.enable = true; - blast-nt.enable = true; - blast-swissprot.enable = true; - uniref90.enable = true; - uniref100.enable = true; - pdb.enable = true; - pdb-mmcif.enable = true; - rnacentral.enable = true; - pfam.enable = true; - rfam.enable = true; - # alphafold.enable = true; # Very large, enable when needed + databases = dbSyncDatabases; + }; + + services.vector.settings = { + sources.psi_systemd_status_source = { + type = "exec"; + command = [ (toString systemdStatusScript) ]; + mode = "scheduled"; + scheduled.exec_interval_secs = 60; + decoding.codec = "json"; + }; + + sinks.psi_systemd_status_loki = { + type = "loki"; + inputs = [ "psi_systemd_status_source" ]; + endpoint = "http://${config.networking.sbee.hosts.rho.wg-admin}:3100"; + encoding.codec = "json"; + labels = { + host = "{{ host }}"; + log_type = "{{ log_type }}"; + unit = "{{ unit }}"; + active_state = "{{ active_state }}"; + }; + batch.timeout_secs = 10; }; }; diff --git a/modules/monitoring/grafana-dashboards.nix b/modules/monitoring/grafana-dashboards.nix deleted file mode 100644 index 7b90484..0000000 --- a/modules/monitoring/grafana-dashboards.nix +++ /dev/null @@ -1,240 +0,0 @@ -{ - pkgs, - lib, - ... -}: -let - dashboard = { - uid = "sjanglab-infra"; - title = "SjangLab Infrastructure"; - tags = [ - "infra" - "nixos" - ]; - timezone = "browser"; - schemaVersion = 41; - version = 1; - refresh = "30s"; - time = { - from = "now-6h"; - to = "now"; - }; - templating.list = [ ]; - annotations.list = [ ]; - panels = [ - { - id = 1; - title = "Host memory available"; - type = "timeseries"; - datasource = "Prometheus"; - gridPos = { - h = 8; - w = 8; - x = 0; - y = 0; - }; - fieldConfig.defaults = { - unit = "percent"; - min = 0; - max = 100; - }; - targets = [ - { - refId = "A"; - expr = "100 * host_memory_available_bytes / host_memory_total_bytes"; - legendFormat = "{{host}}"; - } - ]; - } - { - id = 2; - title = "CPU busy"; - type = "timeseries"; - datasource = "Prometheus"; - gridPos = { - h = 8; - w = 8; - x = 8; - y = 0; - }; - fieldConfig.defaults = { - unit = "percent"; - min = 0; - max = 100; - }; - targets = [ - { - refId = "A"; - expr = ''(1 - avg by (host) (rate(host_cpu_seconds_total{mode="idle"}[5m]))) * 100''; - legendFormat = "{{host}}"; - } - ]; - } - { - id = 3; - title = "Root filesystem free"; - type = "timeseries"; - datasource = "Prometheus"; - gridPos = { - h = 8; - w = 8; - x = 16; - y = 0; - }; - fieldConfig.defaults = { - unit = "percent"; - min = 0; - max = 100; - }; - targets = [ - { - refId = "A"; - expr = ''100 * host_filesystem_free_bytes{mountpoint="/"} / host_filesystem_total_bytes{mountpoint="/"}''; - legendFormat = "{{host}}"; - } - ]; - } - { - id = 4; - title = "Gatus endpoint success"; - type = "stat"; - datasource = "Prometheus"; - gridPos = { - h = 8; - w = 12; - x = 0; - y = 8; - }; - fieldConfig.defaults = { - unit = "bool_on_off"; - mappings = [ - { - type = "value"; - options = { - "0" = { - text = "DOWN"; - color = "red"; - }; - "1" = { - text = "UP"; - color = "green"; - }; - }; - } - ]; - thresholds = { - mode = "absolute"; - steps = [ - { - color = "red"; - value = null; - } - { - color = "green"; - value = 1; - } - ]; - }; - }; - options = { - reduceOptions = { - calcs = [ "lastNotNull" ]; - fields = ""; - values = false; - }; - orientation = "auto"; - textMode = "auto"; - }; - targets = [ - { - refId = "A"; - expr = "gatus_results_endpoint_success"; - legendFormat = "{{group}}/{{name}}"; - } - ]; - } - { - id = 5; - title = "Prometheus target health"; - type = "stat"; - datasource = "Prometheus"; - gridPos = { - h = 8; - w = 12; - x = 12; - y = 8; - }; - fieldConfig.defaults = { - unit = "bool_on_off"; - mappings = [ - { - type = "value"; - options = { - "0" = { - text = "DOWN"; - color = "red"; - }; - "1" = { - text = "UP"; - color = "green"; - }; - }; - } - ]; - }; - targets = [ - { - refId = "A"; - expr = "up"; - legendFormat = "{{job}} {{instance}}"; - } - ]; - } - { - id = 6; - title = "Recent SSH events"; - type = "logs"; - datasource = "Loki"; - gridPos = { - h = 10; - w = 24; - x = 0; - y = 16; - }; - options = { - showTime = true; - showLabels = false; - wrapLogMessage = true; - enableLogDetails = true; - sortOrder = "Descending"; - }; - targets = [ - { - refId = "A"; - expr = ''{log_type="ssh"}''; - } - ]; - } - ]; - }; - - dashboardsDir = pkgs.runCommand "grafana-dashboards" { } '' - mkdir -p $out - ${lib.getExe pkgs.jq} . ${pkgs.writeText "sjanglab-infra-dashboard.json" (builtins.toJSON dashboard)} > $out/sjanglab-infra.json - ''; -in -{ - services.grafana.provision.dashboards.settings = { - apiVersion = 1; - providers = [ - { - name = "infra"; - type = "file"; - disableDeletion = true; - updateIntervalSeconds = 30; - allowUiUpdates = false; - options.path = dashboardsDir; - } - ]; - }; -} diff --git a/modules/monitoring/grafana/dashboards.nix b/modules/monitoring/grafana/dashboards.nix new file mode 100644 index 0000000..08c21f1 --- /dev/null +++ b/modules/monitoring/grafana/dashboards.nix @@ -0,0 +1,435 @@ +{ + pkgs, + lib, + ... +}: +let + prometheusDatasource = { + type = "prometheus"; + uid = "PBFA97CFB590B2093"; + }; + + lokiDatasource = { + type = "loki"; + uid = "P8E80F9AEF21F6940"; + }; + + dashboard = { + uid = "sjanglab-infra"; + title = "SjangLab Infrastructure"; + tags = [ + "infra" + "nixos" + ]; + timezone = "browser"; + schemaVersion = 41; + version = 1; + refresh = "30s"; + time = { + from = "now-6h"; + to = "now"; + }; + templating.list = [ ]; + annotations.list = [ ]; + panels = [ + { + id = 1; + title = "Host memory available"; + type = "timeseries"; + datasource = prometheusDatasource; + gridPos = { + h = 8; + w = 8; + x = 0; + y = 0; + }; + fieldConfig.defaults = { + unit = "percent"; + min = 0; + max = 100; + }; + targets = [ + { + refId = "A"; + datasource = prometheusDatasource; + expr = "100 * host_memory_available_bytes / host_memory_total_bytes"; + legendFormat = "{{host}}"; + } + ]; + } + { + id = 2; + title = "CPU busy"; + type = "timeseries"; + datasource = prometheusDatasource; + gridPos = { + h = 8; + w = 8; + x = 8; + y = 0; + }; + fieldConfig.defaults = { + unit = "percent"; + min = 0; + max = 100; + }; + targets = [ + { + refId = "A"; + datasource = prometheusDatasource; + expr = ''(1 - avg by (host) (rate(host_cpu_seconds_total{mode="idle"}[5m]))) * 100''; + legendFormat = "{{host}}"; + } + ]; + } + { + id = 3; + title = "Root filesystem free"; + type = "timeseries"; + datasource = prometheusDatasource; + gridPos = { + h = 8; + w = 8; + x = 16; + y = 0; + }; + fieldConfig.defaults = { + unit = "percent"; + min = 0; + max = 100; + }; + targets = [ + { + refId = "A"; + datasource = prometheusDatasource; + expr = ''100 * host_filesystem_free_bytes{mountpoint="/"} / host_filesystem_total_bytes{mountpoint="/"}''; + legendFormat = "{{host}}"; + } + ]; + } + { + id = 4; + title = "Gatus endpoints"; + type = "table"; + datasource = prometheusDatasource; + gridPos = { + h = 8; + w = 12; + x = 0; + y = 8; + }; + options = { + showHeader = true; + cellHeight = "sm"; + footer.show = false; + sortBy = [ + { + displayName = "group"; + desc = false; + } + ]; + }; + transformations = [ + { id = "labelsToFields"; } + { + id = "organize"; + options = { + excludeByName = { + Time = true; + __name__ = true; + key = true; + type = true; + }; + indexByName = { + group = 0; + name = 1; + Value = 2; + gatus_results_endpoint_success = 2; + }; + renameByName = { + Value = "status"; + gatus_results_endpoint_success = "status"; + }; + }; + } + ]; + fieldConfig.defaults = { + mappings = [ + { + type = "value"; + options = { + "0" = { + text = "DOWN"; + color = "red"; + }; + "1" = { + text = "UP"; + color = "green"; + }; + }; + } + ]; + }; + targets = [ + { + refId = "A"; + datasource = prometheusDatasource; + expr = "last_over_time(gatus_results_endpoint_success[5m])"; + instant = true; + format = "table"; + } + ]; + } + { + id = 5; + title = "Prometheus targets"; + type = "table"; + datasource = prometheusDatasource; + gridPos = { + h = 8; + w = 12; + x = 12; + y = 8; + }; + options = { + showHeader = true; + cellHeight = "sm"; + footer.show = false; + }; + transformations = [ + { id = "labelsToFields"; } + { + id = "organize"; + options = { + excludeByName = { + Time = true; + __name__ = true; + }; + indexByName = { + job = 0; + instance = 1; + Value = 2; + up = 2; + }; + renameByName = { + Value = "status"; + up = "status"; + }; + }; + } + ]; + fieldConfig.defaults = { + mappings = [ + { + type = "value"; + options = { + "0" = { + text = "DOWN"; + color = "red"; + }; + "1" = { + text = "UP"; + color = "green"; + }; + }; + } + ]; + }; + targets = [ + { + refId = "A"; + datasource = prometheusDatasource; + expr = "up"; + instant = true; + format = "table"; + } + ]; + } + { + id = 6; + title = "Recent SSH events"; + type = "logs"; + datasource = lokiDatasource; + gridPos = { + h = 10; + w = 24; + x = 0; + y = 16; + }; + options = { + showTime = true; + showLabels = false; + wrapLogMessage = true; + enableLogDetails = true; + sortOrder = "Descending"; + }; + targets = [ + { + refId = "A"; + datasource = lokiDatasource; + expr = ''{log_type="ssh"} | json | line_format "{{.message}}"''; + } + ]; + } + { + id = 7; + title = "SSH bastion forwards"; + type = "table"; + datasource = lokiDatasource; + gridPos = { + h = 8; + w = 24; + x = 0; + y = 26; + }; + options = { + showHeader = true; + cellHeight = "sm"; + footer.show = false; + sortBy = [ + { + displayName = "Time"; + desc = true; + } + ]; + }; + transformations = [ + { + id = "extractFields"; + options = { + source = "Line"; + format = "json"; + replace = true; + keepTime = true; + }; + } + { + id = "organize"; + options = { + excludeByName = { + auth_method = true; + bastion_child_pid = true; + bastion_local_ip = true; + bastion_pid = true; + event = true; + host = true; + key_fingerprint = true; + key_type = true; + log_type = true; + target_port = true; + }; + indexByName = { + Time = 0; + source_ip = 1; + source_port = 2; + bastion_user = 3; + target_host = 4; + target_ip = 5; + bastion_local_port = 6; + message = 7; + }; + renameByName = { + source_ip = "source IP"; + source_port = "source port"; + bastion_user = "user"; + target_host = "target"; + target_ip = "target IP"; + bastion_local_port = "eta port"; + message = "summary"; + }; + }; + } + ]; + targets = [ + { + refId = "A"; + datasource = lokiDatasource; + expr = ''{log_type="ssh_bastion", event="bastion_forward"}''; + } + ]; + } + { + id = 8; + title = "psi systemd status"; + type = "table"; + datasource = lokiDatasource; + gridPos = { + h = 8; + w = 24; + x = 0; + y = 34; + }; + options = { + showHeader = true; + cellHeight = "sm"; + footer.show = false; + }; + transformations = [ + { + id = "extractFields"; + options = { + source = "Line"; + format = "json"; + replace = true; + keepTime = true; + }; + } + { + id = "organize"; + options = { + excludeByName = { + host = true; + log_type = true; + message = true; + }; + indexByName = { + Time = 0; + unit = 1; + active_state = 2; + sub_state = 3; + result = 4; + last_exit_status = 5; + description = 6; + }; + renameByName = { + active_state = "active"; + sub_state = "sub"; + last_exit_status = "exit"; + }; + }; + } + ]; + targets = [ + { + refId = "A"; + datasource = lokiDatasource; + expr = ''{host="psi", log_type="systemd_status"}''; + } + ]; + } + ]; + }; + + dashboardsDir = pkgs.runCommand "grafana-dashboards" { } '' + mkdir -p $out + ${lib.getExe pkgs.jq} . ${pkgs.writeText "sjanglab-infra-dashboard.json" (builtins.toJSON dashboard)} > $out/sjanglab-infra.json + ''; +in +{ + services.grafana.settings.dashboards.default_home_dashboard_path = + "${dashboardsDir}/sjanglab-infra.json"; + + services.grafana.provision.dashboards.settings = { + apiVersion = 1; + providers = [ + { + name = "infra"; + type = "file"; + disableDeletion = true; + updateIntervalSeconds = 30; + allowUiUpdates = false; + options.path = dashboardsDir; + } + ]; + }; +} diff --git a/modules/monitoring/grafana.nix b/modules/monitoring/grafana/default.nix similarity index 92% rename from modules/monitoring/grafana.nix rename to modules/monitoring/grafana/default.nix index 298e441..b47d253 100644 --- a/modules/monitoring/grafana.nix +++ b/modules/monitoring/grafana/default.nix @@ -11,8 +11,8 @@ let in { imports = [ - ./grafana-dashboards.nix - ../gatus/check.nix + ./dashboards.nix + ../../gatus/check.nix ]; gatusCheck.push = [ @@ -68,6 +68,7 @@ in datasources.settings.datasources = [ { name = "Prometheus"; + uid = "PBFA97CFB590B2093"; type = "prometheus"; url = prometheusUrl; isDefault = true; @@ -75,6 +76,7 @@ in } { name = "Loki"; + uid = "P8E80F9AEF21F6940"; type = "loki"; url = lokiUrl; editable = false; @@ -84,13 +86,13 @@ in }; sops.secrets.grafana-admin-password = { - sopsFile = ./secrets.yaml; + sopsFile = ../secrets.yaml; owner = "grafana"; group = "grafana"; }; sops.secrets.grafana-secret-key = { - sopsFile = ./secrets.yaml; + sopsFile = ../secrets.yaml; owner = "grafana"; group = "grafana"; }; diff --git a/modules/monitoring/vector/monitor-systems.nix b/modules/monitoring/vector/monitor-systems.nix index 43223ef..959c064 100644 --- a/modules/monitoring/vector/monitor-systems.nix +++ b/modules/monitoring/vector/monitor-systems.nix @@ -10,7 +10,7 @@ in imports = [ ./default.nix ../loki.nix - ../grafana.nix + ../grafana ../prometheus ../../gatus/check.nix ]; diff --git a/modules/sshd/default.nix b/modules/sshd/default.nix index 688b8ee..444865d 100644 --- a/modules/sshd/default.nix +++ b/modules/sshd/default.nix @@ -1,6 +1,7 @@ { lib, config, + pkgs, ... }: let @@ -35,6 +36,309 @@ let timeWindow = 60; maxAttempts = 5; }; + + bastionTargets = lib.mapAttrs (_name: host: host.wg-admin) ( + lib.filterAttrs (_name: host: host ? wg-admin && host.wg-admin != null) config.networking.sbee.hosts + ); + + bastionTargetsFile = pkgs.writeText "ssh-bastion-targets.json" (builtins.toJSON bastionTargets); + lokiUrl = "http://${config.networking.sbee.hosts.rho.wg-admin}:3100"; + + pamBastionSessionScript = pkgs.writeShellScript "ssh-bastion-pam-session" '' + exec ${pkgs.python3}/bin/python3 - <<'PY' + import json + import os + import time + + def parent(pid: int) -> int: + try: + with open(f"/proc/{pid}/status", "r", encoding="utf-8") as f: + for line in f: + if line.startswith("PPid:"): + return int(line.split()[1]) + except OSError: + return 0 + return 0 + + def comm(pid: int) -> str: + try: + with open(f"/proc/{pid}/comm", "r", encoding="utf-8") as f: + return f.read().strip() + except OSError: + return "" + + pid = os.getppid() + sshd_pid = 0 + while pid > 1: + if comm(pid) == "sshd-session": + sshd_pid = pid + break + pid = parent(pid) + + if sshd_pid == 0: + raise SystemExit(0) + + session_dir = "/run/ssh-bastion-audit/sessions" + os.makedirs(session_dir, mode=0o700, exist_ok=True) + event = { + "seen_at": time.time(), + "bastion_pid": str(sshd_pid), + "bastion_user": os.environ.get("PAM_USER", ""), + "source_ip": os.environ.get("PAM_RHOST", ""), + "source_port": "", + "auth_method": "pam_session", + "key_type": "", + "key_fingerprint": "", + } + path = os.path.join(session_dir, f"{sshd_pid}.json") + tmp = f"{path}.tmp" + with open(tmp, "w", encoding="utf-8") as f: + json.dump(event, f, sort_keys=True) + os.replace(tmp, path) + PY + ''; + + bastionAuditScript = pkgs.writeShellScript "ssh-bastion-audit" '' + exec ${pkgs.python3}/bin/python3 ${pkgs.writeText "ssh-bastion-audit.py" '' + import json + import os + import re + import socket + import sys + import time + import urllib.request + + with open(sys.argv[1], "r", encoding="utf-8") as f: + target_map = json.load(f) + loki_url = sys.argv[2].rstrip("/") + + target_by_ip = {ip: host for host, ip in target_map.items()} + session_dir = "/run/ssh-bastion-audit/sessions" + auth_by_pid = {} + emitted = {} + max_age = 24 * 60 * 60 + + def cleanup(now: float) -> None: + for pid in [pid for pid, info in auth_by_pid.items() if now - info["seen_at"] > max_age]: + del auth_by_pid[pid] + for key in [key for key, seen_at in emitted.items() if now - seen_at > max_age]: + del emitted[key] + + def emit(event: dict[str, object]) -> None: + line = json.dumps(event, sort_keys=True) + print(line, flush=True) + + labels = { + "host": "eta", + "log_type": "ssh_bastion", + "event": "bastion_forward", + "target_host": str(event.get("target_host", "unknown")), + "bastion_user": str(event.get("bastion_user", "unknown")), + } + payload = { + "streams": [ + { + "stream": labels, + "values": [[str(time.time_ns()), line]], + } + ] + } + request = urllib.request.Request( + f"{loki_url}/loki/api/v1/push", + data=json.dumps(payload).encode("utf-8"), + headers={"Content-Type": "application/json"}, + method="POST", + ) + try: + urllib.request.urlopen(request, timeout=5).read() + except Exception as error: + print(f"failed to push ssh bastion audit event to Loki: {error}", file=sys.stderr, flush=True) + + def decode_ipv4(hex_addr: str) -> str: + return socket.inet_ntoa(bytes.fromhex(hex_addr)[::-1]) + + def tcp_table() -> dict[str, dict[str, object]]: + sockets = {} + try: + with open("/proc/net/tcp", "r", encoding="utf-8") as f: + lines = f.readlines()[1:] + except OSError: + return sockets + + for line in lines: + fields = line.split() + if len(fields) < 10: + continue + local_addr, local_port_hex = fields[1].split(":") + remote_addr, remote_port_hex = fields[2].split(":") + sockets[fields[9]] = { + "local_ip": decode_ipv4(local_addr), + "local_port": str(int(local_port_hex, 16)), + "remote_ip": decode_ipv4(remote_addr), + "remote_port": str(int(remote_port_hex, 16)), + "state": fields[3], + } + return sockets + + def process_fields(pid: str) -> dict[str, str]: + try: + with open(f"/proc/{pid}/status", "r", encoding="utf-8") as f: + return dict(line.split(":", 1) for line in f if line.startswith(("Name:", "PPid:"))) + except OSError: + return {} + + def process_args(pid: str) -> str: + try: + with open(f"/proc/{pid}/cmdline", "rb") as f: + return f.read().replace(b"\0", b" ").decode("utf-8", "replace").strip() + except OSError: + return "" + + def sshd_session_pids() -> list[str]: + pids = [] + for name in os.listdir("/proc"): + if name.isdigit() and process_fields(name).get("Name", "").strip() == "sshd-session": + pids.append(name) + return pids + + def parent_pid(pid: str) -> str: + return process_fields(pid).get("PPid", "").strip() + + def child_pids(root_pid: str) -> list[str]: + return [pid for pid in sshd_session_pids() if parent_pid(pid) == root_pid] + + def parse_user_from_args(args: str) -> str: + match = re.search(r"sshd-session: ([^ @\[]+)", args) + if match: + return match.group(1) + return "" + + def infer_auth(pid: str, table: dict[str, dict[str, object]]) -> dict[str, object]: + candidates = [pid] + parent = parent_pid(pid) + if parent: + candidates.append(parent) + candidates.extend(child_pids(pid)) + + user = "" + source_ip = "" + source_port = "" + for candidate in candidates: + if not user: + user = parse_user_from_args(process_args(candidate)) + for inode in pid_socket_inodes(candidate): + conn = table.get(inode) + if conn is None: + continue + if str(conn["local_port"]) == "10022": + source_ip = str(conn["remote_ip"]) + source_port = str(conn["remote_port"]) + break + if source_ip: + break + + return { + "seen_at": time.time(), + "bastion_pid": pid, + "bastion_user": user, + "source_ip": source_ip, + "source_port": source_port, + "auth_method": "proc_socket", + "key_type": "", + "key_fingerprint": "", + } + + def load_sessions(now: float) -> None: + try: + names = os.listdir(session_dir) + except OSError: + return + for name in names: + if not name.endswith(".json"): + continue + path = os.path.join(session_dir, name) + try: + with open(path, "r", encoding="utf-8") as f: + session = json.load(f) + except (OSError, json.JSONDecodeError): + continue + pid = str(session.get("bastion_pid", "")) + seen_at = float(session.get("seen_at", 0)) + if not pid or now - seen_at > max_age: + try: + os.unlink(path) + except OSError: + pass + continue + auth_by_pid[pid] = session + + def pid_socket_inodes(pid: str) -> set[str]: + inodes = set() + fd_dir = f"/proc/{pid}/fd" + try: + fds = os.listdir(fd_dir) + except OSError: + return inodes + for fd in fds: + try: + target = os.readlink(os.path.join(fd_dir, fd)) + except OSError: + continue + if target.startswith("socket:[") and target.endswith("]"): + inodes.add(target[len("socket:[") : -1]) + return inodes + + while True: + now = time.time() + load_sessions(now) + cleanup(now) + table = tcp_table() + auth_snapshot = dict(auth_by_pid) + candidate_roots = set(auth_snapshot) | set(sshd_session_pids()) + + for bastion_pid in candidate_roots: + auth = auth_snapshot.get(bastion_pid) or infer_auth(bastion_pid, table) + candidate_pids = [bastion_pid] + child_pids(bastion_pid) + for pid in candidate_pids: + for inode in pid_socket_inodes(pid): + conn = table.get(inode) + if conn is None: + continue + target_ip = str(conn["remote_ip"]) + target_port = str(conn["remote_port"]) + target_host = target_by_ip.get(target_ip) + + # Only record active SSH jumps into managed wg-admin hosts. + if target_port != "10022" or target_host is None: + continue + + key = (pid, str(conn["local_port"]), target_ip, target_port) + if key in emitted: + continue + emitted[key] = now + + event = { + "event": "bastion_forward", + "log_type": "ssh_bastion", + "host": "eta", + "bastion_pid": bastion_pid, + "bastion_child_pid": "" if pid == bastion_pid else pid, + "target_host": target_host, + "target_ip": target_ip, + "target_port": target_port, + "bastion_local_ip": conn["local_ip"], + "bastion_local_port": conn["local_port"], + } + event.update({k: v for k, v in auth.items() if k != "seen_at"}) + event["message"] = ( + f"SSH bastion forward: {event.get('source_ip', 'unknown')} -> " + f"{target_host} ({target_ip}:{target_port}) as {event.get('bastion_user', 'unknown')} " + f"via eta:{conn['local_port']}" + ) + emit(event) + time.sleep(1) + ''} ${bastionTargetsFile} ${lokiUrl} + ''; in { # ========== SSH server ========== @@ -94,6 +398,47 @@ in ''; }; + security.pam.services.sshd.rules.session.ssh-bastion-audit = lib.mkIf isBastion { + order = config.security.pam.services.sshd.rules.session.unix.order + 10; + control = "optional"; + modulePath = "pam_exec.so"; + args = [ + "seteuid" + (toString pamBastionSessionScript) + ]; + }; + + systemd.tmpfiles.rules = lib.mkIf isBastion [ + "d /run/ssh-bastion-audit 0700 root root - -" + "d /run/ssh-bastion-audit/sessions 0700 root root - -" + ]; + + systemd.services.ssh-bastion-audit = lib.mkIf isBastion { + description = "Emit correlated SSH bastion forwarding audit events"; + wantedBy = [ "multi-user.target" ]; + after = [ + "sshd.service" + "systemd-journald.service" + ]; + serviceConfig = { + ExecStart = bastionAuditScript; + Restart = "always"; + RestartSec = "5s"; + User = "root"; + DynamicUser = false; + PrivateTmp = true; + ProtectSystem = "strict"; + ProtectHome = true; + NoNewPrivileges = true; + RestrictAddressFamilies = [ + "AF_UNIX" + "AF_INET" + ]; + SystemCallFilter = [ "@system-service" ]; + MemoryMax = "256M"; + }; + }; + # ========== SSH CA ========== warnings = lib.optional ( !builtins.pathExists cert && config.networking.hostName != "nixos"