diff --git a/cmd/executor/main.go b/cmd/executor/main.go index 571168e..f9b8e83 100644 --- a/cmd/executor/main.go +++ b/cmd/executor/main.go @@ -528,6 +528,12 @@ func (executorMetricsObserver) OnCircuitBreakerTrip(reason string) { // stateToInt maps system states to a numeric gauge value. -1 surfaces an // anomaly on dashboards if a new state is added without updating this mapping. +// +// SYNC SOURCE — keep in lock-step with: +// - cmd/executor/metrics.go:systemStateGauge (Help text) +// - internal/risk/state.go State* constants +// - deploy/docker/prometheus/alerts.yml AetherHalted rule +// - deploy/docker/grafana/dashboards/risk.json func stateToInt(s risk.SystemState) int { switch s { case risk.StateRunning: diff --git a/cmd/executor/metrics.go b/cmd/executor/metrics.go index 6355232..382a036 100644 --- a/cmd/executor/metrics.go +++ b/cmd/executor/metrics.go @@ -71,9 +71,15 @@ var ( Help: "Per-builder submission round-trip latency in ms", Buckets: []float64{10, 25, 50, 100, 250, 500, 1000, 2000, 5000}, }, []string{"builder"}) + // SYNC SOURCE for the system_state integer encoding. Any change here + // must also update: + // - cmd/executor/main.go stateToInt() + // - internal/risk/state.go State* string constants + // - deploy/docker/prometheus/alerts.yml AetherHalted (`== 3`) + // - deploy/docker/grafana/dashboards/risk.json systemStateGauge = prometheus.NewGauge(prometheus.GaugeOpts{ Name: "aether_system_state", - Help: "Current system state (0=Running, 1=Degraded, 2=Paused, 3=Halted)", + Help: "Current system state (0=Running, 1=Degraded, 2=Paused, 3=Halted). See cmd/executor/main.go:stateToInt for the canonical mapping.", }) circuitBreakerTripsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "aether_circuit_breaker_trips_total", @@ -83,6 +89,15 @@ var ( Name: "aether_executor_shadow_bundles_total", Help: "Bundles built+logged but not submitted (AETHER_SHADOW=1)", }) + // Counts every big.Int → float64 down-cast inside addBigIntCounter that + // loses precision. Cumulative profit / gas spent counters cross 2^53 wei + // after a few ETH of lifetime activity, so loss is expected and the log + // line was being emitted on every bundle. Operators can dashboard this + // counter instead. + metricsPrecisionLoss = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "aether_metrics_precision_loss_total", + Help: "Number of big.Int → float64 down-casts in addBigIntCounter that lost precision (expected once cumulative wei counters cross 2^53).", + }) ) func init() { @@ -101,6 +116,7 @@ func init() { systemStateGauge, circuitBreakerTripsTotal, shadowBundles, + metricsPrecisionLoss, ) } @@ -187,7 +203,12 @@ func addBigIntCounter(counter prometheus.Counter, value *big.Int) { } f, accuracy := new(big.Float).SetInt(value).Float64() if accuracy != big.Exact { - log.Printf("Metrics precision loss: %s truncated to %.0f", value.String(), f) + // Cumulative wei counters cross 2^53 after a few ETH of lifetime + // activity, so this branch is expected on a healthy long-running + // bot. Surface it as a counter (dashboardable, alertable, sampleable) + // instead of a per-bundle log line that drowns the rest of the + // executor output. + metricsPrecisionLoss.Inc() } if f == 0 { return diff --git a/cmd/executor/metrics_test.go b/cmd/executor/metrics_test.go index 0cce7b9..c7b54b7 100644 --- a/cmd/executor/metrics_test.go +++ b/cmd/executor/metrics_test.go @@ -222,8 +222,16 @@ func TestEndToEndLatency(t *testing.T) { } func TestRecordBuilderResult_ScrapeLabels(t *testing.T) { - recordBuilderResult("flashbots", true, 42*time.Millisecond) - recordBuilderResult("titan", false, 123*time.Millisecond) + // Use a unique prefix so the global Prometheus registry does not see this + // test's series leak into any aggregate query (e.g. `sum(rate(...))`) + // that another test might assert on. Real builder names ("flashbots", + // "titan") are reserved for production and should not appear in tests. + const ( + nameAlpha = "scrape_alpha" + nameBeta = "scrape_beta" + ) + recordBuilderResult(nameAlpha, true, 42*time.Millisecond) + recordBuilderResult(nameBeta, false, 123*time.Millisecond) server := httptest.NewServer(promhttp.Handler()) defer server.Close() @@ -241,10 +249,10 @@ func TestRecordBuilderResult_ScrapeLabels(t *testing.T) { payload := string(body) required := []string{ - `aether_executor_builder_submissions_total{builder="flashbots",result="success"}`, - `aether_executor_builder_submissions_total{builder="titan",result="failure"}`, - `aether_executor_builder_latency_ms_count{builder="flashbots"}`, - `aether_executor_builder_latency_ms_count{builder="titan"}`, + `aether_executor_builder_submissions_total{builder="` + nameAlpha + `",result="success"}`, + `aether_executor_builder_submissions_total{builder="` + nameBeta + `",result="failure"}`, + `aether_executor_builder_latency_ms_count{builder="` + nameAlpha + `"}`, + `aether_executor_builder_latency_ms_count{builder="` + nameBeta + `"}`, } for _, want := range required { if !strings.Contains(payload, want) { diff --git a/cmd/executor/submitter.go b/cmd/executor/submitter.go index c62908a..1c89efa 100644 --- a/cmd/executor/submitter.go +++ b/cmd/executor/submitter.go @@ -80,15 +80,22 @@ func NewSubmitter(builders []BuilderConfig, searcherKey string) (*Submitter, err } metrics := make(map[string]*BuilderMetrics, len(builders)) - names := make([]string, 0, len(builders)) + enabledNames := make([]string, 0, len(builders)) for _, b := range builders { metrics[b.Name] = &BuilderMetrics{} - names = append(names, b.Name) + // Only pre-register Prometheus series for enabled builders. Disabled + // builders never produce traffic, and registering zero-total series + // for them only confuses operators looking at the AetherBuilderDown + // alert (the alert correctly stays silent, but the missing series + // removes the visual ambiguity entirely). + if b.Enabled { + enabledNames = append(enabledNames, b.Name) + } } // Ensure both {result="success"} and {result="failure"} series exist - // for every configured builder from t=0 so the AetherBuilderDown alert + // for every enabled builder from t=0 so the AetherBuilderDown alert // can reason about builders that have not yet produced either outcome. - PreRegisterBuilderLabels(names) + PreRegisterBuilderLabels(enabledNames) transport := &http.Transport{ MaxIdleConnsPerHost: len(builders), diff --git a/deploy/docker/alertmanager.yml b/deploy/docker/alertmanager.yml index cb14c73..e863b8f 100644 --- a/deploy/docker/alertmanager.yml +++ b/deploy/docker/alertmanager.yml @@ -23,6 +23,14 @@ receivers: {{ end }} inhibit_rules: - - source_matchers: [severity="critical"] + # Suppress all non-critical alerts while AetherHalted is firing. Once the + # bot is halted, every other warning/info is downstream noise — operators + # only need the halt page until manual reset clears it. + # + # The `equal: []` clause means "no label-equality required" — any firing + # AetherHalted suppresses every warning/info system-wide regardless of + # alertname. The previous `equal: [alertname]` form was a no-op because + # each alert has a unique alertname. + - source_matchers: [alertname="AetherHalted", severity="critical"] target_matchers: [severity=~"warning|info"] - equal: [alertname] + equal: [] diff --git a/deploy/docker/docker-compose.yml b/deploy/docker/docker-compose.yml index cb252bc..56239f3 100644 --- a/deploy/docker/docker-compose.yml +++ b/deploy/docker/docker-compose.yml @@ -53,7 +53,9 @@ services: - aether-net prometheus: - image: prom/prometheus:latest + # Pin to a known-good major to avoid silent breaking upgrades on + # `docker compose pull`. Bump in a dedicated PR after smoke-testing. + image: prom/prometheus:v2.54.1 container_name: aether-prometheus ports: - "9091:9090" @@ -66,7 +68,7 @@ services: - aether-net alertmanager: - image: prom/alertmanager:latest + image: prom/alertmanager:v0.27.0 container_name: aether-alertmanager ports: - "9093:9093" @@ -77,11 +79,22 @@ services: - alertmanager-data:/alertmanager # Substitute the webhook URL from the environment at startup so the # secret never lands in the committed config file. + # + # Note: $$VAR escapes Compose-time interpolation so the literal + # `$SLACK_WEBHOOK_URL` is what reaches the container shell. Without the + # escape, Compose expands the value at parse time and the resolved URL + # ends up baked into the container ARGV, visible via `docker inspect`. + # The sed delimiter is `#` so URLs containing `|`, `&`, or `\` survive. entrypoint: - /bin/sh - -c - | - sed "s|__SLACK_WEBHOOK_URL__|${SLACK_WEBHOOK_URL}|g" \ + if [ -z "$$SLACK_WEBHOOK_URL" ]; then + echo "FATAL: SLACK_WEBHOOK_URL is unset; refusing to start alertmanager." >&2 + echo " Set SLACK_WEBHOOK_URL in .env or unset the alertmanager service." >&2 + exit 1 + fi + sed "s#__SLACK_WEBHOOK_URL__#$$SLACK_WEBHOOK_URL#g" \ /etc/alertmanager/alertmanager.yml.tpl > /tmp/alertmanager.yml exec /bin/alertmanager \ --config.file=/tmp/alertmanager.yml \ @@ -91,7 +104,7 @@ services: - aether-net grafana: - image: grafana/grafana:latest + image: grafana/grafana:10.4.7 container_name: aether-grafana ports: - "3000:3000" diff --git a/deploy/docker/prometheus.yml b/deploy/docker/prometheus.yml index 70f7826..a94c998 100644 --- a/deploy/docker/prometheus.yml +++ b/deploy/docker/prometheus.yml @@ -17,3 +17,8 @@ scrape_configs: - job_name: "aether-rust" static_configs: - targets: ["aether-rust:9092"] + # Self-scrape of Alertmanager so AlertmanagerDown can fire when the + # alerting path is itself broken (e.g. config validation crashloop). + - job_name: "alertmanager" + static_configs: + - targets: ["alertmanager:9093"] diff --git a/deploy/docker/prometheus/alerts.yml b/deploy/docker/prometheus/alerts.yml index 04ea81e..7295eb6 100644 --- a/deploy/docker/prometheus/alerts.yml +++ b/deploy/docker/prometheus/alerts.yml @@ -3,6 +3,10 @@ groups: interval: 30s rules: + # NOTE: `== 3` is the integer encoding for the Halted state defined in + # cmd/executor/main.go:stateToInt. Keep in sync with that mapping and + # internal/risk/state.go's State* constants — see the SYNC SOURCE + # comments at both sites. - alert: AetherHalted expr: aether_system_state == 3 for: 1m @@ -34,7 +38,14 @@ groups: description: "p99 end-to-end latency = {{ printf \"%.1f\" $value }}ms over last 5m (target <100ms)." - alert: AetherNoOpportunities - expr: rate(aether_arbs_published_total[10m]) * 60 < 5 + # Suppress during the first 30m after process start so a fresh boot or + # restart does not page operators while the warm-up window is still + # building up the publish-rate window. + expr: | + (rate(aether_arbs_published_total[10m]) * 60 < 5) + unless on() ( + (time() - min(process_start_time_seconds{job="aether-rust"})) < 1800 + ) for: 10m labels: severity: warning @@ -61,6 +72,10 @@ groups: description: "Base fee = {{ printf \"%.1f\" $value }} gwei. Executor preflight will reject arbs until this drops." - alert: AetherBuilderDown + # Disabled builders register zero-total on both {success} and {failure} + # series, so the second leg below filters them out by requiring + # observed traffic. Operators staring at an "idle" disabled builder + # see no alert by design — that's expected, not a regression. expr: | sum by (builder) (rate(aether_executor_builder_submissions_total{result="success"}[2m])) == 0 and on (builder) @@ -70,4 +85,17 @@ groups: severity: critical annotations: summary: "Builder {{ $labels.builder }} has no successful submissions" - description: "Builder {{ $labels.builder }} received submissions but zero succeeded over the last 2m. Check builder endpoint health and auth." + description: "Builder {{ $labels.builder }} received submissions but zero succeeded over the last 2m. Check builder endpoint health and auth. Note: builders configured with Enabled=false are intentionally silent here." + + # Self-monitor of the alerting path. If Alertmanager crashloops (bad + # config, SLACK_WEBHOOK_URL missing, etc.) the rest of the alerts above + # silently never reach Slack; this rule fires through Prometheus's own + # built-in alert delivery so the gap surfaces. + - alert: AlertmanagerDown + expr: up{job="alertmanager"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Alertmanager scrape target is down" + description: "Prometheus has been unable to scrape alertmanager:9093 for 2m. Slack delivery is offline. Check the alertmanager container logs and SLACK_WEBHOOK_URL config." diff --git a/internal/risk/state.go b/internal/risk/state.go index 27b137b..f42e194 100644 --- a/internal/risk/state.go +++ b/internal/risk/state.go @@ -7,6 +7,15 @@ import ( ) // SystemState represents the current system operating state. +// +// SYNC SOURCE — the integer encoding of these states is shared across: +// - cmd/executor/main.go stateToInt() +// - cmd/executor/metrics.go systemStateGauge Help text +// - deploy/docker/prometheus/alerts.yml AetherHalted (`== 3`) +// - deploy/docker/grafana/dashboards/risk.json +// +// Adding a variant here without updating those sites makes the gauge report +// -1 (anomaly value) and breaks the AetherHalted alert. type SystemState string const (