Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cmd/executor/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,12 @@ func (executorMetricsObserver) OnCircuitBreakerTrip(reason string) {

// stateToInt maps system states to a numeric gauge value. -1 surfaces an
// anomaly on dashboards if a new state is added without updating this mapping.
//
// SYNC SOURCE — keep in lock-step with:
// - cmd/executor/metrics.go:systemStateGauge (Help text)
// - internal/risk/state.go State* constants
// - deploy/docker/prometheus/alerts.yml AetherHalted rule
// - deploy/docker/grafana/dashboards/risk.json
func stateToInt(s risk.SystemState) int {
switch s {
case risk.StateRunning:
Expand Down
25 changes: 23 additions & 2 deletions cmd/executor/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,15 @@ var (
Help: "Per-builder submission round-trip latency in ms",
Buckets: []float64{10, 25, 50, 100, 250, 500, 1000, 2000, 5000},
}, []string{"builder"})
// SYNC SOURCE for the system_state integer encoding. Any change here
// must also update:
// - cmd/executor/main.go stateToInt()
// - internal/risk/state.go State* string constants
// - deploy/docker/prometheus/alerts.yml AetherHalted (`== 3`)
// - deploy/docker/grafana/dashboards/risk.json
systemStateGauge = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "aether_system_state",
Help: "Current system state (0=Running, 1=Degraded, 2=Paused, 3=Halted)",
Help: "Current system state (0=Running, 1=Degraded, 2=Paused, 3=Halted). See cmd/executor/main.go:stateToInt for the canonical mapping.",
})
circuitBreakerTripsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "aether_circuit_breaker_trips_total",
Expand All @@ -83,6 +89,15 @@ var (
Name: "aether_executor_shadow_bundles_total",
Help: "Bundles built+logged but not submitted (AETHER_SHADOW=1)",
})
// Counts every big.Int → float64 down-cast inside addBigIntCounter that
// loses precision. Cumulative profit / gas spent counters cross 2^53 wei
// after a few ETH of lifetime activity, so loss is expected and the log
// line was being emitted on every bundle. Operators can dashboard this
// counter instead.
metricsPrecisionLoss = prometheus.NewCounter(prometheus.CounterOpts{
Name: "aether_metrics_precision_loss_total",
Help: "Number of big.Int → float64 down-casts in addBigIntCounter that lost precision (expected once cumulative wei counters cross 2^53).",
})
)

func init() {
Expand All @@ -101,6 +116,7 @@ func init() {
systemStateGauge,
circuitBreakerTripsTotal,
shadowBundles,
metricsPrecisionLoss,
)
}

Expand Down Expand Up @@ -187,7 +203,12 @@ func addBigIntCounter(counter prometheus.Counter, value *big.Int) {
}
f, accuracy := new(big.Float).SetInt(value).Float64()
if accuracy != big.Exact {
log.Printf("Metrics precision loss: %s truncated to %.0f", value.String(), f)
// Cumulative wei counters cross 2^53 after a few ETH of lifetime
// activity, so this branch is expected on a healthy long-running
// bot. Surface it as a counter (dashboardable, alertable, sampleable)
// instead of a per-bundle log line that drowns the rest of the
// executor output.
metricsPrecisionLoss.Inc()
}
if f == 0 {
return
Expand Down
20 changes: 14 additions & 6 deletions cmd/executor/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,16 @@ func TestEndToEndLatency(t *testing.T) {
}

func TestRecordBuilderResult_ScrapeLabels(t *testing.T) {
recordBuilderResult("flashbots", true, 42*time.Millisecond)
recordBuilderResult("titan", false, 123*time.Millisecond)
// Use a unique prefix so the global Prometheus registry does not see this
// test's series leak into any aggregate query (e.g. `sum(rate(...))`)
// that another test might assert on. Real builder names ("flashbots",
// "titan") are reserved for production and should not appear in tests.
const (
nameAlpha = "scrape_alpha"
nameBeta = "scrape_beta"
)
recordBuilderResult(nameAlpha, true, 42*time.Millisecond)
recordBuilderResult(nameBeta, false, 123*time.Millisecond)

server := httptest.NewServer(promhttp.Handler())
defer server.Close()
Expand All @@ -241,10 +249,10 @@ func TestRecordBuilderResult_ScrapeLabels(t *testing.T) {
payload := string(body)

required := []string{
`aether_executor_builder_submissions_total{builder="flashbots",result="success"}`,
`aether_executor_builder_submissions_total{builder="titan",result="failure"}`,
`aether_executor_builder_latency_ms_count{builder="flashbots"}`,
`aether_executor_builder_latency_ms_count{builder="titan"}`,
`aether_executor_builder_submissions_total{builder="` + nameAlpha + `",result="success"}`,
`aether_executor_builder_submissions_total{builder="` + nameBeta + `",result="failure"}`,
`aether_executor_builder_latency_ms_count{builder="` + nameAlpha + `"}`,
`aether_executor_builder_latency_ms_count{builder="` + nameBeta + `"}`,
}
for _, want := range required {
if !strings.Contains(payload, want) {
Expand Down
15 changes: 11 additions & 4 deletions cmd/executor/submitter.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,22 @@ func NewSubmitter(builders []BuilderConfig, searcherKey string) (*Submitter, err
}

metrics := make(map[string]*BuilderMetrics, len(builders))
names := make([]string, 0, len(builders))
enabledNames := make([]string, 0, len(builders))
for _, b := range builders {
metrics[b.Name] = &BuilderMetrics{}
names = append(names, b.Name)
// Only pre-register Prometheus series for enabled builders. Disabled
// builders never produce traffic, and registering zero-total series
// for them only confuses operators looking at the AetherBuilderDown
// alert (the alert correctly stays silent, but the missing series
// removes the visual ambiguity entirely).
if b.Enabled {
enabledNames = append(enabledNames, b.Name)
}
}
// Ensure both {result="success"} and {result="failure"} series exist
// for every configured builder from t=0 so the AetherBuilderDown alert
// for every enabled builder from t=0 so the AetherBuilderDown alert
// can reason about builders that have not yet produced either outcome.
PreRegisterBuilderLabels(names)
PreRegisterBuilderLabels(enabledNames)

transport := &http.Transport{
MaxIdleConnsPerHost: len(builders),
Expand Down
12 changes: 10 additions & 2 deletions deploy/docker/alertmanager.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ receivers:
{{ end }}

inhibit_rules:
- source_matchers: [severity="critical"]
# Suppress all non-critical alerts while AetherHalted is firing. Once the
# bot is halted, every other warning/info is downstream noise — operators
# only need the halt page until manual reset clears it.
#
# The `equal: []` clause means "no label-equality required" — any firing
# AetherHalted suppresses every warning/info system-wide regardless of
# alertname. The previous `equal: [alertname]` form was a no-op because
# each alert has a unique alertname.
- source_matchers: [alertname="AetherHalted", severity="critical"]
target_matchers: [severity=~"warning|info"]
equal: [alertname]
equal: []
21 changes: 17 additions & 4 deletions deploy/docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ services:
- aether-net

prometheus:
image: prom/prometheus:latest
# Pin to a known-good major to avoid silent breaking upgrades on
# `docker compose pull`. Bump in a dedicated PR after smoke-testing.
image: prom/prometheus:v2.54.1
container_name: aether-prometheus
ports:
- "9091:9090"
Expand All @@ -66,7 +68,7 @@ services:
- aether-net

alertmanager:
image: prom/alertmanager:latest
image: prom/alertmanager:v0.27.0
container_name: aether-alertmanager
ports:
- "9093:9093"
Expand All @@ -77,11 +79,22 @@ services:
- alertmanager-data:/alertmanager
# Substitute the webhook URL from the environment at startup so the
# secret never lands in the committed config file.
#
# Note: $$VAR escapes Compose-time interpolation so the literal
# `$SLACK_WEBHOOK_URL` is what reaches the container shell. Without the
# escape, Compose expands the value at parse time and the resolved URL
# ends up baked into the container ARGV, visible via `docker inspect`.
# The sed delimiter is `#` so URLs containing `|`, `&`, or `\` survive.
entrypoint:
- /bin/sh
- -c
- |
sed "s|__SLACK_WEBHOOK_URL__|${SLACK_WEBHOOK_URL}|g" \
if [ -z "$$SLACK_WEBHOOK_URL" ]; then
echo "FATAL: SLACK_WEBHOOK_URL is unset; refusing to start alertmanager." >&2
echo " Set SLACK_WEBHOOK_URL in .env or unset the alertmanager service." >&2
exit 1
fi
sed "s#__SLACK_WEBHOOK_URL__#$$SLACK_WEBHOOK_URL#g" \
/etc/alertmanager/alertmanager.yml.tpl > /tmp/alertmanager.yml
exec /bin/alertmanager \
--config.file=/tmp/alertmanager.yml \
Expand All @@ -91,7 +104,7 @@ services:
- aether-net

grafana:
image: grafana/grafana:latest
image: grafana/grafana:10.4.7
container_name: aether-grafana
ports:
- "3000:3000"
Expand Down
5 changes: 5 additions & 0 deletions deploy/docker/prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,8 @@ scrape_configs:
- job_name: "aether-rust"
static_configs:
- targets: ["aether-rust:9092"]
# Self-scrape of Alertmanager so AlertmanagerDown can fire when the
# alerting path is itself broken (e.g. config validation crashloop).
- job_name: "alertmanager"
static_configs:
- targets: ["alertmanager:9093"]
32 changes: 30 additions & 2 deletions deploy/docker/prometheus/alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ groups:
interval: 30s
rules:

# NOTE: `== 3` is the integer encoding for the Halted state defined in
# cmd/executor/main.go:stateToInt. Keep in sync with that mapping and
# internal/risk/state.go's State* constants — see the SYNC SOURCE
# comments at both sites.
- alert: AetherHalted
expr: aether_system_state == 3
for: 1m
Expand Down Expand Up @@ -34,7 +38,14 @@ groups:
description: "p99 end-to-end latency = {{ printf \"%.1f\" $value }}ms over last 5m (target <100ms)."

- alert: AetherNoOpportunities
expr: rate(aether_arbs_published_total[10m]) * 60 < 5
# Suppress during the first 30m after process start so a fresh boot or
# restart does not page operators while the warm-up window is still
# building up the publish-rate window.
expr: |
(rate(aether_arbs_published_total[10m]) * 60 < 5)
unless on() (
(time() - min(process_start_time_seconds{job="aether-rust"})) < 1800
)
for: 10m
labels:
severity: warning
Expand All @@ -61,6 +72,10 @@ groups:
description: "Base fee = {{ printf \"%.1f\" $value }} gwei. Executor preflight will reject arbs until this drops."

- alert: AetherBuilderDown
# Disabled builders register zero-total on both {success} and {failure}
# series, so the second leg below filters them out by requiring
# observed traffic. Operators staring at an "idle" disabled builder
# see no alert by design — that's expected, not a regression.
expr: |
sum by (builder) (rate(aether_executor_builder_submissions_total{result="success"}[2m])) == 0
and on (builder)
Expand All @@ -70,4 +85,17 @@ groups:
severity: critical
annotations:
summary: "Builder {{ $labels.builder }} has no successful submissions"
description: "Builder {{ $labels.builder }} received submissions but zero succeeded over the last 2m. Check builder endpoint health and auth."
description: "Builder {{ $labels.builder }} received submissions but zero succeeded over the last 2m. Check builder endpoint health and auth. Note: builders configured with Enabled=false are intentionally silent here."

# Self-monitor of the alerting path. If Alertmanager crashloops (bad
# config, SLACK_WEBHOOK_URL missing, etc.) the rest of the alerts above
# silently never reach Slack; this rule fires through Prometheus's own
# built-in alert delivery so the gap surfaces.
- alert: AlertmanagerDown
expr: up{job="alertmanager"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Alertmanager scrape target is down"
description: "Prometheus has been unable to scrape alertmanager:9093 for 2m. Slack delivery is offline. Check the alertmanager container logs and SLACK_WEBHOOK_URL config."
9 changes: 9 additions & 0 deletions internal/risk/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ import (
)

// SystemState represents the current system operating state.
//
// SYNC SOURCE — the integer encoding of these states is shared across:
// - cmd/executor/main.go stateToInt()
// - cmd/executor/metrics.go systemStateGauge Help text
// - deploy/docker/prometheus/alerts.yml AetherHalted (`== 3`)
// - deploy/docker/grafana/dashboards/risk.json
//
// Adding a variant here without updating those sites makes the gauge report
// -1 (anomaly value) and breaks the AetherHalted alert.
type SystemState string

const (
Expand Down
Loading