From 267e759c5a2e9b3ed43a9c19008f7acdb8a1ada6 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Wed, 24 Jun 2026 10:38:41 -0500 Subject: [PATCH] feat(telemetry): alert on gateway controller reconcile-error ratio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two new PrometheusRule alerts and a matching runbook for the case where the gateway controller is stuck retrying rejected API server writes. Prod context: since v0.23.4 shipped PR #217 (cert-listener withholding), controller_runtime_reconcile_total{controller="gateway",result="error"} has been ~1963 vs result="success" ~395 — an 83% failure rate with no alert firing. The regression: witholding all listeners on a gateway produces a downstream Gateway with zero listeners, which the Gateway-API CRD rejects as "Required value". The controller hot-loops silently. Alerts added to config/telemetry/alerts/gateways.yaml: - GatewayControllerReconcileErrorRatioHigh (warning, >20% for 15m) - GatewayControllerReconcileErrorRatioCritical (critical, >50% for 10m) Expression uses sum without(result) to aggregate across result label dimensions so the ratio is computed correctly and the alert carries only the controller label (not result="error"). Also adds promtool unit tests (test/prometheus-rules/gateways/) and a runbook (docs/runbooks/gateway-controller-health.md) covering meaning, impact, diagnosis, and remediation for both tiers. Closes #212 (alerting gap identified in that issue). --- config/telemetry/alerts/gateways.yaml | 45 +++++++++ docs/runbooks/gateway-controller-health.md | 97 +++++++++++++++++++ .../gateways/controller-health-rules.yaml | 41 ++++++++ .../gateways/controller-health-tests.yaml | 97 +++++++++++++++++++ 4 files changed, 280 insertions(+) create mode 100644 docs/runbooks/gateway-controller-health.md create mode 100644 test/prometheus-rules/gateways/controller-health-rules.yaml create mode 100644 test/prometheus-rules/gateways/controller-health-tests.yaml diff --git a/config/telemetry/alerts/gateways.yaml b/config/telemetry/alerts/gateways.yaml index beff834..12bb6a9 100644 --- a/config/telemetry/alerts/gateways.yaml +++ b/config/telemetry/alerts/gateways.yaml @@ -50,6 +50,51 @@ spec: summary: "Gateway {{ $labels.resource_name }} has been degraded for over 60 seconds" description: "Gateway {{ $labels.resource_name }} in namespace {{ $labels.resource_namespace }} has been in a degraded state for over 60 seconds without recovering, which exceeds the 60-second SLO threshold." + # Controller reconcile-error alerts detect when the gateway controller is + # failing to write downstream Gateway objects. A high error ratio means updates + # are being rejected by the API server (e.g. invalid objects), the controller + # is hot-looping, and the stale configuration stays programmed at the edge. + - name: nso-gateway-controller-health + interval: 30s + rules: + # Fires when more than 20% of gateway controller reconcile attempts are + # failing over a 10-minute window, sustained for 15 minutes. A common cause + # is a downstream Gateway object failing CRD validation (e.g. all listeners + # withheld leaves spec.listeners empty, which is rejected as Required value). + # While this fires, controller changes are not reaching the edge. + - alert: GatewayControllerReconcileErrorRatioHigh + expr: | + ( + sum without(result) (rate(controller_runtime_reconcile_total{controller="gateway",result="error"}[10m])) + / + sum without(result) (rate(controller_runtime_reconcile_total{controller="gateway"}[10m])) + ) > 0.2 + for: 15m + labels: + severity: warning + annotations: + summary: "Gateway controller reconcile error ratio is {{ $value | humanizePercentage }} (threshold 20%)" + description: "More than 20% of gateway controller reconcile attempts are failing. The controller cannot write downstream Gateway objects — changes are not reaching the edge and the stale configuration stays programmed. Common causes: a downstream Gateway fails CRD validation (e.g. zero listeners after cert withholding empties the list), an API server admission error, or a permissions regression. Check controller logs for 'Reconciler error' entries on the 'gateway' controller." + runbook_url: "https://github.com/datum-cloud/network-services-operator/blob/main/docs/runbooks/gateway-controller-health.md#gatewaycontrollerreconcileerrorratohigh" + + # Fires at a critical threshold when more than 50% of reconciles are failing, + # sustained for 10 minutes. At this rate the controller has essentially + # stopped applying changes — treat as an active outage for gateway programming. + - alert: GatewayControllerReconcileErrorRatioCritical + expr: | + ( + sum without(result) (rate(controller_runtime_reconcile_total{controller="gateway",result="error"}[10m])) + / + sum without(result) (rate(controller_runtime_reconcile_total{controller="gateway"}[10m])) + ) > 0.5 + for: 10m + labels: + severity: critical + annotations: + summary: "Gateway controller reconcile error ratio is {{ $value | humanizePercentage }} (threshold 50%) — edge programming is stalled" + description: "More than 50% of gateway controller reconcile attempts are failing. The controller has effectively stopped programming downstream gateways — this is an active outage for any gateway changes (new listeners, cert updates, connector changes). Check controller logs for 'Reconciler error' entries. See GatewayControllerReconcileErrorRatioHigh for initial diagnosis." + runbook_url: "https://github.com/datum-cloud/network-services-operator/blob/main/docs/runbooks/gateway-controller-health.md#gatewaycontrollerreconcileerrorratocritical" + # TLS certificate health alerts fire on nso_* metrics emitted directly by the # NSO operator and extension server. They are available in the same Prometheus # that loads this rule, alongside the envoy_gateway_* metrics above. diff --git a/docs/runbooks/gateway-controller-health.md b/docs/runbooks/gateway-controller-health.md new file mode 100644 index 0000000..c9b7770 --- /dev/null +++ b/docs/runbooks/gateway-controller-health.md @@ -0,0 +1,97 @@ +# Runbook: Gateway controller reconcile-error alerts + +These alerts fire when the gateway controller's reconcile error rate is +persistently high. A reconcile error means the controller attempted to write or +update a downstream Gateway object and was rejected by the API server. While +errors persist, that gateway's configuration is frozen at its last successfully +written state — listener changes, certificate updates, and connector changes do +not reach the edge. + +The most common trigger is PR #217's cert-withholding feature: when every +listener on a gateway has an unusable TLS certificate, the controller withholds +all of them, producing a downstream Gateway with zero listeners. The Gateway-API +CRD rejects that as a Required value validation error, and the controller enters +a hot-loop retrying the same rejected write. + +Related: issue [#212](https://github.com/datum-cloud/network-services-operator/issues/212) +and PR [#217](https://github.com/datum-cloud/network-services-operator/pull/217). + +## Shared diagnosis + +The controller exposes reconcile outcomes via the standard controller-runtime +counter `controller_runtime_reconcile_total` with a `result` label +(`error` / `success`). + +Identify which gateways are failing by inspecting controller logs: + +```sh +kubectl -n logs -l | grep 'Reconciler error' | grep 'gateway' +``` + +Each error log line names the namespace and name of the object that failed. The +error message explains why the write was rejected. + +Check the downstream Gateway object directly to confirm the current state: + +```sh +kubectl --context -n get gateway -o yaml +``` + +## GatewayControllerReconcileErrorRatioHigh + +**Meaning (warning).** More than 20% of gateway controller reconcile attempts +are returning errors, sustained for 15 minutes. The controller is struggling to +write downstream Gateway objects and some changes are not reaching the edge. + +**Impact.** Affected gateways are not receiving configuration updates. New +listeners, TLS cert rotation, and connector-status changes all depend on +successful reconciles. The edge continues running whatever configuration was +last successfully programmed. + +**Diagnose.** Find the failing gateways in controller logs (see Shared +diagnosis). The most common error messages are: + +- `spec.listeners: Required value` — all listeners were withheld (cert + withholding left none); the Gateway-API CRD requires at least one listener. + If this is the cause, `GatewayListenerCertUnusable` should also be firing for + the same gateway — check whether all listeners on that gateway have unusable + certificates. +- `Forbidden` / `Unauthorized` — permissions regression; check the controller's + RBAC. +- `conflict` / `ResourceVersion` — transient write conflicts; these resolve on + their own and should not sustain a high error rate. + +**Remediate.** Fix the root cause identified in the logs. For the +all-listeners-withheld case, restore at least one usable TLS certificate for the +gateway (or remove the broken Certificate references) — the listener returns +automatically once the certificate is valid. + +## GatewayControllerReconcileErrorRatioCritical + +**Meaning (critical).** More than 50% of gateway controller reconcile attempts +are failing, sustained for 10 minutes. The controller has effectively stopped +programming downstream gateways. + +**Impact.** Treat as an active outage for gateway configuration changes. No +listener updates, TLS rotations, or connector state changes are being applied +to any affected gateways. Customers may see stale routing, expired certificates +left in place, or connectors showing incorrect availability — whatever was +programmed before the errors began. + +**Diagnose.** Follow the same steps as +[GatewayControllerReconcileErrorRatioHigh](#gatewaycontrollerreconcileerrorratohigh). +At this error rate the problem is systematic — check whether the issue affects a +single gateway (one bad object) or many (a broader regression like an RBAC +change or API server outage). + +Check the API server error rate to rule out an upstream problem: + +```sh +kubectl get --raw /healthz +``` + +**Remediate.** Fix the root cause as for the warning tier. If the error is +a validation failure on one gateway, fixing that object's configuration will +unblock the rest. If an API server or permissions change caused a broad failure, +roll it back and verify the controller regains a healthy reconcile ratio before +resolving the alert. diff --git a/test/prometheus-rules/gateways/controller-health-rules.yaml b/test/prometheus-rules/gateways/controller-health-rules.yaml new file mode 100644 index 0000000..3a11464 --- /dev/null +++ b/test/prometheus-rules/gateways/controller-health-rules.yaml @@ -0,0 +1,41 @@ +groups: +- name: nso-gateway-controller-health + interval: 30s + rules: + # Fires when more than 20% of gateway controller reconcile attempts are + # failing over a 10-minute window, sustained for 15 minutes. A common cause + # is a downstream Gateway object failing CRD validation (e.g. all listeners + # withheld leaves spec.listeners empty, which is rejected as Required value). + # While this fires, controller changes are not reaching the edge. + - alert: GatewayControllerReconcileErrorRatioHigh + expr: | + ( + sum without(result) (rate(controller_runtime_reconcile_total{controller="gateway",result="error"}[10m])) + / + sum without(result) (rate(controller_runtime_reconcile_total{controller="gateway"}[10m])) + ) > 0.2 + for: 15m + labels: + severity: warning + annotations: + summary: "Gateway controller reconcile error ratio is {{ $value | humanizePercentage }} (threshold 20%)" + description: "More than 20% of gateway controller reconcile attempts are failing. The controller cannot write downstream Gateway objects — changes are not reaching the edge and the stale configuration stays programmed. Common causes: a downstream Gateway fails CRD validation (e.g. zero listeners after cert withholding empties the list), an API server admission error, or a permissions regression. Check controller logs for 'Reconciler error' entries on the 'gateway' controller." + runbook_url: "https://github.com/datum-cloud/network-services-operator/blob/main/docs/runbooks/gateway-controller-health.md#gatewaycontrollerreconcileerrorratohigh" + + # Fires at a critical threshold when more than 50% of reconciles are failing, + # sustained for 10 minutes. At this rate the controller has essentially + # stopped applying changes — treat as an active outage for gateway programming. + - alert: GatewayControllerReconcileErrorRatioCritical + expr: | + ( + sum without(result) (rate(controller_runtime_reconcile_total{controller="gateway",result="error"}[10m])) + / + sum without(result) (rate(controller_runtime_reconcile_total{controller="gateway"}[10m])) + ) > 0.5 + for: 10m + labels: + severity: critical + annotations: + summary: "Gateway controller reconcile error ratio is {{ $value | humanizePercentage }} (threshold 50%) — edge programming is stalled" + description: "More than 50% of gateway controller reconcile attempts are failing. The controller has effectively stopped programming downstream gateways — this is an active outage for any gateway changes (new listeners, cert updates, connector changes). Check controller logs for 'Reconciler error' entries. See GatewayControllerReconcileErrorRatioHigh for initial diagnosis." + runbook_url: "https://github.com/datum-cloud/network-services-operator/blob/main/docs/runbooks/gateway-controller-health.md#gatewaycontrollerreconcileerrorratocritical" diff --git a/test/prometheus-rules/gateways/controller-health-tests.yaml b/test/prometheus-rules/gateways/controller-health-tests.yaml new file mode 100644 index 0000000..8b9eb1f --- /dev/null +++ b/test/prometheus-rules/gateways/controller-health-tests.yaml @@ -0,0 +1,97 @@ +rule_files: + - controller-health-rules.yaml + +evaluation_interval: 1m + +tests: + # GatewayControllerReconcileErrorRatioHigh — >20% error ratio sustained >15 minutes + # Counter increments every minute: 9 errors + 1 success = 90% error rate. + # The for: 15m means the alert fires at eval_time 16m (pending from 1m through 15m, firing at 16m). + - interval: 1m + input_series: + - series: 'controller_runtime_reconcile_total{controller="gateway",result="error"}' + values: '0+9x20' + - series: 'controller_runtime_reconcile_total{controller="gateway",result="success"}' + values: '0+1x20' + alert_rule_test: + - eval_time: 16m + alertname: GatewayControllerReconcileErrorRatioHigh + exp_alerts: + - exp_labels: + severity: warning + controller: gateway + exp_annotations: + summary: "Gateway controller reconcile error ratio is 90% (threshold 20%)" + description: "More than 20% of gateway controller reconcile attempts are failing. The controller cannot write downstream Gateway objects — changes are not reaching the edge and the stale configuration stays programmed. Common causes: a downstream Gateway fails CRD validation (e.g. zero listeners after cert withholding empties the list), an API server admission error, or a permissions regression. Check controller logs for 'Reconciler error' entries on the 'gateway' controller." + runbook_url: "https://github.com/datum-cloud/network-services-operator/blob/main/docs/runbooks/gateway-controller-health.md#gatewaycontrollerreconcileerrorratohigh" + + # GatewayControllerReconcileErrorRatioHigh — healthy ratio (10% errors), should NOT alert + # 1 error + 9 success = 10% error rate, below the 20% threshold + - interval: 1m + input_series: + - series: 'controller_runtime_reconcile_total{controller="gateway",result="error"}' + values: '0+1x20' + - series: 'controller_runtime_reconcile_total{controller="gateway",result="success"}' + values: '0+9x20' + alert_rule_test: + - eval_time: 20m + alertname: GatewayControllerReconcileErrorRatioHigh + exp_alerts: [] + + # GatewayControllerReconcileErrorRatioHigh — ratio above threshold but under 15m duration, should NOT alert + # 90% error rate but checked at 14m (for: 15m not yet satisfied) + - interval: 1m + input_series: + - series: 'controller_runtime_reconcile_total{controller="gateway",result="error"}' + values: '0+9x20' + - series: 'controller_runtime_reconcile_total{controller="gateway",result="success"}' + values: '0+1x20' + alert_rule_test: + - eval_time: 14m + alertname: GatewayControllerReconcileErrorRatioHigh + exp_alerts: [] + + # GatewayControllerReconcileErrorRatioCritical — >50% error ratio sustained >10 minutes + # 8 errors + 2 success = 80% error rate. Fires at eval_time 11m. + - interval: 1m + input_series: + - series: 'controller_runtime_reconcile_total{controller="gateway",result="error"}' + values: '0+8x20' + - series: 'controller_runtime_reconcile_total{controller="gateway",result="success"}' + values: '0+2x20' + alert_rule_test: + - eval_time: 11m + alertname: GatewayControllerReconcileErrorRatioCritical + exp_alerts: + - exp_labels: + severity: critical + controller: gateway + exp_annotations: + summary: "Gateway controller reconcile error ratio is 80% (threshold 50%) — edge programming is stalled" + description: "More than 50% of gateway controller reconcile attempts are failing. The controller has effectively stopped programming downstream gateways — this is an active outage for any gateway changes (new listeners, cert updates, connector changes). Check controller logs for 'Reconciler error' entries. See GatewayControllerReconcileErrorRatioHigh for initial diagnosis." + runbook_url: "https://github.com/datum-cloud/network-services-operator/blob/main/docs/runbooks/gateway-controller-health.md#gatewaycontrollerreconcileerrorratocritical" + + # GatewayControllerReconcileErrorRatioCritical — 40% error rate (below 50%), should NOT alert + # 4 errors + 6 success = 40% error rate + - interval: 1m + input_series: + - series: 'controller_runtime_reconcile_total{controller="gateway",result="error"}' + values: '0+4x20' + - series: 'controller_runtime_reconcile_total{controller="gateway",result="success"}' + values: '0+6x20' + alert_rule_test: + - eval_time: 20m + alertname: GatewayControllerReconcileErrorRatioCritical + exp_alerts: [] + + # GatewayControllerReconcileErrorRatioCritical — >50% ratio but under 10m duration, should NOT alert + - interval: 1m + input_series: + - series: 'controller_runtime_reconcile_total{controller="gateway",result="error"}' + values: '0+8x20' + - series: 'controller_runtime_reconcile_total{controller="gateway",result="success"}' + values: '0+2x20' + alert_rule_test: + - eval_time: 9m + alertname: GatewayControllerReconcileErrorRatioCritical + exp_alerts: []