datum-cloud · scotwells · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/.golangci.yml b/.golangci.yml
@@ -8,7 +8,6 @@ linters:
     - dupl
     - errcheck
     - ginkgolinter
-    - goconst
     - gocyclo
     - govet
     - ineffassign
@@ -37,11 +36,6 @@ linters:
           - dupl
           - lll
         path: internal/*
-      # Repeated string literals in tests are usually fixture/table data;
-      # extracting them to constants hurts readability more than it helps.
-      - linters:
-          - goconst
-        path: _test\.go
       # The validation packages are built almost entirely from field.ErrorList
       # accumulators that hold a handful of errors; preallocating them adds noise
       # without meaningful benefit.

diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml
@@ -17,6 +17,13 @@ namePrefix: network-services-operator-
 components:
 - ../resource-metrics
 - ../webhook
+# Prometheus ServiceMonitor for the controller-manager metrics endpoint.
+# The controller serves metrics over HTTPS on :8443 with delegated authn/authz
+# (controller-runtime WithAuthenticationAndAuthorization). The ServiceMonitor
+# uses insecureSkipVerify because the controller auto-generates a self-signed
+# TLS cert — there is no cert-manager-issued cert for the metrics endpoint and
+# no CA bundle to reference. Prometheus still authenticates via the bearer token.
+- ../prometheus
 
 resources:
 - ../crd
@@ -26,10 +33,7 @@ resources:
 # crd/kustomization.yaml
 # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required.
 #- ../certmanager
-# [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'.
-#- ../prometheus
-# [METRICS] Expose the controller manager metrics service.
-- metrics_service.yaml
+# metrics_service.yaml is now included by the ../prometheus component above.
 # [NETWORK POLICY] Protect the /metrics endpoint and Webhook Server with NetworkPolicy.
 # Only Pod(s) running a namespace labeled with 'metrics: enabled' will be able to gather the metrics.
 # Only CR(s) which requires webhooks and are applied on namespaces labeled with 'webhooks: enabled' will

diff --git a/config/extension-server/kustomization.yaml b/config/extension-server/kustomization.yaml
@@ -9,6 +9,7 @@ resources:
   - rbac
   - certmanager
   - network-policy
+  - metrics-monitor.yaml
 
 images:
   - name: ghcr.io/datum-cloud/network-services-operator

diff --git a/config/extension-server/metrics-monitor.yaml b/config/extension-server/metrics-monitor.yaml
@@ -0,0 +1,22 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  labels:
+    app.kubernetes.io/name: network-services-operator
+    app.kubernetes.io/component: envoy-gateway-extension-server
+    app.kubernetes.io/managed-by: kustomize
+  name: envoy-gateway-extension-server-metrics
+  namespace: system
+spec:
+  endpoints:
+    # The extension server serves /metrics on plain HTTP (no TLS) on the
+    # health-addr port (:8080). Only the gRPC port uses mTLS; the health
+    # address intentionally stays plain HTTP so Kubernetes probes don't
+    # need certificates.
+    - path: /metrics
+      port: metrics
+      scheme: http
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: network-services-operator
+      app.kubernetes.io/component: envoy-gateway-extension-server
diff --git a/config/telemetry/alerts/gateways.yaml b/config/telemetry/alerts/gateways.yaml
@@ -22,6 +22,16 @@ spec:
         summary: "Gateway {{ $labels.resource_name }} is taking longer than 60 seconds to reach Ready status"
         description: "Gateway {{ $labels.resource_name }} in namespace {{ $labels.resource_namespace }} has been in creation state for {{ $value }} seconds without reaching Ready status (Accepted=True AND Programmed=True), which exceeds the 60-second SLO threshold."
 
+    - alert: EnvoyPatchPolicyProgrammingFailed
+      expr: |
+        envoy_gateway_envoypatchpolicy_status_condition{type="Programmed"} == 0
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: "EnvoyPatchPolicy {{ $labels.name }} failed to program"
+        description: "EnvoyPatchPolicy {{ $labels.name }} in namespace {{ $labels.namespace }} has been unable to apply its xDS patch for over 5 minutes (reason: {{ $labels.reason }}). Customer traffic on affected gateways may be impacted."
+
     - alert: GatewayDegradedSLOViolation
       expr: |
         (
@@ -39,3 +49,75 @@ spec:
       annotations:
         summary: "Gateway {{ $labels.resource_name }} has been degraded for over 60 seconds"
         description: "Gateway {{ $labels.resource_name }} in namespace {{ $labels.resource_namespace }} has been in a degraded state for over 60 seconds without recovering, which exceeds the 60-second SLO threshold."
+
+  # TLS certificate health alerts fire on nso_* metrics emitted directly by the
+  # NSO operator and extension server. They are available in the same Prometheus
+  # that loads this rule, alongside the envoy_gateway_* metrics above.
+  # These complement the infrastructure EnvoyListenerUpdateRejected alert (which
+  # fires when Envoy rejects a bad LDS update). These alerts cover the earlier
+  # prevention path: NSO withholds a listener or the extension server drops a
+  # broken filter chain before Envoy has a chance to reject the update.
+  - name: nso-tls-cert-health
+    interval: 30s
+    rules:
+    # Fires when NSO has withheld a Gateway listener because its TLS certificate
+    # is unusable. The customer's HTTPS hostname is dark until the cert recovers.
+    # If EnvoyListenerUpdateRejected is also firing without this alert, NSO's
+    # cert gating has regressed and a bad cert reached Envoy directly.
+    - alert: GatewayListenerCertUnusable
+      expr: |
+        nso_gateway_listener_cert_withheld == 1
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: "Gateway listener {{ $labels.namespace }}/{{ $labels.name }}/{{ $labels.listener }} has an unusable TLS certificate"
+        description: "NSO has withheld listener {{ $labels.listener }} (hostname {{ $labels.hostname }}) on Gateway {{ $labels.name }} in namespace {{ $labels.namespace }} because its TLS certificate is unusable (reason: {{ $labels.reason }}). The customer cannot serve HTTPS on this hostname. Check the cert-manager Certificate and Secret in the downstream cluster."
+        runbook_url: "https://github.com/datum-cloud/network-services-operator/blob/main/docs/runbooks/gateway-tls-certificates.md#gatewaylistenercertunusable"
+
+    # Fires when a managed TLS certificate is within 7 days of expiry while it
+    # is still healthy. cert-manager renews automatically, but renewal fails if
+    # the domain's DNS no longer points to Datum. Acting here avoids a future
+    # GatewayListenerCertUnusable alert.
+    - alert: GatewayListenerCertExpiringSoon
+      expr: |
+        (nso_gateway_listener_cert_expiry_time - time()) / 86400 < 7
+      for: 1h
+      labels:
+        severity: warning
+      annotations:
+        summary: "TLS certificate for Gateway listener {{ $labels.namespace }}/{{ $labels.name }}/{{ $labels.listener }} expires in less than 7 days"
+        description: "The cert-manager Certificate for listener {{ $labels.listener }} (hostname {{ $labels.hostname }}, secret {{ $labels.secret }}) on Gateway {{ $labels.name }} in namespace {{ $labels.namespace }} expires within 7 days. cert-manager should renew it automatically, but renewal fails if the domain's DNS no longer points to Datum. Verify the Certificate is Ready=True in the downstream cluster."
+        runbook_url: "https://github.com/datum-cloud/network-services-operator/blob/main/docs/runbooks/gateway-tls-certificates.md#gatewaylistenercertexpiringsoon"
+
+    # Fires when the extension server is actively dropping broken certificates
+    # from the configuration it sends to the edge. This is expected briefly
+    # between a certificate failing and the controller withholding the listener.
+    # If only this fires and GatewayListenerCertUnusable does not, the controller
+    # may have missed the listener.
+    - alert: TLSBackstopPruningChains
+      expr: |
+        nso_extension_tls_pruned_chains_active > 0
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: "Extension server is dropping {{ $value }} broken certificate(s) to protect the edge listener"
+        description: "The extension server is dropping {{ $value }} broken certificate(s) from the configuration it sends to the edge gateway. Check extension server logs for 'pruned invalid TLS chains' to find the affected hostnames. If GatewayListenerCertUnusable is also firing, both layers of protection are working as expected. If only this alert fires, the controller may have missed the listener."
+        runbook_url: "https://github.com/datum-cloud/network-services-operator/blob/main/docs/runbooks/gateway-tls-certificates.md#tlsbackstoppruningchains"
+
+    # Fires when the extension server could not protect a listener because every
+    # certificate on it is broken. It never removes a listener entirely, so the
+    # edge will reject the update for that listener — EnvoyListenerUpdateRejected
+    # (infra) confirms it. It means the controller did not withhold the listener
+    # before it reached the edge.
+    - alert: TLSBackstopListenerAllCertsBroken
+      expr: |
+        nso_extension_tls_listeners_left_intact_active > 0
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: "{{ $value }} edge listener(s) have every TLS certificate broken and cannot be protected"
+        description: "The extension server left {{ $value }} edge listener(s) untouched because every certificate on them is broken. It never removes a listener entirely, so the edge will reject the configuration update for those listeners (EnvoyListenerUpdateRejected confirms it). This means the controller did not withhold the listener before it reached the edge. Check extension server logs for 'listeners_left_intact' and why the controller did not withhold the listener."
+        runbook_url: "https://github.com/datum-cloud/network-services-operator/blob/main/docs/runbooks/gateway-tls-certificates.md#tlsbackstoplistenerallcertsbroken"
diff --git a/docs/runbooks/gateway-tls-certificates.md b/docs/runbooks/gateway-tls-certificates.md
@@ -0,0 +1,137 @@
+# Runbook: Gateway TLS certificate alerts
+
+These alerts cover the health of the TLS certificates that gateway listeners use
+to serve HTTPS. Every HTTPS hostname on a gateway shares a single edge listener,
+so an unusable certificate is handled in two layers:
+
+1. The **controller** leaves a listener with an unusable certificate out of the
+   downstream gateway, so one bad certificate only affects its own hostname and
+   every other hostname keeps serving. The affected listener reports the problem
+   to the customer through its status conditions.
+2. The **extension server** is a backstop: if a bad certificate reaches the edge
+   anyway, it drops only the affected part of the listener rather than letting
+   the whole listener fail.
+
+A certificate is "unusable" when it has expired, is not valid yet, is missing,
+its certificate and key do not match, or it has not been issued yet.
+
+Related: issue [#212](https://github.com/datum-cloud/network-services-operator/issues/212).
+The infra-side `EnvoyListenerUpdateRejected` alert fires when the edge actually
+rejects a listener update — the alerts here are designed to fire *before* that
+happens, or to explain it when it does.
+
+## Shared diagnosis
+
+Each alert carries labels identifying the affected object: `namespace`, `name`
+(the gateway), `listener`, and usually `hostname`.
+
+Find the gateway and the failing listener's status:
+
+```sh
+kubectl -n <namespace> get gateway <name> -o yaml | yq '.status.listeners'
+```
+
+A gated listener reports `Programmed: False` (reason `Invalid`) and
+`ResolvedRefs: False` (reason `InvalidCertificateRef`) with a plain-language
+message naming the hostname.
+
+Inspect the backing certificate on the downstream (edge) cluster. The Certificate
+and its Secret are named `<gateway>-<listener>`:
+
+```sh
+kubectl --context <downstream> -n <downstream-ns> get certificate <gateway>-<listener> -o yaml
+kubectl --context <downstream> -n <downstream-ns> get secret <gateway>-<listener> -o yaml
+```
+
+The most common root cause is a customer pointing their domain away from Datum:
+ACME renewal then fails, the certificate goes `Ready: False`, and it eventually
+expires. That is a customer action, not a platform fault — the listener is
+correctly withheld and recovers on its own once the certificate can be issued.
+
+## GatewayListenerCertUnusable
+
+**Meaning.** The controller is withholding a listener because its certificate is
+unusable. The customer's HTTPS hostname is unavailable until the certificate
+recovers.
+
+**Impact.** Limited to the one hostname. Other hostnames on the gateway are
+unaffected — this is the isolation working as intended.
+
+**Diagnose.** Read the `reason` label and the listener status message (see Shared
+diagnosis). Check the downstream Certificate's `Ready` condition and its
+`status.notAfter`.
+
+**Remediate.** Usually no platform action is needed — confirm whether the
+customer's domain still points to Datum. If it does and issuance is genuinely
+stuck, investigate cert-manager (the issuer, ACME order, and challenge for that
+hostname). The listener returns automatically once the certificate is issued.
+
+## GatewayListenerCertExpiringSoon
+
+**Meaning.** A currently-healthy certificate expires within seven days. This is a
+warning to act before it starts gating the listener.
+
+**Impact.** None yet. It becomes `GatewayListenerCertUnusable` if the certificate
+expires before it is renewed.
+
+**Diagnose.** Check the downstream Certificate's `status.renewalTime` and whether
+recent renewal attempts are failing (cert-manager events / logs for that
+Certificate). Confirm the hostname's DNS still resolves to Datum, since ACME
+renewal depends on it.
+
+**Remediate.** If renewal is failing because DNS moved away, this will become a
+customer-driven gating event — no platform fix. If renewal is failing for a
+platform reason, fix the issuer / ACME path so cert-manager can renew.
+
+## TLSBackstopPruningChains
+
+**Meaning.** The extension server is actively dropping broken certificates from
+the configuration it sends to the edge. This is expected for a short window
+between a certificate failing and the controller withholding the listener.
+
+**Impact.** None on its own — the backstop is protecting the listener. The
+affected hostname is the one whose certificate is broken.
+
+**Diagnose.** Check extension server logs for `pruned invalid TLS chains` to find
+the affected hostnames:
+
+```sh
+kubectl -n <ext-server-ns> logs -l <ext-server-selector> | grep 'pruned invalid TLS chains'
+```
+
+If `GatewayListenerCertUnusable` is also firing for the same hostname, both
+layers are working as expected and no action is needed. If **only** this alert
+fires, the controller did not withhold the listener — see the next alert and
+check why (start with the listener's status conditions and the controller logs).
+
+**Remediate.** Generally none. If it persists without a matching
+`GatewayListenerCertUnusable`, treat it as a controller gap and investigate the
+gateway reconcile for that listener.
+
+## TLSBackstopListenerAllCertsBroken
+
+**Meaning (critical).** Every certificate on an edge listener is broken. The
+backstop never removes a listener entirely, so the edge will reject the
+configuration update for that listener and its config will freeze on its last
+good state.
+
+**Impact.** The listener stops accepting configuration changes. Because the edge
+listener is shared, this can affect every hostname on it — this is the
+fleet-impacting failure the two-layer design exists to prevent, so reaching it
+means the controller-side protection did not catch the listener.
+
+**Diagnose.**
+
+```sh
+kubectl -n <ext-server-ns> logs -l <ext-server-selector> | grep 'listeners_left_intact'
+```
+
+Cross-check the infra `EnvoyListenerUpdateRejected` alert, which confirms the
+edge is rejecting the update. Identify every certificate on the affected listener
+and why each is broken (expired, not yet valid, or mismatched), then determine
+why the controller did not withhold the listener before it reached the edge.
+
+**Remediate.** Restore or remove the broken certificates so the listener has at
+least one usable certificate, which lets the edge accept the update again. Then
+follow up on the controller gap that allowed an all-broken listener to be
+programmed.