diff --git a/.golangci.yml b/.golangci.yml index 871f9782..1678c6cb 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -8,7 +8,6 @@ linters: - dupl - errcheck - ginkgolinter - - goconst - gocyclo - govet - ineffassign @@ -37,11 +36,6 @@ linters: - dupl - lll path: internal/* - # Repeated string literals in tests are usually fixture/table data; - # extracting them to constants hurts readability more than it helps. - - linters: - - goconst - path: _test\.go # The validation packages are built almost entirely from field.ErrorList # accumulators that hold a handful of errors; preallocating them adds noise # without meaningful benefit. diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index d45fe305..86798fde 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -17,6 +17,13 @@ namePrefix: network-services-operator- components: - ../resource-metrics - ../webhook +# Prometheus ServiceMonitor for the controller-manager metrics endpoint. +# The controller serves metrics over HTTPS on :8443 with delegated authn/authz +# (controller-runtime WithAuthenticationAndAuthorization). The ServiceMonitor +# uses insecureSkipVerify because the controller auto-generates a self-signed +# TLS cert — there is no cert-manager-issued cert for the metrics endpoint and +# no CA bundle to reference. Prometheus still authenticates via the bearer token. +- ../prometheus resources: - ../crd @@ -26,10 +33,7 @@ resources: # crd/kustomization.yaml # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. #- ../certmanager -# [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. -#- ../prometheus -# [METRICS] Expose the controller manager metrics service. -- metrics_service.yaml +# metrics_service.yaml is now included by the ../prometheus component above. # [NETWORK POLICY] Protect the /metrics endpoint and Webhook Server with NetworkPolicy. # Only Pod(s) running a namespace labeled with 'metrics: enabled' will be able to gather the metrics. # Only CR(s) which requires webhooks and are applied on namespaces labeled with 'webhooks: enabled' will diff --git a/config/extension-server/kustomization.yaml b/config/extension-server/kustomization.yaml index 181e191b..97c028cb 100644 --- a/config/extension-server/kustomization.yaml +++ b/config/extension-server/kustomization.yaml @@ -9,6 +9,7 @@ resources: - rbac - certmanager - network-policy + - metrics-monitor.yaml images: - name: ghcr.io/datum-cloud/network-services-operator diff --git a/config/extension-server/metrics-monitor.yaml b/config/extension-server/metrics-monitor.yaml new file mode 100644 index 00000000..ef6df309 --- /dev/null +++ b/config/extension-server/metrics-monitor.yaml @@ -0,0 +1,22 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: network-services-operator + app.kubernetes.io/component: envoy-gateway-extension-server + app.kubernetes.io/managed-by: kustomize + name: envoy-gateway-extension-server-metrics + namespace: system +spec: + endpoints: + # The extension server serves /metrics on plain HTTP (no TLS) on the + # health-addr port (:8080). Only the gRPC port uses mTLS; the health + # address intentionally stays plain HTTP so Kubernetes probes don't + # need certificates. + - path: /metrics + port: metrics + scheme: http + selector: + matchLabels: + app.kubernetes.io/name: network-services-operator + app.kubernetes.io/component: envoy-gateway-extension-server diff --git a/config/telemetry/alerts/gateways.yaml b/config/telemetry/alerts/gateways.yaml index 44e42887..beff8344 100644 --- a/config/telemetry/alerts/gateways.yaml +++ b/config/telemetry/alerts/gateways.yaml @@ -22,6 +22,16 @@ spec: summary: "Gateway {{ $labels.resource_name }} is taking longer than 60 seconds to reach Ready status" description: "Gateway {{ $labels.resource_name }} in namespace {{ $labels.resource_namespace }} has been in creation state for {{ $value }} seconds without reaching Ready status (Accepted=True AND Programmed=True), which exceeds the 60-second SLO threshold." + - alert: EnvoyPatchPolicyProgrammingFailed + expr: | + envoy_gateway_envoypatchpolicy_status_condition{type="Programmed"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "EnvoyPatchPolicy {{ $labels.name }} failed to program" + description: "EnvoyPatchPolicy {{ $labels.name }} in namespace {{ $labels.namespace }} has been unable to apply its xDS patch for over 5 minutes (reason: {{ $labels.reason }}). Customer traffic on affected gateways may be impacted." + - alert: GatewayDegradedSLOViolation expr: | ( @@ -39,3 +49,75 @@ spec: annotations: summary: "Gateway {{ $labels.resource_name }} has been degraded for over 60 seconds" description: "Gateway {{ $labels.resource_name }} in namespace {{ $labels.resource_namespace }} has been in a degraded state for over 60 seconds without recovering, which exceeds the 60-second SLO threshold." + + # TLS certificate health alerts fire on nso_* metrics emitted directly by the + # NSO operator and extension server. They are available in the same Prometheus + # that loads this rule, alongside the envoy_gateway_* metrics above. + # These complement the infrastructure EnvoyListenerUpdateRejected alert (which + # fires when Envoy rejects a bad LDS update). These alerts cover the earlier + # prevention path: NSO withholds a listener or the extension server drops a + # broken filter chain before Envoy has a chance to reject the update. + - name: nso-tls-cert-health + interval: 30s + rules: + # Fires when NSO has withheld a Gateway listener because its TLS certificate + # is unusable. The customer's HTTPS hostname is dark until the cert recovers. + # If EnvoyListenerUpdateRejected is also firing without this alert, NSO's + # cert gating has regressed and a bad cert reached Envoy directly. + - alert: GatewayListenerCertUnusable + expr: | + nso_gateway_listener_cert_withheld == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Gateway listener {{ $labels.namespace }}/{{ $labels.name }}/{{ $labels.listener }} has an unusable TLS certificate" + description: "NSO has withheld listener {{ $labels.listener }} (hostname {{ $labels.hostname }}) on Gateway {{ $labels.name }} in namespace {{ $labels.namespace }} because its TLS certificate is unusable (reason: {{ $labels.reason }}). The customer cannot serve HTTPS on this hostname. Check the cert-manager Certificate and Secret in the downstream cluster." + runbook_url: "https://github.com/datum-cloud/network-services-operator/blob/main/docs/runbooks/gateway-tls-certificates.md#gatewaylistenercertunusable" + + # Fires when a managed TLS certificate is within 7 days of expiry while it + # is still healthy. cert-manager renews automatically, but renewal fails if + # the domain's DNS no longer points to Datum. Acting here avoids a future + # GatewayListenerCertUnusable alert. + - alert: GatewayListenerCertExpiringSoon + expr: | + (nso_gateway_listener_cert_expiry_time - time()) / 86400 < 7 + for: 1h + labels: + severity: warning + annotations: + summary: "TLS certificate for Gateway listener {{ $labels.namespace }}/{{ $labels.name }}/{{ $labels.listener }} expires in less than 7 days" + description: "The cert-manager Certificate for listener {{ $labels.listener }} (hostname {{ $labels.hostname }}, secret {{ $labels.secret }}) on Gateway {{ $labels.name }} in namespace {{ $labels.namespace }} expires within 7 days. cert-manager should renew it automatically, but renewal fails if the domain's DNS no longer points to Datum. Verify the Certificate is Ready=True in the downstream cluster." + runbook_url: "https://github.com/datum-cloud/network-services-operator/blob/main/docs/runbooks/gateway-tls-certificates.md#gatewaylistenercertexpiringsoon" + + # Fires when the extension server is actively dropping broken certificates + # from the configuration it sends to the edge. This is expected briefly + # between a certificate failing and the controller withholding the listener. + # If only this fires and GatewayListenerCertUnusable does not, the controller + # may have missed the listener. + - alert: TLSBackstopPruningChains + expr: | + nso_extension_tls_pruned_chains_active > 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Extension server is dropping {{ $value }} broken certificate(s) to protect the edge listener" + description: "The extension server is dropping {{ $value }} broken certificate(s) from the configuration it sends to the edge gateway. Check extension server logs for 'pruned invalid TLS chains' to find the affected hostnames. If GatewayListenerCertUnusable is also firing, both layers of protection are working as expected. If only this alert fires, the controller may have missed the listener." + runbook_url: "https://github.com/datum-cloud/network-services-operator/blob/main/docs/runbooks/gateway-tls-certificates.md#tlsbackstoppruningchains" + + # Fires when the extension server could not protect a listener because every + # certificate on it is broken. It never removes a listener entirely, so the + # edge will reject the update for that listener — EnvoyListenerUpdateRejected + # (infra) confirms it. It means the controller did not withhold the listener + # before it reached the edge. + - alert: TLSBackstopListenerAllCertsBroken + expr: | + nso_extension_tls_listeners_left_intact_active > 0 + for: 2m + labels: + severity: critical + annotations: + summary: "{{ $value }} edge listener(s) have every TLS certificate broken and cannot be protected" + description: "The extension server left {{ $value }} edge listener(s) untouched because every certificate on them is broken. It never removes a listener entirely, so the edge will reject the configuration update for those listeners (EnvoyListenerUpdateRejected confirms it). This means the controller did not withhold the listener before it reached the edge. Check extension server logs for 'listeners_left_intact' and why the controller did not withhold the listener." + runbook_url: "https://github.com/datum-cloud/network-services-operator/blob/main/docs/runbooks/gateway-tls-certificates.md#tlsbackstoplistenerallcertsbroken" diff --git a/docs/runbooks/gateway-tls-certificates.md b/docs/runbooks/gateway-tls-certificates.md new file mode 100644 index 00000000..58f96995 --- /dev/null +++ b/docs/runbooks/gateway-tls-certificates.md @@ -0,0 +1,137 @@ +# Runbook: Gateway TLS certificate alerts + +These alerts cover the health of the TLS certificates that gateway listeners use +to serve HTTPS. Every HTTPS hostname on a gateway shares a single edge listener, +so an unusable certificate is handled in two layers: + +1. The **controller** leaves a listener with an unusable certificate out of the + downstream gateway, so one bad certificate only affects its own hostname and + every other hostname keeps serving. The affected listener reports the problem + to the customer through its status conditions. +2. The **extension server** is a backstop: if a bad certificate reaches the edge + anyway, it drops only the affected part of the listener rather than letting + the whole listener fail. + +A certificate is "unusable" when it has expired, is not valid yet, is missing, +its certificate and key do not match, or it has not been issued yet. + +Related: issue [#212](https://github.com/datum-cloud/network-services-operator/issues/212). +The infra-side `EnvoyListenerUpdateRejected` alert fires when the edge actually +rejects a listener update — the alerts here are designed to fire *before* that +happens, or to explain it when it does. + +## Shared diagnosis + +Each alert carries labels identifying the affected object: `namespace`, `name` +(the gateway), `listener`, and usually `hostname`. + +Find the gateway and the failing listener's status: + +```sh +kubectl -n get gateway -o yaml | yq '.status.listeners' +``` + +A gated listener reports `Programmed: False` (reason `Invalid`) and +`ResolvedRefs: False` (reason `InvalidCertificateRef`) with a plain-language +message naming the hostname. + +Inspect the backing certificate on the downstream (edge) cluster. The Certificate +and its Secret are named `-`: + +```sh +kubectl --context -n get certificate - -o yaml +kubectl --context -n get secret - -o yaml +``` + +The most common root cause is a customer pointing their domain away from Datum: +ACME renewal then fails, the certificate goes `Ready: False`, and it eventually +expires. That is a customer action, not a platform fault — the listener is +correctly withheld and recovers on its own once the certificate can be issued. + +## GatewayListenerCertUnusable + +**Meaning.** The controller is withholding a listener because its certificate is +unusable. The customer's HTTPS hostname is unavailable until the certificate +recovers. + +**Impact.** Limited to the one hostname. Other hostnames on the gateway are +unaffected — this is the isolation working as intended. + +**Diagnose.** Read the `reason` label and the listener status message (see Shared +diagnosis). Check the downstream Certificate's `Ready` condition and its +`status.notAfter`. + +**Remediate.** Usually no platform action is needed — confirm whether the +customer's domain still points to Datum. If it does and issuance is genuinely +stuck, investigate cert-manager (the issuer, ACME order, and challenge for that +hostname). The listener returns automatically once the certificate is issued. + +## GatewayListenerCertExpiringSoon + +**Meaning.** A currently-healthy certificate expires within seven days. This is a +warning to act before it starts gating the listener. + +**Impact.** None yet. It becomes `GatewayListenerCertUnusable` if the certificate +expires before it is renewed. + +**Diagnose.** Check the downstream Certificate's `status.renewalTime` and whether +recent renewal attempts are failing (cert-manager events / logs for that +Certificate). Confirm the hostname's DNS still resolves to Datum, since ACME +renewal depends on it. + +**Remediate.** If renewal is failing because DNS moved away, this will become a +customer-driven gating event — no platform fix. If renewal is failing for a +platform reason, fix the issuer / ACME path so cert-manager can renew. + +## TLSBackstopPruningChains + +**Meaning.** The extension server is actively dropping broken certificates from +the configuration it sends to the edge. This is expected for a short window +between a certificate failing and the controller withholding the listener. + +**Impact.** None on its own — the backstop is protecting the listener. The +affected hostname is the one whose certificate is broken. + +**Diagnose.** Check extension server logs for `pruned invalid TLS chains` to find +the affected hostnames: + +```sh +kubectl -n logs -l | grep 'pruned invalid TLS chains' +``` + +If `GatewayListenerCertUnusable` is also firing for the same hostname, both +layers are working as expected and no action is needed. If **only** this alert +fires, the controller did not withhold the listener — see the next alert and +check why (start with the listener's status conditions and the controller logs). + +**Remediate.** Generally none. If it persists without a matching +`GatewayListenerCertUnusable`, treat it as a controller gap and investigate the +gateway reconcile for that listener. + +## TLSBackstopListenerAllCertsBroken + +**Meaning (critical).** Every certificate on an edge listener is broken. The +backstop never removes a listener entirely, so the edge will reject the +configuration update for that listener and its config will freeze on its last +good state. + +**Impact.** The listener stops accepting configuration changes. Because the edge +listener is shared, this can affect every hostname on it — this is the +fleet-impacting failure the two-layer design exists to prevent, so reaching it +means the controller-side protection did not catch the listener. + +**Diagnose.** + +```sh +kubectl -n logs -l | grep 'listeners_left_intact' +``` + +Cross-check the infra `EnvoyListenerUpdateRejected` alert, which confirms the +edge is rejecting the update. Identify every certificate on the affected listener +and why each is broken (expired, not yet valid, or mismatched), then determine +why the controller did not withhold the listener before it reached the edge. + +**Remediate.** Restore or remove the broken certificates so the listener has at +least one usable certificate, which lets the edge accept the update again. Then +follow up on the controller gap that allowed an all-broken listener to be +programmed. diff --git a/internal/controller/gateway_controller.go b/internal/controller/gateway_controller.go index 218f13d2..1743f542 100644 --- a/internal/controller/gateway_controller.go +++ b/internal/controller/gateway_controller.go @@ -4,6 +4,7 @@ package controller import ( "context" + "crypto/tls" "fmt" "slices" "strings" @@ -12,6 +13,7 @@ import ( cmv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1" cmmeta "github.com/cert-manager/cert-manager/pkg/apis/meta/v1" envoygatewayv1alpha1 "github.com/envoyproxy/gateway/api/v1alpha1" + "github.com/prometheus/client_golang/prometheus" corev1 "k8s.io/api/core/v1" discoveryv1 "k8s.io/api/discovery/v1" "k8s.io/apimachinery/pkg/api/equality" @@ -270,10 +272,21 @@ func (r *GatewayReconciler) ensureDownstreamGateway( return result, nil } + // Evaluate certificate health once so the listener we propagate and the + // status we report to the user can never disagree. + listenerCertHealth := r.evaluateListenerCertHealth( + ctx, + downstreamClient, + downstreamGateway.Namespace, + upstreamGateway, + claimedHostnames, + ) + desiredDownstreamGateway := r.getDesiredDownstreamGateway( ctx, upstreamGateway, claimedHostnames, + listenerCertHealth, ) if downstreamGateway.CreationTimestamp.IsZero() { @@ -364,8 +377,18 @@ func (r *GatewayReconciler) ensureDownstreamGateway( downstreamStrategy, verifiedHostnames, notClaimedHostnames, + listenerCertHealth, ) + // When a listener is only waiting on a certificate to be issued, check back + // soon so it starts serving promptly once the certificate is ready. + for _, status := range listenerCertHealth { + if !status.healthy && status.pending { + result.RequeueAfter = max(result.RequeueAfter, 1*time.Minute) + break + } + } + addresses := make([]gatewayv1.GatewayStatusAddress, 0, len(targetDomainHostnames)) for _, hostname := range targetDomainHostnames { @@ -383,10 +406,264 @@ func (r *GatewayReconciler) ensureDownstreamGateway( return httpRouteResult.Merge(result), downstreamGateway } +// listenerCertExpiryMargin keeps a certificate that is mid-renewal from briefly +// looking expired and dropping the listener. +const listenerCertExpiryMargin = 1 * time.Minute + +// listenerCertStatus describes whether a listener's TLS certificate is usable. +// A listener with no entry is never gated: its certificate is managed elsewhere +// (for example the shared platform certificate) and a customer's certificate +// failure must never disable it. +type listenerCertStatus struct { + healthy bool + // reason is the listener status reason shown when the certificate is unusable. + reason gatewayv1.ListenerConditionReason + // message is a plain-language explanation shown to the customer. + message string + // pending means the listener is only waiting on a certificate to be issued, + // so the reconcile should check back to let it recover. + pending bool + // notAfter is the certificate's expiry time, populated only when the + // certificate is healthy and its expiry is known. Used to set the expiry + // gauge so operators can alert before a cert expires. + notAfter *metav1.Time + // secretName is the downstream Secret that holds the certificate material. + // Carried here so the expiry gauge can be labelled with the secret name + // without recomputing it outside listenerCertHealth. + secretName string +} + +// clearListenerCertMetrics removes every certificate-health gauge series for a +// gateway. Used both before re-recording each reconcile and on gateway deletion +// so a removed listener never leaves a series stuck at its last value. The gating +// counter is left alone because it is cumulative. +func clearListenerCertMetrics(namespace, name string) { + labels := prometheus.Labels{jsonKeyNamespace: namespace, jsonKeyName: name} + gatewayListenerCertWithheld.DeletePartialMatch(labels) + gatewayListenerCertExpiryTime.DeletePartialMatch(labels) + gatewayListenerCertManaged.DeletePartialMatch(labels) +} + +// evaluateListenerCertHealth reports which listeners have a usable certificate. +// It only considers listeners that own a per-hostname certificate; the same +// selection the rest of the controller uses keeps what we propagate and what we +// report to the customer consistent. A listener with no certificate of its own +// is left out so it is never gated. +func (r *GatewayReconciler) evaluateListenerCertHealth( + ctx context.Context, + downstreamClient client.Client, + downstreamNamespace string, + upstreamGateway *gatewayv1.Gateway, + claimedHostnames []string, +) map[gatewayv1.SectionName]listenerCertStatus { + logger := log.FromContext(ctx) + health := make(map[gatewayv1.SectionName]listenerCertStatus) + + wildcardSuffix := "." + r.Config.Gateway.TargetDomain + hasSharedSecret := r.Config.Gateway.HasDefaultListenerTLSSecret() + now := time.Now() + + // Drop this gateway's previous certificate metrics up front and re-record the + // current state below, so series for listeners that recovered, changed + // hostname, or were removed don't linger at a stale value. + clearListenerCertMetrics(upstreamGateway.Namespace, upstreamGateway.Name) + + for _, l := range upstreamGateway.Spec.Listeners { + // Only listeners that own a per-hostname certificate are gated. + if l.TLS == nil || l.TLS.Options[certificateIssuerTLSOption] == "" || l.Hostname == nil { + continue + } + hostname := string(*l.Hostname) + if !slices.Contains(claimedHostnames, hostname) { + continue + } + // The shared platform certificate is managed by us, not the customer, + // so never gate listeners that use it. + if hasSharedSecret && (strings.HasSuffix(hostname, wildcardSuffix) || hostname == r.Config.Gateway.TargetDomain) { + continue + } + + status := r.listenerCertHealth(ctx, downstreamClient, downstreamNamespace, upstreamGateway.Name, l.Name, hostname, now) + health[l.Name] = status + + // Mark this listener as managed regardless of its health, so the + // SLI ratio (withheld / managed) can be computed fleet-wide. + gatewayListenerCertManaged.WithLabelValues( + upstreamGateway.Namespace, upstreamGateway.Name, string(l.Name), hostname, + ).Set(1) + + if !status.healthy { + logArgs := []any{ + "listener", l.Name, "hostname", hostname, + "reason", status.reason, "message", status.message, + } + // Include the expiry timestamp when available so log queries can + // identify exactly when the cert stopped being valid. + if status.notAfter != nil { + logArgs = append(logArgs, "cert_not_after", status.notAfter.UTC().Format(time.RFC3339)) + } + logger.Info("listener certificate unhealthy", logArgs...) + + gatewayListenerCertWithheld.WithLabelValues( + upstreamGateway.Namespace, upstreamGateway.Name, + string(l.Name), hostname, string(status.reason), + ).Set(1) + gatewayListenerCertGatingTotal.WithLabelValues( + upstreamGateway.Namespace, upstreamGateway.Name, + string(l.Name), hostname, string(status.reason), + ).Inc() + } else if status.notAfter != nil { + // Record the expiry timestamp for healthy certs so operators can + // alert before the next expiry rather than after. + gatewayListenerCertExpiryTime.WithLabelValues( + upstreamGateway.Namespace, upstreamGateway.Name, + string(l.Name), hostname, status.secretName, + ).Set(float64(status.notAfter.Unix())) + } + } + + return health +} + +// listenerCertHealth reports whether a single listener's certificate can be +// used to serve HTTPS right now, covering every way it can be unusable: still +// being issued, not yet valid, expired, or missing. +func (r *GatewayReconciler) listenerCertHealth( + ctx context.Context, + downstreamClient client.Client, + downstreamNamespace string, + gatewayName string, + listenerName gatewayv1.SectionName, + hostname string, + now time.Time, +) listenerCertStatus { + logger := log.FromContext(ctx) + + certName := listenerCertificateName(gatewayName, listenerName) + secretName := listenerCertificateSecretName(gatewayName, listenerName) + + // The certificate must have been issued. + var cert cmv1.Certificate + if err := downstreamClient.Get(ctx, client.ObjectKey{Namespace: downstreamNamespace, Name: certName}, &cert); err != nil { + if apierrors.IsNotFound(err) { + return listenerCertStatus{ + reason: gatewayv1.ListenerReasonInvalidCertificateRef, + message: certIssuanceFailingMessage(hostname), + pending: true, + secretName: secretName, + } + } + // On a read error, hold the listener back but allow it to recover later. + logger.Error(err, "failed to get listener Certificate", "certificate", certName) + return listenerCertStatus{ + reason: gatewayv1.ListenerReasonInvalidCertificateRef, + message: certIssuanceFailingMessage(hostname), + pending: true, + secretName: secretName, + } + } + + if !certIsReady(&cert) { + return listenerCertStatus{ + reason: gatewayv1.ListenerReasonInvalidCertificateRef, + message: certIssuanceFailingMessage(hostname), + pending: true, + secretName: secretName, + } + } + + // The certificate must be within its valid dates. + if cert.Status.NotBefore != nil && cert.Status.NotBefore.After(now) { + return listenerCertStatus{ + reason: gatewayv1.ListenerReasonInvalidCertificateRef, + message: certNotYetValidMessage(hostname), + secretName: secretName, + } + } + if cert.Status.NotAfter != nil && !cert.Status.NotAfter.After(now.Add(listenerCertExpiryMargin)) { + return listenerCertStatus{ + reason: gatewayv1.ListenerReasonInvalidCertificateRef, + message: certExpiredMessage(hostname), + notAfter: cert.Status.NotAfter, + secretName: secretName, + } + } + + // Finally, load the stored certificate and key and confirm they match and + // are still valid. This catches a broken or mismatched certificate that + // would otherwise be served and break HTTPS for the listener. + var secret corev1.Secret + if err := downstreamClient.Get(ctx, client.ObjectKey{Namespace: downstreamNamespace, Name: secretName}, &secret); err != nil { + if apierrors.IsNotFound(err) { + return listenerCertStatus{ + reason: gatewayv1.ListenerReasonInvalidCertificateRef, + message: certMissingMessage(hostname), + pending: true, + secretName: secretName, + } + } + logger.Error(err, "failed to get listener Secret", "secret", secretName) + return listenerCertStatus{ + reason: gatewayv1.ListenerReasonInvalidCertificateRef, + message: certMissingMessage(hostname), + pending: true, + secretName: secretName, + } + } + + keyPair, err := tls.X509KeyPair(secret.Data["tls.crt"], secret.Data["tls.key"]) + if err != nil { + return listenerCertStatus{ + reason: gatewayv1.ListenerReasonInvalidCertificateRef, + message: certMissingMessage(hostname), + secretName: secretName, + } + } + if leaf := keyPair.Leaf; leaf != nil && !leaf.NotAfter.After(now) { + return listenerCertStatus{ + reason: gatewayv1.ListenerReasonInvalidCertificateRef, + message: certExpiredMessage(hostname), + secretName: secretName, + } + } + + return listenerCertStatus{healthy: true, notAfter: cert.Status.NotAfter, secretName: secretName} +} + +// certIsReady reports whether a cert-manager Certificate has Ready=True. +func certIsReady(cert *cmv1.Certificate) bool { + for _, c := range cert.Status.Conditions { + if c.Type == cmv1.CertificateConditionReady { + return c.Status == cmmeta.ConditionTrue + } + } + return false +} + +// These messages are shown to customers, so they stay plain and name the +// hostname affected. + +func certExpiredMessage(hostname string) string { + return fmt.Sprintf("The TLS certificate for %s has expired, so HTTPS for this hostname is paused until a valid certificate is issued. This usually means the domain no longer points to Datum — please check its DNS.", hostname) +} + +func certNotYetValidMessage(hostname string) string { + return fmt.Sprintf("The TLS certificate for %s isn't valid yet. HTTPS will start automatically once its start date is reached.", hostname) +} + +func certIssuanceFailingMessage(hostname string) string { + return fmt.Sprintf("We couldn't issue a TLS certificate for %s, so HTTPS for this hostname is unavailable. Please confirm the domain points to Datum.", hostname) +} + +func certMissingMessage(hostname string) string { + return fmt.Sprintf("The TLS certificate for %s is missing or unreadable, so HTTPS for this hostname is unavailable.", hostname) +} + func (r *GatewayReconciler) getDesiredDownstreamGateway( ctx context.Context, upstreamGateway *gatewayv1.Gateway, claimedHostnames []string, + listenerCertHealth map[gatewayv1.SectionName]listenerCertStatus, ) *gatewayv1.Gateway { logger := log.FromContext(ctx) var downstreamGateway gatewayv1.Gateway @@ -401,6 +678,23 @@ func (r *GatewayReconciler) getDesiredDownstreamGateway( continue } + // Leave out a listener whose certificate isn't usable. A bad certificate + // then only affects its own hostname, and every other hostname on the + // gateway keeps serving. + if status, gated := listenerCertHealth[l.Name]; gated { + if !status.healthy { + logger.Info("skipping downstream gateway listener with unhealthy certificate", + "upstream_listener_index", listenerIndex, "listener", l.Name, + "reason", status.reason, "message", status.message) + continue + } + // The listener was evaluated (it owns a per-hostname cert) and is + // healthy: it is admitted to the downstream gateway. Log this so + // operators can confirm recovery without correlating metric state. + logger.Info("admitting downstream gateway listener with healthy certificate", + "upstream_listener_index", listenerIndex, "listener", l.Name) + } + // Per-listener TLS decision: hostnames covered by the wildcard // (*.targetDomain) reference the pre-provisioned shared secret; // all others reference a per-listener secret populated by a @@ -1145,10 +1439,11 @@ func (r *GatewayReconciler) finalizeGateway( logger := log.FromContext(ctx) logger.Info("finalizing gateway") - // Remove the per-gateway programmed gauge so deleted gateways do not leave - // stale label sets in the metric. This prevents sum(nso_gateway_programmed_total) - // from over-counting the fleet after gateways are removed. + // Remove per-gateway metric series so deleted gateways do not leave stale + // label sets that misrepresent fleet state. gatewayProgrammedTotal.DeleteLabelValues(upstreamGateway.Namespace, upstreamGateway.Name) + // Clear this gateway's cert-health series now that it is gone. + clearListenerCertMetrics(upstreamGateway.Namespace, upstreamGateway.Name) // Clean up DNS records created by this gateway if r.Config.Gateway.EnableDNSIntegration { @@ -1335,6 +1630,7 @@ func (r *GatewayReconciler) ensureDownstreamGatewayHTTPRoutes( downstreamStrategy downstreamclient.ResourceStrategy, verifiedHostnames []string, notClaimedHostnames []string, + listenerCertHealth map[gatewayv1.SectionName]listenerCertStatus, ) (result Result) { logger := log.FromContext(ctx) @@ -1469,9 +1765,22 @@ func (r *GatewayReconciler) ensureDownstreamGatewayHTTPRoutes( ObservedGeneration: upstreamGateway.Generation, } + resolvedRefsCondition := metav1.Condition{ + Type: string(gatewayv1.ListenerConditionResolvedRefs), + Status: metav1.ConditionTrue, + Reason: string(gatewayv1.ListenerReasonResolvedRefs), + Message: "The listener has been resolved by the Datum Gateway", + ObservedGeneration: upstreamGateway.Generation, + } + + // A hostname problem is reported instead of a certificate problem, since + // the hostname has to be sorted out first. + hostnameProblem := false + if listener.Hostname != nil { if !slices.Contains(verifiedHostnames, string(*listener.Hostname)) { + hostnameProblem = true acceptedCondition.Status = metav1.ConditionFalse acceptedCondition.Reason = networkingv1alpha.UnverifiedHostnamesPresent acceptedCondition.Message = fmt.Sprintf("The hostname %q has not been verified. Check status of Domains in the same namespace.", *listener.Hostname) @@ -1480,6 +1789,7 @@ func (r *GatewayReconciler) ensureDownstreamGatewayHTTPRoutes( programmedCondition.Reason = acceptedCondition.Reason programmedCondition.Message = acceptedCondition.Message } else if slices.Contains(notClaimedHostnames, string(*listener.Hostname)) { + hostnameProblem = true acceptedCondition.Status = metav1.ConditionFalse acceptedCondition.Reason = networkingv1alpha.HostnameInUseReason acceptedCondition.Message = fmt.Sprintf("The hostname %q is already attached to a resource.", *listener.Hostname) @@ -1490,18 +1800,24 @@ func (r *GatewayReconciler) ensureDownstreamGatewayHTTPRoutes( } } - apimeta.SetStatusCondition(&status.Conditions, acceptedCondition) + // Tell the customer when a listener's certificate is the reason HTTPS + // isn't running. The listener stays accepted because the configuration + // is fine; only the certificate needs attention. + if !hostnameProblem { + if certStatus, gated := listenerCertHealth[listener.Name]; gated && !certStatus.healthy { + resolvedRefsCondition.Status = metav1.ConditionFalse + resolvedRefsCondition.Reason = string(gatewayv1.ListenerReasonInvalidCertificateRef) + resolvedRefsCondition.Message = certStatus.message - // TODO(jreese) update this based on the downstream gateway's status - apimeta.SetStatusCondition(&status.Conditions, programmedCondition) + programmedCondition.Status = metav1.ConditionFalse + programmedCondition.Reason = string(gatewayv1.ListenerReasonInvalid) + programmedCondition.Message = certStatus.message + } + } - apimeta.SetStatusCondition(&status.Conditions, metav1.Condition{ - Type: string(gatewayv1.ListenerConditionResolvedRefs), - Status: metav1.ConditionTrue, - Reason: string(gatewayv1.ListenerReasonResolvedRefs), - Message: "The listener has been resolved by the Datum Gateway", - ObservedGeneration: upstreamGateway.Generation, - }) + apimeta.SetStatusCondition(&status.Conditions, acceptedCondition) + apimeta.SetStatusCondition(&status.Conditions, programmedCondition) + apimeta.SetStatusCondition(&status.Conditions, resolvedRefsCondition) listenerStatus = append(listenerStatus, status) } diff --git a/internal/controller/gateway_controller_test.go b/internal/controller/gateway_controller_test.go index 50c61522..41669135 100644 --- a/internal/controller/gateway_controller_test.go +++ b/internal/controller/gateway_controller_test.go @@ -2,12 +2,21 @@ package controller import ( "context" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" "fmt" + "math/big" "slices" "strings" "testing" + "time" cmv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1" + cmmeta "github.com/cert-manager/cert-manager/pkg/apis/meta/v1" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" @@ -316,6 +325,10 @@ func TestEnsureDownstreamGatewayWildcardCert(t *testing.T) { }, } + // downstreamNamespaceName is the namespace NSO maps the upstream namespace to + // (see mappednamespace.go). Per-listener Certificates/Secrets live here. + downstreamNamespaceName := fmt.Sprintf("ns-%s", upstreamNamespace.UID) + tests := []struct { name string defaultTLSSecretName string @@ -323,8 +336,11 @@ func TestEnsureDownstreamGatewayWildcardCert(t *testing.T) { upstreamGateway *gatewayv1.Gateway existingUpstreamObjects []client.Object existingDownstreamObjects []client.Object - assert func(t *testing.T, upstreamGateway, downstreamGateway *gatewayv1.Gateway) - assertDownstream func(t *testing.T, downstreamClient client.Client, downstreamGateway *gatewayv1.Gateway) + // listenerCertStates seed downstream Certificate/Secret objects in the + // downstream namespace before reconcile, modelling per-listener cert health. + listenerCertStates []listenerCertState + assert func(t *testing.T, upstreamGateway, downstreamGateway *gatewayv1.Gateway) + assertDownstream func(t *testing.T, downstreamClient client.Client, downstreamGateway *gatewayv1.Gateway) }{ { name: "default https listener uses shared TLS secret", @@ -386,6 +402,9 @@ func TestEnsureDownstreamGatewayWildcardCert(t *testing.T) { }) }), }, + listenerCertStates: []listenerCertState{ + {gatewayName: "test-gw", listenerName: "https-hostname-0", hostname: "custom.example.com", ready: true}, + }, assert: func(t *testing.T, upstreamGateway, downstreamGateway *gatewayv1.Gateway) { httpsListener := gatewayutil.GetListenerByName( downstreamGateway.Spec.Listeners, @@ -515,6 +534,9 @@ func TestEnsureDownstreamGatewayWildcardCert(t *testing.T) { defaultTLSSecretName: "", upstreamGateway: newGateway(config.NetworkServicesOperator{Gateway: baseConfig}, upstreamNamespace.Name, "test-gw", func(g *gatewayv1.Gateway) { }), + listenerCertStates: []listenerCertState{ + {gatewayName: "test-gw", listenerName: gatewayutil.DefaultHTTPSListenerName, hostname: "test-gw.test-suite.com", ready: true}, + }, assert: func(t *testing.T, upstreamGateway, downstreamGateway *gatewayv1.Gateway) { httpsListener := gatewayutil.GetListenerByName( downstreamGateway.Spec.Listeners, @@ -621,6 +643,11 @@ func TestEnsureDownstreamGatewayWildcardCert(t *testing.T) { }, }) + for _, s := range tt.listenerCertStates { + tt.existingDownstreamObjects = append(tt.existingDownstreamObjects, + newDownstreamListenerCertObjects(t, downstreamNamespaceName, s)...) + } + for _, obj := range append(tt.existingUpstreamObjects, tt.existingDownstreamObjects...) { obj.SetUID(uuid.NewUUID()) obj.SetCreationTimestamp(metav1.Now()) @@ -680,6 +707,283 @@ func TestEnsureDownstreamGatewayWildcardCert(t *testing.T) { } } +// TestListenerCertificateHealthGating checks that a listener with an unusable +// certificate is kept out of the downstream gateway and reported to the customer, +// while a healthy listener on the same gateway is unaffected. +func TestListenerCertificateHealthGating(t *testing.T) { + testScheme := runtime.NewScheme() + require.NoError(t, scheme.AddToScheme(testScheme)) + require.NoError(t, gatewayv1.Install(testScheme)) + require.NoError(t, discoveryv1.AddToScheme(testScheme)) + require.NoError(t, networkingv1alpha.AddToScheme(testScheme)) + require.NoError(t, cmv1.AddToScheme(testScheme)) + + upstreamNamespace := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: "test", UID: uuid.NewUUID()}, + } + downstreamNamespaceName := fmt.Sprintf("ns-%s", upstreamNamespace.UID) + + testConfig := config.NetworkServicesOperator{ + Gateway: config.GatewayConfig{ + DownstreamGatewayClassName: "test-suite", + DownstreamHostnameAccountingNamespace: "default", + TargetDomain: "test-suite.com", + IPFamilies: []networkingv1alpha.IPFamily{ + networkingv1alpha.IPv4Protocol, + networkingv1alpha.IPv6Protocol, + }, + }, + } + + now := time.Now() + + // customHTTPSListener builds a per-listener-cert HTTPS listener. + customHTTPSListener := func(name gatewayv1.SectionName, hostname string) gatewayv1.Listener { + return gatewayv1.Listener{ + Name: name, + Protocol: gatewayv1.HTTPSProtocolType, + Port: DefaultHTTPSPort, + Hostname: ptr.To(gatewayv1.Hostname(hostname)), + AllowedRoutes: &gatewayv1.AllowedRoutes{ + Namespaces: &gatewayv1.RouteNamespaces{From: ptr.To(gatewayv1.NamespacesFromSame)}, + }, + TLS: &gatewayv1.ListenerTLSConfig{ + Mode: ptr.To(gatewayv1.TLSModeTerminate), + Options: map[gatewayv1.AnnotationKey]gatewayv1.AnnotationValue{ + gatewayv1.AnnotationKey(certificateIssuerTLSOption): "test-issuer", + }, + }, + } + } + + verifiedDomain := func(name string) client.Object { + return newDomain(upstreamNamespace.Name, name, func(d *networkingv1alpha.Domain) { + d.Spec.DomainName = name + apimeta.SetStatusCondition(&d.Status.Conditions, metav1.Condition{ + Type: networkingv1alpha.DomainConditionVerified, + Status: metav1.ConditionTrue, + }) + }) + } + + type listenerExpect struct { + name gatewayv1.SectionName + present bool // present in downstream Gateway.Spec.Listeners + programmed metav1.ConditionStatus + resolvedRefs metav1.ConditionStatus + resolvedRefsReason string // when False + expectMessageContains string // substring of the user-facing message (when unhealthy) + } + + tests := []struct { + name string + listeners []gatewayv1.Listener + domains []client.Object + certStates []listenerCertState + expect []listenerExpect + }{ + { + name: "ready cert keeps listener present and programmed", + listeners: []gatewayv1.Listener{customHTTPSListener("https-0", "a.example.com")}, + domains: []client.Object{verifiedDomain("a.example.com")}, + certStates: []listenerCertState{ + {gatewayName: "test-gw", listenerName: "https-0", hostname: "a.example.com", ready: true}, + }, + expect: []listenerExpect{ + {name: "https-0", present: true, programmed: metav1.ConditionTrue, resolvedRefs: metav1.ConditionTrue}, + }, + }, + { + name: "expired cert omits listener and reports expired message", + listeners: []gatewayv1.Listener{customHTTPSListener("https-0", "a.example.com")}, + domains: []client.Object{verifiedDomain("a.example.com")}, + certStates: []listenerCertState{ + {gatewayName: "test-gw", listenerName: "https-0", hostname: "a.example.com", ready: true, + notAfter: ptr.To(now.Add(-1 * time.Hour))}, + }, + expect: []listenerExpect{ + {name: "https-0", present: false, programmed: metav1.ConditionFalse, + resolvedRefs: metav1.ConditionFalse, resolvedRefsReason: string(gatewayv1.ListenerReasonInvalidCertificateRef), + expectMessageContains: "has expired"}, + }, + }, + { + name: "not-yet-valid cert omits listener and reports not-yet-valid message", + listeners: []gatewayv1.Listener{customHTTPSListener("https-0", "a.example.com")}, + domains: []client.Object{verifiedDomain("a.example.com")}, + certStates: []listenerCertState{ + {gatewayName: "test-gw", listenerName: "https-0", hostname: "a.example.com", ready: true, + notBefore: ptr.To(now.Add(24 * time.Hour))}, + }, + expect: []listenerExpect{ + {name: "https-0", present: false, programmed: metav1.ConditionFalse, + resolvedRefs: metav1.ConditionFalse, resolvedRefsReason: string(gatewayv1.ListenerReasonInvalidCertificateRef), + expectMessageContains: "isn't valid yet"}, + }, + }, + { + name: "not-ready cert omits listener and reports issuance-failing message", + listeners: []gatewayv1.Listener{customHTTPSListener("https-0", "a.example.com")}, + domains: []client.Object{verifiedDomain("a.example.com")}, + certStates: []listenerCertState{ + {gatewayName: "test-gw", listenerName: "https-0", hostname: "a.example.com", ready: false}, + }, + expect: []listenerExpect{ + {name: "https-0", present: false, programmed: metav1.ConditionFalse, + resolvedRefs: metav1.ConditionFalse, resolvedRefsReason: string(gatewayv1.ListenerReasonInvalidCertificateRef), + expectMessageContains: "couldn't issue"}, + }, + }, + { + name: "missing cert omits listener and reports issuance-failing message", + listeners: []gatewayv1.Listener{customHTTPSListener("https-0", "a.example.com")}, + domains: []client.Object{verifiedDomain("a.example.com")}, + certStates: nil, // no downstream Certificate seeded + expect: []listenerExpect{ + {name: "https-0", present: false, programmed: metav1.ConditionFalse, + resolvedRefs: metav1.ConditionFalse, resolvedRefsReason: string(gatewayv1.ListenerReasonInvalidCertificateRef), + expectMessageContains: "couldn't issue"}, + }, + }, + { + name: "cert-key mismatch omits listener and reports missing/unreadable message", + listeners: []gatewayv1.Listener{customHTTPSListener("https-0", "a.example.com")}, + domains: []client.Object{verifiedDomain("a.example.com")}, + certStates: []listenerCertState{ + {gatewayName: "test-gw", listenerName: "https-0", hostname: "a.example.com", ready: true, mismatchedKey: true}, + }, + expect: []listenerExpect{ + {name: "https-0", present: false, programmed: metav1.ConditionFalse, + resolvedRefs: metav1.ConditionFalse, resolvedRefsReason: string(gatewayv1.ListenerReasonInvalidCertificateRef), + expectMessageContains: "missing or unreadable"}, + }, + }, + { + // Incident regression: one expired cert must NOT take down a sibling + // healthy listener sharing the same :443. + name: "expired sibling does not affect healthy listener", + listeners: []gatewayv1.Listener{ + customHTTPSListener("https-good", "good.example.com"), + customHTTPSListener("https-bad", "bad.example.com"), + }, + domains: []client.Object{verifiedDomain("good.example.com"), verifiedDomain("bad.example.com")}, + certStates: []listenerCertState{ + {gatewayName: "test-gw", listenerName: "https-good", hostname: "good.example.com", ready: true}, + {gatewayName: "test-gw", listenerName: "https-bad", hostname: "bad.example.com", ready: true, + notAfter: ptr.To(now.Add(-1 * time.Hour))}, + }, + expect: []listenerExpect{ + {name: "https-good", present: true, programmed: metav1.ConditionTrue, resolvedRefs: metav1.ConditionTrue}, + {name: "https-bad", present: false, programmed: metav1.ConditionFalse, + resolvedRefs: metav1.ConditionFalse, resolvedRefsReason: string(gatewayv1.ListenerReasonInvalidCertificateRef), + expectMessageContains: "has expired"}, + }, + }, + } + + logger := zap.New(zap.UseFlagOptions(&zap.Options{Development: true})) + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + upstreamGateway := newGateway(testConfig, upstreamNamespace.Name, "test-gw", func(g *gatewayv1.Gateway) { + g.Spec.Listeners = append(g.Spec.Listeners, tt.listeners...) + }) + + upstreamObjects := append([]client.Object{}, tt.domains...) + upstreamObjects = append(upstreamObjects, &gatewayv1.GatewayClass{ + ObjectMeta: metav1.ObjectMeta{Name: "test"}, + Spec: gatewayv1.GatewayClassSpec{ControllerName: gatewayv1.GatewayController("test")}, + }) + + var downstreamObjects []client.Object + for _, s := range tt.certStates { + downstreamObjects = append(downstreamObjects, newDownstreamListenerCertObjects(t, downstreamNamespaceName, s)...) + } + + for _, obj := range append(append([]client.Object{}, upstreamObjects...), downstreamObjects...) { + obj.SetUID(uuid.NewUUID()) + obj.SetCreationTimestamp(metav1.Now()) + } + + fakeUpstreamClient := fake.NewClientBuilder(). + WithScheme(testScheme). + WithObjects(upstreamGateway, upstreamNamespace). + WithObjects(upstreamObjects...). + WithStatusSubresource(upstreamGateway). + WithStatusSubresource(upstreamObjects...). + Build() + + fakeDownstreamClient := fake.NewClientBuilder(). + WithScheme(testScheme). + WithObjects(downstreamObjects...). + WithStatusSubresource(&gatewayv1.Gateway{}). + WithStatusSubresource(downstreamObjects...). + Build() + + ctx := log.IntoContext(context.Background(), logger) + mgr := &fakeMockManager{cl: fakeUpstreamClient} + reconciler := &GatewayReconciler{ + mgr: mgr, + Config: testConfig, + DownstreamCluster: &fakeCluster{cl: fakeDownstreamClient}, + } + downstreamStrategy := downstreamclient.NewMappedNamespaceResourceStrategy("test", fakeUpstreamClient, fakeDownstreamClient) + + reconciler.prepareUpstreamGateway(upstreamGateway) + result, downstreamGateway := reconciler.ensureDownstreamGateway( + ctx, "test-suite", fakeUpstreamClient, upstreamGateway, downstreamStrategy, + ) + require.NoError(t, result.Err, "ensureDownstreamGateway returned error") + _, err := result.Complete(ctx) + require.NoError(t, err, "result.Complete returned error") + + updatedUpstream := &gatewayv1.Gateway{} + require.NoError(t, fakeUpstreamClient.Get(ctx, client.ObjectKeyFromObject(upstreamGateway), updatedUpstream)) + + listenerStatusByName := map[gatewayv1.SectionName]gatewayv1.ListenerStatus{} + for _, ls := range updatedUpstream.Status.Listeners { + listenerStatusByName[ls.Name] = ls + } + + for _, exp := range tt.expect { + // The listener is sent to the edge only when its certificate is usable. + ds := gatewayutil.GetListenerByName(downstreamGateway.Spec.Listeners, exp.name) + if exp.present { + assert.NotNil(t, ds, "listener %q should be present in downstream gateway", exp.name) + } else { + assert.Nil(t, ds, "listener %q should be omitted from downstream gateway", exp.name) + } + + // The customer-facing status reflects the certificate problem. + ls, ok := listenerStatusByName[exp.name] + if !assert.True(t, ok, "listener %q missing from upstream status", exp.name) { + continue + } + + programmed := apimeta.FindStatusCondition(ls.Conditions, string(gatewayv1.ListenerConditionProgrammed)) + if assert.NotNil(t, programmed, "programmed condition missing on %q", exp.name) { + assert.Equal(t, exp.programmed, programmed.Status, "programmed status on %q", exp.name) + } + + resolvedRefs := apimeta.FindStatusCondition(ls.Conditions, string(gatewayv1.ListenerConditionResolvedRefs)) + if assert.NotNil(t, resolvedRefs, "resolvedRefs condition missing on %q", exp.name) { + assert.Equal(t, exp.resolvedRefs, resolvedRefs.Status, "resolvedRefs status on %q", exp.name) + if exp.resolvedRefs == metav1.ConditionFalse { + assert.Equal(t, exp.resolvedRefsReason, resolvedRefs.Reason, "resolvedRefs reason on %q", exp.name) + assert.Contains(t, resolvedRefs.Message, exp.expectMessageContains, "resolvedRefs message on %q", exp.name) + } + } + + // Accepted must remain True regardless of cert health. + accepted := apimeta.FindStatusCondition(ls.Conditions, string(gatewayv1.ListenerConditionAccepted)) + if assert.NotNil(t, accepted, "accepted condition missing on %q", exp.name) { + assert.Equal(t, metav1.ConditionTrue, accepted.Status, "accepted should stay True on %q", exp.name) + } + } + }) + } +} + func TestEnsureDownstreamGatewayHTTPRoutes(t *testing.T) { testScheme := runtime.NewScheme() assert.NoError(t, scheme.AddToScheme(testScheme)) @@ -877,6 +1181,7 @@ func TestEnsureDownstreamGatewayHTTPRoutes(t *testing.T) { downstreamStrategy, nil, nil, + nil, ) assert.NoError(t, result.Err, "failed ensuring downstream gateway HTTPRoutes") @@ -1380,7 +1685,7 @@ func TestGetDesiredDownstreamGateway_UnclaimedHostnameSkipped(t *testing.T) { Spec: gatewayv1.GatewaySpec{Listeners: tt.listeners}, } - desired := reconciler.getDesiredDownstreamGateway(ctx, upstream, tt.claimedHostnames) + desired := reconciler.getDesiredDownstreamGateway(ctx, upstream, tt.claimedHostnames, nil) assert.Empty(t, desired.Annotations, "desired gateway should have no cert-manager annotations") assert.Len(t, desired.Spec.Listeners, tt.expectListeners, "downstream listener count") @@ -1388,6 +1693,119 @@ func TestGetDesiredDownstreamGateway_UnclaimedHostnameSkipped(t *testing.T) { } } +// generateTLSKeyPair returns PEM-encoded cert and key bytes for hostname, with +// the supplied validity window. Used to seed downstream Secrets in cert-health +// tests so the X509 self-check exercises real material. +func generateTLSKeyPair(t *testing.T, hostname string, notBefore, notAfter time.Time) (certPEM, keyPEM []byte) { + t.Helper() + + key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + require.NoError(t, err) + + tmpl := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: hostname}, + DNSNames: []string{hostname}, + NotBefore: notBefore, + NotAfter: notAfter, + KeyUsage: x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + } + + der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &key.PublicKey, key) + require.NoError(t, err) + + certPEM = pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der}) + + keyDER, err := x509.MarshalECPrivateKey(key) + require.NoError(t, err) + keyPEM = pem.EncodeToMemory(&pem.Block{Type: "EC PRIVATE KEY", Bytes: keyDER}) + + return certPEM, keyPEM +} + +// listenerCertState describes a downstream cert-manager Certificate + Secret +// pair to seed for a listener in cert-health tests. +type listenerCertState struct { + gatewayName string + listenerName gatewayv1.SectionName + hostname string + + ready bool // Certificate Ready condition + notBefore *time.Time // Certificate.status.NotBefore (defaults to past) + notAfter *time.Time // Certificate.status.NotAfter (defaults to far future) + + omitSecret bool // do not create the backing Secret + mismatchedKey bool // seed a Secret whose key does not match the cert + secretNotAfter *time.Time // leaf NotAfter for the Secret material (defaults to far future) +} + +// newDownstreamListenerCertObjects builds the downstream Certificate (+ Secret) +// objects for a listener in the namespace, modelling the requested health state. +func newDownstreamListenerCertObjects(t *testing.T, namespace string, s listenerCertState) []client.Object { + t.Helper() + + now := time.Now() + notBefore := now.Add(-1 * time.Hour) + if s.notBefore != nil { + notBefore = *s.notBefore + } + notAfter := now.Add(365 * 24 * time.Hour) + if s.notAfter != nil { + notAfter = *s.notAfter + } + + cert := &cmv1.Certificate{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: listenerCertificateName(s.gatewayName, s.listenerName), + }, + Status: cmv1.CertificateStatus{ + NotBefore: &metav1.Time{Time: notBefore}, + NotAfter: &metav1.Time{Time: notAfter}, + }, + } + readyStatus := cmmeta.ConditionFalse + if s.ready { + readyStatus = cmmeta.ConditionTrue + } + cert.Status.Conditions = []cmv1.CertificateCondition{ + {Type: cmv1.CertificateConditionReady, Status: readyStatus}, + } + + objs := make([]client.Object, 0, 2) + objs = append(objs, cert) + + if s.omitSecret { + return objs + } + + secretNotAfter := now.Add(365 * 24 * time.Hour) + if s.secretNotAfter != nil { + secretNotAfter = *s.secretNotAfter + } + certPEM, keyPEM := generateTLSKeyPair(t, s.hostname, now.Add(-1*time.Hour), secretNotAfter) + if s.mismatchedKey { + // Replace the key with one from an unrelated keypair so X509KeyPair fails. + _, keyPEM = generateTLSKeyPair(t, s.hostname, now.Add(-1*time.Hour), secretNotAfter) + } + + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: listenerCertificateSecretName(s.gatewayName, s.listenerName), + }, + Type: corev1.SecretTypeTLS, + Data: map[string][]byte{ + "tls.crt": certPEM, + "tls.key": keyPEM, + }, + } + objs = append(objs, secret) + + return objs +} + func newHTTPRoute(namespace, name string, opts ...func(*gatewayv1.HTTPRoute)) *gatewayv1.HTTPRoute { route := &gatewayv1.HTTPRoute{ ObjectMeta: metav1.ObjectMeta{ diff --git a/internal/controller/metrics.go b/internal/controller/metrics.go index 94093a5f..78784ca8 100644 --- a/internal/controller/metrics.go +++ b/internal/controller/metrics.go @@ -11,6 +11,14 @@ import ( "github.com/prometheus/client_golang/prometheus/promauto" ) +// Metric label name constants for TLS certificate health metrics. +const ( + metricLabelListener = "listener" + metricLabelHostname = "hostname" + metricLabelSecret = "secret" + metricLabelReason = "reason" +) + var ( // replicatorConflictsTotal counts resource-version conflicts observed by the // gateway-resource-replicator controller. Conflicts arise when the upstream or @@ -53,4 +61,58 @@ var ( }, []string{jsonKeyNamespace, jsonKeyName}, ) + + // gatewayListenerCertWithheld is 1 for each upstream Gateway listener that NSO + // is currently withholding from the downstream because its TLS certificate is + // unusable. The series for a listener is removed when the listener recovers, + // is removed from the Gateway, or the Gateway is deleted. + // + // Use sum(nso_gateway_listener_cert_withheld) to count how many listeners are + // currently dark across the fleet, or filter by namespace/name/listener/hostname + // to find the specific affected object during an incident. + gatewayListenerCertWithheld = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "nso_gateway_listener_cert_withheld", + Help: "1 when a Gateway listener is withheld from the downstream because its TLS certificate is unusable, 0 after it recovers.", + }, + []string{jsonKeyNamespace, jsonKeyName, metricLabelListener, metricLabelHostname, metricLabelReason}, + ) + + // gatewayListenerCertGatingTotal counts every reconcile cycle in which a + // Gateway listener is withheld due to an unusable certificate. A rising rate + // means new cert failures are arriving, not just that existing ones persist. + gatewayListenerCertGatingTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "nso_gateway_listener_cert_gating_total", + Help: "Total reconcile cycles in which a Gateway listener was withheld because its TLS certificate was unusable.", + }, + []string{jsonKeyNamespace, jsonKeyName, metricLabelListener, metricLabelHostname, metricLabelReason}, + ) + + // gatewayListenerCertExpiryTime is the Unix timestamp (seconds) at which a + // managed Gateway listener's TLS certificate expires. Only set for listeners + // with a healthy certificate whose expiry is known. Use this to alert before + // a cert expires and NSO begins gating the listener. + // + // Query time-to-expiry in days: + // (nso_gateway_listener_cert_expiry_time - time()) / 86400 + gatewayListenerCertExpiryTime = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "nso_gateway_listener_cert_expiry_time", + Help: "Unix timestamp when the managed TLS certificate for this Gateway listener expires. Only present when the certificate is healthy.", + }, + []string{jsonKeyNamespace, jsonKeyName, metricLabelListener, metricLabelHostname, metricLabelSecret}, + ) + + // gatewayListenerCertManaged counts the total number of Gateway listeners + // that NSO evaluates for certificate health each reconcile. Together with + // gatewayListenerCertWithheld this gives the fraction of managed listeners + // that are currently serving (the SLI ratio). + gatewayListenerCertManaged = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "nso_gateway_listener_cert_managed", + Help: "1 for each Gateway listener whose TLS certificate is managed and evaluated by NSO, regardless of health.", + }, + []string{jsonKeyNamespace, jsonKeyName, metricLabelListener, metricLabelHostname}, + ) ) diff --git a/internal/extensionserver/metrics/metrics.go b/internal/extensionserver/metrics/metrics.go index 111dca19..82e8717b 100644 --- a/internal/extensionserver/metrics/metrics.go +++ b/internal/extensionserver/metrics/metrics.go @@ -154,6 +154,57 @@ var ( }, ) + // TLSPrunedChainsTotal tracks how often a broken certificate had to be + // dropped from a listener to keep the rest of it serving (issue #212). + TLSPrunedChainsTotal = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "nso_extension_tls_pruned_chains_total", + Help: "Total TLS filter chains dropped for referencing an invalid (mismatched/expired) certificate across all hook invocations.", + }, + ) + + // TLSPrunedSecretsTotal tracks how many broken certificates were dropped. + TLSPrunedSecretsTotal = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "nso_extension_tls_pruned_secrets_total", + Help: "Total invalid TLS secrets dropped from the response across all hook invocations.", + }, + ) + + // TLSListenersLeftIntactTotal tracks listeners that were left untouched + // because every certificate on them was broken; a non-zero value means a + // listener could not be saved here and still needs the controller-side fix. + TLSListenersLeftIntactTotal = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "nso_extension_tls_listeners_left_intact_total", + Help: "Total listeners left intact (not emptied) because all their TLS filter chains were invalid, across all hook invocations.", + }, + ) + + // TLSPrunedChainsActive is the number of TLS filter chains that were dropped + // in the most recent PostTranslateModify response. Set to zero when the hook + // runs cleanly. Use this to know whether the data-plane backstop is currently + // active, as opposed to TLSPrunedChainsTotal which only accumulates. + TLSPrunedChainsActive = promauto.NewGauge( + prometheus.GaugeOpts{ + Name: "nso_extension_tls_pruned_chains_active", + Help: "TLS filter chains dropped in the most recent PostTranslateModify response. Non-zero means the data-plane cert backstop is currently active.", + }, + ) + + // TLSListenersLeftIntactActive is the number of listeners left completely + // untouched in the most recent PostTranslateModify response because every + // certificate on them was broken. Non-zero means the backstop could not save + // the listener (it never empties a listener) — this is an important signal + // that the controller-side gating did not suppress the listener and an Envoy + // LDS NACK may follow. + TLSListenersLeftIntactActive = promauto.NewGauge( + prometheus.GaugeOpts{ + Name: "nso_extension_tls_listeners_left_intact_active", + Help: "Listeners left intact in the most recent PostTranslateModify response because all their TLS chains were broken. Non-zero means the backstop could not protect the listener.", + }, + ) + // CacheSynced is 1 when the informer cache has synced and the extension server // is accepting gRPC connections, 0 during startup or if the cache lost sync. CacheSynced = promauto.NewGauge( diff --git a/internal/extensionserver/mutate/tls.go b/internal/extensionserver/mutate/tls.go new file mode 100644 index 00000000..5a36912e --- /dev/null +++ b/internal/extensionserver/mutate/tls.go @@ -0,0 +1,220 @@ +package mutate + +import ( + "crypto/tls" + "crypto/x509" + "time" + + listenerv3 "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3" + tlsv3 "github.com/envoyproxy/go-control-plane/envoy/extensions/transport_sockets/tls/v3" +) + +// PruneInvalidTLSSecrets removes only the parts of a listener that use a broken +// certificate, so one customer's bad certificate can't stop the whole listener +// and take HTTPS down for every other hostname on it (issue #212). +// +// It is written so it can never make things worse: +// - A listener is never left with nothing to serve. If every certificate on a +// listener is broken, the listener is left exactly as it was and counted in +// listenersLeftIntact, because handing back an empty listener would stop the +// listener entirely — the very thing this guards against. +// - Parts of a listener that don't serve a certificate are never touched. +// - Anything it can't read is left alone, and it never reports an error, since +// doing so would stop the update. +// +// now is a parameter so expiry can be tested with fixed times. +// +// It returns the secrets to hand back (the broken ones that were actually +// dropped are removed), how many parts and certificates were dropped, how many +// listeners were left untouched because everything on them was broken, and the +// hostnames that were dropped, for logging. The listeners are edited in place. +func PruneInvalidTLSSecrets( + listeners []*listenerv3.Listener, + secrets []*tlsv3.Secret, + now time.Time, +) (kept []*tlsv3.Secret, prunedChains int, prunedSecrets int, listenersLeftIntact int, droppedNames []string) { + // Find the broken certificates. + badSecrets := make(map[string]struct{}) + for _, sec := range secrets { + name := sec.GetName() + if name == "" { + continue + } + cert := sec.GetTlsCertificate() + if cert == nil { + // Only certificates are checked; leave anything else alone. + continue + } + if !tlsCertificateUsable(cert, now) { + badSecrets[name] = struct{}{} + } + } + if len(badSecrets) == 0 { + // Nothing broken, so change nothing. + return secrets, 0, 0, 0, nil + } + + // Remember which broken certificates were actually dropped, so only those + // are removed from the secrets handed back. + prunedBadNames := make(map[string]struct{}) + for _, l := range listeners { + chains := l.GetFilterChains() + if len(chains) == 0 { + continue + } + + // Work out which parts of the listener use a broken certificate. + dropChain := make([]bool, len(chains)) + anyDrop := false + keptCount := 0 + for i, fc := range chains { + if chainReferencesBadSecret(fc, badSecrets) { + dropChain[i] = true + anyDrop = true + } else { + keptCount++ + } + } + if !anyDrop { + continue + } + + // If everything on the listener is broken, leave it untouched rather + // than hand back a listener with nothing to serve. + if keptCount == 0 { + listenersLeftIntact++ + continue + } + + // Keep the good parts, in their original order. + newChains := make([]*listenerv3.FilterChain, 0, keptCount) + for i, fc := range chains { + if dropChain[i] { + prunedChains++ + droppedNames = append(droppedNames, chainDropLabel(fc, badSecrets)) + for _, n := range chainBadSecretNames(fc, badSecrets) { + prunedBadNames[n] = struct{}{} + } + continue + } + newChains = append(newChains, fc) + } + l.FilterChains = newChains + } + + // Build kept secrets: input minus the bad secrets actually pruned from an + // emitted listener. A bad secret still referenced only by an intact listener + // is retained to preserve referential completeness. + prunedSecrets = len(prunedBadNames) + if prunedSecrets == 0 { + // Bad secrets existed but none were pruned from an emitted listener + // (e.g. only referenced by listeners left intact). Pass secrets through. + return secrets, prunedChains, 0, listenersLeftIntact, droppedNames + } + kept = make([]*tlsv3.Secret, 0, len(secrets)) + for _, sec := range secrets { + if _, dropped := prunedBadNames[sec.GetName()]; dropped { + continue + } + kept = append(kept, sec) + } + return kept, prunedChains, prunedSecrets, listenersLeftIntact, droppedNames +} + +// tlsCertificateUsable reports whether a certificate and its key match and are +// within their valid dates. A certificate we can't read is treated as usable, so +// only certificates we can prove are broken are ever dropped. +func tlsCertificateUsable(cert *tlsv3.TlsCertificate, now time.Time) bool { + certPEM := cert.GetCertificateChain().GetInlineBytes() + keyPEM := cert.GetPrivateKey().GetInlineBytes() + if len(certPEM) == 0 || len(keyPEM) == 0 { + // Nothing we can read here, so don't touch it. + return true + } + pair, err := tls.X509KeyPair(certPEM, keyPEM) + if err != nil { + // The certificate and key don't match, or can't be read. + return false + } + leaf := pair.Leaf + if leaf == nil { + if len(pair.Certificate) == 0 { + return false + } + parsed, perr := x509.ParseCertificate(pair.Certificate[0]) + if perr != nil { + return false + } + leaf = parsed + } + if now.Before(leaf.NotBefore) { + return false // Not valid yet. + } + if !leaf.NotAfter.After(now) { + return false // Expired. + } + return true +} + +// chainReferencesBadSecret reports whether this part of the listener serves one +// of the broken certificates. Parts that serve no certificate are never dropped. +func chainReferencesBadSecret(fc *listenerv3.FilterChain, bad map[string]struct{}) bool { + for _, n := range chainSDSSecretNames(fc) { + if _, ok := bad[n]; ok { + return true + } + } + return false +} + +// chainBadSecretNames returns the broken certificates this part of the listener serves. +func chainBadSecretNames(fc *listenerv3.FilterChain, bad map[string]struct{}) []string { + var out []string + for _, n := range chainSDSSecretNames(fc) { + if _, ok := bad[n]; ok { + out = appendUnique(out, n) + } + } + return out +} + +// chainSDSSecretNames returns the certificates this part of the listener serves. +// Anything it can't read returns nothing, so it is never matched or dropped. +func chainSDSSecretNames(fc *listenerv3.FilterChain) []string { + ts := fc.GetTransportSocket() + if ts == nil { + return nil + } + tc := ts.GetTypedConfig() + if tc == nil { + return nil + } + dtc := &tlsv3.DownstreamTlsContext{} + if err := tc.UnmarshalTo(dtc); err != nil { + // Not something that serves a certificate; leave it alone. + return nil + } + var names []string + for _, sds := range dtc.GetCommonTlsContext().GetTlsCertificateSdsSecretConfigs() { + if n := sds.GetName(); n != "" { + names = appendUnique(names, n) + } + } + return names +} + +// chainDropLabel builds a human-readable label for a dropped chain for logging: +// the chain's SNI server name(s) when present, otherwise the bad SDS secret +// name(s) it referenced. +func chainDropLabel(fc *listenerv3.FilterChain, bad map[string]struct{}) string { + if sn := fc.GetFilterChainMatch().GetServerNames(); len(sn) > 0 { + return sn[0] + } + if names := chainBadSecretNames(fc, bad); len(names) > 0 { + return names[0] + } + if n := fc.GetName(); n != "" { + return n + } + return "" +} diff --git a/internal/extensionserver/mutate/tls_test.go b/internal/extensionserver/mutate/tls_test.go new file mode 100644 index 00000000..db7e43a2 --- /dev/null +++ b/internal/extensionserver/mutate/tls_test.go @@ -0,0 +1,298 @@ +package mutate + +import ( + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "math/big" + "testing" + "time" + + corev3 "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + listenerv3 "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3" + tlsv3 "github.com/envoyproxy/go-control-plane/envoy/extensions/transport_sockets/tls/v3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "google.golang.org/protobuf/types/known/anypb" +) + +// fixedNow is the reference "now" used across the table; certs are minted +// relative to it so expiry is deterministic. +var fixedNow = time.Date(2026, 6, 23, 12, 0, 0, 0, time.UTC) + +// mintCert returns PEM-encoded (cert, key) for an ECDSA leaf valid in +// [notBefore, notAfter). The returned pair is internally consistent (key matches +// cert) unless mismatchKey replaces the key with an unrelated one. +func mintCert(t *testing.T, cn string, notBefore, notAfter time.Time) (certPEM, keyPEM []byte) { + t.Helper() + key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + require.NoError(t, err) + + tmpl := &x509.Certificate{ + SerialNumber: big.NewInt(time.Now().UnixNano()), + Subject: pkix.Name{CommonName: cn}, + NotBefore: notBefore, + NotAfter: notAfter, + DNSNames: []string{cn}, + } + der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &key.PublicKey, key) + require.NoError(t, err) + certPEM = pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der}) + + keyDER, err := x509.MarshalECPrivateKey(key) + require.NoError(t, err) + keyPEM = pem.EncodeToMemory(&pem.Block{Type: "EC PRIVATE KEY", Bytes: keyDER}) + return certPEM, keyPEM +} + +// validPair mints a cert valid around fixedNow. +func validPair(t *testing.T, cn string) (certPEM, keyPEM []byte) { + t.Helper() + return mintCert(t, cn, fixedNow.Add(-24*time.Hour), fixedNow.Add(24*time.Hour)) +} + +// expiredPair mints a cert that expired before fixedNow. +func expiredPair(t *testing.T, cn string) (certPEM, keyPEM []byte) { + t.Helper() + return mintCert(t, cn, fixedNow.Add(-48*time.Hour), fixedNow.Add(-24*time.Hour)) +} + +// tlsSecret builds a *tlsv3.Secret carrying inline cert/key bytes. +func tlsSecret(name string, certPEM, keyPEM []byte) *tlsv3.Secret { + return &tlsv3.Secret{ + Name: name, + Type: &tlsv3.Secret_TlsCertificate{ + TlsCertificate: &tlsv3.TlsCertificate{ + CertificateChain: &corev3.DataSource{ + Specifier: &corev3.DataSource_InlineBytes{InlineBytes: certPEM}, + }, + PrivateKey: &corev3.DataSource{ + Specifier: &corev3.DataSource_InlineBytes{InlineBytes: keyPEM}, + }, + }, + }, + } +} + +// tlsChain builds a part of a listener that serves the named certificate for +// the given hostname. +func tlsChain(t *testing.T, sni, secretName string) *listenerv3.FilterChain { + t.Helper() + dtc := &tlsv3.DownstreamTlsContext{ + CommonTlsContext: &tlsv3.CommonTlsContext{ + TlsCertificateSdsSecretConfigs: []*tlsv3.SdsSecretConfig{ + {Name: secretName}, + }, + }, + } + dtcAny, err := anypb.New(dtc) + require.NoError(t, err) + return &listenerv3.FilterChain{ + Name: "fc-" + sni, + FilterChainMatch: &listenerv3.FilterChainMatch{ServerNames: []string{sni}}, + TransportSocket: &corev3.TransportSocket{ + Name: "envoy.transport_sockets.tls", + ConfigType: &corev3.TransportSocket_TypedConfig{TypedConfig: dtcAny}, + }, + } +} + +// plainChain builds a part of a listener that serves no certificate. +func plainChain(name string) *listenerv3.FilterChain { + return &listenerv3.FilterChain{Name: name} +} + +func listener(chains ...*listenerv3.FilterChain) *listenerv3.Listener { + return &listenerv3.Listener{Name: "https", FilterChains: chains} +} + +func chainNames(l *listenerv3.Listener) []string { + out := make([]string, 0, len(l.GetFilterChains())) + for _, fc := range l.GetFilterChains() { + out = append(out, fc.GetName()) + } + return out +} + +func secretNames(secs []*tlsv3.Secret) []string { + out := make([]string, 0, len(secs)) + for _, s := range secs { + out = append(out, s.GetName()) + } + return out +} + +func TestPruneInvalidTLSSecrets_ExpiredChainDropped(t *testing.T) { + goodCert, goodKey := validPair(t, "good.example.com") + badCert, badKey := expiredPair(t, "bad.example.com") + + secrets := []*tlsv3.Secret{ + tlsSecret("good-secret", goodCert, goodKey), + tlsSecret("bad-secret", badCert, badKey), + } + l := listener(tlsChain(t, "good.example.com", "good-secret"), + tlsChain(t, "bad.example.com", "bad-secret"), + ) + listeners := []*listenerv3.Listener{l} + + kept, prunedChains, prunedSecrets, intact, dropped := + PruneInvalidTLSSecrets(listeners, secrets, fixedNow) + + assert.Equal(t, 1, prunedChains) + assert.Equal(t, 1, prunedSecrets) + assert.Equal(t, 0, intact) + assert.Equal(t, []string{"bad.example.com"}, dropped) + + // Good chain kept, listener still valid (non-empty). + assert.Equal(t, []string{"fc-good.example.com"}, chainNames(l)) + // Bad secret removed from kept, good secret retained. + assert.ElementsMatch(t, []string{"good-secret"}, secretNames(kept)) +} + +func TestPruneInvalidTLSSecrets_MismatchedKeyChainDropped(t *testing.T) { + goodCert, goodKey := validPair(t, "good.example.com") + // Mismatch: a valid cert paired with an unrelated valid key. + mismatchCert, _ := validPair(t, "bad.example.com") + _, otherKey := validPair(t, "other.example.com") + + secrets := []*tlsv3.Secret{ + tlsSecret("good-secret", goodCert, goodKey), + tlsSecret("mismatch-secret", mismatchCert, otherKey), + } + l := listener(tlsChain(t, "good.example.com", "good-secret"), + tlsChain(t, "bad.example.com", "mismatch-secret"), + ) + + kept, prunedChains, prunedSecrets, intact, dropped := + PruneInvalidTLSSecrets([]*listenerv3.Listener{l}, secrets, fixedNow) + + assert.Equal(t, 1, prunedChains) + assert.Equal(t, 1, prunedSecrets) + assert.Equal(t, 0, intact) + assert.Equal(t, []string{"bad.example.com"}, dropped) + assert.Equal(t, []string{"fc-good.example.com"}, chainNames(l)) + assert.ElementsMatch(t, []string{"good-secret"}, secretNames(kept)) +} + +func TestPruneInvalidTLSSecrets_AllBadListenerLeftIntact(t *testing.T) { + bad1Cert, bad1Key := expiredPair(t, "bad1.example.com") + bad2Cert, bad2Key := expiredPair(t, "bad2.example.com") + + secrets := []*tlsv3.Secret{ + tlsSecret("bad1-secret", bad1Cert, bad1Key), + tlsSecret("bad2-secret", bad2Cert, bad2Key), + } + l := listener(tlsChain(t, "bad1.example.com", "bad1-secret"), + tlsChain(t, "bad2.example.com", "bad2-secret"), + ) + + kept, prunedChains, prunedSecrets, intact, _ := + PruneInvalidTLSSecrets([]*listenerv3.Listener{l}, secrets, fixedNow) + + // Listener left INTACT (not emptied), nothing pruned from an emitted listener. + assert.Equal(t, 0, prunedChains) + assert.Equal(t, 0, prunedSecrets) + assert.Equal(t, 1, intact) + // Both chains still present — never emitted empty. + assert.Equal(t, []string{"fc-bad1.example.com", "fc-bad2.example.com"}, chainNames(l)) + // Bad secrets retained because their (intact) listener still references them. + assert.ElementsMatch(t, []string{"bad1-secret", "bad2-secret"}, secretNames(kept)) +} + +func TestPruneInvalidTLSSecrets_NonTLSChainNeverDropped(t *testing.T) { + goodCert, goodKey := validPair(t, "good.example.com") + badCert, badKey := expiredPair(t, "bad.example.com") + + secrets := []*tlsv3.Secret{ + tlsSecret("good-secret", goodCert, goodKey), + tlsSecret("bad-secret", badCert, badKey), + } + // A non-TLS/default chain alongside one good and one bad TLS chain. + l := listener(plainChain("tcp-passthrough"), + tlsChain(t, "good.example.com", "good-secret"), + tlsChain(t, "bad.example.com", "bad-secret"), + ) + + _, prunedChains, _, intact, _ := + PruneInvalidTLSSecrets([]*listenerv3.Listener{l}, secrets, fixedNow) + + assert.Equal(t, 1, prunedChains) + assert.Equal(t, 0, intact) + // Only the bad TLS chain removed; non-TLS chain and good TLS chain remain. + assert.Equal(t, []string{"tcp-passthrough", "fc-good.example.com"}, chainNames(l)) +} + +func TestPruneInvalidTLSSecrets_GoodOnlyNoOp(t *testing.T) { + good1Cert, good1Key := validPair(t, "a.example.com") + good2Cert, good2Key := validPair(t, "b.example.com") + + secrets := []*tlsv3.Secret{ + tlsSecret("a-secret", good1Cert, good1Key), + tlsSecret("b-secret", good2Cert, good2Key), + } + l := listener(tlsChain(t, "a.example.com", "a-secret"), + tlsChain(t, "b.example.com", "b-secret"), + ) + + kept, prunedChains, prunedSecrets, intact, dropped := + PruneInvalidTLSSecrets([]*listenerv3.Listener{l}, secrets, fixedNow) + + assert.Equal(t, 0, prunedChains) + assert.Equal(t, 0, prunedSecrets) + assert.Equal(t, 0, intact) + assert.Nil(t, dropped) + // Same secrets passed through (identity of the fast path). + assert.ElementsMatch(t, []string{"a-secret", "b-secret"}, secretNames(kept)) + assert.Len(t, kept, 2) + // Both chains untouched. + assert.Equal(t, []string{"fc-a.example.com", "fc-b.example.com"}, chainNames(l)) +} + +func TestPruneInvalidTLSSecrets_NonTLSSecretIgnored(t *testing.T) { + // A secret with no TlsCertificate (e.g. a validation context) is never bad. + vctxSecret := &tlsv3.Secret{ + Name: "ca-secret", + Type: &tlsv3.Secret_ValidationContext{ + ValidationContext: &tlsv3.CertificateValidationContext{}, + }, + } + goodCert, goodKey := validPair(t, "good.example.com") + secrets := []*tlsv3.Secret{ + vctxSecret, + tlsSecret("good-secret", goodCert, goodKey), + } + l := listener(tlsChain(t, "good.example.com", "good-secret")) + + kept, prunedChains, _, intact, _ := + PruneInvalidTLSSecrets([]*listenerv3.Listener{l}, secrets, fixedNow) + + assert.Equal(t, 0, prunedChains) + assert.Equal(t, 0, intact) + assert.ElementsMatch(t, []string{"ca-secret", "good-secret"}, secretNames(kept)) +} + +func TestPruneInvalidTLSSecrets_NotYetValidDropped(t *testing.T) { + goodCert, goodKey := validPair(t, "good.example.com") + // Not-yet-valid: starts after fixedNow. + futureCert, futureKey := mintCert(t, "future.example.com", + fixedNow.Add(24*time.Hour), fixedNow.Add(48*time.Hour)) + + secrets := []*tlsv3.Secret{ + tlsSecret("good-secret", goodCert, goodKey), + tlsSecret("future-secret", futureCert, futureKey), + } + l := listener(tlsChain(t, "good.example.com", "good-secret"), + tlsChain(t, "future.example.com", "future-secret"), + ) + + _, prunedChains, prunedSecrets, _, dropped := + PruneInvalidTLSSecrets([]*listenerv3.Listener{l}, secrets, fixedNow) + + assert.Equal(t, 1, prunedChains) + assert.Equal(t, 1, prunedSecrets) + assert.Equal(t, []string{"future.example.com"}, dropped) + assert.Equal(t, []string{"fc-good.example.com"}, chainNames(l)) +} diff --git a/internal/extensionserver/server/server.go b/internal/extensionserver/server/server.go index 49b2f4da..5fc21315 100644 --- a/internal/extensionserver/server/server.go +++ b/internal/extensionserver/server/server.go @@ -261,6 +261,53 @@ func (s *Server) PostTranslateModify( extmetrics.PhaseDuration.WithLabelValues("mutate").Observe(time.Since(mutStart).Seconds()) + // Last line of defence (issue #212): drop only the parts of a listener that + // use a broken certificate, so one bad certificate can't take HTTPS down for + // every other hostname sharing the listener. This only removes things it can + // prove are broken and never empties a listener, so it cannot make things + // worse. + _, tlsSpan := tr.Start(ctx, "tls.prune") + keptSecrets, prunedChains, prunedSecrets, listenersLeftIntact, droppedTLSNames := + mutate.PruneInvalidTLSSecrets(listeners, req.GetSecrets(), time.Now()) + + // Collect the Envoy listener names that had chains pruned or were left + // intact so they appear in the trace and the log alongside the SNI hostnames. + var affectedListenerNames []string + if prunedChains > 0 || listenersLeftIntact > 0 { + for _, l := range listeners { + if n := l.GetName(); n != "" { + affectedListenerNames = append(affectedListenerNames, n) + } + } + } + + tlsSpan.SetAttributes( + attribute.Int("tls.pruned_chains", prunedChains), + attribute.Int("tls.pruned_secrets", prunedSecrets), + attribute.Int("tls.listeners_left_intact", listenersLeftIntact), + attribute.StringSlice("tls.dropped_names", droppedTLSNames), + ) + tlsSpan.End() + + // Counters accumulate across all hook invocations; use rate() for trending. + extmetrics.TLSPrunedChainsTotal.Add(float64(prunedChains)) + extmetrics.TLSPrunedSecretsTotal.Add(float64(prunedSecrets)) + extmetrics.TLSListenersLeftIntactTotal.Add(float64(listenersLeftIntact)) + // Active gauges reflect the current state after each invocation so operators + // can alert on "is the backstop firing right now" without needing rate(). + extmetrics.TLSPrunedChainsActive.Set(float64(prunedChains)) + extmetrics.TLSListenersLeftIntactActive.Set(float64(listenersLeftIntact)) + + if prunedChains > 0 || listenersLeftIntact > 0 { + s.log.Warn("PostTranslateModify pruned invalid TLS chains", + "pruned_chains", prunedChains, + "pruned_secrets", prunedSecrets, + "listeners_left_intact", listenersLeftIntact, + "dropped_hostnames", droppedTLSNames, + "affected_listeners", affectedListenerNames, + ) + } + // Record per-mutation-family counters. These accumulate across builds; use // rate() in Prometheus / PromQL to derive per-build averages. extmetrics.WAFHCMMutationsTotal.Add(float64(hcmCount)) @@ -285,7 +332,7 @@ func (s *Server) PostTranslateModify( return &pb.PostTranslateModifyResponse{ Clusters: clusters, - Secrets: req.GetSecrets(), // pass through unchanged + Secrets: keptSecrets, // invalid-cert secrets pruned (issue #212); see TLS prune phase Listeners: listeners, Routes: routes, }, nil diff --git a/test/e2e/gateway-invalid-certificate/chainsaw-test.yaml b/test/e2e/gateway-invalid-certificate/chainsaw-test.yaml new file mode 100644 index 00000000..5ab2e026 --- /dev/null +++ b/test/e2e/gateway-invalid-certificate/chainsaw-test.yaml @@ -0,0 +1,654 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +# +# Issue #212 — a listener whose certificate can't be used must not be sent to +# the edge, and the customer must be told why. One bad certificate must never +# take HTTPS down for the other hostnames that share the gateway. +# +# The test follows the whole story: +# - the bad hostname is held back while the good one keeps working; +# - the customer sees a plain-language reason on the bad hostname; +# - a real HTTPS request to the good hostname returns 200, and the bad one is +# not served; +# - once the certificate can be issued, the bad hostname recovers and serves. +# +# To make the failure happen reliably, the bad hostname points at a certificate +# issuer that doesn't exist yet, so its certificate is never issued. Expired and +# mismatched certificates are covered by unit tests. +# +# concurrent: false — this test shares some cluster-wide setup with the +# gateway-accepted test, so it must not run at the same time. +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: gateway-invalid-certificate +spec: + concurrent: false + bindings: + - name: clusterIssuerName + value: (join('-', ['e2e', $namespace])) + - name: gatewayClassName + value: (join('-', ['e2e', $namespace])) + # The issuer the bad listener references. It does NOT exist when the gateway + # is created (so issuance fails), and is created later to drive recovery. + - name: recoveryIssuerName + value: (join('-', ['recover', $namespace])) + cluster: nso-standard + steps: + - name: Create CA on the downstream cluster + try: + - create: + cluster: nso-infra + resource: + apiVersion: cert-manager.io/v1 + kind: ClusterIssuer + metadata: + name: (join('-', [$clusterIssuerName, 'issuer'])) + spec: + selfSigned: {} + + - create: + cluster: nso-infra + resource: + apiVersion: cert-manager.io/v1 + kind: Certificate + metadata: + name: (join('-', [$clusterIssuerName, 'ca'])) + namespace: cert-manager + spec: + isCA: true + commonName: (join('-', [$clusterIssuerName, 'ca'])) + secretName: ($clusterIssuerName) + privateKey: + algorithm: ECDSA + size: 256 + issuerRef: + name: (join('-', [$clusterIssuerName, 'issuer'])) + kind: ClusterIssuer + group: cert-manager.io + + - create: + cluster: nso-infra + resource: + apiVersion: cert-manager.io/v1 + kind: ClusterIssuer + metadata: + name: ($clusterIssuerName) + spec: + ca: + secretName: ($clusterIssuerName) + + # Wait for the CA to be issued before anything depends on it. + - assert: + cluster: nso-infra + resource: + apiVersion: cert-manager.io/v1 + kind: Certificate + metadata: + name: (join('-', [$clusterIssuerName, 'ca'])) + namespace: cert-manager + status: + conditions: + - type: Ready + status: "True" + + - name: Create the GatewayClasses + try: + - create: + cluster: nso-standard + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: GatewayClass + metadata: + name: ($gatewayClassName) + spec: + controllerName: gateway.networking.datumapis.com/external-global-proxy-controller + + # Shared cluster-scoped downstream config — applied (not created) so this + # test coexists with gateway-accepted regardless of run order. + - apply: + cluster: nso-infra + resource: + apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: EnvoyProxy + metadata: + name: custom-proxy-config + namespace: envoy-gateway-system + spec: + provider: + type: Kubernetes + kubernetes: + envoyService: + type: ClusterIP + patch: + type: StrategicMerge + value: + spec: + ipFamilyPolicy: RequireDualStack + mergeGateways: true + + - apply: + cluster: nso-infra + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: GatewayClass + metadata: + name: datum-downstream-gateway-e2e + spec: + controllerName: gateway.envoyproxy.io/gatewayclass-controller + parametersRef: + group: gateway.envoyproxy.io + kind: EnvoyProxy + name: custom-proxy-config + namespace: envoy-gateway-system + + - apply: + cluster: nso-infra + resource: + apiVersion: v1 + kind: Namespace + metadata: + name: datum-downstream-gateway-hostnames + + - name: Create a backend to route to + try: + # Plain HTTP backend on the downstream cluster. The gateway terminates + # TLS with the per-listener certificate and forwards plaintext here. + - create: + cluster: nso-infra + resource: + apiVersion: v1 + kind: Pod + metadata: + name: backend-pod + namespace: default + labels: + purpose: backend-pod + spec: + containers: + - name: backend + image: ghcr.io/mccutchen/go-httpbin:2.18.1 + command: ["/bin/go-httpbin"] + args: ["-host", "0.0.0.0", "-port", "8080"] + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi + terminationGracePeriodSeconds: 0 + + - create: + cluster: nso-infra + resource: + apiVersion: v1 + kind: Service + metadata: + name: backend-service + namespace: default + spec: + ports: + - name: http + port: 8080 + targetPort: 8080 + selector: + purpose: backend-pod + + - name: Provision and verify the Domain + try: + - create: + cluster: nso-standard + resource: + apiVersion: networking.datumapis.com/v1alpha + kind: Domain + metadata: + name: test-domain + spec: + domainName: e2e.env.datum.net + + - description: | + Stand in for the Domain controller's HTTP/DNS verification so the + listener hostnames below are claimable. + script: + content: | + kubectl -n $NAMESPACE patch domain test-domain \ + --subresource=status --type=merge \ + -p '{"status":{"conditions":[{"type": "Verified", "status": "True", "reason": "Test", "message": "test", "lastTransitionTime": "2025-02-24T23:59:09Z"}]}}' + + - name: Provision a Gateway with a good and a bad certificate listener + try: + - script: + cluster: nso-standard + skipCommandOutput: true + skipLogOutput: true + content: | + kubectl get ns $NAMESPACE -o json + outputs: + - name: downstreamNamespaceName + value: (join('-', ['ns', json_parse($stdout).metadata.uid])) + + - create: + cluster: nso-standard + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + metadata: + name: test-gateway + spec: + gatewayClassName: ($gatewayClassName) + listeners: + - protocol: HTTP + port: 80 + name: http + allowedRoutes: + namespaces: + from: Same + hostname: good.e2e.env.datum.net + # Healthy listener: references the working CA issuer. + - protocol: HTTPS + port: 443 + name: https-good + allowedRoutes: + namespaces: + from: Same + hostname: good.e2e.env.datum.net + tls: + mode: Terminate + options: + gateway.networking.datumapis.com/certificate-issuer: ($clusterIssuerName) + # Bad listener: references an issuer that does not exist yet, so + # cert-manager cannot issue the certificate and it stays + # Ready=False. NSO must omit this listener downstream. + - protocol: HTTPS + port: 443 + name: https-bad + allowedRoutes: + namespaces: + from: Same + hostname: bad.e2e.env.datum.net + tls: + mode: Terminate + options: + gateway.networking.datumapis.com/certificate-issuer: ($recoveryIssuerName) + + # An EndpointSlice (upstream) that NSO mirrors into a downstream Service, + # plus an HTTPRoute serving both hostnames. The route binds to whichever + # listeners exist, so it covers the good listener now and the bad listener + # after recovery. + - create: + cluster: nso-standard + resource: + apiVersion: discovery.k8s.io/v1 + kind: EndpointSlice + metadata: + name: test-slice-1 + addressType: FQDN + endpoints: + - addresses: + - backend-service.default.svc.cluster.local + conditions: + ready: true + serving: true + terminating: false + ports: + - name: http + appProtocol: http + port: 8080 + + - create: + cluster: nso-standard + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + name: test-route + spec: + # NSO forbids spec.hostnames on HTTPRoutes; the route inherits + # each attached listener's hostname, so it serves good now and + # the recovered bad listener later. + parentRefs: + - name: test-gateway + kind: Gateway + rules: + - matches: + - path: + type: PathPrefix + value: / + backendRefs: + - group: discovery.k8s.io + kind: EndpointSlice + name: test-slice-1 + port: 8080 + + # Wait until the operator has published the canonical hostname so we know + # the gateway has reconciled at least once. + - assert: + timeout: 120s + cluster: nso-standard + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + metadata: + name: test-gateway + status: + (addresses[?type == 'Hostname'] | [0]): + type: Hostname + + # The good hostname is sent to the edge; the bad one is held back. Every + # other hostname on the gateway keeps working. + - assert: + timeout: 120s + cluster: nso-infra + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + metadata: + name: test-gateway + namespace: ($downstreamNamespaceName) + spec: + # good-cert HTTPS listener present with its per-listener secret. + (listeners[?name == 'https-good']): + - port: 443 + protocol: HTTPS + tls: + mode: Terminate + certificateRefs: + - group: "" + kind: Secret + name: test-gateway-https-good + # bad-cert HTTPS listener omitted entirely. + (listeners[?name == 'https-bad'] | length(@)): 0 + # plain HTTP listener (no certificate) is never gated. + (listeners[?name == 'http'] | length(@)): 1 + + # The customer-facing status explains, in plain language, that the bad + # hostname is held back because of its certificate, while the good one + # reports healthy. Listeners are matched by name because the gateway also + # adds its own default listeners. + - assert: + timeout: 120s + cluster: nso-standard + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + metadata: + name: test-gateway + status: + (listeners[?name == 'https-good'] | [0]): + (conditions[?type == 'Accepted']): + - status: "True" + (conditions[?type == 'Programmed']): + - status: "True" + (conditions[?type == 'ResolvedRefs']): + - status: "True" + (listeners[?name == 'https-bad'] | [0]): + # Config is structurally valid, so Accepted stays True. + (conditions[?type == 'Accepted']): + - status: "True" + # But the certificate isn't usable, so the listener is not + # programmed and its refs don't resolve. + (conditions[?type == 'Programmed']): + - status: "False" + reason: Invalid + (conditions[?type == 'ResolvedRefs']): + - status: "False" + reason: InvalidCertificateRef + message: "We couldn't issue a TLS certificate for bad.e2e.env.datum.net, so HTTPS for this hostname is unavailable. Please confirm the domain points to Datum." + catch: + - script: + cluster: nso-standard + content: | + kubectl -n network-services-operator-system logs -l app.kubernetes.io/name=network-services-operator --tail=200 + kubectl -n $NAMESPACE get gateway test-gateway -o yaml + - script: + cluster: nso-infra + env: + - name: DOWNSTREAM_NAMESPACE + value: ($downstreamNamespaceName) + content: | + kubectl -n $DOWNSTREAM_NAMESPACE get gateway test-gateway -o yaml + kubectl -n $DOWNSTREAM_NAMESPACE get httproute -o yaml + kubectl -n $DOWNSTREAM_NAMESPACE get certificates -o yaml + + - name: Confirm Envoy programmed and serves the good listener + try: + - script: + cluster: nso-standard + skipCommandOutput: true + skipLogOutput: true + content: | + kubectl get ns $NAMESPACE -o json + outputs: + - name: downstreamNamespaceName + value: (join('-', ['ns', json_parse($stdout).metadata.uid])) + + # This status is set by the edge gateway itself, so Programmed=True means + # the edge accepted the configuration and is ready to serve it. + - assert: + timeout: 180s + cluster: nso-infra + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + metadata: + name: test-gateway + namespace: ($downstreamNamespaceName) + status: + (listeners[?name == 'https-good'] | [0]): + (conditions[?type == 'Programmed']): + - status: "True" + + # The HTTPRoute is accepted and its refs resolve on the downstream + # gateway (the data-plane route is in place). + - assert: + timeout: 120s + cluster: nso-infra + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + name: test-route + namespace: ($downstreamNamespaceName) + status: + parents: + - (conditions[?type == 'Accepted']): + - status: "True" + (conditions[?type == 'ResolvedRefs']): + - status: "True" + + - assert: + cluster: nso-infra + resource: + apiVersion: v1 + kind: Pod + metadata: + name: backend-pod + namespace: default + status: + phase: Running + + # A long-lived client pod with curl preinstalled. + - create: + cluster: nso-infra + resource: + apiVersion: v1 + kind: Pod + metadata: + name: conntest + namespace: default + spec: + containers: + - name: curl + image: curlimages/curl:8.11.1 + command: ["sleep", "infinity"] + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi + terminationGracePeriodSeconds: 0 + + - assert: + cluster: nso-infra + resource: + apiVersion: v1 + kind: Pod + metadata: + name: conntest + namespace: default + status: + phase: Running + + # A real HTTPS request to the good hostname must return 200, and the bad + # hostname must not be served. Retries give the edge a moment to pick up + # the configuration. + # Explicit timeout: the chainsaw default for script steps is 5s, too short + # for the retry loop below. + - script: + timeout: 240s + cluster: nso-infra + content: | + set -eu + IP=$(kubectl get svc -n envoy-gateway-system \ + -l gateway.envoyproxy.io/owning-gatewayclass=datum-downstream-gateway-e2e \ + -o jsonpath='{.items[0].spec.clusterIP}') + echo "gateway service clusterIP: ${IP}" + + echo "--- good hostname must serve 200 ---" + ok="" + for i in $(seq 1 24); do + code=$(kubectl -n default exec conntest -- \ + curl -ksS -o /dev/null -w '%{http_code}' --max-time 10 \ + --resolve good.e2e.env.datum.net:443:"${IP}" \ + https://good.e2e.env.datum.net/status/200 || true) + echo "attempt ${i}: good -> ${code}" + if [ "${code}" = "200" ]; then ok="yes"; break; fi + sleep 5 + done + if [ -z "${ok}" ]; then + echo "ERROR: good hostname never returned 200"; exit 1 + fi + + echo "--- bad hostname must not serve (no matching listener) ---" + code=$(kubectl -n default exec conntest -- \ + curl -ksS -o /dev/null -w '%{http_code}' --max-time 10 \ + --resolve bad.e2e.env.datum.net:443:"${IP}" \ + https://bad.e2e.env.datum.net/status/200 || true) + echo "bad -> ${code}" + if [ "${code}" = "200" ]; then + echo "ERROR: bad hostname was served despite an invalid certificate"; exit 1 + fi + echo "OK: good served, bad not served" + catch: + - script: + cluster: nso-infra + env: + - name: DOWNSTREAM_NAMESPACE + value: ($downstreamNamespaceName) + content: | + kubectl -n $DOWNSTREAM_NAMESPACE get gateway test-gateway -o yaml + kubectl -n $DOWNSTREAM_NAMESPACE get httproute -o yaml + kubectl get svc -n envoy-gateway-system -o wide + kubectl -n envoy-gateway-system logs -l gateway.envoyproxy.io/owning-gatewayclass=datum-downstream-gateway-e2e -c envoy --tail=100 + + - name: Recover the bad listener by creating its issuer + try: + - script: + cluster: nso-standard + skipCommandOutput: true + skipLogOutput: true + content: | + kubectl get ns $NAMESPACE -o json + outputs: + - name: downstreamNamespaceName + value: (join('-', ['ns', json_parse($stdout).metadata.uid])) + + # The previously-missing issuer now exists and can sign, using the same + # CA. cert-manager issues the listener's certificate, the gate sees it + # Ready, and the listener is propagated. + - create: + cluster: nso-infra + resource: + apiVersion: cert-manager.io/v1 + kind: ClusterIssuer + metadata: + name: ($recoveryIssuerName) + spec: + ca: + secretName: ($clusterIssuerName) + + # The recovered hostname is sent to the edge once its certificate issues. + - assert: + timeout: 180s + cluster: nso-infra + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + metadata: + name: test-gateway + namespace: ($downstreamNamespaceName) + spec: + (listeners[?name == 'https-bad']): + - port: 443 + protocol: HTTPS + tls: + mode: Terminate + certificateRefs: + - group: "" + kind: Secret + name: test-gateway-https-bad + + # The recovered hostname reports healthy to the customer. + - assert: + timeout: 120s + cluster: nso-standard + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + metadata: + name: test-gateway + status: + (listeners[?name == 'https-bad'] | [0]): + (conditions[?type == 'Accepted']): + - status: "True" + (conditions[?type == 'Programmed']): + - status: "True" + (conditions[?type == 'ResolvedRefs']): + - status: "True" + + # The recovered hostname now actually serves 200. + - script: + timeout: 240s + cluster: nso-infra + content: | + set -eu + IP=$(kubectl get svc -n envoy-gateway-system \ + -l gateway.envoyproxy.io/owning-gatewayclass=datum-downstream-gateway-e2e \ + -o jsonpath='{.items[0].spec.clusterIP}') + echo "--- recovered hostname must now serve 200 ---" + ok="" + for i in $(seq 1 24); do + code=$(kubectl -n default exec conntest -- \ + curl -ksS -o /dev/null -w '%{http_code}' --max-time 10 \ + --resolve bad.e2e.env.datum.net:443:"${IP}" \ + https://bad.e2e.env.datum.net/status/200 || true) + echo "attempt ${i}: recovered bad -> ${code}" + if [ "${code}" = "200" ]; then ok="yes"; break; fi + sleep 5 + done + if [ -z "${ok}" ]; then + echo "ERROR: recovered hostname never returned 200"; exit 1 + fi + echo "OK: recovered hostname serves 200" + catch: + - script: + cluster: nso-infra + env: + - name: DOWNSTREAM_NAMESPACE + value: ($downstreamNamespaceName) + content: | + kubectl -n $DOWNSTREAM_NAMESPACE get gateway test-gateway -o yaml + kubectl -n $DOWNSTREAM_NAMESPACE get certificates -o yaml + kubectl -n network-services-operator-system logs -l app.kubernetes.io/name=network-services-operator --tail=200 diff --git a/test/prometheus-rules/gateways/cert-health-rules.yaml b/test/prometheus-rules/gateways/cert-health-rules.yaml new file mode 100644 index 00000000..ffd109da --- /dev/null +++ b/test/prometheus-rules/gateways/cert-health-rules.yaml @@ -0,0 +1,61 @@ +groups: +- name: nso-tls-cert-health + interval: 30s + rules: + # Fires when NSO has withheld a Gateway listener because its TLS certificate + # is unusable. The customer's HTTPS hostname is dark until the cert recovers. + # If EnvoyListenerUpdateRejected is also firing without this alert, NSO's + # cert gating has regressed and a bad cert reached Envoy directly. + - alert: GatewayListenerCertUnusable + expr: | + nso_gateway_listener_cert_withheld == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Gateway listener {{ $labels.namespace }}/{{ $labels.name }}/{{ $labels.listener }} has an unusable TLS certificate" + description: "NSO has withheld listener {{ $labels.listener }} (hostname {{ $labels.hostname }}) on Gateway {{ $labels.name }} in namespace {{ $labels.namespace }} because its TLS certificate is unusable (reason: {{ $labels.reason }}). The customer cannot serve HTTPS on this hostname. Check the cert-manager Certificate and Secret in the downstream cluster." + + # Fires when a managed TLS certificate is within 7 days of expiry while it + # is still healthy. cert-manager renews automatically, but renewal fails if + # the domain's DNS no longer points to Datum. Acting here avoids a future + # GatewayListenerCertUnusable alert. + - alert: GatewayListenerCertExpiringSoon + expr: | + (nso_gateway_listener_cert_expiry_time - time()) / 86400 < 7 + for: 1h + labels: + severity: warning + annotations: + summary: "TLS certificate for Gateway listener {{ $labels.namespace }}/{{ $labels.name }}/{{ $labels.listener }} expires in less than 7 days" + description: "The cert-manager Certificate for listener {{ $labels.listener }} (hostname {{ $labels.hostname }}, secret {{ $labels.secret }}) on Gateway {{ $labels.name }} in namespace {{ $labels.namespace }} expires within 7 days. cert-manager should renew it automatically, but renewal fails if the domain's DNS no longer points to Datum. Verify the Certificate is Ready=True in the downstream cluster." + + # Fires when the extension server's data-plane backstop is actively dropping + # broken TLS filter chains from PostTranslateModify responses. This is normal + # during the window between cert failure and controller-side gating catching + # up. If only this alert fires and GatewayListenerCertUnusable does not, the + # controller may have missed the bad listener — investigate. + - alert: TLSBackstopPruningChains + expr: | + nso_extension_tls_pruned_chains_active > 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Extension server TLS backstop is actively pruning {{ $value }} broken filter chain(s)" + description: "The extension server's TLS backstop is dropping {{ $value }} broken TLS filter chain(s) from PostTranslateModify responses. Check extension server logs for 'pruned invalid TLS chains' to identify affected hostnames. If GatewayListenerCertUnusable is also firing, the two-layer defense is working as expected. If only this alert fires, the controller-side cert gating may have missed the listener." + + # Fires when the extension server's backstop could not save a listener because + # every TLS filter chain on it is broken. The backstop never empties a listener, + # so Envoy will NACK the full LDS update — see EnvoyListenerUpdateRejected in + # the infra alerts for the confirming signal. This means NSO's controller-side + # cert gating did not suppress the listener before it reached the data plane. + - alert: TLSBackstopListenerAllCertsBroken + expr: | + nso_extension_tls_listeners_left_intact_active > 0 + for: 2m + labels: + severity: critical + annotations: + summary: "{{ $value }} Envoy listener(s) have all TLS certificates broken — backstop cannot protect them" + description: "The extension server left {{ $value }} Envoy listener(s) completely intact because every TLS filter chain on them referenced a broken certificate. The backstop never empties a listener, so Envoy will reject the LDS update for those listeners (see EnvoyListenerUpdateRejected). This means NSO's controller-side cert gating did not suppress the listener before it reached the data plane. Check extension server logs for 'listeners_left_intact' and investigate why evaluateListenerCertHealth did not catch the listener." diff --git a/test/prometheus-rules/gateways/cert-health-tests.yaml b/test/prometheus-rules/gateways/cert-health-tests.yaml new file mode 100644 index 00000000..391ee3ae --- /dev/null +++ b/test/prometheus-rules/gateways/cert-health-tests.yaml @@ -0,0 +1,147 @@ +rule_files: + - cert-health-rules.yaml + +evaluation_interval: 30s + +tests: + # GatewayListenerCertUnusable — listener withheld for more than 5 minutes + - interval: 1m + input_series: + - series: 'nso_gateway_listener_cert_withheld{namespace="customer-ns", name="my-gateway", listener="https-custom", hostname="example.com", reason="InvalidCertificateRef"}' + values: '1+0x5' + alert_rule_test: + - eval_time: 6m + alertname: GatewayListenerCertUnusable + exp_alerts: + - exp_labels: + severity: warning + namespace: customer-ns + name: my-gateway + listener: https-custom + hostname: example.com + reason: InvalidCertificateRef + exp_annotations: + summary: "Gateway listener customer-ns/my-gateway/https-custom has an unusable TLS certificate" + description: "NSO has withheld listener https-custom (hostname example.com) on Gateway my-gateway in namespace customer-ns because its TLS certificate is unusable (reason: InvalidCertificateRef). The customer cannot serve HTTPS on this hostname. Check the cert-manager Certificate and Secret in the downstream cluster." + + # GatewayListenerCertUnusable — listener withheld for less than 5 minutes (should NOT alert) + - interval: 1m + input_series: + - series: 'nso_gateway_listener_cert_withheld{namespace="customer-ns", name="my-gateway", listener="https-custom", hostname="example.com", reason="InvalidCertificateRef"}' + values: '1+0x2' + alert_rule_test: + - eval_time: 3m + alertname: GatewayListenerCertUnusable + exp_alerts: [] + + # GatewayListenerCertUnusable — listener is healthy (withheld=0, should NOT alert) + - interval: 1m + input_series: + - series: 'nso_gateway_listener_cert_withheld{namespace="customer-ns", name="healthy-gateway", listener="https-custom", hostname="ok.com", reason="InvalidCertificateRef"}' + values: '0+0x5' + alert_rule_test: + - eval_time: 6m + alertname: GatewayListenerCertUnusable + exp_alerts: [] + + # GatewayListenerCertExpiringSoon — cert expires in 5 days (within 7-day threshold) + # At eval_time 62m (3720s), expiry_time = 3720 + 5*86400 = 435720 + - interval: 1m + input_series: + - series: 'nso_gateway_listener_cert_expiry_time{namespace="customer-ns", name="my-gateway", listener="https-custom", hostname="example.com", secret="my-gateway-https-custom"}' + values: '435720+0x62' + alert_rule_test: + - eval_time: 62m + alertname: GatewayListenerCertExpiringSoon + exp_alerts: + - exp_labels: + severity: warning + namespace: customer-ns + name: my-gateway + listener: https-custom + hostname: example.com + secret: my-gateway-https-custom + exp_annotations: + summary: "TLS certificate for Gateway listener customer-ns/my-gateway/https-custom expires in less than 7 days" + description: "The cert-manager Certificate for listener https-custom (hostname example.com, secret my-gateway-https-custom) on Gateway my-gateway in namespace customer-ns expires within 7 days. cert-manager should renew it automatically, but renewal fails if the domain's DNS no longer points to Datum. Verify the Certificate is Ready=True in the downstream cluster." + + # GatewayListenerCertExpiringSoon — cert expires in 8 days (outside threshold, should NOT alert) + # At eval_time 62m (3720s), expiry_time = 3720 + 8*86400 = 694920 + - interval: 1m + input_series: + - series: 'nso_gateway_listener_cert_expiry_time{namespace="customer-ns", name="my-gateway", listener="https-custom", hostname="example.com", secret="my-gateway-https-custom"}' + values: '694920+0x62' + alert_rule_test: + - eval_time: 62m + alertname: GatewayListenerCertExpiringSoon + exp_alerts: [] + + # TLSBackstopPruningChains — backstop actively dropping chains for more than 2 minutes + - interval: 1m + input_series: + - series: 'nso_extension_tls_pruned_chains_active' + values: '3+0x2' + alert_rule_test: + - eval_time: 3m + alertname: TLSBackstopPruningChains + exp_alerts: + - exp_labels: + severity: warning + exp_annotations: + summary: "Extension server TLS backstop is actively pruning 3 broken filter chain(s)" + description: "The extension server's TLS backstop is dropping 3 broken TLS filter chain(s) from PostTranslateModify responses. Check extension server logs for 'pruned invalid TLS chains' to identify affected hostnames. If GatewayListenerCertUnusable is also firing, the two-layer defense is working as expected. If only this alert fires, the controller-side cert gating may have missed the listener." + + # TLSBackstopPruningChains — backstop just started, under 2 minutes (should NOT alert) + - interval: 1m + input_series: + - series: 'nso_extension_tls_pruned_chains_active' + values: '2+0x0' + alert_rule_test: + - eval_time: 1m + alertname: TLSBackstopPruningChains + exp_alerts: [] + + # TLSBackstopPruningChains — no chains pruned (should NOT alert) + - interval: 1m + input_series: + - series: 'nso_extension_tls_pruned_chains_active' + values: '0+0x2' + alert_rule_test: + - eval_time: 3m + alertname: TLSBackstopPruningChains + exp_alerts: [] + + # TLSBackstopListenerAllCertsBroken — all certs broken for more than 2 minutes (critical) + - interval: 1m + input_series: + - series: 'nso_extension_tls_listeners_left_intact_active' + values: '1+0x2' + alert_rule_test: + - eval_time: 3m + alertname: TLSBackstopListenerAllCertsBroken + exp_alerts: + - exp_labels: + severity: critical + exp_annotations: + summary: "1 Envoy listener(s) have all TLS certificates broken — backstop cannot protect them" + description: "The extension server left 1 Envoy listener(s) completely intact because every TLS filter chain on them referenced a broken certificate. The backstop never empties a listener, so Envoy will reject the LDS update for those listeners (see EnvoyListenerUpdateRejected). This means NSO's controller-side cert gating did not suppress the listener before it reached the data plane. Check extension server logs for 'listeners_left_intact' and investigate why evaluateListenerCertHealth did not catch the listener." + + # TLSBackstopListenerAllCertsBroken — condition just started, under 2 minutes (should NOT alert) + - interval: 1m + input_series: + - series: 'nso_extension_tls_listeners_left_intact_active' + values: '2+0x0' + alert_rule_test: + - eval_time: 1m + alertname: TLSBackstopListenerAllCertsBroken + exp_alerts: [] + + # TLSBackstopListenerAllCertsBroken — no intact listeners (should NOT alert) + - interval: 1m + input_series: + - series: 'nso_extension_tls_listeners_left_intact_active' + values: '0+0x2' + alert_rule_test: + - eval_time: 3m + alertname: TLSBackstopListenerAllCertsBroken + exp_alerts: []