diff --git a/Makefile b/Makefile index fb65611d..c042168f 100644 --- a/Makefile +++ b/Makefile @@ -81,7 +81,8 @@ test-e2e: chainsaw } $(KIND) get kubeconfig --name nso-standard > $(TMPDIR)/.kind-nso-standard.yaml $(KIND) get kubeconfig --name nso-infra > $(TMPDIR)/.kind-nso-infra.yaml - $(CHAINSAW) test ./test/e2e \ + $(CHAINSAW) test $(or $(TEST_DIR),./test/e2e) \ + --parallel 1 \ --cluster nso-standard=$(TMPDIR)/.kind-nso-standard.yaml \ --cluster nso-infra=$(TMPDIR)/.kind-nso-infra.yaml @@ -153,7 +154,7 @@ set-image-controller: manifests kustomize cd config/manager && $(KUSTOMIZE) edit set image ghcr.io/datum-cloud/network-services-operator=${IMG} .PHONY: prepare-infra-cluster -prepare-infra-cluster: cert-manager envoy-gateway external-dns downstream-crds +prepare-infra-cluster: cert-manager envoy-gateway external-dns downstream-crds billing-usage-collector load-image-nso-infra extension-server configure-eg-extension-manager .PHONY: downstream-crds downstream-crds: ## Install NSO CRDs on the downstream (infra) cluster that the replicator mirrors into it. @@ -168,12 +169,16 @@ prepare-e2e: chainsaw set-image-controller cert-manager load-image-all deploy-e2 prepare-dev: chainsaw set-image-controller cert-manager install .PHONY: load-image-all -load-image-all: load-image-operator +load-image-all: load-image-operator load-image-nso-infra .PHONY: load-image-operator load-image-operator: docker-build kind $(KIND) load docker-image $(IMG) -n nso-standard +.PHONY: load-image-nso-infra +load-image-nso-infra: docker-build kind ## Load operator image into nso-infra kind cluster (needed by the extension server). + $(KIND) load docker-image $(IMG) -n nso-infra + .PHONY: cert-manager cert-manager: cmctl $(KUSTOMIZE) build --enable-helm config/tools/cert-manager | kubectl apply --server-side=true --force-conflicts -f - @@ -187,6 +192,24 @@ envoy-gateway: external-dns: $(KUSTOMIZE) build --enable-helm config/tools/external-dns | kubectl apply --server-side=true --force-conflicts -f - +.PHONY: billing-usage-collector +billing-usage-collector: + $(KUSTOMIZE) build --enable-helm config/tools/billing-usage-collector | kubectl apply --server-side=true --force-conflicts -f - + +.PHONY: extension-server +extension-server: ## Deploy the NSO extension server to the infra cluster (e2e overlay with cert-manager-issued TLS). + $(KUSTOMIZE) build --enable-helm config/extension-server-e2e | kubectl apply --server-side=true --force-conflicts -f - + kubectl rollout restart deployment/network-services-operator-envoy-gateway-extension-server \ + -n network-services-operator-system + kubectl rollout status deployment/network-services-operator-envoy-gateway-extension-server \ + -n network-services-operator-system --timeout=5m + +.PHONY: configure-eg-extension-manager +configure-eg-extension-manager: ## Patch the EG ConfigMap to enable extensionManager and restart the EG controller. + $(KUSTOMIZE) build --enable-helm config/tools/envoy-gateway/overlays/e2e | kubectl apply --server-side=true --force-conflicts -f - + kubectl rollout restart deployment/envoy-gateway -n envoy-gateway-system + kubectl rollout status deployment/envoy-gateway -n envoy-gateway-system --timeout=3m + .PHONY: kind-standard-cluster kind-standard-cluster: kind $(KIND) create cluster --config=config/tools/kind/standard-cluster.yaml diff --git a/config/dev/downstream_resources/downstream-gateway.yaml b/config/dev/downstream_resources/downstream-gateway.yaml index 29b2ecb8..5654da34 100644 --- a/config/dev/downstream_resources/downstream-gateway.yaml +++ b/config/dev/downstream_resources/downstream-gateway.yaml @@ -96,6 +96,36 @@ spec: disable: false enableVirtualHostStats: true enablePerEndpointStats: true + accessLog: + settings: + - format: + type: JSON + json: + start_time: "%START_TIME%" + method: "%REQ(:METHOD)%" + path: "%REQ(X-ENVOY-ORIGINAL-PATH?:PATH)%" + protocol: "%PROTOCOL%" + response_code: "%RESPONSE_CODE%" + bytes_received: "%BYTES_RECEIVED%" + bytes_sent: "%BYTES_SENT%" + duration: "%DURATION%" + route_name: "%ROUTE_NAME%" + project_name: "%METADATA(ROUTE:datum-gateway:project_name)%" + # Push access logs straight to the node-local Vector agent over OTLP. + # The JSON fields above are delivered as OTLP log-record attributes, + # which Vector's `envoy_access_logs` opentelemetry source parses into + # billing CloudEvents. internalTrafficPolicy: Local on the Vector + # Service keeps this hop on-node. + sinks: + - type: OpenTelemetry + openTelemetry: + host: billing-usage-collector-vector.billing-system.svc.cluster.local + port: 4317 + # OTLP has no custom URL path; identify this stream via an OTel + # resource attribute instead. Surfaces in Vector under + # `.resources` and can be used to route/filter signals. + resources: + service.name: nso-httproute-signals --- apiVersion: gateway.networking.k8s.io/v1 kind: GatewayClass diff --git a/config/extension-server-e2e/client-cert-patch.yaml b/config/extension-server-e2e/client-cert-patch.yaml new file mode 100644 index 00000000..d0cd85f8 --- /dev/null +++ b/config/extension-server-e2e/client-cert-patch.yaml @@ -0,0 +1,12 @@ +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: envoy-gateway-extension-server-eg-client-tls +spec: + issuerRef: + # Switch from the production placeholder ClusterIssuer to the e2e CA Issuer. + # kustomize propagates the namePrefix so this resolves to + # network-services-operator-nso-es-ca-issuer at apply time. + name: nso-es-ca-issuer + kind: Issuer + group: cert-manager.io diff --git a/config/extension-server-e2e/deployment-patch.yaml b/config/extension-server-e2e/deployment-patch.yaml new file mode 100644 index 00000000..8be324ab --- /dev/null +++ b/config/extension-server-e2e/deployment-patch.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: envoy-gateway-extension-server +spec: + # Single-node kind clusters can't satisfy the base DoNotSchedule topology constraint. + replicas: 1 + template: + spec: + # Remove the hostname spread constraint — only 1 node in kind. + topologySpreadConstraints: null + volumes: + # Replace cert-manager CSI driver mount with a regular Secret volume. + # cert-manager Certificate nso-es-tls populates this Secret. + - name: tls + csi: null + secret: + secretName: nso-extension-server-tls + # Replace ConfigMap CA bundle with the same Secret; the ca.crt key holds + # the issuing CA cert that the extension server uses to verify EG's client cert. + - name: tls-ca + configMap: null + secret: + secretName: nso-extension-server-tls + items: + - key: ca.crt + path: ca.crt + # Extension server operator config — disables Coraza WAF injection + # so standard (non-contrib) Envoy images work in e2e. + - name: server-config + configMap: + name: extension-server-config + containers: + - name: envoy-gateway-extension-server + env: + - name: SERVER_CONFIG + value: /server-config/config.yaml + volumeMounts: + - name: server-config + mountPath: /server-config + readOnly: true diff --git a/config/extension-server-e2e/kustomization.yaml b/config/extension-server-e2e/kustomization.yaml new file mode 100644 index 00000000..001934da --- /dev/null +++ b/config/extension-server-e2e/kustomization.yaml @@ -0,0 +1,28 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: network-services-operator-system +namePrefix: network-services-operator- + +resources: + - namespace.yaml + - ../extension-server + - tls.yaml + - server-config.yaml + +patches: + # Replace CSI + ConfigMap volumes with Secret-based mounts; reduce replicas to 1 for single-node kind. + - path: deployment-patch.yaml + target: + kind: Deployment + name: envoy-gateway-extension-server + # Remove the PDB minAvailable constraint — kind clusters have only 1 node so minAvailable:1 prevents eviction. + - path: pdb-patch.yaml + target: + kind: PodDisruptionBudget + name: envoy-gateway-extension-server-pdb + # Switch the EG client cert issuer from the placeholder to the e2e CA. + - path: client-cert-patch.yaml + target: + kind: Certificate + name: envoy-gateway-extension-server-eg-client-tls diff --git a/config/extension-server-e2e/namespace.yaml b/config/extension-server-e2e/namespace.yaml new file mode 100644 index 00000000..064c2bd4 --- /dev/null +++ b/config/extension-server-e2e/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: network-services-operator-system diff --git a/config/extension-server-e2e/pdb-patch.yaml b/config/extension-server-e2e/pdb-patch.yaml new file mode 100644 index 00000000..97d2b047 --- /dev/null +++ b/config/extension-server-e2e/pdb-patch.yaml @@ -0,0 +1,8 @@ +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: envoy-gateway-extension-server-pdb +spec: + # replicas=1 in e2e; minAvailable:1 would block any eviction. Set to 0 so kind + # cluster teardown / upgrades don't stall. + minAvailable: 0 diff --git a/config/extension-server-e2e/server-config.yaml b/config/extension-server-e2e/server-config.yaml new file mode 100644 index 00000000..94cb9312 --- /dev/null +++ b/config/extension-server-e2e/server-config.yaml @@ -0,0 +1,17 @@ +--- +# ConfigMap holding the extension server operator config for e2e. +# Disables Coraza WAF injection — e2e uses the standard Envoy image which +# does not have the golang filter compiled in, so injecting coraza-waf causes +# Envoy to reject the listener configuration. +apiVersion: v1 +kind: ConfigMap +metadata: + name: extension-server-config + namespace: network-services-operator-system +data: + config.yaml: | + apiVersion: apiserver.config.datumapis.com/v1alpha1 + kind: NetworkServicesOperator + gateway: + coraza: + disabled: true diff --git a/config/extension-server-e2e/tls.yaml b/config/extension-server-e2e/tls.yaml new file mode 100644 index 00000000..6a9b0b5d --- /dev/null +++ b/config/extension-server-e2e/tls.yaml @@ -0,0 +1,59 @@ +--- +# Self-signed bootstrap issuer; creates the e2e CA cert below. +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: nso-es-selfsigned +spec: + selfSigned: {} +--- +# CA cert. cert-manager places the CA cert+key in Secret nso-extension-server-ca +# (in the same namespace). The CA Issuer below references that Secret. +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: nso-es-ca +spec: + isCA: true + commonName: nso-extension-server-ca + secretName: nso-extension-server-ca + privateKey: + algorithm: ECDSA + size: 256 + issuerRef: + name: nso-es-selfsigned + kind: Issuer + group: cert-manager.io +--- +# CA-backed issuer used by both the server cert and the EG client cert below. +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: nso-es-ca-issuer +spec: + ca: + secretName: nso-extension-server-ca +--- +# Extension-server TLS cert. cert-manager writes it to Secret nso-extension-server-tls, +# which includes ca.crt = the CA cert — EG reads this field from certificateRef to verify +# the extension server's presented cert. +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: nso-es-tls +spec: + secretName: nso-extension-server-tls + dnsNames: + - network-services-operator-envoy-gateway-extension-server.network-services-operator-system.svc + - network-services-operator-envoy-gateway-extension-server.network-services-operator-system.svc.cluster.local + privateKey: + algorithm: ECDSA + size: 256 + issuerRef: + name: nso-es-ca-issuer + kind: Issuer + group: cert-manager.io + usages: + - server auth + - digital signature + - key encipherment diff --git a/config/tools/billing-usage-collector/kustomization.yaml b/config/tools/billing-usage-collector/kustomization.yaml new file mode 100644 index 00000000..fa8ba92f --- /dev/null +++ b/config/tools/billing-usage-collector/kustomization.yaml @@ -0,0 +1,268 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: billing-system + +resources: + - namespace.yaml + +helmCharts: + - name: vector + namespace: billing-system + releaseName: billing-usage-collector + version: 0.34.0 + repo: https://helm.vector.dev + valuesInline: + role: Agent + image: + tag: "0.55.0-alpine" + tolerations: + - operator: Exists + serviceAccount: + create: true + name: billing-usage-collector + env: + - name: BILLING_GATEWAY_URL + value: "http://mock-billing-gateway.billing-system.svc.cluster.local:8080" + customConfig: + data_dir: "/vector-data-dir" + sources: + billing_sdk: + type: http_server + address: "0.0.0.0:9880" + path: "/cloudevents" + decoding: + codec: json + nso_logs: + type: http_server + address: "0.0.0.0:9881" + path: "/logs" + decoding: + codec: json + # Envoy Gateway writes access logs to stdout via its File access log + # sink (path: /dev/stdout), exactly as the production downstream + # gateway EnvoyProxy does. The node-local Vector agent tails the + # container log files through the kubernetes_logs source -- the real, + # file-based ingestion path for live traffic. The JSON access log + # line arrives as a string in `.message`. The `nso_logs` http_server + # above is only used by the e2e test to inject hand-crafted log lines. + envoy_access_logs: + type: kubernetes_logs + # Only tail the Envoy proxy pods; their access logs live on the + # `envoy` container's stdout. Other namespaces/containers are + # ignored so the agent does not parse unrelated cluster logs. + extra_field_selector: "metadata.namespace=envoy-gateway-system" + # Re-scan for new container log files every 2s instead of the 60s + # default so the agent picks up freshly-created Envoy proxy pods + # quickly -- the e2e test provisions a gateway and verifies within a + # short window. + glob_minimum_cooldown_ms: 2000 + transforms: + parse_nso_logs: + type: remap + inputs: + - nso_logs + - envoy_access_logs + source: | + # The kubernetes_logs source (Envoy File sink -> stdout) delivers + # the raw access log line as a string in `.message`; parse it as + # JSON and lift the fields to the top level. The http_server source + # (e2e test injection) already delivers them at the top level. + # Normalize both so the rest of this transform is source-agnostic. + # Lines that are not JSON (e.g. Envoy startup logs) are dropped. + if is_string(.message) { + # `.message` is known to be a string inside this guard, so + # string!() never actually aborts (hence Vector's harmless + # "can't abort infallible function" compile warning). + parsed, err = parse_json(string!(.message)) + if err != null { + abort + } + . = object(parsed) ?? {} + } + + parsed_route, err = parse_regex(.route_name, r'^httproute/(?P[^/]+)/(?P[^/]+)') + if err != null || is_null(parsed_route) { + abort + } + + httproute_namespace = parsed_route.namespace + httproute_name = parsed_route.name + + project_id = replace!(to_string(httproute_namespace), r'^ns-', "") + subject = "projects/" + project_id + + # Human-readable project name injected by the gateway access log + # (x-datum-project-name header). Absent/empty renders as "" or "-". + project_name = "" + if !is_null(.project_name) { + pn = to_string(.project_name) ?? "" + if pn != "-" { + project_name = pn + } + } + + bytes_sent = to_int(.bytes_sent) ?? 0 + bytes_received = to_int(.bytes_received) ?? 0 + duration_ms = to_int(.duration) ?? 0 + + uuid_clean = replace(uuid_v4(), "-", "") + ulid_base = "0" + upcase(slice(uuid_clean, 1, 26) ?? "") + + events = [] + region = "us-east-1" + if !is_null(.region) { + region = .region + } + gateway_name = "test-gateway" + if !is_null(.gateway_name) { + gateway_name = .gateway_name + } + gateway_namespace = "default" + if !is_null(.gateway_namespace) { + gateway_namespace = .gateway_namespace + } + gateway_class = "datum-downstream-gateway" + if !is_null(.gateway_class) { + gateway_class = .gateway_class + } + start_time = .start_time + if is_null(start_time) { + start_time = format_timestamp!(now(), "%Y-%m-%dT%H:%M:%SZ") + } + + req_event = { + "specversion": "1.0", + "id": ulid_base, + "type": "networking.datumapis.com/gateway/requests", + "source": "//networking.datumapis.com/gateway", + "subject": subject, + "time": start_time, + "datacontenttype": "application/json", + "data": { + "value": "1", + "dimensions": { + "region": region, + "gateway": gateway_name, + "gateway_namespace": gateway_namespace, + "gateway_class": gateway_class, + "httproute_name": httproute_name, + "httproute_namespace": httproute_namespace, + "project_name": project_name + } + } + } + events = push(events, req_event) + + if bytes_sent > 0 { + uuid_clean_egress = replace(uuid_v4(), "-", "") + ulid_egress = "0" + upcase(slice(uuid_clean_egress, 1, 26) ?? "") + egress_event = req_event + egress_event.id = ulid_egress + egress_event.type = "networking.datumapis.com/gateway/egress-bytes" + egress_event.data.value = to_string(bytes_sent) + events = push(events, egress_event) + } + + if bytes_received > 0 { + uuid_clean_ingress = replace(uuid_v4(), "-", "") + ulid_ingress = "0" + upcase(slice(uuid_clean_ingress, 1, 26) ?? "") + ingress_event = req_event + ingress_event.id = ulid_ingress + ingress_event.type = "networking.datumapis.com/gateway/ingress-bytes" + ingress_event.data.value = to_string(bytes_received) + events = push(events, ingress_event) + } + + if duration_ms > 0 { + duration_sec = to_float(duration_ms) / 1000.0 + if duration_sec > 0 { + uuid_clean_conn = replace(uuid_v4(), "-", "") + ulid_conn = "0" + upcase(slice(uuid_clean_conn, 1, 26) ?? "") + conn_event = req_event + conn_event.id = ulid_conn + conn_event.type = "networking.datumapis.com/gateway/connection-seconds" + conn_event.data.value = to_string(duration_sec) + events = push(events, conn_event) + } + } + + . = events + clean_cloudevents: + type: remap + inputs: + - billing_sdk + - parse_nso_logs + source: | + del(.timestamp) + del(.host) + del(.source_type) + sinks: + billing_gateway: + type: http + inputs: + - clean_cloudevents + uri: "${BILLING_GATEWAY_URL}/v1/usage/events:batchIngest" + method: post + encoding: + codec: json + batch: + max_events: 100 + timeout_secs: 1 + tls: + verify_certificate: false + buffer: + type: memory + when_full: block + request: + retry_policy: + max_retries: 30 + initial_backoff_secs: 1 + max_backoff_secs: 60 + jitter_mode: full + initContainers: + - name: init-data + image: busybox:1.36 + securityContext: + runAsUser: 0 + runAsNonRoot: false + privileged: true + command: + - sh + - -c + - chown -R 1000:1000 /var/lib/vector + volumeMounts: + - name: data + mountPath: /var/lib/vector + podSecurityContext: + runAsUser: 1000 + # Primary GID 0 (root group) so the kubernetes_logs source can read the + # Envoy container logs: the kubelet writes them as root:root with + # /var/log/pods at 0750 and each 0.log at 0640 -- both group-readable by + # the root group only. We keep runAsUser 1000 / runAsNonRoot: true so the + # process is still a non-root user; only the supplemental group is root. + runAsGroup: 0 + runAsNonRoot: true + securityContext: + allowPrivilegeEscalation: false + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 500m + memory: 256Mi + service: + enabled: true + type: ClusterIP + internalTrafficPolicy: Local + ports: + - name: cloudevents + port: 9880 + protocol: TCP + targetPort: 9880 + - name: nso-logs + port: 9881 + protocol: TCP + targetPort: 9881 + podMonitor: + enabled: true diff --git a/config/tools/billing-usage-collector/namespace.yaml b/config/tools/billing-usage-collector/namespace.yaml new file mode 100644 index 00000000..d53bf36a --- /dev/null +++ b/config/tools/billing-usage-collector/namespace.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: billing-system + labels: + app.kubernetes.io/name: billing + app.kubernetes.io/managed-by: kustomize diff --git a/config/tools/envoy-gateway/overlays/e2e/envoy-gateway-extension-manager-config.yaml b/config/tools/envoy-gateway/overlays/e2e/envoy-gateway-extension-manager-config.yaml new file mode 100644 index 00000000..cc35091c --- /dev/null +++ b/config/tools/envoy-gateway/overlays/e2e/envoy-gateway-extension-manager-config.yaml @@ -0,0 +1,83 @@ +--- +# Patches the Helm-managed envoy-gateway-config ConfigMap to enable the +# extensionManager. Applied after the EG helm chart with --server-side +# --force-conflicts so the extra field is owned by this apply, not helm. +# +# All other fields (extensionApis, runtimeFlags, provider image pins, etc.) are +# reproduced verbatim from the v1.8.1 helm chart so the full ConfigMap value is +# consistent — the data field is a single YAML string and cannot be merged at +# field granularity by server-side apply. +apiVersion: v1 +kind: ConfigMap +metadata: + name: envoy-gateway-config + namespace: envoy-gateway-system +data: + envoy-gateway.yaml: | + apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: EnvoyGateway + extensionApis: + enableBackend: true + enableEnvoyPatchPolicy: true + gateway: + controllerName: gateway.envoyproxy.io/gatewayclass-controller + logging: + level: + default: info + provider: + kubernetes: + rateLimitDeployment: + container: + image: docker.io/envoyproxy/ratelimit:ff287602 + patch: + type: StrategicMerge + value: + spec: + template: + spec: + containers: + - imagePullPolicy: IfNotPresent + name: envoy-ratelimit + shutdownManager: + image: docker.io/envoyproxy/gateway:v1.8.1 + type: Kubernetes + runtimeFlags: + enabled: + - XDSNameSchemeV2 + extensionManager: + service: + fqdn: + hostname: network-services-operator-envoy-gateway-extension-server.network-services-operator-system.svc.cluster.local + port: 5005 + tls: + # certificateRef points to the Secret created by the nso-es-tls Certificate. + # EG reads ca.crt from this Secret to verify the extension server's TLS cert. + certificateRef: + name: nso-extension-server-tls + namespace: network-services-operator-system + # EG presents this client cert when dialing the extension server. + clientCertificateRef: + name: envoy-gateway-extension-server-eg-client-tls + namespace: network-services-operator-system + retry: + maxAttempts: 4 + initialBackoff: 100ms + maxBackoff: 1s + backoffMultiplier: + numerator: 200 + retryableStatusCodes: + - UNAVAILABLE + hooks: + xdsTranslator: + post: + - Translation + translation: + listener: + includeAll: true + route: + includeAll: true + cluster: + includeAll: true + secret: + includeAll: true + failOpen: false diff --git a/config/tools/envoy-gateway/overlays/e2e/kustomization.yaml b/config/tools/envoy-gateway/overlays/e2e/kustomization.yaml new file mode 100644 index 00000000..655dcf2e --- /dev/null +++ b/config/tools/envoy-gateway/overlays/e2e/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - envoy-gateway-extension-manager-config.yaml diff --git a/internal/extensionserver/cache/index.go b/internal/extensionserver/cache/index.go index 5074b502..d76e0792 100644 --- a/internal/extensionserver/cache/index.go +++ b/internal/extensionserver/cache/index.go @@ -7,6 +7,7 @@ import ( "net/url" "sort" "strconv" + "strings" corev1 "k8s.io/api/core/v1" apimeta "k8s.io/apimachinery/pkg/api/meta" @@ -30,9 +31,10 @@ import ( // they are prepended to every policy's per-rule directive list. func BuildPolicyIndexFromClient(ctx context.Context, cl client.Client, baseDirectives []string) (*PolicyIndex, error) { idx := &PolicyIndex{ - DStoUS: make(map[string]string), - TPPs: make(map[string][]TPPInfo), - Connectors: make(map[ConnectorKey]ConnectorInfo), + DStoUS: make(map[string]string), + ProjectNames: make(map[string]string), + TPPs: make(map[string][]TPPInfo), + Connectors: make(map[ConnectorKey]ConnectorInfo), } if err := populateFromClient(ctx, cl, idx, baseDirectives); err != nil { return nil, err @@ -67,6 +69,14 @@ func populateFromClient(ctx context.Context, cl client.Client, idx *PolicyIndex, // in filter_metadata, so dsNS == ns.Name. Identity entry. idx.DStoUS[ns.Name] = ns.Name } + + // Derive project name from the upstream cluster name label. The label + // value is "cluster-" (written by mappednamespace.go). In + // single-cluster deployments where no cluster name is configured the + // label is absent and ProjectNames[ns.Name] is left empty. + if clusterLabel := ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel]; clusterLabel != "" { + idx.ProjectNames[ns.Name] = strings.TrimPrefix(clusterLabel, "cluster-") + } } // --- TrafficProtectionPolicies --- diff --git a/internal/extensionserver/cache/index_test.go b/internal/extensionserver/cache/index_test.go index ed59a78d..cdd0abef 100644 --- a/internal/extensionserver/cache/index_test.go +++ b/internal/extensionserver/cache/index_test.go @@ -1153,9 +1153,10 @@ func TestPopulateFromClient_NamespaceNameCollision_LatentRisk(t *testing.T) { // Simulate what BuildPolicyIndex does: call populateFromClient once per cluster. idx := &PolicyIndex{ - DStoUS: make(map[string]string), - TPPs: make(map[string][]TPPInfo), - Connectors: make(map[ConnectorKey]ConnectorInfo), + DStoUS: make(map[string]string), + ProjectNames: make(map[string]string), + TPPs: make(map[string][]TPPInfo), + Connectors: make(map[ConnectorKey]ConnectorInfo), } require.NoError(t, populateFromClient(context.Background(), clA, idx, nil)) require.NoError(t, populateFromClient(context.Background(), clB, idx, nil)) diff --git a/internal/extensionserver/cache/types.go b/internal/extensionserver/cache/types.go index eb0262e2..b76ccf9b 100644 --- a/internal/extensionserver/cache/types.go +++ b/internal/extensionserver/cache/types.go @@ -36,6 +36,16 @@ type PolicyIndex struct { // Values are upstream namespace names; they key into TPPs and Connectors. DStoUS map[string]string + // ProjectNames maps downstream namespace names to the human-readable + // project name for that namespace. Derived from + // meta.datumapis.com/upstream-cluster-name on replica namespaces (value + // format: "cluster-"). Empty string when the label is absent + // (e.g. single-cluster dev with no cluster name configured). + // + // TODO: replace with resourcemanager.miloapis.com/project-name once that + // label is available on project namespaces. + ProjectNames map[string]string + // TPPs maps upstream namespace names to the list of // TrafficProtectionPolicies in that namespace, sorted by creation // timestamp then name to match NSO reconciler precedence order. diff --git a/internal/extensionserver/mutate/tpp.go b/internal/extensionserver/mutate/tpp.go index b876e4ed..8ae17930 100644 --- a/internal/extensionserver/mutate/tpp.go +++ b/internal/extensionserver/mutate/tpp.go @@ -155,20 +155,16 @@ func InjectCorazaListenerFilters(l *listenerv3.Listener, cfg *CorazaConfig) (int // TrafficProtectionPolicy. For each VirtualHost it: // 1. Extracts the EG filter_metadata["envoy-gateway"] gateway resource ref. // 2. Resolves the upstream namespace via idx.DStoUS. -// 3. Finds the governing TPP from idx.TPPs (route-level wins over gateway-level). -// 4. Writes typed_per_filter_config and datum-gateway metadata on governed routes. +// 3. Stamps project_name into datum-gateway route metadata on every NSO-owned route. +// 4. Finds the governing TPP from idx.TPPs (route-level wins over gateway-level). +// 5. Writes typed_per_filter_config and datum-gateway metadata on governed routes. // -// Returns the number of routes mutated. +// Returns the number of routes mutated (WAF-configured routes only). func ApplyTPPRouteConfig( rc *routev3.RouteConfiguration, idx *extcache.PolicyIndex, cfg *CorazaConfig, ) (int, error) { - // When disabled, emit no per-route WAF config — matches the no-op in - // InjectCorazaListenerFilters so the two halves are always consistent. - if cfg.Disabled { - return 0, nil - } mutated := 0 for _, vh := range rc.GetVirtualHosts() { // Extract VH-level EG resource reference; expect Kind=Gateway. @@ -184,12 +180,26 @@ func ApplyTPPRouteConfig( continue } + projectName := idx.ProjectNames[dsNS] tpps := idx.TPPs[upstreamNS] // Gateway-level governing TPP (no SectionName scoping in P1; see design §2.2 C5). gwTPP := findGatewayTPP(tpps, gwName) for _, rt := range vh.GetRoutes() { + // Stamp project_name on every NSO-owned route so the Envoy access log + // can emit it via %METADATA(ROUTE:datum-gateway:project_name)%. + // applyRouteWAFConfig overwrites this entry for TPP-governed routes, + // so it also includes project_name in the metadata it builds. + injectProjectNameMetadata(rt, projectName) + + // WAF per-route config requires the listener-level filter to be present. + // Skip when Coraza is disabled (e.g. standard Envoy image without the + // golang filter) so that project_name is still stamped above. + if cfg.Disabled { + continue + } + // Check for a route-level TPP (HTTPRoute targeting) — takes precedence. _, _, routeName, _ := extractEGResource(rt.GetMetadata()) routeTPP := findRouteTPP(tpps, routeName) @@ -202,7 +212,7 @@ func ApplyTPPRouteConfig( continue } - if err := applyRouteWAFConfig(rt, governing, cfg); err != nil { + if err := applyRouteWAFConfig(rt, governing, projectName, cfg); err != nil { return mutated, fmt.Errorf("apply WAF config to route %q: %w", rt.GetName(), err) } mutated++ @@ -211,10 +221,34 @@ func ApplyTPPRouteConfig( return mutated, nil } +// injectProjectNameMetadata writes project_name into the datum-gateway +// filter_metadata of a route. Called for every NSO-owned route regardless of +// whether a TrafficProtectionPolicy governs it, so that +// %METADATA(ROUTE:datum-gateway:project_name)% is always available in the +// Envoy access log format. +func injectProjectNameMetadata(rt *routev3.Route, projectName string) { + if rt.Metadata == nil { + rt.Metadata = &corev3.Metadata{} + } + if rt.Metadata.FilterMetadata == nil { + rt.Metadata.FilterMetadata = make(map[string]*structpb.Struct) + } + existing := rt.Metadata.FilterMetadata[datumGatewayMetadataKey] + if existing == nil { + s, _ := structpb.NewStruct(map[string]any{"project_name": projectName}) + rt.Metadata.FilterMetadata[datumGatewayMetadataKey] = s + return + } + if existing.Fields == nil { + existing.Fields = make(map[string]*structpb.Value) + } + existing.Fields["project_name"] = structpb.NewStringValue(projectName) +} + // applyRouteWAFConfig writes the datum-gateway filter_metadata and Coraza // typed_per_filter_config onto a single route. -func applyRouteWAFConfig(rt *routev3.Route, tpp *extcache.TPPInfo, cfg *CorazaConfig) error { - meta, err := buildDatumGatewayMetadata(tpp) +func applyRouteWAFConfig(rt *routev3.Route, tpp *extcache.TPPInfo, projectName string, cfg *CorazaConfig) error { + meta, err := buildDatumGatewayMetadata(tpp, projectName) if err != nil { return fmt.Errorf("build datum-gateway metadata: %w", err) } @@ -368,8 +402,9 @@ func corazaPluginConfigAny(directives []string, cfg *CorazaConfig) (*anypb.Any, // buildDatumGatewayMetadata builds the filter_metadata["datum-gateway"] struct // for a governed route. Matches the metadata contract in STATE.md. -func buildDatumGatewayMetadata(tpp *extcache.TPPInfo) (*structpb.Struct, error) { +func buildDatumGatewayMetadata(tpp *extcache.TPPInfo, projectName string) (*structpb.Struct, error) { s, err := structpb.NewStruct(map[string]any{ + "project_name": projectName, egMetaFieldResources: []any{ map[string]any{ egMetaFieldKind: "TrafficProtectionPolicy", diff --git a/internal/extensionserver/mutate/tpp_test.go b/internal/extensionserver/mutate/tpp_test.go index 22183f0a..86fc6489 100644 --- a/internal/extensionserver/mutate/tpp_test.go +++ b/internal/extensionserver/mutate/tpp_test.go @@ -145,9 +145,10 @@ func tppTargetingHTTPRoute(upstreamNS, tppName, routeName string) extcache.TPPIn // ("test-project") are fixed — all callers in this package use those values. func policyIndex(tpps ...extcache.TPPInfo) *extcache.PolicyIndex { return &extcache.PolicyIndex{ - DStoUS: map[string]string{"ns-abc-123": "test-project"}, - TPPs: map[string][]extcache.TPPInfo{"test-project": tpps}, - Connectors: make(map[extcache.ConnectorKey]extcache.ConnectorInfo), + DStoUS: map[string]string{"ns-abc-123": "test-project"}, + ProjectNames: map[string]string{"ns-abc-123": "test-project"}, + TPPs: map[string][]extcache.TPPInfo{"test-project": tpps}, + Connectors: make(map[extcache.ConnectorKey]extcache.ConnectorInfo), } } @@ -342,9 +343,10 @@ func TestApplyTPPRouteConfig_UnknownDSNamespace_Skipped(t *testing.T) { cfg := testCorazaConfig() // idx does NOT contain a mapping for dsNS. idx := &extcache.PolicyIndex{ - DStoUS: map[string]string{"ns-other": "other-project"}, - TPPs: make(map[string][]extcache.TPPInfo), - Connectors: make(map[extcache.ConnectorKey]extcache.ConnectorInfo), + DStoUS: map[string]string{"ns-other": "other-project"}, + ProjectNames: make(map[string]string), + TPPs: make(map[string][]extcache.TPPInfo), + Connectors: make(map[extcache.ConnectorKey]extcache.ConnectorInfo), } vh := buildVHWithGatewayMeta( @@ -486,9 +488,10 @@ func TestInjectCorazaListenerFilters_StaticRouteConfig_Skipped(t *testing.T) { } // TestApplyTPPRouteConfig_Disabled_NoOp verifies that setting -// CorazaConfig.Disabled=true causes ApplyTPPRouteConfig to return 0 mutations -// and leave all routes untouched, even when a governing TPP exists. -func TestApplyTPPRouteConfig_Disabled_NoOp(t *testing.T) { +// CorazaConfig.Disabled=true skips WAF per-route config but still stamps +// project_name into the datum-gateway route metadata so the access log can +// emit it even on standard Envoy images (no golang filter). +func TestApplyTPPRouteConfig_Disabled_StampsProjectName(t *testing.T) { cfg := testCorazaConfig() cfg.Disabled = true @@ -509,13 +512,16 @@ func TestApplyTPPRouteConfig_Disabled_NoOp(t *testing.T) { n, err := ApplyTPPRouteConfig(rc, idx, cfg) require.NoError(t, err) - assert.Equal(t, 0, n, "disabled Coraza must not apply any per-route WAF config") + assert.Equal(t, 0, n, "disabled Coraza must not count any WAF per-route mutations") rt := vh.Routes[0] assert.Nil(t, rt.GetTypedPerFilterConfig(), "route must have no typed_per_filter_config when Coraza is disabled") - assert.Nil(t, rt.GetMetadata().GetFilterMetadata()[datumGatewayMetadataKey], - "route must have no datum-gateway metadata when Coraza is disabled") + // project_name is stamped unconditionally so access logs always carry it. + meta := rt.GetMetadata().GetFilterMetadata()[datumGatewayMetadataKey] + require.NotNil(t, meta, "datum-gateway filter_metadata must be set even when Coraza is disabled") + assert.Equal(t, upstreamNS, meta.GetFields()["project_name"].GetStringValue(), + "project_name must be stamped on NSO-owned routes regardless of Coraza.Disabled") } func TestApplyTPPRouteConfig_RouteLevelTPPWins(t *testing.T) { diff --git a/test/e2e/billing/chainsaw-test.yaml b/test/e2e/billing/chainsaw-test.yaml new file mode 100644 index 00000000..c08bd116 --- /dev/null +++ b/test/e2e/billing/chainsaw-test.yaml @@ -0,0 +1,770 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: billing-http-metering +spec: + bindings: + - name: clusterIssuerName + value: (join('-', ['e2e-billing', $namespace])) + - name: gatewayClassName + value: (join('-', ['e2e-billing', $namespace])) + cluster: nso-standard + steps: + - name: Create CA + try: + - create: + cluster: nso-infra + resource: + apiVersion: cert-manager.io/v1 + kind: ClusterIssuer + metadata: + name: (join('-', [$clusterIssuerName, 'issuer'])) + spec: + selfSigned: {} + + - create: + cluster: nso-infra + resource: + apiVersion: cert-manager.io/v1 + kind: Certificate + metadata: + name: (join('-', [$clusterIssuerName, 'ca'])) + namespace: cert-manager + spec: + isCA: true + commonName: (join('-', [$clusterIssuerName, 'ca'])) + secretName: ($clusterIssuerName) + privateKey: + algorithm: ECDSA + size: 256 + issuerRef: + name: (join('-', [$clusterIssuerName, 'issuer'])) + kind: ClusterIssuer + group: cert-manager.io + + - create: + cluster: nso-infra + resource: + apiVersion: cert-manager.io/v1 + kind: ClusterIssuer + metadata: + name: ($clusterIssuerName) + spec: + ca: + secretName: ($clusterIssuerName) + + - script: + cluster: nso-infra + env: + - name: CLUSTER_ISSUER_NAME + value: ($clusterIssuerName) + content: | + kubectl get secret -n cert-manager $CLUSTER_ISSUER_NAME -o yaml | \ + sed 's/namespace: .*/namespace: envoy-gateway-system/' | \ + kubectl apply -n envoy-gateway-system -f - + + - name: Deploy Mock Billing Gateway + try: + - create: + cluster: nso-infra + resource: + apiVersion: v1 + kind: Service + metadata: + name: mock-billing-gateway + namespace: billing-system + spec: + ports: + - name: http + port: 8080 + targetPort: 8080 + selector: + app: mock-billing-gateway + - create: + cluster: nso-infra + resource: + apiVersion: apps/v1 + kind: Deployment + metadata: + name: mock-billing-gateway + namespace: billing-system + spec: + replicas: 1 + selector: + matchLabels: + app: mock-billing-gateway + template: + metadata: + labels: + app: mock-billing-gateway + spec: + containers: + - name: mock-billing-gateway + image: python:3.11-alpine + command: ["python", "-c", "import socket; s=socket.socket(); s.bind(('0.0.0.0', 8080)); s.listen(5); print('Started');\nwhile True:\n conn, addr = s.accept(); req = conn.recv(4096); print(req.decode('utf-8', errors='ignore'), flush=True); conn.sendall(b'HTTP/1.1 200 OK\\r\\nContent-Length: 0\\r\\n\\r\\n'); conn.close()"] + resources: + requests: + cpu: 10m + memory: 16Mi + limits: + cpu: 50m + memory: 32Mi + - assert: + cluster: nso-infra + resource: + apiVersion: apps/v1 + kind: Deployment + metadata: + name: mock-billing-gateway + namespace: billing-system + status: + readyReplicas: 1 + + - name: Reset Vector + try: + - script: + timeout: 90s + cluster: nso-infra + content: | + kubectl rollout restart daemonset/billing-usage-collector-vector -n billing-system + kubectl rollout status daemonset/billing-usage-collector-vector -n billing-system --timeout=60s + + - name: Create GatewayClasses + try: + - create: + cluster: nso-standard + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: GatewayClass + metadata: + name: ($gatewayClassName) + spec: + controllerName: gateway.networking.datumapis.com/external-global-proxy-controller + + # Ensure the downstream environment's standard gateway class and config exists + - apply: + cluster: nso-infra + resource: + apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: EnvoyProxy + metadata: + name: custom-proxy-config + namespace: envoy-gateway-system + spec: + telemetry: + accessLog: + settings: + - sinks: + # File sink only -- identical to the production + # downstream gateway EnvoyProxy. Envoy writes the JSON + # access log to stdout; the node-local Vector agent + # tails the container log file (kubernetes_logs source) + # and emits the CloudEvents. This exercises the real, + # file-based production ingestion path -- no OTLP push. + # The test also scrapes the same stdout to derive the + # expected metric values (bytes, duration, project_name). + - type: File + file: + path: /dev/stdout + format: + type: JSON + json: + start_time: "%START_TIME%" + method: "%REQ(:METHOD)%" + path: "%REQ(X-ENVOY-ORIGINAL-PATH?:PATH)%" + protocol: "%PROTOCOL%" + response_code: "%RESPONSE_CODE%" + bytes_received: "%BYTES_RECEIVED%" + bytes_sent: "%BYTES_SENT%" + duration: "%DURATION%" + route_name: "%ROUTE_NAME%" + project_name: "%METADATA(ROUTE:datum-gateway:project_name)%" + provider: + type: Kubernetes + kubernetes: + envoyService: + type: ClusterIP + patch: + type: StrategicMerge + value: + spec: + ipFamilyPolicy: RequireDualStack + envoyDeployment: + patch: + type: StrategicMerge + value: + spec: + template: + spec: + containers: + - name: envoy + volumeMounts: + - mountPath: /etc/ssl/certs/ + name: selfsigned-ca + readOnly: true + volumes: + - name: selfsigned-ca + secret: + secretName: ($clusterIssuerName) + items: + - key: ca.crt + path: ca-certificates.crt + mergeGateways: true + + - apply: + cluster: nso-infra + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: GatewayClass + metadata: + name: datum-downstream-gateway-e2e + spec: + controllerName: gateway.envoyproxy.io/gatewayclass-controller + parametersRef: + group: gateway.envoyproxy.io + kind: EnvoyProxy + name: custom-proxy-config + namespace: envoy-gateway-system + + - apply: + cluster: nso-infra + resource: + apiVersion: v1 + kind: Namespace + metadata: + name: datum-downstream-gateway-hostnames + + # Get downstream namespace name by retrieving standard namespace UID + - script: + cluster: nso-standard + content: | + kubectl get ns $NAMESPACE -o json + outputs: + - name: downstreamNamespaceName + value: (join('-', ['ns', json_parse($stdout).metadata.uid])) + + - name: Create namespace on infra cluster + try: + - script: + cluster: nso-infra + content: | + kubectl create namespace $NAMESPACE --dry-run=client -o yaml | kubectl apply -f - + cleanup: + - script: + timeout: 60s + cluster: nso-infra + content: | + kubectl delete namespace $NAMESPACE --ignore-not-found=true + + - name: Create backend pod and service + try: + - create: + cluster: nso-infra + resource: + apiVersion: cert-manager.io/v1 + kind: Certificate + metadata: + name: backend-pod + spec: + dnsNames: + - (join('.', ['backend-service', $namespace, 'svc.cluster.local'])) + issuerRef: + name: ($clusterIssuerName) + kind: ClusterIssuer + secretName: backend-cert + - create: + cluster: nso-infra + resource: + apiVersion: v1 + kind: Pod + metadata: + name: backend-pod + labels: + app: backend + spec: + containers: + - name: backend + image: ghcr.io/mccutchen/go-httpbin:2.18.1 + command: ["/bin/go-httpbin"] + args: + - -host + - "0.0.0.0" + - -port + - "8080" + resources: + requests: + cpu: 10m + memory: 16Mi + limits: + cpu: 50m + memory: 64Mi + - create: + cluster: nso-infra + resource: + apiVersion: v1 + kind: Service + metadata: + name: backend-service + spec: + ports: + - name: http + port: 8080 + targetPort: 8080 + selector: + app: backend + + - name: Provision Domain + try: + - create: + cluster: nso-standard + resource: + apiVersion: networking.datumapis.com/v1alpha + kind: Domain + metadata: + name: test-domain + spec: + domainName: e2e.env.datum.net + - script: + content: | + kubectl -n $NAMESPACE patch domain test-domain \ + --subresource=status --type=merge \ + -p '{"status":{"conditions":[{"type": "Verified", "status": "True", "reason": "Test", "message": "test", "lastTransitionTime": "2025-02-24T23:59:09Z"}]}}' + + - name: Provision Gateway + try: + - script: + cluster: nso-standard + skipCommandOutput: true + skipLogOutput: true + content: | + kubectl get ns $NAMESPACE -o json + outputs: + - name: downstreamNamespaceName + value: (join('-', ['ns', json_parse($stdout).metadata.uid])) + - create: + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + metadata: + name: test-gateway + spec: + gatewayClassName: ($gatewayClassName) + listeners: + - name: default-http + port: 80 + protocol: HTTP + allowedRoutes: + namespaces: + from: Same + - assert: + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + metadata: + name: test-gateway + status: + conditions: + - type: Accepted + status: "True" + - type: Programmed + status: "True" + - assert: + cluster: nso-infra + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + metadata: + name: test-gateway + namespace: ($downstreamNamespaceName) + + - name: Provision HTTPRoute + try: + - script: + cluster: nso-standard + skipCommandOutput: true + skipLogOutput: true + content: | + kubectl get ns $NAMESPACE -o json + outputs: + - name: downstreamNamespaceName + value: (join('-', ['ns', json_parse($stdout).metadata.uid])) + - create: + resource: + apiVersion: discovery.k8s.io/v1 + kind: EndpointSlice + metadata: + name: test-slice-1 + labels: + kubernetes.io/service-name: backend-service + addressType: FQDN + ports: + - name: http + port: 8080 + endpoints: + - addresses: + - (join('.', ['backend-service', $namespace, 'svc.cluster.local'])) + conditions: + ready: true + serving: true + terminating: false + - create: + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + name: test-route + spec: + parentRefs: + - name: test-gateway + rules: + - matches: + - path: + type: PathPrefix + value: / + backendRefs: + - group: discovery.k8s.io + kind: EndpointSlice + name: test-slice-1 + port: 8080 + - assert: + cluster: nso-infra + resource: + apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + name: test-route + namespace: ($downstreamNamespaceName) + + - name: Provision Pod to test connectivity + try: + - script: + cluster: nso-standard + skipCommandOutput: true + skipLogOutput: true + content: | + kubectl get ns $NAMESPACE -o json + outputs: + - name: downstreamNamespaceName + value: (join('-', ['ns', json_parse($stdout).metadata.uid])) + - script: + timeout: 45s + skipCommandOutput: true + skipLogOutput: true + cluster: nso-infra + content: | + # Wait up to 10s for the envoy service to be created + for i in $(seq 1 10); do + SVC=$(kubectl get svc -n envoy-gateway-system -l gateway.envoyproxy.io/owning-gatewayclass=datum-downstream-gateway-e2e -o json || true) + if [ ! -z "$SVC" ] && echo "$SVC" | grep -q "metadata"; then + echo "$SVC" + exit 0 + fi + sleep 1 + done + echo "ERROR: Envoy service not found for namespace ($downstreamNamespaceName)" >&2 + exit 1 + outputs: + - name: gatewayService + value: (json_parse($stdout).items | [0]) + + - create: + cluster: nso-infra + resource: + apiVersion: v1 + kind: Pod + metadata: + name: test-pod + spec: + containers: + - name: test-pod + image: alpine:3.19 + command: ["sleep", "3600"] + env: + - name: GATEWAY_SERVICE_NAME + value: ($gatewayService.metadata.name) + - name: GATEWAY_SERVICE_NAMESPACE + value: envoy-gateway-system + lifecycle: + postStart: + exec: + command: ["apk", "add", "curl"] + terminationGracePeriodSeconds: 0 + + - assert: + cluster: nso-infra + resource: + apiVersion: v1 + kind: Pod + metadata: + name: backend-pod + status: + phase: Running + + - assert: + cluster: nso-infra + resource: + apiVersion: v1 + kind: Pod + metadata: + name: test-pod + status: + phase: Running + + - sleep: + duration: 10s + + # Get primary hostname on nso-standard + - script: + cluster: nso-standard + content: | + PRIMARY_HOSTNAME=$(kubectl -n $NAMESPACE get gateway test-gateway -o jsonpath='{.status.addresses[0].value}' || true) + if [ -z "$PRIMARY_HOSTNAME" ]; then + PRIMARY_HOSTNAME="test-domain.prism.e2e.env.datum.net" + fi + printf "%s" "$PRIMARY_HOSTNAME" + outputs: + - name: primaryHostname + value: ($stdout) + + # Send request with a known request/response size and capture it on nso-infra + - script: + cluster: nso-infra + env: + - name: STANDARD_NAMESPACE + value: ($namespace) + - name: PRIMARY_HOSTNAME + value: ($primaryHostname) + content: | + # Send request with 11 bytes of request body ("hello world") to /delay/2 to ensure duration >= 2s + kubectl -n $STANDARD_NAMESPACE exec -i test-pod -- sh -c " \ + set -x; \ + curl -kvf -H \"Host: ${PRIMARY_HOSTNAME}\" -d 'hello world' http://\${GATEWAY_SERVICE_NAME}.\${GATEWAY_SERVICE_NAMESPACE}.svc.cluster.local/delay/2; \ + " + + # Verify Vector emitted the CloudEvents. Envoy wrote the access log to + # stdout via its File sink when the request above was served, and the + # node-local Vector agent tailed the container log file (kubernetes_logs + # source) automatically, so this step only derives the expected values + # and polls the mock billing gateway for the result. + - script: + timeout: 45s + cluster: nso-infra + env: + - name: DOWNSTREAM_NAMESPACE + value: ($downstreamNamespaceName) + content: | + # ===================================================================== + # Validates the full HTTP metering pipeline, end to end: + # + # Envoy access log --(File sink -> stdout)--> container log file + # | | + # | (captured here for | (Vector agent + # | expected values) | kubernetes_logs + # | | source tails it, + # | | VRL transform fans + # v v it into CloudEvents) + # captured here (STEP 1-2) mock-billing-gateway (STEP 4) + # + # Envoy writes the access log to stdout (File sink on + # custom-proxy-config, path: /dev/stdout) when the request is + # served; the node-local Vector agent tails the container log file + # automatically -- the test does NOT forward it manually. This is + # the same file-based ingestion path as the production gateway. + # + # A single access log line is expected to produce FOUR CloudEvents, + # one per usage dimension, all sharing the same subject + # ("projects/"): + # + # type value + # ------------------------------------------------ ------------------- + # networking.datumapis.com/gateway/requests "1" + # networking.datumapis.com/gateway/ingress-bytes bytes_received ("11") + # networking.datumapis.com/gateway/egress-bytes bytes_sent + # networking.datumapis.com/gateway/connection-seconds duration_ms / 1000 + # + # Every CloudEvent also carries a "dimensions" object that must include + # the project_name (stamped by the extension server onto route + # filter_metadata["datum-gateway"]["project_name"] at xDS build time + # and surfaced in the access log via + # %METADATA(ROUTE:datum-gateway:project_name)%). STEP 4 asserts both + # the per-type values AND that the project_name dimension is correct. + # ===================================================================== + + sleep 3 + # --- STEP 1: capture the Envoy access log line for our HTTPRoute ----- + # The route_name embeds the downstream namespace, so grepping for + # 'httproute//' isolates this test's request. + ENVOY_LOG=$(kubectl -n envoy-gateway-system logs -l gateway.envoyproxy.io/owning-gatewayclass=datum-downstream-gateway-e2e -c envoy --tail=500 | grep "httproute/$DOWNSTREAM_NAMESPACE/" | tail -n 1 || true) + if [ -z "$ENVOY_LOG" ]; then + echo "ERROR: No Envoy access log found containing 'httproute/'" >&2 + exit 1 + fi + echo "Found Envoy access log: $ENVOY_LOG" + + # --- STEP 2: derive the expected CloudEvent values from that log ---- + # These are the source-of-truth values Vector should reproduce in the + # CloudEvents it emits. They are parsed from the captured log rather + # than hardcoded so the assertions track the real request. + BYTES_SENT=$(python3 -c "import sys, json; print(json.loads(sys.argv[1])['bytes_sent'])" "$ENVOY_LOG") + DURATION_MS=$(python3 -c "import sys, json; print(json.loads(sys.argv[1])['duration'])" "$ENVOY_LOG") + EXPECTED_DURATION=$(( DURATION_MS / 1000 )) + # project_name is stamped by the extension server onto route + # filter_metadata["datum-gateway"]["project_name"] at xDS build + # time and read by Envoy via %METADATA(ROUTE:datum-gateway:project_name)%. + # A "-" or empty value means the extension server did not inject the + # metadata (extension server not running, or %METADATA()% not + # supported in this EG version for JSON access logs). + # json null → Python None → print gives "None"; coerce to "" explicitly. + PROJECT_NAME=$(python3 -c "import sys, json; v=json.loads(sys.argv[1]).get('project_name'); print('' if v is None else v)" "$ENVOY_LOG") + if [ -z "$PROJECT_NAME" ] || [ "$PROJECT_NAME" = "-" ]; then + echo "ERROR: Envoy access log has no project_name; extension server did not inject datum-gateway route metadata, or %METADATA(ROUTE:...)% is not supported in this EG version for JSON access logs. Log: $ENVOY_LOG" >&2 + exit 1 + fi + + echo "Parsed Envoy Log metrics:" + echo " - Ingress bytes (expected): 11" + echo " - Egress bytes (expected): $BYTES_SENT" + echo " - Duration ms (expected): $DURATION_MS ($EXPECTED_DURATION seconds)" + echo " - Project name (expected): $PROJECT_NAME" + + # --- STEP 3: (no manual forwarding) -------------------------------- + # The request above already caused Envoy to write this access log to + # stdout via its File sink (custom-proxy-config, path: /dev/stdout). + # The node-local Vector agent tails the Envoy container log file via + # its kubernetes_logs source, and Vector's VRL transform parses it + # and emits the four CloudEvents to the mock-billing-gateway sink, so + # there is nothing to POST here -- we just poll for the result. + + # --- STEP 4: verify the CloudEvents the mock billing gateway received + # Poll the mock gateway's request log (Vector may take a moment to + # flush) and assert that, for our subject, all four event types are + # present with the expected values AND the correct project_name + # dimension. Give up after the deadline and dump diagnostics. + DEADLINE=$(($(date +%s) + 30)) + while [ "$(date +%s)" -lt "$DEADLINE" ]; do + MOCK_LOGS=$(kubectl logs -n billing-system -l app=mock-billing-gateway --tail=100 || true) + + # Define Python verification script as a bash variable (indented safely in YAML) + PYTHON_VERIFY=" + import sys, json + expected_bytes_sent = float($BYTES_SENT) + expected_duration_sec = float($DURATION_MS) / 1000.0 + expected_project_name = \"$PROJECT_NAME\" + logs = sys.stdin.read() + diagnostics = [] + def check_events(events): + # Returns True once a single subject has the full, correct set + # of CloudEvents for our request. Records why each subject was + # rejected in 'diagnostics' for failure output. + diagnostics.clear() + # Group events by subject (projects/); a valid run has all + # four event types under one subject. + by_subject = {} + for e in events: + subj = e.get('subject', '') + by_subject.setdefault(subj, []).append(e) + if not by_subject: + diagnostics.append(\"No events found in this log chunk.\") + return False + for subj, subj_events in by_subject.items(): + # Index this subject's events by type so we can assert each + # of the four expected usage events exists exactly once. + types = {e.get('type'): e for e in subj_events} + req = types.get('networking.datumapis.com/gateway/requests') + ingress = types.get('networking.datumapis.com/gateway/ingress-bytes') + egress = types.get('networking.datumapis.com/gateway/egress-bytes') + conn = types.get('networking.datumapis.com/gateway/connection-seconds') + missing = [] + if not req: missing.append('requests') + if not ingress: missing.append('ingress-bytes') + if not egress: missing.append('egress-bytes') + if not conn: missing.append('connection-seconds') + if missing: + diagnostics.append(f\"Subject '{subj}': missing event types {missing} (found: {list(types.keys())})\") + continue + r_val = req['data'].get('value') + i_val = ingress['data'].get('value') + e_val = egress['data'].get('value') + c_val = conn['data'].get('value') + if r_val != '1': + diagnostics.append(f\"Subject '{subj}': req value expected '1', got '{r_val}'\") + continue + if i_val != '11': + diagnostics.append(f\"Subject '{subj}': ingress value expected '11', got '{i_val}'\") + continue + try: + e_val_f = float(e_val) + if e_val_f != expected_bytes_sent: + diagnostics.append(f\"Subject '{subj}': egress value expected {expected_bytes_sent}, got {e_val_f}\") + continue + except Exception as ex: + diagnostics.append(f\"Subject '{subj}': egress value '{e_val}' could not be parsed as float: {ex}\") + continue + try: + c_val_f = float(c_val) + diff = abs(c_val_f - expected_duration_sec) + if diff > 0.001: + diagnostics.append(f\"Subject '{subj}': conn value expected {expected_duration_sec} (diff <= 0.001), got {c_val_f} (diff {diff})\") + continue + except Exception as ex: + diagnostics.append(f\"Subject '{subj}': conn value '{c_val}' could not be parsed as float: {ex}\") + continue + # All per-type values matched; finally assert the shared + # project_name dimension (carried on every event; checked + # here on the 'requests' event) resolves to the project. + dims = req['data'].get('dimensions', {}) + p_val = dims.get('project_name') + if p_val != expected_project_name: + diagnostics.append(f\"Subject '{subj}': project_name dimension expected '{expected_project_name}', got '{p_val}'\") + continue + # This subject passed every check. + return True + return False + + # Try JSON array format (e.g. future codec: json_array) + found = False + for line in logs.splitlines(): + stripped = line.strip() + if stripped.startswith('[') and stripped.endswith(']'): + try: + if check_events(json.loads(stripped)): + found = True + break + except Exception: + pass + + # Try NDJSON format (Vector default: one JSON object per line) + if not found: + ndjson_events = [] + for line in logs.splitlines(): + stripped = line.strip() + if stripped.startswith('{') and stripped.endswith('}'): + try: + e = json.loads(stripped) + if 'type' in e and 'data' in e: + ndjson_events.append(e) + except Exception: + pass + if ndjson_events: + found = check_events(ndjson_events) + + if not found: + print(\"Validation Failure Details:\", file=sys.stderr) + for d in diagnostics: + print(f\" - {d}\", file=sys.stderr) + sys.exit(0 if found else 1) + " + + # Verify request count, ingress-bytes, egress-bytes, and connection-seconds CloudEvents using python + if echo "$MOCK_LOGS" | python3 -c "import sys, textwrap; exec(textwrap.dedent(sys.argv[1]))" "$PYTHON_VERIFY"; then + echo "SUCCESS: mock-billing-gateway received correct requests (1), ingress (11), egress ($BYTES_SENT), connection-seconds (equivalent to $DURATION_MS ms) and project_name ($PROJECT_NAME) CloudEvents!" + exit 0 + fi + sleep 2 + done + echo "FAIL: mock-billing-gateway did not receive correct CloudEvent metrics. Logs:" >&2 + kubectl logs -n billing-system -l app=mock-billing-gateway --tail=100 >&2 + exit 1 + +