diff --git a/Taskfile.test-infra.yml b/Taskfile.test-infra.yml new file mode 100644 index 00000000..ae577cf1 --- /dev/null +++ b/Taskfile.test-infra.yml @@ -0,0 +1,419 @@ +version: '3' + +# Production-fidelity test environment. Two kind clusters: nso-upstream (the +# control plane) and nso-downstream (the edge). Consumed by both the e2e and perf +# suites. +# +# Bring up: task test-infra:up +# Tear down: task test-infra:down +# Smoke: task test-infra:smoke + +# kustomize fetches some CRDs over https. A developer's global git config can +# rewrite https to ssh, which fails in a non-interactive fetch. Neutralize the +# global git config for kustomize's child git so the https fetches succeed; the +# repo-local config is untouched. +env: + GIT_CONFIG_GLOBAL: /dev/null + +vars: + TMP_DIR: + sh: echo "${TMPDIR:-/tmp}" + REPO: + sh: pwd + # Pinned-to-prod tooling. We use an explicit kind v0.32.0 binary because it + # ships the node image production runs, which the checked-in bin/kind predates. + KIND: '{{.REPO}}/bin/kind-v0.32.0' + KUSTOMIZE: '{{.REPO}}/bin/kustomize' + CMCTL: '{{.REPO}}/bin/cmctl' + KARMADACTL: '{{.REPO}}/bin/karmadactl' + CHAINSAW: '{{.REPO}}/bin/chainsaw' + + UPSTREAM_CLUSTER: nso-upstream + DOWNSTREAM_CLUSTER: nso-downstream + UPSTREAM_CTX: kind-nso-upstream + DOWNSTREAM_CTX: kind-nso-downstream + + # The exact node image the production edge runs. + K8S_NODE_IMAGE: kindest/node:v1.35.5@sha256:ce977ae6d65918d0b58a5f8b5e940429c2ce42fa3a5619ec2bbc60b949c0ac95 + # Gateway control-plane version, matching production. The extension-server SDK + # version is paired with it deliberately, also matching production. + ENVOY_GATEWAY_VERSION: v1.7.4 + EG_SDK_VERSION: v1.8.1 + # Multi-arch WAF image โ€” the same filter as the production edge, but it also + # loads natively on arm64 dev hosts. The data plane references this image. + CORAZA_WAF_IMAGE: ghcr.io/datum-labs/coraza-envoy-go-filter/coraza-waf:v1.3.0-multiarch.1 + # CONNECT-proxy stand-in for the connector-tunnel scenario, built from + # test/e2e-edge/_fixtures/connector-tunnel and loaded into the downstream cluster. + CONNECT_PROXY_IMAGE: connect-proxy:e2e + # The WAF is enabled by default since the multi-arch image loads on every + # architecture. Set CORAZA_DISABLED=true only to deliberately skip the WAF. + CORAZA_DISABLED: '{{.CORAZA_DISABLED | default "false"}}' + # Karmada-bundled kube-apiserver, pinned to production. + KARMADA_APISERVER_VERSION: v1.33.2 + # NSO image pinned by git SHA so the loaded image is used deterministically. + IMG: + sh: echo "ghcr.io/datum-cloud/network-services-operator:$(git rev-parse --short HEAD)" + +tasks: + + up: + desc: "Bring the canonical two-cluster prod-fidelity test env ONLINE (clusters + EG v1.7.4 + ext-server + Coraza data plane + NSO manager)." + cmds: + - echo "๐Ÿš€ test-infra:up โ€” building nso-upstream + nso-downstream" + - task: clusters + # The Gateway-API and gateway CRDs must land on the downstream cluster + # before cert-manager (it probes the Gateway-API CRDs at startup) and before + # the downstream gateway (it needs its own CRDs present). + - task: eg-crds + - task: downstream-crds + - task: downstream-namespaces + - task: cert-manager-upstream + - task: cert-manager-downstream + - task: nso-image + - task: load-fixtures + - task: prepare-upstream + - task: eg-downstream + - task: extension-server + - task: external-dns + - task: link-clusters + - task: wait-ready + - echo "๐ŸŽ‰ test-infra:up complete. Run 'task test-infra:smoke' to confirm M2." + + down: + desc: "Tear down the test-infra clusters (and any Karmada host)." + cmds: + - echo "๐Ÿ’ฅ test-infra:down" + # Deleting both kind clusters removes all federation state, so this unjoin + # is only a best-effort courtesy. + - '{{.KARMADACTL}} deinit --context {{.UPSTREAM_CTX}} --force --purge-namespace 2>/dev/null || true' + - rm -rf {{.TMP_DIR}}/karmada-{{.UPSTREAM_CLUSTER}} + - '{{.KIND}} delete cluster --name {{.UPSTREAM_CLUSTER}} || true' + - '{{.KIND}} delete cluster --name {{.DOWNSTREAM_CLUSTER}} || true' + - rm -f {{.TMP_DIR}}/.kind-{{.DOWNSTREAM_CLUSTER}}-internal.yaml {{.TMP_DIR}}/.kind-{{.UPSTREAM_CLUSTER}}.yaml {{.TMP_DIR}}/.kind-{{.DOWNSTREAM_CLUSTER}}.yaml + - echo "โœจ done." + + clusters: + desc: "Create the upstream + downstream kind clusters pinned to the prod edge node image." + cmds: + - echo "๐Ÿ—๏ธ creating clusters (node {{.K8S_NODE_IMAGE}})" + - '{{.KIND}} delete cluster --name {{.UPSTREAM_CLUSTER}} 2>/dev/null || true' + - '{{.KIND}} delete cluster --name {{.DOWNSTREAM_CLUSTER}} 2>/dev/null || true' + - '{{.KIND}} create cluster --image {{.K8S_NODE_IMAGE}} --config=config/tools/kind/upstream-cluster.yaml' + - '{{.KIND}} create cluster --image {{.K8S_NODE_IMAGE}} --config=config/tools/kind/downstream-cluster.yaml' + # Running the full federation control plane alongside the rest of the stack + # in one kind node exhausts the default file-watch limits on the macOS + # Docker VM, which crashloops a component with "too many open files" and + # blocks the member join. Raise the limits on both nodes. + - docker exec {{.UPSTREAM_CLUSTER}}-control-plane sysctl -w fs.inotify.max_user_instances=8192 fs.inotify.max_user_watches=1048576 + - docker exec {{.DOWNSTREAM_CLUSTER}}-control-plane sysctl -w fs.inotify.max_user_instances=8192 fs.inotify.max_user_watches=1048576 + + cert-manager-upstream: + desc: "Install cert-manager (+ CSI driver) on the upstream cluster." + cmds: + - '{{.KUSTOMIZE}} build --enable-helm config/tools/cert-manager | kubectl --context {{.UPSTREAM_CTX}} apply --server-side=true --force-conflicts -f -' + - '{{.CMCTL}} check api --context {{.UPSTREAM_CTX}} --wait=5m' + + cert-manager-downstream: + desc: "Install cert-manager (+ CSI driver) on the downstream cluster." + cmds: + - '{{.KUSTOMIZE}} build --enable-helm config/tools/cert-manager | kubectl --context {{.DOWNSTREAM_CTX}} apply --server-side=true --force-conflicts -f -' + - '{{.CMCTL}} check api --context {{.DOWNSTREAM_CTX}} --wait=5m' + + nso-image: + desc: "Build the NSO operator image (git-SHA tag) and load it into both clusters." + cmds: + - echo "๐Ÿ”จ building {{.IMG}}" + - docker build -t {{.IMG}} . + - cd config/manager && {{.KUSTOMIZE}} edit set image ghcr.io/datum-cloud/network-services-operator={{.IMG}} + - '{{.KIND}} load docker-image {{.IMG}} --name {{.UPSTREAM_CLUSTER}}' + - '{{.KIND}} load docker-image {{.IMG}} --name {{.DOWNSTREAM_CLUSTER}}' + + prepare-upstream: + desc: "Deploy the NSO manager + webhook (config/e2e) on the upstream cluster, with the prod-base memory profile." + cmds: + - echo "๐Ÿ”ง deploying NSO manager (upstream)" + - '{{.KUSTOMIZE}} build config/e2e | kubectl --context {{.UPSTREAM_CTX}} apply --server-side=true --force-conflicts -f -' + # Match production: with the extension server handling proxy configuration, + # NSO must not also emit its own patch policies. The shared config file is + # also used by an older path that still relies on them, so we patch the live + # config here rather than editing that file. + - | + CFG=$(kubectl --context {{.UPSTREAM_CTX}} -n network-services-operator-system get cm network-services-operator-config -o jsonpath='{.data.config\.yaml}') + if ! echo "$CFG" | grep -q "eppEmissionEnabled"; then + NEW=$(echo "$CFG" | sed 's|^ downstreamGatewayClassName: .*|&\n eppEmissionEnabled: false|') + kubectl --context {{.UPSTREAM_CTX}} -n network-services-operator-system create cm network-services-operator-config \ + --from-literal=config.yaml="$NEW" --dry-run=client -o yaml | kubectl --context {{.UPSTREAM_CTX}} apply -f - + fi + # Raise the manager's memory limit to the production base so perf sweeps and + # webhook-under-load reflect production headroom. + - | + kubectl --context {{.UPSTREAM_CTX}} -n network-services-operator-system patch deploy network-services-operator-controller-manager \ + --type=json -p '[{"op":"replace","path":"/spec/template/spec/containers/0/resources/limits/memory","value":"2Gi"},{"op":"replace","path":"/spec/template/spec/containers/0/resources/requests/memory","value":"512Mi"}]' || true + + eg-crds: + desc: "Install the full Gateway-API + Envoy Gateway CRD set (incl ReferenceGrant + EnvoyProxy) on the downstream cluster. The downstream EG chart sets includeCRDs:false, so CRDs are applied separately (matches prod, where CRDs are managed out-of-band)." + vars: + EG_CRD_DIR: config/tools/envoy-gateway-downstream/charts/gateway-helm-{{.ENVOY_GATEWAY_VERSION}}/gateway-helm/crds + cmds: + - kubectl --context {{.DOWNSTREAM_CTX}} apply --server-side=true --force-conflicts -f {{.EG_CRD_DIR}}/gatewayapi-crds.yaml + - kubectl --context {{.DOWNSTREAM_CTX}} apply --server-side=true --force-conflicts -f {{.EG_CRD_DIR}}/generated/ + + downstream-namespaces: + desc: "Create the downstream gateway + hostname-accounting namespaces, plus the EG-watched e2e-direct namespace for hand-delivered fixtures (D1/D2). Runs BEFORE eg-downstream so e2e-direct carries its watch label before EG establishes its informer." + cmds: + - kubectl --context {{.DOWNSTREAM_CTX}} apply -f config/dev/downstream_resources/namespaces.yaml + # e2e-direct carries the gateway watch label from creation so gateways + # applied directly here reconcile deterministically. The gateway only + # reliably watches a namespace labeled at creation time, so the label must + # be in the manifest, not added afterward. + - kubectl --context {{.DOWNSTREAM_CTX}} apply -f config/e2e-downstream/direct-namespace.yaml + + eg-downstream: + desc: "Install the dedicated downstream Envoy Gateway (v1.7.4) with the ext-server extensionManager wiring on the downstream cluster." + cmds: + - echo "๐Ÿ”ง installing downstream EG {{.ENVOY_GATEWAY_VERSION}} (+ extensionManager, failOpen:false, maxMessageSize:256Mi)" + # Build from the untracked config/e2e-downstream/eg-downstream copy, which + # bakes in the e2e pins, so a hard reset on the shared branch cannot revert + # them โ€” it kept clobbering the tracked copy. + - '{{.KUSTOMIZE}} build --enable-helm config/e2e-downstream/eg-downstream | kubectl --context {{.DOWNSTREAM_CTX}} apply --server-side=true --force-conflicts -f -' + + load-coraza-waf: + desc: "Fallback: pull the Coraza WAF image and kind-load it into the downstream cluster (offline-CI escape hatch)." + cmds: + - docker pull {{.CORAZA_WAF_IMAGE}} + - '{{.KIND}} load docker-image {{.CORAZA_WAF_IMAGE}} --name {{.DOWNSTREAM_CLUSTER}}' + + load-fixtures: + desc: "Build + load the e2e fixture images into the downstream cluster (CONNECT-proxy stand-in for the connector-tunnel scenario)." + cmds: + - echo "๐Ÿ”ง building + loading fixture image {{.CONNECT_PROXY_IMAGE}}" + - docker build -t {{.CONNECT_PROXY_IMAGE}} test/e2e-edge/_fixtures/connector-tunnel + - '{{.KIND}} load docker-image {{.CONNECT_PROXY_IMAGE}} --name {{.DOWNSTREAM_CLUSTER}}' + + d1-mint-expired-secret: + desc: "D1 helper: mint an ALREADY-EXPIRED kubernetes.io/tls Secret and apply it to the downstream cluster, bypassing the upstream #212 cert-health gate so the ext-server prune backstop can be tested in isolation. Vars: NAMESPACE, SECRET, HOSTNAME." + vars: + # Default to the pre-provisioned, gateway-watched namespace so a gateway + # applied alongside this secret reconciles deterministically. + NAMESPACE: '{{.NAMESPACE | default "e2e-direct"}}' + SECRET: '{{.SECRET | default "d1-expired-tls"}}' + HOSTNAME: '{{.HOSTNAME | default "d1-bad.e2e.env.datum.net"}}' + cmds: + - | + config/e2e-downstream/d1-cert-bypass/mint-expired-secret.sh \ + "{{.NAMESPACE}}" "{{.SECRET}}" "{{.HOSTNAME}}" \ + | kubectl --context {{.DOWNSTREAM_CTX}} apply -f - + - echo "โœ… applied expired TLS Secret {{.NAMESPACE}}/{{.SECRET}} (host {{.HOSTNAME}}) to {{.DOWNSTREAM_CLUSTER}}" + + extension-server: + desc: "Deploy the ext-server (2 replicas + PDB + mTLS) + e2e cert chain + Coraza/branded-page config + test EnvoyProxy (Coraza+admin:19000) on the downstream cluster." + cmds: + - echo "๐Ÿ”ง deploying ext-server + e2e issuer chain + test EnvoyProxy (config/e2e-downstream)" + # Pin the ext-server image to the git-SHA build (same image as the manager). + - cd config/e2e-downstream && {{.KUSTOMIZE}} edit set image ghcr.io/datum-cloud/network-services-operator={{.IMG}} 2>/dev/null || true + - '{{.KUSTOMIZE}} build config/e2e-downstream | kubectl --context {{.DOWNSTREAM_CTX}} apply --server-side=true --force-conflicts -f -' + # WAF disable toggle. The overlay ships the WAF enabled for production + # fidelity; when CORAZA_DISABLED is set we flip the live config so listeners + # program without it. + - | + if [ "{{.CORAZA_DISABLED}}" = "true" ]; then + echo "โš ๏ธ Coraza WAF disabled (host arch {{OS}}/$(uname -m); WAF .so is amd64-only)" + kubectl --context {{.DOWNSTREAM_CTX}} -n network-services-operator-system get cm extension-server-config -o yaml \ + | sed 's/disabled: false/disabled: true/' | kubectl --context {{.DOWNSTREAM_CTX}} apply -f - + fi + # The extension server's server cert and the gateway's client cert are both + # issued from the e2e certificate authority. Wait for that authority's cert, + # then publish it where each side reads it so they can verify each other. + - task: extserver-ca-bundle + + extserver-ca-bundle: + desc: "Publish the e2e CA's ca.crt into the ext-server CA-bundle ConfigMap and the EG certificateRef Secret (both in network-services-operator-system)." + cmds: + - echo "โณ waiting for the e2e CA certificate to be issued" + - | + kubectl --context {{.DOWNSTREAM_CTX}} -n cert-manager wait certificate e2e-extension-server-ca \ + --for=condition=Ready --timeout=120s + - kubectl --context {{.DOWNSTREAM_CTX}} create namespace network-services-operator-system --dry-run=client -o yaml | kubectl --context {{.DOWNSTREAM_CTX}} apply -f - + - | + CA_CRT=$(kubectl --context {{.DOWNSTREAM_CTX}} -n cert-manager get secret e2e-extension-server-ca -o jsonpath='{.data.ca\.crt}' | base64 -d) + # The certificate-authority bundle the extension server mounts. + kubectl --context {{.DOWNSTREAM_CTX}} -n network-services-operator-system create configmap extension-server-ca-bundle \ + --from-literal=ca.crt="$CA_CRT" --dry-run=client -o yaml | kubectl --context {{.DOWNSTREAM_CTX}} apply -f - + # The same authority cert the downstream gateway references. The gateway + # reads it from the tls.crt key, not ca.crt. + kubectl --context {{.DOWNSTREAM_CTX}} -n network-services-operator-system create secret generic e2e-extension-server-ca \ + --from-literal=tls.crt="$CA_CRT" --dry-run=client -o yaml | kubectl --context {{.DOWNSTREAM_CTX}} apply -f - + # Restart the extension server so it picks up the freshly-published bundle. + - kubectl --context {{.DOWNSTREAM_CTX}} -n network-services-operator-system rollout restart deploy network-services-operator-envoy-gateway-extension-server || true + + downstream-crds: + desc: "Install the NSO CRDs the replicator mirrors into the downstream cluster (the Gateway-API/EG CRDs come from eg-crds)." + cmds: + - kubectl --context {{.DOWNSTREAM_CTX}} apply -f config/crd/bases/networking.datumapis.com_connectors.yaml + - kubectl --context {{.DOWNSTREAM_CTX}} apply -f config/crd/bases/networking.datumapis.com_httpproxies.yaml + - kubectl --context {{.DOWNSTREAM_CTX}} apply -f config/crd/bases/networking.datumapis.com_trafficprotectionpolicies.yaml + + external-dns: + desc: "Install external-dns CRDs (DNSEndpoint) on the downstream cluster." + cmds: + - '{{.KUSTOMIZE}} build --enable-helm config/tools/external-dns | kubectl --context {{.DOWNSTREAM_CTX}} apply --server-side=true --force-conflicts -f -' + + link-clusters: + desc: "Wire the NSO manager's downstream client to the downstream cluster via the downstream-cluster-kubeconfig secret on the upstream cluster." + cmds: + - echo "๐Ÿ”— linking upstream -> downstream" + - '{{.KIND}} get kubeconfig --name {{.DOWNSTREAM_CLUSTER}} --internal > {{.TMP_DIR}}/.kind-{{.DOWNSTREAM_CLUSTER}}-internal.yaml' + - kubectl --context {{.UPSTREAM_CTX}} create namespace network-services-operator-system --dry-run=client -o yaml | kubectl --context {{.UPSTREAM_CTX}} apply -f - + - | + kubectl --context {{.UPSTREAM_CTX}} create secret -n network-services-operator-system \ + generic downstream-cluster-kubeconfig --save-config --dry-run=client -o yaml \ + --from-file=kubeconfig={{.TMP_DIR}}/.kind-{{.DOWNSTREAM_CLUSTER}}-internal.yaml | kubectl --context {{.UPSTREAM_CTX}} apply -f - + + wait-ready: + desc: "Wait for the core components (NSO manager, downstream EG, ext-server) to be Available." + cmds: + - echo "โณ waiting for NSO manager (upstream)" + - kubectl --context {{.UPSTREAM_CTX}} -n network-services-operator-system wait deploy network-services-operator-controller-manager --for=condition=Available --timeout=240s + - echo "โณ waiting for downstream EG" + # The gateway control-plane Deployment is named `envoy-gateway` by its chart, + # not after the release name. + - kubectl --context {{.DOWNSTREAM_CTX}} -n datum-downstream-gateway wait deploy envoy-gateway --for=condition=Available --timeout=240s + - echo "โณ waiting for ext-server" + - kubectl --context {{.DOWNSTREAM_CTX}} -n network-services-operator-system wait deploy network-services-operator-envoy-gateway-extension-server --for=condition=Available --timeout=240s + - echo "โœ… core components ready." + + # ---- Karmada ------------------------------------------------------------ + + karmada-up: + desc: "Stand up a real Karmada host on the upstream cluster (apiserver v1.33.2), join the downstream member, and apply the prod federation artifacts." + vars: + KARMADA_KUBECONFIG: '{{.TMP_DIR}}/karmada-{{.UPSTREAM_CLUSTER}}/karmada-apiserver.config' + # The upstream node's docker-network IP, reachable from the downstream node + # on the same network, used to sign the Karmada apiserver cert. + HOST_IP: + sh: docker inspect {{.UPSTREAM_CLUSTER}}-control-plane -f '{{"{{"}}.NetworkSettings.Networks.kind.IPAddress{{"}}"}}' + # The downstream member's docker-network IP โ€” reachable from the upstream + # control-plane pods on the same network, but not from the macOS host. + MEMBER_IP: + sh: docker inspect {{.DOWNSTREAM_CLUSTER}}-control-plane -f '{{"{{"}}.NetworkSettings.Networks.kind.IPAddress{{"}}"}}' + cmds: + # Advertise 127.0.0.1: karmadactl runs on the macOS host, and init's own + # post-deploy steps dial the advertise address. The docker-network IP is not + # routable from the macOS host, so advertising it would make init time out; + # the upstream cluster maps a host port through to the apiserver, so + # 127.0.0.1 reaches it. We still sign the cert for the docker IP for any + # in-cluster path. The member is joined in push mode, so it never needs to + # reach the apiserver address โ€” advertising 127.0.0.1 is safe for the join. + - echo "๐ŸŒ karmada init (apiserver {{.KARMADA_APISERVER_VERSION}}, advertise 127.0.0.1:32443, cert-ip incl {{.HOST_IP}}) on {{.UPSTREAM_CLUSTER}}" + - mkdir -p {{.TMP_DIR}}/karmada-{{.UPSTREAM_CLUSTER}} + - | + {{.KARMADACTL}} init --context {{.UPSTREAM_CTX}} \ + --kube-image-tag {{.KARMADA_APISERVER_VERSION}} \ + --karmada-apiserver-advertise-address 127.0.0.1 \ + --cert-external-ip "127.0.0.1,{{.HOST_IP}}" \ + --etcd-storage-mode hostPath \ + --karmada-data {{.TMP_DIR}}/karmada-{{.UPSTREAM_CLUSTER}} \ + --karmada-pki {{.TMP_DIR}}/karmada-{{.UPSTREAM_CLUSTER}}/pki + - echo "๐Ÿ”— joining {{.DOWNSTREAM_CLUSTER}} as a Karmada member" + # karmadactl join runs on the host and connects to the member to install its + # agent, so it needs a host-reachable member kubeconfig, not the internal + # docker-hostname form the host cannot resolve. + - '{{.KIND}} get kubeconfig --name {{.DOWNSTREAM_CLUSTER}} > {{.TMP_DIR}}/.kind-{{.DOWNSTREAM_CLUSTER}}-host.yaml' + - | + {{.KARMADACTL}} join {{.DOWNSTREAM_CLUSTER}} \ + --karmada-context karmada-apiserver \ + --kubeconfig {{.KARMADA_KUBECONFIG}} \ + --cluster-kubeconfig {{.TMP_DIR}}/.kind-{{.DOWNSTREAM_CLUSTER}}-host.yaml \ + --cluster-context {{.DOWNSTREAM_CTX}} + # The join stores the host address as the member's endpoint, which the + # control-plane pods running inside the upstream cluster cannot reach. + # Repoint it to the member's docker IP, which they can reach on the shared + # network, and the member then goes Ready. + - | + kubectl --kubeconfig {{.KARMADA_KUBECONFIG}} --context karmada-apiserver \ + patch cluster {{.DOWNSTREAM_CLUSTER}} --type=merge \ + -p '{"spec":{"apiEndpoint":"https://{{.MEMBER_IP}}:6443"}}' + # Label the member so the production propagation policy places resources onto + # it. + - | + kubectl --kubeconfig {{.KARMADA_KUBECONFIG}} --context karmada-apiserver \ + label cluster {{.DOWNSTREAM_CLUSTER}} infra.datum.net/gateways=enabled --overwrite + - echo "โณ waiting for the member cluster to become Ready" + - | + kubectl --kubeconfig {{.KARMADA_KUBECONFIG}} --context karmada-apiserver \ + wait cluster {{.DOWNSTREAM_CLUSTER}} --for=condition=Ready --timeout=120s + - echo "๐Ÿ“œ applying federation artifacts (config/federation)" + - | + kubectl --kubeconfig {{.KARMADA_KUBECONFIG}} --context karmada-apiserver apply -f config/federation/ + - echo "โœ… karmada-up complete. Karmada apiserver kubeconfig at {{.KARMADA_KUBECONFIG}}" + + smoke: + desc: "M2 functional confirmation: drive an upstream Gateway+HTTPRoute through the ext-server path and curl a real 200." + cmds: + - '{{.CHAINSAW}} test ./test/e2e-edge/extension-server-smoke --cluster {{.UPSTREAM_CLUSTER}}={{.TMP_DIR}}/.kind-{{.UPSTREAM_CLUSTER}}.yaml --cluster {{.DOWNSTREAM_CLUSTER}}={{.TMP_DIR}}/.kind-{{.DOWNSTREAM_CLUSTER}}.yaml' + + e2e: + desc: "Run chainsaw e2e scenarios against the live two-cluster env. Pass a scenario name or path after -- (e.g. `task test-infra:e2e -- waf-enforcement`); with no arg, runs every ext-server-path scenario that targets nso-upstream/nso-downstream. Use SCENARIOS=... to override the default set." + vars: + # Scenarios authored against this env's cluster names and downstream path. + # Older fixtures targeting the previous cluster names are excluded here. + DEFAULT_SCENARIOS: extension-server-smoke waf-enforcement branded-error-page connector-offline-503 atomic-reject-isolation + # CLI_ARGS (after --) wins; else SCENARIOS env; else the default set. + SELECTED: '{{.CLI_ARGS | default .SCENARIOS | default .DEFAULT_SCENARIOS}}' + deps: + - kubeconfigs + cmds: + - | + set -e + for s in {{.SELECTED}}; do + # Accept either a bare scenario name or a full/relative path. + case "$s" in + test/e2e-edge/*|./test/e2e-edge/*) dir="$s" ;; + */*) dir="$s" ;; + *) dir="./test/e2e-edge/$s" ;; + esac + echo "๐Ÿงช chainsaw: $dir" + {{.CHAINSAW}} test "$dir" \ + --cluster {{.UPSTREAM_CLUSTER}}={{.TMP_DIR}}/.kind-{{.UPSTREAM_CLUSTER}}.yaml \ + --cluster {{.DOWNSTREAM_CLUSTER}}={{.TMP_DIR}}/.kind-{{.DOWNSTREAM_CLUSTER}}.yaml + done + + kubeconfigs: + desc: "Export per-cluster kubeconfigs to TMPDIR for chainsaw." + cmds: + - '{{.KIND}} get kubeconfig --name {{.UPSTREAM_CLUSTER}} > {{.TMP_DIR}}/.kind-{{.UPSTREAM_CLUSTER}}.yaml' + - '{{.KIND}} get kubeconfig --name {{.DOWNSTREAM_CLUSTER}} > {{.TMP_DIR}}/.kind-{{.DOWNSTREAM_CLUSTER}}.yaml' + + parity:check: + desc: "Run the config-dump parity gate against the live downstream Envoy + ext-server. Pass CLI flags after -- (PARITY owns the CLI). Exit 0 PASS / 1 parity FAIL / 2 tool error." + cmds: + - go build -o bin/parity-check ./cmd/parity-check + - ./bin/parity-check {{.CLI_ARGS}} + + parity:check-live: + desc: "Convenience: resolve the live data-plane Envoy pod from this env's labels and run parity:check in kubectl-exec mode (ext-server via selector across replicas). Extra flags pass through after --." + vars: + # In this env the data-plane proxy and the extension server live in these + # namespaces. + DP_NS: datum-downstream-gateway + DP_SELECTOR: gateway.envoyproxy.io/owning-gatewayclass=datum-downstream-gateway-e2e + EXT_NS: network-services-operator-system + EXT_SELECTOR: app.kubernetes.io/component=envoy-gateway-extension-server + # The admin side wants a single exact proxy pod name, resolved here. The + # extension-server side takes a selector and picks the authoritative replica + # itself. + DP_POD: + sh: kubectl --context {{.DOWNSTREAM_CTX}} -n datum-downstream-gateway get pods -l gateway.envoyproxy.io/owning-gatewayclass=datum-downstream-gateway-e2e -o jsonpath='{.items[0].metadata.name}' + cmds: + - go build -o bin/parity-check ./cmd/parity-check + - | + if [ -z "{{.DP_POD}}" ]; then + echo "no data-plane Envoy pod found (label {{.DP_SELECTOR}} in {{.DP_NS}}); a Gateway must exist in a watched namespace for the merged data plane to be provisioned" >&2 + exit 2 + fi + - | + ./bin/parity-check \ + --coraza-filter=coraza-waf \ + --admin-exec-pod={{.DP_POD}} --admin-exec-namespace={{.DP_NS}} --admin-exec-container=envoy --admin-exec-context={{.DOWNSTREAM_CTX}} \ + --ext-exec-selector={{.EXT_SELECTOR}} --ext-exec-namespace={{.EXT_NS}} --ext-exec-context={{.DOWNSTREAM_CTX}} \ + {{.CLI_ARGS}} diff --git a/Taskfile.yaml b/Taskfile.yaml index 45a19142..43502ef5 100644 --- a/Taskfile.yaml +++ b/Taskfile.yaml @@ -3,6 +3,8 @@ version: '3' includes: dev: taskfile: ./Taskfile.dev.yaml + test-infra: + taskfile: ./Taskfile.test-infra.yml tasks: validate-kustomizations: diff --git a/config/e2e-downstream/d1-cert-bypass/README.md b/config/e2e-downstream/d1-cert-bypass/README.md new file mode 100644 index 00000000..1923679c --- /dev/null +++ b/config/e2e-downstream/d1-cert-bypass/README.md @@ -0,0 +1,28 @@ +# Expired-certificate isolation fixture (test-env-only) + +In production, an expired or otherwise unusable TLS certificate is caught early: +the platform withholds that listener from the edge before it is ever delivered. +The extension server *also* removes unusable certificates at the edge, as a +second line of defense โ€” but because the earlier check normally catches the +problem first, that edge-side removal rarely gets exercised on the normal path. + +This fixture lets a test exercise it directly, by handing the edge a genuinely +expired certificate and bypassing the earlier check. + +1. `mint-expired-secret.sh ` writes a + self-signed, already-expired certificate as a TLS Secret to stdout. Apply it + into the `e2e-direct` namespace on the edge cluster. +2. The test then applies a gateway directly to the edge whose HTTPS listener + uses that certificate. The extension server removes the bad listener while a + healthy sibling keeps serving โ€” which is what the test asserts. + +> **Use the `e2e-direct` namespace.** The gateway controller only watches +> namespaces that already carry the `meta.datumapis.com/upstream-cluster-name` +> label when they are created; a label added afterward is not reliably picked +> up, and a gateway there can stay unprogrammed. The `e2e-direct` namespace is +> created with the label up front for exactly this reason. If you must create a +> namespace inline, set the label at creation time. + +`task -t Taskfile.test-infra.yml d1-mint-expired-secret` is a thin wrapper around +the script (defaults: `NAMESPACE=e2e-direct`, `SECRET=d1-expired-tls`, +`HOSTNAME=d1-bad.e2e.env.datum.net`). diff --git a/config/e2e-downstream/d1-cert-bypass/mint-expired-secret.sh b/config/e2e-downstream/d1-cert-bypass/mint-expired-secret.sh new file mode 100755 index 00000000..8ea5e705 --- /dev/null +++ b/config/e2e-downstream/d1-cert-bypass/mint-expired-secret.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Mint an already-expired self-signed TLS certificate and emit it as a +# kubernetes.io/tls Secret on stdout. Test-env-only: it hands the gateway an +# expired certificate directly, so the extension server's removal of unusable +# certificates can be exercised on its own, without the earlier check rejecting +# it first. +# +# Usage: mint-expired-secret.sh +set -euo pipefail + +NS="${1:?namespace required}" +SECRET="${2:?secret name required}" +HOST="${3:?hostname required}" + +WORK="$(mktemp -d)" +trap 'rm -rf "$WORK"' EXIT + +# Generate a key + a self-signed cert dated entirely in the past so it is expired +# the moment it is created. openssl's -not_before/-not_after (LibreSSL/OpenSSL 3) +# set an explicit validity window; fall back to a 1-second window via -days 0 if +# the flags are unavailable. +openssl req -x509 -newkey rsa:2048 -nodes \ + -keyout "$WORK/tls.key" -out "$WORK/tls.crt" \ + -subj "/CN=${HOST}" \ + -addext "subjectAltName=DNS:${HOST}" \ + -not_before 20200101000000Z -not_after 20200102000000Z 2>/dev/null \ + || openssl req -x509 -newkey rsa:2048 -nodes \ + -keyout "$WORK/tls.key" -out "$WORK/tls.crt" \ + -subj "/CN=${HOST}" -addext "subjectAltName=DNS:${HOST}" -days 1 2>/dev/null + +CRT_B64="$(base64 < "$WORK/tls.crt" | tr -d '\n')" +KEY_B64="$(base64 < "$WORK/tls.key" | tr -d '\n')" + +cat < + + + Service Unavailable +

This service is temporarily unavailable.

+ diff --git a/config/e2e-downstream/extserver-base/kustomization.yaml b/config/e2e-downstream/extserver-base/kustomization.yaml new file mode 100644 index 00000000..14e55681 --- /dev/null +++ b/config/e2e-downstream/extserver-base/kustomization.yaml @@ -0,0 +1,32 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Ext-server, namespaced + prefixed so the Service FQDN matches the EG +# extensionManager fqdn and the CSI dns-names baked into the base deployment. +namespace: network-services-operator-system +namePrefix: network-services-operator- + +resources: + - ../../extension-server + +patches: + - path: patches/extserver-tls.yaml + target: + kind: Deployment + name: envoy-gateway-extension-server + - path: patches/extserver-serverconfig.yaml + target: + kind: Deployment + name: envoy-gateway-extension-server + - path: patches/extserver-clientcert-issuer.yaml + target: + kind: Certificate + name: envoy-gateway-extension-server-eg-client-tls + - path: patches/extserver-ca-bundle.yaml + target: + kind: Deployment + name: envoy-gateway-extension-server + - path: patches/extserver-programmed-set.yaml + target: + kind: Deployment + name: envoy-gateway-extension-server diff --git a/config/e2e-downstream/extserver-base/patches/extserver-ca-bundle.yaml b/config/e2e-downstream/extserver-base/patches/extserver-ca-bundle.yaml new file mode 100644 index 00000000..6e85fc0f --- /dev/null +++ b/config/e2e-downstream/extserver-base/patches/extserver-ca-bundle.yaml @@ -0,0 +1,18 @@ +# Point the ext-server CA bundle volume at the e2e CA ConfigMap (carrying the +# ca.crt that signed the EG client cert), replacing placeholder-ca-bundle. The +# ConfigMap is published by the bring-up (test-infra:extserver-ca-bundle) from +# the e2e-extension-server-ca cert-manager secret. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: envoy-gateway-extension-server +spec: + template: + spec: + volumes: + - name: tls-ca + configMap: + name: extension-server-ca-bundle + items: + - key: ca.crt + path: ca.crt diff --git a/config/e2e-downstream/extserver-base/patches/extserver-clientcert-issuer.yaml b/config/e2e-downstream/extserver-base/patches/extserver-clientcert-issuer.yaml new file mode 100644 index 00000000..9710265e --- /dev/null +++ b/config/e2e-downstream/extserver-base/patches/extserver-clientcert-issuer.yaml @@ -0,0 +1,11 @@ +# Point the EG client cert (CN=envoy-gateway) at the e2e CA ClusterIssuer, +# replacing the base certificate's placeholder-issuer. +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: envoy-gateway-extension-server-eg-client-tls +spec: + issuerRef: + name: e2e-extension-server-ca + kind: ClusterIssuer + group: cert-manager.io diff --git a/config/e2e-downstream/extserver-base/patches/extserver-programmed-set.yaml b/config/e2e-downstream/extserver-base/patches/extserver-programmed-set.yaml new file mode 100644 index 00000000..aa4ebfc4 --- /dev/null +++ b/config/e2e-downstream/extserver-base/patches/extserver-programmed-set.yaml @@ -0,0 +1,17 @@ +# Turn on the read-only /debug/programmed-set endpoint so the parity test can +# confirm the proxy is running exactly the set the build intended. The base +# deployment reads --enable-programmed-set from this env var, defaulting off in +# production; flip it to "true" here for the test environment. Strategic-merge +# on env (matched by name). +apiVersion: apps/v1 +kind: Deployment +metadata: + name: envoy-gateway-extension-server +spec: + template: + spec: + containers: + - name: envoy-gateway-extension-server + env: + - name: ENABLE_PROGRAMMED_SET + value: "true" diff --git a/config/e2e-downstream/extserver-base/patches/extserver-serverconfig.yaml b/config/e2e-downstream/extserver-base/patches/extserver-serverconfig.yaml new file mode 100644 index 00000000..2c9701d9 --- /dev/null +++ b/config/e2e-downstream/extserver-base/patches/extserver-serverconfig.yaml @@ -0,0 +1,26 @@ +# Mount the operator config ConfigMap and set SERVER_CONFIG to its path so the +# ext-server loads Coraza + branded-error-page settings. Strategic-merge on env +# (matched by name) and on volumes/volumeMounts. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: envoy-gateway-extension-server +spec: + template: + spec: + containers: + - name: envoy-gateway-extension-server + env: + - name: SERVER_CONFIG + value: /etc/datum/server-config/config.yaml + volumeMounts: + - name: server-config + mountPath: /etc/datum/server-config + readOnly: true + volumes: + - name: server-config + configMap: + name: extension-server-config + items: + - key: config.yaml + path: config.yaml diff --git a/config/e2e-downstream/extserver-base/patches/extserver-tls.yaml b/config/e2e-downstream/extserver-base/patches/extserver-tls.yaml new file mode 100644 index 00000000..c190ac81 --- /dev/null +++ b/config/e2e-downstream/extserver-base/patches/extserver-tls.yaml @@ -0,0 +1,21 @@ +# Point the ext-server server-cert CSI volume at the e2e CA-backed ClusterIssuer, +# replacing the base deployment's placeholder-issuer. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: envoy-gateway-extension-server +spec: + template: + spec: + volumes: + - name: tls + csi: + driver: csi.cert-manager.io + readOnly: true + volumeAttributes: + csi.cert-manager.io/issuer-kind: ClusterIssuer + csi.cert-manager.io/issuer-name: e2e-extension-server-ca + csi.cert-manager.io/common-name: envoy-gateway-extension-server + csi.cert-manager.io/dns-names: "network-services-operator-envoy-gateway-extension-server.network-services-operator-system.svc,network-services-operator-envoy-gateway-extension-server.network-services-operator-system.svc.cluster.local" + csi.cert-manager.io/key-usages: server auth + csi.cert-manager.io/fs-group: "65532" diff --git a/config/e2e-downstream/extserver-config.yaml b/config/e2e-downstream/extserver-config.yaml new file mode 100644 index 00000000..f5828a0c --- /dev/null +++ b/config/e2e-downstream/extserver-config.yaml @@ -0,0 +1,21 @@ +# Extension server config for the e2e edge. +# +# The WAF is enabled so its rules reach the proxy, and the branded 5xx page is +# pointed at the mounted error-pages volume so the suite can assert the branded +# body by content. The connector-tunnel listener name is left at its default so +# the connector fixtures find the listener by the same name production uses. +apiVersion: v1 +kind: ConfigMap +metadata: + name: extension-server-config + namespace: network-services-operator-system +data: + config.yaml: | + apiVersion: apiserver.config.datumapis.com/v1alpha1 + kind: NetworkServicesOperator + gateway: + coraza: + disabled: false + errorPage: + enabled: true + bodyPath: /etc/datum/error-pages/error-5xx.html diff --git a/config/e2e-downstream/issuer.yaml b/config/e2e-downstream/issuer.yaml new file mode 100644 index 00000000..35fc7a62 --- /dev/null +++ b/config/e2e-downstream/issuer.yaml @@ -0,0 +1,41 @@ +# Self-signed root + CA-backed ClusterIssuer for the e2e ext-server mTLS chain. +# +# This issues both sides of the EG <-> ext-server handshake: +# - the ext-server SERVER cert (via the CSI driver, see patches/extserver-tls.yaml) +# - the EG CLIENT cert (envoy-gateway-extension-server-eg-client-tls, CN=envoy-gateway) +# The CA's ca.crt is also published into the ext-server CA bundle ConfigMap +# (so the ext-server can verify the EG client) and into the EG certificateRef +# secret (so EG can verify the ext-server server cert). +--- +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: e2e-extension-server-selfsigned +spec: + selfSigned: {} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: e2e-extension-server-ca + namespace: cert-manager +spec: + isCA: true + commonName: e2e-extension-server-ca + secretName: e2e-extension-server-ca + duration: 8760h + privateKey: + algorithm: ECDSA + size: 256 + issuerRef: + name: e2e-extension-server-selfsigned + kind: ClusterIssuer + group: cert-manager.io +--- +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: e2e-extension-server-ca +spec: + ca: + secretName: e2e-extension-server-ca diff --git a/config/e2e-downstream/kustomization.yaml b/config/e2e-downstream/kustomization.yaml new file mode 100644 index 00000000..6ab183ae --- /dev/null +++ b/config/e2e-downstream/kustomization.yaml @@ -0,0 +1,38 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# The e2e edge overlay applied to nso-downstream. Composes the ext-server +# (prefixed/namespaced via extserver-base) with the e2e mTLS issuer chain, the +# Coraza/branded-page server-config, the branded error page, and the test +# EnvoyProxy (real Coraza WAF image + admin :19000) and its GatewayClass. +# +# The ConfigMaps / issuer / EnvoyProxy / GatewayClass are kept OUT of the +# name-prefix so the deployment's literal references (envoy-error-pages, +# extension-server-config) and the EG fqdn resolve unchanged. +resources: +- namespace.yaml +- extserver-base +- issuer.yaml +- extserver-config.yaml +- error-pages.yaml +- envoyproxy.yaml + +# The NSO image tag is set by the bring-up (test-infra:extension-server) to the +# git-SHA-built image. +images: +- name: ghcr.io/datum-cloud/network-services-operator + newName: ghcr.io/datum-cloud/network-services-operator + newTag: 63fa912 + +# The default e2e path has no Prometheus operator; drop the ServiceMonitor so +# the apply doesn't fail on the missing monitoring.coreos.com CRD. (Re-add via +# OBSERVABILITY=1 when the Flux observability stack is enabled.) +patches: +- patch: | + $patch: delete + apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + name: envoy-gateway-extension-server-metrics + target: + kind: ServiceMonitor diff --git a/config/e2e-downstream/namespace.yaml b/config/e2e-downstream/namespace.yaml new file mode 100644 index 00000000..b27c33c1 --- /dev/null +++ b/config/e2e-downstream/namespace.yaml @@ -0,0 +1,6 @@ +# The ext-server namespace on the edge cluster. The base ext-server resources +# target this namespace but do not create it. +apiVersion: v1 +kind: Namespace +metadata: + name: network-services-operator-system diff --git a/config/tools/envoy-gateway-downstream/kustomization.yaml b/config/tools/envoy-gateway-downstream/kustomization.yaml index d6cce819..9f5a2a92 100644 --- a/config/tools/envoy-gateway-downstream/kustomization.yaml +++ b/config/tools/envoy-gateway-downstream/kustomization.yaml @@ -6,7 +6,7 @@ helmCharts: includeCRDs: false namespace: datum-downstream-gateway releaseName: envoy-datum-downstream-gateway - version: v1.8.1 + version: v1.7.4 repo: oci://docker.io/envoyproxy valuesInline: config: @@ -29,6 +29,10 @@ helmCharts: - key: meta.datumapis.com/upstream-cluster-name operator: Exists extensionManager: + # Match the ext-server's compiled-in 256 MiB gRPC ceiling. Without + # this the EG side defaults to ~4 MiB and silently freezes xDS once a + # translated snapshot exceeds it (~540 gateways in prod). + maxMessageSize: 256Mi policyResources: - group: networking.datumapis.com version: v1alpha @@ -43,9 +47,11 @@ helmCharts: port: 5005 tls: certificateRef: - # Placeholder โ€” an overlay must patch this to the Secret holding the CA that issued the server cert. - name: placeholder-ca - namespace: placeholder-namespace + # e2e: the Secret holding the CA that issued the ext-server + # server cert. Published by the bring-up (test-infra:extserver-ca-bundle) + # from the e2e-extension-server-ca cert-manager secret. + name: e2e-extension-server-ca + namespace: network-services-operator-system clientCertificateRef: name: envoy-gateway-extension-server-eg-client-tls namespace: network-services-operator-system diff --git a/config/tools/kind/downstream-cluster.yaml b/config/tools/kind/downstream-cluster.yaml new file mode 100644 index 00000000..6339242a --- /dev/null +++ b/config/tools/kind/downstream-cluster.yaml @@ -0,0 +1,14 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +name: nso-downstream +networking: + ipFamily: dual +nodes: +- role: control-plane + extraPortMappings: + - containerPort: 30080 + hostPort: 30080 + protocol: TCP + - containerPort: 30443 + hostPort: 30443 + protocol: TCP diff --git a/config/tools/kind/upstream-cluster.yaml b/config/tools/kind/upstream-cluster.yaml new file mode 100644 index 00000000..be723669 --- /dev/null +++ b/config/tools/kind/upstream-cluster.yaml @@ -0,0 +1,15 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +name: nso-upstream +networking: + ipFamily: dual +nodes: +- role: control-plane + # Expose the Karmada apiserver NodePort (32443) to the host so karmadactl โ€” + # which runs on the host โ€” can reach the Karmada apiserver during init/join. + # On macOS the kind docker-network IP is not host-routable, so without this + # mapping karmadactl times out dialing the advertised node IP:32443. + extraPortMappings: + - containerPort: 32443 + hostPort: 32443 + protocol: TCP