From 13cffad6b87994f5666a92ef4b69c09173282c4f Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Thu, 25 Jun 2026 13:30:08 -0500 Subject: [PATCH] feat(federation): carry edge policy from the control plane to the edges Describes how firewall and connector policy authored centrally is delivered to each edge, and how each edge reads it on arrival. Named for what it does rather than the tool that moves it, so the intent stays legible if the mechanism changes. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01JbCy8vy66RdNYzGSgqH6P6 --- config/federation/README.md | 41 +++ .../federation/clusterpropagationpolicy.yaml | 158 ++++++++++++ config/federation/resourceinterpreters.yaml | 234 ++++++++++++++++++ 3 files changed, 433 insertions(+) create mode 100644 config/federation/README.md create mode 100644 config/federation/clusterpropagationpolicy.yaml create mode 100644 config/federation/resourceinterpreters.yaml diff --git a/config/federation/README.md b/config/federation/README.md new file mode 100644 index 00000000..eb464295 --- /dev/null +++ b/config/federation/README.md @@ -0,0 +1,41 @@ +# Federation + +The Datum edge runs across many clusters in many regions. Customers, though, +work against a single control plane: they create a Gateway, a route, or a +firewall policy in one place. **Federation is what carries that intent out to +the edge clusters that actually serve traffic.** + +This directory holds the federation configuration the test environment applies, +mirrored from production so the test edge fans configuration out the same way +the real one does. + +## Why it's tested as its own concern + +For most of this system's history, the test environment copied configuration +between clusters with a simple direct mechanism — nothing like production. But +several real incidents lived specifically in the federation layer: some +information (a backend's online/offline status) is intentionally *not* carried +to the edge, and the timing of cross-cluster delivery created races. None of +that is visible unless the test edge federates the way production does. + +So the production-fidelity environment stands up real federation and proves the +thing customers depend on: **configuration created in the control plane actually +arrives at the edge.** The test confirms a change made centrally shows up on a +downstream cluster within seconds. + +## What's here + +- A propagation policy describing *which* resources travel to the edge. +- Interpreter rules describing *how* each resource type is carried — including + the deliberate choice to propagate configuration but not live status, which is + the behavior that caused real "false offline" incidents and is now exercised + directly. + +## Implementation + +Federation is implemented with [Karmada](https://karmada.io/). The directory is +named for the responsibility — fanning configuration out to the edge — rather +than the tool, so the intent stays clear even if the underlying mechanism +changes. The environment that applies these artifacts is described in +[`Taskfile.test-infra.yml`](../../Taskfile.test-infra.yml) (`task +test-infra:karmada-up`). diff --git a/config/federation/clusterpropagationpolicy.yaml b/config/federation/clusterpropagationpolicy.yaml new file mode 100644 index 00000000..5299bb3b --- /dev/null +++ b/config/federation/clusterpropagationpolicy.yaml @@ -0,0 +1,158 @@ +apiVersion: policy.karmada.io/v1alpha1 +kind: ClusterPropagationPolicy +metadata: + name: nso-resources +spec: + conflictResolution: Overwrite + placement: + clusterAffinities: + - affinityName: gateway-enabled + labelSelector: + matchExpressions: + - key: infra.datum.net/gateways + operator: In + values: + - enabled + resourceSelectors: + - apiVersion: v1 + kind: Namespace + labelSelector: + matchExpressions: + - key: meta.datumapis.com/upstream-namespace + operator: Exists + - apiVersion: v1 + kind: ConfigMap + labelSelector: + matchExpressions: + - key: meta.datumapis.com/upstream-namespace + operator: Exists + - apiVersion: v1 + kind: Secret + labelSelector: + matchExpressions: + - key: meta.datumapis.com/upstream-namespace + operator: Exists + # TODO(jreese) clean up dupe secret policies + - apiVersion: v1 + kind: Secret + labelSelector: + matchExpressions: + - key: meta.datumapis.com/upstream-cluster-name + operator: Exists + - apiVersion: v1 + kind: Secret + labelSelector: + matchExpressions: + - key: cert-manager.io/issuer-name + operator: In + values: + - nso-gateway + - apiVersion: discovery.k8s.io/v1 + kind: EndpointSlice + labelSelector: + matchExpressions: + - key: meta.datumapis.com/upstream-cluster-name + operator: Exists + - apiVersion: v1 + kind: Service + # TODO(jreese) get labels on these patch policies + # labelSelector: + # matchExpressions: + # - key: meta.datumapis.com/upstream-cluster-name + # operator: Exists + + # Gateway API + - apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + labelSelector: + matchExpressions: + - key: meta.datumapis.com/upstream-cluster-name + operator: Exists + - apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + labelSelector: + matchExpressions: + - key: meta.datumapis.com/upstream-cluster-name + operator: Exists + - apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + labelSelector: + matchExpressions: + - key: meta.datumapis.com/http01-solver + operator: Exists + - apiVersion: gateway.networking.k8s.io/v1 + kind: BackendTLSPolicy + # TODO(jreese) get labels on these when they are created by the httpproxy + # controller + # labelSelector: + # matchExpressions: + # - key: meta.datumapis.com/upstream-cluster-name + # operator: Exists + + # Envoy Gateway API Extensions + - apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: Backend + labelSelector: + matchExpressions: + - key: meta.datumapis.com/upstream-cluster-name + operator: Exists + - apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: BackendTrafficPolicy + labelSelector: + matchExpressions: + - key: meta.datumapis.com/upstream-cluster-name + operator: Exists + - apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: HTTPRouteFilter + labelSelector: + matchExpressions: + - key: meta.datumapis.com/upstream-cluster-name + operator: Exists + - apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: HTTPRouteFilter + labelSelector: + matchExpressions: + - key: meta.datumapis.com/http01-solver + operator: Exists + - apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: SecurityPolicy + labelSelector: + matchExpressions: + - key: meta.datumapis.com/upstream-cluster-name + operator: Exists + - apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: EnvoyPatchPolicy + # TODO(jreese) get labels on these patch policies + # labelSelector: + # matchExpressions: + # - key: meta.datumapis.com/upstream-cluster-name + # operator: Exists + + # Network Services Operator CRDs (replicated for the extension server) + - apiVersion: networking.datumapis.com/v1alpha1 + kind: Connector + labelSelector: + matchExpressions: + - key: meta.datumapis.com/upstream-cluster-name + operator: Exists + - apiVersion: networking.datumapis.com/v1alpha + kind: TrafficProtectionPolicy + labelSelector: + matchExpressions: + - key: meta.datumapis.com/upstream-cluster-name + operator: Exists + - apiVersion: networking.datumapis.com/v1alpha + kind: HTTPProxy + labelSelector: + matchExpressions: + - key: meta.datumapis.com/upstream-cluster-name + operator: Exists + + # External DNS + - apiVersion: externaldns.k8s.io/v1alpha1 + kind: DNSEndpoint + # TODO(jreese) get labels on these + # labelSelector: + # matchExpressions: + # - key: meta.datumapis.com/upstream-cluster-name + # operator: Exists diff --git a/config/federation/resourceinterpreters.yaml b/config/federation/resourceinterpreters.yaml new file mode 100644 index 00000000..452a19ad --- /dev/null +++ b/config/federation/resourceinterpreters.yaml @@ -0,0 +1,234 @@ +# Future test coverage for resource interpreters can leverage the test framework +# once it's merged. +# +# See: https://github.com/karmada-io/karmada/pull/6938 +--- +apiVersion: config.karmada.io/v1alpha1 +kind: ResourceInterpreterCustomization +metadata: + name: gateway.networking.k8s.io-gateway +spec: + target: + apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + customizations: + statusAggregation: + luaScript: > + function AggregateStatus(desiredObj, statusItems) + if statusItems == nil or #statusItems == 0 then + return desiredObj + end + if desiredObj.status == nil then + desiredObj.status = {} + end + + local item = statusItems[1] + if item == nil or item.status == nil then + return desiredObj + end + + -- TODO(jreese) implement proper aggregation logic. Would be good to + -- think through how to represent propagation status across clusters. + if item.status.addresses ~= nil then + desiredObj.status.addresses = item.status.addresses + end + if item.status.conditions ~= nil then + desiredObj.status.conditions = item.status.conditions + end + if item.status.listeners ~= nil then + desiredObj.status.listeners = item.status.listeners + end + return desiredObj + end +--- +apiVersion: config.karmada.io/v1alpha1 +kind: ResourceInterpreterCustomization +metadata: + name: gateway.networking.k8s.io-httproute +spec: + target: + apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + customizations: + statusAggregation: + luaScript: > + function AggregateStatus(desiredObj, statusItems) + if statusItems == nil or #statusItems == 0 then + return desiredObj + end + if desiredObj.status == nil then + desiredObj.status = {} + end + + local item = statusItems[1] + if item == nil or item.status == nil then + return desiredObj + end + + -- TODO(jreese) implement proper aggregation logic. Would be good to + -- think through how to represent propagation status across clusters. + if item.status.parents ~= nil then + desiredObj.status.parents = item.status.parents + end + return desiredObj + end +--- +apiVersion: config.karmada.io/v1alpha1 +kind: ResourceInterpreterCustomization +metadata: + name: gateway.networking.k8s.io-backendtlspolicy +spec: + target: + apiVersion: gateway.networking.k8s.io/v1 + kind: BackendTLSPolicy + customizations: + statusAggregation: + luaScript: > + function AggregateStatus(desiredObj, statusItems) + if statusItems == nil or #statusItems == 0 then + return desiredObj + end + if desiredObj.status == nil then + desiredObj.status = {} + end + + local item = statusItems[1] + if item == nil or item.status == nil then + return desiredObj + end + + -- TODO(jreese) implement proper aggregation logic. Would be good to + -- think through how to represent propagation status across clusters. + if item.status.ancestors ~= nil then + desiredObj.status.ancestors = item.status.ancestors + end + return desiredObj + end +--- +apiVersion: config.karmada.io/v1alpha1 +kind: ResourceInterpreterCustomization +metadata: + name: gateway.envoyproxy.io-backend +spec: + target: + apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: Backend + customizations: + statusAggregation: + luaScript: > + function AggregateStatus(desiredObj, statusItems) + if statusItems == nil or #statusItems == 0 then + return desiredObj + end + if desiredObj.status == nil then + desiredObj.status = {} + end + + local item = statusItems[1] + if item == nil or item.status == nil then + return desiredObj + end + + -- TODO(jreese) implement proper aggregation logic. Would be good to + -- think through how to represent propagation status across clusters. + if item.status.conditions ~= nil then + desiredObj.status.conditions = item.status.conditions + end + return desiredObj + end +--- +apiVersion: config.karmada.io/v1alpha1 +kind: ResourceInterpreterCustomization +metadata: + name: gateway.envoyproxy.io-backendtrafficpolicy +spec: + target: + apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: BackendTrafficPolicy + customizations: + statusAggregation: + luaScript: > + function AggregateStatus(desiredObj, statusItems) + if statusItems == nil or #statusItems == 0 then + return desiredObj + end + if desiredObj.status == nil then + desiredObj.status = {} + end + + local item = statusItems[1] + if item == nil or item.status == nil then + return desiredObj + end + + -- TODO(jreese) implement proper aggregation logic. Would be good to + -- think through how to represent propagation status across clusters. + if item.status.ancestors ~= nil then + desiredObj.status.ancestors = item.status.ancestors + end + return desiredObj + end +--- +apiVersion: config.karmada.io/v1alpha1 +kind: ResourceInterpreterCustomization +metadata: + name: gateway.envoyproxy.io-envoypatchpolicy +spec: + target: + apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: EnvoyPatchPolicy + customizations: + statusAggregation: + luaScript: > + function AggregateStatus(desiredObj, statusItems) + if statusItems == nil or #statusItems == 0 then + return desiredObj + end + if desiredObj.status == nil then + desiredObj.status = {} + end + + local item = statusItems[1] + if item == nil or item.status == nil then + return desiredObj + end + + -- TODO(jreese) implement proper aggregation logic. Would be good to + -- think through how to represent propagation status across clusters. + if item.status.ancestors ~= nil then + desiredObj.status.ancestors = item.status.ancestors + end + return desiredObj + end +--- +apiVersion: config.karmada.io/v1alpha1 +kind: ResourceInterpreterCustomization +metadata: + name: externaldns.k8s.io-dnsendpoint +spec: + target: + apiVersion: externaldns.k8s.io/v1alpha1 + kind: DNSEndpoint + customizations: + statusAggregation: + luaScript: > + function AggregateStatus(desiredObj, statusItems) + if statusItems == nil or #statusItems == 0 then + return desiredObj + end + if desiredObj.status == nil then + desiredObj.status = {} + end + + local item = statusItems[1] + if item == nil or item.status == nil then + return desiredObj + end + + -- TODO(jreese) implement proper aggregation logic. Would be good to + -- think through how to represent propagation status across clusters. + if item.status.observedGeneration ~= nil then + desiredObj.status.observedGeneration = item.status.observedGeneration + end + return desiredObj + end