From 10d874641a01c8021e05edbfcbcda8c3a1f0e3c9 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Tue, 23 Jun 2026 15:05:00 -0500 Subject: [PATCH 1/3] fix(extension-server): use a unique connector domain to avoid Envoy NACK Online connector routing appended the connector's backend target host to the virtual host's domains. Tunnels overwhelmingly target "localhost", so every connector's virtual host received the same "localhost" domain. On a shared route configuration (the HTTP listener that merges all gateways) those duplicate, and Envoy rejects the entire xDS snapshot ("Only unique values for domains are permitted"). A single collision freezes config updates fleet-wide: new tunnels never program a route, and connectors that come online never leave the offline 503 program. Append a synthetic domain derived from the virtual host name instead. Envoy already requires virtual host names to be unique within a route configuration, so the domain can never collide. The internal tunnel listener routes on cluster metadata (tunnel address + endpoint id), not on this domain, so a synthetic value preserves tunnel routing. Adds a regression test asserting two online connectors that share a backend host on one route configuration produce no duplicate domain. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/extensionserver/mutate/connector.go | 26 ++++++-- .../extensionserver/mutate/connector_test.go | 65 +++++++++++++++---- .../extensionserver/server/server_test.go | 28 ++++---- 3 files changed, 89 insertions(+), 30 deletions(-) diff --git a/internal/extensionserver/mutate/connector.go b/internal/extensionserver/mutate/connector.go index 5a7011e..0d82dd7 100644 --- a/internal/extensionserver/mutate/connector.go +++ b/internal/extensionserver/mutate/connector.go @@ -73,7 +73,7 @@ func ReplaceConnectorClusters( // ApplyConnectorRoutes applies connector route mutations to a RouteConfiguration: // - Online (replaced) connector: prepend a CONNECT upgrade route targeting the -// replaced cluster and append info.TargetHost to VH domains. +// replaced cluster and append a unique per-connector domain to VH domains. // - Offline connector: prepend a CONNECT direct_response 503 (tunnel-control // clients) and rewrite the user-facing forwarding routes to a 503 // direct_response (see the offline branch for why). @@ -105,7 +105,7 @@ func ApplyConnectorRoutes( var newRoute *routev3.Route - if info, ok := replaced[connectorCluster]; ok { + if _, ok := replaced[connectorCluster]; ok { // Online: CONNECT route targeting the replaced cluster. newRoute, err = buildConnectRoute( "connector-connect-"+sanitizeID(vh.GetName()), @@ -116,10 +116,15 @@ func ApplyConnectorRoutes( } // Prepend the CONNECT route (NSO inserts at /virtual_hosts/0/routes/0). vh.Routes = append([]*routev3.Route{newRoute}, vh.Routes...) - // Append the connector's target host to VH domains. - // Production uses the actual backend hostname (conflict C1 in design plan), - // NOT a synthetic .connector.local domain. - vh.Domains = appendUnique(vh.Domains, info.TargetHost) + // Make the CONNECT route addressable with a domain that is unique per + // virtual host. The backend host itself must NOT be used here: tunnels + // overwhelmingly target "localhost", so reusing it would put the same + // domain on every connector's virtual host. On a shared route + // configuration (e.g. the HTTP listener that merges all gateways) those + // collide, and Envoy rejects the entire snapshot — freezing config + // updates fleet-wide. The internal tunnel listener routes on cluster + // metadata, not this domain, so a synthetic unique value is sufficient. + vh.Domains = appendUnique(vh.Domains, connectorMatchDomain(vh.GetName())) } else { // Offline connect_matcher route for tunnel-control clients. newRoute, err = buildOfflineRoute("connector-offline-" + sanitizeID(vh.GetName())) @@ -230,6 +235,15 @@ func buildConnectorCluster( return cl, nil } +// connectorMatchDomain returns the synthetic domain added to an online +// connector's virtual host so its CONNECT route is addressable. It is derived +// from the virtual host name, which Envoy already requires to be unique within a +// route configuration, so the domain can never collide with another connector on +// the same (possibly shared) route configuration. +func connectorMatchDomain(vhName string) string { + return sanitizeID(vhName) + ".connector.internal" +} + // buildConnectRoute builds an online CONNECT route (connect_matcher + CONNECT // upgrade) pointed at the given connector cluster. Mirrors buildConnectRoute in // test/perf/extserver/internal/mutate/connector.go. diff --git a/internal/extensionserver/mutate/connector_test.go b/internal/extensionserver/mutate/connector_test.go index 132a78a..b67d8dc 100644 --- a/internal/extensionserver/mutate/connector_test.go +++ b/internal/extensionserver/mutate/connector_test.go @@ -182,7 +182,7 @@ func TestReplaceConnectorClusters_ClusterNotInPolicyIndex_Skipped(t *testing.T) // --- ApplyConnectorRoutes tests --- -func TestApplyConnectorRoutes_Online_PrependsCONNECTRouteAndAppendsTargetHost(t *testing.T) { +func TestApplyConnectorRoutes_Online_PrependsCONNECTRouteAndAppendsUniqueDomain(t *testing.T) { idx := connectorPolicyIndex(true) clusterName := testClusterName() @@ -230,25 +230,28 @@ func TestApplyConnectorRoutes_Online_PrependsCONNECTRouteAndAppendsTargetHost(t // Original forwarding route must remain second. assert.Equal(t, "fwd", vh.Routes[1].GetName()) - // TargetHost (actual backend host) must be appended to VH domains. - // Production uses info.TargetHost, NOT a synthetic .connector.local domain (design §2.3 C1). - assert.Contains(t, vh.Domains, testTargetHost, - "TargetHost must be appended to VH domains (not a .connector.local synthetic domain)") + // A unique synthetic domain is appended so the CONNECT route is addressable. + // The raw backend host must NOT be used: tunnels commonly share a target host + // (e.g. "localhost"), which collides on shared route configurations. + assert.Contains(t, vh.Domains, connectorMatchDomain(vh.GetName()), + "a unique per-VH synthetic domain must be appended") + assert.NotContains(t, vh.Domains, testTargetHost, + "the raw backend host must not be appended (it is the collision source)") } -func TestApplyConnectorRoutes_Online_TargetHostDeduplicated(t *testing.T) { +func TestApplyConnectorRoutes_Online_DomainAppendIsIdempotent(t *testing.T) { idx := connectorPolicyIndex(true) clusterName := testClusterName() info := &extcache.ConnectorInfo{Online: true, TargetHost: testTargetHost, TargetPort: testTargetPort} replaced := map[string]*extcache.ConnectorInfo{clusterName: info} offline := map[string]*extcache.ConnectorInfo{} - // VH already has the target host in its domains. + // VH already carries the synthetic domain from a prior mutation. rc := &routev3.RouteConfiguration{ VirtualHosts: []*routev3.VirtualHost{ { Name: "vh", - Domains: []string{"app.local.test", testTargetHost}, + Domains: []string{"app.local.test", connectorMatchDomain("vh")}, Routes: []*routev3.Route{routeTargeting(clusterName)}, }, }, @@ -257,14 +260,54 @@ func TestApplyConnectorRoutes_Online_TargetHostDeduplicated(t *testing.T) { _, _, err := ApplyConnectorRoutes(rc, idx, replaced, offline) require.NoError(t, err) - // Domain must not be duplicated. + // Synthetic domain must not be duplicated. count := 0 for _, d := range rc.VirtualHosts[0].Domains { - if d == testTargetHost { + if d == connectorMatchDomain("vh") { count++ } } - assert.Equal(t, 1, count, "target host must appear exactly once in VH domains") + assert.Equal(t, 1, count, "synthetic domain must appear exactly once in VH domains") +} + +// TestApplyConnectorRoutes_Online_NoDomainCollisionAcrossConnectors is the +// regression test for the duplicate-domain Envoy NACK: two online connectors +// that share a backend host ("localhost") on the SAME (shared) route +// configuration must not produce a duplicate domain, or Envoy rejects the whole +// snapshot and freezes config updates fleet-wide. +func TestApplyConnectorRoutes_Online_NoDomainCollisionAcrossConnectors(t *testing.T) { + clusterA := "httproute/ds-ns/proxy-a/rule/0" + clusterB := "httproute/ds-ns/proxy-b/rule/0" + const sharedHost = "localhost" + + replaced := map[string]*extcache.ConnectorInfo{ + clusterA: {Online: true, TargetHost: sharedHost, TargetPort: 8099}, + clusterB: {Online: true, TargetHost: sharedHost, TargetPort: 8099}, + } + offline := map[string]*extcache.ConnectorInfo{} + + // One shared route config (e.g. the merged HTTP listener) with both tunnels. + rc := &routev3.RouteConfiguration{ + Name: "http-80", + VirtualHosts: []*routev3.VirtualHost{ + {Name: "tunnel-a", Domains: []string{"a.example.com"}, Routes: []*routev3.Route{routeTargeting(clusterA)}}, + {Name: "tunnel-b", Domains: []string{"b.example.com"}, Routes: []*routev3.Route{routeTargeting(clusterB)}}, + }, + } + + _, _, err := ApplyConnectorRoutes(rc, &extcache.PolicyIndex{}, replaced, offline) + require.NoError(t, err) + + // No domain may appear more than once across the whole route config. + seen := map[string]string{} + for _, vh := range rc.VirtualHosts { + for _, d := range vh.Domains { + if prev, dup := seen[d]; dup { + t.Fatalf("duplicate domain %q across vhosts %q and %q (Envoy would NACK)", d, prev, vh.Name) + } + seen[d] = vh.Name + } + } } func TestApplyConnectorRoutes_Offline_Prepends503Route_NoDomain(t *testing.T) { diff --git a/internal/extensionserver/server/server_test.go b/internal/extensionserver/server/server_test.go index b74b8dd..588598d 100644 --- a/internal/extensionserver/server/server_test.go +++ b/internal/extensionserver/server/server_test.go @@ -153,8 +153,8 @@ func egGatewayMeta(t *testing.T, dsNS, gwName string) *corev3.Metadata { // - Listener receives Coraza filter at HCM position 0 (disabled). // - Route configuration: CONNECT route prepended, forwarding route carries // coraza typed_per_filter_config and datum-gateway metadata. -// - VH domains include the connector's TargetHost (production behavior per -// design §2.3 C1 — NOT a synthetic .connector.local domain). +// - VH domains include a unique per-connector synthetic domain (NOT the raw +// backend host, which collides across connectors and gets NACK'd). // - ALL four resource lists appear in the response. func TestPostTranslateModify_FullSnapshot(t *testing.T) { const ( @@ -367,13 +367,14 @@ func TestPostTranslateModify_FullSnapshot(t *testing.T) { assert.Equal(t, "test-tpp", entry["name"].GetStringValue()) assert.Equal(t, upstreamNS, entry["namespace"].GetStringValue()) - // --- VH domains include connector TargetHost (NOT .connector.local) --- - assert.Contains(t, vh.Domains, targetHost, - "VH domains must contain connector TargetHost (design §2.3 C1: use actual hostname)") - for _, d := range vh.Domains { - assert.False(t, strings.HasSuffix(d, ".connector.local"), - "synthetic .connector.local domain must NOT appear (design §2.3 C1 fix)") - } + // --- VH domains carry a unique synthetic connector domain, NOT the raw + // backend host. The raw host (commonly "localhost") collides across + // connectors on shared route configs and gets the snapshot NACK'd; the + // synthetic domain is derived from the unique VH name. See connector.go. + assert.Contains(t, vh.Domains, "vh.connector.internal", + "VH domains must include the unique per-connector synthetic domain") + assert.NotContains(t, vh.Domains, targetHost, + "raw backend host must not be appended (duplicate-domain collision source)") } // TestPostTranslateModify_SecretsPassThroughUnchanged verifies that secrets are @@ -752,10 +753,11 @@ func TestPostTranslateModify_TwoClusterTopology_DistinctReplicaUID(t *testing.T) assert.Equal(t, "replica-tpp", entry["name"].GetStringValue()) assert.Equal(t, replicaNSName, entry["namespace"].GetStringValue()) - // --- VH domains include connector TargetHost --- - assert.Contains(t, vh.Domains, targetHost, - "GAP-1b regression: VH domains must include connector TargetHost; "+ - "old code never appended it because the connector was not resolved") + // --- Connector resolved: its synthetic domain is appended --- + // GAP-1b regression: old code never appended any domain because the + // connector was not resolved through the DStoUS mapping. + assert.Contains(t, vh.Domains, "vh.connector.internal", + "GAP-1b regression: connector must resolve and append its synthetic domain") } // TestPostTranslateModify_OfflineConnector_503Route verifies that an offline From 467ee9d38d8f8fa661c2fa5aa8b32bf1354b2bb7 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Tue, 23 Jun 2026 16:04:51 -0500 Subject: [PATCH 2/3] docs(connector): clarify connector domain comments (why over how) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite the connector domain comments to lead with the design intent — connector domains must be unique within a merged-listener namespace — and name the .connector.internal format, dropping the mechanical detail. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/extensionserver/mutate/connector.go | 22 ++++++++------------ 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/internal/extensionserver/mutate/connector.go b/internal/extensionserver/mutate/connector.go index 0d82dd7..40cf5f2 100644 --- a/internal/extensionserver/mutate/connector.go +++ b/internal/extensionserver/mutate/connector.go @@ -116,14 +116,10 @@ func ApplyConnectorRoutes( } // Prepend the CONNECT route (NSO inserts at /virtual_hosts/0/routes/0). vh.Routes = append([]*routev3.Route{newRoute}, vh.Routes...) - // Make the CONNECT route addressable with a domain that is unique per - // virtual host. The backend host itself must NOT be used here: tunnels - // overwhelmingly target "localhost", so reusing it would put the same - // domain on every connector's virtual host. On a shared route - // configuration (e.g. the HTTP listener that merges all gateways) those - // collide, and Envoy rejects the entire snapshot — freezing config - // updates fleet-wide. The internal tunnel listener routes on cluster - // metadata, not this domain, so a synthetic unique value is sufficient. + // Connectors share one domain namespace on merged listeners, so a + // duplicate domain breaks the whole config. The backend host won't do + // as that domain — tunnels usually target "localhost" — so give each + // connector a unique synthetic domain. vh.Domains = appendUnique(vh.Domains, connectorMatchDomain(vh.GetName())) } else { // Offline connect_matcher route for tunnel-control clients. @@ -235,11 +231,11 @@ func buildConnectorCluster( return cl, nil } -// connectorMatchDomain returns the synthetic domain added to an online -// connector's virtual host so its CONNECT route is addressable. It is derived -// from the virtual host name, which Envoy already requires to be unique within a -// route configuration, so the domain can never collide with another connector on -// the same (possibly shared) route configuration. +// connectorMatchDomain returns a per-connector domain of the form +// ".connector.internal". Uniqueness is the point: connector +// domains share a namespace on merged listeners, so a collision breaks the whole +// config. Reusing the virtual host name — which Envoy already guarantees unique +// within a route configuration — makes that guarantee free. func connectorMatchDomain(vhName string) string { return sanitizeID(vhName) + ".connector.internal" } From 7f1f513b3b4bb48a002d10f1085d44298663f458 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Tue, 23 Jun 2026 16:08:01 -0500 Subject: [PATCH 3/3] test(extension-server): drop opaque internal tags from comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace "GAP-1b"/"design §" references with plain descriptions of what each assertion guards. The comments already explain the behavior; the tags added nothing for a future reader. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../extensionserver/server/server_test.go | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/internal/extensionserver/server/server_test.go b/internal/extensionserver/server/server_test.go index 588598d..d26e8f5 100644 --- a/internal/extensionserver/server/server_test.go +++ b/internal/extensionserver/server/server_test.go @@ -122,7 +122,7 @@ func mkListenerWithStaticRoute(t *testing.T, name string) *listenerv3.Listener { } // egGatewayMeta builds the envoy-gateway filter_metadata struct for a VH. -// This simulates the metadata EG populates on VirtualHosts per design §2.1. +// This simulates the metadata EG populates on VirtualHosts. func egGatewayMeta(t *testing.T, dsNS, gwName string) *corev3.Metadata { t.Helper() s, err := structpb.NewStruct(map[string]any{ @@ -547,8 +547,9 @@ func TestPostTranslateModify_EGInternalListener_Skipped(t *testing.T) { } } -// TestPostTranslateModify_TwoClusterTopology_DistinctReplicaUID is the GAP-1b -// full-pipeline regression test. It simulates the two-cluster edge topology: +// TestPostTranslateModify_TwoClusterTopology_DistinctReplicaUID is the +// full-pipeline regression test for connector/TPP resolution in the two-cluster +// edge topology: // - Ext-server client lists EDGE cluster replica namespaces. // - Replica namespace is named "ns-" but has a DISTINCT own UID // assigned by the edge cluster's API server. @@ -704,7 +705,7 @@ func TestPostTranslateModify_TwoClusterTopology_DistinctReplicaUID(t *testing.T) require.NoError(t, err) require.NotNil(t, resp) - // --- Connector cluster must be replaced (GAP-1b: Connector resolution) --- + // --- Connector cluster must be replaced (connector resolution) --- // Old code: DStoUS["ns-upstream-uid-abc"] absent → ReplaceConnectorClusters // skipped the cluster → no internal-upstream replacement. require.Len(t, resp.Clusters, 2) @@ -712,7 +713,7 @@ func TestPostTranslateModify_TwoClusterTopology_DistinctReplicaUID(t *testing.T) assert.Equal(t, connCluster, connectorCluster.GetName(), "connector cluster name must be preserved") require.NotNil(t, connectorCluster.GetTransportSocket(), - "GAP-1b regression: connector cluster must be replaced with internal-upstream; "+ + "connector cluster must be replaced with internal-upstream; "+ "old code skipped it because DStoUS[replicaNSName] was absent") assert.Equal(t, "envoy.transport_sockets.internal_upstream", connectorCluster.GetTransportSocket().GetName()) @@ -721,12 +722,12 @@ func TestPostTranslateModify_TwoClusterTopology_DistinctReplicaUID(t *testing.T) assert.Nil(t, resp.Clusters[1].GetTransportSocket(), "non-connector cluster must not get transport_socket") - // --- Route must carry WAF config (GAP-1b: TPP resolution) --- + // --- Route must carry WAF config (TPP resolution) --- // Old code: DStoUS["ns-upstream-uid-abc"] absent → ApplyTPPRouteConfig skipped // every VH → no WAF mutations → routes_tpp_applied:0. vh := resp.Routes[0].VirtualHosts[0] require.Len(t, vh.Routes, 2, - "GAP-1b regression: CONNECT route must be prepended (connector was resolved); "+ + "CONNECT route must be prepended (connector was resolved); "+ "old code left only the original forwarding route") // CONNECT route first. @@ -737,7 +738,7 @@ func TestPostTranslateModify_TwoClusterTopology_DistinctReplicaUID(t *testing.T) fwd := vh.Routes[1] tpfc := fwd.GetTypedPerFilterConfig()["coraza-waf"] require.NotNil(t, tpfc, - "GAP-1b regression: forwarding route must have coraza typed_per_filter_config; "+ + "forwarding route must have coraza typed_per_filter_config; "+ "old code silently skipped WAF injection") assert.True(t, strings.Contains(tpfc.GetTypeUrl(), "golang.v3alpha.ConfigsPerRoute"), "tpfc type url must reference ConfigsPerRoute") @@ -754,10 +755,10 @@ func TestPostTranslateModify_TwoClusterTopology_DistinctReplicaUID(t *testing.T) assert.Equal(t, replicaNSName, entry["namespace"].GetStringValue()) // --- Connector resolved: its synthetic domain is appended --- - // GAP-1b regression: old code never appended any domain because the + // old code never appended any domain because the // connector was not resolved through the DStoUS mapping. assert.Contains(t, vh.Domains, "vh.connector.internal", - "GAP-1b regression: connector must resolve and append its synthetic domain") + "connector must resolve and append its synthetic domain") } // TestPostTranslateModify_OfflineConnector_503Route verifies that an offline