diff --git a/cmd/gmc/internal/controller/builder.go b/cmd/gmc/internal/controller/builder.go index 05d53012..7d8a2d06 100644 --- a/cmd/gmc/internal/controller/builder.go +++ b/cmd/gmc/internal/controller/builder.go @@ -86,6 +86,17 @@ const ( dnsPodLabel = "k8s-app" dnsPodValue = "kube-dns" + // dnsNodeLocalCIDR is the IPv4 link-local block (RFC 3927). On clusters + // running NodeLocal DNSCache (node-local-dns), pods send DNS to a link-local + // address — 169.254.20.10 by the kube-standard `__PILLAR__LOCAL__DNS__` + // convention — served by a hostNetwork DNSCache pod on each node, which the + // kube-dns podSelector cannot match. Allowing the whole 169.254.0.0/16 block + // is the simplest correct rule and stays within Q105's attribution property: + // link-local is non-routable and node-scoped, so it cannot reach an arbitrary + // external resolver — the DNS-exfiltration channel Q105 closed stays closed + // (Q136). See dnsEgressRule. + dnsNodeLocalCIDR = "169.254.0.0/16" + // npProxyName is the NetworkPolicy that restricts proxy pod egress to GitHub CIDRs. npProxyName = gmcnames.ProxyName // npAGCName is the NetworkPolicy that gives AGC pods Kubernetes API server access (port 443). @@ -201,9 +212,16 @@ func metricsScrapeIngressRule() networkingv1.NetworkPolicyIngressRule { } // dnsEgressRule returns a NetworkPolicy egress rule permitting DNS (UDP/TCP 53) -// to the cluster DNS service ONLY — CoreDNS / kube-dns in kube-system — not to -// any destination. It is shared by the proxy, workload, and AGC policies so the -// DNS posture cannot drift between them. +// to the cluster DNS service ONLY — never to any destination. It is shared by +// the proxy, workload, and AGC policies so the DNS posture cannot drift between +// them. Two `To` peers (OR'd) cover the two ways a pod reaches cluster DNS: +// +// 1. The kube-dns / CoreDNS Service in kube-system, matched by an AND of +// namespace + pod selector (the direct path on a cluster without NodeLocal +// DNSCache). +// 2. The IPv4 link-local block 169.254.0.0/16, matched by an ipBlock (the path +// on a cluster running NodeLocal DNSCache, where pods send DNS to a +// link-local address served by a per-node hostNetwork cache — Q136). // // An unrestricted port-53 rule (To: nil ≡ any server) is an unattributed // data-exfiltration side-channel: DNS queries can smuggle data to an @@ -212,8 +230,10 @@ func metricsScrapeIngressRule() networkingv1.NetworkPolicyIngressRule { // egress path forces traffic through the tenant proxy, whose source IPs are // attributable; confining DNS to the in-cluster resolver keeps it on that // attributable path — kube-dns recurses upstream on the pod's behalf, so the -// proxy can still resolve GitHub hostnames to do its job. Only the open "any -// resolver" breadth is removed, not legitimate resolution. +// proxy can still resolve GitHub hostnames to do its job. Both peers preserve +// that property: link-local 169.254.0.0/16 is non-routable and node-scoped, so +// it cannot reach an external resolver. Only the open "any resolver" breadth is +// removed, not legitimate resolution. // // kindnet does not enforce egress NetworkPolicy (see Q7b in // docs/plan/worker-egress-proxy.md), so this restriction is guarded at the @@ -224,17 +244,26 @@ func dnsEgressRule() networkingv1.NetworkPolicyEgressRule { proto53UDP := corev1.ProtocolUDP proto53TCP := corev1.ProtocolTCP return networkingv1.NetworkPolicyEgressRule{ - // A single peer with both selectors set is an AND: kube-dns pods *within* - // kube-system. Splitting them into two peers would be an OR and would also - // admit any pod labelled k8s-app=kube-dns in any namespace. - To: []networkingv1.NetworkPolicyPeer{{ - NamespaceSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{dnsNamespaceLabel: dnsNamespaceValue}, + To: []networkingv1.NetworkPolicyPeer{ + // A single peer with both selectors set is an AND: kube-dns pods *within* + // kube-system. Splitting them into two peers would be an OR and would also + // admit any pod labelled k8s-app=kube-dns in any namespace. + { + NamespaceSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{dnsNamespaceLabel: dnsNamespaceValue}, + }, + PodSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{dnsPodLabel: dnsPodValue}, + }, }, - PodSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{dnsPodLabel: dnsPodValue}, + // NodeLocal DNSCache path: pods reach the per-node hostNetwork cache at a + // link-local address (169.254.20.10 by convention). hostNetwork pods are + // not matched by any pod/namespace selector, so this peer is an ipBlock. + // The block is non-routable, so it does not widen the exfil surface. + { + IPBlock: &networkingv1.IPBlock{CIDR: dnsNodeLocalCIDR}, }, - }}, + }, Ports: []networkingv1.NetworkPolicyPort{ {Protocol: &proto53UDP, Port: ptr(intstr.FromInt32(53))}, {Protocol: &proto53TCP, Port: ptr(intstr.FromInt32(53))}, diff --git a/cmd/gmc/internal/controller/builder_test.go b/cmd/gmc/internal/controller/builder_test.go index b5e9f509..6d1171ad 100644 --- a/cmd/gmc/internal/controller/builder_test.go +++ b/cmd/gmc/internal/controller/builder_test.go @@ -596,12 +596,21 @@ func TestBuildNetworkPolicy_DNSEgressAlwaysPresent(t *testing.T) { } } -// TestBuildNetworkPolicy_DNSEgressRestrictedToKubeDNS locks in Q105: the port-53 -// egress rule on every GMC-managed NetworkPolicy must target the cluster DNS -// service (kube-dns / CoreDNS in kube-system) and must NOT be open (To: nil ≡ any -// resolver). An open DNS path is an unattributed exfiltration side-channel that -// bypasses the per-tenant egress-IP attribution every other egress path enforces. -// This is an authoring/spec-level guard because kindnet does not enforce egress +// TestBuildNetworkPolicy_DNSEgressRestrictedToKubeDNS locks in Q105 + Q136: the +// port-53 egress rule on every GMC-managed NetworkPolicy must target cluster DNS +// only and must NOT be open (To: nil ≡ any resolver). An open DNS path is an +// unattributed exfiltration side-channel that bypasses the per-tenant egress-IP +// attribution every other egress path enforces. +// +// Q136 widened the rule to two OR'd peers so NodeLocal DNSCache clusters resolve: +// +// - the kube-dns / CoreDNS pods in kube-system (AND of namespace + pod +// selector — the direct path), and +// - the link-local block 169.254.0.0/16 (ipBlock — the node-local cache path). +// +// Both peers stay within Q105's attribution property: link-local is non-routable +// and node-scoped, so it cannot reach an external resolver. This is an +// authoring/spec-level guard because kindnet does not enforce egress // NetworkPolicy (see Q7b) — mirroring the egress-negative guard pattern. func TestBuildNetworkPolicy_DNSEgressRestrictedToKubeDNS(t *testing.T) { ag := newTestAG("gateway", "team-a") @@ -624,16 +633,28 @@ func TestBuildNetworkPolicy_DNSEgressRestrictedToKubeDNS(t *testing.T) { rule := dnsRules[0] require.NotEmpty(t, rule.To, "%s port-53 rule must have a To peer — an empty To opens DNS to any resolver (Q105)", np.Name) - require.Len(t, rule.To, 1, "%s DNS rule should select kube-dns via a single peer", np.Name) - - peer := rule.To[0] - assert.Nil(t, peer.IPBlock, "%s DNS peer must not be an ipBlock", np.Name) - require.NotNil(t, peer.NamespaceSelector, "%s DNS peer must select the kube-dns namespace", np.Name) - require.NotNil(t, peer.PodSelector, "%s DNS peer must select the kube-dns pods", np.Name) - assert.Equal(t, dnsNamespaceValue, peer.NamespaceSelector.MatchLabels[dnsNamespaceLabel], + require.Len(t, rule.To, 2, + "%s DNS rule must select kube-dns AND the node-local link-local block (Q136)", np.Name) + + // Peer 1: the cluster DNS Service (kube-dns / CoreDNS) selector peer. + kubeDNS := rule.To[0] + assert.Nil(t, kubeDNS.IPBlock, "%s kube-dns DNS peer must not be an ipBlock", np.Name) + require.NotNil(t, kubeDNS.NamespaceSelector, "%s DNS peer must select the kube-dns namespace", np.Name) + require.NotNil(t, kubeDNS.PodSelector, "%s DNS peer must select the kube-dns pods", np.Name) + assert.Equal(t, dnsNamespaceValue, kubeDNS.NamespaceSelector.MatchLabels[dnsNamespaceLabel], "%s DNS peer namespace selector must match kube-system", np.Name) - assert.Equal(t, dnsPodValue, peer.PodSelector.MatchLabels[dnsPodLabel], + assert.Equal(t, dnsPodValue, kubeDNS.PodSelector.MatchLabels[dnsPodLabel], "%s DNS peer pod selector must match k8s-app=kube-dns", np.Name) + + // Peer 2: the NodeLocal DNSCache link-local ipBlock (Q136). + nodeLocal := rule.To[1] + require.NotNil(t, nodeLocal.IPBlock, "%s DNS rule must allow the node-local link-local block (Q136)", np.Name) + assert.Nil(t, nodeLocal.NamespaceSelector, "%s node-local DNS peer must be a bare ipBlock", np.Name) + assert.Nil(t, nodeLocal.PodSelector, "%s node-local DNS peer must be a bare ipBlock", np.Name) + assert.Equal(t, "169.254.0.0/16", nodeLocal.IPBlock.CIDR, + "%s node-local DNS peer must be the link-local block 169.254.0.0/16", np.Name) + assert.Empty(t, nodeLocal.IPBlock.Except, + "%s node-local DNS ipBlock must not carve out exceptions", np.Name) } } diff --git a/cmd/gmc/test/e2e/provisioning_test.go b/cmd/gmc/test/e2e/provisioning_test.go index 36fe52e5..6e95f3cf 100644 --- a/cmd/gmc/test/e2e/provisioning_test.go +++ b/cmd/gmc/test/e2e/provisioning_test.go @@ -6,6 +6,7 @@ package e2e import ( "fmt" "os/exec" + "regexp" "time" . "github.com/onsi/ginkgo/v2" @@ -334,8 +335,10 @@ spec: Expect(workloadYAML).NotTo(MatchRegexp(`(?m)^\s*ingress:`), "workload NP must carry no ingress rules — an ingress: block means inbound was allowed to worker pods (Q128):\n%s", workloadYAML) - // DNS egress rule: port 53 on both UDP and TCP, no `to:` peers - // (allows DNS to any destination). + // DNS egress rule: port 53 on both UDP and TCP. DNS is confined to + // cluster DNS (kube-dns / CoreDNS in kube-system) plus the node-local + // link-local block — not "any resolver" (Q105/Q136); the peer shape is + // asserted below. Expect(workloadYAML).To(MatchRegexp(`(?s)port:\s*53\b.*protocol:\s*UDP`), "workload NP missing DNS UDP egress rule:\n%s", workloadYAML) Expect(workloadYAML).To(MatchRegexp(`(?s)port:\s*53\b.*protocol:\s*TCP`), @@ -350,11 +353,19 @@ spec: Expect(workloadYAML).To(MatchRegexp(`(?s)port:\s*8080.*podSelector:.*matchLabels:.*app:\s*actions-gateway-proxy`), "workload NP port-8080 egress rule missing podSelector app=actions-gateway-proxy (regression: rule broadened to any destination):\n%s", workloadYAML) - // The workload NP must NOT contain any egress to GitHub CIDRs — that - // is the proxy NP's job. The most likely regression is an ipBlock - // peer (any IPv4 cidr) appearing in the workload egress. - Expect(workloadYAML).NotTo(ContainSubstring("ipBlock:"), - "workload NP must not list any ipBlock egress peers (GitHub CIDRs belong on the proxy NP):\n%s", workloadYAML) + // The workload NP must NOT contain egress to GitHub CIDRs — that is the + // proxy NP's job. The only ipBlock permitted is the link-local block + // 169.254.0.0/16 used for NodeLocal DNSCache DNS egress (Q136): it is + // non-routable and node-scoped, so it cannot reach GitHub or an external + // resolver and preserves the per-tenant egress-IP attribution (Q105). + // Any other (routable) cidr — e.g. a GitHub CIDR leaking onto the + // workload NP — is a regression. + Expect(workloadYAML).To(ContainSubstring("cidr: 169.254.0.0/16"), + "workload NP missing the node-local DNS link-local ipBlock 169.254.0.0/16 (Q136):\n%s", workloadYAML) + for _, m := range regexp.MustCompile(`cidr:\s*(\S+)`).FindAllStringSubmatch(workloadYAML, -1) { + Expect(m[1]).To(Equal("169.254.0.0/16"), + "workload NP egress has an unexpected ipBlock cidr %q — only the node-local DNS link-local block is allowed; GitHub CIDRs belong on the proxy NP (Q136):\n%s", m[1], workloadYAML) + } By("dumping the AGC NetworkPolicy as YAML") agcYAML, err := utils.Run(exec.Command("kubectl", "get", "networkpolicy", agcName, diff --git a/docs/STATUS.md b/docs/STATUS.md index d9b6816b..8a7a6746 100644 --- a/docs/STATUS.md +++ b/docs/STATUS.md @@ -57,7 +57,6 @@ Specific actionable items in priority order. Pick from the top; skip 🚫 items | Q131 | Flake: TestListener_IdleNotShutdownIfLast poll-count timing | `tests` `bug` | 🔲 | S | goroutine_test.go:419 asserted poll count ≥5 but got 3 ("poll past threshold when last listener") on a local `make check`; passed on rerun (-count=3) w/o code change. Timing-sensitive idle-shutdown test; tighten synchronization, not the count. | | Q113 | Flake: eviction integration tests time out in waitForWorkerPod | `tests` `bug` | 🔲 | S | EvictionTriggersRequeue + EvictionBudgetExhausted (failure_recovery_test.go:142) timed out (20s) on CI run 27383065643, passed on rerun w/o code change. Suspects: sessions[len-1] pick on shared brokerStub; 20s budget on 2-vCPU runner. | | Q120 | Flake: SIGTERM integration test misses session-delete budget | `tests` `bug` | 🔲 | S | TestAGC_SIGTERM_DeletesAllSessions: session-39 missed the 10s WaitForSessionDelete on CI run 27422248358 (PR 209), passed on rerun w/o code change. ~40 concurrent teardowns on a 2-vCPU runner; same shared-brokerStub/budget class as Q113. | -| Q136 | node-local-dns (NodeLocal DNSCache) support for tenant DNS egress | `security` `infra` `1.0-gate` | 🔲 | S | Q105 DNS rule (kube-dns podSelector) breaks NodeLocal DNSCache: pods query link-local 169.254.20.10 (hostNetwork pod) → dropped on enforcing CNI, incl proxy. Fix: also allow port-53 to 169.254.0.0/16 (link-local, non-routable, keeps Q105). | | Q137 | AGC RunnerGroup not re-reconciled after a non-retriable baseline-listener exit | `bug` `infra` | 🔲 | S | runnergroup_controller.go returns RequeueAfter=reapAfter (0 with no worker pods); if the permanent baseline exits non-retriably the L317 restart only fires on the next watch event — up to the 10h resync. Requeue when ActiveCountQ138 | Bounded-by-default HTTP clients — retrofit http.DefaultClient fallbacks + lint gate | `infra` `bug` `tests` | 🔲 | M | ~8 prod clients default to http.DefaultClient (no read timeout); a slow peer wedges the goroutine (Q134 class). Add a bounded-by-default httpx client, make long-poll the explicit exception, and gate new uses with forbidigo+noctx. | | Q98 | Helm chart distribution/publishing pipeline | `infra` `1.0-gate` | 🔲 | M | Pipeline shipped: publish.yml chart-publish job packages, pushes (oci://ghcr/charts), and cosign-signs the chart. Remaining (first v* tag): live publish proof, flip prerelease annotation, oci:// in upgrade.md/README, Artifact Hub listing. | diff --git a/docs/design/05-security.md b/docs/design/05-security.md index 5490c1e8..55922277 100644 --- a/docs/design/05-security.md +++ b/docs/design/05-security.md @@ -30,7 +30,7 @@ Several mitigations below rest on the per-tenant NetworkPolicies the GMC reconci | **AGC Token Compromise** | High | The AGC never saves plaintext keys to disk. GitHub App private keys are mounted as read-only volumes with restrictive file permissions (0400). | | **Credential Leak via Logged Error Bodies** | Medium | The AGC, broker client, and probe interpolate upstream GitHub HTTP response bodies into errors that callers log. Some of these bodies carry credential material — the runner-token endpoint's 200 body holds an access token, and `generate-jitconfig`'s body holds the runner JIT registration credential plus RSA key. Before any upstream body is placed into an error or log line it passes through a single shared redactor (`githubapp.SanitizeBody`) that strips credential-shaped substrings (GitHub `gh*_`/`github_pat_` tokens, JWTs, `access_token`/`encoded_jit_config`/`private_key`/`secret` JSON values, and long opaque base64 blobs) and caps the result. Redaction runs before capping so a secret straddling the cap boundary cannot survive in the truncated tail. No secret is ever logged directly; this control hardens the indirect path. | | **Eviction-Retry API Misuse** | Medium | The AGC calls `POST /repos/{owner}/{repo}/actions/runs/{run_id}/rerun-failed-jobs` using the tenant's installation access token when a worker pod is evicted. The blast radius is bounded: the installation token is scoped to the GitHub App's installation on a specific organization or repository, so the AGC cannot re-run jobs belonging to other tenants or organizations. The `run_id` is extracted from the job payload delivered by GitHub's broker — the AGC cannot fabricate or substitute a run ID for a run it did not acquire. To prevent abuse of the retry path (e.g. a compromised AGC looping re-runs), `maxEvictionRetries` caps the number of automatic retries per job and is enforced before the API call is made. Operators should monitor `actions_gateway_eviction_retries_exhausted_total` to detect abnormal eviction patterns. | -| **DNS Exfiltration Side-Channel** (Unattributed Egress) | Medium | The per-tenant egress-IP attribution that isolates tenants rests on *all* real egress traversing the tenant proxy, whose source IPs are attributable. An unrestricted port-53 egress rule (`to: []` ≡ any resolver) would defeat that: any pod — including untrusted worker job code — could smuggle data out by encoding it into DNS queries aimed at an attacker-controlled authoritative server, an unattributed side-channel that never touches the proxy. All three per-tenant NetworkPolicies (workload, AGC, proxy) therefore confine port-53 egress to the **cluster DNS service only** (`kube-dns` / `CoreDNS` in `kube-system`, matched by `namespaceSelector` on `kubernetes.io/metadata.name: kube-system` plus `podSelector` on `k8s-app: kube-dns`). `kube-dns` recurses upstream on the pod's behalf, so legitimate resolution (including the proxy's own GitHub-hostname lookups) is unaffected — only the "any resolver" breadth is removed (Q105). Like the other egress negatives, this is enforced only by a policy-aware CNI; the reliable CI guard is the authoring-level test `TestBuildNetworkPolicy_DNSEgressRestrictedToKubeDNS`, which asserts every policy's DNS rule selects kube-dns and is never open. | +| **DNS Exfiltration Side-Channel** (Unattributed Egress) | Medium | The per-tenant egress-IP attribution that isolates tenants rests on *all* real egress traversing the tenant proxy, whose source IPs are attributable. An unrestricted port-53 egress rule (`to: []` ≡ any resolver) would defeat that: any pod — including untrusted worker job code — could smuggle data out by encoding it into DNS queries aimed at an attacker-controlled authoritative server, an unattributed side-channel that never touches the proxy. All three per-tenant NetworkPolicies (workload, AGC, proxy) therefore confine port-53 egress to **cluster DNS only**, via two OR'd peers: the `kube-dns` / `CoreDNS` Service in `kube-system` (matched by `namespaceSelector` on `kubernetes.io/metadata.name: kube-system` plus `podSelector` on `k8s-app: kube-dns`), and — for clusters running NodeLocal DNSCache (`node-local-dns`), where pods send DNS to a per-node `hostNetwork` cache at a link-local address no selector can match — the link-local block `169.254.0.0/16` (`ipBlock`, Q136). Both peers preserve the attribution property: `kube-dns` recurses upstream on the pod's behalf so legitimate resolution (including the proxy's own GitHub-hostname lookups) is unaffected, and `169.254.0.0/16` is **non-routable and node-scoped** — it cannot reach an arbitrary external resolver, so the exfiltration channel stays closed while the "any resolver" breadth remains removed (Q105/Q136). Like the other egress negatives, this is enforced only by a policy-aware CNI; the reliable CI guard is the authoring-level test `TestBuildNetworkPolicy_DNSEgressRestrictedToKubeDNS`, which asserts every policy's DNS rule selects kube-dns **and** the link-local block and is never open. | | **Proxy as Traffic Interception Point** | Medium | The proxy only handles CONNECT tunneling and does not terminate TLS. It cannot inspect or modify the encrypted payload between the AGC/worker and GitHub. Proxy pods run with a read-only root filesystem and no elevated capabilities. | | **Cross-Tenant Proxy CA Trust** | Medium | The egress proxy's TLS cert is signed by a cert-manager-issued self-signed CA stored in the per-tenant `actions-gateway-proxy-tls` Secret. The AGC pins this CA explicitly (via its trust pool) rather than trusting the cluster's root store, and worker pods install the same CA into a combined `SSL_CERT_FILE` bundle so Runner.Worker's .NET HttpClient accepts the proxy handshake. The cert (`tls.crt`) is projected into both AGC and worker pods via an `Items: [tls.crt]` Secret volume; the private key (`tls.key`) is mounted *only* into the proxy pod itself, so a runner compromise does not yield the ability to forge a proxy cert. Trust is tenant-scoped: each tenant's CA is independent, so a compromised CA in one namespace cannot mint a cert trusted by another tenant's AGC or workers. | | **Egress IP Change Mid-Session** | Low–Unknown | GitHub's broker protocol is token-based, not IP-bound. Session IDs and bearer tokens carry no IP affinity, so rotating across proxy pods mid-job is expected to work. The Twirp log stream is naturally sticky (long-lived HTTP/2 connection stays on one proxy pod once open). Impact is unknown because GitHub's abuse detection heuristics are undocumented. **Early mitigation: the [Milestone 1](06-implementation-phases.md#milestone-1-wire-protocol-probe-days-14) wire protocol probe explicitly tests broker API calls routed through a multi-pod proxy pool to confirm GitHub does not reject or flag IP variance across `CreateSession → GetMessage → AcquireJob`.** If the probe surfaces a problem, `ClientIP` session affinity on the proxy Service is the low-effort fallback; explicit per-goroutine proxy assignment is the higher-fidelity option if needed. | diff --git a/docs/design/network-architecture.md b/docs/design/network-architecture.md index c259898c..007086f8 100644 --- a/docs/design/network-architecture.md +++ b/docs/design/network-architecture.md @@ -78,9 +78,13 @@ spec: - Egress ingress: [] # no ingress permitted egress: - # DNS — needed for resolving the proxy Service name. Confined to the cluster - # DNS service (kube-dns / CoreDNS in kube-system), not "any resolver": an - # open port-53 rule is an unattributed exfiltration side-channel (Q105). + # DNS — needed for resolving the proxy Service name. Confined to cluster DNS, + # not "any resolver": an open port-53 rule is an unattributed exfiltration + # side-channel (Q105). Two OR'd peers cover both delivery paths: the kube-dns + # / CoreDNS pods in kube-system (direct path), and the link-local block + # 169.254.0.0/16 for NodeLocal DNSCache clusters where pods send DNS to a + # per-node hostNetwork cache (Q136). Link-local is non-routable, so it does + # not widen the exfil surface. - to: - namespaceSelector: matchLabels: @@ -88,6 +92,8 @@ spec: podSelector: matchLabels: k8s-app: kube-dns + - ipBlock: + cidr: 169.254.0.0/16 ports: - protocol: UDP port: 53 @@ -127,7 +133,8 @@ spec: policyTypes: - Egress egress: - # DNS — confined to cluster DNS (kube-dns / CoreDNS in kube-system); see Q105. + # DNS — confined to cluster DNS (kube-dns / CoreDNS in kube-system) plus the + # link-local block for NodeLocal DNSCache; see Q105/Q136. - to: - namespaceSelector: matchLabels: @@ -135,6 +142,8 @@ spec: podSelector: matchLabels: k8s-app: kube-dns + - ipBlock: + cidr: 169.254.0.0/16 ports: - protocol: UDP port: 53 @@ -178,10 +187,11 @@ spec: - port: 8080 protocol: TCP egress: - # DNS — proxy resolves GitHub hostnames on behalf of clients. Confined to the - # cluster DNS service (kube-dns / CoreDNS in kube-system); kube-dns recurses - # upstream so external names still resolve, but the proxy cannot reach an - # arbitrary resolver — closing the open-DNS exfiltration side-channel (Q105). + # DNS — proxy resolves GitHub hostnames on behalf of clients. Confined to + # cluster DNS (kube-dns / CoreDNS in kube-system) plus the link-local block + # for NodeLocal DNSCache; kube-dns recurses upstream so external names still + # resolve, but the proxy cannot reach an arbitrary resolver — closing the + # open-DNS exfiltration side-channel (Q105/Q136). - to: - namespaceSelector: matchLabels: @@ -189,6 +199,8 @@ spec: podSelector: matchLabels: k8s-app: kube-dns + - ipBlock: + cidr: 169.254.0.0/16 ports: - protocol: UDP port: 53 @@ -218,7 +230,14 @@ All in-cluster service discovery uses Kubernetes DNS (`kube-dns` / `CoreDNS`). T External DNS resolution (for GitHub hostnames) is performed by the proxy pods themselves, not by the AGC or worker pods — the AGC and workers connect to the proxy using `CONNECT :` and the proxy resolves the hostname on their behalf. This means the proxy pods must have egress access to the cluster's DNS resolver in addition to GitHub's IP ranges. -DNS egress on all three policies is **confined to the cluster DNS service** (`kube-dns` / `CoreDNS` in `kube-system`, matched by `namespaceSelector` on the well-known `kubernetes.io/metadata.name: kube-system` label plus a `podSelector` on the conventional `k8s-app: kube-dns` label) rather than left open to any resolver (Q105). An unrestricted port-53 rule (`to: []`) would let any pod smuggle data to an attacker-controlled resolver — an unattributed side-channel that bypasses the per-tenant egress-IP attribution every other egress path enforces. Confining DNS to the in-cluster resolver keeps resolution on the attributable path: `kube-dns` recurses upstream on the pod's behalf, so external GitHub names still resolve while no pod can reach an arbitrary DNS server directly. Operators running a DNS service under a non-standard namespace or pod label must adjust the selector accordingly (or supply their own equivalent rule under `spec.proxy.managedNetworkPolicy: false`). +DNS egress on all three policies is **confined to cluster DNS** rather than left open to any resolver (Q105). An unrestricted port-53 rule (`to: []`) would let any pod smuggle data to an attacker-controlled resolver — an unattributed side-channel that bypasses the per-tenant egress-IP attribution every other egress path enforces. Confining DNS to the in-cluster resolver keeps resolution on the attributable path: `kube-dns` recurses upstream on the pod's behalf, so external GitHub names still resolve while no pod can reach an arbitrary DNS server directly. + +Each DNS rule allows two OR'd peers, covering the two ways a pod reaches cluster DNS: + +- **Direct path** — the `kube-dns` / `CoreDNS` Service in `kube-system`, matched by `namespaceSelector` on the well-known `kubernetes.io/metadata.name: kube-system` label plus a `podSelector` on the conventional `k8s-app: kube-dns` label. +- **NodeLocal DNSCache path** — the IPv4 link-local block `169.254.0.0/16`, matched by an `ipBlock` (Q136). On clusters running [NodeLocal DNSCache](https://kubernetes.io/docs/tasks/administer-cluster/nodelocaldns/) (`node-local-dns`), pods send DNS to a link-local address (`169.254.20.10` by the kube-standard `__PILLAR__LOCAL__DNS__` convention) served by a per-node `hostNetwork` DNSCache pod, which no pod/namespace selector can match. Allowing the whole link-local block is the simplest correct rule and **preserves Q105's attribution property**: `169.254.0.0/16` is non-routable and node-scoped, so it cannot reach an external resolver — the DNS-exfiltration channel Q105 closed stays closed. + +Operators running a DNS service under a non-standard namespace or pod label must adjust the selector accordingly (or supply their own equivalent rule under `spec.proxy.managedNetworkPolicy: false`). --- diff --git a/docs/operations/security-operations.md b/docs/operations/security-operations.md index 80c02a32..9b63413a 100644 --- a/docs/operations/security-operations.md +++ b/docs/operations/security-operations.md @@ -381,16 +381,19 @@ Triage the report against this operator's needs: relabel the DNS pods or set `spec.proxy.managedNetworkPolicy: false` and supply your own DNS egress rule. Symptom: tenant workloads time out on every lookup while non-DNS connectivity is unaffected. -- **Known limitation — NodeLocal DNSCache (`node-local-dns`) is not yet - supported out of the box.** With node-local-dns, pods send queries to a - link-local IP (default `169.254.20.10`) served by a `hostNetwork` - `node-local-dns` pod, not to a `k8s-app: kube-dns` CoreDNS pod — so on an - enforcing CNI the kube-dns-only rule drops those queries, breaking resolution - for workers **and** the proxy. Until first-class support lands - ([Q136](../STATUS.md#Q136)), grant it with an additive NetworkPolicy allowing - port-53 egress to `169.254.0.0/16` (link-local is non-routable off-node, so - this preserves the no-arbitrary-resolver property) for the workload and proxy - pods — see [Tenant egress posture & deliberate widening](#tenant-egress-posture--deliberate-widening). +- **NodeLocal DNSCache (`node-local-dns`) is supported.** With node-local-dns, + pods send queries to a link-local IP (default `169.254.20.10`) served by a + `hostNetwork` `node-local-dns` pod, not to a `k8s-app: kube-dns` CoreDNS pod — + which the kube-dns podSelector cannot match. The tenant NetworkPolicies + therefore allow port-53 egress to the link-local block `169.254.0.0/16` as a + second peer alongside the kube-dns selector (Q136), so both topologies resolve + out of the box with no operator action. Link-local is non-routable and + node-scoped, so this preserves the no-arbitrary-resolver property of Q105 — + the link-local block cannot reach an external resolver. If your node-local-dns + cache listens on a non-default address *outside* `169.254.0.0/16`, set + `spec.proxy.managedNetworkPolicy: false` and supply your own DNS egress rule, + or add an additive NetworkPolicy — see + [Tenant egress posture & deliberate widening](#tenant-egress-posture--deliberate-widening). - **Findings that don't apply** (managed control plane hides the file, a check for a component you don't run) — record the justification alongside the cluster's onboarding ticket.