diff --git a/cmd/gmc/internal/controller/builder.go b/cmd/gmc/internal/controller/builder.go
index fb4ca5ab..05d53012 100644
--- a/cmd/gmc/internal/controller/builder.go
+++ b/cmd/gmc/internal/controller/builder.go
@@ -73,6 +73,19 @@ const (
metricsScrapeNamespaceLabel = "metrics"
metricsScrapeNamespaceValue = "enabled"
+ // dnsNamespaceLabel / dnsNamespaceValue and dnsPodLabel / dnsPodValue select
+ // the cluster DNS service (CoreDNS / kube-dns) as the sole permitted DNS
+ // egress peer. The namespace is matched via the well-known immutable
+ // `kubernetes.io/metadata.name` label that every Kubernetes ≥1.21 stamps on
+ // each namespace (so no manual labelling of kube-system is required); the
+ // pods via the conventional `k8s-app: kube-dns` label CoreDNS carries by
+ // default in every distribution this controller targets. See dnsEgressRule
+ // for why egress is confined to this peer (Q105).
+ dnsNamespaceLabel = "kubernetes.io/metadata.name"
+ dnsNamespaceValue = "kube-system"
+ dnsPodLabel = "k8s-app"
+ dnsPodValue = "kube-dns"
+
// npProxyName is the NetworkPolicy that restricts proxy pod egress to GitHub CIDRs.
npProxyName = gmcnames.ProxyName
// npAGCName is the NetworkPolicy that gives AGC pods Kubernetes API server access (port 443).
@@ -187,19 +200,53 @@ func metricsScrapeIngressRule() networkingv1.NetworkPolicyIngressRule {
}
}
-// buildProxyNetworkPolicy constructs the NetworkPolicy for proxy pods.
-// Proxy pods may reach GitHub CIDRs on 443 (for CONNECT tunneling) and DNS.
-// Only workload pods (AGC and workers) may initiate connections to the proxy.
-func buildProxyNetworkPolicy(ag *gmcv1alpha1.ActionsGateway, githubCIDRs []net.IPNet) *networkingv1.NetworkPolicy {
+// dnsEgressRule returns a NetworkPolicy egress rule permitting DNS (UDP/TCP 53)
+// to the cluster DNS service ONLY — CoreDNS / kube-dns in kube-system — not to
+// any destination. It is shared by the proxy, workload, and AGC policies so the
+// DNS posture cannot drift between them.
+//
+// An unrestricted port-53 rule (To: nil ≡ any server) is an unattributed
+// data-exfiltration side-channel: DNS queries can smuggle data to an
+// attacker-controlled resolver, bypassing the per-tenant egress-IP attribution
+// that is a headline isolation property of this system (Q105). Every other
+// egress path forces traffic through the tenant proxy, whose source IPs are
+// attributable; confining DNS to the in-cluster resolver keeps it on that
+// attributable path — kube-dns recurses upstream on the pod's behalf, so the
+// proxy can still resolve GitHub hostnames to do its job. Only the open "any
+// resolver" breadth is removed, not legitimate resolution.
+//
+// kindnet does not enforce egress NetworkPolicy (see Q7b in
+// docs/plan/worker-egress-proxy.md), so this restriction is guarded at the
+// spec/authoring level by TestBuildNetworkPolicy_DNSEgressRestrictedToKubeDNS
+// rather than by a live e2e deny test; a runtime negative needs a
+// policy-enforcing CNI such as Calico.
+func dnsEgressRule() networkingv1.NetworkPolicyEgressRule {
proto53UDP := corev1.ProtocolUDP
proto53TCP := corev1.ProtocolTCP
-
- egress := []networkingv1.NetworkPolicyEgressRule{{
+ return networkingv1.NetworkPolicyEgressRule{
+ // A single peer with both selectors set is an AND: kube-dns pods *within*
+ // kube-system. Splitting them into two peers would be an OR and would also
+ // admit any pod labelled k8s-app=kube-dns in any namespace.
+ To: []networkingv1.NetworkPolicyPeer{{
+ NamespaceSelector: &metav1.LabelSelector{
+ MatchLabels: map[string]string{dnsNamespaceLabel: dnsNamespaceValue},
+ },
+ PodSelector: &metav1.LabelSelector{
+ MatchLabels: map[string]string{dnsPodLabel: dnsPodValue},
+ },
+ }},
Ports: []networkingv1.NetworkPolicyPort{
{Protocol: &proto53UDP, Port: ptr(intstr.FromInt32(53))},
{Protocol: &proto53TCP, Port: ptr(intstr.FromInt32(53))},
},
- }}
+ }
+}
+
+// buildProxyNetworkPolicy constructs the NetworkPolicy for proxy pods.
+// Proxy pods may reach GitHub CIDRs on 443 (for CONNECT tunneling) and DNS.
+// Only workload pods (AGC and workers) may initiate connections to the proxy.
+func buildProxyNetworkPolicy(ag *gmcv1alpha1.ActionsGateway, githubCIDRs []net.IPNet) *networkingv1.NetworkPolicy {
+ egress := []networkingv1.NetworkPolicyEgressRule{dnsEgressRule()}
managed := ag.Spec.Proxy.ManagedNetworkPolicy == nil || *ag.Spec.Proxy.ManagedNetworkPolicy
if managed && len(githubCIDRs) > 0 {
peers := make([]networkingv1.NetworkPolicyPeer, 0, len(githubCIDRs))
@@ -261,16 +308,8 @@ func buildProxyNetworkPolicy(ag *gmcv1alpha1.ActionsGateway, githubCIDRs []net.I
// additively re-admits the monitoring metrics scrape, so default-deny here costs it
// nothing.
func buildWorkloadNetworkPolicy(ag *gmcv1alpha1.ActionsGateway) *networkingv1.NetworkPolicy {
- proto53UDP := corev1.ProtocolUDP
- proto53TCP := corev1.ProtocolTCP
-
egress := []networkingv1.NetworkPolicyEgressRule{
- {
- Ports: []networkingv1.NetworkPolicyPort{
- {Protocol: &proto53UDP, Port: ptr(intstr.FromInt32(53))},
- {Protocol: &proto53TCP, Port: ptr(intstr.FromInt32(53))},
- },
- },
+ dnsEgressRule(),
{
Ports: []networkingv1.NetworkPolicyPort{{Port: ptr(intstr.FromInt32(proxyPort))}},
To: []networkingv1.NetworkPolicyPeer{{
@@ -322,9 +361,6 @@ func buildWorkloadNetworkPolicy(ag *gmcv1alpha1.ActionsGateway) *networkingv1.Ne
// this, the AGC NP carried no ingress policy type and any pod in the namespace
// could scrape per-tenant metrics off the controller-runtime metrics server.
func buildAGCNetworkPolicy(ag *gmcv1alpha1.ActionsGateway) *networkingv1.NetworkPolicy {
- proto53UDP := corev1.ProtocolUDP
- proto53TCP := corev1.ProtocolTCP
-
return &networkingv1.NetworkPolicy{
ObjectMeta: metav1.ObjectMeta{Name: npAGCName, Namespace: ag.Namespace, Labels: managedLabels(ag)},
Spec: networkingv1.NetworkPolicySpec{
@@ -332,12 +368,7 @@ func buildAGCNetworkPolicy(ag *gmcv1alpha1.ActionsGateway) *networkingv1.Network
PolicyTypes: []networkingv1.PolicyType{networkingv1.PolicyTypeEgress, networkingv1.PolicyTypeIngress},
Ingress: []networkingv1.NetworkPolicyIngressRule{metricsScrapeIngressRule()},
Egress: []networkingv1.NetworkPolicyEgressRule{
- {
- Ports: []networkingv1.NetworkPolicyPort{
- {Protocol: &proto53UDP, Port: ptr(intstr.FromInt32(53))},
- {Protocol: &proto53TCP, Port: ptr(intstr.FromInt32(53))},
- },
- },
+ dnsEgressRule(),
{
Ports: []networkingv1.NetworkPolicyPort{
{Port: ptr(intstr.FromInt32(443))},
diff --git a/cmd/gmc/internal/controller/builder_test.go b/cmd/gmc/internal/controller/builder_test.go
index 0ae24049..b5e9f509 100644
--- a/cmd/gmc/internal/controller/builder_test.go
+++ b/cmd/gmc/internal/controller/builder_test.go
@@ -596,6 +596,47 @@ func TestBuildNetworkPolicy_DNSEgressAlwaysPresent(t *testing.T) {
}
}
+// TestBuildNetworkPolicy_DNSEgressRestrictedToKubeDNS locks in Q105: the port-53
+// egress rule on every GMC-managed NetworkPolicy must target the cluster DNS
+// service (kube-dns / CoreDNS in kube-system) and must NOT be open (To: nil ≡ any
+// resolver). An open DNS path is an unattributed exfiltration side-channel that
+// bypasses the per-tenant egress-IP attribution every other egress path enforces.
+// This is an authoring/spec-level guard because kindnet does not enforce egress
+// NetworkPolicy (see Q7b) — mirroring the egress-negative guard pattern.
+func TestBuildNetworkPolicy_DNSEgressRestrictedToKubeDNS(t *testing.T) {
+ ag := newTestAG("gateway", "team-a")
+ for _, np := range []*networkingv1.NetworkPolicy{
+ buildProxyNetworkPolicy(ag, nil),
+ buildWorkloadNetworkPolicy(ag),
+ buildAGCNetworkPolicy(ag),
+ } {
+ var dnsRules []networkingv1.NetworkPolicyEgressRule
+ for _, rule := range np.Spec.Egress {
+ for _, port := range rule.Ports {
+ if port.Port != nil && port.Port.IntVal == 53 {
+ dnsRules = append(dnsRules, rule)
+ break
+ }
+ }
+ }
+ require.Len(t, dnsRules, 1, "%s must carry exactly one port-53 egress rule", np.Name)
+
+ rule := dnsRules[0]
+ require.NotEmpty(t, rule.To,
+ "%s port-53 rule must have a To peer — an empty To opens DNS to any resolver (Q105)", np.Name)
+ require.Len(t, rule.To, 1, "%s DNS rule should select kube-dns via a single peer", np.Name)
+
+ peer := rule.To[0]
+ assert.Nil(t, peer.IPBlock, "%s DNS peer must not be an ipBlock", np.Name)
+ require.NotNil(t, peer.NamespaceSelector, "%s DNS peer must select the kube-dns namespace", np.Name)
+ require.NotNil(t, peer.PodSelector, "%s DNS peer must select the kube-dns pods", np.Name)
+ assert.Equal(t, dnsNamespaceValue, peer.NamespaceSelector.MatchLabels[dnsNamespaceLabel],
+ "%s DNS peer namespace selector must match kube-system", np.Name)
+ assert.Equal(t, dnsPodValue, peer.PodSelector.MatchLabels[dnsPodLabel],
+ "%s DNS peer pod selector must match k8s-app=kube-dns", np.Name)
+ }
+}
+
func TestBuildAGCNetworkPolicy_PodSelectorIsAGCOnly(t *testing.T) {
ag := newTestAG("gateway", "team-a")
np := buildAGCNetworkPolicy(ag)
diff --git a/docs/STATUS.md b/docs/STATUS.md
index 75790331..ded9359d 100644
--- a/docs/STATUS.md
+++ b/docs/STATUS.md
@@ -56,6 +56,7 @@ Specific actionable items in priority order. Pick from the top; skip 🚫 items
| Q131 | Flake: TestListener_IdleNotShutdownIfLast poll-count timing | `tests` `bug` | 🔲 | S | goroutine_test.go:419 asserted poll count ≥5 but got 3 ("poll past threshold when last listener") on a local `make check`; passed on rerun (-count=3) w/o code change. Timing-sensitive idle-shutdown test; tighten synchronization, not the count. |
| Q113 | Flake: eviction integration tests time out in waitForWorkerPod | `tests` `bug` | 🔲 | S | EvictionTriggersRequeue + EvictionBudgetExhausted (failure_recovery_test.go:142) timed out (20s) on CI run 27383065643, passed on rerun w/o code change. Suspects: sessions[len-1] pick on shared brokerStub; 20s budget on 2-vCPU runner. |
| Q120 | Flake: SIGTERM integration test misses session-delete budget | `tests` `bug` | 🔲 | S | TestAGC_SIGTERM_DeletesAllSessions: session-39 missed the 10s WaitForSessionDelete on CI run 27422248358 (PR 209), passed on rerun w/o code change. ~40 concurrent teardowns on a 2-vCPU runner; same shared-brokerStub/budget class as Q113. |
+| Q136 | node-local-dns (NodeLocal DNSCache) support for tenant DNS egress | `security` `infra` `1.0-gate` | 🔲 | S | Q105 DNS rule (kube-dns podSelector) breaks NodeLocal DNSCache: pods query link-local 169.254.20.10 (hostNetwork pod) → dropped on enforcing CNI, incl proxy. Fix: also allow port-53 to 169.254.0.0/16 (link-local, non-routable, keeps Q105). |
| Q98 | Helm chart distribution/publishing pipeline | `infra` `1.0-gate` | 🔲 | M | Pipeline shipped: publish.yml chart-publish job packages, pushes (oci://ghcr/charts), and cosign-signs the chart. Remaining (first v* tag): live publish proof, flip prerelease annotation, oci:// in upgrade.md/README, Artifact Hub listing. |
| Q112 | GMC Events silently 403'd: recorder writes events.k8s.io, RBAC grants core only | `bug` `infra` | 🔲 | S | Same root cause as the AGC fix in PR 202 (Q95): GMC uses mgr.GetEventRecorder (writes events.k8s.io/v1) but its kubebuilder marker grants only core "" events, so every GMC Event is dropped. Fix marker + `make manifests`; assert one event in e2e. |
| Q9 | [M3-tests remaining items (H2/M/L)](plan/milestone-3-tests.md) | `milestone` `tests` | 🔲 | M | **Unblocked** — M3 metric assertions (H1) landed. Highest-leverage remaining: **H2** (rerun-API 5xx contract), **H3** (decryption-failure fallback), **M3** (`activePodCount` Pending branch). Worth picking up after 5c–5g. |
@@ -86,7 +87,6 @@ Specific actionable items in priority order. Pick from the top; skip 🚫 items
| Q89 | Per-tenant `spec.logLevel` CRD knob | `infra` | 🔲 | M | Logging-audit Theme G (post-1.0, after F1): add `spec.logLevel` (info\|debug) to ActionsGateway, threaded to AGC+proxy like `securityProfile` (rolling restart). Needs CRD+operator docs. See [logging-audit](plan/logging-audit.md). |
| Q103 | No SLSA build provenance attestation on images | `security` `infra` | 🔲 | S | publish.yml signs + SBOM-attests (cosign) but emits no provenance predicate: no provenance: on build-push, no actions/attest-build-provenance. Dockerfiles advertise SLSA-L3 reproducibility with nothing backing it. Add provenance attestation. |
| Q104 | ServiceMonitor scrapes metrics with insecureSkipVerify:true | `security` `infra` | 🔲 | S | templates/servicemonitor.yaml sets tlsConfig.insecureSkipVerify:true (self-signed metrics cert) — MITM-able scrape. cert-manager is wired for the webhook but not metrics. Offer a cert-manager-issued metrics cert toggle. Overlaps [Q72](#Q72). |
-| Q105 | Worker/proxy DNS egress is unrestricted (port 53 to any) | `security` `docs` | 🔲 | S | builder.go:197,254 emit a port-53 egress rule with no To peer, so workers/proxy can resolve via any server — a DNS exfil channel bypassing per-tenant egress-IP attribution. Restrict to kube-dns or document the gap in 05-security.md. |
| Q127 | [Security-hardening batch from audit 2](plan/security-audit-2026-06.md) | `security` | 🔲 | M | 8 small items (see plan doc): PSA-guard hardcoded SA name; AG singleton guard; validate noProxyCIDRs; CONNECT TLS MinVersion; checksum tool downloads (cosign!); no GHA cache on release builds; AGC any-dest 443; privileged-webhook incoherence. |
| Q133 | Platform-gated eligibility for securityProfile: privileged | `security` | 🔲 | M | A tenant self-selects securityProfile: privileged at create; only downgrades are webhook-gated. Eligibility to run privileged should be a platform call — gate it behind a platform-applied namespace label. Extends Q127 item 8 (profile-aware webhook) |
| Q106 | Non-atomic eviction-retry counter race | `bug` | 🔲 | S | handleEviction reads-modifies-writes the count with no per-key lock (provisioner.go:451). Two concurrent evictions of the same run_id both pass the budget check and both call rerun-failed-jobs, exceeding the budget. Serialize per runID (atomic CAS). |
diff --git a/docs/design/05-security.md b/docs/design/05-security.md
index 1084d433..6634c469 100644
--- a/docs/design/05-security.md
+++ b/docs/design/05-security.md
@@ -20,7 +20,7 @@ The two-tier architecture introduces both stronger isolation guarantees and new
## 5.2. AGC & Proxy-Level Threats (Namespace-Scoped)
-Several mitigations below rest on the per-tenant NetworkPolicies the GMC reconciles (workload egress restricted to DNS + proxy; only AGC-labelled pods get apiserver egress; **workload pods default-deny all ingress** — Q128). The ingress default-deny matters because worker pods run untrusted GitHub Actions job code and are outbound-only by design (they long-poll/dial out to GitHub via the proxy and to the AGC); nothing legitimately initiates a connection *to* a worker, so the workload NP declares `policyTypes: [Ingress, Egress]` with an empty ingress rule set. Without it, worker pods were default-allow ingress and any pod in the cluster could open connections to untrusted job code — a lateral-movement / cross-tenant channel. NetworkPolicy objects are inert unless the cluster CNI enforces them — kind's default kindnet does **not** drop traffic, so production clusters must run a policy-enforcing CNI (Calico, Cilium, or equivalent). Runtime enforcement of the egress negatives was observed on a Calico kind cluster on 2026-06-11 (Q7b; see [network-architecture.md § How to Validate Network Isolation](network-architecture.md#how-to-validate-network-isolation)).
+Several mitigations below rest on the per-tenant NetworkPolicies the GMC reconciles (workload egress restricted to DNS + proxy, with **DNS itself confined to the cluster DNS service** rather than any resolver — Q105; only AGC-labelled pods get apiserver egress; **workload pods default-deny all ingress** — Q128). The ingress default-deny matters because worker pods run untrusted GitHub Actions job code and are outbound-only by design (they long-poll/dial out to GitHub via the proxy and to the AGC); nothing legitimately initiates a connection *to* a worker, so the workload NP declares `policyTypes: [Ingress, Egress]` with an empty ingress rule set. Without it, worker pods were default-allow ingress and any pod in the cluster could open connections to untrusted job code — a lateral-movement / cross-tenant channel. NetworkPolicy objects are inert unless the cluster CNI enforces them — kind's default kindnet does **not** drop traffic, so production clusters must run a policy-enforcing CNI (Calico, Cilium, or equivalent). Runtime enforcement of the egress negatives was observed on a Calico kind cluster on 2026-06-11 (Q7b; see [network-architecture.md § How to Validate Network Isolation](network-architecture.md#how-to-validate-network-isolation)).
| Threat Vector | Impact | Mitigation Strategy |
| --- | --- | --- |
@@ -30,6 +30,7 @@ Several mitigations below rest on the per-tenant NetworkPolicies the GMC reconci
| **AGC Token Compromise** | High | The AGC never saves plaintext keys to disk. GitHub App private keys are mounted as read-only volumes with restrictive file permissions (0400). |
| **Credential Leak via Logged Error Bodies** | Medium | The AGC, broker client, and probe interpolate upstream GitHub HTTP response bodies into errors that callers log. Some of these bodies carry credential material — the runner-token endpoint's 200 body holds an access token, and `generate-jitconfig`'s body holds the runner JIT registration credential plus RSA key. Before any upstream body is placed into an error or log line it passes through a single shared redactor (`githubapp.SanitizeBody`) that strips credential-shaped substrings (GitHub `gh*_`/`github_pat_` tokens, JWTs, `access_token`/`encoded_jit_config`/`private_key`/`secret` JSON values, and long opaque base64 blobs) and caps the result. Redaction runs before capping so a secret straddling the cap boundary cannot survive in the truncated tail. No secret is ever logged directly; this control hardens the indirect path. |
| **Eviction-Retry API Misuse** | Medium | The AGC calls `POST /repos/{owner}/{repo}/actions/runs/{run_id}/rerun-failed-jobs` using the tenant's installation access token when a worker pod is evicted. The blast radius is bounded: the installation token is scoped to the GitHub App's installation on a specific organization or repository, so the AGC cannot re-run jobs belonging to other tenants or organizations. The `run_id` is extracted from the job payload delivered by GitHub's broker — the AGC cannot fabricate or substitute a run ID for a run it did not acquire. To prevent abuse of the retry path (e.g. a compromised AGC looping re-runs), `maxEvictionRetries` caps the number of automatic retries per job and is enforced before the API call is made. Operators should monitor `actions_gateway_eviction_retries_exhausted_total` to detect abnormal eviction patterns. |
+| **DNS Exfiltration Side-Channel** (Unattributed Egress) | Medium | The per-tenant egress-IP attribution that isolates tenants rests on *all* real egress traversing the tenant proxy, whose source IPs are attributable. An unrestricted port-53 egress rule (`to: []` ≡ any resolver) would defeat that: any pod — including untrusted worker job code — could smuggle data out by encoding it into DNS queries aimed at an attacker-controlled authoritative server, an unattributed side-channel that never touches the proxy. All three per-tenant NetworkPolicies (workload, AGC, proxy) therefore confine port-53 egress to the **cluster DNS service only** (`kube-dns` / `CoreDNS` in `kube-system`, matched by `namespaceSelector` on `kubernetes.io/metadata.name: kube-system` plus `podSelector` on `k8s-app: kube-dns`). `kube-dns` recurses upstream on the pod's behalf, so legitimate resolution (including the proxy's own GitHub-hostname lookups) is unaffected — only the "any resolver" breadth is removed (Q105). Like the other egress negatives, this is enforced only by a policy-aware CNI; the reliable CI guard is the authoring-level test `TestBuildNetworkPolicy_DNSEgressRestrictedToKubeDNS`, which asserts every policy's DNS rule selects kube-dns and is never open. |
| **Proxy as Traffic Interception Point** | Medium | The proxy only handles CONNECT tunneling and does not terminate TLS. It cannot inspect or modify the encrypted payload between the AGC/worker and GitHub. Proxy pods run with a read-only root filesystem and no elevated capabilities. |
| **Cross-Tenant Proxy CA Trust** | Medium | The egress proxy's TLS cert is signed by a cert-manager-issued self-signed CA stored in the per-tenant `actions-gateway-proxy-tls` Secret. The AGC pins this CA explicitly (via its trust pool) rather than trusting the cluster's root store, and worker pods install the same CA into a combined `SSL_CERT_FILE` bundle so Runner.Worker's .NET HttpClient accepts the proxy handshake. The cert (`tls.crt`) is projected into both AGC and worker pods via an `Items: [tls.crt]` Secret volume; the private key (`tls.key`) is mounted *only* into the proxy pod itself, so a runner compromise does not yield the ability to forge a proxy cert. Trust is tenant-scoped: each tenant's CA is independent, so a compromised CA in one namespace cannot mint a cert trusted by another tenant's AGC or workers. |
| **Egress IP Change Mid-Session** | Low–Unknown | GitHub's broker protocol is token-based, not IP-bound. Session IDs and bearer tokens carry no IP affinity, so rotating across proxy pods mid-job is expected to work. The Twirp log stream is naturally sticky (long-lived HTTP/2 connection stays on one proxy pod once open). Impact is unknown because GitHub's abuse detection heuristics are undocumented. **Early mitigation: the [Milestone 1](06-implementation-phases.md#milestone-1-wire-protocol-probe-days-14) wire protocol probe explicitly tests broker API calls routed through a multi-pod proxy pool to confirm GitHub does not reject or flag IP variance across `CreateSession → GetMessage → AcquireJob`.** If the probe surfaces a problem, `ClientIP` session affinity on the proxy Service is the low-effort fallback; explicit per-goroutine proxy assignment is the higher-fidelity option if needed. |
diff --git a/docs/design/network-architecture.md b/docs/design/network-architecture.md
index 59c55a61..c259898c 100644
--- a/docs/design/network-architecture.md
+++ b/docs/design/network-architecture.md
@@ -78,8 +78,17 @@ spec:
- Egress
ingress: [] # no ingress permitted
egress:
- # DNS — needed for resolving the proxy Service name
- - ports:
+ # DNS — needed for resolving the proxy Service name. Confined to the cluster
+ # DNS service (kube-dns / CoreDNS in kube-system), not "any resolver": an
+ # open port-53 rule is an unattributed exfiltration side-channel (Q105).
+ - to:
+ - namespaceSelector:
+ matchLabels:
+ kubernetes.io/metadata.name: kube-system
+ podSelector:
+ matchLabels:
+ k8s-app: kube-dns
+ ports:
- protocol: UDP
port: 53
- protocol: TCP
@@ -118,8 +127,15 @@ spec:
policyTypes:
- Egress
egress:
- # DNS
- - ports:
+ # DNS — confined to cluster DNS (kube-dns / CoreDNS in kube-system); see Q105.
+ - to:
+ - namespaceSelector:
+ matchLabels:
+ kubernetes.io/metadata.name: kube-system
+ podSelector:
+ matchLabels:
+ k8s-app: kube-dns
+ ports:
- protocol: UDP
port: 53
- protocol: TCP
@@ -162,8 +178,18 @@ spec:
- port: 8080
protocol: TCP
egress:
- # DNS — proxy resolves GitHub hostnames on behalf of clients
- - ports:
+ # DNS — proxy resolves GitHub hostnames on behalf of clients. Confined to the
+ # cluster DNS service (kube-dns / CoreDNS in kube-system); kube-dns recurses
+ # upstream so external names still resolve, but the proxy cannot reach an
+ # arbitrary resolver — closing the open-DNS exfiltration side-channel (Q105).
+ - to:
+ - namespaceSelector:
+ matchLabels:
+ kubernetes.io/metadata.name: kube-system
+ podSelector:
+ matchLabels:
+ k8s-app: kube-dns
+ ports:
- protocol: UDP
port: 53
- protocol: TCP
@@ -190,7 +216,9 @@ If `spec.proxy.managedNetworkPolicy: false` is set, the GMC omits the GitHub-CID
All in-cluster service discovery uses Kubernetes DNS (`kube-dns` / `CoreDNS`). The proxy pool is reachable from the AGC and worker pods via the `ClusterIP` Service name: `actions-gateway-proxy..svc.cluster.local`. The `NO_PROXY` env var includes `kubernetes.default.svc.cluster.local` and the cluster service CIDR so that Kubernetes API calls are never routed through the egress proxy.
-External DNS resolution (for GitHub hostnames) is performed by the proxy pods themselves, not by the AGC or worker pods — the AGC and workers connect to the proxy using `CONNECT :` and the proxy resolves the hostname on their behalf. This means the proxy pods must have egress access to the cluster's DNS resolver in addition to GitHub's IP ranges. In practice, DNS egress is typically covered by the cluster's default network policy or a separate allow-all DNS rule.
+External DNS resolution (for GitHub hostnames) is performed by the proxy pods themselves, not by the AGC or worker pods — the AGC and workers connect to the proxy using `CONNECT :` and the proxy resolves the hostname on their behalf. This means the proxy pods must have egress access to the cluster's DNS resolver in addition to GitHub's IP ranges.
+
+DNS egress on all three policies is **confined to the cluster DNS service** (`kube-dns` / `CoreDNS` in `kube-system`, matched by `namespaceSelector` on the well-known `kubernetes.io/metadata.name: kube-system` label plus a `podSelector` on the conventional `k8s-app: kube-dns` label) rather than left open to any resolver (Q105). An unrestricted port-53 rule (`to: []`) would let any pod smuggle data to an attacker-controlled resolver — an unattributed side-channel that bypasses the per-tenant egress-IP attribution every other egress path enforces. Confining DNS to the in-cluster resolver keeps resolution on the attributable path: `kube-dns` recurses upstream on the pod's behalf, so external GitHub names still resolve while no pod can reach an arbitrary DNS server directly. Operators running a DNS service under a non-standard namespace or pod label must adjust the selector accordingly (or supply their own equivalent rule under `spec.proxy.managedNetworkPolicy: false`).
---
diff --git a/docs/operations/security-operations.md b/docs/operations/security-operations.md
index b51101df..521caaa8 100644
--- a/docs/operations/security-operations.md
+++ b/docs/operations/security-operations.md
@@ -42,6 +42,8 @@ Two detection substrates are used:
- [Posture scanning (preventive)](#posture-scanning-preventive)
- [Manifest posture — polaris (automated, in CI)](#manifest-posture--polaris-automated-in-ci)
- [CIS-benchmark posture — kube-bench (manual, pre-production)](#cis-benchmark-posture--kube-bench-manual-pre-production)
+- [Tenant egress posture & deliberate widening](#tenant-egress-posture--deliberate-widening)
+ - [Managing egress at scale](#managing-egress-at-scale)
- [License attribution in images](#license-attribution-in-images)
- [Image provenance: signature & SBOM verification](#image-provenance-signature--sbom-verification)
- [Verify a signature](#verify-a-signature)
@@ -366,6 +368,28 @@ Triage the report against this operator's needs:
[network-architecture.md § How to Validate Network Isolation](../design/network-architecture.md#how-to-validate-network-isolation) —
the "blocked" probes must actually time out (validated under Calico on a
kind cluster, Q7b 2026-06-11).
+- **Cluster DNS must be labelled `k8s-app=kube-dns` in `kube-system`.** Tenant
+ NetworkPolicies confine port-53 egress to the cluster DNS service rather than
+ leaving DNS open to any resolver (Q105 — an open DNS path is an unattributed
+ exfiltration side-channel). The selector matches the conventional CoreDNS
+ deployment: pods labelled `k8s-app: kube-dns` in the `kube-system` namespace
+ (matched via the immutable `kubernetes.io/metadata.name` namespace label). This
+ is the default on every mainstream distribution and managed control plane. If
+ your cluster runs DNS under a different label or namespace **and** uses an
+ enforcing CNI, tenant pods will fail to resolve any name until you either
+ relabel the DNS pods or set `spec.proxy.managedNetworkPolicy: false` and supply
+ your own DNS egress rule. Symptom: tenant workloads time out on every lookup
+ while non-DNS connectivity is unaffected.
+- **Known limitation — NodeLocal DNSCache (`node-local-dns`) is not yet
+ supported out of the box.** With node-local-dns, pods send queries to a
+ link-local IP (default `169.254.20.10`) served by a `hostNetwork`
+ `node-local-dns` pod, not to a `k8s-app: kube-dns` CoreDNS pod — so on an
+ enforcing CNI the kube-dns-only rule drops those queries, breaking resolution
+ for workers **and** the proxy. Until first-class support lands
+ ([Q136](../STATUS.md#Q136)), grant it with an additive NetworkPolicy allowing
+ port-53 egress to `169.254.0.0/16` (link-local is non-routable off-node, so
+ this preserves the no-arbitrary-resolver property) for the workload and proxy
+ pods — see [Tenant egress posture & deliberate widening](#tenant-egress-posture--deliberate-widening).
- **Findings that don't apply** (managed control plane hides the file, a check
for a component you don't run) — record the justification alongside the
cluster's onboarding ticket.
@@ -374,6 +398,114 @@ The goal is **zero critical (`[FAIL]`) findings that this stack depends on**
before the first production tenant (per
[milestone-5.md](../plan/milestone-5.md) §3).
+## Tenant egress posture & deliberate widening
+
+**The secure default is controller-managed and not opt-in.** For every tenant,
+the GMC reconciles three NetworkPolicies that confine worker (and AGC) egress to
+exactly what the design requires: DNS to the cluster DNS service only, and all
+GitHub-bound traffic through the per-tenant egress proxy (whose source IPs are
+attributable). Worker pods cannot reach arbitrary destinations directly — that is
+the per-tenant egress-IP isolation property, and it is present automatically the
+moment a tenant is provisioned. Do **not** hand-edit the GMC-managed policies
+(`actions-gateway-workload`, `actions-gateway-controller`, `actions-gateway-proxy`):
+the controller reconciles them back, and the proxy policy's GitHub-CIDR rule is
+refreshed from `api.github.com/meta` every 24h, so a hand-edit would be reverted
+or go stale. See [network-architecture.md](../design/network-architecture.md#networkpolicy-rules)
+for the full policy set.
+
+Some jobs legitimately need egress the proxy cannot carry — the CONNECT proxy
+tunnels HTTP/HTTPS to GitHub CIDRs only, so a non-HTTP protocol (a database, SSH,
+a raw TCP/UDP service), an internal artifact store or package mirror, or a
+specific custom DNS resolver is unreachable by default. **Grant that egress with
+an *additional*, additive NetworkPolicy in the tenant namespace — not by relaxing
+the managed defaults.** NetworkPolicies are additive (a union of allows), so an
+extra policy widens egress for the pods it selects without touching the floor.
+
+Worker pods carry two selectable labels, so you can target all workers or a single
+runner type:
+
+- `actions-gateway/component: workload` — every worker (and the AGC) in the tenant
+- `actions-gateway/runner-group: ` — workers of one specific RunnerGroup
+
+```yaml
+# Applied by a platform admin (requires NetworkPolicy write in the tenant
+# namespace) — grants ONE runner type extra egress. CIDR + port + protocol.
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+ name: gpu-builders-extra-egress
+ namespace: team-a
+spec:
+ podSelector:
+ matchLabels:
+ actions-gateway/component: workload
+ actions-gateway/runner-group: gpu-builders # omit this line for tenant-wide
+ policyTypes: [Egress]
+ egress:
+ - to:
+ - ipBlock: {cidr: 10.50.0.0/24} # internal registry / artifact store
+ ports:
+ - {protocol: TCP, port: 443}
+ - {protocol: TCP, port: 5432} # e.g. Postgres
+ - to:
+ - ipBlock: {cidr: 10.50.0.53/32} # custom DNS resolver
+ ports:
+ - {protocol: UDP, port: 53}
+ - {protocol: TCP, port: 53}
+```
+
+**This is a deliberate, documented trade-off, not a routine knob.** Egress to the
+listed destinations leaves with the worker's own pod IP and therefore **bypasses
+the per-tenant proxy egress-IP attribution** for those flows. Untrusted job code
+(e.g. fork-PR workflows) can use any hole you open, so:
+
+- Keep the allowlist as narrow as the use case requires — specific CIDRs and
+ ports, never a `0.0.0.0/0` catch-all.
+- Authoring it requires namespace NetworkPolicy-write, so it is inherently a
+ platform/admin decision — which is the correct authority for relaxing
+ attribution. Track each grant in the tenant's onboarding ticket.
+- **For a custom DNS resolver specifically, prefer a cluster-level CoreDNS
+ `forward` zone** over reopening worker DNS: that keeps resolution on the
+ attributable in-cluster path while still resolving the names you need.
+
+If instead you want to take over the **proxy's** own egress policy (for example
+to express GitHub egress as FQDN rules under Cilium/Calico), set
+`spec.proxy.managedNetworkPolicy: false` on the `ActionsGateway` — the GMC then
+stops managing the proxy GitHub-CIDR rule and you own keeping it current. That is
+the supported, explicit hand-off; the managed path remains the default.
+
+### Managing egress at scale
+
+This project deliberately does **not** ship tooling to manage the *widening*
+policies — that is a cluster/platform concern with a mature ecosystem, and owning
+it here would re-create the coupling the managed-floor split avoids. What the
+project commits to instead is a stable **integration surface**: every worker pod
+carries two labels you can target from any policy engine, and these are a
+supported contract (they will not be renamed without a migration note):
+
+- `actions-gateway/component: workload` — all worker (and AGC) pods in the tenant
+- `actions-gateway/runner-group: ` — workers of one specific RunnerGroup
+
+For anything beyond a handful of static CIDRs, prefer the ecosystem over
+hand-written `NetworkPolicy`:
+
+- **Your CNI's richer egress** — `CiliumNetworkPolicy` `toFQDNs` (DNS-aware,
+ hostname allowlists), Calico `NetworkSet` (reusable CIDR groups) / DNS policy,
+ and policy tiers. This is the right tool for "let `gpu-builders` reach
+ `*.internal.corp` and a database." It pairs with the
+ `spec.proxy.managedNetworkPolicy: false` hand-off above.
+- **`AdminNetworkPolicy`** ([sig-network `network-policy-api`](https://network-policy-api.sigs.k8s.io/))
+ — cluster-admin-level, cross-namespace egress baselines (`AdminNetworkPolicy` /
+ `BaselineAdminNetworkPolicy`), implemented by Cilium/Calico/OVN-Kubernetes. The
+ most direct fit for "platform admin governs egress across all tenant
+ namespaces" — maturing (alpha→beta), so confirm your CNI's support level.
+- **Kyverno / OPA Gatekeeper** — policy-as-code to *generate* per-namespace NPs
+ (e.g. a templated default-deny or egress allowance keyed off a namespace label)
+ and to *validate* that any admin-added egress conforms to your guardrails.
+
+The labels above are what make all of these targetable; the secure floor stays
+GMC-managed regardless.
+
---
## License attribution in images
diff --git a/docs/plan/release-1.0.md b/docs/plan/release-1.0.md
index 246aa5f1..3477198c 100644
--- a/docs/plan/release-1.0.md
+++ b/docs/plan/release-1.0.md
@@ -330,11 +330,14 @@ release-integrity claim false if shipped silently — the exact failure the
1.0 bar exists to prevent — so "resolved" means a real fix *or* an honest
docs caveat, not omission.
-- **Unrestricted port-53 egress** ([Q105](../STATUS.md)): `builder.go`
+- ~~**Unrestricted port-53 egress** ([Q105](../STATUS.md)): `builder.go`
emits a port-53 egress rule with no `To` peer, so workers/proxy can
resolve via any DNS server — a DNS-exfil channel that undercuts the
- per-tenant egress-IP isolation claim. Restrict to kube-dns, or document
- the gap in `05-security.md`.
+ per-tenant egress-IP isolation claim.~~ **Resolved:** all three per-tenant
+ NetworkPolicies (workload, AGC, proxy) now confine port-53 egress to the
+ cluster DNS service (`k8s-app: kube-dns` in `kube-system`); guarded by the
+ authoring test `TestBuildNetworkPolicy_DNSEgressRestrictedToKubeDNS`. See
+ [05-security.md](../design/05-security.md) § DNS Exfiltration Side-Channel.
- **Release-integrity siblings of [Q123](../STATUS.md)/[Q124](../STATUS.md)** —
vendored deps are never hash-verified against `go.sum`
([Q126](../STATUS.md)), and the cosign binary in the signing pipeline is
diff --git a/docs/plan/security-audit-2026-06.md b/docs/plan/security-audit-2026-06.md
index c3ea518c..bb37e35c 100644
--- a/docs/plan/security-audit-2026-06.md
+++ b/docs/plan/security-audit-2026-06.md
@@ -35,7 +35,7 @@ unimplemented at the authorization layer.
| GMC teardown fail-open (`deleteIfExists` swallows errors, finalizer removed) | Medium | **Resolved (Q125):** `deleteIfExists` returns its error (NotFound = success); `reconcileDelete` collects delete errors, emits a `TeardownIncomplete` event, and requeues without removing the finalizer until every delete is confirmed gone. Fail-closed and idempotent. |
| Vendored deps never integrity-checked against go.sum in CI | Medium | **New → [Q126](../STATUS.md#Q126)** |
| 8 smaller hardening items (see below) | Low | **New → [Q127](../STATUS.md#Q127)** (batch) |
-| DNS egress allows port 53 to any destination | Medium | Known — [Q105](../STATUS.md#Q105) |
+| DNS egress allows port 53 to any destination | Medium | **Fixed (Q105)** — port-53 egress confined to cluster DNS (`k8s-app: kube-dns` in `kube-system`) across all three per-tenant NetworkPolicies |
| Proxy has no app-layer destination allowlist / connection cap | Medium | Accepted by design — security.md M-2, Appendix G §G.1, [Q19](../STATUS.md#Q19). This audit adds the metadata-service/SSRF framing as a revisit argument |
| ResourceQuota is optional and tenant-controlled | Medium | **Resolved (Q130, 2026-06-14):** the tenant-authored `spec.namespaceQuota` was removed; the `ResourceQuota` is now platform-owned (the platform admin must provision it on the namespace), so it is no longer tenant-controlled. Remaining per-cluster proxy HPA-max guard work stays in [Q82](../STATUS.md#Q82). |
| No SLSA provenance attestation | Info | Known — [Q103](../STATUS.md#Q103) |