Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 56 additions & 25 deletions cmd/gmc/internal/controller/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,19 @@ const (
metricsScrapeNamespaceLabel = "metrics"
metricsScrapeNamespaceValue = "enabled"

// dnsNamespaceLabel / dnsNamespaceValue and dnsPodLabel / dnsPodValue select
// the cluster DNS service (CoreDNS / kube-dns) as the sole permitted DNS
// egress peer. The namespace is matched via the well-known immutable
// `kubernetes.io/metadata.name` label that every Kubernetes ≥1.21 stamps on
// each namespace (so no manual labelling of kube-system is required); the
// pods via the conventional `k8s-app: kube-dns` label CoreDNS carries by
// default in every distribution this controller targets. See dnsEgressRule
// for why egress is confined to this peer (Q105).
dnsNamespaceLabel = "kubernetes.io/metadata.name"
dnsNamespaceValue = "kube-system"
dnsPodLabel = "k8s-app"
dnsPodValue = "kube-dns"

// npProxyName is the NetworkPolicy that restricts proxy pod egress to GitHub CIDRs.
npProxyName = gmcnames.ProxyName
// npAGCName is the NetworkPolicy that gives AGC pods Kubernetes API server access (port 443).
Expand Down Expand Up @@ -187,19 +200,53 @@ func metricsScrapeIngressRule() networkingv1.NetworkPolicyIngressRule {
}
}

// buildProxyNetworkPolicy constructs the NetworkPolicy for proxy pods.
// Proxy pods may reach GitHub CIDRs on 443 (for CONNECT tunneling) and DNS.
// Only workload pods (AGC and workers) may initiate connections to the proxy.
func buildProxyNetworkPolicy(ag *gmcv1alpha1.ActionsGateway, githubCIDRs []net.IPNet) *networkingv1.NetworkPolicy {
// dnsEgressRule returns a NetworkPolicy egress rule permitting DNS (UDP/TCP 53)
// to the cluster DNS service ONLY — CoreDNS / kube-dns in kube-system — not to
// any destination. It is shared by the proxy, workload, and AGC policies so the
// DNS posture cannot drift between them.
//
// An unrestricted port-53 rule (To: nil ≡ any server) is an unattributed
// data-exfiltration side-channel: DNS queries can smuggle data to an
// attacker-controlled resolver, bypassing the per-tenant egress-IP attribution
// that is a headline isolation property of this system (Q105). Every other
// egress path forces traffic through the tenant proxy, whose source IPs are
// attributable; confining DNS to the in-cluster resolver keeps it on that
// attributable path — kube-dns recurses upstream on the pod's behalf, so the
// proxy can still resolve GitHub hostnames to do its job. Only the open "any
// resolver" breadth is removed, not legitimate resolution.
//
// kindnet does not enforce egress NetworkPolicy (see Q7b in
// docs/plan/worker-egress-proxy.md), so this restriction is guarded at the
// spec/authoring level by TestBuildNetworkPolicy_DNSEgressRestrictedToKubeDNS
// rather than by a live e2e deny test; a runtime negative needs a
// policy-enforcing CNI such as Calico.
func dnsEgressRule() networkingv1.NetworkPolicyEgressRule {
proto53UDP := corev1.ProtocolUDP
proto53TCP := corev1.ProtocolTCP

egress := []networkingv1.NetworkPolicyEgressRule{{
return networkingv1.NetworkPolicyEgressRule{
// A single peer with both selectors set is an AND: kube-dns pods *within*
// kube-system. Splitting them into two peers would be an OR and would also
// admit any pod labelled k8s-app=kube-dns in any namespace.
To: []networkingv1.NetworkPolicyPeer{{
NamespaceSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{dnsNamespaceLabel: dnsNamespaceValue},
},
PodSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{dnsPodLabel: dnsPodValue},
},
}},
Ports: []networkingv1.NetworkPolicyPort{
{Protocol: &proto53UDP, Port: ptr(intstr.FromInt32(53))},
{Protocol: &proto53TCP, Port: ptr(intstr.FromInt32(53))},
},
}}
}
}

// buildProxyNetworkPolicy constructs the NetworkPolicy for proxy pods.
// Proxy pods may reach GitHub CIDRs on 443 (for CONNECT tunneling) and DNS.
// Only workload pods (AGC and workers) may initiate connections to the proxy.
func buildProxyNetworkPolicy(ag *gmcv1alpha1.ActionsGateway, githubCIDRs []net.IPNet) *networkingv1.NetworkPolicy {
egress := []networkingv1.NetworkPolicyEgressRule{dnsEgressRule()}
managed := ag.Spec.Proxy.ManagedNetworkPolicy == nil || *ag.Spec.Proxy.ManagedNetworkPolicy
if managed && len(githubCIDRs) > 0 {
peers := make([]networkingv1.NetworkPolicyPeer, 0, len(githubCIDRs))
Expand Down Expand Up @@ -261,16 +308,8 @@ func buildProxyNetworkPolicy(ag *gmcv1alpha1.ActionsGateway, githubCIDRs []net.I
// additively re-admits the monitoring metrics scrape, so default-deny here costs it
// nothing.
func buildWorkloadNetworkPolicy(ag *gmcv1alpha1.ActionsGateway) *networkingv1.NetworkPolicy {
proto53UDP := corev1.ProtocolUDP
proto53TCP := corev1.ProtocolTCP

egress := []networkingv1.NetworkPolicyEgressRule{
{
Ports: []networkingv1.NetworkPolicyPort{
{Protocol: &proto53UDP, Port: ptr(intstr.FromInt32(53))},
{Protocol: &proto53TCP, Port: ptr(intstr.FromInt32(53))},
},
},
dnsEgressRule(),
{
Ports: []networkingv1.NetworkPolicyPort{{Port: ptr(intstr.FromInt32(proxyPort))}},
To: []networkingv1.NetworkPolicyPeer{{
Expand Down Expand Up @@ -322,22 +361,14 @@ func buildWorkloadNetworkPolicy(ag *gmcv1alpha1.ActionsGateway) *networkingv1.Ne
// this, the AGC NP carried no ingress policy type and any pod in the namespace
// could scrape per-tenant metrics off the controller-runtime metrics server.
func buildAGCNetworkPolicy(ag *gmcv1alpha1.ActionsGateway) *networkingv1.NetworkPolicy {
proto53UDP := corev1.ProtocolUDP
proto53TCP := corev1.ProtocolTCP

return &networkingv1.NetworkPolicy{
ObjectMeta: metav1.ObjectMeta{Name: npAGCName, Namespace: ag.Namespace, Labels: managedLabels(ag)},
Spec: networkingv1.NetworkPolicySpec{
PodSelector: metav1.LabelSelector{MatchLabels: map[string]string{"app": agcAppName}},
PolicyTypes: []networkingv1.PolicyType{networkingv1.PolicyTypeEgress, networkingv1.PolicyTypeIngress},
Ingress: []networkingv1.NetworkPolicyIngressRule{metricsScrapeIngressRule()},
Egress: []networkingv1.NetworkPolicyEgressRule{
{
Ports: []networkingv1.NetworkPolicyPort{
{Protocol: &proto53UDP, Port: ptr(intstr.FromInt32(53))},
{Protocol: &proto53TCP, Port: ptr(intstr.FromInt32(53))},
},
},
dnsEgressRule(),
{
Ports: []networkingv1.NetworkPolicyPort{
{Port: ptr(intstr.FromInt32(443))},
Expand Down
41 changes: 41 additions & 0 deletions cmd/gmc/internal/controller/builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,47 @@ func TestBuildNetworkPolicy_DNSEgressAlwaysPresent(t *testing.T) {
}
}

// TestBuildNetworkPolicy_DNSEgressRestrictedToKubeDNS locks in Q105: the port-53
// egress rule on every GMC-managed NetworkPolicy must target the cluster DNS
// service (kube-dns / CoreDNS in kube-system) and must NOT be open (To: nil ≡ any
// resolver). An open DNS path is an unattributed exfiltration side-channel that
// bypasses the per-tenant egress-IP attribution every other egress path enforces.
// This is an authoring/spec-level guard because kindnet does not enforce egress
// NetworkPolicy (see Q7b) — mirroring the egress-negative guard pattern.
func TestBuildNetworkPolicy_DNSEgressRestrictedToKubeDNS(t *testing.T) {
ag := newTestAG("gateway", "team-a")
for _, np := range []*networkingv1.NetworkPolicy{
buildProxyNetworkPolicy(ag, nil),
buildWorkloadNetworkPolicy(ag),
buildAGCNetworkPolicy(ag),
} {
var dnsRules []networkingv1.NetworkPolicyEgressRule
for _, rule := range np.Spec.Egress {
for _, port := range rule.Ports {
if port.Port != nil && port.Port.IntVal == 53 {
dnsRules = append(dnsRules, rule)
break
}
}
}
require.Len(t, dnsRules, 1, "%s must carry exactly one port-53 egress rule", np.Name)

rule := dnsRules[0]
require.NotEmpty(t, rule.To,
"%s port-53 rule must have a To peer — an empty To opens DNS to any resolver (Q105)", np.Name)
require.Len(t, rule.To, 1, "%s DNS rule should select kube-dns via a single peer", np.Name)

peer := rule.To[0]
assert.Nil(t, peer.IPBlock, "%s DNS peer must not be an ipBlock", np.Name)
require.NotNil(t, peer.NamespaceSelector, "%s DNS peer must select the kube-dns namespace", np.Name)
require.NotNil(t, peer.PodSelector, "%s DNS peer must select the kube-dns pods", np.Name)
assert.Equal(t, dnsNamespaceValue, peer.NamespaceSelector.MatchLabels[dnsNamespaceLabel],
"%s DNS peer namespace selector must match kube-system", np.Name)
assert.Equal(t, dnsPodValue, peer.PodSelector.MatchLabels[dnsPodLabel],
"%s DNS peer pod selector must match k8s-app=kube-dns", np.Name)
}
}

func TestBuildAGCNetworkPolicy_PodSelectorIsAGCOnly(t *testing.T) {
ag := newTestAG("gateway", "team-a")
np := buildAGCNetworkPolicy(ag)
Expand Down
2 changes: 1 addition & 1 deletion docs/STATUS.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ Specific actionable items in priority order. Pick from the top; skip 🚫 items
| <a id="Q131"></a>Q131 | Flake: TestListener_IdleNotShutdownIfLast poll-count timing | `tests` `bug` | 🔲 | S | goroutine_test.go:419 asserted poll count ≥5 but got 3 ("poll past threshold when last listener") on a local `make check`; passed on rerun (-count=3) w/o code change. Timing-sensitive idle-shutdown test; tighten synchronization, not the count. |
| <a id="Q113"></a>Q113 | Flake: eviction integration tests time out in waitForWorkerPod | `tests` `bug` | 🔲 | S | EvictionTriggersRequeue + EvictionBudgetExhausted (failure_recovery_test.go:142) timed out (20s) on CI run 27383065643, passed on rerun w/o code change. Suspects: sessions[len-1] pick on shared brokerStub; 20s budget on 2-vCPU runner. |
| <a id="Q120"></a>Q120 | Flake: SIGTERM integration test misses session-delete budget | `tests` `bug` | 🔲 | S | TestAGC_SIGTERM_DeletesAllSessions: session-39 missed the 10s WaitForSessionDelete on CI run 27422248358 (PR 209), passed on rerun w/o code change. ~40 concurrent teardowns on a 2-vCPU runner; same shared-brokerStub/budget class as Q113. |
| <a id="Q136"></a>Q136 | node-local-dns (NodeLocal DNSCache) support for tenant DNS egress | `security` `infra` `1.0-gate` | 🔲 | S | Q105 DNS rule (kube-dns podSelector) breaks NodeLocal DNSCache: pods query link-local 169.254.20.10 (hostNetwork pod) → dropped on enforcing CNI, incl proxy. Fix: also allow port-53 to 169.254.0.0/16 (link-local, non-routable, keeps Q105). |
| <a id="Q98"></a>Q98 | Helm chart distribution/publishing pipeline | `infra` `1.0-gate` | 🔲 | M | Pipeline shipped: publish.yml chart-publish job packages, pushes (oci://ghcr/charts), and cosign-signs the chart. Remaining (first v* tag): live publish proof, flip prerelease annotation, oci:// in upgrade.md/README, Artifact Hub listing. |
| <a id="Q112"></a>Q112 | GMC Events silently 403'd: recorder writes events.k8s.io, RBAC grants core only | `bug` `infra` | 🔲 | S | Same root cause as the AGC fix in PR 202 (Q95): GMC uses mgr.GetEventRecorder (writes events.k8s.io/v1) but its kubebuilder marker grants only core "" events, so every GMC Event is dropped. Fix marker + `make manifests`; assert one event in e2e. |
| <a id="Q9"></a>Q9 | [M3-tests remaining items (H2/M/L)](plan/milestone-3-tests.md) | `milestone` `tests` | 🔲 | M | **Unblocked** — M3 metric assertions (H1) landed. Highest-leverage remaining: **H2** (rerun-API 5xx contract), **H3** (decryption-failure fallback), **M3** (`activePodCount` Pending branch). Worth picking up after 5c–5g. |
Expand Down Expand Up @@ -86,7 +87,6 @@ Specific actionable items in priority order. Pick from the top; skip 🚫 items
| <a id="Q89"></a>Q89 | Per-tenant `spec.logLevel` CRD knob | `infra` | 🔲 | M | Logging-audit Theme G (post-1.0, after F1): add `spec.logLevel` (info\|debug) to ActionsGateway, threaded to AGC+proxy like `securityProfile` (rolling restart). Needs CRD+operator docs. See [logging-audit](plan/logging-audit.md). |
| <a id="Q103"></a>Q103 | No SLSA build provenance attestation on images | `security` `infra` | 🔲 | S | publish.yml signs + SBOM-attests (cosign) but emits no provenance predicate: no provenance: on build-push, no actions/attest-build-provenance. Dockerfiles advertise SLSA-L3 reproducibility with nothing backing it. Add provenance attestation. |
| <a id="Q104"></a>Q104 | ServiceMonitor scrapes metrics with insecureSkipVerify:true | `security` `infra` | 🔲 | S | templates/servicemonitor.yaml sets tlsConfig.insecureSkipVerify:true (self-signed metrics cert) — MITM-able scrape. cert-manager is wired for the webhook but not metrics. Offer a cert-manager-issued metrics cert toggle. Overlaps [Q72](#Q72). |
| <a id="Q105"></a>Q105 | Worker/proxy DNS egress is unrestricted (port 53 to any) | `security` `docs` | 🔲 | S | builder.go:197,254 emit a port-53 egress rule with no To peer, so workers/proxy can resolve via any server — a DNS exfil channel bypassing per-tenant egress-IP attribution. Restrict to kube-dns or document the gap in 05-security.md. |
| <a id="Q127"></a>Q127 | [Security-hardening batch from audit 2](plan/security-audit-2026-06.md) | `security` | 🔲 | M | 8 small items (see plan doc): PSA-guard hardcoded SA name; AG singleton guard; validate noProxyCIDRs; CONNECT TLS MinVersion; checksum tool downloads (cosign!); no GHA cache on release builds; AGC any-dest 443; privileged-webhook incoherence. |
| <a id="Q133"></a>Q133 | Platform-gated eligibility for securityProfile: privileged | `security` | 🔲 | M | A tenant self-selects securityProfile: privileged at create; only downgrades are webhook-gated. Eligibility to run privileged should be a platform call — gate it behind a platform-applied namespace label. Extends Q127 item 8 (profile-aware webhook) |
| <a id="Q106"></a>Q106 | Non-atomic eviction-retry counter race | `bug` | 🔲 | S | handleEviction reads-modifies-writes the count with no per-key lock (provisioner.go:451). Two concurrent evictions of the same run_id both pass the budget check and both call rerun-failed-jobs, exceeding the budget. Serialize per runID (atomic CAS). |
Expand Down
Loading
Loading