From 83a0e540b3950efaa51c43bc0914b9214d1c6bce Mon Sep 17 00:00:00 2001 From: Karl Date: Sun, 14 Jun 2026 12:28:46 -0700 Subject: [PATCH 1/4] fix(agentpool)!: default empty agent keyType to RSA-3072, not Ed25519 (Q109) generateKey treated an empty/unspecified KeyType as Ed25519, a secure-by-default regression: Ed25519 agents cannot decrypt the RSA-OAEP session key the broker sends, so any caller constructing a key with an empty keyType silently dropped the AES-256-CBC defense-in-depth layer. Only the CLI flag default (rsa) held the secure choice. Make the library default match the CLI: an empty or unrecognised keyType now yields RSA-3072. Ed25519 requires the explicit KeyTypeEd25519 opt-in. Adds a unit test pinning the contract and updates the godoc/comments at all construction sites; release-1.0.md bucket G marks the regression resolved. --- cmd/agc/internal/agentpool/crypto.go | 19 +++++--- cmd/agc/internal/agentpool/crypto_test.go | 48 +++++++++++++++++++ cmd/agc/internal/agentpool/pool.go | 2 +- .../controller/runnergroup_controller.go | 2 +- docs/plan/release-1.0.md | 8 ++-- 5 files changed, 68 insertions(+), 11 deletions(-) create mode 100644 cmd/agc/internal/agentpool/crypto_test.go diff --git a/cmd/agc/internal/agentpool/crypto.go b/cmd/agc/internal/agentpool/crypto.go index afeb2823..60a824f6 100644 --- a/cmd/agc/internal/agentpool/crypto.go +++ b/cmd/agc/internal/agentpool/crypto.go @@ -14,21 +14,28 @@ import ( type KeyType string const ( - // KeyTypeEd25519 generates an Ed25519 key (default). + // KeyTypeEd25519 generates an Ed25519 key (explicit opt-in). Ed25519 agents + // cannot decrypt RSA-OAEP session keys, so selecting it loses a layer of + // session-key encryption — see docs/design/05-security.md. KeyTypeEd25519 KeyType = "ed25519" - // KeyTypeRSA generates an RSA-3072 key. + // KeyTypeRSA generates an RSA-3072 key (secure default). KeyTypeRSA KeyType = "rsa" ) // generateKey returns a new private key of the requested type. -// An empty KeyType defaults to KeyTypeEd25519. +// +// An empty KeyType defaults to KeyTypeRSA (RSA-3072), the secure default that +// preserves RSA-OAEP session-key encryption. Ed25519 must be requested +// explicitly via KeyTypeEd25519: defaulting empty→Ed25519 would be a +// secure-by-default regression (any caller that omits the key type would +// silently lose a layer of encryption). func generateKey(kt KeyType) (crypto.Signer, error) { switch kt { - case KeyTypeRSA: - return rsa.GenerateKey(rand.Reader, 3072) - default: + case KeyTypeEd25519: _, priv, err := ed25519.GenerateKey(rand.Reader) return priv, err + default: + return rsa.GenerateKey(rand.Reader, 3072) } } diff --git a/cmd/agc/internal/agentpool/crypto_test.go b/cmd/agc/internal/agentpool/crypto_test.go new file mode 100644 index 00000000..3afd43a7 --- /dev/null +++ b/cmd/agc/internal/agentpool/crypto_test.go @@ -0,0 +1,48 @@ +package agentpool + +import ( + "crypto/ed25519" + "crypto/rsa" + "testing" +) + +// TestGenerateKeyDefaultIsRSA pins the secure-by-default contract: an empty +// (unspecified) KeyType must yield an RSA-3072 key, never Ed25519. Defaulting +// empty→Ed25519 would silently drop RSA-OAEP session-key encryption for any +// caller that omits the key type (Q109 regression). RSA stays the default; +// Ed25519 is an explicit opt-in only (Q11). +func TestGenerateKeyDefaultIsRSA(t *testing.T) { + tests := []struct { + name string + keyType KeyType + wantRSA bool + }{ + {name: "empty defaults to RSA", keyType: "", wantRSA: true}, + {name: "explicit rsa", keyType: KeyTypeRSA, wantRSA: true}, + {name: "unrecognised value defaults to RSA", keyType: KeyType("bogus"), wantRSA: true}, + {name: "explicit ed25519 opt-in", keyType: KeyTypeEd25519, wantRSA: false}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + signer, err := generateKey(tc.keyType) + if err != nil { + t.Fatalf("generateKey(%q): %v", tc.keyType, err) + } + switch key := signer.(type) { + case *rsa.PrivateKey: + if !tc.wantRSA { + t.Fatalf("generateKey(%q) = RSA, want Ed25519", tc.keyType) + } + if got := key.N.BitLen(); got != 3072 { + t.Errorf("RSA key size = %d bits, want 3072", got) + } + case ed25519.PrivateKey: + if tc.wantRSA { + t.Fatalf("generateKey(%q) = Ed25519, want RSA", tc.keyType) + } + default: + t.Fatalf("generateKey(%q) = %T, want *rsa.PrivateKey or ed25519.PrivateKey", tc.keyType, signer) + } + }) + } +} diff --git a/cmd/agc/internal/agentpool/pool.go b/cmd/agc/internal/agentpool/pool.go index 4fa3c70b..93024cb4 100644 --- a/cmd/agc/internal/agentpool/pool.go +++ b/cmd/agc/internal/agentpool/pool.go @@ -138,7 +138,7 @@ type Pool struct { // NewPool creates a Pool for the given RunnerGroup. // runnerLabels is the label set passed to GitHub during runner registration. -// keyType selects the algorithm for newly-generated agent keys; empty defaults to KeyTypeEd25519. +// keyType selects the algorithm for newly-generated agent keys; empty defaults to KeyTypeRSA (the secure default). func NewPool(c client.Client, namespace, groupName, runnerVersion string, runnerLabels []string, registrar Registrar, keyType KeyType) *Pool { return &Pool{ client: c, diff --git a/cmd/agc/internal/controller/runnergroup_controller.go b/cmd/agc/internal/controller/runnergroup_controller.go index f368c15a..831fca3c 100644 --- a/cmd/agc/internal/controller/runnergroup_controller.go +++ b/cmd/agc/internal/controller/runnergroup_controller.go @@ -73,7 +73,7 @@ type RunnerGroupReconciler struct { Metrics *listener.Metrics Log *slog.Logger Provisioner *provisioner.Provisioner - AgentKeyType agentpool.KeyType // defaults to KeyTypeEd25519 when empty + AgentKeyType agentpool.KeyType // defaults to KeyTypeRSA (the secure default) when empty // Recorder emits Kubernetes Events on the reconciled RunnerGroup so that // credential, agent-pool, and listener failures surface in `kubectl describe diff --git a/docs/plan/release-1.0.md b/docs/plan/release-1.0.md index 2d49ba38..246aa5f1 100644 --- a/docs/plan/release-1.0.md +++ b/docs/plan/release-1.0.md @@ -342,11 +342,13 @@ docs caveat, not omission. Both admit an unverified artifact into a signed release. Fix (re-vendor + `git diff --exit-code` CI gate; checksum-pin cosign as the `KIND_BINARY_SHA256` pattern does) or caveat. -- **Library `generateKey` empty-keyType default is Ed25519** +- ~~**Library `generateKey` empty-keyType default is Ed25519** ([Q109](../STATUS.md)): an empty `keyType` yields Ed25519 — the secure-by-default regression `CLAUDE.md` explicitly forbids (Ed25519 agents can't decrypt RSA-OAEP session keys). Only the CLI default holds - RSA; the **library** default must be RSA too. + RSA; the **library** default must be RSA too.~~ **Resolved (Q109):** + `generateKey` now defaults an empty/unrecognised `keyType` to RSA-3072, + matching the CLI; Ed25519 requires the explicit `KeyTypeEd25519` opt-in. --- @@ -415,7 +417,7 @@ Suggested sequence: | D. Operability | Q35 (logging+probes) | Q34 HA, Q51, Q72, Q35 logger unify | | E. Docs honesty | capacity reframe, egress + sandbox caveats, ops install flow, Q103 SLSA-L3 claim | — | | F. Engineering quality | Q77 coverage, Q78 dup-check, Q79 `-race` unit, Q80 gosec, Q81 errcheck, Q84 shellcheck, Q66 install-artifact validation | — | -| G. Must-resolve (fix **or** caveat → Q99) | Q105 port-53 egress, Q126 + Q127-cosign release integrity, Q109 Ed25519 library default | — | +| G. Must-resolve (fix **or** caveat → Q99) | Q105 port-53 egress, Q126 + Q127-cosign release integrity, ~~Q109 Ed25519 library default~~ (fixed) | — | **1.0 = all gating boxes ticked**, and every bucket-G item either fixed or honestly caveated. Recommended items that slip become ordinary post-1.0 From 68098744aeb50a4eb3c085f1353ce508eba5cba4 Mon Sep 17 00:00:00 2001 From: Karl Date: Sun, 14 Jun 2026 12:28:51 -0700 Subject: [PATCH 2/4] docs(status): remove completed Q109 row (RSA library default) --- docs/STATUS.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/STATUS.md b/docs/STATUS.md index c9ceb892..53456fa8 100644 --- a/docs/STATUS.md +++ b/docs/STATUS.md @@ -94,7 +94,6 @@ Specific actionable items in priority order. Pick from the top; skip 🚫 items | Q106 | Non-atomic eviction-retry counter race | `bug` | 🔲 | S | handleEviction reads-modifies-writes the count with no per-key lock (provisioner.go:451). Two concurrent evictions of the same run_id both pass the budget check and both call rerun-failed-jobs, exceeding the budget. Serialize per runID (atomic CAS). | | Q107 | Token manager tight retry loop when TTL ≤ refreshLeadTime | `bug` `infra` | 🔲 | S | On success wait=ExpiresAt-5min-now clamped to 0 (token/manager.go:174); a near-expiry token (skew/short TTL) → fetch→wait 0→fetch spin, a retry storm vs the token endpoint. Floor the post-success wait (e.g. 30s) when wait<=0. | | Q108 | Broker long-poll has no per-request deadline | `infra` | 🔲 | S | AGC leaves BrokerConfig.HTTPClient nil, so GetMessage long-polls with no ResponseHeaderTimeout (broker/client.go:163; agc/main.go:416). A black-holed broker conn blocks a listener until the OS TCP timeout. Set a tuned client or per-poll ctx (~55s). | -| Q109 | Library agent-key-type default is ed25519, not the secure rsa | `security` | 🔲 | S | generateKey defaults empty keyType→Ed25519 (agentpool/crypto.go:24), the regressed path. Only the CLI default (rsa) holds the secure choice; any new constructor with empty keyType silently regresses. Make the library default RSA. See [Q11](#Q11). | | Q110 | coverage-baseline folds in untested test-helper packages | `tests` `infra` | 🔲 | S | coverage-baseline.txt floors (broker 48.3, gmc 48.2) fold in untested helpers (brokertest) and miss envtest coverage, so the ratchet misleads while core code is ~80%. Exclude helper pkgs so floors track production code. Sibling of Q77. | | Q55 | Verify provisioner-test goleak cascade fix held in CI | `tests` `bug` | 🔲 | S | Intermittent ~20-test goleak cascade in `internal/provisioner` fixed by `waitForPodCreated` helper in 59c0714; delete row once CI is clean. If flakes recur, migrate remaining ~18 Eventually-on-Pod sites to the helper. | | Q129 | [Public GitHub Pages website](plan/website.md) | `docs` `infra` | ▶ | L | [Public site](plan/website.md): MkDocs Material scaffold + landing + vs-ARC page + Pages workflow shipped, builds in CI. Remaining: public launch (gated on Q99 + enabling Pages); link reconcile + strict build folds into [Q52](#Q52). | From c7127c9bb2fc61f85852c23d7626d779fe2a8b21 Mon Sep 17 00:00:00 2001 From: Karl Date: Sun, 14 Jun 2026 12:54:40 -0700 Subject: [PATCH 3/4] test(agc): pin integration suite to Ed25519 agent keys (Q109) The AGC integration suite constructed RunnerGroupReconciler with a zero-value AgentKeyType. Before Q109 that meant fast Ed25519 keys; now empty defaults to RSA-3072, whose per-agent generation cost blows the suite's Eventually timeouts on CI (TestAGC_SIGTERM_DeletesAllSessions timed out). These suites exercise the session/reconcile lifecycle, not crypto, so set AgentKeyType explicitly to Ed25519 to keep them fast. The production default stays RSA-3072. --- .../controller/integration/suite_integration_test.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmd/agc/internal/controller/integration/suite_integration_test.go b/cmd/agc/internal/controller/integration/suite_integration_test.go index a29c08e1..12df2c2d 100644 --- a/cmd/agc/internal/controller/integration/suite_integration_test.go +++ b/cmd/agc/internal/controller/integration/suite_integration_test.go @@ -176,6 +176,11 @@ func startAGCReconcilerOpts(t *testing.T, opts provisionerOptions) (*controller. Client: mgr.GetClient(), TokenManager: tm, Registrar: reg, + // Use Ed25519 agent keys: these suites exercise the session/reconcile + // lifecycle, not crypto, and Ed25519 generation is near-instant. The + // production default is RSA-3072 (Q109), whose per-agent generation cost + // is high enough to blow these tests' Eventually timeouts on CI. + AgentKeyType: agentpool.KeyTypeEd25519, BrokerConfig: controller.BrokerConfig{ BrokerURL: brokerStub.URL, RunnerVersion: "2.335.1", From d9c9e4be0bb48d606cbfbe1e451dffbbca33bd7b Mon Sep 17 00:00:00 2001 From: Karl Date: Sun, 14 Jun 2026 13:19:53 -0700 Subject: [PATCH 4/4] docs(status): file Q135 e2e MultipleJobsQueued flake (flakes-go-first) --- docs/STATUS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/STATUS.md b/docs/STATUS.md index 53456fa8..372b0a6f 100644 --- a/docs/STATUS.md +++ b/docs/STATUS.md @@ -51,6 +51,7 @@ Specific actionable items in priority order. Pick from the top; skip 🚫 items | ID | Item | Labels | St | Sz | Notes | |---|---|---|---|---|---| +| Q135 | Flake: e2e E2E_AGC_MultipleJobsQueued — 4min timeout waiting for 2nd worker pod | `tests` `bug` | 🔲 | S | job_lifecycle_test.go:179 timed out waiting for the 2nd queued job's worker pod on PR 227 run 27510232006; passed on rerun w/o code change. Timing-sensitive worker-pod spawn under load; same shared-brokerStub/session-recycle class as Q120. | | Q134 | Flake: e2e E2E_AGC_WorkerPodAdmittedWithNonNumericUserImage — "no session registered" 180s timeout | `tests` `bug` `1.0-gate` | 🔲 | M | Heaviest e2e spec (real worker image) times out waiting for an AGC session (worker_securitycontext_test.go:96). Hit main + 2 PRs ~2026-06-14 17:00; other runs passed — intermittent. Likely AGC session/image-pull timing; blocks PRs when it fires. | | Q131 | Flake: TestListener_IdleNotShutdownIfLast poll-count timing | `tests` `bug` | 🔲 | S | goroutine_test.go:419 asserted poll count ≥5 but got 3 ("poll past threshold when last listener") on a local `make check`; passed on rerun (-count=3) w/o code change. Timing-sensitive idle-shutdown test; tighten synchronization, not the count. | | Q113 | Flake: eviction integration tests time out in waitForWorkerPod | `tests` `bug` | 🔲 | S | EvictionTriggersRequeue + EvictionBudgetExhausted (failure_recovery_test.go:142) timed out (20s) on CI run 27383065643, passed on rerun w/o code change. Suspects: sessions[len-1] pick on shared brokerStub; 20s budget on 2-vCPU runner. |