Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions cmd/agc/internal/agentpool/crypto.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,28 @@ import (
type KeyType string

const (
// KeyTypeEd25519 generates an Ed25519 key (default).
// KeyTypeEd25519 generates an Ed25519 key (explicit opt-in). Ed25519 agents
// cannot decrypt RSA-OAEP session keys, so selecting it loses a layer of
// session-key encryption — see docs/design/05-security.md.
KeyTypeEd25519 KeyType = "ed25519"
// KeyTypeRSA generates an RSA-3072 key.
// KeyTypeRSA generates an RSA-3072 key (secure default).
KeyTypeRSA KeyType = "rsa"
)

// generateKey returns a new private key of the requested type.
// An empty KeyType defaults to KeyTypeEd25519.
//
// An empty KeyType defaults to KeyTypeRSA (RSA-3072), the secure default that
// preserves RSA-OAEP session-key encryption. Ed25519 must be requested
// explicitly via KeyTypeEd25519: defaulting empty→Ed25519 would be a
// secure-by-default regression (any caller that omits the key type would
// silently lose a layer of encryption).
func generateKey(kt KeyType) (crypto.Signer, error) {
switch kt {
case KeyTypeRSA:
return rsa.GenerateKey(rand.Reader, 3072)
default:
case KeyTypeEd25519:
_, priv, err := ed25519.GenerateKey(rand.Reader)
return priv, err
default:
return rsa.GenerateKey(rand.Reader, 3072)
}
}

Expand Down
48 changes: 48 additions & 0 deletions cmd/agc/internal/agentpool/crypto_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package agentpool

import (
"crypto/ed25519"
"crypto/rsa"
"testing"
)

// TestGenerateKeyDefaultIsRSA pins the secure-by-default contract: an empty
// (unspecified) KeyType must yield an RSA-3072 key, never Ed25519. Defaulting
// empty→Ed25519 would silently drop RSA-OAEP session-key encryption for any
// caller that omits the key type (Q109 regression). RSA stays the default;
// Ed25519 is an explicit opt-in only (Q11).
func TestGenerateKeyDefaultIsRSA(t *testing.T) {
tests := []struct {
name string
keyType KeyType
wantRSA bool
}{
{name: "empty defaults to RSA", keyType: "", wantRSA: true},
{name: "explicit rsa", keyType: KeyTypeRSA, wantRSA: true},
{name: "unrecognised value defaults to RSA", keyType: KeyType("bogus"), wantRSA: true},
{name: "explicit ed25519 opt-in", keyType: KeyTypeEd25519, wantRSA: false},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
signer, err := generateKey(tc.keyType)
if err != nil {
t.Fatalf("generateKey(%q): %v", tc.keyType, err)
}
switch key := signer.(type) {
case *rsa.PrivateKey:
if !tc.wantRSA {
t.Fatalf("generateKey(%q) = RSA, want Ed25519", tc.keyType)
}
if got := key.N.BitLen(); got != 3072 {
t.Errorf("RSA key size = %d bits, want 3072", got)
}
case ed25519.PrivateKey:
if tc.wantRSA {
t.Fatalf("generateKey(%q) = Ed25519, want RSA", tc.keyType)
}
default:
t.Fatalf("generateKey(%q) = %T, want *rsa.PrivateKey or ed25519.PrivateKey", tc.keyType, signer)
}
})
}
}
2 changes: 1 addition & 1 deletion cmd/agc/internal/agentpool/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ type Pool struct {

// NewPool creates a Pool for the given RunnerGroup.
// runnerLabels is the label set passed to GitHub during runner registration.
// keyType selects the algorithm for newly-generated agent keys; empty defaults to KeyTypeEd25519.
// keyType selects the algorithm for newly-generated agent keys; empty defaults to KeyTypeRSA (the secure default).
func NewPool(c client.Client, namespace, groupName, runnerVersion string, runnerLabels []string, registrar Registrar, keyType KeyType) *Pool {
return &Pool{
client: c,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,11 @@ func startAGCReconcilerOpts(t *testing.T, opts provisionerOptions) (*controller.
Client: mgr.GetClient(),
TokenManager: tm,
Registrar: reg,
// Use Ed25519 agent keys: these suites exercise the session/reconcile
// lifecycle, not crypto, and Ed25519 generation is near-instant. The
// production default is RSA-3072 (Q109), whose per-agent generation cost
// is high enough to blow these tests' Eventually timeouts on CI.
AgentKeyType: agentpool.KeyTypeEd25519,
BrokerConfig: controller.BrokerConfig{
BrokerURL: brokerStub.URL,
RunnerVersion: "2.335.1",
Expand Down
2 changes: 1 addition & 1 deletion cmd/agc/internal/controller/runnergroup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ type RunnerGroupReconciler struct {
Metrics *listener.Metrics
Log *slog.Logger
Provisioner *provisioner.Provisioner
AgentKeyType agentpool.KeyType // defaults to KeyTypeEd25519 when empty
AgentKeyType agentpool.KeyType // defaults to KeyTypeRSA (the secure default) when empty

// Recorder emits Kubernetes Events on the reconciled RunnerGroup so that
// credential, agent-pool, and listener failures surface in `kubectl describe
Expand Down
2 changes: 1 addition & 1 deletion docs/STATUS.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ Specific actionable items in priority order. Pick from the top; skip 🚫 items

| ID | Item | Labels | St | Sz | Notes |
|---|---|---|---|---|---|
| <a id="Q135"></a>Q135 | Flake: e2e E2E_AGC_MultipleJobsQueued — 4min timeout waiting for 2nd worker pod | `tests` `bug` | 🔲 | S | job_lifecycle_test.go:179 timed out waiting for the 2nd queued job's worker pod on PR 227 run 27510232006; passed on rerun w/o code change. Timing-sensitive worker-pod spawn under load; same shared-brokerStub/session-recycle class as Q120. |
| <a id="Q134"></a>Q134 | Flake: e2e E2E_AGC_WorkerPodAdmittedWithNonNumericUserImage — "no session registered" 180s timeout | `tests` `bug` `1.0-gate` | 🔲 | M | Heaviest e2e spec (real worker image) times out waiting for an AGC session (worker_securitycontext_test.go:96). Hit main + 2 PRs ~2026-06-14 17:00; other runs passed — intermittent. Likely AGC session/image-pull timing; blocks PRs when it fires. |
| <a id="Q131"></a>Q131 | Flake: TestListener_IdleNotShutdownIfLast poll-count timing | `tests` `bug` | 🔲 | S | goroutine_test.go:419 asserted poll count ≥5 but got 3 ("poll past threshold when last listener") on a local `make check`; passed on rerun (-count=3) w/o code change. Timing-sensitive idle-shutdown test; tighten synchronization, not the count. |
| <a id="Q113"></a>Q113 | Flake: eviction integration tests time out in waitForWorkerPod | `tests` `bug` | 🔲 | S | EvictionTriggersRequeue + EvictionBudgetExhausted (failure_recovery_test.go:142) timed out (20s) on CI run 27383065643, passed on rerun w/o code change. Suspects: sessions[len-1] pick on shared brokerStub; 20s budget on 2-vCPU runner. |
Expand Down Expand Up @@ -94,7 +95,6 @@ Specific actionable items in priority order. Pick from the top; skip 🚫 items
| <a id="Q106"></a>Q106 | Non-atomic eviction-retry counter race | `bug` | 🔲 | S | handleEviction reads-modifies-writes the count with no per-key lock (provisioner.go:451). Two concurrent evictions of the same run_id both pass the budget check and both call rerun-failed-jobs, exceeding the budget. Serialize per runID (atomic CAS). |
| <a id="Q107"></a>Q107 | Token manager tight retry loop when TTL ≤ refreshLeadTime | `bug` `infra` | 🔲 | S | On success wait=ExpiresAt-5min-now clamped to 0 (token/manager.go:174); a near-expiry token (skew/short TTL) → fetch→wait 0→fetch spin, a retry storm vs the token endpoint. Floor the post-success wait (e.g. 30s) when wait<=0. |
| <a id="Q108"></a>Q108 | Broker long-poll has no per-request deadline | `infra` | 🔲 | S | AGC leaves BrokerConfig.HTTPClient nil, so GetMessage long-polls with no ResponseHeaderTimeout (broker/client.go:163; agc/main.go:416). A black-holed broker conn blocks a listener until the OS TCP timeout. Set a tuned client or per-poll ctx (~55s). |
| <a id="Q109"></a>Q109 | Library agent-key-type default is ed25519, not the secure rsa | `security` | 🔲 | S | generateKey defaults empty keyType→Ed25519 (agentpool/crypto.go:24), the regressed path. Only the CLI default (rsa) holds the secure choice; any new constructor with empty keyType silently regresses. Make the library default RSA. See [Q11](#Q11). |
| <a id="Q110"></a>Q110 | coverage-baseline folds in untested test-helper packages | `tests` `infra` | 🔲 | S | coverage-baseline.txt floors (broker 48.3, gmc 48.2) fold in untested helpers (brokertest) and miss envtest coverage, so the ratchet misleads while core code is ~80%. Exclude helper pkgs so floors track production code. Sibling of Q77. |
| <a id="Q55"></a>Q55 | Verify provisioner-test goleak cascade fix held in CI | `tests` `bug` | 🔲 | S | Intermittent ~20-test goleak cascade in `internal/provisioner` fixed by `waitForPodCreated` helper in 59c0714; delete row once CI is clean. If flakes recur, migrate remaining ~18 Eventually-on-Pod sites to the helper. |
| <a id="Q129"></a>Q129 | [Public GitHub Pages website](plan/website.md) | `docs` `infra` | ▶ | L | [Public site](plan/website.md): MkDocs Material scaffold + landing + vs-ARC page + Pages workflow shipped, builds in CI. Remaining: public launch (gated on Q99 + enabling Pages); link reconcile + strict build folds into [Q52](#Q52). |
Expand Down
8 changes: 5 additions & 3 deletions docs/plan/release-1.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -342,11 +342,13 @@ docs caveat, not omission.
Both admit an unverified artifact into a signed release. Fix
(re-vendor + `git diff --exit-code` CI gate; checksum-pin cosign as the
`KIND_BINARY_SHA256` pattern does) or caveat.
- **Library `generateKey` empty-keyType default is Ed25519**
- ~~**Library `generateKey` empty-keyType default is Ed25519**
([Q109](../STATUS.md)): an empty `keyType` yields Ed25519 — the
secure-by-default regression `CLAUDE.md` explicitly forbids (Ed25519
agents can't decrypt RSA-OAEP session keys). Only the CLI default holds
RSA; the **library** default must be RSA too.
RSA; the **library** default must be RSA too.~~ **Resolved (Q109):**
`generateKey` now defaults an empty/unrecognised `keyType` to RSA-3072,
matching the CLI; Ed25519 requires the explicit `KeyTypeEd25519` opt-in.

---

Expand Down Expand Up @@ -415,7 +417,7 @@ Suggested sequence:
| D. Operability | Q35 (logging+probes) | Q34 HA, Q51, Q72, Q35 logger unify |
| E. Docs honesty | capacity reframe, egress + sandbox caveats, ops install flow, Q103 SLSA-L3 claim | — |
| F. Engineering quality | Q77 coverage, Q78 dup-check, Q79 `-race` unit, Q80 gosec, Q81 errcheck, Q84 shellcheck, Q66 install-artifact validation | — |
| G. Must-resolve (fix **or** caveat → Q99) | Q105 port-53 egress, Q126 + Q127-cosign release integrity, Q109 Ed25519 library default | — |
| G. Must-resolve (fix **or** caveat → Q99) | Q105 port-53 egress, Q126 + Q127-cosign release integrity, ~~Q109 Ed25519 library default~~ (fixed) | — |

**1.0 = all gating boxes ticked**, and every bucket-G item either fixed or
honestly caveated. Recommended items that slip become ordinary post-1.0
Expand Down
Loading