diff --git a/cmd/agc/internal/listener/goroutine.go b/cmd/agc/internal/listener/goroutine.go
index 254f6073..24013a74 100644
--- a/cmd/agc/internal/listener/goroutine.go
+++ b/cmd/agc/internal/listener/goroutine.go
@@ -66,9 +66,22 @@ type Config struct {
IdleThreshold int // consecutive 202s before idle shutdown; 0 means 50
// RenewInterval is the cadence of the per-job RenewJob loop. 0 means 60s.
RenewInterval time.Duration
- JobHandler JobHandlerFunc
- Clock Clock
- Log *slog.Logger
+ // ControlPlaneTimeout bounds each non-long-poll broker call on the
+ // session-establishment path — the OAuth token exchange and CreateSession —
+ // so a slow or unresponsive broker cannot wedge the goroutine indefinitely.
+ // Without it those calls inherit only the long-lived manager context (the
+ // broker uses http.DefaultClient, which has no read timeout), so a broker
+ // that accepts the connection but is slow to respond — e.g. an overloaded
+ // shared fakegithub under parallel CI load — blocks the goroutine inside a
+ // single attempt and the RunnerGroup never registers a session (Q134). With
+ // a deadline the call fails fast and retriably, so the Multiplexer restarts
+ // the baseline and retries. Zero selects defaultControlPlaneTimeout. The
+ // GetMessage long-poll is deliberately excluded — it holds the connection
+ // open for the broker's poll interval by design.
+ ControlPlaneTimeout time.Duration
+ JobHandler JobHandlerFunc
+ Clock Clock
+ Log *slog.Logger
// RunnerOS is passed to AcquireJob (e.g. "Linux").
RunnerOS string
@@ -112,6 +125,22 @@ type Config struct {
// traffic.
const staleEOFThreshold = 3
+// defaultControlPlaneTimeout is the per-call deadline applied to the listener's
+// non-long-poll broker operations (OAuth token exchange, CreateSession) when
+// Config.ControlPlaneTimeout is unset. 30s is generous for a healthy round-trip
+// to GitHub yet tight enough that several retries fit inside the e2e
+// session-registration budget when the broker stalls (Q134).
+const defaultControlPlaneTimeout = 30 * time.Second
+
+// controlPlaneTimeout returns the per-call deadline for the goroutine's
+// non-long-poll broker operations, defaulting when unset.
+func (cfg Config) controlPlaneTimeout() time.Duration {
+ if cfg.ControlPlaneTimeout > 0 {
+ return cfg.ControlPlaneTimeout
+ }
+ return defaultControlPlaneTimeout
+}
+
// Run executes the listener goroutine. It blocks until the context is cancelled
// or an unrecoverable error occurs (VersionTooOldError, unauthorized).
// The caller (Multiplexer) is responsible for restarting it after a recoverable exit.
@@ -321,7 +350,9 @@ func Run(ctx context.Context, cfg Config) error {
// refreshBrokerToken fetches a fresh OAuth token and sets it on cfg.Broker.
func refreshBrokerToken(ctx context.Context, cfg Config) error {
- token, err := githubapp.FetchRunnerOAuthToken(ctx, cfg.Agent.Creds, cfg.Agent.PrivateKey, cfg.HTTPClient)
+ cctx, cancel := context.WithTimeout(ctx, cfg.controlPlaneTimeout())
+ defer cancel()
+ token, err := githubapp.FetchRunnerOAuthToken(cctx, cfg.Agent.Creds, cfg.Agent.PrivateKey, cfg.HTTPClient)
if err != nil {
return fmt.Errorf("refresh broker token: %w", err)
}
@@ -350,7 +381,9 @@ type sessionState struct {
// the AES-256-CBC message key from the server's RSA-encrypted session key.
func createSession(ctx context.Context, cfg Config, log *slog.Logger) (sessionState, error) {
agentName := fmt.Sprintf("%s-%d", cfg.Group, cfg.Agent.Index)
- sess, err := cfg.Broker.CreateSession(ctx, cfg.Agent.AgentID, agentName, cfg.Agent.RunnerVersion)
+ cctx, cancel := context.WithTimeout(ctx, cfg.controlPlaneTimeout())
+ defer cancel()
+ sess, err := cfg.Broker.CreateSession(cctx, cfg.Agent.AgentID, agentName, cfg.Agent.RunnerVersion)
if err != nil {
var vtooOld *broker.VersionTooOldError
if errors.As(err, &vtooOld) {
@@ -421,13 +454,20 @@ func handleJob(ctx context.Context, cfg Config, log *slog.Logger, aesKey []byte,
runServiceURL = jobBody.RunServiceURL
)
- // Call AcquireJob if we have a runServiceURL.
+ // Call AcquireJob if we have a runServiceURL. Bounded by the control-plane
+ // timeout for the same reason as createSession: it is a short request/response
+ // call (not the long-poll), so an unresponsive broker here must not wedge the
+ // goroutine — that would block job pickup and the worker pod would never spawn
+ // (Q134 class). A timeout surfaces as a recoverable AcquireJob error; the poll
+ // loop logs it and continues, re-acquiring on the next delivery.
if runServiceURL != "" {
- resp, rawBytes, acqErr := cfg.Broker.AcquireJob(ctx, runServiceURL, broker.JobAcquisitionRequest{
+ acqCtx, cancelAcq := context.WithTimeout(ctx, cfg.controlPlaneTimeout())
+ resp, rawBytes, acqErr := cfg.Broker.AcquireJob(acqCtx, runServiceURL, broker.JobAcquisitionRequest{
JobMessageID: jobBody.RunnerRequestID,
RunnerOS: cfg.RunnerOS,
BillingOwnerID: jobBody.BillingOwnerID,
})
+ cancelAcq()
if acqErr != nil {
if cfg.Metrics != nil {
cfg.Metrics.JobAcquisitionErrors.WithLabelValues(cfg.Namespace, "acquirejob_failed").Inc()
diff --git a/cmd/agc/internal/listener/goroutine_test.go b/cmd/agc/internal/listener/goroutine_test.go
index 4105c6d6..5bd95ada 100644
--- a/cmd/agc/internal/listener/goroutine_test.go
+++ b/cmd/agc/internal/listener/goroutine_test.go
@@ -10,6 +10,7 @@ import (
"crypto/sha1" //nolint:gosec // SHA-1 required by RSA-OAEP to match server side
"encoding/base64"
"encoding/json"
+ "errors"
"io"
"net/http"
"net/http/httptest"
@@ -353,6 +354,64 @@ func TestListener_CreateSessionUnauthorized(t *testing.T) {
goleak.VerifyNone(t)
}
+// TestListener_CreateSessionStallDoesNotWedge proves that a broker which
+// accepts the connection but never responds to CreateSession cannot wedge the
+// goroutine: the per-call ControlPlaneTimeout fires, Run returns a *retriable*
+// error well before the outer context deadline, and the Multiplexer is free to
+// restart the baseline and retry. This is the Q134 regression guard — before
+// the control-plane timeout the goroutine inherited only the long-lived manager
+// context and blocked inside a single CreateSession indefinitely, so the
+// RunnerGroup never registered a session within the e2e budget.
+func TestListener_CreateSessionStallDoesNotWedge(t *testing.T) {
+ oauthSrv := oauthStub()
+
+ // The Create handler simulates an overloaded broker that accepts the
+ // connection but is slow to respond: it blocks until the test releases it
+ // (stop), with a hard-cap fallback so it can never wedge the run. Note we do
+ // NOT rely on r.Context() cancellation here — server-side observation of a
+ // client's context-deadline disconnect is not prompt or reliable, so the
+ // test drives the handler's lifetime directly via stop.
+ stop := make(chan struct{})
+ var once sync.Once
+ handlerReturned := make(chan struct{})
+ mux := &brokerMux{}
+ mux.SetCreate(func(_ http.ResponseWriter, _ *http.Request) {
+ select {
+ case <-stop:
+ case <-time.After(30 * time.Second): // safety cap; close(stop) fires first
+ }
+ once.Do(func() { close(handlerReturned) })
+ })
+ brokerSrv := httptest.NewServer(mux)
+
+ cfg := makeCfg(t, oauthSrv, brokerSrv)
+ cfg.ControlPlaneTimeout = 200 * time.Millisecond
+
+ // Generous outer deadline: the assertion is that Run returns far sooner,
+ // proving the per-call timeout fired rather than the outer context.
+ ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+ defer cancel()
+
+ start := time.Now()
+ err := listener.Run(ctx, cfg)
+ elapsed := time.Since(start)
+
+ require.Error(t, err, "Run must surface the stalled CreateSession, not hang")
+ assert.Less(t, elapsed, 3*time.Second,
+ "Run should fail fast on a stalled CreateSession (got %s)", elapsed)
+ // The failure must be retriable so the Multiplexer restarts the baseline; a
+ // NonRetriableError would permanently park it.
+ var nre *listener.NonRetriableError
+ assert.False(t, errors.As(err, &nre),
+ "a control-plane timeout must be retriable, got %v", err)
+
+ close(stop) // release the blocked handler
+ <-handlerReturned // ...and wait for it to return before the leak check
+ closeHTTP(oauthSrv)
+ closeHTTP(brokerSrv)
+ goleak.VerifyNone(t)
+}
+
func TestListener_GetMessage202Loop(t *testing.T) {
oauthSrv := oauthStub()
var polls atomic.Int32
@@ -526,6 +585,64 @@ func TestListener_AcquireJobThenReuse(t *testing.T) {
goleak.VerifyNone(t)
}
+// TestListener_AcquireJobStallDoesNotWedge proves the control-plane timeout also
+// guards the job-pickup path: a broker that accepts the connection but never
+// responds to AcquireJob must not wedge the goroutine inside handleJob (which
+// would block job pickup so the worker pod never spawns — the Q134 class at the
+// AcquireJob call site). The bounded AcquireJob fails fast, handleJob returns a
+// recoverable error, and the poll loop re-enters GetMessage.
+func TestListener_AcquireJobStallDoesNotWedge(t *testing.T) {
+ oauthSrv := oauthStub()
+ var delivered atomic.Bool
+ var pollsAfter atomic.Int32
+ mux := &brokerMux{}
+ brokerSrv := httptest.NewServer(mux)
+
+ mux.SetGetMessage(func(w http.ResponseWriter, _ *http.Request) {
+ if !delivered.Swap(true) {
+ w.Header().Set("Content-Type", "application/json")
+ _ = json.NewEncoder(w).Encode(jobMsgWithURL(brokerSrv.URL))
+ return
+ }
+ pollsAfter.Add(1)
+ w.WriteHeader(http.StatusAccepted)
+ })
+
+ // AcquireJob accepts the connection but is slow to respond. Without the
+ // per-call timeout the listener would wedge in handleJob here and never poll
+ // again; with it, AcquireJob fails fast and the poll loop continues.
+ stop := make(chan struct{})
+ var once sync.Once
+ handlerReturned := make(chan struct{})
+ mux.SetAcquire(func(_ http.ResponseWriter, _ *http.Request) {
+ select {
+ case <-stop:
+ case <-time.After(30 * time.Second): // safety cap; close(stop) fires first
+ }
+ once.Do(func() { close(handlerReturned) })
+ })
+
+ cfg := makeCfg(t, oauthSrv, brokerSrv)
+ cfg.IsLastListener = func() bool { return true }
+ cfg.ControlPlaneTimeout = 200 * time.Millisecond
+ cfg.JobHandler = func(_ context.Context, _, _ string, _ []byte, _ string) error { return nil }
+
+ ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+ defer cancel()
+
+ done := runAndWait(ctx, cfg)
+ assert.Eventually(t, func() bool { return pollsAfter.Load() > 0 }, 3*time.Second, 10*time.Millisecond,
+ "listener should re-poll after a stalled AcquireJob times out, not wedge in handleJob")
+ cancel()
+ <-done
+
+ close(stop)
+ <-handlerReturned
+ closeHTTP(oauthSrv)
+ closeHTTP(brokerSrv)
+ goleak.VerifyNone(t)
+}
+
func TestListener_SpawnReplacementOnAcquire(t *testing.T) {
oauthSrv := oauthStub()
var spawnCalls atomic.Int32
diff --git a/docs/STATUS.md b/docs/STATUS.md
index ded9359d..6a4c8be1 100644
--- a/docs/STATUS.md
+++ b/docs/STATUS.md
@@ -51,12 +51,15 @@ Specific actionable items in priority order. Pick from the top; skip 🚫 items
| ID | Item | Labels | St | Sz | Notes |
|---|---|---|---|---|---|
+| Q134 | Flake: e2e "no session registered" 180s timeout (WorkerPodAdmitted / SessionRegistered) | `tests` `bug` `1.0-gate` | 🔲 | S | Root-caused: listener createSession+OAuth fetch run on the manager ctx with no per-call deadline → a slow shared fakegithub wedges the goroutine past the 180s budget. Fix bounds control-plane broker calls; verify green post-merge. |
| Q135 | Flake: e2e E2E_AGC_MultipleJobsQueued — 4min timeout waiting for 2nd worker pod | `tests` `bug` | 🔲 | S | job_lifecycle_test.go:179 timed out waiting for the 2nd queued job's worker pod on PR 227 run 27510232006; passed on rerun w/o code change. Timing-sensitive worker-pod spawn under load; same shared-brokerStub/session-recycle class as Q120. |
-| Q134 | Flake: e2e E2E_AGC_WorkerPodAdmittedWithNonNumericUserImage — "no session registered" 180s timeout | `tests` `bug` `1.0-gate` | 🔲 | M | Heaviest e2e spec (real worker image) times out waiting for an AGC session (worker_securitycontext_test.go:96). Hit main + 2 PRs ~2026-06-14 17:00; other runs passed — intermittent. Likely AGC session/image-pull timing; blocks PRs when it fires. |
+| Q139 | Flake: e2e E2E_GMC_TenantProvisioning_ProxyConnectWorks — curl through proxy gets HTTP 504 | `tests` `bug` `infra` | 🔲 | M | provisioning_test.go:282: curl CONNECT through the per-tenant egress proxy to GitHub returned 504 on PR 231 e2e; other specs passed — intermittent. Proxy upstream/tunnel timeout under CI load; independent of the AGC session flakes (Q134). |
| Q131 | Flake: TestListener_IdleNotShutdownIfLast poll-count timing | `tests` `bug` | 🔲 | S | goroutine_test.go:419 asserted poll count ≥5 but got 3 ("poll past threshold when last listener") on a local `make check`; passed on rerun (-count=3) w/o code change. Timing-sensitive idle-shutdown test; tighten synchronization, not the count. |
| Q113 | Flake: eviction integration tests time out in waitForWorkerPod | `tests` `bug` | 🔲 | S | EvictionTriggersRequeue + EvictionBudgetExhausted (failure_recovery_test.go:142) timed out (20s) on CI run 27383065643, passed on rerun w/o code change. Suspects: sessions[len-1] pick on shared brokerStub; 20s budget on 2-vCPU runner. |
| Q120 | Flake: SIGTERM integration test misses session-delete budget | `tests` `bug` | 🔲 | S | TestAGC_SIGTERM_DeletesAllSessions: session-39 missed the 10s WaitForSessionDelete on CI run 27422248358 (PR 209), passed on rerun w/o code change. ~40 concurrent teardowns on a 2-vCPU runner; same shared-brokerStub/budget class as Q113. |
| Q136 | node-local-dns (NodeLocal DNSCache) support for tenant DNS egress | `security` `infra` `1.0-gate` | 🔲 | S | Q105 DNS rule (kube-dns podSelector) breaks NodeLocal DNSCache: pods query link-local 169.254.20.10 (hostNetwork pod) → dropped on enforcing CNI, incl proxy. Fix: also allow port-53 to 169.254.0.0/16 (link-local, non-routable, keeps Q105). |
+| Q137 | AGC RunnerGroup not re-reconciled after a non-retriable baseline-listener exit | `bug` `infra` | 🔲 | S | runnergroup_controller.go returns RequeueAfter=reapAfter (0 with no worker pods); if the permanent baseline exits non-retriably the L317 restart only fires on the next watch event — up to the 10h resync. Requeue when ActiveCountQ138 | Bounded-by-default HTTP clients — retrofit http.DefaultClient fallbacks + lint gate | `infra` `bug` `tests` | 🔲 | M | ~8 prod clients default to http.DefaultClient (no read timeout); a slow peer wedges the goroutine (Q134 class). Add a bounded-by-default httpx client, make long-poll the explicit exception, and gate new uses with forbidigo+noctx. |
| Q98 | Helm chart distribution/publishing pipeline | `infra` `1.0-gate` | 🔲 | M | Pipeline shipped: publish.yml chart-publish job packages, pushes (oci://ghcr/charts), and cosign-signs the chart. Remaining (first v* tag): live publish proof, flip prerelease annotation, oci:// in upgrade.md/README, Artifact Hub listing. |
| Q112 | GMC Events silently 403'd: recorder writes events.k8s.io, RBAC grants core only | `bug` `infra` | 🔲 | S | Same root cause as the AGC fix in PR 202 (Q95): GMC uses mgr.GetEventRecorder (writes events.k8s.io/v1) but its kubebuilder marker grants only core "" events, so every GMC Event is dropped. Fix marker + `make manifests`; assert one event in e2e. |
| Q9 | [M3-tests remaining items (H2/M/L)](plan/milestone-3-tests.md) | `milestone` `tests` | 🔲 | M | **Unblocked** — M3 metric assertions (H1) landed. Highest-leverage remaining: **H2** (rerun-API 5xx contract), **H3** (decryption-failure fallback), **M3** (`activePodCount` Pending branch). Worth picking up after 5c–5g. |
diff --git a/docs/plan/q134-session-registration-flake.md b/docs/plan/q134-session-registration-flake.md
new file mode 100644
index 00000000..ccf877a1
--- /dev/null
+++ b/docs/plan/q134-session-registration-flake.md
@@ -0,0 +1,88 @@
+# Q134 — e2e "no session registered" flake: root cause and fix
+
+Intermittent failure of the AGC→fakegithub session-registration e2e specs
+(`E2E_AGC_WorkerPodAdmittedWithNonNumericUserImage`, `E2E_AGC_SessionRegistered`,
+`E2E_AGC_MultipleJobsQueued`). One spec per run times out (180–240s) on
+`fakegithubActiveSessionsForOwner(...)` staying empty — the AGC never registers
+a broker session with the shared in-cluster `fakegithub`.
+
+## What it is NOT
+
+- **Not a regression from `5e490d9` (Q134's own "fix", PR #226).** The identical
+ failure (`worker_securitycontext_test.go:96`, "no session registered", 180s)
+ occurred on `main` at `5bab2d50e` (run 27505615811), *before* the "last green"
+ `ac54a78`. The apparent `ac54a78`-green / `5e490d9`-red bisect boundary is an
+ artifact of an intermittent flake landing on a green run by luck. It predates
+ the commit.
+- **Not the gate added by `5e490d9`.** `WaitForRunnerGroupReconciled` waits for
+ `RunnerGroup.status.observedGeneration`, which the reconciler sets
+ (`runnergroup_controller.go:327`) *synchronously, right after `mux.Start()`
+ returns*. `mux.Start()` only *spawns* the listener goroutine — it does not wait
+ for `createSession` to succeed. So `observedGeneration` is set ~9s after the
+ tenant is applied (CI timeline) while the session is still not registered; the
+ gate passes in <1s and the 180s session wait then fails. The gate's premise
+ ("observedGeneration ⇒ a broker session is imminent") is incomplete. It is
+ harmless but did not address the flake.
+- **Not DNS/NetworkPolicy.** `EnsureAgents` (agent registration, an HTTP
+ round-trip to fakegithub) *succeeds* at reconcile time — that is what sets
+ `observedGeneration` the gate observes. So AGC→fakegithub connectivity is fine
+ when the session subsequently fails to register.
+
+## Root cause
+
+The listener's session-establishment broker calls have **no per-call deadline**.
+`createSession` (`goroutine.go`) and the OAuth token exchange
+(`refreshBrokerToken` → `githubapp.FetchRunnerOAuthToken`) run on the goroutine's
+context, which derives from the **long-lived manager context** (cancelled only at
+AGC shutdown). The per-agent `broker.Client` is built with `HTTPClient: nil`
+(`BrokerConfig.HTTPClient` is never set in `cmd/agc/main.go`), so it falls back to
+`http.DefaultClient` — which has no overall/read timeout.
+
+Consequently, if fakegithub *accepts the TCP connection but is slow to respond*
+— plausible for a single-replica service under 6-proc parallel CI load — the
+goroutine blocks **inside a single `createSession`/token call indefinitely**
+rather than failing and retrying. The Multiplexer restarts a baseline listener
+every ~1s on a *returned* (retriable) error, but a wedged call never returns, so
+no retry happens and the RunnerGroup never registers a session within the test
+budget. The missing timeout is what converts transient slowness into a
+budget-exhausting hang.
+
+## Fix
+
+Bound the two session-establishment broker calls with a per-call deadline
+(`Config.ControlPlaneTimeout`, default 30s). A stalled call now fails fast and
+*retriably*, so the Multiplexer restarts the baseline and retries — many attempts
+fit inside the 180s budget. The `GetMessage` long-poll is deliberately left
+unbounded (it holds the connection open for the broker's poll interval by
+design).
+
+- `cmd/agc/internal/listener/goroutine.go`: `Config.ControlPlaneTimeout` +
+ `controlPlaneTimeout()` default; wrap `createSession` and `refreshBrokerToken`.
+- Test: `TestListener_CreateSessionStallDoesNotWedge` — a broker that never
+ responds to `CreateSession` makes `Run` return a *retriable* error well before
+ the outer deadline, instead of hanging.
+
+## CI validation and the AcquireJob extension
+
+The first push (createSession + OAuth bound) got the suite **past session
+registration** in CI: the next e2e run logged `job 1: picking a live session →
+enqueuing onto session-4`, then still timed out at `job_lifecycle_test.go:179`
+with `expected >= 1 new worker pods, have 0`. A job was enqueued onto a *live*
+session but no worker pod spawned for 240s — the **same unbounded-control-plane
+class at the next call site**: `AcquireJob` (the request between job delivery and
+pod provisioning) also ran on the manager ctx with `http.DefaultClient`. It is
+now bound by the same `ControlPlaneTimeout`; a stalled `AcquireJob` fails fast
+and the poll loop re-acquires (its error is already handled as recoverable).
+
+The e2e suite is **multi-flaky**, not single-cause. The same run also failed
+`E2E_GMC_TenantProvisioning_ProxyConnectWorks` (`provisioning_test.go:282`, a
+curl-through-proxy egress test) — a fast failure unrelated to AGC sessions or
+this change. That is a separate flake outside this PR's scope.
+
+## Follow-ups (filed separately)
+
+- **Q136** — `runnergroup_controller.go` returns `RequeueAfter=reapAfter`, which
+ is 0 when the RunnerGroup has no worker pods. If the permanent baseline exits
+ *non-retriably* (pool-exhausted / unauthorized), the recovery at L317
+ (`if mux.ActiveCount()==0 { mux.Start() }`) only fires on the next watch event —
+ up to the 10h resync. Requeue when `ActiveCount < desired`.