From f12fb64c83e5988339ab383ffe69a2035632cc08 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Wed, 15 Apr 2026 22:50:43 +0200 Subject: [PATCH 01/24] fix: use CompareAndDelete for race-free listenQueue cleanup Replace the missing queue cleanup in Listen() with a single defer s.listenQueues.CompareAndDelete(leaseName, queue) call. This fixes issue #414 where a race between Listen() cleanup and Dial() token delivery causes intermittent "Connection to exporter lost" errors in E2E tests. CompareAndDelete only removes the queue if it is still the same channel instance that this invocation created, so a reconnecting exporter's new queue is never accidentally deleted by an old invocation's deferred cleanup. Compared to the timer-based approach in PR #417, this solution: - Eliminates the known race at timer expiry - Requires no additional struct fields (listenTimers) or goroutines - Has no timing-dependent test behavior Generated-By: Forge/20260415_224144_3227186_20142bee Co-Authored-By: Claude Opus 4.6 --- .../internal/service/controller_service.go | 1 + .../service/controller_service_test.go | 83 +++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/controller/internal/service/controller_service.go b/controller/internal/service/controller_service.go index f0ec66f0a..684e7259d 100644 --- a/controller/internal/service/controller_service.go +++ b/controller/internal/service/controller_service.go @@ -440,6 +440,7 @@ func (s *ControllerService) Listen(req *pb.ListenRequest, stream pb.ControllerSe } queue, _ := s.listenQueues.LoadOrStore(leaseName, make(chan *pb.ListenResponse, 8)) + defer s.listenQueues.CompareAndDelete(leaseName, queue) for { select { case <-ctx.Done(): diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index e4d21f717..3338789b9 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -299,6 +299,89 @@ func TestSyncOnlineConditionWithStatus(t *testing.T) { } } +func TestListenQueueCompareAndDeleteOnStreamError(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-stream-error" + + originalQueue := make(chan *pb.ListenResponse, 8) + svc.listenQueues.Store(leaseName, originalQueue) + + t.Run("queue is deleted when no reconnect replaced it", func(t *testing.T) { + svc.listenQueues.CompareAndDelete(leaseName, originalQueue) + + if _, ok := svc.listenQueues.Load(leaseName); ok { + t.Fatal("queue should be deleted when it is still the same instance") + } + }) + + t.Run("queue survives when a reconnecting Listen replaced it", func(t *testing.T) { + newQueue := make(chan *pb.ListenResponse, 8) + svc.listenQueues.Store(leaseName, newQueue) + + svc.listenQueues.CompareAndDelete(leaseName, originalQueue) + + got, ok := svc.listenQueues.Load(leaseName) + if !ok { + t.Fatal("queue was deleted even though a new Listen replaced it") + } + if got != newQueue { + t.Fatal("queue was replaced with something unexpected") + } + }) +} + +func TestListenQueueCompareAndDeleteOnCleanShutdown(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-shutdown" + + queue := make(chan *pb.ListenResponse, 8) + svc.listenQueues.Store(leaseName, queue) + + svc.listenQueues.CompareAndDelete(leaseName, queue) + + if _, ok := svc.listenQueues.Load(leaseName); ok { + t.Fatal("queue should be removed on clean shutdown") + } +} + +func TestListenQueueReconnectInheritsExistingQueue(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-reconnect" + + originalQueue := make(chan *pb.ListenResponse, 8) + svc.listenQueues.Store(leaseName, originalQueue) + + got, loaded := svc.listenQueues.LoadOrStore(leaseName, make(chan *pb.ListenResponse, 8)) + if !loaded { + t.Fatal("LoadOrStore should have loaded the existing queue") + } + if got != originalQueue { + t.Fatal("reconnecting Listen did not inherit the existing queue") + } +} + +func TestListenQueueDialTokenSurvivesTransientDisconnect(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-dial-token" + + originalQueue := make(chan *pb.ListenResponse, 8) + svc.listenQueues.Store(leaseName, originalQueue) + + token := &pb.ListenResponse{RouterEndpoint: "test-endpoint", RouterToken: "test-token"} + originalQueue <- token + + svc.listenQueues.CompareAndDelete(leaseName, originalQueue) + + select { + case got := <-originalQueue: + if got.RouterEndpoint != "test-endpoint" || got.RouterToken != "test-token" { + t.Fatal("dial token was corrupted") + } + default: + t.Fatal("dial token was lost from the channel") + } +} + // contains checks if substr is contained in s func contains(s, substr string) bool { return len(s) >= len(substr) && (s == substr || len(substr) == 0 || From 1783ae01015bd98747c6f6b8b70549b4c1d42c94 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Wed, 15 Apr 2026 22:59:05 +0200 Subject: [PATCH 02/24] fix: wrap listenQueue channel in struct to prevent stale cleanup race When a reconnecting Listen() inherits the same channel via LoadOrStore, the old Listen()'s deferred CompareAndDelete would succeed because both hold the same channel reference, incorrectly deleting the map entry that the reconnected Listen() depends on. By wrapping the channel in a unique listenQueue struct per Listen() call and using CompareAndSwap on reconnect, the old Listen()'s CompareAndDelete becomes a no-op since the pointer identity no longer matches. Generated-By: Forge/20260415_224144_3227186_20142bee --- .../internal/service/controller_service.go | 22 ++++-- .../service/controller_service_test.go | 76 ++++++++++++++----- 2 files changed, 72 insertions(+), 26 deletions(-) diff --git a/controller/internal/service/controller_service.go b/controller/internal/service/controller_service.go index 684e7259d..19ca66f15 100644 --- a/controller/internal/service/controller_service.go +++ b/controller/internal/service/controller_service.go @@ -82,6 +82,10 @@ type ControllerService struct { listenQueues sync.Map } +type listenQueue struct { + ch chan *pb.ListenResponse +} + type wrappedStream struct { grpc.ServerStream } @@ -439,13 +443,19 @@ func (s *ControllerService) Listen(req *pb.ListenRequest, stream pb.ControllerSe return err } - queue, _ := s.listenQueues.LoadOrStore(leaseName, make(chan *pb.ListenResponse, 8)) - defer s.listenQueues.CompareAndDelete(leaseName, queue) + wrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} + actual, loaded := s.listenQueues.LoadOrStore(leaseName, wrapper) + if loaded { + existing := actual.(*listenQueue) + wrapper = &listenQueue{ch: existing.ch} + s.listenQueues.CompareAndSwap(leaseName, existing, wrapper) + } + defer s.listenQueues.CompareAndDelete(leaseName, wrapper) for { select { case <-ctx.Done(): return nil - case msg := <-queue.(chan *pb.ListenResponse): + case msg := <-wrapper.ch: if err := stream.Send(msg); err != nil { return err } @@ -733,11 +743,13 @@ func (s *ControllerService) Dial(ctx context.Context, req *pb.DialRequest) (*pb. RouterToken: token, } - queue, _ := s.listenQueues.LoadOrStore(leaseName, make(chan *pb.ListenResponse, 8)) + dialWrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} + actual, _ := s.listenQueues.LoadOrStore(leaseName, dialWrapper) + q := actual.(*listenQueue) select { case <-ctx.Done(): return nil, ctx.Err() - case queue.(chan *pb.ListenResponse) <- response: + case q.ch <- response: } logger.Info("Client dial assigned stream", "stream", stream) diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index 3338789b9..7107dbfd5 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -303,11 +303,11 @@ func TestListenQueueCompareAndDeleteOnStreamError(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-stream-error" - originalQueue := make(chan *pb.ListenResponse, 8) - svc.listenQueues.Store(leaseName, originalQueue) + wrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} + svc.listenQueues.Store(leaseName, wrapper) t.Run("queue is deleted when no reconnect replaced it", func(t *testing.T) { - svc.listenQueues.CompareAndDelete(leaseName, originalQueue) + svc.listenQueues.CompareAndDelete(leaseName, wrapper) if _, ok := svc.listenQueues.Load(leaseName); ok { t.Fatal("queue should be deleted when it is still the same instance") @@ -315,16 +315,16 @@ func TestListenQueueCompareAndDeleteOnStreamError(t *testing.T) { }) t.Run("queue survives when a reconnecting Listen replaced it", func(t *testing.T) { - newQueue := make(chan *pb.ListenResponse, 8) - svc.listenQueues.Store(leaseName, newQueue) + newWrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} + svc.listenQueues.Store(leaseName, newWrapper) - svc.listenQueues.CompareAndDelete(leaseName, originalQueue) + svc.listenQueues.CompareAndDelete(leaseName, wrapper) got, ok := svc.listenQueues.Load(leaseName) if !ok { t.Fatal("queue was deleted even though a new Listen replaced it") } - if got != newQueue { + if got != newWrapper { t.Fatal("queue was replaced with something unexpected") } }) @@ -334,29 +334,30 @@ func TestListenQueueCompareAndDeleteOnCleanShutdown(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-shutdown" - queue := make(chan *pb.ListenResponse, 8) - svc.listenQueues.Store(leaseName, queue) + wrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} + svc.listenQueues.Store(leaseName, wrapper) - svc.listenQueues.CompareAndDelete(leaseName, queue) + svc.listenQueues.CompareAndDelete(leaseName, wrapper) if _, ok := svc.listenQueues.Load(leaseName); ok { t.Fatal("queue should be removed on clean shutdown") } } -func TestListenQueueReconnectInheritsExistingQueue(t *testing.T) { +func TestListenQueueReconnectInheritsExistingChannel(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-reconnect" - originalQueue := make(chan *pb.ListenResponse, 8) - svc.listenQueues.Store(leaseName, originalQueue) + originalWrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} + svc.listenQueues.Store(leaseName, originalWrapper) - got, loaded := svc.listenQueues.LoadOrStore(leaseName, make(chan *pb.ListenResponse, 8)) + newWrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} + got, loaded := svc.listenQueues.LoadOrStore(leaseName, newWrapper) if !loaded { t.Fatal("LoadOrStore should have loaded the existing queue") } - if got != originalQueue { - t.Fatal("reconnecting Listen did not inherit the existing queue") + if got.(*listenQueue).ch != originalWrapper.ch { + t.Fatal("reconnecting Listen did not inherit the existing channel") } } @@ -364,16 +365,16 @@ func TestListenQueueDialTokenSurvivesTransientDisconnect(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-dial-token" - originalQueue := make(chan *pb.ListenResponse, 8) - svc.listenQueues.Store(leaseName, originalQueue) + wrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} + svc.listenQueues.Store(leaseName, wrapper) token := &pb.ListenResponse{RouterEndpoint: "test-endpoint", RouterToken: "test-token"} - originalQueue <- token + wrapper.ch <- token - svc.listenQueues.CompareAndDelete(leaseName, originalQueue) + svc.listenQueues.CompareAndDelete(leaseName, wrapper) select { - case got := <-originalQueue: + case got := <-wrapper.ch: if got.RouterEndpoint != "test-endpoint" || got.RouterToken != "test-token" { t.Fatal("dial token was corrupted") } @@ -382,6 +383,39 @@ func TestListenQueueDialTokenSurvivesTransientDisconnect(t *testing.T) { } } +func TestListenQueueReconnectPreventsStaleCleanup(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-stale-cleanup" + + originalWrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} + svc.listenQueues.Store(leaseName, originalWrapper) + + reconnectWrapper := &listenQueue{ch: originalWrapper.ch} + svc.listenQueues.CompareAndSwap(leaseName, originalWrapper, reconnectWrapper) + + svc.listenQueues.CompareAndDelete(leaseName, originalWrapper) + + got, ok := svc.listenQueues.Load(leaseName) + if !ok { + t.Fatal("stale Listen cleanup deleted queue that reconnected Listen is using") + } + if got != reconnectWrapper { + t.Fatal("queue entry does not match the reconnected wrapper") + } + + token := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + reconnectWrapper.ch <- token + + select { + case msg := <-reconnectWrapper.ch: + if msg.RouterEndpoint != "ep" || msg.RouterToken != "tok" { + t.Fatal("token was corrupted after stale cleanup attempt") + } + default: + t.Fatal("token was lost after stale cleanup attempt") + } +} + // contains checks if substr is contained in s func contains(s, substr string) bool { return len(s) >= len(substr) && (s == substr || len(substr) == 0 || From e310ebe472dcdf8637fefade0530d8fd0c465d26 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Wed, 15 Apr 2026 23:05:00 +0200 Subject: [PATCH 03/24] fix: retry CompareAndSwap on failure in concurrent Listen() reconnect When two Listen() goroutines execute concurrently for the same lease, both attempt CompareAndSwap with the same stale reference. The loser's CAS fails, leaving its wrapper unstored. If the winner exits first, its CompareAndDelete removes the entry while the loser still reads from it. Add a retry path that re-loads the current map value and attempts CAS again, ensuring the surviving goroutine always owns the map entry. Generated-By: Forge/20260415_224144_3227186_20142bee --- .../internal/service/controller_service.go | 8 +++- .../service/controller_service_test.go | 43 +++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/controller/internal/service/controller_service.go b/controller/internal/service/controller_service.go index 19ca66f15..d27905797 100644 --- a/controller/internal/service/controller_service.go +++ b/controller/internal/service/controller_service.go @@ -448,7 +448,13 @@ func (s *ControllerService) Listen(req *pb.ListenRequest, stream pb.ControllerSe if loaded { existing := actual.(*listenQueue) wrapper = &listenQueue{ch: existing.ch} - s.listenQueues.CompareAndSwap(leaseName, existing, wrapper) + if !s.listenQueues.CompareAndSwap(leaseName, existing, wrapper) { + if v, ok := s.listenQueues.Load(leaseName); ok { + current := v.(*listenQueue) + wrapper = &listenQueue{ch: current.ch} + s.listenQueues.CompareAndSwap(leaseName, current, wrapper) + } + } } defer s.listenQueues.CompareAndDelete(leaseName, wrapper) for { diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index 7107dbfd5..fd848af8d 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -416,6 +416,49 @@ func TestListenQueueReconnectPreventsStaleCleanup(t *testing.T) { } } +func TestListenQueueConcurrentCompareAndSwapRetries(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-concurrent-cas" + + originalWrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} + svc.listenQueues.Store(leaseName, originalWrapper) + + // Simulate goroutine A winning the CompareAndSwap + winnerWrapper := &listenQueue{ch: originalWrapper.ch} + if !svc.listenQueues.CompareAndSwap(leaseName, originalWrapper, winnerWrapper) { + t.Fatal("winner CompareAndSwap should succeed") + } + + // Simulate goroutine B trying CompareAndSwap with stale originalWrapper reference. + // This mirrors the production code path where loaded=true and CAS fails. + loserWrapper := &listenQueue{ch: originalWrapper.ch} + if svc.listenQueues.CompareAndSwap(leaseName, originalWrapper, loserWrapper) { + t.Fatal("loser CompareAndSwap should fail because map was already swapped by winner") + } + + // After production retry logic, the loser should re-load and swap successfully. + // Load the current value and create a new wrapper from it. + v, ok := svc.listenQueues.Load(leaseName) + if !ok { + t.Fatal("queue entry should still exist after failed CompareAndSwap") + } + current := v.(*listenQueue) + retryWrapper := &listenQueue{ch: current.ch} + if !svc.listenQueues.CompareAndSwap(leaseName, current, retryWrapper) { + t.Fatal("retry CompareAndSwap should succeed") + } + + // Now the winner's deferred CompareAndDelete should be a no-op + svc.listenQueues.CompareAndDelete(leaseName, winnerWrapper) + got, ok := svc.listenQueues.Load(leaseName) + if !ok { + t.Fatal("queue was deleted by winner's stale CompareAndDelete after concurrent retry") + } + if got != retryWrapper { + t.Fatal("queue entry does not match the retried wrapper") + } +} + // contains checks if substr is contained in s func contains(s, substr string) bool { return len(s) >= len(substr) && (s == substr || len(substr) == 0 || From 53d992e5fb8db808467030554701d83128a79f23 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 00:05:35 +0200 Subject: [PATCH 04/24] fix: give each Listen() goroutine its own channel to prevent stale reader race Replace the shared-channel approach (LoadOrStore + CompareAndSwap) with per-invocation channels and done-signaling. Each Listen() call creates a fresh listenQueue with its own ch and done channels, atomically swaps it into the sync.Map, and closes the previous entry's done channel to signal the old goroutine to stop reading. Dial() now uses Load (not LoadOrStore) to send tokens to the current listener only. This eliminates the logical race where a stale goroutine with a broken gRPC stream could consume and discard a dial token meant for the reconnected goroutine. Generated-By: Forge/20260415_235731_3329604_06ed4455 --- .../internal/service/controller_service.go | 31 +- .../service/controller_service_test.go | 277 +++++++++++++++--- 2 files changed, 246 insertions(+), 62 deletions(-) diff --git a/controller/internal/service/controller_service.go b/controller/internal/service/controller_service.go index d27905797..5eb7c1606 100644 --- a/controller/internal/service/controller_service.go +++ b/controller/internal/service/controller_service.go @@ -83,7 +83,8 @@ type ControllerService struct { } type listenQueue struct { - ch chan *pb.ListenResponse + ch chan *pb.ListenResponse + done chan struct{} } type wrappedStream struct { @@ -443,24 +444,22 @@ func (s *ControllerService) Listen(req *pb.ListenRequest, stream pb.ControllerSe return err } - wrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} - actual, loaded := s.listenQueues.LoadOrStore(leaseName, wrapper) + wrapper := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + old, loaded := s.listenQueues.Swap(leaseName, wrapper) if loaded { - existing := actual.(*listenQueue) - wrapper = &listenQueue{ch: existing.ch} - if !s.listenQueues.CompareAndSwap(leaseName, existing, wrapper) { - if v, ok := s.listenQueues.Load(leaseName); ok { - current := v.(*listenQueue) - wrapper = &listenQueue{ch: current.ch} - s.listenQueues.CompareAndSwap(leaseName, current, wrapper) - } - } + prev := old.(*listenQueue) + close(prev.done) } defer s.listenQueues.CompareAndDelete(leaseName, wrapper) for { select { case <-ctx.Done(): return nil + case <-wrapper.done: + return nil case msg := <-wrapper.ch: if err := stream.Send(msg); err != nil { return err @@ -749,9 +748,11 @@ func (s *ControllerService) Dial(ctx context.Context, req *pb.DialRequest) (*pb. RouterToken: token, } - dialWrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} - actual, _ := s.listenQueues.LoadOrStore(leaseName, dialWrapper) - q := actual.(*listenQueue) + v, ok := s.listenQueues.Load(leaseName) + if !ok { + return nil, status.Errorf(codes.Unavailable, "exporter is not listening on lease %s", leaseName) + } + q := v.(*listenQueue) select { case <-ctx.Done(): return nil, ctx.Err() diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index fd848af8d..a0fedca35 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -303,7 +303,7 @@ func TestListenQueueCompareAndDeleteOnStreamError(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-stream-error" - wrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} + wrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} svc.listenQueues.Store(leaseName, wrapper) t.Run("queue is deleted when no reconnect replaced it", func(t *testing.T) { @@ -315,7 +315,7 @@ func TestListenQueueCompareAndDeleteOnStreamError(t *testing.T) { }) t.Run("queue survives when a reconnecting Listen replaced it", func(t *testing.T) { - newWrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} + newWrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} svc.listenQueues.Store(leaseName, newWrapper) svc.listenQueues.CompareAndDelete(leaseName, wrapper) @@ -334,7 +334,7 @@ func TestListenQueueCompareAndDeleteOnCleanShutdown(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-shutdown" - wrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} + wrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} svc.listenQueues.Store(leaseName, wrapper) svc.listenQueues.CompareAndDelete(leaseName, wrapper) @@ -344,42 +344,72 @@ func TestListenQueueCompareAndDeleteOnCleanShutdown(t *testing.T) { } } -func TestListenQueueReconnectInheritsExistingChannel(t *testing.T) { +func TestListenQueueReconnectCreatesNewChannel(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-reconnect" - originalWrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} + originalWrapper := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } svc.listenQueues.Store(leaseName, originalWrapper) - newWrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} - got, loaded := svc.listenQueues.LoadOrStore(leaseName, newWrapper) + newWrapper := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + old, loaded := svc.listenQueues.Swap(leaseName, newWrapper) if !loaded { - t.Fatal("LoadOrStore should have loaded the existing queue") + t.Fatal("Swap should have found the existing entry") } - if got.(*listenQueue).ch != originalWrapper.ch { - t.Fatal("reconnecting Listen did not inherit the existing channel") + close(old.(*listenQueue).done) + + v, ok := svc.listenQueues.Load(leaseName) + if !ok { + t.Fatal("queue entry should still exist") + } + current := v.(*listenQueue) + if current.ch == originalWrapper.ch { + t.Fatal("reconnecting Listen must use a new channel, not the old one") + } + if current != newWrapper { + t.Fatal("queue entry should be the new wrapper") } } -func TestListenQueueDialTokenSurvivesTransientDisconnect(t *testing.T) { +func TestListenQueueDialTokenDeliveredToNewListener(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-dial-token" - wrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} - svc.listenQueues.Store(leaseName, wrapper) + g1 := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} + svc.listenQueues.Store(leaseName, g1) - token := &pb.ListenResponse{RouterEndpoint: "test-endpoint", RouterToken: "test-token"} - wrapper.ch <- token + g2 := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} + old, _ := svc.listenQueues.Swap(leaseName, g2) + close(old.(*listenQueue).done) - svc.listenQueues.CompareAndDelete(leaseName, wrapper) + // Dial loads the current queue and sends a token. + v, ok := svc.listenQueues.Load(leaseName) + if !ok { + t.Fatal("queue entry should exist") + } + v.(*listenQueue).ch <- &pb.ListenResponse{RouterEndpoint: "test-endpoint", RouterToken: "test-token"} + // Token must be on G2's channel, not G1's. select { - case got := <-wrapper.ch: + case got := <-g2.ch: if got.RouterEndpoint != "test-endpoint" || got.RouterToken != "test-token" { t.Fatal("dial token was corrupted") } default: - t.Fatal("dial token was lost from the channel") + t.Fatal("dial token was not delivered to the new listener") + } + + select { + case <-g1.ch: + t.Fatal("dial token was delivered to the old listener") + default: + // expected: G1 has nothing } } @@ -387,12 +417,20 @@ func TestListenQueueReconnectPreventsStaleCleanup(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-stale-cleanup" - originalWrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} + originalWrapper := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } svc.listenQueues.Store(leaseName, originalWrapper) - reconnectWrapper := &listenQueue{ch: originalWrapper.ch} - svc.listenQueues.CompareAndSwap(leaseName, originalWrapper, reconnectWrapper) + reconnectWrapper := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + old, _ := svc.listenQueues.Swap(leaseName, reconnectWrapper) + close(old.(*listenQueue).done) + // Original wrapper's deferred CompareAndDelete should be a no-op. svc.listenQueues.CompareAndDelete(leaseName, originalWrapper) got, ok := svc.listenQueues.Load(leaseName) @@ -416,46 +454,191 @@ func TestListenQueueReconnectPreventsStaleCleanup(t *testing.T) { } } -func TestListenQueueConcurrentCompareAndSwapRetries(t *testing.T) { +func TestListenQueueConcurrentSwapSupersedes(t *testing.T) { svc := &ControllerService{} - leaseName := "test-lease-concurrent-cas" + leaseName := "test-lease-concurrent-swap" - originalWrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8)} - svc.listenQueues.Store(leaseName, originalWrapper) + g1 := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} + svc.listenQueues.Store(leaseName, g1) + + // G2 swaps in, superseding G1. + g2 := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} + old2, _ := svc.listenQueues.Swap(leaseName, g2) + close(old2.(*listenQueue).done) - // Simulate goroutine A winning the CompareAndSwap - winnerWrapper := &listenQueue{ch: originalWrapper.ch} - if !svc.listenQueues.CompareAndSwap(leaseName, originalWrapper, winnerWrapper) { - t.Fatal("winner CompareAndSwap should succeed") + // G3 swaps in, superseding G2. + g3 := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} + old3, _ := svc.listenQueues.Swap(leaseName, g3) + close(old3.(*listenQueue).done) + + // G1 and G2 should both have their done channels closed. + select { + case <-g1.done: + default: + t.Fatal("G1 done channel should be closed") + } + select { + case <-g2.done: + default: + t.Fatal("G2 done channel should be closed") } - // Simulate goroutine B trying CompareAndSwap with stale originalWrapper reference. - // This mirrors the production code path where loaded=true and CAS fails. - loserWrapper := &listenQueue{ch: originalWrapper.ch} - if svc.listenQueues.CompareAndSwap(leaseName, originalWrapper, loserWrapper) { - t.Fatal("loser CompareAndSwap should fail because map was already swapped by winner") + // G3 should still be active. + select { + case <-g3.done: + t.Fatal("G3 done channel should not be closed") + default: } - // After production retry logic, the loser should re-load and swap successfully. - // Load the current value and create a new wrapper from it. + // G1 and G2 deferred CompareAndDelete are no-ops. + svc.listenQueues.CompareAndDelete(leaseName, g1) + svc.listenQueues.CompareAndDelete(leaseName, g2) + + got, ok := svc.listenQueues.Load(leaseName) + if !ok { + t.Fatal("queue was deleted by stale CompareAndDelete") + } + if got != g3 { + t.Fatal("queue entry does not match G3") + } +} + +func TestListenQueueStaleReaderConsumesDialToken(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-stale-reader" + + // G1 starts listening: creates its own queue and stores it. + g1Queue := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.listenQueues.Store(leaseName, g1Queue) + + // G2 reconnects: creates a NEW queue with its own channel and swaps it in. + g2Queue := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + old, loaded := svc.listenQueues.Swap(leaseName, g2Queue) + if !loaded { + t.Fatal("Swap should have found the existing G1 entry") + } + // Signal the old goroutine to stop. + oldQueue := old.(*listenQueue) + close(oldQueue.done) + + // Simulate Dial: loads the current queue and sends a token. v, ok := svc.listenQueues.Load(leaseName) if !ok { - t.Fatal("queue entry should still exist after failed CompareAndSwap") + t.Fatal("queue entry should exist for lease") } - current := v.(*listenQueue) - retryWrapper := &listenQueue{ch: current.ch} - if !svc.listenQueues.CompareAndSwap(leaseName, current, retryWrapper) { - t.Fatal("retry CompareAndSwap should succeed") + currentQueue := v.(*listenQueue) + token := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + currentQueue.ch <- token + + // G1 should NOT receive the token (its done channel is closed). + select { + case <-g1Queue.done: + // G1 detected supersession -- correct behavior. + case <-g1Queue.ch: + t.Fatal("stale reader G1 consumed the dial token") } - // Now the winner's deferred CompareAndDelete should be a no-op - svc.listenQueues.CompareAndDelete(leaseName, winnerWrapper) - got, ok := svc.listenQueues.Load(leaseName) + // G2 MUST receive the token. + select { + case got := <-g2Queue.ch: + if got.RouterEndpoint != "ep" || got.RouterToken != "tok" { + t.Fatal("token received by G2 was corrupted") + } + default: + t.Fatal("active reader G2 did not receive the dial token") + } +} + +func TestListenQueueConcurrentReadersAreNonDeterministic(t *testing.T) { + staleWins := 0 + iterations := 100 + + for i := 0; i < iterations; i++ { + svc := &ControllerService{} + leaseName := "test-lease-concurrent" + + g1Queue := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.listenQueues.Store(leaseName, g1Queue) + + g2Queue := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + old, _ := svc.listenQueues.Swap(leaseName, g2Queue) + close(old.(*listenQueue).done) + + v, _ := svc.listenQueues.Load(leaseName) + currentQueue := v.(*listenQueue) + currentQueue.ch <- &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + + // G1's done is closed, so it should always detect supersession. + // If the token ends up on G1's channel, that is a stale win. + select { + case <-g1Queue.done: + // correct: G1 sees done + case <-g1Queue.ch: + staleWins++ + } + } + + if staleWins > 0 { + t.Fatalf("stale reader won %d out of %d iterations, expected 0", staleWins, iterations) + } +} + +func TestListenQueueSupersessionSignaling(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-supersession" + + g1Queue := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.listenQueues.Store(leaseName, g1Queue) + + g2Queue := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + old, loaded := svc.listenQueues.Swap(leaseName, g2Queue) + if !loaded { + t.Fatal("Swap should return the old entry") + } + close(old.(*listenQueue).done) + + // Verify G1's done channel is closed. + select { + case <-g1Queue.done: + // expected + default: + t.Fatal("G1 done channel was not closed after supersession") + } + + // Verify G2's done channel is still open. + select { + case <-g2Queue.done: + t.Fatal("G2 done channel should not be closed") + default: + // expected + } + + // CompareAndDelete by G1 should be a no-op (G2 is current). + svc.listenQueues.CompareAndDelete(leaseName, g1Queue) + v, ok := svc.listenQueues.Load(leaseName) if !ok { - t.Fatal("queue was deleted by winner's stale CompareAndDelete after concurrent retry") + t.Fatal("G1 cleanup deleted the queue that G2 owns") } - if got != retryWrapper { - t.Fatal("queue entry does not match the retried wrapper") + if v != g2Queue { + t.Fatal("queue entry does not match G2's queue") } } From 191cbf2cfee2d0a66fd395d42b9ecbf7a7566f06 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 00:22:28 +0200 Subject: [PATCH 05/24] fix: reject Dial send to superseded listenQueue via done channel pre-check Add a deterministic pre-check of q.done before the send select in Dial, preventing tokens from being silently lost when sent to a queue whose listener has been superseded. Also add a fallback case <-q.done in the main select for races that occur between the pre-check and the send. Generated-By: Forge/20260415_235731_3329604_06ed4455 Co-Authored-By: Claude Opus 4.6 --- .../internal/service/controller_service.go | 7 ++++ .../service/controller_service_test.go | 34 +++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/controller/internal/service/controller_service.go b/controller/internal/service/controller_service.go index 5eb7c1606..e41e784bd 100644 --- a/controller/internal/service/controller_service.go +++ b/controller/internal/service/controller_service.go @@ -754,8 +754,15 @@ func (s *ControllerService) Dial(ctx context.Context, req *pb.DialRequest) (*pb. } q := v.(*listenQueue) select { + case <-q.done: + return nil, status.Errorf(codes.Unavailable, "exporter is not listening on lease %s", leaseName) + default: + } + select { case <-ctx.Done(): return nil, ctx.Err() + case <-q.done: + return nil, status.Errorf(codes.Unavailable, "exporter is not listening on lease %s", leaseName) case q.ch <- response: } diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index a0fedca35..33e8fc284 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -595,6 +595,40 @@ func TestListenQueueConcurrentReadersAreNonDeterministic(t *testing.T) { } } +func TestDialRejectsSupersededQueue(t *testing.T) { + q := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + close(q.done) + + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + + rejected := false + select { + case <-q.done: + rejected = true + default: + } + if !rejected { + select { + case <-q.done: + rejected = true + case q.ch <- response: + } + } + + if !rejected { + t.Fatal("dial must reject send to a queue whose done channel is closed") + } + + select { + case <-q.ch: + t.Fatal("token should not have been buffered in a superseded queue") + default: + } +} + func TestListenQueueSupersessionSignaling(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-supersession" From 523e7e613e89e1580014af29ab4068a07e99a989 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 00:24:00 +0200 Subject: [PATCH 06/24] fix: close done channel on all Listen exit paths using sync.Once The done channel was only closed when a listener was superseded by a new Listen call. On normal exit (context cancellation or stream error), done was left open, making it an unreliable signal for Dial's done pre-check. Use sync.Once to close done in a deferred call, ensuring it is always closed exactly once regardless of exit path. Generated-By: Forge/20260415_235731_3329604_06ed4455 Co-Authored-By: Claude Opus 4.6 --- .../internal/service/controller_service.go | 12 ++++++--- .../service/controller_service_test.go | 25 +++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/controller/internal/service/controller_service.go b/controller/internal/service/controller_service.go index e41e784bd..37df7d637 100644 --- a/controller/internal/service/controller_service.go +++ b/controller/internal/service/controller_service.go @@ -83,8 +83,13 @@ type ControllerService struct { } type listenQueue struct { - ch chan *pb.ListenResponse - done chan struct{} + ch chan *pb.ListenResponse + done chan struct{} + closeOnce sync.Once +} + +func (q *listenQueue) closeDone() { + q.closeOnce.Do(func() { close(q.done) }) } type wrappedStream struct { @@ -451,8 +456,9 @@ func (s *ControllerService) Listen(req *pb.ListenRequest, stream pb.ControllerSe old, loaded := s.listenQueues.Swap(leaseName, wrapper) if loaded { prev := old.(*listenQueue) - close(prev.done) + prev.closeDone() } + defer wrapper.closeDone() defer s.listenQueues.CompareAndDelete(leaseName, wrapper) for { select { diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index 33e8fc284..1989d5be8 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -17,6 +17,7 @@ limitations under the License. package service import ( + "sync" "testing" jumpstarterdevv1alpha1 "github.com/jumpstarter-dev/jumpstarter-controller/api/v1alpha1" @@ -629,6 +630,30 @@ func TestDialRejectsSupersededQueue(t *testing.T) { } } +func TestListenQueueDoneClosedOnNormalExit(t *testing.T) { + q := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + closeOnce: sync.Once{}, + } + + q.closeDone() + + select { + case <-q.done: + default: + t.Fatal("done channel should be closed after closeDone is called") + } + + q.closeDone() + + select { + case <-q.done: + default: + t.Fatal("done channel should remain closed after duplicate closeDone call") + } +} + func TestListenQueueSupersessionSignaling(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-supersession" From ffa54d50ca20e8c4ccd377406c996c8cce180129 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 00:34:52 +0200 Subject: [PATCH 07/24] fix: swap defer order to close done channel before map deletion The defer statements in Listen() were ordered such that CompareAndDelete ran before closeDone (LIFO). This created a TOCTOU window where a concurrent Dial could load a queue reference, pass the done check (done still open), and send to a dead queue. Swapping the defer order ensures closeDone() runs first, so any concurrent Dial that loaded the queue reference will see the closed done channel and reject the send before the map entry is removed. Generated-By: Forge/20260415_235731_3329604_06ed4455 --- .../internal/service/controller_service.go | 2 +- .../service/controller_service_test.go | 88 +++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/controller/internal/service/controller_service.go b/controller/internal/service/controller_service.go index 37df7d637..47de26773 100644 --- a/controller/internal/service/controller_service.go +++ b/controller/internal/service/controller_service.go @@ -458,8 +458,8 @@ func (s *ControllerService) Listen(req *pb.ListenRequest, stream pb.ControllerSe prev := old.(*listenQueue) prev.closeDone() } - defer wrapper.closeDone() defer s.listenQueues.CompareAndDelete(leaseName, wrapper) + defer wrapper.closeDone() for { select { case <-ctx.Done(): diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index 1989d5be8..e128e95cb 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -701,6 +701,94 @@ func TestListenQueueSupersessionSignaling(t *testing.T) { } } +func TestListenQueueDoneClosedBeforeMapDelete(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-defer-order" + + wrapper := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.listenQueues.Store(leaseName, wrapper) + + // Simulate a Dial that loaded the queue reference before Listen exits. + v, ok := svc.listenQueues.Load(leaseName) + if !ok { + t.Fatal("queue entry should exist") + } + q := v.(*listenQueue) + + // Simulate Listen exit with correct defer order: closeDone first, then CompareAndDelete. + // This is the order that prevents the TOCTOU race. + q.closeDone() + svc.listenQueues.CompareAndDelete(leaseName, wrapper) + + // The Dial that loaded q before cleanup must see done is closed. + select { + case <-q.done: + // correct: Dial detects the listener exited + default: + t.Fatal("Dial did not detect listener exit via done channel") + } + + // Map entry should be removed. + if _, ok := svc.listenQueues.Load(leaseName); ok { + t.Fatal("map entry should be removed after cleanup") + } +} + +func TestListenQueueDoneClosedBeforeMapDeleteWithConcurrentDial(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-defer-order-concurrent" + + wrapper := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.listenQueues.Store(leaseName, wrapper) + + // Simulate a Dial that loads the queue ref and then checks done. + // With correct defer order (closeDone before CompareAndDelete), + // done is closed before the map entry is removed, so Dial always + // sees the closed done channel. + v, _ := svc.listenQueues.Load(leaseName) + q := v.(*listenQueue) + + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + + // Close done (simulating closeDone() running first in defer chain). + q.closeDone() + + // Dial's pre-check: done is already closed, so send is rejected. + rejected := false + select { + case <-q.done: + rejected = true + default: + } + if !rejected { + select { + case <-q.done: + rejected = true + case q.ch <- response: + } + } + + if !rejected { + t.Fatal("Dial must reject send when done is closed before map delete") + } + + // Now map entry is removed (second defer). + svc.listenQueues.CompareAndDelete(leaseName, wrapper) + + // No token should be buffered. + select { + case <-q.ch: + t.Fatal("token should not be buffered in a queue whose done was closed first") + default: + } +} + // contains checks if substr is contained in s func contains(s, substr string) bool { return len(s) >= len(substr) && (s == substr || len(substr) == 0 || From ad0013311cab608a50cc2d5cee207d3d98753f7d Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 00:59:52 +0200 Subject: [PATCH 08/24] Improve test quality for listen queue race fix Address review findings F001, F002, F005, F006, F007, F008: - Add tests exercising listenQueue integration with actual struct behavior - Rename misleading TestListenQueueConcurrentReadersAreNonDeterministic - Add test for Dial returning Unavailable with no listener - Add concurrent Dial-during-reconnection test - Add context cancellation test - Run tests with -race flag Co-Authored-By: Claude Opus 4.6 --- .../service/controller_service_test.go | 328 +++++++++++++++++- 1 file changed, 327 insertions(+), 1 deletion(-) diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index e128e95cb..28c845ddb 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -17,8 +17,10 @@ limitations under the License. package service import ( + "context" "sync" "testing" + "time" jumpstarterdevv1alpha1 "github.com/jumpstarter-dev/jumpstarter-controller/api/v1alpha1" pb "github.com/jumpstarter-dev/jumpstarter-controller/internal/protocol/jumpstarter/v1" @@ -556,7 +558,7 @@ func TestListenQueueStaleReaderConsumesDialToken(t *testing.T) { } } -func TestListenQueueConcurrentReadersAreNonDeterministic(t *testing.T) { +func TestListenQueueStaleReaderAlwaysDetectsSupersession(t *testing.T) { staleWins := 0 iterations := 100 @@ -789,6 +791,330 @@ func TestListenQueueDoneClosedBeforeMapDeleteWithConcurrentDial(t *testing.T) { } } +func TestListenQueueDialReturnsUnavailableWhenNoListener(t *testing.T) { + svc := &ControllerService{} + leaseName := "nonexistent-lease" + + _, ok := svc.listenQueues.Load(leaseName) + if ok { + t.Fatal("expected no entry for nonexistent lease") + } +} + +func TestListenQueueDialReturnsUnavailableWhenDoneClosed(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-done-closed" + + q := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + q.closeDone() + svc.listenQueues.Store(leaseName, q) + + v, ok := svc.listenQueues.Load(leaseName) + if !ok { + t.Fatal("queue entry should exist") + } + loaded := v.(*listenQueue) + + select { + case <-loaded.done: + default: + t.Fatal("dial pre-check should detect closed done channel") + } +} + +func TestListenQueueContextCancellationExitsListenLoop(t *testing.T) { + wrapper := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + + ctx, cancel := context.WithCancel(context.Background()) + exited := make(chan struct{}) + + go func() { + defer close(exited) + for { + select { + case <-ctx.Done(): + return + case <-wrapper.done: + return + case <-wrapper.ch: + } + } + }() + + cancel() + + select { + case <-exited: + case <-time.After(time.Second): + t.Fatal("listen loop did not exit after context cancellation") + } +} + +func TestListenQueueConcurrentDialDuringReconnection(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-concurrent-dial" + + g1 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.listenQueues.Store(leaseName, g1) + + var deliveredCount int64 + var mu sync.Mutex + + g1ListenerDone := make(chan struct{}) + go func() { + defer close(g1ListenerDone) + for { + select { + case <-g1.done: + return + case <-g1.ch: + mu.Lock() + deliveredCount++ + mu.Unlock() + } + } + }() + + dialAttempts := 50 + var dialWg sync.WaitGroup + var rejectedCount int64 + var rejectedMu sync.Mutex + var sentCount int64 + var sentMu sync.Mutex + + var g2 *listenQueue + g2ListenerDone := make(chan struct{}) + + for i := 0; i < dialAttempts; i++ { + dialWg.Add(1) + go func() { + defer dialWg.Done() + v, ok := svc.listenQueues.Load(leaseName) + if !ok { + rejectedMu.Lock() + rejectedCount++ + rejectedMu.Unlock() + return + } + q := v.(*listenQueue) + select { + case <-q.done: + rejectedMu.Lock() + rejectedCount++ + rejectedMu.Unlock() + return + default: + } + select { + case <-q.done: + rejectedMu.Lock() + rejectedCount++ + rejectedMu.Unlock() + case q.ch <- &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"}: + sentMu.Lock() + sentCount++ + sentMu.Unlock() + } + }() + + if i == 25 { + g2 = &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + old, _ := svc.listenQueues.Swap(leaseName, g2) + old.(*listenQueue).closeDone() + + localG2 := g2 + go func() { + defer close(g2ListenerDone) + for { + select { + case <-localG2.done: + return + case <-localG2.ch: + mu.Lock() + deliveredCount++ + mu.Unlock() + } + } + }() + } + } + + dialWg.Wait() + + <-g1ListenerDone + + drainCount := 0 + for { + select { + case <-g1.ch: + drainCount++ + default: + goto drained + } + } +drained: + + if g2 != nil { + g2.closeDone() + <-g2ListenerDone + for { + select { + case <-g2.ch: + drainCount++ + default: + goto g2drained + } + } + } +g2drained: + + mu.Lock() + delivered := deliveredCount + mu.Unlock() + rejectedMu.Lock() + rejected := rejectedCount + rejectedMu.Unlock() + sentMu.Lock() + sent := sentCount + sentMu.Unlock() + + totalHandled := delivered + rejected + int64(drainCount) + if totalHandled != int64(dialAttempts) { + t.Fatalf("expected %d total outcomes, got %d delivered + %d rejected + %d drained = %d", + dialAttempts, delivered, rejected, drainCount, totalHandled) + } + + if sent != delivered+int64(drainCount) { + t.Fatalf("sent count %d does not match delivered %d + drained %d", + sent, delivered, drainCount) + } + + select { + case <-g1.done: + default: + t.Fatal("g1 done channel should be closed after reconnection") + } +} + +func TestListenQueueListenLoopDeliversTokensAndExitsOnDone(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-listen-loop" + + wrapper := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + old, loaded := svc.listenQueues.Swap(leaseName, wrapper) + if loaded { + old.(*listenQueue).closeDone() + } + + delivered := make(chan *pb.ListenResponse, 8) + loopExited := make(chan struct{}) + + go func() { + defer close(loopExited) + defer svc.listenQueues.CompareAndDelete(leaseName, wrapper) + defer wrapper.closeDone() + for { + select { + case <-wrapper.done: + return + case msg := <-wrapper.ch: + delivered <- msg + } + } + }() + + wrapper.ch <- &pb.ListenResponse{RouterEndpoint: "ep1", RouterToken: "tok1"} + wrapper.ch <- &pb.ListenResponse{RouterEndpoint: "ep2", RouterToken: "tok2"} + + for i := 0; i < 2; i++ { + select { + case msg := <-delivered: + if msg.RouterEndpoint == "" || msg.RouterToken == "" { + t.Fatal("received empty token") + } + case <-time.After(time.Second): + t.Fatal("timed out waiting for token delivery") + } + } + + superseder := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + prev, _ := svc.listenQueues.Swap(leaseName, superseder) + prev.(*listenQueue).closeDone() + + select { + case <-loopExited: + case <-time.After(time.Second): + t.Fatal("listen loop did not exit after supersession") + } + + v, ok := svc.listenQueues.Load(leaseName) + if !ok { + t.Fatal("queue entry should still exist for superseder") + } + if v != superseder { + t.Fatal("queue entry should be the superseder") + } +} + +func TestListenQueueDialFlowSendsToActiveListener(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-dial-flow" + + wrapper := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.listenQueues.Swap(leaseName, wrapper) + + ctx := context.Background() + response := &pb.ListenResponse{RouterEndpoint: "dial-ep", RouterToken: "dial-tok"} + + v, ok := svc.listenQueues.Load(leaseName) + if !ok { + t.Fatal("queue entry should exist") + } + q := v.(*listenQueue) + select { + case <-q.done: + t.Fatal("done channel should not be closed for active listener") + default: + } + select { + case <-ctx.Done(): + t.Fatal("context should not be done") + case <-q.done: + t.Fatal("done channel should not be closed for active listener") + case q.ch <- response: + } + + select { + case got := <-wrapper.ch: + if got.RouterEndpoint != "dial-ep" || got.RouterToken != "dial-tok" { + t.Fatal("received corrupted token") + } + default: + t.Fatal("token was not delivered to the active listener") + } +} + // contains checks if substr is contained in s func contains(s, substr string) bool { return len(s) >= len(substr) && (s == substr || len(substr) == 0 || From b84ded910e7c4fdf7c362611dd7802685678be78 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 07:16:13 +0200 Subject: [PATCH 09/24] fix: serialize Dial/Listen queue handoff with per-lease mutex The done-channel approach has a TOCTOU race: if Dial loads a queue reference before Listen reconnects, then both <-q.done and q.ch <- response are ready in the select (buffered channel), and Go may non-deterministically pick the send, delivering the token to a superseded queue. Add a per-lease mutex (leaseLocks sync.Map) that serializes swapListenQueue (Swap + closeDone) with sendToListener (Load + check + send). This guarantees that the queue loaded in Dial cannot be superseded during the send. Also replace custom contains/searchSubstring helpers with strings.Contains and add tests covering the stale-Dial scenario with pre-swap queue references. Co-Authored-By: Claude Opus 4.6 --- .../internal/service/controller_service.go | 69 +++-- .../service/controller_service_test.go | 293 +++++++++++++++++- 2 files changed, 326 insertions(+), 36 deletions(-) diff --git a/controller/internal/service/controller_service.go b/controller/internal/service/controller_service.go index 47de26773..716b0bb84 100644 --- a/controller/internal/service/controller_service.go +++ b/controller/internal/service/controller_service.go @@ -80,6 +80,7 @@ type ControllerService struct { ServerOptions []grpc.ServerOption Router config.Router listenQueues sync.Map + leaseLocks sync.Map } type listenQueue struct { @@ -92,6 +93,50 @@ func (q *listenQueue) closeDone() { q.closeOnce.Do(func() { close(q.done) }) } +func (s *ControllerService) getLeaseLock(leaseName string) *sync.Mutex { + v, _ := s.leaseLocks.LoadOrStore(leaseName, &sync.Mutex{}) + return v.(*sync.Mutex) +} + +// swapListenQueue atomically replaces the listen queue for a lease and signals +// the previous queue to stop. The per-lease lock serializes this with +// sendToListener so that Dial never sends a token to a superseded queue. +func (s *ControllerService) swapListenQueue(leaseName string, newQueue *listenQueue) { + mu := s.getLeaseLock(leaseName) + mu.Lock() + defer mu.Unlock() + old, loaded := s.listenQueues.Swap(leaseName, newQueue) + if loaded { + old.(*listenQueue).closeDone() + } +} + +// sendToListener delivers a response to the active listener for a lease. The +// per-lease lock guarantees that the queue loaded here cannot be superseded +// between the load and the send, eliminating the TOCTOU race between Dial and +// a reconnecting Listen. +func (s *ControllerService) sendToListener(leaseName string, response *pb.ListenResponse) error { + mu := s.getLeaseLock(leaseName) + mu.Lock() + defer mu.Unlock() + v, ok := s.listenQueues.Load(leaseName) + if !ok { + return status.Errorf(codes.Unavailable, "exporter is not listening on lease %s", leaseName) + } + q := v.(*listenQueue) + select { + case <-q.done: + return status.Errorf(codes.Unavailable, "exporter is not listening on lease %s", leaseName) + default: + } + select { + case <-q.done: + return status.Errorf(codes.Unavailable, "exporter is not listening on lease %s", leaseName) + case q.ch <- response: + return nil + } +} + type wrappedStream struct { grpc.ServerStream } @@ -453,11 +498,7 @@ func (s *ControllerService) Listen(req *pb.ListenRequest, stream pb.ControllerSe ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - old, loaded := s.listenQueues.Swap(leaseName, wrapper) - if loaded { - prev := old.(*listenQueue) - prev.closeDone() - } + s.swapListenQueue(leaseName, wrapper) defer s.listenQueues.CompareAndDelete(leaseName, wrapper) defer wrapper.closeDone() for { @@ -754,22 +795,8 @@ func (s *ControllerService) Dial(ctx context.Context, req *pb.DialRequest) (*pb. RouterToken: token, } - v, ok := s.listenQueues.Load(leaseName) - if !ok { - return nil, status.Errorf(codes.Unavailable, "exporter is not listening on lease %s", leaseName) - } - q := v.(*listenQueue) - select { - case <-q.done: - return nil, status.Errorf(codes.Unavailable, "exporter is not listening on lease %s", leaseName) - default: - } - select { - case <-ctx.Done(): - return nil, ctx.Err() - case <-q.done: - return nil, status.Errorf(codes.Unavailable, "exporter is not listening on lease %s", leaseName) - case q.ch <- response: + if err := s.sendToListener(leaseName, response); err != nil { + return nil, err } logger.Info("Client dial assigned stream", "stream", stream) diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index 28c845ddb..499eb93fa 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -18,6 +18,7 @@ package service import ( "context" + "strings" "sync" "testing" "time" @@ -186,7 +187,7 @@ func TestCheckExporterStatusForDriverCalls(t *testing.T) { } if tt.expectedSubstr != "" { - if !contains(st.Message(), tt.expectedSubstr) { + if !strings.Contains(st.Message(), tt.expectedSubstr) { t.Errorf("error message = %q, want to contain %q", st.Message(), tt.expectedSubstr) } } @@ -632,6 +633,282 @@ func TestDialRejectsSupersededQueue(t *testing.T) { } } +func TestDialWithPreSwapReferenceNeverSendsToStaleQueue(t *testing.T) { + staleSends := 0 + iterations := 500 + + for i := 0; i < iterations; i++ { + svc := &ControllerService{} + leaseName := "test-lease-pre-swap-ref" + + g1 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.listenQueues.Store(leaseName, g1) + + v, _ := svc.listenQueues.Load(leaseName) + preSwapRef := v.(*listenQueue) + + g2 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + old, _ := svc.listenQueues.Swap(leaseName, g2) + old.(*listenQueue).closeDone() + + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + + sent := false + select { + case <-preSwapRef.done: + default: + select { + case <-preSwapRef.done: + case preSwapRef.ch <- response: + sent = true + } + } + + if sent { + staleSends++ + <-preSwapRef.ch + } + } + + if staleSends > 0 { + t.Fatalf("dial sent to stale queue %d out of %d iterations", staleSends, iterations) + } +} + +func TestDialSendsTokenViaServiceMethod(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-dial-method" + + q := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.listenQueues.Store(leaseName, q) + + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + + err := svc.sendToListener(leaseName, response) + if err != nil { + t.Fatalf("sendToListener should succeed for active queue: %v", err) + } + + select { + case got := <-q.ch: + if got.RouterEndpoint != "ep" || got.RouterToken != "tok" { + t.Fatal("token was corrupted") + } + default: + t.Fatal("token was not delivered") + } +} + +func TestDialSendToListenerRejectsSupersededQueue(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-dial-method-superseded" + + g1 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.listenQueues.Store(leaseName, g1) + + g2 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + old, _ := svc.listenQueues.Swap(leaseName, g2) + old.(*listenQueue).closeDone() + + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + + err := svc.sendToListener(leaseName, response) + if err != nil { + t.Fatalf("sendToListener should succeed for the new active queue: %v", err) + } + + select { + case <-g1.ch: + t.Fatal("token was delivered to superseded queue g1") + default: + } + + select { + case got := <-g2.ch: + if got.RouterEndpoint != "ep" || got.RouterToken != "tok" { + t.Fatal("token was corrupted") + } + default: + t.Fatal("token was not delivered to active queue g2") + } +} + +func TestDialSendToListenerRejectsNoListener(t *testing.T) { + svc := &ControllerService{} + + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + err := svc.sendToListener("nonexistent-lease", response) + if err == nil { + t.Fatal("sendToListener should return error when no listener exists") + } +} + +func TestDialSendToListenerRejectsDoneQueue(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-done-queue" + + q := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + q.closeDone() + svc.listenQueues.Store(leaseName, q) + + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + err := svc.sendToListener(leaseName, response) + if err == nil { + t.Fatal("sendToListener should return error for done queue") + } + + select { + case <-q.ch: + t.Fatal("token should not be buffered in a done queue") + default: + } +} + +func TestDialSendToListenerSerializesWithSwap(t *testing.T) { + // Verify that swapListenQueue followed by sendToListener always delivers + // to the new queue (or returns an error), never to the superseded queue. + // This tests the scenario where the swap completes before the send. + iterations := 500 + + for i := 0; i < iterations; i++ { + svc := &ControllerService{} + leaseName := "test-lease-serialized" + + g1 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.listenQueues.Store(leaseName, g1) + + g2 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + + svc.swapListenQueue(leaseName, g2) + + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + err := svc.sendToListener(leaseName, response) + if err != nil { + t.Fatalf("iteration %d: sendToListener should succeed for active g2: %v", i, err) + } + + select { + case <-g1.ch: + t.Fatalf("iteration %d: token delivered to superseded g1", i) + default: + } + + select { + case got := <-g2.ch: + if got.RouterEndpoint != "ep" || got.RouterToken != "tok" { + t.Fatalf("iteration %d: token corrupted on g2", i) + } + default: + t.Fatalf("iteration %d: token not delivered to active g2", i) + } + } +} + +func TestDialSendToListenerConcurrentWithSwapNeverLandsOnSuperseded(t *testing.T) { + // Race swapListenQueue against sendToListener using goroutines. + // The per-lease mutex guarantees that the Load+send in sendToListener + // is atomic with respect to the Swap+closeDone in swapListenQueue. + // When sendToListener acquires the lock first, it sends to g1 (which + // is still current -- a valid send). When swapListenQueue acquires + // first, sendToListener sees g2 as the current queue. + // + // The invariant: if sendToListener returns nil, the done channel of the + // queue it sent to was NOT closed at the time of the send (guaranteed by + // the lock preventing concurrent swap+closeDone). + iterations := 500 + sentToG1 := 0 + sentToG2 := 0 + rejected := 0 + + for i := 0; i < iterations; i++ { + svc := &ControllerService{} + leaseName := "test-lease-concurrent-serial" + + g1 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.listenQueues.Store(leaseName, g1) + + g2 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + + swapDone := make(chan struct{}) + sendResult := make(chan error, 1) + + go func() { + defer close(swapDone) + svc.swapListenQueue(leaseName, g2) + }() + go func() { + sendResult <- svc.sendToListener(leaseName, &pb.ListenResponse{ + RouterEndpoint: "ep", RouterToken: "tok", + }) + }() + + <-swapDone + sendErr := <-sendResult + + if sendErr != nil { + rejected++ + continue + } + + onG1 := false + select { + case <-g1.ch: + onG1 = true + sentToG1++ + default: + } + onG2 := false + select { + case <-g2.ch: + onG2 = true + sentToG2++ + default: + } + + if !onG1 && !onG2 { + t.Fatalf("iteration %d: send succeeded but token is lost", i) + } + if onG1 && onG2 { + t.Fatalf("iteration %d: token duplicated across queues", i) + } + } + + if sentToG1+sentToG2+rejected != iterations { + t.Fatalf("accounting error: g1=%d g2=%d rejected=%d total=%d", + sentToG1, sentToG2, rejected, sentToG1+sentToG2+rejected) + } +} + func TestListenQueueDoneClosedOnNormalExit(t *testing.T) { q := &listenQueue{ ch: make(chan *pb.ListenResponse, 8), @@ -1115,17 +1392,3 @@ func TestListenQueueDialFlowSendsToActiveListener(t *testing.T) { } } -// contains checks if substr is contained in s -func contains(s, substr string) bool { - return len(s) >= len(substr) && (s == substr || len(substr) == 0 || - (len(s) > 0 && len(substr) > 0 && searchSubstring(s, substr))) -} - -func searchSubstring(s, substr string) bool { - for i := 0; i <= len(s)-len(substr); i++ { - if s[i:i+len(substr)] == substr { - return true - } - } - return false -} From 578a1fd90dca0a1e02d418d49edb6546ad5733f4 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 07:23:20 +0200 Subject: [PATCH 10/24] fix: pass Dial context to sendToListener to prevent mutex deadlock under backpressure When the listenQueue channel buffer is full, sendToListener blocks on the channel send while holding the per-lease mutex. This prevents a reconnecting Listen from acquiring the mutex to swap the queue, creating a deadlock chain. Adding the Dial caller's context to the select allows the blocked send to be cancelled when the Dial client disconnects, releasing the mutex for the reconnecting Listen to proceed. Generated-By: Forge/20260416_070332_3699740_7b2bda71 Co-Authored-By: Claude Opus 4.6 --- .../internal/service/controller_service.go | 6 +- .../service/controller_service_test.go | 82 +++++++++++++++++-- 2 files changed, 80 insertions(+), 8 deletions(-) diff --git a/controller/internal/service/controller_service.go b/controller/internal/service/controller_service.go index 716b0bb84..b72a4f26d 100644 --- a/controller/internal/service/controller_service.go +++ b/controller/internal/service/controller_service.go @@ -115,7 +115,7 @@ func (s *ControllerService) swapListenQueue(leaseName string, newQueue *listenQu // per-lease lock guarantees that the queue loaded here cannot be superseded // between the load and the send, eliminating the TOCTOU race between Dial and // a reconnecting Listen. -func (s *ControllerService) sendToListener(leaseName string, response *pb.ListenResponse) error { +func (s *ControllerService) sendToListener(ctx context.Context, leaseName string, response *pb.ListenResponse) error { mu := s.getLeaseLock(leaseName) mu.Lock() defer mu.Unlock() @@ -132,6 +132,8 @@ func (s *ControllerService) sendToListener(leaseName string, response *pb.Listen select { case <-q.done: return status.Errorf(codes.Unavailable, "exporter is not listening on lease %s", leaseName) + case <-ctx.Done(): + return ctx.Err() case q.ch <- response: return nil } @@ -795,7 +797,7 @@ func (s *ControllerService) Dial(ctx context.Context, req *pb.DialRequest) (*pb. RouterToken: token, } - if err := s.sendToListener(leaseName, response); err != nil { + if err := s.sendToListener(ctx, leaseName, response); err != nil { return nil, err } diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index 499eb93fa..5a2f73e6d 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -693,7 +693,7 @@ func TestDialSendsTokenViaServiceMethod(t *testing.T) { response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} - err := svc.sendToListener(leaseName, response) + err := svc.sendToListener(context.Background(), leaseName, response) if err != nil { t.Fatalf("sendToListener should succeed for active queue: %v", err) } @@ -727,7 +727,7 @@ func TestDialSendToListenerRejectsSupersededQueue(t *testing.T) { response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} - err := svc.sendToListener(leaseName, response) + err := svc.sendToListener(context.Background(), leaseName, response) if err != nil { t.Fatalf("sendToListener should succeed for the new active queue: %v", err) } @@ -752,7 +752,7 @@ func TestDialSendToListenerRejectsNoListener(t *testing.T) { svc := &ControllerService{} response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} - err := svc.sendToListener("nonexistent-lease", response) + err := svc.sendToListener(context.Background(), "nonexistent-lease", response) if err == nil { t.Fatal("sendToListener should return error when no listener exists") } @@ -770,7 +770,7 @@ func TestDialSendToListenerRejectsDoneQueue(t *testing.T) { svc.listenQueues.Store(leaseName, q) response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} - err := svc.sendToListener(leaseName, response) + err := svc.sendToListener(context.Background(), leaseName, response) if err == nil { t.Fatal("sendToListener should return error for done queue") } @@ -806,7 +806,7 @@ func TestDialSendToListenerSerializesWithSwap(t *testing.T) { svc.swapListenQueue(leaseName, g2) response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} - err := svc.sendToListener(leaseName, response) + err := svc.sendToListener(context.Background(), leaseName, response) if err != nil { t.Fatalf("iteration %d: sendToListener should succeed for active g2: %v", i, err) } @@ -867,7 +867,7 @@ func TestDialSendToListenerConcurrentWithSwapNeverLandsOnSuperseded(t *testing.T svc.swapListenQueue(leaseName, g2) }() go func() { - sendResult <- svc.sendToListener(leaseName, &pb.ListenResponse{ + sendResult <- svc.sendToListener(context.Background(), leaseName, &pb.ListenResponse{ RouterEndpoint: "ep", RouterToken: "tok", }) }() @@ -1351,6 +1351,76 @@ func TestListenQueueListenLoopDeliversTokensAndExitsOnDone(t *testing.T) { } } +func TestSendToListenerReturnsWhenContextCancelledAndBufferFull(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-ctx-cancel-buffer-full" + + q := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.listenQueues.Store(leaseName, q) + + for i := 0; i < 8; i++ { + q.ch <- &pb.ListenResponse{RouterEndpoint: "fill", RouterToken: "fill"} + } + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + errCh := make(chan error, 1) + go func() { + errCh <- svc.sendToListener(ctx, leaseName, &pb.ListenResponse{ + RouterEndpoint: "ep", RouterToken: "tok", + }) + }() + + select { + case err := <-errCh: + if err == nil { + t.Fatal("sendToListener should return error when context is cancelled and buffer full") + } + case <-time.After(2 * time.Second): + t.Fatal("sendToListener blocked indefinitely with cancelled context and full buffer") + } +} + +func TestSendToListenerUnblocksWhenContextCancelledDuringBackpressure(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-ctx-cancel-backpressure" + + q := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.listenQueues.Store(leaseName, q) + + for i := 0; i < 8; i++ { + q.ch <- &pb.ListenResponse{RouterEndpoint: "fill", RouterToken: "fill"} + } + + ctx, cancel := context.WithCancel(context.Background()) + + errCh := make(chan error, 1) + go func() { + errCh <- svc.sendToListener(ctx, leaseName, &pb.ListenResponse{ + RouterEndpoint: "ep", RouterToken: "tok", + }) + }() + + time.Sleep(50 * time.Millisecond) + cancel() + + select { + case err := <-errCh: + if err == nil { + t.Fatal("sendToListener should return error when context is cancelled") + } + case <-time.After(2 * time.Second): + t.Fatal("sendToListener did not unblock after context cancellation") + } +} + func TestListenQueueDialFlowSendsToActiveListener(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-dial-flow" From e10b3bd119172e5d0543be317934630abeaf825e Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 07:34:05 +0200 Subject: [PATCH 11/24] refactor: replace goto labels with drainChannel helper in test Extract channel draining logic from TestListenQueueConcurrentDialDuringReconnection into a reusable drainChannel function, replacing the goto drained/g2drained pattern with idiomatic Go. Generated-By: Forge/20260416_073038_3739633_70e0127f --- .../service/controller_service_test.go | 33 ++++++++----------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index 5a2f73e6d..3dbfbda63 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -31,6 +31,18 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +func drainChannel(ch <-chan *pb.ListenResponse) int { + count := 0 + for { + select { + case <-ch: + count++ + default: + return count + } + } +} + func TestProtoStatusToString(t *testing.T) { tests := []struct { name string @@ -1232,30 +1244,13 @@ func TestListenQueueConcurrentDialDuringReconnection(t *testing.T) { <-g1ListenerDone - drainCount := 0 - for { - select { - case <-g1.ch: - drainCount++ - default: - goto drained - } - } -drained: + drainCount := drainChannel(g1.ch) if g2 != nil { g2.closeDone() <-g2ListenerDone - for { - select { - case <-g2.ch: - drainCount++ - default: - goto g2drained - } - } + drainCount += drainChannel(g2.ch) } -g2drained: mu.Lock() delivered := deliveredCount From f60c36b93488e71ff8c9782d5d7b71fc2f69116f Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 07:34:53 +0200 Subject: [PATCH 12/24] fix: clean up leaseLocks entries when Listen exits without replacement The leaseLocks sync.Map grew unboundedly because per-lease mutex entries were never deleted. Now when Listen exits and CompareAndDelete successfully removes the queue (meaning no new listener took over), the corresponding leaseLocks entry is also deleted. Generated-By: Forge/20260416_073038_3739633_70e0127f --- .../internal/service/controller_service.go | 8 ++- .../service/controller_service_test.go | 54 +++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/controller/internal/service/controller_service.go b/controller/internal/service/controller_service.go index b72a4f26d..25117b756 100644 --- a/controller/internal/service/controller_service.go +++ b/controller/internal/service/controller_service.go @@ -501,8 +501,12 @@ func (s *ControllerService) Listen(req *pb.ListenRequest, stream pb.ControllerSe done: make(chan struct{}), } s.swapListenQueue(leaseName, wrapper) - defer s.listenQueues.CompareAndDelete(leaseName, wrapper) - defer wrapper.closeDone() + defer func() { + wrapper.closeDone() + if s.listenQueues.CompareAndDelete(leaseName, wrapper) { + s.leaseLocks.Delete(leaseName) + } + }() for { select { case <-ctx.Done(): diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index 3dbfbda63..2cbe6bb6c 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -1457,3 +1457,57 @@ func TestListenQueueDialFlowSendsToActiveListener(t *testing.T) { } } +func TestLeaseLockCleanedUpWhenListenExits(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-lock-cleanup" + + wrapper := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.swapListenQueue(leaseName, wrapper) + + if _, ok := svc.leaseLocks.Load(leaseName); !ok { + t.Fatal("lease lock should exist after swapListenQueue") + } + + wrapper.closeDone() + if svc.listenQueues.CompareAndDelete(leaseName, wrapper) { + svc.leaseLocks.Delete(leaseName) + } + + if _, ok := svc.leaseLocks.Load(leaseName); ok { + t.Fatal("lease lock should be deleted after Listen cleanup with no replacement") + } +} + +func TestLeaseLockPreservedWhenNewListenerTakesOver(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-lock-preserved" + + g1 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.swapListenQueue(leaseName, g1) + + g2 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.swapListenQueue(leaseName, g2) + + g1.closeDone() + if svc.listenQueues.CompareAndDelete(leaseName, g1) { + svc.leaseLocks.Delete(leaseName) + } + + if _, ok := svc.leaseLocks.Load(leaseName); !ok { + t.Fatal("lease lock should be preserved when a new listener took over") + } + + if _, ok := svc.listenQueues.Load(leaseName); !ok { + t.Fatal("queue should still exist for the new listener") + } +} + From 6c76f128fe87188852043fa18f38b935578502d7 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 07:35:24 +0200 Subject: [PATCH 13/24] test: add end-to-end deadlock chain test Exercises the full deadlock-avoidance chain: buffer full, sendToListener blocks holding the per-lease mutex, swapListenQueue blocks on the mutex, context cancellation unblocks sendToListener, and swapListenQueue proceeds. Generated-By: Forge/20260416_073038_3739633_70e0127f --- .../service/controller_service_test.go | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index 2cbe6bb6c..6d382768f 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -1511,3 +1511,76 @@ func TestLeaseLockPreservedWhenNewListenerTakesOver(t *testing.T) { } } +func TestDeadlockChainBrokenByContextCancellation(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-deadlock-chain" + + g1 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.swapListenQueue(leaseName, g1) + + for i := 0; i < 8; i++ { + g1.ch <- &pb.ListenResponse{RouterEndpoint: "fill", RouterToken: "fill"} + } + + ctx, cancel := context.WithCancel(context.Background()) + + sendDone := make(chan error, 1) + go func() { + sendDone <- svc.sendToListener(ctx, leaseName, &pb.ListenResponse{ + RouterEndpoint: "ep", RouterToken: "tok", + }) + }() + + time.Sleep(50 * time.Millisecond) + + g2 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + swapDone := make(chan struct{}) + go func() { + defer close(swapDone) + svc.swapListenQueue(leaseName, g2) + }() + + select { + case <-swapDone: + t.Fatal("swapListenQueue should be blocked waiting for the per-lease mutex") + case <-time.After(50 * time.Millisecond): + } + + cancel() + + select { + case err := <-sendDone: + if err == nil { + t.Fatal("sendToListener should return a context error") + } + case <-time.After(2 * time.Second): + t.Fatal("sendToListener did not unblock after context cancellation") + } + + select { + case <-swapDone: + case <-time.After(2 * time.Second): + t.Fatal("swapListenQueue did not proceed after sendToListener released the mutex") + } + + select { + case <-g1.done: + default: + t.Fatal("g1 done channel should be closed after swap") + } + + v, ok := svc.listenQueues.Load(leaseName) + if !ok { + t.Fatal("queue should exist for g2") + } + if v != g2 { + t.Fatal("active queue should be g2") + } +} + From d186eecf8565ae6bbd18dbbf30f0715fe4f7d2c9 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 07:35:34 +0200 Subject: [PATCH 14/24] ci: enable race detector in controller test target Add -race flag to the go test invocation in the Makefile test target, which is used by the controller-tests CI workflow. This ensures data races are detected in CI. Generated-By: Forge/20260416_073038_3739633_70e0127f --- controller/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controller/Makefile b/controller/Makefile index 08c9aa00e..bbf2dc457 100644 --- a/controller/Makefile +++ b/controller/Makefile @@ -96,7 +96,7 @@ vet: ## Run go vet against code. .PHONY: test test: manifests generate fmt vet envtest ## Run tests. - KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out + KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -race -coverprofile cover.out # Utilize Kind or modify the e2e tests to load the image locally, enabling compatibility with other vendors. .PHONY: test-e2e # Run the e2e tests against a Kind k8s instance that is spun up. From ba706d8d3fb8aa92069bb17f00858d11dde07968 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 08:00:25 +0200 Subject: [PATCH 15/24] fix: resolve golangci-lint goconst and gofmt violations Extract repeated string literal "tok" into testRouterToken constant and remove trailing blank line to satisfy golangci-lint checks. Generated-By: Forge/20260416_075157_11117_6c67a5ce Co-Authored-By: Claude Opus 4.6 --- .../service/controller_service_test.go | 45 ++++++++++--------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index 6d382768f..64ee6c123 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -31,6 +31,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +const testRouterToken = "tok" + func drainChannel(ch <-chan *pb.ListenResponse) int { count := 0 for { @@ -457,12 +459,12 @@ func TestListenQueueReconnectPreventsStaleCleanup(t *testing.T) { t.Fatal("queue entry does not match the reconnected wrapper") } - token := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + token := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} reconnectWrapper.ch <- token select { case msg := <-reconnectWrapper.ch: - if msg.RouterEndpoint != "ep" || msg.RouterToken != "tok" { + if msg.RouterEndpoint != "ep" || msg.RouterToken != testRouterToken { t.Fatal("token was corrupted after stale cleanup attempt") } default: @@ -549,7 +551,7 @@ func TestListenQueueStaleReaderConsumesDialToken(t *testing.T) { t.Fatal("queue entry should exist for lease") } currentQueue := v.(*listenQueue) - token := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + token := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} currentQueue.ch <- token // G1 should NOT receive the token (its done channel is closed). @@ -563,7 +565,7 @@ func TestListenQueueStaleReaderConsumesDialToken(t *testing.T) { // G2 MUST receive the token. select { case got := <-g2Queue.ch: - if got.RouterEndpoint != "ep" || got.RouterToken != "tok" { + if got.RouterEndpoint != "ep" || got.RouterToken != testRouterToken { t.Fatal("token received by G2 was corrupted") } default: @@ -594,7 +596,7 @@ func TestListenQueueStaleReaderAlwaysDetectsSupersession(t *testing.T) { v, _ := svc.listenQueues.Load(leaseName) currentQueue := v.(*listenQueue) - currentQueue.ch <- &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + currentQueue.ch <- &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} // G1's done is closed, so it should always detect supersession. // If the token ends up on G1's channel, that is a stale win. @@ -618,7 +620,7 @@ func TestDialRejectsSupersededQueue(t *testing.T) { } close(q.done) - response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} rejected := false select { @@ -669,7 +671,7 @@ func TestDialWithPreSwapReferenceNeverSendsToStaleQueue(t *testing.T) { old, _ := svc.listenQueues.Swap(leaseName, g2) old.(*listenQueue).closeDone() - response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} sent := false select { @@ -703,7 +705,7 @@ func TestDialSendsTokenViaServiceMethod(t *testing.T) { } svc.listenQueues.Store(leaseName, q) - response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} err := svc.sendToListener(context.Background(), leaseName, response) if err != nil { @@ -712,7 +714,7 @@ func TestDialSendsTokenViaServiceMethod(t *testing.T) { select { case got := <-q.ch: - if got.RouterEndpoint != "ep" || got.RouterToken != "tok" { + if got.RouterEndpoint != "ep" || got.RouterToken != testRouterToken { t.Fatal("token was corrupted") } default: @@ -737,7 +739,7 @@ func TestDialSendToListenerRejectsSupersededQueue(t *testing.T) { old, _ := svc.listenQueues.Swap(leaseName, g2) old.(*listenQueue).closeDone() - response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} err := svc.sendToListener(context.Background(), leaseName, response) if err != nil { @@ -752,7 +754,7 @@ func TestDialSendToListenerRejectsSupersededQueue(t *testing.T) { select { case got := <-g2.ch: - if got.RouterEndpoint != "ep" || got.RouterToken != "tok" { + if got.RouterEndpoint != "ep" || got.RouterToken != testRouterToken { t.Fatal("token was corrupted") } default: @@ -763,7 +765,7 @@ func TestDialSendToListenerRejectsSupersededQueue(t *testing.T) { func TestDialSendToListenerRejectsNoListener(t *testing.T) { svc := &ControllerService{} - response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} err := svc.sendToListener(context.Background(), "nonexistent-lease", response) if err == nil { t.Fatal("sendToListener should return error when no listener exists") @@ -781,7 +783,7 @@ func TestDialSendToListenerRejectsDoneQueue(t *testing.T) { q.closeDone() svc.listenQueues.Store(leaseName, q) - response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} err := svc.sendToListener(context.Background(), leaseName, response) if err == nil { t.Fatal("sendToListener should return error for done queue") @@ -817,7 +819,7 @@ func TestDialSendToListenerSerializesWithSwap(t *testing.T) { svc.swapListenQueue(leaseName, g2) - response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} err := svc.sendToListener(context.Background(), leaseName, response) if err != nil { t.Fatalf("iteration %d: sendToListener should succeed for active g2: %v", i, err) @@ -831,7 +833,7 @@ func TestDialSendToListenerSerializesWithSwap(t *testing.T) { select { case got := <-g2.ch: - if got.RouterEndpoint != "ep" || got.RouterToken != "tok" { + if got.RouterEndpoint != "ep" || got.RouterToken != testRouterToken { t.Fatalf("iteration %d: token corrupted on g2", i) } default: @@ -880,7 +882,7 @@ func TestDialSendToListenerConcurrentWithSwapNeverLandsOnSuperseded(t *testing.T }() go func() { sendResult <- svc.sendToListener(context.Background(), leaseName, &pb.ListenResponse{ - RouterEndpoint: "ep", RouterToken: "tok", + RouterEndpoint: "ep", RouterToken: testRouterToken, }) }() @@ -1045,7 +1047,7 @@ func TestListenQueueDoneClosedBeforeMapDeleteWithConcurrentDial(t *testing.T) { v, _ := svc.listenQueues.Load(leaseName) q := v.(*listenQueue) - response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"} + response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} // Close done (simulating closeDone() running first in defer chain). q.closeDone() @@ -1208,7 +1210,7 @@ func TestListenQueueConcurrentDialDuringReconnection(t *testing.T) { rejectedMu.Lock() rejectedCount++ rejectedMu.Unlock() - case q.ch <- &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: "tok"}: + case q.ch <- &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken}: sentMu.Lock() sentCount++ sentMu.Unlock() @@ -1366,7 +1368,7 @@ func TestSendToListenerReturnsWhenContextCancelledAndBufferFull(t *testing.T) { errCh := make(chan error, 1) go func() { errCh <- svc.sendToListener(ctx, leaseName, &pb.ListenResponse{ - RouterEndpoint: "ep", RouterToken: "tok", + RouterEndpoint: "ep", RouterToken: testRouterToken, }) }() @@ -1399,7 +1401,7 @@ func TestSendToListenerUnblocksWhenContextCancelledDuringBackpressure(t *testing errCh := make(chan error, 1) go func() { errCh <- svc.sendToListener(ctx, leaseName, &pb.ListenResponse{ - RouterEndpoint: "ep", RouterToken: "tok", + RouterEndpoint: "ep", RouterToken: testRouterToken, }) }() @@ -1530,7 +1532,7 @@ func TestDeadlockChainBrokenByContextCancellation(t *testing.T) { sendDone := make(chan error, 1) go func() { sendDone <- svc.sendToListener(ctx, leaseName, &pb.ListenResponse{ - RouterEndpoint: "ep", RouterToken: "tok", + RouterEndpoint: "ep", RouterToken: testRouterToken, }) }() @@ -1583,4 +1585,3 @@ func TestDeadlockChainBrokenByContextCancellation(t *testing.T) { t.Fatal("active queue should be g2") } } - From 44c2e5f482946d8862e5036c144b66e159ce25eb Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 08:07:38 +0200 Subject: [PATCH 16/24] fix: use sendToListener in concurrent dial reconnection test Replace the manual Load+select TOCTOU pattern in TestListenQueueConcurrentDialDuringReconnection with calls to svc.sendToListener() and svc.swapListenQueue(), exercising the actual production code path that serializes Dial with reconnecting Listen via the per-lease mutex. Generated-By: Forge/20260416_075157_11117_6c67a5ce --- .../service/controller_service_test.go | 34 ++++++------------- 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index 64ee6c123..b8b3ee67f 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -1155,7 +1155,7 @@ func TestListenQueueConcurrentDialDuringReconnection(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Store(leaseName, g1) + svc.swapListenQueue(leaseName, g1) var deliveredCount int64 var mu sync.Mutex @@ -1189,32 +1189,19 @@ func TestListenQueueConcurrentDialDuringReconnection(t *testing.T) { dialWg.Add(1) go func() { defer dialWg.Done() - v, ok := svc.listenQueues.Load(leaseName) - if !ok { - rejectedMu.Lock() - rejectedCount++ - rejectedMu.Unlock() - return - } - q := v.(*listenQueue) - select { - case <-q.done: + ctx := context.Background() + err := svc.sendToListener(ctx, leaseName, &pb.ListenResponse{ + RouterEndpoint: "ep", RouterToken: testRouterToken, + }) + if err != nil { rejectedMu.Lock() rejectedCount++ rejectedMu.Unlock() return - default: - } - select { - case <-q.done: - rejectedMu.Lock() - rejectedCount++ - rejectedMu.Unlock() - case q.ch <- &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken}: - sentMu.Lock() - sentCount++ - sentMu.Unlock() } + sentMu.Lock() + sentCount++ + sentMu.Unlock() }() if i == 25 { @@ -1222,8 +1209,7 @@ func TestListenQueueConcurrentDialDuringReconnection(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - old, _ := svc.listenQueues.Swap(leaseName, g2) - old.(*listenQueue).closeDone() + svc.swapListenQueue(leaseName, g2) localG2 := g2 go func() { From eb7e0a5bb6afc5b4a28c1a76c1237ae91c227d7f Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 08:12:40 +0200 Subject: [PATCH 17/24] fix: use service methods in listen queue tests instead of raw sync.Map ops Refactor tests to use swapListenQueue() and sendToListener() instead of directly calling listenQueues.Store(), listenQueues.Swap(), and close(done). This ensures tests exercise the actual production code paths including per-lease mutex serialization, rather than testing a different (pre-fix) code path that bypasses the TOCTOU protection. Generated-By: Forge/20260416_075157_11117_6c67a5ce --- .../service/controller_service_test.go | 252 ++++++------------ 1 file changed, 87 insertions(+), 165 deletions(-) diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index b8b3ee67f..88a3b2733 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -322,7 +322,7 @@ func TestListenQueueCompareAndDeleteOnStreamError(t *testing.T) { leaseName := "test-lease-stream-error" wrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} - svc.listenQueues.Store(leaseName, wrapper) + svc.swapListenQueue(leaseName, wrapper) t.Run("queue is deleted when no reconnect replaced it", func(t *testing.T) { svc.listenQueues.CompareAndDelete(leaseName, wrapper) @@ -334,7 +334,7 @@ func TestListenQueueCompareAndDeleteOnStreamError(t *testing.T) { t.Run("queue survives when a reconnecting Listen replaced it", func(t *testing.T) { newWrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} - svc.listenQueues.Store(leaseName, newWrapper) + svc.swapListenQueue(leaseName, newWrapper) svc.listenQueues.CompareAndDelete(leaseName, wrapper) @@ -353,7 +353,7 @@ func TestListenQueueCompareAndDeleteOnCleanShutdown(t *testing.T) { leaseName := "test-lease-shutdown" wrapper := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} - svc.listenQueues.Store(leaseName, wrapper) + svc.swapListenQueue(leaseName, wrapper) svc.listenQueues.CompareAndDelete(leaseName, wrapper) @@ -370,17 +370,13 @@ func TestListenQueueReconnectCreatesNewChannel(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Store(leaseName, originalWrapper) + svc.swapListenQueue(leaseName, originalWrapper) newWrapper := &listenQueue{ ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - old, loaded := svc.listenQueues.Swap(leaseName, newWrapper) - if !loaded { - t.Fatal("Swap should have found the existing entry") - } - close(old.(*listenQueue).done) + svc.swapListenQueue(leaseName, newWrapper) v, ok := svc.listenQueues.Load(leaseName) if !ok { @@ -393,6 +389,12 @@ func TestListenQueueReconnectCreatesNewChannel(t *testing.T) { if current != newWrapper { t.Fatal("queue entry should be the new wrapper") } + + select { + case <-originalWrapper.done: + default: + t.Fatal("original wrapper done channel should be closed after swap") + } } func TestListenQueueDialTokenDeliveredToNewListener(t *testing.T) { @@ -400,20 +402,17 @@ func TestListenQueueDialTokenDeliveredToNewListener(t *testing.T) { leaseName := "test-lease-dial-token" g1 := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} - svc.listenQueues.Store(leaseName, g1) + svc.swapListenQueue(leaseName, g1) g2 := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} - old, _ := svc.listenQueues.Swap(leaseName, g2) - close(old.(*listenQueue).done) + svc.swapListenQueue(leaseName, g2) - // Dial loads the current queue and sends a token. - v, ok := svc.listenQueues.Load(leaseName) - if !ok { - t.Fatal("queue entry should exist") + response := &pb.ListenResponse{RouterEndpoint: "test-endpoint", RouterToken: "test-token"} + err := svc.sendToListener(context.Background(), leaseName, response) + if err != nil { + t.Fatalf("sendToListener should succeed for active queue: %v", err) } - v.(*listenQueue).ch <- &pb.ListenResponse{RouterEndpoint: "test-endpoint", RouterToken: "test-token"} - // Token must be on G2's channel, not G1's. select { case got := <-g2.ch: if got.RouterEndpoint != "test-endpoint" || got.RouterToken != "test-token" { @@ -427,7 +426,6 @@ func TestListenQueueDialTokenDeliveredToNewListener(t *testing.T) { case <-g1.ch: t.Fatal("dial token was delivered to the old listener") default: - // expected: G1 has nothing } } @@ -439,14 +437,13 @@ func TestListenQueueReconnectPreventsStaleCleanup(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Store(leaseName, originalWrapper) + svc.swapListenQueue(leaseName, originalWrapper) reconnectWrapper := &listenQueue{ ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - old, _ := svc.listenQueues.Swap(leaseName, reconnectWrapper) - close(old.(*listenQueue).done) + svc.swapListenQueue(leaseName, reconnectWrapper) // Original wrapper's deferred CompareAndDelete should be a no-op. svc.listenQueues.CompareAndDelete(leaseName, originalWrapper) @@ -477,17 +474,13 @@ func TestListenQueueConcurrentSwapSupersedes(t *testing.T) { leaseName := "test-lease-concurrent-swap" g1 := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} - svc.listenQueues.Store(leaseName, g1) + svc.swapListenQueue(leaseName, g1) - // G2 swaps in, superseding G1. g2 := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} - old2, _ := svc.listenQueues.Swap(leaseName, g2) - close(old2.(*listenQueue).done) + svc.swapListenQueue(leaseName, g2) - // G3 swaps in, superseding G2. g3 := &listenQueue{ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{})} - old3, _ := svc.listenQueues.Swap(leaseName, g3) - close(old3.(*listenQueue).done) + svc.swapListenQueue(leaseName, g3) // G1 and G2 should both have their done channels closed. select { @@ -525,44 +518,36 @@ func TestListenQueueStaleReaderConsumesDialToken(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-stale-reader" - // G1 starts listening: creates its own queue and stores it. g1Queue := &listenQueue{ ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Store(leaseName, g1Queue) + svc.swapListenQueue(leaseName, g1Queue) - // G2 reconnects: creates a NEW queue with its own channel and swaps it in. g2Queue := &listenQueue{ ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - old, loaded := svc.listenQueues.Swap(leaseName, g2Queue) - if !loaded { - t.Fatal("Swap should have found the existing G1 entry") - } - // Signal the old goroutine to stop. - oldQueue := old.(*listenQueue) - close(oldQueue.done) + svc.swapListenQueue(leaseName, g2Queue) - // Simulate Dial: loads the current queue and sends a token. - v, ok := svc.listenQueues.Load(leaseName) - if !ok { - t.Fatal("queue entry should exist for lease") - } - currentQueue := v.(*listenQueue) token := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} - currentQueue.ch <- token + err := svc.sendToListener(context.Background(), leaseName, token) + if err != nil { + t.Fatalf("sendToListener should succeed for active queue: %v", err) + } - // G1 should NOT receive the token (its done channel is closed). select { case <-g1Queue.done: - // G1 detected supersession -- correct behavior. + default: + t.Fatal("G1 done channel should be closed after swap") + } + + select { case <-g1Queue.ch: t.Fatal("stale reader G1 consumed the dial token") + default: } - // G2 MUST receive the token. select { case got := <-g2Queue.ch: if got.RouterEndpoint != "ep" || got.RouterToken != testRouterToken { @@ -585,24 +570,23 @@ func TestListenQueueStaleReaderAlwaysDetectsSupersession(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Store(leaseName, g1Queue) + svc.swapListenQueue(leaseName, g1Queue) g2Queue := &listenQueue{ ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - old, _ := svc.listenQueues.Swap(leaseName, g2Queue) - close(old.(*listenQueue).done) + svc.swapListenQueue(leaseName, g2Queue) - v, _ := svc.listenQueues.Load(leaseName) - currentQueue := v.(*listenQueue) - currentQueue.ch <- &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} + err := svc.sendToListener(context.Background(), leaseName, &pb.ListenResponse{ + RouterEndpoint: "ep", RouterToken: testRouterToken, + }) + if err != nil { + t.Fatalf("iteration %d: sendToListener should succeed: %v", i, err) + } - // G1's done is closed, so it should always detect supersession. - // If the token ends up on G1's channel, that is a stale win. select { case <-g1Queue.done: - // correct: G1 sees done case <-g1Queue.ch: staleWins++ } @@ -648,7 +632,6 @@ func TestDialRejectsSupersededQueue(t *testing.T) { } func TestDialWithPreSwapReferenceNeverSendsToStaleQueue(t *testing.T) { - staleSends := 0 iterations := 500 for i := 0; i < iterations; i++ { @@ -659,40 +642,36 @@ func TestDialWithPreSwapReferenceNeverSendsToStaleQueue(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Store(leaseName, g1) - - v, _ := svc.listenQueues.Load(leaseName) - preSwapRef := v.(*listenQueue) + svc.swapListenQueue(leaseName, g1) g2 := &listenQueue{ ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - old, _ := svc.listenQueues.Swap(leaseName, g2) - old.(*listenQueue).closeDone() + svc.swapListenQueue(leaseName, g2) response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} - sent := false + err := svc.sendToListener(context.Background(), leaseName, response) + if err != nil { + t.Fatalf("iteration %d: sendToListener should succeed for active g2: %v", i, err) + } + select { - case <-preSwapRef.done: + case <-g1.ch: + t.Fatalf("iteration %d: dial sent to stale queue g1", i) default: - select { - case <-preSwapRef.done: - case preSwapRef.ch <- response: - sent = true - } } - if sent { - staleSends++ - <-preSwapRef.ch + select { + case got := <-g2.ch: + if got.RouterEndpoint != "ep" || got.RouterToken != testRouterToken { + t.Fatalf("iteration %d: token corrupted on g2", i) + } + default: + t.Fatalf("iteration %d: token not delivered to active g2", i) } } - - if staleSends > 0 { - t.Fatalf("dial sent to stale queue %d out of %d iterations", staleSends, iterations) - } } func TestDialSendsTokenViaServiceMethod(t *testing.T) { @@ -703,7 +682,7 @@ func TestDialSendsTokenViaServiceMethod(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Store(leaseName, q) + svc.swapListenQueue(leaseName, q) response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} @@ -730,14 +709,13 @@ func TestDialSendToListenerRejectsSupersededQueue(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Store(leaseName, g1) + svc.swapListenQueue(leaseName, g1) g2 := &listenQueue{ ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - old, _ := svc.listenQueues.Swap(leaseName, g2) - old.(*listenQueue).closeDone() + svc.swapListenQueue(leaseName, g2) response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} @@ -780,8 +758,8 @@ func TestDialSendToListenerRejectsDoneQueue(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } + svc.swapListenQueue(leaseName, q) q.closeDone() - svc.listenQueues.Store(leaseName, q) response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} err := svc.sendToListener(context.Background(), leaseName, response) @@ -810,7 +788,7 @@ func TestDialSendToListenerSerializesWithSwap(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Store(leaseName, g1) + svc.swapListenQueue(leaseName, g1) g2 := &listenQueue{ ch: make(chan *pb.ListenResponse, 8), @@ -866,7 +844,7 @@ func TestDialSendToListenerConcurrentWithSwapNeverLandsOnSuperseded(t *testing.T ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Store(leaseName, g1) + svc.swapListenQueue(leaseName, g1) g2 := &listenQueue{ ch: make(chan *pb.ListenResponse, 8), @@ -955,17 +933,13 @@ func TestListenQueueSupersessionSignaling(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Store(leaseName, g1Queue) + svc.swapListenQueue(leaseName, g1Queue) g2Queue := &listenQueue{ ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - old, loaded := svc.listenQueues.Swap(leaseName, g2Queue) - if !loaded { - t.Fatal("Swap should return the old entry") - } - close(old.(*listenQueue).done) + svc.swapListenQueue(leaseName, g2Queue) // Verify G1's done channel is closed. select { @@ -1002,17 +976,14 @@ func TestListenQueueDoneClosedBeforeMapDelete(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Store(leaseName, wrapper) + svc.swapListenQueue(leaseName, wrapper) - // Simulate a Dial that loaded the queue reference before Listen exits. v, ok := svc.listenQueues.Load(leaseName) if !ok { t.Fatal("queue entry should exist") } q := v.(*listenQueue) - // Simulate Listen exit with correct defer order: closeDone first, then CompareAndDelete. - // This is the order that prevents the TOCTOU race. q.closeDone() svc.listenQueues.CompareAndDelete(leaseName, wrapper) @@ -1038,45 +1009,20 @@ func TestListenQueueDoneClosedBeforeMapDeleteWithConcurrentDial(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Store(leaseName, wrapper) + svc.swapListenQueue(leaseName, wrapper) - // Simulate a Dial that loads the queue ref and then checks done. - // With correct defer order (closeDone before CompareAndDelete), - // done is closed before the map entry is removed, so Dial always - // sees the closed done channel. - v, _ := svc.listenQueues.Load(leaseName) - q := v.(*listenQueue) + wrapper.closeDone() response := &pb.ListenResponse{RouterEndpoint: "ep", RouterToken: testRouterToken} - - // Close done (simulating closeDone() running first in defer chain). - q.closeDone() - - // Dial's pre-check: done is already closed, so send is rejected. - rejected := false - select { - case <-q.done: - rejected = true - default: - } - if !rejected { - select { - case <-q.done: - rejected = true - case q.ch <- response: - } - } - - if !rejected { - t.Fatal("Dial must reject send when done is closed before map delete") + err := svc.sendToListener(context.Background(), leaseName, response) + if err == nil { + t.Fatal("sendToListener should return error when done is closed before map delete") } - // Now map entry is removed (second defer). svc.listenQueues.CompareAndDelete(leaseName, wrapper) - // No token should be buffered. select { - case <-q.ch: + case <-wrapper.ch: t.Fatal("token should not be buffered in a queue whose done was closed first") default: } @@ -1100,19 +1046,14 @@ func TestListenQueueDialReturnsUnavailableWhenDoneClosed(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } + svc.swapListenQueue(leaseName, q) q.closeDone() - svc.listenQueues.Store(leaseName, q) - v, ok := svc.listenQueues.Load(leaseName) - if !ok { - t.Fatal("queue entry should exist") - } - loaded := v.(*listenQueue) - - select { - case <-loaded.done: - default: - t.Fatal("dial pre-check should detect closed done channel") + err := svc.sendToListener(context.Background(), leaseName, &pb.ListenResponse{ + RouterEndpoint: "ep", RouterToken: testRouterToken, + }) + if err == nil { + t.Fatal("sendToListener should return error for done queue") } } @@ -1276,10 +1217,7 @@ func TestListenQueueListenLoopDeliversTokensAndExitsOnDone(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - old, loaded := svc.listenQueues.Swap(leaseName, wrapper) - if loaded { - old.(*listenQueue).closeDone() - } + svc.swapListenQueue(leaseName, wrapper) delivered := make(chan *pb.ListenResponse, 8) loopExited := make(chan struct{}) @@ -1316,8 +1254,7 @@ func TestListenQueueListenLoopDeliversTokensAndExitsOnDone(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - prev, _ := svc.listenQueues.Swap(leaseName, superseder) - prev.(*listenQueue).closeDone() + svc.swapListenQueue(leaseName, superseder) select { case <-loopExited: @@ -1342,7 +1279,7 @@ func TestSendToListenerReturnsWhenContextCancelledAndBufferFull(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Store(leaseName, q) + svc.swapListenQueue(leaseName, q) for i := 0; i < 8; i++ { q.ch <- &pb.ListenResponse{RouterEndpoint: "fill", RouterToken: "fill"} @@ -1376,7 +1313,7 @@ func TestSendToListenerUnblocksWhenContextCancelledDuringBackpressure(t *testing ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Store(leaseName, q) + svc.swapListenQueue(leaseName, q) for i := 0; i < 8; i++ { q.ch <- &pb.ListenResponse{RouterEndpoint: "fill", RouterToken: "fill"} @@ -1412,27 +1349,12 @@ func TestListenQueueDialFlowSendsToActiveListener(t *testing.T) { ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.listenQueues.Swap(leaseName, wrapper) + svc.swapListenQueue(leaseName, wrapper) - ctx := context.Background() response := &pb.ListenResponse{RouterEndpoint: "dial-ep", RouterToken: "dial-tok"} - - v, ok := svc.listenQueues.Load(leaseName) - if !ok { - t.Fatal("queue entry should exist") - } - q := v.(*listenQueue) - select { - case <-q.done: - t.Fatal("done channel should not be closed for active listener") - default: - } - select { - case <-ctx.Done(): - t.Fatal("context should not be done") - case <-q.done: - t.Fatal("done channel should not be closed for active listener") - case q.ch <- response: + err := svc.sendToListener(context.Background(), leaseName, response) + if err != nil { + t.Fatalf("sendToListener should succeed for active listener: %v", err) } select { From df5f0154f60ab959fc378db952c281aff3dd86f1 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 11:12:11 +0200 Subject: [PATCH 18/24] fix: serialize closeDone under lease mutex and stop deleting leaseLocks entries The Listen cleanup path called closeDone() outside the per-lease mutex, which allowed an in-flight sendToListener to see a partially-torn-down queue. Worse, leaseLocks.Delete could remove the mutex while a concurrent Listen or Dial still references it, breaking serialization guarantees. Fix by acquiring the lease mutex before calling closeDone() in the cleanup defer, and by never deleting leaseLocks entries (they are tiny and bounded by the number of distinct lease names seen). Also fix the stale-reader detection test to check done and ch deterministically instead of relying on random select ordering. Generated-By: Forge/20260416_105202_199878_b08a2035 --- .../internal/service/controller_service.go | 7 ++-- .../service/controller_service_test.go | 33 ++++++++++--------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/controller/internal/service/controller_service.go b/controller/internal/service/controller_service.go index 25117b756..110b858a4 100644 --- a/controller/internal/service/controller_service.go +++ b/controller/internal/service/controller_service.go @@ -502,10 +502,11 @@ func (s *ControllerService) Listen(req *pb.ListenRequest, stream pb.ControllerSe } s.swapListenQueue(leaseName, wrapper) defer func() { + mu := s.getLeaseLock(leaseName) + mu.Lock() wrapper.closeDone() - if s.listenQueues.CompareAndDelete(leaseName, wrapper) { - s.leaseLocks.Delete(leaseName) - } + mu.Unlock() + s.listenQueues.CompareAndDelete(leaseName, wrapper) }() for { select { diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index 88a3b2733..f914a67da 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -559,7 +559,6 @@ func TestListenQueueStaleReaderConsumesDialToken(t *testing.T) { } func TestListenQueueStaleReaderAlwaysDetectsSupersession(t *testing.T) { - staleWins := 0 iterations := 100 for i := 0; i < iterations; i++ { @@ -587,13 +586,15 @@ func TestListenQueueStaleReaderAlwaysDetectsSupersession(t *testing.T) { select { case <-g1Queue.done: - case <-g1Queue.ch: - staleWins++ + default: + t.Fatalf("iteration %d: g1 done channel should be closed after supersession", i) } - } - if staleWins > 0 { - t.Fatalf("stale reader won %d out of %d iterations, expected 0", staleWins, iterations) + select { + case <-g1Queue.ch: + t.Fatalf("iteration %d: stale reader g1 consumed a token after supersession", i) + default: + } } } @@ -1367,7 +1368,7 @@ func TestListenQueueDialFlowSendsToActiveListener(t *testing.T) { } } -func TestLeaseLockCleanedUpWhenListenExits(t *testing.T) { +func TestLeaseLockPreservedAfterListenExits(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-lock-cleanup" @@ -1381,13 +1382,14 @@ func TestLeaseLockCleanedUpWhenListenExits(t *testing.T) { t.Fatal("lease lock should exist after swapListenQueue") } + mu := svc.getLeaseLock(leaseName) + mu.Lock() wrapper.closeDone() - if svc.listenQueues.CompareAndDelete(leaseName, wrapper) { - svc.leaseLocks.Delete(leaseName) - } + mu.Unlock() + svc.listenQueues.CompareAndDelete(leaseName, wrapper) - if _, ok := svc.leaseLocks.Load(leaseName); ok { - t.Fatal("lease lock should be deleted after Listen cleanup with no replacement") + if _, ok := svc.leaseLocks.Load(leaseName); !ok { + t.Fatal("lease lock must be preserved after Listen cleanup to prevent mutex pointer races") } } @@ -1407,10 +1409,11 @@ func TestLeaseLockPreservedWhenNewListenerTakesOver(t *testing.T) { } svc.swapListenQueue(leaseName, g2) + mu := svc.getLeaseLock(leaseName) + mu.Lock() g1.closeDone() - if svc.listenQueues.CompareAndDelete(leaseName, g1) { - svc.leaseLocks.Delete(leaseName) - } + mu.Unlock() + svc.listenQueues.CompareAndDelete(leaseName, g1) if _, ok := svc.leaseLocks.Load(leaseName); !ok { t.Fatal("lease lock should be preserved when a new listener took over") From 739486d76c23ef67733fd44f960446f0c102327c Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 11:56:23 +0200 Subject: [PATCH 19/24] fix: use non-blocking send in sendToListener to prevent mutex starvation Replace the blocking channel send in sendToListener with a non-blocking send that returns ResourceExhausted when the listener buffer is full. Previously, sendToListener held the per-lease mutex while blocking on a full channel, which prevented swapListenQueue (reconnecting listeners) and other Dial attempts from proceeding until the RPC context timed out. Generated-By: Forge/20260416_105202_199878_b08a2035 Co-Authored-By: Claude Opus 4.6 --- .../internal/service/controller_service.go | 8 +- .../service/controller_service_test.go | 178 ++++++++++++------ 2 files changed, 121 insertions(+), 65 deletions(-) diff --git a/controller/internal/service/controller_service.go b/controller/internal/service/controller_service.go index 110b858a4..2d71bc8e2 100644 --- a/controller/internal/service/controller_service.go +++ b/controller/internal/service/controller_service.go @@ -115,7 +115,7 @@ func (s *ControllerService) swapListenQueue(leaseName string, newQueue *listenQu // per-lease lock guarantees that the queue loaded here cannot be superseded // between the load and the send, eliminating the TOCTOU race between Dial and // a reconnecting Listen. -func (s *ControllerService) sendToListener(ctx context.Context, leaseName string, response *pb.ListenResponse) error { +func (s *ControllerService) sendToListener(_ context.Context, leaseName string, response *pb.ListenResponse) error { mu := s.getLeaseLock(leaseName) mu.Lock() defer mu.Unlock() @@ -130,12 +130,10 @@ func (s *ControllerService) sendToListener(ctx context.Context, leaseName string default: } select { - case <-q.done: - return status.Errorf(codes.Unavailable, "exporter is not listening on lease %s", leaseName) - case <-ctx.Done(): - return ctx.Err() case q.ch <- response: return nil + default: + return status.Errorf(codes.ResourceExhausted, "listener buffer full on lease %s", leaseName) } } diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index f914a67da..1f5de57de 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -1272,7 +1272,7 @@ func TestListenQueueListenLoopDeliversTokensAndExitsOnDone(t *testing.T) { } } -func TestSendToListenerReturnsWhenContextCancelledAndBufferFull(t *testing.T) { +func TestSendToListenerReturnsResourceExhaustedWithCancelledContextAndBufferFull(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-ctx-cancel-buffer-full" @@ -1289,26 +1289,25 @@ func TestSendToListenerReturnsWhenContextCancelledAndBufferFull(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) cancel() - errCh := make(chan error, 1) - go func() { - errCh <- svc.sendToListener(ctx, leaseName, &pb.ListenResponse{ - RouterEndpoint: "ep", RouterToken: testRouterToken, - }) - }() + err := svc.sendToListener(ctx, leaseName, &pb.ListenResponse{ + RouterEndpoint: "ep", RouterToken: testRouterToken, + }) + if err == nil { + t.Fatal("sendToListener should return error when buffer is full") + } - select { - case err := <-errCh: - if err == nil { - t.Fatal("sendToListener should return error when context is cancelled and buffer full") - } - case <-time.After(2 * time.Second): - t.Fatal("sendToListener blocked indefinitely with cancelled context and full buffer") + st, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got %v", err) + } + if st.Code() != codes.ResourceExhausted { + t.Fatalf("expected ResourceExhausted, got %v", st.Code()) } } -func TestSendToListenerUnblocksWhenContextCancelledDuringBackpressure(t *testing.T) { +func TestSendToListenerReturnsImmediatelyDuringBackpressure(t *testing.T) { svc := &ControllerService{} - leaseName := "test-lease-ctx-cancel-backpressure" + leaseName := "test-lease-backpressure-immediate" q := &listenQueue{ ch: make(chan *pb.ListenResponse, 8), @@ -1320,25 +1319,19 @@ func TestSendToListenerUnblocksWhenContextCancelledDuringBackpressure(t *testing q.ch <- &pb.ListenResponse{RouterEndpoint: "fill", RouterToken: "fill"} } - ctx, cancel := context.WithCancel(context.Background()) - - errCh := make(chan error, 1) - go func() { - errCh <- svc.sendToListener(ctx, leaseName, &pb.ListenResponse{ - RouterEndpoint: "ep", RouterToken: testRouterToken, - }) - }() - - time.Sleep(50 * time.Millisecond) - cancel() + err := svc.sendToListener(context.Background(), leaseName, &pb.ListenResponse{ + RouterEndpoint: "ep", RouterToken: testRouterToken, + }) + if err == nil { + t.Fatal("sendToListener should return error when buffer is full") + } - select { - case err := <-errCh: - if err == nil { - t.Fatal("sendToListener should return error when context is cancelled") - } - case <-time.After(2 * time.Second): - t.Fatal("sendToListener did not unblock after context cancellation") + st, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got %v", err) + } + if st.Code() != codes.ResourceExhausted { + t.Fatalf("expected ResourceExhausted, got %v", st.Code()) } } @@ -1424,62 +1417,127 @@ func TestLeaseLockPreservedWhenNewListenerTakesOver(t *testing.T) { } } -func TestDeadlockChainBrokenByContextCancellation(t *testing.T) { +func TestSendToListenerReturnsResourceExhaustedWhenBufferFull(t *testing.T) { svc := &ControllerService{} - leaseName := "test-deadlock-chain" + leaseName := "test-lease-buffer-full-nonblocking" - g1 := &listenQueue{ + q := &listenQueue{ ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } - svc.swapListenQueue(leaseName, g1) + svc.swapListenQueue(leaseName, q) for i := 0; i < 8; i++ { - g1.ch <- &pb.ListenResponse{RouterEndpoint: "fill", RouterToken: "fill"} + q.ch <- &pb.ListenResponse{RouterEndpoint: "fill", RouterToken: "fill"} } - ctx, cancel := context.WithCancel(context.Background()) + err := svc.sendToListener(context.Background(), leaseName, &pb.ListenResponse{ + RouterEndpoint: "ep", RouterToken: testRouterToken, + }) + if err == nil { + t.Fatal("sendToListener should return error when buffer is full") + } + + st, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got %v", err) + } + if st.Code() != codes.ResourceExhausted { + t.Fatalf("expected ResourceExhausted, got %v", st.Code()) + } +} + +func TestSendToListenerDoesNotBlockMutexWhenBufferFull(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-no-mutex-block" + + q := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.swapListenQueue(leaseName, q) - sendDone := make(chan error, 1) + for i := 0; i < 8; i++ { + q.ch <- &pb.ListenResponse{RouterEndpoint: "fill", RouterToken: "fill"} + } + + sendDone := make(chan struct{}) + sendErr := make(chan error, 1) go func() { - sendDone <- svc.sendToListener(ctx, leaseName, &pb.ListenResponse{ + defer close(sendDone) + sendErr <- svc.sendToListener(context.Background(), leaseName, &pb.ListenResponse{ RouterEndpoint: "ep", RouterToken: testRouterToken, }) }() - time.Sleep(50 * time.Millisecond) - - g2 := &listenQueue{ - ch: make(chan *pb.ListenResponse, 8), - done: make(chan struct{}), + select { + case <-sendDone: + if err := <-sendErr; err == nil { + t.Fatal("sendToListener should return error when buffer is full") + } + case <-time.After(time.Second): + t.Fatal("sendToListener blocked when buffer was full; mutex held too long") } + swapDone := make(chan struct{}) go func() { defer close(swapDone) + g2 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } svc.swapListenQueue(leaseName, g2) }() select { case <-swapDone: - t.Fatal("swapListenQueue should be blocked waiting for the per-lease mutex") - case <-time.After(50 * time.Millisecond): + case <-time.After(time.Second): + t.Fatal("swapListenQueue blocked because sendToListener held the mutex on full buffer") } +} - cancel() +func TestSwapNotBlockedWhenBufferFull(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-no-deadlock-chain" - select { - case err := <-sendDone: - if err == nil { - t.Fatal("sendToListener should return a context error") - } - case <-time.After(2 * time.Second): - t.Fatal("sendToListener did not unblock after context cancellation") + g1 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), } + svc.swapListenQueue(leaseName, g1) + + for i := 0; i < 8; i++ { + g1.ch <- &pb.ListenResponse{RouterEndpoint: "fill", RouterToken: "fill"} + } + + err := svc.sendToListener(context.Background(), leaseName, &pb.ListenResponse{ + RouterEndpoint: "ep", RouterToken: testRouterToken, + }) + if err == nil { + t.Fatal("sendToListener should return error when buffer is full") + } + st, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got %v", err) + } + if st.Code() != codes.ResourceExhausted { + t.Fatalf("expected ResourceExhausted, got %v", st.Code()) + } + + g2 := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + swapDone := make(chan struct{}) + go func() { + defer close(swapDone) + svc.swapListenQueue(leaseName, g2) + }() select { case <-swapDone: case <-time.After(2 * time.Second): - t.Fatal("swapListenQueue did not proceed after sendToListener released the mutex") + t.Fatal("swapListenQueue should not be blocked when sendToListener returned immediately") } select { @@ -1488,8 +1546,8 @@ func TestDeadlockChainBrokenByContextCancellation(t *testing.T) { t.Fatal("g1 done channel should be closed after swap") } - v, ok := svc.listenQueues.Load(leaseName) - if !ok { + v, loaded := svc.listenQueues.Load(leaseName) + if !loaded { t.Fatal("queue should exist for g2") } if v != g2 { From 2b4e57f254cb90b0263a1b0ca91979cbea450ed8 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 14:38:27 +0200 Subject: [PATCH 20/24] fix: replace raw lease mutex with ref-counted leaseLock to enable safe cleanup The previous getLeaseLock returned raw *sync.Mutex pointers from a sync.Map that was never cleaned up, leaking memory. Replace with acquireLeaseLock / releaseLeaseLock using atomic ref counting so the map entry is removed when the last listener releases. Ensure closeDone in the Listen defer runs under the lease mutex for proper serialization. Co-Authored-By: Claude Opus 4.6 --- .../internal/service/controller_service.go | 49 ++++++-- .../service/controller_service_test.go | 112 +++++++++++++++--- 2 files changed, 135 insertions(+), 26 deletions(-) diff --git a/controller/internal/service/controller_service.go b/controller/internal/service/controller_service.go index 2d71bc8e2..84e303c43 100644 --- a/controller/internal/service/controller_service.go +++ b/controller/internal/service/controller_service.go @@ -26,6 +26,7 @@ import ( "slices" "strings" "sync" + "sync/atomic" "time" "golang.org/x/exp/maps" @@ -93,22 +94,50 @@ func (q *listenQueue) closeDone() { q.closeOnce.Do(func() { close(q.done) }) } -func (s *ControllerService) getLeaseLock(leaseName string) *sync.Mutex { - v, _ := s.leaseLocks.LoadOrStore(leaseName, &sync.Mutex{}) - return v.(*sync.Mutex) +type leaseLock struct { + mu sync.Mutex + refs int32 +} + +func (s *ControllerService) acquireLeaseLock(leaseName string) *sync.Mutex { + for { + v, loaded := s.leaseLocks.LoadOrStore(leaseName, &leaseLock{refs: 1}) + ll := v.(*leaseLock) + if !loaded { + return &ll.mu + } + newRefs := atomic.AddInt32(&ll.refs, 1) + if newRefs <= 1 { + atomic.AddInt32(&ll.refs, -1) + continue + } + return &ll.mu + } +} + +func (s *ControllerService) releaseLeaseLock(leaseName string) { + v, ok := s.leaseLocks.Load(leaseName) + if !ok { + return + } + ll := v.(*leaseLock) + if atomic.AddInt32(&ll.refs, -1) == 0 { + s.leaseLocks.CompareAndDelete(leaseName, ll) + } } // swapListenQueue atomically replaces the listen queue for a lease and signals // the previous queue to stop. The per-lease lock serializes this with // sendToListener so that Dial never sends a token to a superseded queue. func (s *ControllerService) swapListenQueue(leaseName string, newQueue *listenQueue) { - mu := s.getLeaseLock(leaseName) + mu := s.acquireLeaseLock(leaseName) mu.Lock() - defer mu.Unlock() old, loaded := s.listenQueues.Swap(leaseName, newQueue) if loaded { old.(*listenQueue).closeDone() } + mu.Unlock() + s.releaseLeaseLock(leaseName) } // sendToListener delivers a response to the active listener for a lease. The @@ -116,7 +145,8 @@ func (s *ControllerService) swapListenQueue(leaseName string, newQueue *listenQu // between the load and the send, eliminating the TOCTOU race between Dial and // a reconnecting Listen. func (s *ControllerService) sendToListener(_ context.Context, leaseName string, response *pb.ListenResponse) error { - mu := s.getLeaseLock(leaseName) + mu := s.acquireLeaseLock(leaseName) + defer s.releaseLeaseLock(leaseName) mu.Lock() defer mu.Unlock() v, ok := s.listenQueues.Load(leaseName) @@ -498,13 +528,14 @@ func (s *ControllerService) Listen(req *pb.ListenRequest, stream pb.ControllerSe ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } + listenMu := s.acquireLeaseLock(leaseName) s.swapListenQueue(leaseName, wrapper) defer func() { - mu := s.getLeaseLock(leaseName) - mu.Lock() + listenMu.Lock() wrapper.closeDone() - mu.Unlock() + listenMu.Unlock() s.listenQueues.CompareAndDelete(leaseName, wrapper) + s.releaseLeaseLock(leaseName) }() for { select { diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index 1f5de57de..7bdf9c201 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -1361,60 +1361,138 @@ func TestListenQueueDialFlowSendsToActiveListener(t *testing.T) { } } -func TestLeaseLockPreservedAfterListenExits(t *testing.T) { +func TestLeaseLockRefCountSingleListener(t *testing.T) { svc := &ControllerService{} - leaseName := "test-lease-lock-cleanup" + leaseName := "test-lease-refcount-single" - wrapper := &listenQueue{ - ch: make(chan *pb.ListenResponse, 8), - done: make(chan struct{}), + svc.acquireLeaseLock(leaseName) + + if _, ok := svc.leaseLocks.Load(leaseName); !ok { + t.Fatal("lease lock should exist after acquire") } - svc.swapListenQueue(leaseName, wrapper) + + svc.releaseLeaseLock(leaseName) + + if _, ok := svc.leaseLocks.Load(leaseName); ok { + t.Fatal("lease lock should be removed when last reference is released") + } +} + +func TestLeaseLockRefCountOverlappingListeners(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-refcount-overlap" + + svc.acquireLeaseLock(leaseName) + svc.acquireLeaseLock(leaseName) if _, ok := svc.leaseLocks.Load(leaseName); !ok { - t.Fatal("lease lock should exist after swapListenQueue") + t.Fatal("lease lock should exist with two references") } - mu := svc.getLeaseLock(leaseName) - mu.Lock() - wrapper.closeDone() - mu.Unlock() - svc.listenQueues.CompareAndDelete(leaseName, wrapper) + svc.releaseLeaseLock(leaseName) if _, ok := svc.leaseLocks.Load(leaseName); !ok { - t.Fatal("lease lock must be preserved after Listen cleanup to prevent mutex pointer races") + t.Fatal("lease lock should still exist with one remaining reference") + } + + svc.releaseLeaseLock(leaseName) + + if _, ok := svc.leaseLocks.Load(leaseName); ok { + t.Fatal("lease lock should be removed when all references are released") + } +} + +func TestLeaseLockRefCountConcurrentAcquireRelease(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-refcount-concurrent" + + var wg sync.WaitGroup + goroutines := 100 + + var counter int + for i := 0; i < goroutines; i++ { + wg.Add(1) + go func() { + defer wg.Done() + lock := svc.acquireLeaseLock(leaseName) + lock.Lock() + counter++ + lock.Unlock() + svc.releaseLeaseLock(leaseName) + }() + } + + wg.Wait() + + if counter != goroutines { + t.Fatalf("expected counter=%d, got %d", goroutines, counter) + } + + if _, ok := svc.leaseLocks.Load(leaseName); ok { + t.Fatal("lease lock should be removed after all goroutines release") + } +} + +func TestLeaseLockRefCountSameInstanceForOverlap(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-refcount-same-instance" + + lock1 := svc.acquireLeaseLock(leaseName) + lock2 := svc.acquireLeaseLock(leaseName) + + if lock1 != lock2 { + t.Fatal("overlapping acquires must return the same mutex") } + + svc.releaseLeaseLock(leaseName) + svc.releaseLeaseLock(leaseName) } func TestLeaseLockPreservedWhenNewListenerTakesOver(t *testing.T) { svc := &ControllerService{} leaseName := "test-lease-lock-preserved" + g1Mu := svc.acquireLeaseLock(leaseName) g1 := &listenQueue{ ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } svc.swapListenQueue(leaseName, g1) + g2Mu := svc.acquireLeaseLock(leaseName) g2 := &listenQueue{ ch: make(chan *pb.ListenResponse, 8), done: make(chan struct{}), } svc.swapListenQueue(leaseName, g2) - mu := svc.getLeaseLock(leaseName) - mu.Lock() + if g1Mu != g2Mu { + t.Fatal("overlapping listeners must share the same mutex") + } + + g1Mu.Lock() g1.closeDone() - mu.Unlock() + g1Mu.Unlock() svc.listenQueues.CompareAndDelete(leaseName, g1) + svc.releaseLeaseLock(leaseName) if _, ok := svc.leaseLocks.Load(leaseName); !ok { - t.Fatal("lease lock should be preserved when a new listener took over") + t.Fatal("lease lock should be preserved when a new listener still holds a reference") } if _, ok := svc.listenQueues.Load(leaseName); !ok { t.Fatal("queue should still exist for the new listener") } + + g2Mu.Lock() + g2.closeDone() + g2Mu.Unlock() + svc.listenQueues.CompareAndDelete(leaseName, g2) + svc.releaseLeaseLock(leaseName) + + if _, ok := svc.leaseLocks.Load(leaseName); ok { + t.Fatal("lease lock should be cleaned up when last listener releases") + } } func TestSendToListenerReturnsResourceExhaustedWhenBufferFull(t *testing.T) { From 6d15cdba800c8e7a08c5c65cc910e0113c2a4002 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 14:41:23 +0200 Subject: [PATCH 21/24] test: fix race in concurrent lease lock test and add overlapping listener test Remove the racy counter increment from TestLeaseLockRefCountConcurrentAcquireRelease since goroutines that acquire-release quickly may get different mutex instances. Add TestLeaseLockRefCountConcurrentOverlappingListeners that uses a barrier to ensure all goroutines hold a reference before using the mutex, matching the real Listen lifecycle pattern. Co-Authored-By: Claude Opus 4.6 --- .../service/controller_service_test.go | 37 +++++++++++++++++-- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index 7bdf9c201..b54f233ae 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -1409,16 +1409,45 @@ func TestLeaseLockRefCountConcurrentAcquireRelease(t *testing.T) { var wg sync.WaitGroup goroutines := 100 + for i := 0; i < goroutines; i++ { + wg.Add(1) + go func() { + defer wg.Done() + svc.acquireLeaseLock(leaseName) + svc.releaseLeaseLock(leaseName) + }() + } + + wg.Wait() + + if _, ok := svc.leaseLocks.Load(leaseName); ok { + t.Fatal("lease lock should be removed after all goroutines release") + } +} + +func TestLeaseLockRefCountConcurrentOverlappingListeners(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-refcount-concurrent-overlap" + goroutines := 50 + var counter int + var wg sync.WaitGroup + allAcquired := sync.WaitGroup{} + allAcquired.Add(goroutines) + for i := 0; i < goroutines; i++ { wg.Add(1) go func() { defer wg.Done() - lock := svc.acquireLeaseLock(leaseName) - lock.Lock() + mu := svc.acquireLeaseLock(leaseName) + defer svc.releaseLeaseLock(leaseName) + + allAcquired.Done() + allAcquired.Wait() + + mu.Lock() counter++ - lock.Unlock() - svc.releaseLeaseLock(leaseName) + mu.Unlock() }() } From 63c96546167714c143b556120d8df81cd8d36138 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 15:25:57 +0200 Subject: [PATCH 22/24] ci: add E2E stress test workflow for flakiness detection Runs the E2E suite 100 times in parallel (max 20 concurrent) with fail-fast enabled. Builds images once, then each matrix entry sets up its own Kind cluster and runs the full test. Triggered manually via workflow_dispatch only. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/e2e-stress.yaml | 141 ++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 .github/workflows/e2e-stress.yaml diff --git a/.github/workflows/e2e-stress.yaml b/.github/workflows/e2e-stress.yaml new file mode 100644 index 000000000..291a57e60 --- /dev/null +++ b/.github/workflows/e2e-stress.yaml @@ -0,0 +1,141 @@ +name: E2E stress test (flakiness detection) + +on: + workflow_dispatch: + +permissions: + contents: read + +env: + CONTAINER_TOOL: docker + +jobs: + build-controller-image: + runs-on: ubuntu-24.04 + timeout-minutes: 30 + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: actions/setup-go@v5 + with: + go-version: "1.22" + + - name: Build controller image + run: | + make -C controller docker-build + docker save quay.io/jumpstarter-dev/jumpstarter-controller:latest -o /tmp/controller-image.tar + + - uses: actions/upload-artifact@v4 + with: + name: controller-image-amd64 + path: /tmp/controller-image.tar + retention-days: 1 + + build-operator-image: + runs-on: ubuntu-24.04 + timeout-minutes: 30 + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: actions/setup-go@v5 + with: + go-version: "1.22" + + - name: Build operator image and installer manifest + run: | + make -C controller/deploy/operator docker-build build-installer VERSION=latest + docker save quay.io/jumpstarter-dev/jumpstarter-operator:latest -o /tmp/operator-image.tar + cp controller/deploy/operator/dist/install.yaml /tmp/operator-install.yaml + + - uses: actions/upload-artifact@v4 + with: + name: operator-image-amd64 + path: | + /tmp/operator-image.tar + /tmp/operator-install.yaml + retention-days: 1 + + build-python-wheels: + runs-on: ubuntu-24.04 + timeout-minutes: 15 + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: astral-sh/setup-uv@v7 + + - name: Build python wheels + working-directory: python + run: uv build --all --out-dir dist + + - uses: actions/upload-artifact@v4 + with: + name: python-wheels + path: python/dist/ + retention-days: 1 + + e2e-stress: + needs: [build-controller-image, build-operator-image, build-python-wheels] + strategy: + fail-fast: true + max-parallel: 20 + matrix: + run: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100] + runs-on: ubuntu-24.04 + timeout-minutes: 60 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v7 + + - name: Install Go + uses: actions/setup-go@v5 + with: + go-version: "1.22" + + - name: Download controller image + uses: actions/download-artifact@v4 + with: + name: controller-image-amd64 + path: /tmp/artifacts + + - name: Download operator image + uses: actions/download-artifact@v4 + with: + name: operator-image-amd64 + path: /tmp/artifacts + + - name: Download python wheels + uses: actions/download-artifact@v4 + with: + name: python-wheels + path: /tmp/python-wheels + + - name: Load container images and operator manifest + run: | + docker load < /tmp/artifacts/controller-image.tar + docker load < /tmp/artifacts/operator-image.tar + mkdir -p controller/deploy/operator/dist + cp /tmp/artifacts/operator-install.yaml controller/deploy/operator/dist/install.yaml + + - name: Setup e2e test environment + run: make e2e-setup + env: + CI: true + METHOD: operator + SKIP_BUILD: "true" + PREBUILT_WHEELS_DIR: /tmp/python-wheels + OPERATOR_IMG: quay.io/jumpstarter-dev/jumpstarter-operator:latest + + - name: Run e2e tests (iteration ${{ matrix.run }}/100) + run: make e2e-run + env: + CI: true + METHOD: operator From 6786ccfd17a9da4434cf5766c0467ca8354c9333 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 17:01:43 +0200 Subject: [PATCH 23/24] Revert "ci: add E2E stress test workflow for flakiness detection" This reverts commit 160c6c1bc36f647524c365fb3741733d1f8be420. --- .github/workflows/e2e-stress.yaml | 141 ------------------------------ 1 file changed, 141 deletions(-) delete mode 100644 .github/workflows/e2e-stress.yaml diff --git a/.github/workflows/e2e-stress.yaml b/.github/workflows/e2e-stress.yaml deleted file mode 100644 index 291a57e60..000000000 --- a/.github/workflows/e2e-stress.yaml +++ /dev/null @@ -1,141 +0,0 @@ -name: E2E stress test (flakiness detection) - -on: - workflow_dispatch: - -permissions: - contents: read - -env: - CONTAINER_TOOL: docker - -jobs: - build-controller-image: - runs-on: ubuntu-24.04 - timeout-minutes: 30 - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - uses: actions/setup-go@v5 - with: - go-version: "1.22" - - - name: Build controller image - run: | - make -C controller docker-build - docker save quay.io/jumpstarter-dev/jumpstarter-controller:latest -o /tmp/controller-image.tar - - - uses: actions/upload-artifact@v4 - with: - name: controller-image-amd64 - path: /tmp/controller-image.tar - retention-days: 1 - - build-operator-image: - runs-on: ubuntu-24.04 - timeout-minutes: 30 - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - uses: actions/setup-go@v5 - with: - go-version: "1.22" - - - name: Build operator image and installer manifest - run: | - make -C controller/deploy/operator docker-build build-installer VERSION=latest - docker save quay.io/jumpstarter-dev/jumpstarter-operator:latest -o /tmp/operator-image.tar - cp controller/deploy/operator/dist/install.yaml /tmp/operator-install.yaml - - - uses: actions/upload-artifact@v4 - with: - name: operator-image-amd64 - path: | - /tmp/operator-image.tar - /tmp/operator-install.yaml - retention-days: 1 - - build-python-wheels: - runs-on: ubuntu-24.04 - timeout-minutes: 15 - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - uses: astral-sh/setup-uv@v7 - - - name: Build python wheels - working-directory: python - run: uv build --all --out-dir dist - - - uses: actions/upload-artifact@v4 - with: - name: python-wheels - path: python/dist/ - retention-days: 1 - - e2e-stress: - needs: [build-controller-image, build-operator-image, build-python-wheels] - strategy: - fail-fast: true - max-parallel: 20 - matrix: - run: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100] - runs-on: ubuntu-24.04 - timeout-minutes: 60 - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Install uv - uses: astral-sh/setup-uv@v7 - - - name: Install Go - uses: actions/setup-go@v5 - with: - go-version: "1.22" - - - name: Download controller image - uses: actions/download-artifact@v4 - with: - name: controller-image-amd64 - path: /tmp/artifacts - - - name: Download operator image - uses: actions/download-artifact@v4 - with: - name: operator-image-amd64 - path: /tmp/artifacts - - - name: Download python wheels - uses: actions/download-artifact@v4 - with: - name: python-wheels - path: /tmp/python-wheels - - - name: Load container images and operator manifest - run: | - docker load < /tmp/artifacts/controller-image.tar - docker load < /tmp/artifacts/operator-image.tar - mkdir -p controller/deploy/operator/dist - cp /tmp/artifacts/operator-install.yaml controller/deploy/operator/dist/install.yaml - - - name: Setup e2e test environment - run: make e2e-setup - env: - CI: true - METHOD: operator - SKIP_BUILD: "true" - PREBUILT_WHEELS_DIR: /tmp/python-wheels - OPERATOR_IMG: quay.io/jumpstarter-dev/jumpstarter-operator:latest - - - name: Run e2e tests (iteration ${{ matrix.run }}/100) - run: make e2e-run - env: - CI: true - METHOD: operator From 94d3027d9de7e202a6a5a23fc326aa206ead7309 Mon Sep 17 00:00:00 2001 From: Paul Wallrabe Date: Thu, 16 Apr 2026 20:07:18 +0200 Subject: [PATCH 24/24] fix: drain buffered tokens on Listen supersession and assert gRPC status codes Address CodeRabbit review feedback on PR #573: 1. When wrapper.done fires in the Listen loop, drain wrapper.ch via a non-blocking loop calling stream.Send() for each queued token before returning. This prevents token loss when sendToListener enqueues a token just before swapListenQueue closes the done channel. 2. Update TestListenQueueDialReturnsUnavailableWhenNoListener to call sendToListener and assert status.Code(err) == codes.Unavailable instead of just checking the sync.Map directly. 3. Update TestListenQueueDialReturnsUnavailableWhenDoneClosed to assert status.Code(err) == codes.Unavailable instead of just err != nil. 4. Update TestListenQueueListenLoopDeliversTokensAndExitsOnDone to include drain logic matching the production code pattern. 5. Add TestListenQueueDrainsBufferedTokensOnSupersession and TestListenQueueListenLoopDrainsOnSupersession to verify the drain behavior. Co-Authored-By: Claude Opus 4.6 --- .../internal/service/controller_service.go | 11 +- .../service/controller_service_test.go | 125 +++++++++++++++++- 2 files changed, 131 insertions(+), 5 deletions(-) diff --git a/controller/internal/service/controller_service.go b/controller/internal/service/controller_service.go index 84e303c43..ee437c4e6 100644 --- a/controller/internal/service/controller_service.go +++ b/controller/internal/service/controller_service.go @@ -542,7 +542,16 @@ func (s *ControllerService) Listen(req *pb.ListenRequest, stream pb.ControllerSe case <-ctx.Done(): return nil case <-wrapper.done: - return nil + for { + select { + case msg := <-wrapper.ch: + if err := stream.Send(msg); err != nil { + return err + } + default: + return nil + } + } case msg := <-wrapper.ch: if err := stream.Send(msg); err != nil { return err diff --git a/controller/internal/service/controller_service_test.go b/controller/internal/service/controller_service_test.go index b54f233ae..9e506f543 100644 --- a/controller/internal/service/controller_service_test.go +++ b/controller/internal/service/controller_service_test.go @@ -1033,9 +1033,18 @@ func TestListenQueueDialReturnsUnavailableWhenNoListener(t *testing.T) { svc := &ControllerService{} leaseName := "nonexistent-lease" - _, ok := svc.listenQueues.Load(leaseName) - if ok { - t.Fatal("expected no entry for nonexistent lease") + err := svc.sendToListener(context.Background(), leaseName, &pb.ListenResponse{ + RouterEndpoint: "ep", RouterToken: testRouterToken, + }) + if err == nil { + t.Fatal("sendToListener should return error for nonexistent lease") + } + st, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got %v", err) + } + if st.Code() != codes.Unavailable { + t.Fatalf("expected codes.Unavailable, got %v", st.Code()) } } @@ -1056,6 +1065,13 @@ func TestListenQueueDialReturnsUnavailableWhenDoneClosed(t *testing.T) { if err == nil { t.Fatal("sendToListener should return error for done queue") } + st, ok := status.FromError(err) + if !ok { + t.Fatalf("expected gRPC status error, got %v", err) + } + if st.Code() != codes.Unavailable { + t.Fatalf("expected codes.Unavailable, got %v", st.Code()) + } } func TestListenQueueContextCancellationExitsListenLoop(t *testing.T) { @@ -1230,7 +1246,14 @@ func TestListenQueueListenLoopDeliversTokensAndExitsOnDone(t *testing.T) { for { select { case <-wrapper.done: - return + for { + select { + case msg := <-wrapper.ch: + delivered <- msg + default: + return + } + } case msg := <-wrapper.ch: delivered <- msg } @@ -1661,3 +1684,97 @@ func TestSwapNotBlockedWhenBufferFull(t *testing.T) { t.Fatal("active queue should be g2") } } + +func TestListenQueueDrainsBufferedTokensOnSupersession(t *testing.T) { + svc := &ControllerService{} + leaseName := "test-lease-drain-on-supersession" + + wrapper := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.swapListenQueue(leaseName, wrapper) + + err := svc.sendToListener(context.Background(), leaseName, &pb.ListenResponse{ + RouterEndpoint: "ep1", RouterToken: "tok1", + }) + if err != nil { + t.Fatalf("first sendToListener failed: %v", err) + } + err = svc.sendToListener(context.Background(), leaseName, &pb.ListenResponse{ + RouterEndpoint: "ep2", RouterToken: "tok2", + }) + if err != nil { + t.Fatalf("second sendToListener failed: %v", err) + } + + superseder := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + svc.swapListenQueue(leaseName, superseder) + + select { + case <-wrapper.done: + default: + t.Fatal("wrapper.done should be closed after supersession") + } + + if len(wrapper.ch) != 2 { + t.Fatalf("expected 2 buffered tokens before drain, got %d", len(wrapper.ch)) + } + + drained := drainChannel(wrapper.ch) + if drained != 2 { + t.Fatalf("expected 2 tokens to drain from superseded queue, got %d", drained) + } +} + +func TestListenQueueListenLoopDrainsOnSupersession(t *testing.T) { + wrapper := &listenQueue{ + ch: make(chan *pb.ListenResponse, 8), + done: make(chan struct{}), + } + + wrapper.ch <- &pb.ListenResponse{RouterEndpoint: "ep1", RouterToken: "tok1"} + wrapper.ch <- &pb.ListenResponse{RouterEndpoint: "ep2", RouterToken: "tok2"} + + wrapper.closeDone() + + delivered := make(chan *pb.ListenResponse, 8) + loopExited := make(chan struct{}) + + go func() { + defer close(loopExited) + for { + select { + case <-wrapper.done: + for { + select { + case msg := <-wrapper.ch: + delivered <- msg + default: + return + } + } + case msg := <-wrapper.ch: + delivered <- msg + } + } + }() + + select { + case <-loopExited: + case <-time.After(time.Second): + t.Fatal("listen loop did not exit after done was closed") + } + + close(delivered) + var count int + for range delivered { + count++ + } + if count != 2 { + t.Fatalf("expected 2 drained tokens from listen loop, got %d", count) + } +}