From f780bbb47738bdce16330620a57d09587b3be2f8 Mon Sep 17 00:00:00 2001 From: Jakub Novak Date: Tue, 14 Apr 2026 09:25:29 +0000 Subject: [PATCH 1/4] refactor: extract StaleCutoff as shared constant --- packages/api/internal/sandbox/states.go | 4 ++++ packages/api/internal/sandbox/storage/redis/items.go | 3 ++- packages/api/internal/sandbox/storage/redis/operations.go | 7 +------ 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/packages/api/internal/sandbox/states.go b/packages/api/internal/sandbox/states.go index 0a59155310..aa267ed9db 100644 --- a/packages/api/internal/sandbox/states.go +++ b/packages/api/internal/sandbox/states.go @@ -4,6 +4,10 @@ import ( "time" ) +// StaleCutoff is how long a team entry must be expired before it can be pruned +// This prevents races where TTL is extended, but the sandbox would still be removed +const StaleCutoff = 10 * time.Minute + // TransitionEffect describes what happens to the sandbox when a state action completes. type TransitionEffect int diff --git a/packages/api/internal/sandbox/storage/redis/items.go b/packages/api/internal/sandbox/storage/redis/items.go index 77c8b860d8..6539aa8aa9 100644 --- a/packages/api/internal/sandbox/storage/redis/items.go +++ b/packages/api/internal/sandbox/storage/redis/items.go @@ -114,7 +114,8 @@ func (s *Storage) ExpiredItems(ctx context.Context) ([]sandbox.Sandbox, error) { // Only evict running sandboxes if sbx.State != sandbox.StateRunning { - if time.Since(sbx.EndTime) <= staleCutoff { + // If the sandbox is in transitioning state for more than stale cutoff, it's likely failed removal. Let it be cleaned up by the regular expiration process. + if time.Since(sbx.EndTime) <= sandbox.StaleCutoff { // Let the current removal finish continue diff --git a/packages/api/internal/sandbox/storage/redis/operations.go b/packages/api/internal/sandbox/storage/redis/operations.go index f67db89ee0..87ae8ae790 100644 --- a/packages/api/internal/sandbox/storage/redis/operations.go +++ b/packages/api/internal/sandbox/storage/redis/operations.go @@ -224,11 +224,6 @@ func (s *Storage) Update(ctx context.Context, teamID uuid.UUID, sandboxID string return updatedSbx, nil } -// staleCutoff is how long a team entry must be idle (no Add calls) before it -// can be pruned from the global teams ZSET when its sandbox count is zero. -// This prevents races where a Remove sees SCARD==0 right before an Add. -const staleCutoff = time.Hour - func (s *Storage) TeamsWithSandboxCount(ctx context.Context) (map[uuid.UUID]int64, error) { members, err := s.redisClient.ZRangeWithScores(ctx, globalTeamsSet, 0, -1).Result() if err != nil { @@ -269,7 +264,7 @@ func (s *Storage) TeamsWithSandboxCount(ctx context.Context) (map[uuid.UUID]int6 } now := time.Now().Unix() - cutoff := now - int64(staleCutoff.Seconds()) + cutoff := now - int64(sandbox.StaleCutoff.Seconds()) teams := make(map[uuid.UUID]int64, len(entries)) var stale []any From 35c3ed4f2294df50b14abcea5f0a97268aa53077 Mon Sep 17 00:00:00 2001 From: Jakub Novak Date: Tue, 14 Apr 2026 09:25:52 +0000 Subject: [PATCH 2/4] fix: use context.WithoutCancel for sandbox store removal to prevent cleanup failure on client disconnect --- packages/api/internal/orchestrator/delete_instance.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/api/internal/orchestrator/delete_instance.go b/packages/api/internal/orchestrator/delete_instance.go index 5d1b979bcd..8adb7889f4 100644 --- a/packages/api/internal/orchestrator/delete_instance.go +++ b/packages/api/internal/orchestrator/delete_instance.go @@ -84,7 +84,8 @@ func (o *Orchestrator) RemoveSandbox(ctx context.Context, teamID uuid.UUID, sand } defer func() { go o.analyticsRemove(context.WithoutCancel(ctx), sbx, opts.Action) }() - defer o.sandboxStore.Remove(ctx, teamID, sandboxID) + // Once we start the removal process, we want to make sure it gets removed from the store + defer o.sandboxStore.Remove(context.WithoutCancel(ctx), teamID, sandboxID) err = o.removeSandboxFromNode(ctx, sbx, opts.Action) if err != nil { logger.L().Error(ctx, "Error pausing sandbox", zap.Error(err), logger.WithSandboxID(sbx.SandboxID)) From 1032a5ebf0b8682ee7eff683dc2bba3f959b9013 Mon Sep 17 00:00:00 2001 From: Jakub Novak Date: Tue, 14 Apr 2026 09:26:05 +0000 Subject: [PATCH 3/4] fix: clean up zombie sandboxes stuck in terminal state after StaleCutoff --- packages/api/internal/orchestrator/delete_instance.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/packages/api/internal/orchestrator/delete_instance.go b/packages/api/internal/orchestrator/delete_instance.go index 8adb7889f4..89e6dd3458 100644 --- a/packages/api/internal/orchestrator/delete_instance.go +++ b/packages/api/internal/orchestrator/delete_instance.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "time" "github.com/google/uuid" "go.uber.org/zap" @@ -80,6 +81,10 @@ func (o *Orchestrator) RemoveSandbox(ctx context.Context, teamID uuid.UUID, sand if alreadyDone { logger.L().Info(ctx, "Sandbox was already in the process of being removed", logger.WithSandboxID(sandboxID), zap.String("state", string(sbx.State))) + if time.Since(sbx.EndTime) > sandbox.StaleCutoff && opts.Action.Effect == sandbox.TransitionExpires { + o.sandboxStore.Remove(context.WithoutCancel(ctx), teamID, sandboxID) + } + return nil } From 55150caed59b4f54799d96c00a41af12257c0c33 Mon Sep 17 00:00:00 2001 From: Jakub Novak Date: Tue, 14 Apr 2026 12:26:30 +0000 Subject: [PATCH 4/4] fix: add missing analytics action --- packages/api/internal/orchestrator/delete_instance.go | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/api/internal/orchestrator/delete_instance.go b/packages/api/internal/orchestrator/delete_instance.go index 89e6dd3458..1528c386e7 100644 --- a/packages/api/internal/orchestrator/delete_instance.go +++ b/packages/api/internal/orchestrator/delete_instance.go @@ -83,6 +83,7 @@ func (o *Orchestrator) RemoveSandbox(ctx context.Context, teamID uuid.UUID, sand if time.Since(sbx.EndTime) > sandbox.StaleCutoff && opts.Action.Effect == sandbox.TransitionExpires { o.sandboxStore.Remove(context.WithoutCancel(ctx), teamID, sandboxID) + go o.analyticsRemove(context.WithoutCancel(ctx), sbx, opts.Action) } return nil