diff --git a/packages/api/internal/orchestrator/delete_instance.go b/packages/api/internal/orchestrator/delete_instance.go index 5d1b979bcd..1528c386e7 100644 --- a/packages/api/internal/orchestrator/delete_instance.go +++ b/packages/api/internal/orchestrator/delete_instance.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "time" "github.com/google/uuid" "go.uber.org/zap" @@ -80,11 +81,17 @@ func (o *Orchestrator) RemoveSandbox(ctx context.Context, teamID uuid.UUID, sand if alreadyDone { logger.L().Info(ctx, "Sandbox was already in the process of being removed", logger.WithSandboxID(sandboxID), zap.String("state", string(sbx.State))) + if time.Since(sbx.EndTime) > sandbox.StaleCutoff && opts.Action.Effect == sandbox.TransitionExpires { + o.sandboxStore.Remove(context.WithoutCancel(ctx), teamID, sandboxID) + go o.analyticsRemove(context.WithoutCancel(ctx), sbx, opts.Action) + } + return nil } defer func() { go o.analyticsRemove(context.WithoutCancel(ctx), sbx, opts.Action) }() - defer o.sandboxStore.Remove(ctx, teamID, sandboxID) + // Once we start the removal process, we want to make sure it gets removed from the store + defer o.sandboxStore.Remove(context.WithoutCancel(ctx), teamID, sandboxID) err = o.removeSandboxFromNode(ctx, sbx, opts.Action) if err != nil { logger.L().Error(ctx, "Error pausing sandbox", zap.Error(err), logger.WithSandboxID(sbx.SandboxID)) diff --git a/packages/api/internal/sandbox/states.go b/packages/api/internal/sandbox/states.go index 0a59155310..aa267ed9db 100644 --- a/packages/api/internal/sandbox/states.go +++ b/packages/api/internal/sandbox/states.go @@ -4,6 +4,10 @@ import ( "time" ) +// StaleCutoff is how long a team entry must be expired before it can be pruned +// This prevents races where TTL is extended, but the sandbox would still be removed +const StaleCutoff = 10 * time.Minute + // TransitionEffect describes what happens to the sandbox when a state action completes. type TransitionEffect int diff --git a/packages/api/internal/sandbox/storage/redis/items.go b/packages/api/internal/sandbox/storage/redis/items.go index 77c8b860d8..6539aa8aa9 100644 --- a/packages/api/internal/sandbox/storage/redis/items.go +++ b/packages/api/internal/sandbox/storage/redis/items.go @@ -114,7 +114,8 @@ func (s *Storage) ExpiredItems(ctx context.Context) ([]sandbox.Sandbox, error) { // Only evict running sandboxes if sbx.State != sandbox.StateRunning { - if time.Since(sbx.EndTime) <= staleCutoff { + // If the sandbox is in transitioning state for more than stale cutoff, it's likely failed removal. Let it be cleaned up by the regular expiration process. + if time.Since(sbx.EndTime) <= sandbox.StaleCutoff { // Let the current removal finish continue diff --git a/packages/api/internal/sandbox/storage/redis/operations.go b/packages/api/internal/sandbox/storage/redis/operations.go index f67db89ee0..87ae8ae790 100644 --- a/packages/api/internal/sandbox/storage/redis/operations.go +++ b/packages/api/internal/sandbox/storage/redis/operations.go @@ -224,11 +224,6 @@ func (s *Storage) Update(ctx context.Context, teamID uuid.UUID, sandboxID string return updatedSbx, nil } -// staleCutoff is how long a team entry must be idle (no Add calls) before it -// can be pruned from the global teams ZSET when its sandbox count is zero. -// This prevents races where a Remove sees SCARD==0 right before an Add. -const staleCutoff = time.Hour - func (s *Storage) TeamsWithSandboxCount(ctx context.Context) (map[uuid.UUID]int64, error) { members, err := s.redisClient.ZRangeWithScores(ctx, globalTeamsSet, 0, -1).Result() if err != nil { @@ -269,7 +264,7 @@ func (s *Storage) TeamsWithSandboxCount(ctx context.Context) (map[uuid.UUID]int6 } now := time.Now().Unix() - cutoff := now - int64(staleCutoff.Seconds()) + cutoff := now - int64(sandbox.StaleCutoff.Seconds()) teams := make(map[uuid.UUID]int64, len(entries)) var stale []any