From 893cfac31f6cdc8063ad6af2879462313b4603b3 Mon Sep 17 00:00:00 2001 From: Jakub Novak Date: Tue, 14 Apr 2026 09:43:34 +0000 Subject: [PATCH 1/4] fix(uffd): add retry with exponential backoff on source.Slice() errors in faultPage Transient Slice errors (network blips, temporary GCS/S3 failures) previously caused immediate sandbox termination. Retry up to 3 times with exponential backoff (50ms-500ms + jitter) before signaling uffd exit, giving transient errors a chance to recover. --- .../sandbox/uffd/userfaultfd/userfaultfd.go | 54 +++++++++++++++++-- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go index 51bcc4f17d..ca049bd1dc 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go @@ -4,8 +4,10 @@ import ( "context" "errors" "fmt" + "math/rand" "sync" "syscall" + "time" "unsafe" "go.opentelemetry.io/otel" @@ -25,6 +27,17 @@ var tracer = otel.Tracer("github.com/e2b-dev/infra/packages/orchestrator/pkg/san const maxRequestsInProgress = 4096 +const ( + // sliceMaxAttempts is the number of times to retry source.Slice() after the initial attempt. + // Total attempts = sliceMaxAttempts + 1. + sliceMaxAttempts = 3 + // sliceRetryBaseDelay is the initial backoff delay before the first retry. + // Subsequent retries double the delay (exponential backoff), capped at sliceRetryMaxDelay. + sliceRetryBaseDelay = 50 * time.Millisecond + // sliceRetryMaxDelay is the maximum backoff delay between retries. + sliceRetryMaxDelay = 500 * time.Millisecond +) + var ErrUnexpectedEventType = errors.New("unexpected event type") // hasEvent checks if a specific poll event flag is set in revents. @@ -331,7 +344,39 @@ func (u *Userfaultfd) faultPage( } }() - b, dataErr := source.Slice(ctx, offset, int64(pagesize)) + var b []byte + var dataErr error + var attempt int + +retryLoop: + for attempt = range sliceMaxAttempts { + b, dataErr = source.Slice(ctx, offset, int64(pagesize)) + if dataErr == nil { + break + } + + if attempt >= sliceMaxAttempts-1 { + break + } + + u.logger.Warn(ctx, "UFFD serve slice error, retrying", + zap.Int("attempt", attempt+1), + zap.Int("max_attempts", sliceMaxAttempts), + zap.Error(dataErr), + ) + + delay := min(sliceRetryBaseDelay< Date: Tue, 14 Apr 2026 09:49:11 +0000 Subject: [PATCH 2/4] refactor(uffd): rename sliceMaxAttempts to sliceMaxRetries for clarity The constant represents the number of retries, not total attempts. Adjusted the loop range and log fields to be consistent with the new naming. --- .../pkg/sandbox/uffd/userfaultfd/userfaultfd.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go index ca049bd1dc..b2406d4c3e 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go @@ -28,9 +28,9 @@ var tracer = otel.Tracer("github.com/e2b-dev/infra/packages/orchestrator/pkg/san const maxRequestsInProgress = 4096 const ( - // sliceMaxAttempts is the number of times to retry source.Slice() after the initial attempt. - // Total attempts = sliceMaxAttempts + 1. - sliceMaxAttempts = 3 + // sliceMaxRetries is the number of times to retry source.Slice() after the initial attempt. + // Total attempts = sliceMaxRetries + 1. + sliceMaxRetries = 3 // sliceRetryBaseDelay is the initial backoff delay before the first retry. // Subsequent retries double the delay (exponential backoff), capped at sliceRetryMaxDelay. sliceRetryBaseDelay = 50 * time.Millisecond @@ -349,19 +349,19 @@ func (u *Userfaultfd) faultPage( var attempt int retryLoop: - for attempt = range sliceMaxAttempts { + for attempt = range sliceMaxRetries + 1 { b, dataErr = source.Slice(ctx, offset, int64(pagesize)) if dataErr == nil { break } - if attempt >= sliceMaxAttempts-1 { + if attempt >= sliceMaxRetries { break } u.logger.Warn(ctx, "UFFD serve slice error, retrying", zap.Int("attempt", attempt+1), - zap.Int("max_attempts", sliceMaxAttempts), + zap.Int("max_attempts", sliceMaxRetries+1), zap.Error(dataErr), ) @@ -387,7 +387,7 @@ retryLoop: span.RecordError(joinedErr) u.logger.Error(ctx, "UFFD serve data fetch error after retries", - zap.Int("attempts", sliceMaxAttempts), + zap.Int("attempts", attempt+1), zap.Error(joinedErr), ) From a7ed495819883076fee6077ad354d06d80796975 Mon Sep 17 00:00:00 2001 From: Jakub Novak Date: Tue, 14 Apr 2026 10:47:18 +0000 Subject: [PATCH 3/4] fix(uffd): skip retry logging when context is already cancelled When the context is cancelled (e.g. during shutdown), source.Slice() fails immediately with a context error. Without this check, up to 4096 concurrent fault handlers would each log a misleading 'retrying' warning before the select detects ctx.Done(). Break the retry loop early when ctx.Err() is non-nil to avoid the log noise. --- .../orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go index b2406d4c3e..60e970b184 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go @@ -355,7 +355,7 @@ retryLoop: break } - if attempt >= sliceMaxRetries { + if attempt >= sliceMaxRetries || ctx.Err() != nil { break } From 5fa2cb0b99360ab373d2775322970def4d19dc41 Mon Sep 17 00:00:00 2001 From: Jakub Novak Date: Tue, 14 Apr 2026 10:49:31 +0000 Subject: [PATCH 4/4] fix(uffd): stop backoff timer explicitly on context cancellation time.After creates a timer that lives until expiry even if the select takes the ctx.Done path. Under high failure load with 4096 concurrent fault handlers each retrying 3 times, this could produce many abandoned timers. Use time.NewTimer and call Stop() on context cancellation to release the timer immediately. --- .../pkg/sandbox/uffd/userfaultfd/userfaultfd.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go index 60e970b184..c69b65781b 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go @@ -368,12 +368,16 @@ retryLoop: delay := min(sliceRetryBaseDelay<