diff --git a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go index 51bcc4f17d..c69b65781b 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go +++ b/packages/orchestrator/pkg/sandbox/uffd/userfaultfd/userfaultfd.go @@ -4,8 +4,10 @@ import ( "context" "errors" "fmt" + "math/rand" "sync" "syscall" + "time" "unsafe" "go.opentelemetry.io/otel" @@ -25,6 +27,17 @@ var tracer = otel.Tracer("github.com/e2b-dev/infra/packages/orchestrator/pkg/san const maxRequestsInProgress = 4096 +const ( + // sliceMaxRetries is the number of times to retry source.Slice() after the initial attempt. + // Total attempts = sliceMaxRetries + 1. + sliceMaxRetries = 3 + // sliceRetryBaseDelay is the initial backoff delay before the first retry. + // Subsequent retries double the delay (exponential backoff), capped at sliceRetryMaxDelay. + sliceRetryBaseDelay = 50 * time.Millisecond + // sliceRetryMaxDelay is the maximum backoff delay between retries. + sliceRetryMaxDelay = 500 * time.Millisecond +) + var ErrUnexpectedEventType = errors.New("unexpected event type") // hasEvent checks if a specific poll event flag is set in revents. @@ -331,7 +344,43 @@ func (u *Userfaultfd) faultPage( } }() - b, dataErr := source.Slice(ctx, offset, int64(pagesize)) + var b []byte + var dataErr error + var attempt int + +retryLoop: + for attempt = range sliceMaxRetries + 1 { + b, dataErr = source.Slice(ctx, offset, int64(pagesize)) + if dataErr == nil { + break + } + + if attempt >= sliceMaxRetries || ctx.Err() != nil { + break + } + + u.logger.Warn(ctx, "UFFD serve slice error, retrying", + zap.Int("attempt", attempt+1), + zap.Int("max_attempts", sliceMaxRetries+1), + zap.Error(dataErr), + ) + + delay := min(sliceRetryBaseDelay<