-
Notifications
You must be signed in to change notification settings - Fork 206
[retry] Add centralized retry/backoff abstraction #1007
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
02affd4
3f5e74e
f189bc6
6e8e9e7
702f0a1
08b2ca7
628892a
eba9ca5
1718c39
df31ddc
0deb97d
cfcf5b4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,94 @@ | ||
| package retry | ||
|
|
||
| import ( | ||
| "errors" | ||
|
|
||
| "github.com/cenkalti/backoff/v5" | ||
| meshkiterrors "github.com/meshery/meshkit/errors" | ||
| ) | ||
|
|
||
| // ErrInvalidConfig is returned when retry configuration validation fails. | ||
| // Use errors.Is(err, ErrInvalidConfig) to distinguish config errors from | ||
| // operation failures. | ||
| var ErrInvalidConfig = errors.New("retry: invalid config") | ||
|
|
||
| const ( | ||
| ErrRetryCode = "meshkit-10001" | ||
| ErrContextCode = "meshkit-10002" | ||
| ErrConfigCode = "meshkit-10003" | ||
| ) | ||
|
|
||
| type retryError struct { | ||
| inner error | ||
| meshkit *meshkiterrors.Error | ||
| } | ||
|
|
||
| func (e *retryError) Error() string { | ||
| return e.meshkit.Error() | ||
| } | ||
|
|
||
| func (e *retryError) Unwrap() []error { | ||
| return []error{e.inner, e.meshkit} | ||
| } | ||
|
|
||
| func ErrRetry(err error) error { | ||
| return &retryError{ | ||
| inner: err, | ||
| meshkit: meshkiterrors.New(ErrRetryCode, meshkiterrors.Alert, | ||
| []string{"Retry operation failed"}, | ||
| []string{err.Error()}, | ||
| []string{"Operation did not succeed within retry limits"}, | ||
| []string{"Check the underlying operation and retry configuration"}), | ||
| } | ||
| } | ||
|
|
||
| func ErrContext(err error) error { | ||
| return &retryError{ | ||
| inner: err, | ||
| meshkit: meshkiterrors.New(ErrContextCode, meshkiterrors.Alert, | ||
| []string{"Context canceled or deadline exceeded"}, | ||
| []string{err.Error()}, | ||
| []string{"Operation timed out or context was canceled"}, | ||
| []string{"Check context timeout and ensure the operation completes in time"}), | ||
| } | ||
| } | ||
|
|
||
| func ErrConfig(err error) error { | ||
| return &retryError{ | ||
| inner: err, | ||
| meshkit: meshkiterrors.New(ErrConfigCode, meshkiterrors.Alert, | ||
| []string{"Invalid retry configuration"}, | ||
| []string{err.Error()}, | ||
| []string{"One or more config values are invalid"}, | ||
| []string{"Ensure all retry configuration values are correct"}), | ||
| } | ||
| } | ||
|
|
||
| // ErrorDecision controls retry behaviour for a single error. | ||
| type ErrorDecision int | ||
|
|
||
| const ( | ||
| DecisionRetry ErrorDecision = iota | ||
| DecisionStop | ||
| ) | ||
|
|
||
| // ErrorClassifier returns the retry decision for a given error. | ||
| // Return DecisionStop for errors that should not be retried (e.g. HTTP 4xx, | ||
| // validation failures, auth errors). Return DecisionRetry for transient | ||
| // errors (timeouts, 5xx, rate limits). | ||
| // | ||
| // Ignored when the operation explicitly returns Permanent(err). | ||
| type ErrorClassifier func(err error) ErrorDecision | ||
|
|
||
| // Permanent wraps err to signal no further retries should be attempted. | ||
| // Use for non-transient errors (HTTP 4xx, auth failures, validation errors). | ||
| // Do NOT use for context-cancellation; return ctx.Err() directly. | ||
| func Permanent(err error) error { | ||
| return backoff.Permanent(err) | ||
| } | ||
|
|
||
| // IsPermanent reports whether err is (or wraps) a PermanentError. | ||
| func IsPermanent(err error) bool { | ||
| var pErr *backoff.PermanentError | ||
| return errors.As(err, &pErr) | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| package retry | ||
|
Pranav-d33 marked this conversation as resolved.
|
||
|
|
||
| import ( | ||
| "time" | ||
|
|
||
| "github.com/meshery/meshkit/logger" | ||
| ) | ||
|
|
||
| const ( | ||
| DefaultInitialInterval = 500 * time.Millisecond | ||
| DefaultMaxInterval = 30 * time.Second | ||
| DefaultMaxElapsedTime = 2 * time.Minute | ||
| DefaultMultiplier = 1.5 | ||
| DefaultRandomizationFactor = 0.3 // Never set to 0 in production | ||
| ) | ||
|
|
||
| type Config struct { | ||
| MaxAttempts uint | ||
| InitialInterval time.Duration | ||
| MaxInterval time.Duration | ||
| MaxElapsedTime time.Duration | ||
| Multiplier float64 | ||
| RandomizationFactor float64 | ||
| Notifier func(err error, wait time.Duration) | ||
| ErrorClassifier ErrorClassifier | ||
| } | ||
|
|
||
| func defaultConfig() Config { | ||
| return Config{ | ||
| InitialInterval: DefaultInitialInterval, | ||
| MaxInterval: DefaultMaxInterval, | ||
| MaxElapsedTime: DefaultMaxElapsedTime, | ||
| Multiplier: DefaultMultiplier, | ||
| RandomizationFactor: DefaultRandomizationFactor, | ||
| } | ||
| } | ||
|
|
||
| type Option func(*Config) | ||
|
|
||
| // WithMaxAttempts sets a hard cap on total calls (includes first attempt). | ||
| func WithMaxAttempts(n uint) Option { | ||
| return func(c *Config) { c.MaxAttempts = n } | ||
| } | ||
|
|
||
| func WithInitialInterval(d time.Duration) Option { | ||
| return func(c *Config) { c.InitialInterval = d } | ||
| } | ||
|
|
||
| func WithMaxInterval(d time.Duration) Option { | ||
| return func(c *Config) { c.MaxInterval = d } | ||
| } | ||
|
|
||
| // WithMaxElapsedTime sets wall-clock deadline. Pass 0 to disable. | ||
| func WithMaxElapsedTime(d time.Duration) Option { | ||
| return func(c *Config) { c.MaxElapsedTime = d } | ||
| } | ||
|
|
||
| func WithMultiplier(m float64) Option { | ||
| return func(c *Config) { c.Multiplier = m } | ||
| } | ||
|
|
||
| // WithJitter overrides randomization factor (range: 0.0-1.0). Do not set to 0.0 in production. | ||
| func WithJitter(f float64) Option { | ||
| return func(c *Config) { c.RandomizationFactor = f } | ||
| } | ||
|
|
||
| // WithErrorClassifier provides a decision function for classifying errors as | ||
| // retryable (DecisionRetry) or terminal (DecisionStop). When set, every error | ||
| // returned by the operation (except those explicitly wrapped with Permanent) | ||
| // is passed to this function. If it returns DecisionStop, the error is treated | ||
| // as permanent and the retry loop stops immediately. | ||
| // | ||
| // Example: | ||
| // | ||
| // retry.Do(ctx, op, | ||
| // retry.WithErrorClassifier(func(err error) retry.ErrorDecision { | ||
| // var status *myHTTPError | ||
| // if errors.As(err, &status) { | ||
| // if status.Code >= 500 { | ||
| // return retry.DecisionRetry | ||
| // } | ||
| // return retry.DecisionStop | ||
| // } | ||
| // return retry.DecisionRetry | ||
| // }), | ||
| // ) | ||
| func WithErrorClassifier(classifier ErrorClassifier) Option { | ||
| return func(c *Config) { c.ErrorClassifier = classifier } | ||
| } | ||
|
Pranav-d33 marked this conversation as resolved.
|
||
|
|
||
| func WithNotifier(n func(err error, wait time.Duration)) Option { | ||
| return func(c *Config) { c.Notifier = n } | ||
| } | ||
|
|
||
| // WithLogNotifier emits a Warn log entry on each retry via MeshKit's logger.Handler. | ||
| func WithLogNotifier(log logger.Handler) Option { | ||
| return WithNotifier(func(err error, wait time.Duration) { | ||
| log.Infof("retry: transient error; retrying in %s", wait.Round(time.Millisecond)) | ||
| log.Warn(err) | ||
| }) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| package retry | ||
|
Pranav-d33 marked this conversation as resolved.
|
||
|
|
||
| import ( | ||
| "context" | ||
| "errors" | ||
| "fmt" | ||
| "math" | ||
|
|
||
| "github.com/cenkalti/backoff/v5" | ||
| ) | ||
|
|
||
| type Operation func(ctx context.Context) error | ||
|
|
||
| // Do executes op with exponential backoff until success, permanent error, | ||
| // context cancellation, or budget exhaustion. Config via opts (default: | ||
| // 500ms initial, 1.5x growth, 30% jitter, 2min cap). | ||
| // | ||
| // When a ErrorClassifier is configured via WithErrorClassifier, every non-nil | ||
| // error from op (except those explicitly wrapped with Permanent) is passed to | ||
| // the classifier before the retry decision is made. | ||
| func Do(ctx context.Context, op Operation, opts ...Option) error { | ||
|
Pranav-d33 marked this conversation as resolved.
|
||
| if err := ctx.Err(); err != nil { | ||
| return ErrContext(err) | ||
| } | ||
| cfg := defaultConfig() | ||
|
Pranav-d33 marked this conversation as resolved.
|
||
| for _, o := range opts { | ||
| o(&cfg) | ||
| } | ||
|
|
||
| if err := validateConfig(cfg); err != nil { | ||
| return ErrConfig(err) | ||
| } | ||
|
|
||
| apply := op | ||
| if cfg.ErrorClassifier != nil { | ||
| apply = func(ctx context.Context) error { | ||
| err := op(ctx) | ||
| if err == nil { | ||
| return nil | ||
| } | ||
| var pErr *backoff.PermanentError | ||
| if errors.As(err, &pErr) { | ||
| return err | ||
| } | ||
| if cfg.ErrorClassifier(err) == DecisionStop { | ||
| return backoff.Permanent(err) | ||
| } | ||
| return err | ||
| } | ||
| } | ||
|
|
||
| retryOpts := []backoff.RetryOption{ | ||
| backoff.WithBackOff(buildBackOff(cfg)), | ||
| backoff.WithMaxElapsedTime(cfg.MaxElapsedTime), | ||
| backoff.WithNotify(cfg.Notifier), | ||
| } | ||
| if cfg.MaxAttempts > 0 { | ||
| retryOpts = append(retryOpts, backoff.WithMaxTries(cfg.MaxAttempts)) | ||
| } | ||
|
|
||
| _, err := backoff.Retry(ctx, func() (struct{}, error) { | ||
| return struct{}{}, apply(ctx) | ||
| }, retryOpts...) | ||
| if err != nil { | ||
| return ErrRetry(err) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| func validateConfig(cfg Config) error { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The config validation is a good addition, but the returned errors are currently string-only. Right now callers can only distinguish invalid retry configuration from an operation failure by inspecting the error message. That is brittle. Since I’d suggest exposing a sentinel error, for example: var ErrInvalidConfig = errors.New("retry: invalid config")Then wrap it from validateConfig: return fmt.Errorf("%w: InitialInterval must be > 0, got %v", ErrInvalidConfig, cfg.InitialInterval)This lets consumers write: if errors.Is(err, retry.ErrInvalidConfig) {
// fail startup / reject bad config
} |
||
| if cfg.InitialInterval <= 0 { | ||
| return fmt.Errorf("%w: InitialInterval must be > 0, got %v", ErrInvalidConfig, cfg.InitialInterval) | ||
| } | ||
| if cfg.MaxInterval <= 0 { | ||
| return fmt.Errorf("%w: MaxInterval must be > 0, got %v", ErrInvalidConfig, cfg.MaxInterval) | ||
| } | ||
| if cfg.MaxInterval < cfg.InitialInterval { | ||
| return fmt.Errorf("%w: MaxInterval (%v) must be >= InitialInterval (%v)", ErrInvalidConfig, cfg.MaxInterval, cfg.InitialInterval) | ||
| } | ||
| if cfg.MaxElapsedTime < 0 { | ||
| return fmt.Errorf("%w: MaxElapsedTime must be >= 0, got %v", ErrInvalidConfig, cfg.MaxElapsedTime) | ||
| } | ||
| if math.IsNaN(cfg.Multiplier) || math.IsInf(cfg.Multiplier, 0) || cfg.Multiplier < 1 { | ||
| return fmt.Errorf("%w: Multiplier must be finite and >= 1, got %v", ErrInvalidConfig, cfg.Multiplier) | ||
| } | ||
| if math.IsNaN(cfg.RandomizationFactor) || cfg.RandomizationFactor < 0 || cfg.RandomizationFactor > 1 { | ||
| return fmt.Errorf("%w: RandomizationFactor must be finite and in [0,1], got %v", ErrInvalidConfig, cfg.RandomizationFactor) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| // buildBackOff constructs a backoff policy from Config. | ||
| func buildBackOff(cfg Config) backoff.BackOff { | ||
| b := backoff.NewExponentialBackOff() | ||
| b.InitialInterval = cfg.InitialInterval | ||
| b.MaxInterval = cfg.MaxInterval | ||
| b.Multiplier = cfg.Multiplier | ||
| b.RandomizationFactor = cfg.RandomizationFactor | ||
|
|
||
| return b | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.