From 2361fc8a0cc52e74eac807959599983f1cd7625f Mon Sep 17 00:00:00 2001 From: dreamgeneX Date: Mon, 29 Jun 2026 14:15:58 -0700 Subject: [PATCH] Add retry with exponential backoff for transient external service failures Wrap CdnService.invalidate, EmailService, and PaymentProviderService with a shared RetryPolicy utility (3 retries, 1s base delay, 2x multiplier, 30s max, full jitter). Transient 5xx/network errors are retried; 4xx client errors propagate immediately. Tracks retries via the external_call_retry_total{service,attempt} Prometheus counter. Closes #886 --- src/cdn/cdn.service.spec.ts | 47 ++++++++ src/cdn/cdn.service.ts | 27 +++-- src/common/utils/retry-policy.spec.ts | 89 ++++++++++++++ src/common/utils/retry-policy.ts | 111 ++++++++++++++++++ src/notifications/email/email.service.spec.ts | 46 ++++++++ src/notifications/email/email.service.ts | 33 ++++++ .../payment-provider.service.spec.ts | 59 ++++++++++ .../providers/payment-provider.service.ts | 40 +++++++ 8 files changed, 445 insertions(+), 7 deletions(-) create mode 100644 src/cdn/cdn.service.spec.ts create mode 100644 src/common/utils/retry-policy.spec.ts create mode 100644 src/common/utils/retry-policy.ts create mode 100644 src/notifications/email/email.service.spec.ts create mode 100644 src/notifications/email/email.service.ts create mode 100644 src/payments/providers/payment-provider.service.spec.ts create mode 100644 src/payments/providers/payment-provider.service.ts diff --git a/src/cdn/cdn.service.spec.ts b/src/cdn/cdn.service.spec.ts new file mode 100644 index 00000000..7886c1e1 --- /dev/null +++ b/src/cdn/cdn.service.spec.ts @@ -0,0 +1,47 @@ +import { CdnService } from './cdn.service'; +import { externalCallRetryCounter } from '../common/utils/retry-policy'; + +describe('CdnService invalidate retry behaviour', () => { + const originalEnv = { ...process.env }; + + beforeEach(() => { + process.env.CDN_ENABLED = 'true'; + process.env.CLOUDFRONT_DISTRIBUTION_ID = 'dist-123'; + externalCallRetryCounter.reset(); + }); + + afterEach(() => { + process.env = { ...originalEnv }; + }); + + it( + 'retries transparently on a single transient failure and still succeeds', + async () => { + const service = new CdnService(); + const transientError = { status: 503 }; + jest + .spyOn(service as any, 'callInvalidationProvider') + .mockRejectedValueOnce(transientError) + .mockResolvedValue(undefined); + + const result = await service.invalidate(['/index.html']); + + expect(result.success).toBe(true); + expect((service as any).callInvalidationProvider).toHaveBeenCalledTimes(2); + }, + 10000, + ); + + it( + 'propagates the error to the caller after 3 consecutive failures', + async () => { + const service = new CdnService(); + const transientError = { status: 503 }; + jest.spyOn(service as any, 'callInvalidationProvider').mockRejectedValue(transientError); + + await expect(service.invalidate(['/index.html'])).rejects.toEqual(transientError); + expect((service as any).callInvalidationProvider).toHaveBeenCalledTimes(4); // initial + 3 retries + }, + 15000, + ); +}); diff --git a/src/cdn/cdn.service.ts b/src/cdn/cdn.service.ts index 2cd5d400..ceff694f 100644 --- a/src/cdn/cdn.service.ts +++ b/src/cdn/cdn.service.ts @@ -1,5 +1,6 @@ import { Injectable, Logger } from '@nestjs/common'; import { resolveCdnConfig, resolveCacheHeaderConfig } from './cdn.config'; +import { RetryPolicy } from '../common/utils/retry-policy'; export interface CacheHeaders { 'Cache-Control': string; @@ -17,6 +18,7 @@ export class CdnService { private readonly logger = new Logger(CdnService.name); private readonly cdn = resolveCdnConfig(); private readonly cacheHeaders = resolveCacheHeaderConfig(); + private readonly retryPolicy = new RetryPolicy({ service: 'cdn' }); /** * Returns optimised Cache-Control headers for a given asset path. @@ -56,13 +58,7 @@ export class CdnService { `Invalidating ${paths.length} path(s) on distribution ${this.cdn.distributionId}: ${paths.join(', ')}`, ); - // Placeholder: wire up AWS SDK CloudFront.createInvalidation here when credentials are available. - // Example: - // const cf = new CloudFrontClient({}); - // await cf.send(new CreateInvalidationCommand({ - // DistributionId: this.cdn.distributionId, - // InvalidationBatch: { Paths: { Quantity: paths.length, Items: paths }, CallerReference: Date.now().toString() }, - // })); + await this.retryPolicy.execute(() => this.callInvalidationProvider(paths)); return { success: true, @@ -71,6 +67,23 @@ export class CdnService { }; } + /** + * Calls the CDN provider's invalidation API. Wrapped by RetryPolicy in + * `invalidate()` since this network call can fail transiently (5xx, + * connection resets). + * + * Placeholder: wire up AWS SDK CloudFront.createInvalidation here when + * credentials are available, e.g.: + * const cf = new CloudFrontClient({}); + * await cf.send(new CreateInvalidationCommand({ + * DistributionId: this.cdn.distributionId, + * InvalidationBatch: { Paths: { Quantity: paths.length, Items: paths }, CallerReference: Date.now().toString() }, + * })); + */ + protected async callInvalidationProvider(_paths: string[]): Promise { + // No-op until the CloudFront SDK call is wired up. + } + /** Returns the CDN URL for a given asset path. */ getAssetUrl(assetPath: string): string { if (!this.cdn.enabled || !this.cdn.domain) return assetPath; diff --git a/src/common/utils/retry-policy.spec.ts b/src/common/utils/retry-policy.spec.ts new file mode 100644 index 00000000..060d9043 --- /dev/null +++ b/src/common/utils/retry-policy.spec.ts @@ -0,0 +1,89 @@ +import { RetryPolicy, isTransientError, externalCallRetryCounter } from './retry-policy'; + +describe('RetryPolicy', () => { + beforeEach(() => { + externalCallRetryCounter.reset(); + }); + + it('returns the result immediately on success without retrying', async () => { + const policy = new RetryPolicy({ service: 'test', baseDelayMs: 1, maxDelayMs: 1 }); + const fn = jest.fn().mockResolvedValue('ok'); + + const result = await policy.execute(fn); + + expect(result).toBe('ok'); + expect(fn).toHaveBeenCalledTimes(1); + }); + + it('retries on a transient failure and eventually succeeds', async () => { + const policy = new RetryPolicy({ service: 'email', baseDelayMs: 1, maxDelayMs: 1 }); + const transientError = { status: 503 }; + const fn = jest + .fn() + .mockRejectedValueOnce(transientError) + .mockRejectedValueOnce(transientError) + .mockResolvedValue('ok'); + + const result = await policy.execute(fn); + + expect(result).toBe('ok'); + expect(fn).toHaveBeenCalledTimes(3); + }); + + it('propagates the error after exhausting retries', async () => { + const policy = new RetryPolicy({ service: 'payment', maxRetries: 3, baseDelayMs: 1, maxDelayMs: 1 }); + const transientError = { status: 503 }; + const fn = jest.fn().mockRejectedValue(transientError); + + await expect(policy.execute(fn)).rejects.toBe(transientError); + expect(fn).toHaveBeenCalledTimes(4); // initial attempt + 3 retries + }); + + it('does not retry client errors (4xx)', async () => { + const policy = new RetryPolicy({ service: 'payment', baseDelayMs: 1, maxDelayMs: 1 }); + const clientError = { status: 400 }; + const fn = jest.fn().mockRejectedValue(clientError); + + await expect(policy.execute(fn)).rejects.toBe(clientError); + expect(fn).toHaveBeenCalledTimes(1); + }); + + it('retries network errors with no status code', async () => { + const policy = new RetryPolicy({ service: 'cdn', baseDelayMs: 1, maxDelayMs: 1 }); + const networkError = new Error('ECONNRESET'); + const fn = jest.fn().mockRejectedValueOnce(networkError).mockResolvedValue('ok'); + + const result = await policy.execute(fn); + + expect(result).toBe('ok'); + expect(fn).toHaveBeenCalledTimes(2); + }); + + it('increments the external_call_retry_total counter per retry attempt', async () => { + const policy = new RetryPolicy({ service: 'cdn', baseDelayMs: 1, maxDelayMs: 1 }); + const transientError = { status: 503 }; + const fn = jest.fn().mockRejectedValueOnce(transientError).mockResolvedValue('ok'); + + await policy.execute(fn); + + const metric = await externalCallRetryCounter.get(); + const sample = metric.values.find( + (v) => v.labels.service === 'cdn' && v.labels.attempt === '1', + ); + expect(sample?.value).toBe(1); + }); +}); + +describe('isTransientError', () => { + it('treats 5xx as transient', () => { + expect(isTransientError({ status: 503 })).toBe(true); + }); + + it('treats 4xx as non-transient', () => { + expect(isTransientError({ status: 404 })).toBe(false); + }); + + it('treats errors with no status as transient (network errors)', () => { + expect(isTransientError(new Error('ETIMEDOUT'))).toBe(true); + }); +}); diff --git a/src/common/utils/retry-policy.ts b/src/common/utils/retry-policy.ts new file mode 100644 index 00000000..16da3694 --- /dev/null +++ b/src/common/utils/retry-policy.ts @@ -0,0 +1,111 @@ +import { Counter } from 'prom-client'; + +/** + * Tracks retry attempts for external service calls, labelled by service name + * and attempt number, so retry pressure per integration is visible in + * Grafana/Prometheus. + */ +export const externalCallRetryCounter = new Counter({ + name: 'external_call_retry_total', + help: 'Total number of retry attempts made for external service calls', + labelNames: ['service', 'attempt'] as const, +}); + +export interface RetryPolicyOptions { + /** Label used on the Prometheus counter, e.g. "email", "payment", "cdn". */ + service: string; + maxRetries?: number; + baseDelayMs?: number; + multiplier?: number; + maxDelayMs?: number; + /** Returns true if the error is transient and should be retried. */ + isRetryable?: (error: unknown) => boolean; +} + +const DEFAULT_MAX_RETRIES = 3; +const DEFAULT_BASE_DELAY_MS = 1000; +const DEFAULT_MULTIPLIER = 2; +const DEFAULT_MAX_DELAY_MS = 30000; + +/** + * Client errors (4xx) indicate a problem with the request itself, not a + * transient blip, so they are never retried. Everything else (5xx, network + * errors with no status code) is treated as transient. + */ +export function isTransientError(error: unknown): boolean { + const status = + (error as { status?: number; statusCode?: number; response?: { status?: number } })?.status ?? + (error as { statusCode?: number })?.statusCode ?? + (error as { response?: { status?: number } })?.response?.status; + + if (typeof status === 'number') { + return status >= 500; + } + + // No HTTP status (e.g. ECONNRESET, ETIMEDOUT) — assume transient. + return true; +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +/** + * Full-jitter exponential backoff: a random delay between 0 and the + * capped exponential value for this attempt. + */ +function computeDelayMs( + attempt: number, + baseDelayMs: number, + multiplier: number, + maxDelayMs: number, +): number { + const exponential = baseDelayMs * multiplier ** attempt; + const capped = Math.min(exponential, maxDelayMs); + return Math.random() * capped; +} + +/** + * Retries a transient-failing async operation with exponential backoff and + * full jitter. Client errors (4xx) are propagated immediately without retry. + */ +export class RetryPolicy { + private readonly maxRetries: number; + private readonly baseDelayMs: number; + private readonly multiplier: number; + private readonly maxDelayMs: number; + private readonly isRetryable: (error: unknown) => boolean; + private readonly service: string; + + constructor(options: RetryPolicyOptions) { + this.service = options.service; + this.maxRetries = options.maxRetries ?? DEFAULT_MAX_RETRIES; + this.baseDelayMs = options.baseDelayMs ?? DEFAULT_BASE_DELAY_MS; + this.multiplier = options.multiplier ?? DEFAULT_MULTIPLIER; + this.maxDelayMs = options.maxDelayMs ?? DEFAULT_MAX_DELAY_MS; + this.isRetryable = options.isRetryable ?? isTransientError; + } + + async execute(fn: () => Promise): Promise { + let lastError: unknown; + + for (let attempt = 0; attempt <= this.maxRetries; attempt++) { + try { + return await fn(); + } catch (error) { + lastError = error; + + if (!this.isRetryable(error) || attempt === this.maxRetries) { + throw error; + } + + externalCallRetryCounter.inc({ service: this.service, attempt: String(attempt + 1) }); + + const delay = computeDelayMs(attempt, this.baseDelayMs, this.multiplier, this.maxDelayMs); + await sleep(delay); + } + } + + throw lastError; + } +} diff --git a/src/notifications/email/email.service.spec.ts b/src/notifications/email/email.service.spec.ts new file mode 100644 index 00000000..19ba1b00 --- /dev/null +++ b/src/notifications/email/email.service.spec.ts @@ -0,0 +1,46 @@ +import { EmailService, EmailProvider, EmailMessage } from './email.service'; +import { externalCallRetryCounter } from '../../common/utils/retry-policy'; + +describe('EmailService', () => { + const message: EmailMessage = { to: 'user@example.com', subject: 'Hi', html: '

Hi

' }; + + beforeEach(() => { + externalCallRetryCounter.reset(); + }); + + it( + 'retries transparently on a single transient 503 from the provider', + async () => { + const provider: EmailProvider = { send: jest.fn() }; + (provider.send as jest.Mock) + .mockRejectedValueOnce({ status: 503 }) + .mockResolvedValue(undefined); + const service = new EmailService(provider); + + await service.send(message); + + expect(provider.send).toHaveBeenCalledTimes(2); + }, + 10000, + ); + + it( + 'propagates the error after 3 consecutive failures', + async () => { + const provider: EmailProvider = { send: jest.fn().mockRejectedValue({ status: 503 }) }; + const service = new EmailService(provider); + + await expect(service.send(message)).rejects.toEqual({ status: 503 }); + expect(provider.send).toHaveBeenCalledTimes(4); + }, + 15000, + ); + + it('does not retry a 4xx client error', async () => { + const provider: EmailProvider = { send: jest.fn().mockRejectedValue({ status: 422 }) }; + const service = new EmailService(provider); + + await expect(service.send(message)).rejects.toEqual({ status: 422 }); + expect(provider.send).toHaveBeenCalledTimes(1); + }); +}); diff --git a/src/notifications/email/email.service.ts b/src/notifications/email/email.service.ts new file mode 100644 index 00000000..3656e405 --- /dev/null +++ b/src/notifications/email/email.service.ts @@ -0,0 +1,33 @@ +import { Injectable } from '@nestjs/common'; +import { RetryPolicy } from '../../common/utils/retry-policy'; + +export interface EmailMessage { + to: string; + subject: string; + html: string; +} + +/** + * Sends an email through the configured provider (SMTP/SendGrid). Actual + * transport wiring is injected so this service stays provider-agnostic and + * testable without a live SMTP connection. + */ +export interface EmailProvider { + send(message: EmailMessage): Promise; +} + +@Injectable() +export class EmailService { + private readonly retryPolicy = new RetryPolicy({ service: 'email' }); + + constructor(private readonly provider: EmailProvider) {} + + /** + * Sends an email, transparently retrying transient provider failures + * (5xx, network errors) with exponential backoff. 4xx errors (e.g. + * invalid recipient) propagate immediately. + */ + async send(message: EmailMessage): Promise { + await this.retryPolicy.execute(() => this.provider.send(message)); + } +} diff --git a/src/payments/providers/payment-provider.service.spec.ts b/src/payments/providers/payment-provider.service.spec.ts new file mode 100644 index 00000000..b71a57fc --- /dev/null +++ b/src/payments/providers/payment-provider.service.spec.ts @@ -0,0 +1,59 @@ +import { PaymentProviderService } from './payment-provider.service'; +import { IPaymentProvider } from './payment-provider.interface'; +import { externalCallRetryCounter } from '../../common/utils/retry-policy'; + +function buildProvider(overrides: Partial = {}): IPaymentProvider { + return { + name: 'stripe', + createPaymentIntent: jest.fn(), + createSubscription: jest.fn(), + cancelSubscription: jest.fn(), + refundPayment: jest.fn(), + handleWebhook: jest.fn(), + verifyWebhookSignature: jest.fn(), + ...overrides, + }; +} + +describe('PaymentProviderService', () => { + beforeEach(() => { + externalCallRetryCounter.reset(); + }); + + it( + 'retries createPaymentIntent transparently on a single transient 503', + async () => { + const createPaymentIntent = jest + .fn() + .mockRejectedValueOnce({ status: 503 }) + .mockResolvedValue({ clientSecret: 'cs', paymentIntentId: 'pi' }); + const service = new PaymentProviderService(buildProvider({ createPaymentIntent })); + + const result = await service.createPaymentIntent(1000, 'usd'); + + expect(result.paymentIntentId).toBe('pi'); + expect(createPaymentIntent).toHaveBeenCalledTimes(2); + }, + 10000, + ); + + it( + 'propagates the error to the caller after 3 consecutive failures', + async () => { + const createPaymentIntent = jest.fn().mockRejectedValue({ status: 503 }); + const service = new PaymentProviderService(buildProvider({ createPaymentIntent })); + + await expect(service.createPaymentIntent(1000, 'usd')).rejects.toEqual({ status: 503 }); + expect(createPaymentIntent).toHaveBeenCalledTimes(4); + }, + 15000, + ); + + it('does not retry a 4xx client error from the gateway', async () => { + const refundPayment = jest.fn().mockRejectedValue({ status: 400 }); + const service = new PaymentProviderService(buildProvider({ refundPayment })); + + await expect(service.refundPayment('pay_1')).rejects.toEqual({ status: 400 }); + expect(refundPayment).toHaveBeenCalledTimes(1); + }); +}); diff --git a/src/payments/providers/payment-provider.service.ts b/src/payments/providers/payment-provider.service.ts new file mode 100644 index 00000000..3d0065b8 --- /dev/null +++ b/src/payments/providers/payment-provider.service.ts @@ -0,0 +1,40 @@ +import { Injectable } from '@nestjs/common'; +import { IPaymentProvider } from './payment-provider.interface'; +import { RetryPolicy } from '../../common/utils/retry-policy'; + +/** + * Wraps an IPaymentProvider implementation (Stripe, etc.) with retry logic + * for the network calls that hit the payment gateway. Webhook handling is + * intentionally excluded — those are inbound calls, not outbound ones we + * control retries for. + */ +@Injectable() +export class PaymentProviderService { + private readonly retryPolicy = new RetryPolicy({ service: 'payment' }); + + constructor(private readonly provider: IPaymentProvider) {} + + async createPaymentIntent( + amount: number, + currency: string, + metadata?: Record, + ) { + return this.retryPolicy.execute(() => + this.provider.createPaymentIntent(amount, currency, metadata), + ); + } + + async createSubscription(customerId: string, priceId: string, metadata?: Record) { + return this.retryPolicy.execute(() => + this.provider.createSubscription(customerId, priceId, metadata), + ); + } + + async cancelSubscription(subscriptionId: string) { + return this.retryPolicy.execute(() => this.provider.cancelSubscription(subscriptionId)); + } + + async refundPayment(paymentId: string, amount?: number) { + return this.retryPolicy.execute(() => this.provider.refundPayment(paymentId, amount)); + } +}