Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions src/cdn/cdn.service.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import { CdnService } from './cdn.service';
import { externalCallRetryCounter } from '../common/utils/retry-policy';

describe('CdnService invalidate retry behaviour', () => {
const originalEnv = { ...process.env };

beforeEach(() => {
process.env.CDN_ENABLED = 'true';
process.env.CLOUDFRONT_DISTRIBUTION_ID = 'dist-123';
externalCallRetryCounter.reset();
});

afterEach(() => {
process.env = { ...originalEnv };
});

it(

Check failure on line 17 in src/cdn/cdn.service.spec.ts

View workflow job for this annotation

GitHub Actions / validate

Replace `⏎····'retries·transparently·on·a·single·transient·failure·and·still·succeeds',⏎···` with `'retries·transparently·on·a·single·transient·failure·and·still·succeeds',`
'retries transparently on a single transient failure and still succeeds',
async () => {
const service = new CdnService();

Check failure on line 20 in src/cdn/cdn.service.spec.ts

View workflow job for this annotation

GitHub Actions / validate

Delete `··`
const transientError = { status: 503 };

Check failure on line 21 in src/cdn/cdn.service.spec.ts

View workflow job for this annotation

GitHub Actions / validate

Delete `··`
jest

Check failure on line 22 in src/cdn/cdn.service.spec.ts

View workflow job for this annotation

GitHub Actions / validate

Delete `··`
.spyOn(service as any, 'callInvalidationProvider')

Check failure on line 23 in src/cdn/cdn.service.spec.ts

View workflow job for this annotation

GitHub Actions / validate

Delete `··`
.mockRejectedValueOnce(transientError)

Check failure on line 24 in src/cdn/cdn.service.spec.ts

View workflow job for this annotation

GitHub Actions / validate

Delete `··`
.mockResolvedValue(undefined);

Check failure on line 25 in src/cdn/cdn.service.spec.ts

View workflow job for this annotation

GitHub Actions / validate

Delete `··`

const result = await service.invalidate(['/index.html']);

Check failure on line 27 in src/cdn/cdn.service.spec.ts

View workflow job for this annotation

GitHub Actions / validate

Delete `··`

expect(result.success).toBe(true);

Check failure on line 29 in src/cdn/cdn.service.spec.ts

View workflow job for this annotation

GitHub Actions / validate

Delete `··`
expect((service as any).callInvalidationProvider).toHaveBeenCalledTimes(2);

Check failure on line 30 in src/cdn/cdn.service.spec.ts

View workflow job for this annotation

GitHub Actions / validate

Delete `··`
},
10000,
);

it(
'propagates the error to the caller after 3 consecutive failures',
async () => {
const service = new CdnService();
const transientError = { status: 503 };
jest.spyOn(service as any, 'callInvalidationProvider').mockRejectedValue(transientError);

await expect(service.invalidate(['/index.html'])).rejects.toEqual(transientError);
expect((service as any).callInvalidationProvider).toHaveBeenCalledTimes(4); // initial + 3 retries
},
15000,
);
});
27 changes: 20 additions & 7 deletions src/cdn/cdn.service.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { Injectable, Logger } from '@nestjs/common';
import { resolveCdnConfig, resolveCacheHeaderConfig } from './cdn.config';
import { RetryPolicy } from '../common/utils/retry-policy';

export interface CacheHeaders {
'Cache-Control': string;
Expand All @@ -17,6 +18,7 @@ export class CdnService {
private readonly logger = new Logger(CdnService.name);
private readonly cdn = resolveCdnConfig();
private readonly cacheHeaders = resolveCacheHeaderConfig();
private readonly retryPolicy = new RetryPolicy({ service: 'cdn' });

/**
* Returns optimised Cache-Control headers for a given asset path.
Expand Down Expand Up @@ -56,13 +58,7 @@ export class CdnService {
`Invalidating ${paths.length} path(s) on distribution ${this.cdn.distributionId}: ${paths.join(', ')}`,
);

// Placeholder: wire up AWS SDK CloudFront.createInvalidation here when credentials are available.
// Example:
// const cf = new CloudFrontClient({});
// await cf.send(new CreateInvalidationCommand({
// DistributionId: this.cdn.distributionId,
// InvalidationBatch: { Paths: { Quantity: paths.length, Items: paths }, CallerReference: Date.now().toString() },
// }));
await this.retryPolicy.execute(() => this.callInvalidationProvider(paths));

return {
success: true,
Expand All @@ -71,6 +67,23 @@ export class CdnService {
};
}

/**
* Calls the CDN provider's invalidation API. Wrapped by RetryPolicy in
* `invalidate()` since this network call can fail transiently (5xx,
* connection resets).
*
* Placeholder: wire up AWS SDK CloudFront.createInvalidation here when
* credentials are available, e.g.:
* const cf = new CloudFrontClient({});
* await cf.send(new CreateInvalidationCommand({
* DistributionId: this.cdn.distributionId,
* InvalidationBatch: { Paths: { Quantity: paths.length, Items: paths }, CallerReference: Date.now().toString() },
* }));
*/
protected async callInvalidationProvider(_paths: string[]): Promise<void> {
// No-op until the CloudFront SDK call is wired up.
}

/** Returns the CDN URL for a given asset path. */
getAssetUrl(assetPath: string): string {
if (!this.cdn.enabled || !this.cdn.domain) return assetPath;
Expand Down
89 changes: 89 additions & 0 deletions src/common/utils/retry-policy.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import { RetryPolicy, isTransientError, externalCallRetryCounter } from './retry-policy';

describe('RetryPolicy', () => {
beforeEach(() => {
externalCallRetryCounter.reset();
});

it('returns the result immediately on success without retrying', async () => {
const policy = new RetryPolicy({ service: 'test', baseDelayMs: 1, maxDelayMs: 1 });
const fn = jest.fn().mockResolvedValue('ok');

const result = await policy.execute(fn);

expect(result).toBe('ok');
expect(fn).toHaveBeenCalledTimes(1);
});

it('retries on a transient failure and eventually succeeds', async () => {
const policy = new RetryPolicy({ service: 'email', baseDelayMs: 1, maxDelayMs: 1 });
const transientError = { status: 503 };
const fn = jest
.fn()
.mockRejectedValueOnce(transientError)
.mockRejectedValueOnce(transientError)
.mockResolvedValue('ok');

const result = await policy.execute(fn);

expect(result).toBe('ok');
expect(fn).toHaveBeenCalledTimes(3);
});

it('propagates the error after exhausting retries', async () => {
const policy = new RetryPolicy({ service: 'payment', maxRetries: 3, baseDelayMs: 1, maxDelayMs: 1 });
const transientError = { status: 503 };
const fn = jest.fn().mockRejectedValue(transientError);

await expect(policy.execute(fn)).rejects.toBe(transientError);
expect(fn).toHaveBeenCalledTimes(4); // initial attempt + 3 retries
});

it('does not retry client errors (4xx)', async () => {
const policy = new RetryPolicy({ service: 'payment', baseDelayMs: 1, maxDelayMs: 1 });
const clientError = { status: 400 };
const fn = jest.fn().mockRejectedValue(clientError);

await expect(policy.execute(fn)).rejects.toBe(clientError);
expect(fn).toHaveBeenCalledTimes(1);
});

it('retries network errors with no status code', async () => {
const policy = new RetryPolicy({ service: 'cdn', baseDelayMs: 1, maxDelayMs: 1 });
const networkError = new Error('ECONNRESET');
const fn = jest.fn().mockRejectedValueOnce(networkError).mockResolvedValue('ok');

const result = await policy.execute(fn);

expect(result).toBe('ok');
expect(fn).toHaveBeenCalledTimes(2);
});

it('increments the external_call_retry_total counter per retry attempt', async () => {
const policy = new RetryPolicy({ service: 'cdn', baseDelayMs: 1, maxDelayMs: 1 });
const transientError = { status: 503 };
const fn = jest.fn().mockRejectedValueOnce(transientError).mockResolvedValue('ok');

await policy.execute(fn);

const metric = await externalCallRetryCounter.get();
const sample = metric.values.find(
(v) => v.labels.service === 'cdn' && v.labels.attempt === '1',
);
expect(sample?.value).toBe(1);
});
});

describe('isTransientError', () => {
it('treats 5xx as transient', () => {
expect(isTransientError({ status: 503 })).toBe(true);
});

it('treats 4xx as non-transient', () => {
expect(isTransientError({ status: 404 })).toBe(false);
});

it('treats errors with no status as transient (network errors)', () => {
expect(isTransientError(new Error('ETIMEDOUT'))).toBe(true);
});
});
111 changes: 111 additions & 0 deletions src/common/utils/retry-policy.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import { Counter } from 'prom-client';

/**
* Tracks retry attempts for external service calls, labelled by service name
* and attempt number, so retry pressure per integration is visible in
* Grafana/Prometheus.
*/
export const externalCallRetryCounter = new Counter({
name: 'external_call_retry_total',
help: 'Total number of retry attempts made for external service calls',
labelNames: ['service', 'attempt'] as const,
});

export interface RetryPolicyOptions {
/** Label used on the Prometheus counter, e.g. "email", "payment", "cdn". */
service: string;
maxRetries?: number;
baseDelayMs?: number;
multiplier?: number;
maxDelayMs?: number;
/** Returns true if the error is transient and should be retried. */
isRetryable?: (error: unknown) => boolean;
}

const DEFAULT_MAX_RETRIES = 3;
const DEFAULT_BASE_DELAY_MS = 1000;
const DEFAULT_MULTIPLIER = 2;
const DEFAULT_MAX_DELAY_MS = 30000;

/**
* Client errors (4xx) indicate a problem with the request itself, not a
* transient blip, so they are never retried. Everything else (5xx, network
* errors with no status code) is treated as transient.
*/
export function isTransientError(error: unknown): boolean {
const status =
(error as { status?: number; statusCode?: number; response?: { status?: number } })?.status ??
(error as { statusCode?: number })?.statusCode ??
(error as { response?: { status?: number } })?.response?.status;

if (typeof status === 'number') {
return status >= 500;
}

// No HTTP status (e.g. ECONNRESET, ETIMEDOUT) — assume transient.
return true;
}

function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}

/**
* Full-jitter exponential backoff: a random delay between 0 and the
* capped exponential value for this attempt.
*/
function computeDelayMs(
attempt: number,
baseDelayMs: number,
multiplier: number,
maxDelayMs: number,
): number {
const exponential = baseDelayMs * multiplier ** attempt;
const capped = Math.min(exponential, maxDelayMs);
return Math.random() * capped;
}

/**
* Retries a transient-failing async operation with exponential backoff and
* full jitter. Client errors (4xx) are propagated immediately without retry.
*/
export class RetryPolicy {
private readonly maxRetries: number;
private readonly baseDelayMs: number;
private readonly multiplier: number;
private readonly maxDelayMs: number;
private readonly isRetryable: (error: unknown) => boolean;
private readonly service: string;

constructor(options: RetryPolicyOptions) {
this.service = options.service;
this.maxRetries = options.maxRetries ?? DEFAULT_MAX_RETRIES;
this.baseDelayMs = options.baseDelayMs ?? DEFAULT_BASE_DELAY_MS;
this.multiplier = options.multiplier ?? DEFAULT_MULTIPLIER;
this.maxDelayMs = options.maxDelayMs ?? DEFAULT_MAX_DELAY_MS;
this.isRetryable = options.isRetryable ?? isTransientError;
}

async execute<T>(fn: () => Promise<T>): Promise<T> {
let lastError: unknown;

for (let attempt = 0; attempt <= this.maxRetries; attempt++) {
try {
return await fn();
} catch (error) {
lastError = error;

if (!this.isRetryable(error) || attempt === this.maxRetries) {
throw error;
}

externalCallRetryCounter.inc({ service: this.service, attempt: String(attempt + 1) });

const delay = computeDelayMs(attempt, this.baseDelayMs, this.multiplier, this.maxDelayMs);
await sleep(delay);
}
}

throw lastError;
}
}
46 changes: 46 additions & 0 deletions src/notifications/email/email.service.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import { EmailService, EmailProvider, EmailMessage } from './email.service';
import { externalCallRetryCounter } from '../../common/utils/retry-policy';

describe('EmailService', () => {
const message: EmailMessage = { to: 'user@example.com', subject: 'Hi', html: '<p>Hi</p>' };

beforeEach(() => {
externalCallRetryCounter.reset();
});

it(
'retries transparently on a single transient 503 from the provider',
async () => {
const provider: EmailProvider = { send: jest.fn() };
(provider.send as jest.Mock)
.mockRejectedValueOnce({ status: 503 })
.mockResolvedValue(undefined);
const service = new EmailService(provider);

await service.send(message);

expect(provider.send).toHaveBeenCalledTimes(2);
},
10000,
);

it(
'propagates the error after 3 consecutive failures',
async () => {
const provider: EmailProvider = { send: jest.fn().mockRejectedValue({ status: 503 }) };
const service = new EmailService(provider);

await expect(service.send(message)).rejects.toEqual({ status: 503 });
expect(provider.send).toHaveBeenCalledTimes(4);
},
15000,
);

it('does not retry a 4xx client error', async () => {
const provider: EmailProvider = { send: jest.fn().mockRejectedValue({ status: 422 }) };
const service = new EmailService(provider);

await expect(service.send(message)).rejects.toEqual({ status: 422 });
expect(provider.send).toHaveBeenCalledTimes(1);
});
});
33 changes: 33 additions & 0 deletions src/notifications/email/email.service.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import { Injectable } from '@nestjs/common';
import { RetryPolicy } from '../../common/utils/retry-policy';

export interface EmailMessage {
to: string;
subject: string;
html: string;
}

/**
* Sends an email through the configured provider (SMTP/SendGrid). Actual
* transport wiring is injected so this service stays provider-agnostic and
* testable without a live SMTP connection.
*/
export interface EmailProvider {
send(message: EmailMessage): Promise<void>;
}

@Injectable()
export class EmailService {
private readonly retryPolicy = new RetryPolicy({ service: 'email' });

constructor(private readonly provider: EmailProvider) {}

/**
* Sends an email, transparently retrying transient provider failures
* (5xx, network errors) with exponential backoff. 4xx errors (e.g.
* invalid recipient) propagate immediately.
*/
async send(message: EmailMessage): Promise<void> {
await this.retryPolicy.execute(() => this.provider.send(message));
}
}
Loading
Loading