diff --git a/Dockerfile b/Dockerfile index 4dffd06..25a7b20 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,7 +23,13 @@ RUN apk add --no-cache curl ENV NODE_ENV=production COPY --from=build /usr/src/app ./ -RUN pnpm install --frozen-lockfile +RUN pnpm install --frozen-lockfile \ + # Run as the non-root `node` user (uid 1000, shipped by node:22-alpine). + # Ponder writes its cache under the workdir and pnpm reads /pnpm at runtime, + # so both must be owned by `node`. + && chown -R node:node /usr/src/app /pnpm + +USER node HEALTHCHECK \ --start-period=24h \ diff --git a/src/application/helpers/orderbookClient.ts b/src/application/helpers/orderbookClient.ts index a599282..33dc24c 100644 --- a/src/application/helpers/orderbookClient.ts +++ b/src/application/helpers/orderbookClient.ts @@ -23,7 +23,16 @@ import { pgSchema, integer, text } from "drizzle-orm/pg-core"; import { encodeAbiParameters, keccak256, type Hex } from "viem"; import { type OrderType } from "../../utils/order-types"; import { COMPOSABLE_COW_HANDLER_ADDRESSES, ORDERBOOK_API_URLS } from "../../data"; -import { BOOTSTRAP_MAX_PAGES, BOOTSTRAP_PAGE_SIZE, ORDERBOOK_HTTP_TIMEOUT_MS, SIGNING_SCHEME_EIP1271 } from "../../constants"; +import { + BOOTSTRAP_MAX_PAGES, + BOOTSTRAP_PAGE_SIZE, + ORDERBOOK_HTTP_TIMEOUT_MS, + ORDERBOOK_MAX_RETRIES, + ORDERBOOK_RETRY_BASE_MS, + ORDERBOOK_RETRY_BUDGET_MS, + ORDERBOOK_RETRY_MAX_DELAY_MS, + SIGNING_SCHEME_EIP1271, +} from "../../constants"; import { decodeEip1271Signature } from "../decoders/erc1271Signature"; import { fetchWithTimeout, TimeoutError, withTimeout } from "./withTimeout"; import { log } from "./logger"; @@ -300,6 +309,75 @@ export async function fetchOwnerOrderStatuses( // ─── API calls ─────────────────────────────────────────────────────────────── +/** + * The orderbook API refused to answer (HTTP 429 or 5xx) after bounded retries. + * Distinct from "the API has no such order" (a UID simply absent from a 2xx + * body) so callers / dashboards can alarm on an unavailable API rather than + * silently treating it as "order not on API yet". + */ +export class OrderbookUnavailableError extends Error { + constructor( + public readonly status: number, + public readonly endpoint: string, + ) { + super(`[COW:orderbook-unavailable] ${endpoint} responded ${status}`); + this.name = "OrderbookUnavailableError"; + } +} + +const sleep = (ms: number): Promise => new Promise((resolve) => setTimeout(resolve, ms)); + +/** Parse a `Retry-After` header (delta-seconds or HTTP-date) into milliseconds; null if absent/unparseable. */ +function parseRetryAfter(value: string | null): number | null { + if (!value) return null; + const seconds = Number(value); + if (Number.isFinite(seconds)) return Math.max(0, seconds * 1000); + const date = Date.parse(value); + if (!Number.isNaN(date)) return Math.max(0, date - Date.now()); + return null; +} + +/** + * `fetchWithTimeout` plus bounded retry/backoff for transient orderbook errors. + * + * Returns the Response on a 2xx. On 429 it honors `Retry-After` (capped at + * ORDERBOOK_RETRY_MAX_DELAY_MS); on 5xx it uses exponential backoff. Retries + * stop once ORDERBOOK_MAX_RETRIES is reached or the next sleep would push the + * loop past ORDERBOOK_RETRY_BUDGET_MS — at which point it throws + * OrderbookUnavailableError instead of holding the block transaction open. + * A TimeoutError from the underlying fetch propagates unchanged. + */ +async function fetchOrderbook( + url: string, + init: RequestInit | undefined, + endpoint: string, +): Promise { + let spent = 0; + for (let attempt = 0; ; attempt++) { + const response = await fetchWithTimeout(url, init, ORDERBOOK_HTTP_TIMEOUT_MS, endpoint); + if (response.ok) return response; + + const retryable = response.status === 429 || response.status >= 500; + if (!retryable || attempt >= ORDERBOOK_MAX_RETRIES) { + throw new OrderbookUnavailableError(response.status, endpoint); + } + + const retryAfterMs = + response.status === 429 ? parseRetryAfter(response.headers.get("retry-after")) : null; + const backoffMs = ORDERBOOK_RETRY_BASE_MS * 2 ** attempt; + const delay = Math.min(retryAfterMs ?? backoffMs, ORDERBOOK_RETRY_MAX_DELAY_MS); + + // Fail fast rather than hold the block transaction open past our budget. + if (spent + delay > ORDERBOOK_RETRY_BUDGET_MS) { + throw new OrderbookUnavailableError(response.status, endpoint); + } + + log("warn", "ob:retry", { endpoint, status: response.status, attempt: attempt + 1, delayMs: delay, retryAfterMs }); + await sleep(delay); + spent += delay; + } +} + /** Fetch orders for an owner with pagination. maxPages limits how many pages are fetched (0 = unlimited). * signingScheme, if provided, is appended as a query param — the API filters server-side when supported, * reducing payload for owners with many ECDSA orders mixed with composable ones. @@ -321,16 +399,7 @@ async function fetchAccountOrders( if (signingScheme) params.set("signingScheme", signingScheme); const url = `${apiBaseUrl}/api/v1/account/${owner}/orders?${params.toString()}`; try { - const response = await fetchWithTimeout( - url, - undefined, - ORDERBOOK_HTTP_TIMEOUT_MS, - "ob:account", - ); - if (!response.ok) { - log("warn", "ob:accountError", { status: response.status, owner }); - break; - } + const response = await fetchOrderbook(url, undefined, "ob:account"); const page = (await response.json()) as OrderbookOrder[]; allOrders.push(...page); pagesFetched++; @@ -338,6 +407,10 @@ async function fetchAccountOrders( if (maxPages > 0 && pagesFetched >= maxPages) break; // page cap reached offset += page.length; } catch (err) { + if (err instanceof OrderbookUnavailableError) { + log("error", "ob:unavailable", { endpoint: "ob:account", status: err.status, owner }); + break; + } if (err instanceof TimeoutError) { log("warn", "ob:accountTimeout", { owner, offset, after: ORDERBOOK_HTTP_TIMEOUT_MS }); break; @@ -368,23 +441,22 @@ async function fetchOrdersByUids( const chunkResults = await Promise.all( chunks.map(async (chunk, idx) => { try { - const response = await fetchWithTimeout( + const response = await fetchOrderbook( url, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(chunk), }, - ORDERBOOK_HTTP_TIMEOUT_MS, "ob:byUids", ); - if (!response.ok) { - log("warn", "ob:batchFetchError", { status: response.status, uids: chunk.length, offset: idx * BATCH_SIZE }); - return [] as OrderbookOrder[]; - } const raw = (await response.json()) as { order: OrderbookOrder }[]; return raw.flatMap((item) => (item?.order != null ? [item.order] : [])); } catch (err) { + if (err instanceof OrderbookUnavailableError) { + log("error", "ob:unavailable", { endpoint: "ob:byUids", status: err.status, uids: chunk.length, offset: idx * BATCH_SIZE }); + return [] as OrderbookOrder[]; + } if (err instanceof TimeoutError) { log("warn", "ob:batchFetchTimeout", { uids: chunk.length, offset: idx * BATCH_SIZE, after: ORDERBOOK_HTTP_TIMEOUT_MS }); return [] as OrderbookOrder[]; diff --git a/src/constants.ts b/src/constants.ts index ef27867..40c6e61 100644 --- a/src/constants.ts +++ b/src/constants.ts @@ -65,6 +65,22 @@ export const DETERMINISTIC_CANCEL_SWEEP_INTERVAL = 100n; */ export const ORDERBOOK_HTTP_TIMEOUT_MS = 10_000; +/** + * Bounded retry for transient orderbook failures (HTTP 429 / 5xx). + * + * These calls run inside Ponder block handlers that hold a DB transaction open, + * so the retry loop must stay short — we cannot honor a large `Retry-After` by + * sleeping (Postgres would terminate the connection). The loop adds at most + * ORDERBOOK_RETRY_BUDGET_MS of wall-clock; if a `Retry-After` (or backoff) would + * exceed the budget, we fail fast and let the next poll (~ORDERBOOK_POLL_INTERVAL + * blocks later) retry naturally — but the failure is logged as a rate-limit/ + * server error, not as "order not on API yet". + */ +export const ORDERBOOK_MAX_RETRIES = 2; // ≤ 3 attempts total +export const ORDERBOOK_RETRY_BASE_MS = 250; // exponential backoff base +export const ORDERBOOK_RETRY_MAX_DELAY_MS = 2_000; // cap on a single sleep (incl. Retry-After) +export const ORDERBOOK_RETRY_BUDGET_MS = 4_000; // total wall-clock the retry loop may add + /** * Hard wall-clock cap for a block handler's aggregate `context.client.multicall` * call (OrderDiscoveryPoller, CancellationWatcher). viem has no per-call signal; the timer races the promise and diff --git a/tests/helpers/orderbookClient.test.ts b/tests/helpers/orderbookClient.test.ts index b1afc95..6454ad6 100644 --- a/tests/helpers/orderbookClient.test.ts +++ b/tests/helpers/orderbookClient.test.ts @@ -17,6 +17,7 @@ vi.mock("ponder", () => ({ })); import * as data from "../../src/data"; +import { ORDERBOOK_MAX_RETRIES } from "../../src/constants"; import { fetchOrderStatusByUids, fetchOwnerOrderStatuses } from "../../src/application/helpers/orderbookClient"; // ─── Helpers ───────────────────────────────────────────────────────────────── @@ -211,6 +212,81 @@ describe("fetchOrderStatusByUids", () => { }); }); +// ─── Resilience: 429 / 5xx handling ─────────────────────────────────────────── + +/** Capture structured `log()` output — the logger writes warn/error as JSON via console.error. */ +function captureErrorLogs() { + const lines: Record[] = []; + const spy = vi.spyOn(console, "error").mockImplementation((line: unknown) => { + try { + lines.push(JSON.parse(String(line))); + } catch { + /* non-JSON line — ignore */ + } + }); + return { + has: (msg: string) => lines.some((l) => l.msg === msg), + find: (msg: string) => lines.find((l) => l.msg === msg), + restore: () => spy.mockRestore(), + }; +} + +describe("orderbook resilience (429 / 5xx)", () => { + beforeAll(() => { + data.ORDERBOOK_API_URLS[TEST_CHAIN_ID] = "http://placeholder"; + }); + + afterAll(() => { + delete (data.ORDERBOOK_API_URLS as Record)[TEST_CHAIN_ID]; + }); + + it("retries a 429 (honoring Retry-After) and succeeds on a later attempt", async () => { + let calls = 0; + const { url, close } = await startServer((_req, res) => { + calls++; + if (calls === 1) { + res.writeHead(429, { "retry-after": "0", "content-type": "application/json" }); + res.end(JSON.stringify({ message: "rate limited" })); + return; + } + res.writeHead(200, { "content-type": "application/json" }); + res.end(JSON.stringify([makeWrappedOrder(UID_A, "fulfilled")])); + }); + data.ORDERBOOK_API_URLS[TEST_CHAIN_ID] = url; + const logs = captureErrorLogs(); + try { + const result = await fetchOrderStatusByUids(makeContext(), TEST_CHAIN_ID, [UID_A]); + expect(calls).toBe(2); + expect(result.get(UID_A)?.status).toBe("fulfilled"); + expect(logs.has("ob:unavailable")).toBe(false); + } finally { + logs.restore(); + await close(); + } + }); + + it("classifies a persistent 429 as ob:unavailable and stops after bounded retries", async () => { + let calls = 0; + const { url, close } = await startServer((_req, res) => { + calls++; + res.writeHead(429, { "retry-after": "0", "content-type": "application/json" }); + res.end(JSON.stringify({ message: "rate limited" })); + }); + data.ORDERBOOK_API_URLS[TEST_CHAIN_ID] = url; + const logs = captureErrorLogs(); + try { + const result = await fetchOrderStatusByUids(makeContext(), TEST_CHAIN_ID, [UID_A]); + expect(calls).toBe(ORDERBOOK_MAX_RETRIES + 1); // bounded: 1 initial + retries + expect(result.has(UID_A)).toBe(false); // absent from map… + expect(logs.find("ob:unavailable")?.status).toBe(429); // …but the cause is logged distinctly + } finally { + logs.restore(); + await close(); + } + }); + +}); + // ─── fetchOwnerOrderStatuses tests ──────────────────────────────────────────── const FAKE_OWNER = "0xaabbccddEEff0011223344556677889900aabbcc" as Hex;