From c358145ddd770e9a1ea1d55665bcb7938c6aaef5 Mon Sep 17 00:00:00 2001 From: mrdulasolutions Date: Wed, 13 May 2026 14:10:04 -0400 Subject: [PATCH] feat(openrouter): client-side rate limit for free-tier models only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User report: `bridge.call(drafts.rerunAgent)` failed with `OpenRouter HTTP 429: Rate limit exceeded: free-models-per-min. X-RateLimit-Limit: 16`. Concurrent agent flows (analyzer, draft generator, rerunAgent, sender lookup) burst through OpenRouter's 16 req/min cap on free models in seconds, and the existing retry logic — 3 attempts with backoff capped at 10s — can't outlast a 60s rate window. Two complementary changes, both isolated to `:free` model ids: 1. Sliding-window rate limiter in front of the fetch. A module-scope timestamp queue tracks the last N successful "issue this call" decisions; before each free-model call we drop expired timestamps and either claim a slot or sleep until the oldest ages out. Awaiters proceed in arrival order. Paid models bypass entirely — adding latency to a request that doesn't need throttling helps nothing. 2. On 429 responses, parse `X-RateLimit-Reset` and sleep until the window opens (capped at 90s) instead of falling back to exponential backoff that wouldn't survive 60s. Acts as a server-truthing safety net for the in-process limiter, which resets across sidecar restarts. Free vs paid detection is by model-id convention: OpenRouter distributes free models with a `:free` suffix; the bare model id is the paid tier. No other changes. Co-Authored-By: Claude Opus 4.7 --- sidecar/src/services/providers/openrouter.ts | 111 ++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/sidecar/src/services/providers/openrouter.ts b/sidecar/src/services/providers/openrouter.ts index ebd10bf..3b47e7c 100644 --- a/sidecar/src/services/providers/openrouter.ts +++ b/sidecar/src/services/providers/openrouter.ts @@ -46,6 +46,72 @@ const MAX_RETRIES = 3; const INITIAL_DELAY_MS = 1000; const MAX_DELAY_MS = 10_000; +// ─── Free-model rate limiting ──────────────────────────────────────────── +// +// OpenRouter caps free-tier models at 16 requests/minute (a value returned +// in the `X-RateLimit-Limit` response header). Multiple concurrent agent +// flows in this app — analyzer, draft generator, rerunAgent, sender lookup +// — can each fire OpenRouter calls in parallel, so the burst can blow +// through 16/min in seconds and the user sees an HTTP 429. +// +// We solve this client-side, but ONLY for free models. Paid models on +// OpenRouter have much higher (effectively unbounded for our load) limits +// and don't need the gate; threading them through a 16/min bucket would +// just add latency for no reason. +// +// Detection is by model-id convention: OpenRouter free models are +// distributed with an `:free` suffix (e.g. `meta-llama/llama-3.3-70b +// -instruct:free`). The bare model id with no suffix is the paid tier. +// +// Algorithm is a sliding-window counter — a queue of timestamps of the +// last `FREE_RATE_LIMIT_PER_MIN` successful "issue this call" decisions. +// Before issuing a new free-model call, we drop timestamps older than +// `RATE_WINDOW_MS`, and if the remaining list is still at capacity, sleep +// until the oldest timestamp ages out. Awaiters proceed in arrival order +// because each call awaits before mutating the array. +// +// This is purely in-process — sidecar restarts reset the counter, which +// is fine: the next 429 from the server would just push us back into +// rate-respecting mode via the X-RateLimit-Reset handler below. +const FREE_RATE_LIMIT_PER_MIN = 16; +const RATE_WINDOW_MS = 60_000; +const recentFreeCallTimestamps: number[] = []; + +function isFreeModel(model: string): boolean { + return model.endsWith(":free"); +} + +async function acquireFreeRateSlot(model: string): Promise { + if (!isFreeModel(model)) return; + // Loop because a queued caller may need to wait multiple windows if a + // burst arrived first. Each iteration either claims a slot or sleeps + // until the next slot frees up. + for (;;) { + const now = Date.now(); + while ( + recentFreeCallTimestamps.length > 0 && + recentFreeCallTimestamps[0]! < now - RATE_WINDOW_MS + ) { + recentFreeCallTimestamps.shift(); + } + if (recentFreeCallTimestamps.length < FREE_RATE_LIMIT_PER_MIN) { + recentFreeCallTimestamps.push(now); + return; + } + // Wait until the oldest tracked call ages out of the window, plus a + // 50ms cushion to avoid a thundering-herd retry exactly at the + // boundary. + const oldest = recentFreeCallTimestamps[0]!; + const waitMs = oldest + RATE_WINDOW_MS - now + 50; + log.info("free-model rate limit reached, queueing", { + model, + waitMs, + queued: recentFreeCallTimestamps.length, + }); + await sleep(Math.max(waitMs, 100)); + } +} + export interface OpenRouterFreeModel { id: string; name: string; @@ -207,6 +273,11 @@ export async function createMessageOpenRouter( } try { + // Client-side rate gate for free models — see top of file. No-op + // for paid models. Runs INSIDE the retry loop so a 429-with-Retry- + // After path also goes through the queue on the next attempt. + await acquireFreeRateSlot(params.model); + const res = await fetch(`${OPENROUTER_BASE_URL}/chat/completions`, { method: "POST", headers: buildHeaders(key), @@ -222,7 +293,25 @@ export async function createMessageOpenRouter( const err = new Error(`OpenRouter HTTP ${res.status}: ${text || "no body"}`); if (!retryable || attempt >= MAX_RETRIES) throw err; lastError = err; - await sleep(backoff(attempt)); + // Server-side safety net: if OpenRouter sent X-RateLimit-Reset + // (epoch milliseconds), respect it instead of falling back to a + // capped exponential backoff that wouldn't outlast a 60s window. + // Cap the sleep at 90s so a misconfigured/giant Reset header + // can't strand the call. + const resetMs = parseRateLimitResetMs(res); + const delay = + res.status === 429 && resetMs !== null + ? Math.min(Math.max(resetMs - Date.now() + 250, 100), 90_000) + : backoff(attempt); + if (res.status === 429) { + log.warn("OpenRouter 429 — sleeping before retry", { + model: params.model, + resetMs, + delay, + attempt, + }); + } + await sleep(delay); continue; } @@ -310,6 +399,26 @@ async function safeText(res: Response): Promise { } } +// OpenRouter returns `X-RateLimit-Reset` as a Unix-epoch value indicating +// when the current rate window will reset. It's documented as +// milliseconds, but treat both ms-since-epoch and s-since-epoch defensively +// so a future change in spec doesn't silently land us in 1970-time. +// Returns null if the header isn't present or doesn't parse to a positive +// future timestamp. +function parseRateLimitResetMs(res: Response): number | null { + const raw = res.headers.get("X-RateLimit-Reset"); + if (!raw) return null; + const n = Number(raw); + if (!Number.isFinite(n) || n <= 0) return null; + // Heuristic: a value < 10^12 is almost certainly seconds (year 2001+). + // Above that, it's milliseconds. Both branches return ms. + const ms = n < 1e12 ? n * 1000 : n; + // Sanity: must be in the next 5 minutes — anything further out is + // either a clock skew issue or a header bug we shouldn't honour. + if (ms - Date.now() > 5 * 60_000) return null; + return ms; +} + function sleep(ms: number): Promise { return new Promise((r) => setTimeout(r, ms)); }