From c358145ddd770e9a1ea1d55665bcb7938c6aaef5 Mon Sep 17 00:00:00 2001
From: mrdulasolutions <matt@mrdula.solutions>
Date: Wed, 13 May 2026 14:10:04 -0400
Subject: [PATCH] feat(openrouter): client-side rate limit for free-tier models
 only
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

User report: `bridge.call(drafts.rerunAgent)` failed with
`OpenRouter HTTP 429: Rate limit exceeded: free-models-per-min.
X-RateLimit-Limit: 16`. Concurrent agent flows (analyzer, draft
generator, rerunAgent, sender lookup) burst through OpenRouter's
16 req/min cap on free models in seconds, and the existing retry
logic — 3 attempts with backoff capped at 10s — can't outlast a
60s rate window.

Two complementary changes, both isolated to `:free` model ids:

1. Sliding-window rate limiter in front of the fetch. A
   module-scope timestamp queue tracks the last N successful
   "issue this call" decisions; before each free-model call we
   drop expired timestamps and either claim a slot or sleep until
   the oldest ages out. Awaiters proceed in arrival order. Paid
   models bypass entirely — adding latency to a request that
   doesn't need throttling helps nothing.

2. On 429 responses, parse `X-RateLimit-Reset` and sleep until the
   window opens (capped at 90s) instead of falling back to
   exponential backoff that wouldn't survive 60s. Acts as a
   server-truthing safety net for the in-process limiter, which
   resets across sidecar restarts.

Free vs paid detection is by model-id convention: OpenRouter
distributes free models with a `:free` suffix; the bare model id
is the paid tier. No other changes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 sidecar/src/services/providers/openrouter.ts | 111 ++++++++++++++++++-
 1 file changed, 110 insertions(+), 1 deletion(-)

diff --git a/sidecar/src/services/providers/openrouter.ts b/sidecar/src/services/providers/openrouter.ts
index ebd10bf..3b47e7c 100644
--- a/sidecar/src/services/providers/openrouter.ts
+++ b/sidecar/src/services/providers/openrouter.ts
@@ -46,6 +46,72 @@ const MAX_RETRIES = 3;
 const INITIAL_DELAY_MS = 1000;
 const MAX_DELAY_MS = 10_000;
 
+// ─── Free-model rate limiting ────────────────────────────────────────────
+//
+// OpenRouter caps free-tier models at 16 requests/minute (a value returned
+// in the `X-RateLimit-Limit` response header). Multiple concurrent agent
+// flows in this app — analyzer, draft generator, rerunAgent, sender lookup
+// — can each fire OpenRouter calls in parallel, so the burst can blow
+// through 16/min in seconds and the user sees an HTTP 429.
+//
+// We solve this client-side, but ONLY for free models. Paid models on
+// OpenRouter have much higher (effectively unbounded for our load) limits
+// and don't need the gate; threading them through a 16/min bucket would
+// just add latency for no reason.
+//
+// Detection is by model-id convention: OpenRouter free models are
+// distributed with an `:free` suffix (e.g. `meta-llama/llama-3.3-70b
+// -instruct:free`). The bare model id with no suffix is the paid tier.
+//
+// Algorithm is a sliding-window counter — a queue of timestamps of the
+// last `FREE_RATE_LIMIT_PER_MIN` successful "issue this call" decisions.
+// Before issuing a new free-model call, we drop timestamps older than
+// `RATE_WINDOW_MS`, and if the remaining list is still at capacity, sleep
+// until the oldest timestamp ages out. Awaiters proceed in arrival order
+// because each call awaits before mutating the array.
+//
+// This is purely in-process — sidecar restarts reset the counter, which
+// is fine: the next 429 from the server would just push us back into
+// rate-respecting mode via the X-RateLimit-Reset handler below.
+const FREE_RATE_LIMIT_PER_MIN = 16;
+const RATE_WINDOW_MS = 60_000;
+const recentFreeCallTimestamps: number[] = [];
+
+function isFreeModel(model: string): boolean {
+  return model.endsWith(":free");
+}
+
+async function acquireFreeRateSlot(model: string): Promise<void> {
+  if (!isFreeModel(model)) return;
+  // Loop because a queued caller may need to wait multiple windows if a
+  // burst arrived first. Each iteration either claims a slot or sleeps
+  // until the next slot frees up.
+  for (;;) {
+    const now = Date.now();
+    while (
+      recentFreeCallTimestamps.length > 0 &&
+      recentFreeCallTimestamps[0]! < now - RATE_WINDOW_MS
+    ) {
+      recentFreeCallTimestamps.shift();
+    }
+    if (recentFreeCallTimestamps.length < FREE_RATE_LIMIT_PER_MIN) {
+      recentFreeCallTimestamps.push(now);
+      return;
+    }
+    // Wait until the oldest tracked call ages out of the window, plus a
+    // 50ms cushion to avoid a thundering-herd retry exactly at the
+    // boundary.
+    const oldest = recentFreeCallTimestamps[0]!;
+    const waitMs = oldest + RATE_WINDOW_MS - now + 50;
+    log.info("free-model rate limit reached, queueing", {
+      model,
+      waitMs,
+      queued: recentFreeCallTimestamps.length,
+    });
+    await sleep(Math.max(waitMs, 100));
+  }
+}
+
 export interface OpenRouterFreeModel {
   id: string;
   name: string;
@@ -207,6 +273,11 @@ export async function createMessageOpenRouter(
     }
 
     try {
+      // Client-side rate gate for free models — see top of file. No-op
+      // for paid models. Runs INSIDE the retry loop so a 429-with-Retry-
+      // After path also goes through the queue on the next attempt.
+      await acquireFreeRateSlot(params.model);
+
       const res = await fetch(`${OPENROUTER_BASE_URL}/chat/completions`, {
         method: "POST",
         headers: buildHeaders(key),
@@ -222,7 +293,25 @@ export async function createMessageOpenRouter(
         const err = new Error(`OpenRouter HTTP ${res.status}: ${text || "no body"}`);
         if (!retryable || attempt >= MAX_RETRIES) throw err;
         lastError = err;
-        await sleep(backoff(attempt));
+        // Server-side safety net: if OpenRouter sent X-RateLimit-Reset
+        // (epoch milliseconds), respect it instead of falling back to a
+        // capped exponential backoff that wouldn't outlast a 60s window.
+        // Cap the sleep at 90s so a misconfigured/giant Reset header
+        // can't strand the call.
+        const resetMs = parseRateLimitResetMs(res);
+        const delay =
+          res.status === 429 && resetMs !== null
+            ? Math.min(Math.max(resetMs - Date.now() + 250, 100), 90_000)
+            : backoff(attempt);
+        if (res.status === 429) {
+          log.warn("OpenRouter 429 — sleeping before retry", {
+            model: params.model,
+            resetMs,
+            delay,
+            attempt,
+          });
+        }
+        await sleep(delay);
         continue;
       }
 
@@ -310,6 +399,26 @@ async function safeText(res: Response): Promise<string> {
   }
 }
 
+// OpenRouter returns `X-RateLimit-Reset` as a Unix-epoch value indicating
+// when the current rate window will reset. It's documented as
+// milliseconds, but treat both ms-since-epoch and s-since-epoch defensively
+// so a future change in spec doesn't silently land us in 1970-time.
+// Returns null if the header isn't present or doesn't parse to a positive
+// future timestamp.
+function parseRateLimitResetMs(res: Response): number | null {
+  const raw = res.headers.get("X-RateLimit-Reset");
+  if (!raw) return null;
+  const n = Number(raw);
+  if (!Number.isFinite(n) || n <= 0) return null;
+  // Heuristic: a value < 10^12 is almost certainly seconds (year 2001+).
+  // Above that, it's milliseconds. Both branches return ms.
+  const ms = n < 1e12 ? n * 1000 : n;
+  // Sanity: must be in the next 5 minutes — anything further out is
+  // either a clock skew issue or a header bug we shouldn't honour.
+  if (ms - Date.now() > 5 * 60_000) return null;
+  return ms;
+}
+
 function sleep(ms: number): Promise<void> {
   return new Promise((r) => setTimeout(r, ms));
 }