From 2488db42106b1b711f74f29d811f497fe973d140 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 12:57:50 -0500 Subject: [PATCH 01/24] docs: design for agent rate-limit and session-cap handling Shared classifier in a new internal/agentlimit package, used by the daemon worker (preserves cooldown + failover for daemon jobs) and the foreground roborev fix loop (strict abort with reset time). Co-Authored-By: Claude Opus 4.7 (1M context) --- ...-05-05-agent-rate-limit-handling-design.md | 315 ++++++++++++++++++ 1 file changed, 315 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md diff --git a/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md b/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md new file mode 100644 index 00000000..190a285a --- /dev/null +++ b/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md @@ -0,0 +1,315 @@ +# Agent Rate-Limit and Session-Cap Handling + +## Problem + +Long-running `roborev fix` sessions keep iterating fruitlessly when the +configured agent hits its session-level rate limit (e.g. Claude Code's +5-hour usage cap). Every subsequent job hits the same wall and the loop +prints warnings without aborting. + +The daemon worker already has a quota-detection layer (`isQuotaError` in +`internal/daemon/worker.go`, plus per-agent in-process cooldown), but it +has two gaps: + +1. The pattern set only catches Gemini-style ("exhausted your capacity") + and Codex-style ("resource exhausted") wording. Claude's session-cap + message is not recognized, so it falls through to generic + transient-retry and then to a plain failure. +2. `roborev fix` runs the fix agent in-process via `fixJobDirect` + (`cmd/roborev/fix.go:262`). It does not go through the daemon worker + at all, so even a perfect daemon-side classifier would not help the + foreground loop. + +## Goals + +- Detect agent-level rate-limit and session-cap failures consistently in + both the daemon worker and the foreground `roborev fix` loop. +- For the foreground fix loop, abort the entire session immediately when + the configured agent hits a session-cap, surfacing the reset time (or + a conservative fallback message) so the user knows when to retry. +- Preserve the daemon's existing behavior for daemon-owned review jobs: + cooldown the agent, retry, then fail over to the configured backup + agent if one is set. CI flows benefit from best-effort completion. +- Keep the detection table conservative: high-confidence patterns only, + one unit test per pattern, with logging for unmatched non-zero exits + so new variants can be observed and added incrementally. + +## Non-goals + +- Auto-failover for foreground `roborev fix`. Strict abort is the default; + if user demand surfaces, an `--auto-failover` flag or config setting can + be added later. +- Cross-machine coordination of cooldowns (the existing in-memory cooldown + is per-daemon-process; that is unchanged). +- Surfacing daemon cooldown state to the CLI through a new API. The + classifier is shared via a Go package import, not via HTTP. + +## Architecture + +### New package: `internal/agentlimit` + +A small, dependency-free package shared by the daemon worker and the CLI +fix loop. Pure classification logic plus duration parsing — no I/O, no +process state. + +```go +package agentlimit + +type Kind int + +const ( + KindNone Kind = iota // no rate-limit signal + KindTransient // 429-style; retry locally, no cooldown + KindQuota // hard quota exhaustion (existing behavior) + KindSession // session-level cap (e.g. Claude 5-hour) +) + +type Classification struct { + Kind Kind + Agent string // canonical agent name (alias-resolved by caller) + ResetAt time.Time // zero if not parseable from the message + CooldownFor time.Duration // fallback when ResetAt is zero + Message string // raw error text (for logs / user display) +} + +// Classify inspects an agent error message for rate-limit signatures. +// agent is the canonical agent name (caller resolves aliases). Returns +// KindNone when no signature matches. +func Classify(agent, errMsg string) Classification +``` + +The pattern table lives in the package as a slice of per-agent rules: + +```go +type rule struct { + Agents []string // agents this rule applies to ("*" = any) + Pattern *regexp.Regexp // or substring (case-insensitive) + Kind Kind + ParseReset func(match []string) (time.Time, time.Duration) +} +``` + +Day-1 rules — copied verbatim from the existing `isQuotaError` set so +behavior for Gemini and Codex is unchanged: + +| Agent | Pattern (case-insensitive) | Kind | Reset parsing | +|-------|----------------------------|------|---------------| +| any | `resource exhausted` | Quota | (none) | +| any | `quota exhausted` / `quota_exhausted` | Quota | (none) | +| any | `exhausted your capacity` | Quota | `reset after ` if present | +| any | `capacity exhausted` / `capacity_exhausted` | Quota | (none) | + +The `Transient` kind exists in the API and is exercised in tests, but +no production rule produces it on day one. Adding a generic +`rate.?limit` pattern was considered and rejected: it overmatches benign +errors and the existing in-call retry layers (`runStreamingCLI` and the +agents' own retry logic) already handle short-lived 429s. + +Claude-specific rules are deliberately omitted at launch — see +[Detection without a captured Claude message](#detection-without-a-captured-claude-message). + +The duration parser currently inline in `worker.go` (the helper that +extracts a cooldown duration from the gemini "reset after Xs" wording) +moves into this package as `parseResetDuration(msg) time.Duration`, so +both consumers share it. + +### Daemon worker (`internal/daemon/worker.go`) changes + +The existing `isQuotaError` helper (line 974) and inline duration parsing +are replaced with calls to `agentlimit.Classify`. Behavior is preserved: + +- `KindQuota` or `KindSession`: cooldown the canonical agent until + `ResetAt` (or `now + CooldownFor`, defaulting to 30m if both are zero). + Continue with normal retry; on retry exhaustion, fail over to the + configured backup agent if one exists. This keeps CI / daemon-owned + review flows best-effort, as today. +- `KindTransient`: do not cooldown; let normal retry handle it. +- `KindNone`: unchanged (generic failure path). + +The `cooldownAgent`/`isAgentInCooldown` helpers stay where they are — +only the *trigger* moves to the shared classifier. + +### CLI fix loop (`cmd/roborev/fix.go`) changes + +`fixJobDirect` returns errors from `params.Agent.Review()` as today. +`runFixOpen` (line 456) and `runFixBatch` (line 1000) are the loop +drivers — both call `fixJobDirect` (directly or via `fixSingleJob`) and +process the result. + +After each `fixJobDirect` error, the caller runs: + +```go +cls := agentlimit.Classify(canonicalAgent(params.Agent.Name()), err.Error()) +switch cls.Kind { +case agentlimit.KindSession, agentlimit.KindQuota: + // Strict abort. No auto-failover for foreground fix. + return newAgentLimitError(cls) +case agentlimit.KindTransient, agentlimit.KindNone: + // Existing per-job error handling (warn-and-continue in discovery + // mode, return error in explicit-IDs mode). +} +``` + +`newAgentLimitError` formats a user-visible message: + +``` +agent claude-code hit a session limit. Cooldown until 5:42 PM (in 2h 13m). +Re-run `roborev fix ...` after that, or pass --agent to switch. +``` + +If `ResetAt` is zero, fall back to `CooldownFor` (or "an unknown reset +time; try again later" if that is also zero), and the suggestion to +override `--agent` stands. + +The CLI exits non-zero with the standard `1` exit code — no new exit +code. Scripts that want to detect this case can grep for the message. + +### Detection without a captured Claude message + +We have not captured Claude Code's actual session-cap output yet +(checked `~/.roborev/errors.log`, no Claude-tagged limit messages). +Adding a guess-pattern risks both false negatives (wording differs +slightly from what we guessed) and false positives (a benign error +mentioning the word "limit" gets classified as a hard cap). + +The framework ships without a Claude-specific rule. Two safeguards make +the next observed cap actionable: + +1. **Unmatched non-zero exit logging.** When the daemon worker or CLI + fix loop receives an agent error with `Classify().Kind == KindNone`, + log a single WARN line containing the agent name and the first 200 + chars of the error message. This produces a recoverable sample the + moment the next cap fires. +2. **Test-only mock pattern.** The unit tests include a synthetic + "claude session limit" pattern fixture so the `KindSession` branch + has full test coverage end-to-end (worker cooldown path and CLI + abort path). Real Claude patterns get added to the production rule + table once captured. + +Once a real Claude message is observed, adding the rule is a one-line +table change plus a unit test fixture. + +### Reset-time parsing + +Two formats need to be supported in the shared parser: + +- Relative duration: `reset after 48m20s`, `try again in 2h13m`, + `retry after 5m`. Existing worker.go logic handles this for Gemini — + it moves into `agentlimit.parseResetDuration` verbatim. +- Absolute time: `resets at 5:42 PM`, `try again at 17:42 UTC`. New + parser, returns `time.Time` interpreted in the local timezone with a + same-day or next-day disambiguation (next-day if the parsed time is + already in the past). + +If neither format matches, both `ResetAt` and `CooldownFor` are zero — +the caller (worker or CLI) applies its own default fallback (30m +cooldown for the daemon, "unknown reset time" message for the CLI). + +## Data flow + +``` +Foreground: roborev fix → fixJobDirect → Agent.Review → error + │ + ▼ + agentlimit.Classify + │ + ┌───────────┴───────────┐ + ▼ ▼ + KindSession/Quota KindTransient/None + │ │ + ▼ ▼ + Abort fix loop with Continue per existing + reset-time error per-job handling + +Daemon: worker job → Agent.Review → error + │ + ▼ + agentlimit.Classify + │ + ┌──────────┴──────────┐ + ▼ ▼ + KindSession/Quota KindTransient/None + │ │ + ▼ ▼ + cooldownAgent(...) Existing retry path + + retry + failover + (existing behavior) +``` + +## Testing strategy + +### `internal/agentlimit` unit tests + +Table-driven, one row per pattern. Each row asserts: + +- The expected `Kind`. +- The expected `Agent` value. +- The expected `ResetAt` / `CooldownFor` for messages that include reset + info, and zero values for those that do not. + +Negative cases: messages that should *not* match (e.g. an unrelated +error containing the word "limit") map to `KindNone`. + +### Worker integration test + +A new test in `internal/daemon/worker_test.go` exercises the +end-to-end daemon path: + +- Configure a worker pool with a mock agent that returns a session-limit + error message on first call. +- Enqueue a job. Run a worker tick. +- Assert: the agent is now in cooldown, the cooldown expiry matches the + parsed reset, and a subsequent job for the same agent is skipped per + existing `(quota cooldown active)` logic. + +This covers the daemon-side wiring without depending on Claude's exact +wording — the test pattern is registered as a fixture in the test file. + +### CLI fix abort test + +A new test in `cmd/roborev/fix_test.go` (or `fix_mock_test.go`): + +- Use the existing `test` agent infrastructure to inject an error that + classifies as `KindSession`. +- Run `runFixBatch` with two job IDs. +- Assert: the loop aborts after the first job, the returned error + surfaces the parsed reset time, and the second job was *not* + processed. + +### Logging behavior + +A small assertion in either the worker or CLI test confirms that an +unmatched non-zero exit (`KindNone`) emits a WARN with a truncated +error preview, so the unmatched-pattern surfacing path is covered. + +## Migration / rollout + +- The change is internal (no public API surface). No flag-gating, no + config opt-in. +- `isQuotaError` becomes a thin wrapper over `agentlimit.Classify` if + any tests still reference it directly; otherwise it is removed in the + same PR. +- Existing behavior for Gemini and Codex is preserved bit-for-bit by + copying their patterns verbatim into the new rule table — verified + by a regression test that runs each known-failing message through + the new classifier and checks it still returns `KindQuota`. + +## Risks and mitigations + +| Risk | Mitigation | +|------|------------| +| Pattern matching is brittle; agents change wording. | Conservative patterns + unit tests + WARN log on unmatched non-zero exits so new variants surface and can be added one-by-one. | +| Misclassifying a benign error as `KindSession` aborts the user's fix loop unnecessarily. | Patterns require specific phrasing (e.g. "exhausted", "session limit"), not generic words like "limit" or "rate". Negative test cases lock this in. | +| Reset-time parsing produces a wrong `time.Time` (e.g. timezone mishandling). | Local-timezone interpretation with explicit same-day/next-day disambiguation, plus unit tests for boundary cases. | +| The CLI abort message is unactionable if neither `ResetAt` nor `CooldownFor` is parseable. | Fallback message includes the raw agent error text and the `--agent` override hint, so the user has both context and a way out. | + +## Out of scope (potential follow-ups) + +- `--auto-failover` flag for `roborev fix` to mirror daemon failover. +- A `roborev status` field showing in-cooldown agents and their + expiries. +- Persisting cooldown state across daemon restarts (today it is purely + in-memory). +- Adding rate-limit handling for the CI poller and `roborev refine` + paths beyond what they inherit transitively from the daemon and the + fix loop. From 2da309906de12baea516ce90aa2e0d453cb3b851 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 13:21:42 -0500 Subject: [PATCH 02/24] docs: revise rate-limit spec per review Corrects daemon retry semantics (quota errors skip retries entirely and go straight to failoverOrFail), completes the day-1 pattern table to match the existing isQuotaError set bit-for-bit, adds an unexported classifyWithRules helper plus injectable classifier function on WorkerPool / fix-loop options for test isolation, and clarifies that PR 1 ships the framework while a Claude-specific rule follows once a real session-cap message is captured. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...-05-05-agent-rate-limit-handling-design.md | 172 +++++++++++++----- 1 file changed, 129 insertions(+), 43 deletions(-) diff --git a/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md b/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md index 190a285a..7310decb 100644 --- a/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md +++ b/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md @@ -1,5 +1,16 @@ # Agent Rate-Limit and Session-Cap Handling +## Scope + +This spec covers the foundation: a shared classifier package, unmatched +non-zero exit logging, daemon-worker integration that preserves current +behavior, and CLI fix-loop abort plumbing wired up against a synthetic +test fixture. Adding a production Claude session-cap pattern is an +explicit follow-up, taken once a real Claude limit message is captured +(see [Detection without a captured Claude message](#detection-without-a-captured-claude-message)). +The first PR ships the framework end-to-end; Claude detection is a +one-line table addition plus a unit test in a follow-up PR. + ## Problem Long-running `roborev fix` sessions keep iterating fruitlessly when the @@ -27,9 +38,12 @@ has two gaps: - For the foreground fix loop, abort the entire session immediately when the configured agent hits a session-cap, surfacing the reset time (or a conservative fallback message) so the user knows when to retry. -- Preserve the daemon's existing behavior for daemon-owned review jobs: - cooldown the agent, retry, then fail over to the configured backup - agent if one is set. CI flows benefit from best-effort completion. +- Preserve the daemon's existing behavior for daemon-owned review jobs + bit-for-bit: on a quota- or session-class error, cooldown the agent + and immediately fail over to a configured backup (or fail the job if + none) — no retries, matching today's `isQuotaError` branch in + `worker.go`. CI flows continue to benefit from best-effort completion + via failover. - Keep the detection table conservative: high-confidence patterns only, one unit test per pattern, with logging for unmatched non-zero exits so new variants can be observed and added incrementally. @@ -76,6 +90,12 @@ type Classification struct { // agent is the canonical agent name (caller resolves aliases). Returns // KindNone when no signature matches. func Classify(agent, errMsg string) Classification + +// classifyWithRules is the same as Classify but accepts an explicit +// rule slice. Unexported; used inside the package's own tests so that +// fixtures (e.g. a synthetic KindSession pattern) do not leak into the +// production defaultRules slice. +func classifyWithRules(agent, errMsg string, rules []rule) Classification ``` The pattern table lives in the package as a slice of per-agent rules: @@ -89,15 +109,27 @@ type rule struct { } ``` -Day-1 rules — copied verbatim from the existing `isQuotaError` set so -behavior for Gemini and Codex is unchanged: - -| Agent | Pattern (case-insensitive) | Kind | Reset parsing | -|-------|----------------------------|------|---------------| -| any | `resource exhausted` | Quota | (none) | -| any | `quota exhausted` / `quota_exhausted` | Quota | (none) | -| any | `exhausted your capacity` | Quota | `reset after ` if present | -| any | `capacity exhausted` / `capacity_exhausted` | Quota | (none) | +Day-1 rules — copied verbatim from the current `isQuotaError` +(`internal/daemon/worker.go:974`) so detection for Gemini and Codex is +byte-for-byte unchanged. The full set of nine substrings (case-insensitive, +matched via `strings.Contains`): + +| Pattern | Kind | Reset parsing | +|---------|------|---------------| +| `resource exhausted` | Quota | `reset after ` if present | +| `quota exceeded` | Quota | same | +| `quota_exceeded` | Quota | same | +| `quota exhausted` | Quota | same | +| `quota_exhausted` | Quota | same | +| `insufficient_quota` | Quota | same | +| `exhausted your capacity` | Quota | same | +| `capacity exhausted` | Quota | same | +| `capacity_exhausted` | Quota | same | + +Reset parsing applies uniformly: any matching message is run through +`parseResetDuration` (today's `parseQuotaCooldown`, moved into the +shared package) so a `reset after X` substring produces `CooldownFor` +when present, regardless of which pattern matched. The `Transient` kind exists in the API and is exercised in tests, but no production rule produces it on day one. Adding a generic @@ -116,19 +148,28 @@ both consumers share it. ### Daemon worker (`internal/daemon/worker.go`) changes The existing `isQuotaError` helper (line 974) and inline duration parsing -are replaced with calls to `agentlimit.Classify`. Behavior is preserved: +are replaced with calls to `agentlimit.Classify`. Behavior is preserved +exactly — quota-class errors skip retries entirely: - `KindQuota` or `KindSession`: cooldown the canonical agent until - `ResetAt` (or `now + CooldownFor`, defaulting to 30m if both are zero). - Continue with normal retry; on retry exhaustion, fail over to the - configured backup agent if one exists. This keeps CI / daemon-owned - review flows best-effort, as today. -- `KindTransient`: do not cooldown; let normal retry handle it. -- `KindNone`: unchanged (generic failure path). + `ResetAt` (or `now + CooldownFor`, defaulting to 30m if both are zero), + then immediately call `failoverOrFail`. Retries are skipped — this + matches today's `isQuotaError` branch at `worker.go:792-798`. + CI / daemon-owned review flows continue to benefit from failover when + a backup agent is configured. +- `KindTransient`: do not cooldown; fall through to the normal retry + path (`failOrRetryAgent` semantics unchanged). +- `KindNone`: unchanged (generic retry-then-fail-or-failover path). The `cooldownAgent`/`isAgentInCooldown` helpers stay where they are — only the *trigger* moves to the shared classifier. +Implementation note: rather than hard-coding `agentlimit.Classify`, the +`WorkerPool` struct holds a `classify func(agent, errMsg string) agentlimit.Classification` +field, initialized to `agentlimit.Classify` at construction. Tests in +this package can substitute a stub for deterministic +`KindSession`/`KindQuota` outcomes without touching `defaultRules`. + ### CLI fix loop (`cmd/roborev/fix.go`) changes `fixJobDirect` returns errors from `params.Agent.Review()` as today. @@ -136,10 +177,13 @@ only the *trigger* moves to the shared classifier. drivers — both call `fixJobDirect` (directly or via `fixSingleJob`) and process the result. -After each `fixJobDirect` error, the caller runs: +After each `fixJobDirect` error, the caller runs the classifier and +branches on the result. The fix loop holds a classifier function +(default `agentlimit.Classify`, swappable in tests) on its options/ +context struct so tests can drive deterministic outcomes: ```go -cls := agentlimit.Classify(canonicalAgent(params.Agent.Name()), err.Error()) +cls := opts.classify(canonicalAgent(params.Agent.Name()), err.Error()) switch cls.Kind { case agentlimit.KindSession, agentlimit.KindQuota: // Strict abort. No auto-failover for foreground fix. @@ -180,11 +224,21 @@ the next observed cap actionable: log a single WARN line containing the agent name and the first 200 chars of the error message. This produces a recoverable sample the moment the next cap fires. -2. **Test-only mock pattern.** The unit tests include a synthetic - "claude session limit" pattern fixture so the `KindSession` branch - has full test coverage end-to-end (worker cooldown path and CLI - abort path). Real Claude patterns get added to the production rule - table once captured. +2. **Test-only mock pattern via injectable rules.** The package's + public API is a single `Classify(agent, errMsg)` function backed by + a private `defaultRules` slice. To exercise the `KindSession` branch + end-to-end without polluting the production rule table, the package + also exposes an unexported `classifyWithRules(agent, errMsg, rules)` + helper. Tests in the same package call `classifyWithRules` with a + synthetic rule slice (e.g. one that maps a fake "test-claude session + limit" message to `KindSession`); production callers always go + through `Classify` and only see `defaultRules`. Tests in other + packages that need to drive a `KindSession` outcome receive a + pre-built `Classification` struct from a small test helper exported + for that purpose, rather than mutating any global state. + + Real Claude patterns get added to the production rule table once + captured. Once a real Claude message is observed, adding the rule is a one-line table change plus a unit test fixture. @@ -231,9 +285,12 @@ Daemon: worker job → Agent.Review → error KindSession/Quota KindTransient/None │ │ ▼ ▼ - cooldownAgent(...) Existing retry path - + retry + failover - (existing behavior) + cooldownAgent(...) Existing retry path + then immediate (failOrRetryAgent + failoverOrFail with up to maxRetries, + (no retries — then failoverOrFail) + matches today's + quota branch) ``` ## Testing strategy @@ -253,24 +310,38 @@ error containing the word "limit") map to `KindNone`. ### Worker integration test A new test in `internal/daemon/worker_test.go` exercises the -end-to-end daemon path: +end-to-end daemon path. To avoid depending on Claude's exact wording +or mutating the production rule table, the worker code calls into a +small package-internal entry point that takes a classifier function +(default: `agentlimit.Classify`); the test substitutes a stub +classifier that returns a `KindSession` `Classification` for the test +agent's error message. - Configure a worker pool with a mock agent that returns a session-limit - error message on first call. + error string and a stub classifier that maps that string to + `KindSession` with a known `ResetAt`. - Enqueue a job. Run a worker tick. - Assert: the agent is now in cooldown, the cooldown expiry matches the - parsed reset, and a subsequent job for the same agent is skipped per - existing `(quota cooldown active)` logic. + stub's `ResetAt`, retries did *not* happen (`retry_count` stays at 0), + and a subsequent job for the same agent is skipped per existing + `(quota cooldown active)` logic. -This covers the daemon-side wiring without depending on Claude's exact -wording — the test pattern is registered as a fixture in the test file. +A second test confirms the byte-for-byte preservation of today's quota +behavior: each of the nine substrings from the original `isQuotaError` +list is fed in as an error message; the test asserts each produces +`KindQuota` and that the worker takes the cooldown-then-failover path +without retries. ### CLI fix abort test -A new test in `cmd/roborev/fix_test.go` (or `fix_mock_test.go`): +A new test in `cmd/roborev/fix_test.go` (or `fix_mock_test.go`) uses +the same classifier-injection pattern: the fix-loop call site takes a +classifier function from a package-private factory that defaults to +`agentlimit.Classify`, and the test substitutes a stub. -- Use the existing `test` agent infrastructure to inject an error that - classifies as `KindSession`. +- Use the existing `test` agent infrastructure to inject an error. +- Stub classifier maps the error to `KindSession` with a known + `ResetAt`. - Run `runFixBatch` with two job IDs. - Assert: the loop aborts after the first job, the returned error surfaces the parsed reset time, and the second job was *not* @@ -284,15 +355,30 @@ error preview, so the unmatched-pattern surfacing path is covered. ## Migration / rollout +The work splits into two PRs: + +**PR 1 — Framework (this spec):** shared classifier package, unmatched +non-zero exit logging, daemon-worker integration that preserves +existing behavior bit-for-bit, and CLI fix-loop abort plumbing +exercised by a synthetic test fixture. No behavior change for any +currently-supported agent. + +**PR 2 — Claude detection (follow-up):** add the production Claude +session-cap rule plus a unit test fixture, once a real session-limit +message has been captured (either by triggering one in normal use, or +by a contributor who has hit one previously). + +Other migration notes: + - The change is internal (no public API surface). No flag-gating, no config opt-in. - `isQuotaError` becomes a thin wrapper over `agentlimit.Classify` if - any tests still reference it directly; otherwise it is removed in the - same PR. + any tests still reference it directly; otherwise it is removed in + PR 1. - Existing behavior for Gemini and Codex is preserved bit-for-bit by - copying their patterns verbatim into the new rule table — verified - by a regression test that runs each known-failing message through - the new classifier and checks it still returns `KindQuota`. + copying all nine patterns from `isQuotaError` verbatim into the new + rule table — verified by the regression test described above that + runs each pattern through the new classifier. ## Risks and mitigations From 4e998553bdbbf1e2d14a27fc0e10caf783b6e594 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 13:32:01 -0500 Subject: [PATCH 03/24] docs: tighten rollout and test-helper wording in rate-limit spec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "no behavior change" claim was wrong: foreground roborev fix gains strict abort for KindQuota in PR 1, which changes outcomes for Gemini and Codex sessions too. Spelled out explicitly. Also dropped the proposed exported test helper — Classification and the Kind* constants are already exported, so cross-package tests can return a literal struct from their stub classifier without extra plumbing. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...-05-05-agent-rate-limit-handling-design.md | 39 ++++++++++++++----- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md b/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md index 7310decb..cb6fb5e3 100644 --- a/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md +++ b/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md @@ -229,13 +229,18 @@ the next observed cap actionable: a private `defaultRules` slice. To exercise the `KindSession` branch end-to-end without polluting the production rule table, the package also exposes an unexported `classifyWithRules(agent, errMsg, rules)` - helper. Tests in the same package call `classifyWithRules` with a + helper. Tests inside `agentlimit` call `classifyWithRules` with a synthetic rule slice (e.g. one that maps a fake "test-claude session limit" message to `KindSession`); production callers always go - through `Classify` and only see `defaultRules`. Tests in other - packages that need to drive a `KindSession` outcome receive a - pre-built `Classification` struct from a small test helper exported - for that purpose, rather than mutating any global state. + through `Classify` and only see `defaultRules`. + + Tests in `internal/daemon` and `cmd/roborev` need a different + mechanism — they can't reach an unexported helper. They use the + injected `classify` function on `WorkerPool` / fix-loop options + (described below) and have their stub return a literal + `agentlimit.Classification{Kind: agentlimit.KindSession, ResetAt: ...}`. + `Classification` and the `Kind*` constants are already exported, so + no extra test helper is needed. Real Claude patterns get added to the production rule table once captured. @@ -358,10 +363,26 @@ error preview, so the unmatched-pattern surfacing path is covered. The work splits into two PRs: **PR 1 — Framework (this spec):** shared classifier package, unmatched -non-zero exit logging, daemon-worker integration that preserves -existing behavior bit-for-bit, and CLI fix-loop abort plumbing -exercised by a synthetic test fixture. No behavior change for any -currently-supported agent. +non-zero exit logging, daemon-worker integration, and CLI fix-loop +abort plumbing exercised by a synthetic test fixture. + +Behavior changes in PR 1: + +- **Daemon worker (no change):** quota detection for Gemini and Codex + is preserved bit-for-bit. The same nine `isQuotaError` substrings + produce the same cooldown-then-failover-or-fail outcome. +- **Foreground `roborev fix` (intentional change):** today, when a + Gemini/Codex job hits a quota error mid-session, the fix loop logs + a per-job warning and continues (in discovery mode) or returns an + error for that one job (in explicit-IDs mode), then proceeds to the + next job — which usually hits the same wall. After PR 1, any error + matching a `KindQuota` (or future `KindSession`) rule on the + configured fix agent aborts the entire session immediately with a + reset-time message. This is the central improvement of the spec; it + applies to existing Codex/Gemini quota errors as well as future + Claude session caps. Users running `roborev fix` against a + quota-exhausted Gemini or Codex backend will see strict abort + instead of warn-and-continue starting in PR 1. **PR 2 — Claude detection (follow-up):** add the production Claude session-cap rule plus a unit test fixture, once a real session-limit From aba56aac53bf6a163f51634e615f8a5c8542fe52 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 13:45:09 -0500 Subject: [PATCH 04/24] docs: implementation plan for agent rate-limit and session-cap handling Eleven TDD-driven tasks splitting the work into a shared agentlimit package, daemon-worker integration that preserves existing Gemini/Codex behavior bit-for-bit, and CLI fix-loop abort plumbing exercised against an injected stub classifier. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-05-agent-rate-limit-handling.md | 1446 +++++++++++++++++ 1 file changed, 1446 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-05-agent-rate-limit-handling.md diff --git a/docs/superpowers/plans/2026-05-05-agent-rate-limit-handling.md b/docs/superpowers/plans/2026-05-05-agent-rate-limit-handling.md new file mode 100644 index 00000000..67734b3e --- /dev/null +++ b/docs/superpowers/plans/2026-05-05-agent-rate-limit-handling.md @@ -0,0 +1,1446 @@ +# Agent Rate-Limit and Session-Cap Handling Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a shared rate-limit classifier so the daemon worker keeps its current cooldown-then-failover behavior for Gemini/Codex quota errors and the foreground `roborev fix` loop aborts with a reset-time message instead of looping fruitlessly when the configured agent hits a quota or session cap. + +**Architecture:** A new `internal/agentlimit` package owns the detection logic — pattern table, kind enum, reset parsing — with no I/O. The daemon worker calls it via a swappable `classify` field on `WorkerPool` (default `agentlimit.Classify`), and the CLI fix loop calls it via a new `classify` field on `fixOptions`. Tests inject stubs that return literal `agentlimit.Classification` values for deterministic outcomes. + +**Tech Stack:** Go 1.x stdlib only (no new dependencies), testify (`require`/`assert`) for test assertions per project conventions. + +**Spec:** `docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md` + +--- + +## File Structure + +**Create:** +- `internal/agentlimit/agentlimit.go` — `Kind`, `Classification`, `rule`, `defaultRules`, `Classify`, `classifyWithRules`, helper `ruleAppliesToAgent` +- `internal/agentlimit/parse.go` — `ParseResetDuration`, `ParseResetTime`, range-clamp helpers +- `internal/agentlimit/agentlimit_test.go` — table tests for `Classify` (nine production patterns + negatives), test for `classifyWithRules` with synthetic `KindSession` rule +- `internal/agentlimit/parse_test.go` — table tests for both parsers + +**Modify:** +- `internal/daemon/worker.go` — add `classify` field on `WorkerPool`; replace `isQuotaError(...)` and `parseQuotaCooldown(...)` call sites at lines 792-796 with `wp.classify(...)`; add unmatched-error WARN log; delete the now-unused `isQuotaError` and `parseQuotaCooldown` helpers (and their tests in `worker_test.go`) +- `internal/daemon/worker_test.go` — drop `TestIsQuotaError`/`TestParseQuotaCooldown` (covered in agentlimit package); add `TestFailOrRetryInner_SessionLimitCoolsDownAndFailsOver` driven by injected classifier; add `TestFailOrRetryInner_QuotaPatternRegression` covering all nine substrings end-to-end +- `cmd/roborev/fix.go` — add `classify agentlimit.ClassifyFunc` and `nowFn func() time.Time` fields on `fixOptions` (defaults wired in `NewFixCmd`); add `agentLimitError` type with `Error()` formatter; in `fixSingleJob` and `runFixBatch`, after each `fixJobDirect` error, classify and either return the abort error (KindQuota/KindSession) or log a WARN (KindNone with non-empty error) before the existing per-job handling +- `cmd/roborev/fix_test.go` — add `TestFixSingleJobAbortsOnSessionLimit` and `TestRunFixBatchAbortsOnQuotaError` using a stub classifier injected via `fixOptions` + +**Note on the daemon-vs-foreground distinction (load-bearing):** the daemon worker keeps its existing behavior for Gemini/Codex (cooldown → immediate failover-or-fail, no retries). The foreground fix loop is where the new abort behavior lives — it applies to KindQuota too, not just future KindSession, and that change is intentional (spec migration section). + +--- + +## Task 1: Create `agentlimit` package skeleton with types + +**Files:** +- Create: `internal/agentlimit/agentlimit.go` +- Test: `internal/agentlimit/agentlimit_test.go` + +- [ ] **Step 1: Write failing test for type round-trip** + +```go +// internal/agentlimit/agentlimit_test.go +package agentlimit + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestClassificationZeroValue(t *testing.T) { + var c Classification + assert := assert.New(t) + assert.Equal(KindNone, c.Kind) + assert.Equal("", c.Agent) + assert.True(c.ResetAt.IsZero()) + assert.Equal(time.Duration(0), c.CooldownFor) + assert.Equal("", c.Message) +} +``` + +- [ ] **Step 2: Run test, confirm compile failure** + +Run: `go test ./internal/agentlimit/ -run TestClassificationZeroValue` +Expected: FAIL — `package agentlimit` does not exist. + +- [ ] **Step 3: Create the package with types** + +```go +// internal/agentlimit/agentlimit.go +// Package agentlimit classifies agent CLI error messages as rate-limit, +// quota, or session-cap signals so the daemon worker and the foreground +// roborev fix loop can react consistently. Pure logic — no I/O. +package agentlimit + +import "time" + +// Kind labels a classified agent error. +type Kind int + +const ( + KindNone Kind = iota // no rate-limit signal recognized + KindTransient // 429-style; retry locally, no cooldown + KindQuota // hard quota exhaustion (Gemini/Codex today) + KindSession // session-level cap (e.g. Claude 5-hour) +) + +// Classification is the result of inspecting an agent error. +type Classification struct { + Kind Kind + Agent string // canonical agent name (caller resolves aliases) + ResetAt time.Time // zero if not parseable from the message + CooldownFor time.Duration // zero if not parseable; caller applies its own fallback + Message string // raw error text (for logs / user display) +} + +// ClassifyFunc is the function shape used by callers that want to inject +// a stub in tests. +type ClassifyFunc func(agent, errMsg string) Classification +``` + +- [ ] **Step 4: Run test, confirm pass** + +Run: `go test ./internal/agentlimit/ -run TestClassificationZeroValue -v` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add internal/agentlimit/agentlimit.go internal/agentlimit/agentlimit_test.go +git commit -m "feat(agentlimit): create package with Kind and Classification types" +``` + +--- + +## Task 2: Implement `ParseResetDuration` + +**Files:** +- Create: `internal/agentlimit/parse.go` +- Test: `internal/agentlimit/parse_test.go` + +- [ ] **Step 1: Write failing tests for relative-duration parsing** + +```go +// internal/agentlimit/parse_test.go +package agentlimit + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestParseResetDuration(t *testing.T) { + cases := []struct { + name string + msg string + want time.Duration + }{ + {"none", "agent failed", 0}, + {"reset after seconds clamped to min", "quota will reset after 5s.", 1 * time.Minute}, + {"reset after minutes", "quota will reset after 48m20s.", 48*time.Minute + 20*time.Second}, + {"reset after hours", "reset after 2h13m.", 2*time.Hour + 13*time.Minute}, + {"reset after with trailing punct", "reset after 1h.. retrying", 1 * time.Hour}, + {"reset after invalid", "reset after notaduration", 0}, + {"reset after negative", "reset after -5m", 0}, + {"reset after huge clamped to max", "reset after 100h", 24 * time.Hour}, + {"case insensitive", "RESET AFTER 30m", 30 * time.Minute}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, ParseResetDuration(tc.msg)) + }) + } +} +``` + +- [ ] **Step 2: Run tests, confirm compile failure** + +Run: `go test ./internal/agentlimit/ -run TestParseResetDuration` +Expected: FAIL — `ParseResetDuration` undefined. + +- [ ] **Step 3: Implement `ParseResetDuration`** + +This lifts the logic from `internal/daemon/worker.go:1015` (`parseQuotaCooldown`), with the difference that it returns `0` instead of a caller-supplied fallback when nothing parses. + +```go +// internal/agentlimit/parse.go +package agentlimit + +import ( + "strings" + "time" +) + +// minCooldown / maxCooldown clamp parsed durations to a sane range so a +// pathological message ("reset after 1ms" or "reset after 100h") does not +// produce a useless cooldown. +const ( + minCooldown = 1 * time.Minute + maxCooldown = 24 * time.Hour +) + +// ParseResetDuration extracts a Go-format duration from a "reset after +// " substring in errMsg (case-insensitive). Returns 0 if no such +// substring is present or the duration is unparseable. Clamps positive +// values to [minCooldown, maxCooldown]. +func ParseResetDuration(errMsg string) time.Duration { + lower := strings.ToLower(errMsg) + idx := strings.Index(lower, "reset after ") + if idx < 0 { + return 0 + } + rest := errMsg[idx+len("reset after "):] + token := rest + if sp := strings.IndexAny(rest, " \t\n,;)"); sp > 0 { + token = rest[:sp] + } + token = strings.TrimRight(token, ".,;:)]}") + d, err := time.ParseDuration(token) + if err != nil || d <= 0 { + return 0 + } + if d < minCooldown { + return minCooldown + } + if d > maxCooldown { + return maxCooldown + } + return d +} +``` + +- [ ] **Step 4: Run tests, confirm pass** + +Run: `go test ./internal/agentlimit/ -run TestParseResetDuration -v` +Expected: PASS for all nine cases. + +- [ ] **Step 5: Commit** + +```bash +git add internal/agentlimit/parse.go internal/agentlimit/parse_test.go +git commit -m "feat(agentlimit): port reset-duration parsing from worker.parseQuotaCooldown" +``` + +--- + +## Task 3: Implement `ParseResetTime` + +**Files:** +- Modify: `internal/agentlimit/parse.go` +- Modify: `internal/agentlimit/parse_test.go` + +This adds an absolute-time parser for messages that say "resets at 5:42 PM" / "try again at 17:42 UTC". No production rule uses it day 1 — Gemini and Codex emit relative durations — but the framework supports it so adding a Claude rule later is a one-line table change without parser work. + +- [ ] **Step 1: Write failing tests for absolute-time parsing** + +```go +// internal/agentlimit/parse_test.go — append below TestParseResetDuration +func TestParseResetTime(t *testing.T) { + // Use a fixed "now" so same-day vs next-day rollover is deterministic. + loc := time.FixedZone("test", 0) + now := time.Date(2026, 5, 5, 12, 0, 0, 0, loc) // noon UTC + cases := []struct { + name string + msg string + wantErr bool + want time.Time + }{ + {"none", "agent failed", true, time.Time{}}, + { + name: "resets at later today", + msg: "limit resets at 5:42 PM", + want: time.Date(2026, 5, 5, 17, 42, 0, 0, loc), + }, + { + name: "try again at later today 24h", + msg: "try again at 17:42", + want: time.Date(2026, 5, 5, 17, 42, 0, 0, loc), + }, + { + name: "resets at earlier today rolls to next day", + msg: "limit resets at 9:00 AM", + want: time.Date(2026, 5, 6, 9, 0, 0, 0, loc), + }, + { + name: "case insensitive", + msg: "LIMIT RESETS AT 6:00 pm", + want: time.Date(2026, 5, 5, 18, 0, 0, 0, loc), + }, + {"unparseable token", "resets at moonrise", true, time.Time{}}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := parseResetTimeAt(tc.msg, now) + if tc.wantErr { + assert.True(t, got.IsZero(), "expected zero time, got %v", got) + return + } + assert.Equal(t, tc.want, got) + }) + } +} +``` + +- [ ] **Step 2: Run tests, confirm compile failure** + +Run: `go test ./internal/agentlimit/ -run TestParseResetTime` +Expected: FAIL — `parseResetTimeAt` undefined. + +- [ ] **Step 3: Implement `ParseResetTime` and `parseResetTimeAt`** + +Append to `internal/agentlimit/parse.go`: + +```go +// ParseResetTime extracts an absolute reset time from messages like +// "resets at 5:42 PM" or "try again at 17:42". Interprets the parsed +// clock time in the local timezone. Returns the zero time.Time if no +// recognized phrase is present or the time is unparseable. +// +// If the parsed clock time is earlier than now-on-the-same-day, the +// returned time rolls forward by 24 hours so callers that compute +// "time until reset" never get a negative duration. +func ParseResetTime(errMsg string) time.Time { + return parseResetTimeAt(errMsg, time.Now()) +} + +// parseResetTimeAt is ParseResetTime with an injectable clock for tests. +func parseResetTimeAt(errMsg string, now time.Time) time.Time { + lower := strings.ToLower(errMsg) + var idx int + switch { + case strings.Contains(lower, "resets at "): + idx = strings.Index(lower, "resets at ") + len("resets at ") + case strings.Contains(lower, "try again at "): + idx = strings.Index(lower, "try again at ") + len("try again at ") + default: + return time.Time{} + } + // Consume up to the next non-time character. Allow digits, ':', and + // AM/PM markers (with optional spaces before them). + rest := errMsg[idx:] + end := len(rest) + for i, r := range rest { + // Stop at sentence-ending punctuation or newline. + if r == '.' || r == ',' || r == ';' || r == '\n' || r == ')' { + end = i + break + } + } + token := strings.TrimSpace(rest[:end]) + + formats := []string{ + "3:04 PM", "3:04 pm", + "3:04PM", "3:04pm", + "15:04", + } + for _, f := range formats { + if t, err := time.Parse(f, token); err == nil { + candidate := time.Date( + now.Year(), now.Month(), now.Day(), + t.Hour(), t.Minute(), 0, 0, now.Location(), + ) + if candidate.Before(now) { + candidate = candidate.Add(24 * time.Hour) + } + return candidate + } + } + return time.Time{} +} +``` + +- [ ] **Step 4: Run tests, confirm pass** + +Run: `go test ./internal/agentlimit/ -run TestParseResetTime -v` +Expected: PASS for all six cases. + +- [ ] **Step 5: Commit** + +```bash +git add internal/agentlimit/parse.go internal/agentlimit/parse_test.go +git commit -m "feat(agentlimit): add absolute reset-time parser with same-day rollover" +``` + +--- + +## Task 4: Implement `Classify` and `classifyWithRules` with the nine production patterns + +**Files:** +- Modify: `internal/agentlimit/agentlimit.go` +- Modify: `internal/agentlimit/agentlimit_test.go` + +- [ ] **Step 1: Write failing tests for production patterns + negatives** + +Append to `internal/agentlimit/agentlimit_test.go`: + +```go +func TestClassifyProductionPatterns(t *testing.T) { + // All nine substrings from the original isQuotaError set must + // produce KindQuota. This is the byte-for-byte regression test + // for current Gemini and Codex detection. + patterns := []string{ + "resource exhausted", + "quota exceeded", + "quota_exceeded", + "quota exhausted", + "quota_exhausted", + "insufficient_quota", + "exhausted your capacity", + "capacity exhausted", + "capacity_exhausted", + } + for _, p := range patterns { + t.Run(p, func(t *testing.T) { + cls := Classify("gemini", "agent failed: "+p+", retrying...") + assert := assert.New(t) + assert.Equal(KindQuota, cls.Kind, "expected KindQuota for %q", p) + assert.Equal("gemini", cls.Agent) + assert.Contains(cls.Message, p) + }) + } +} + +func TestClassifyExtractsCooldownDuration(t *testing.T) { + cls := Classify( + "gemini", + "You have exhausted your capacity on this model. Your quota will reset after 48m20s.", + ) + assert := assert.New(t) + assert.Equal(KindQuota, cls.Kind) + assert.Equal(48*time.Minute+20*time.Second, cls.CooldownFor) + assert.True(cls.ResetAt.IsZero(), "no absolute time in this message") +} + +func TestClassifyNegativeCases(t *testing.T) { + cases := []struct { + name string + msg string + }{ + {"empty", ""}, + {"unrelated error", "exit status 1: file not found"}, + {"benign mention of limit", "limit set to 100 in config"}, + {"benign rate limit (transient, no rule produces it day 1)", "429 rate limit, retrying"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + cls := Classify("gemini", tc.msg) + assert.Equal(t, KindNone, cls.Kind, "expected KindNone for %q", tc.msg) + }) + } +} + +func TestClassifyWithRulesIsolatesSyntheticPattern(t *testing.T) { + // Synthetic rule used only inside this test — does not pollute defaultRules. + syntheticRules := []rule{ + {Agents: []string{"*"}, Substring: "test-claude session limit", Kind: KindSession}, + } + cls := classifyWithRules( + "claude-code", + "5-hour test-claude session limit reached", + syntheticRules, + ) + assert := assert.New(t) + assert.Equal(KindSession, cls.Kind) + assert.Equal("claude-code", cls.Agent) + + // Same message via the production Classify must not match — + // the synthetic rule is not in defaultRules. + cls2 := Classify("claude-code", "5-hour test-claude session limit reached") + assert.Equal(KindNone, cls2.Kind, "synthetic rule must not leak into defaultRules") +} +``` + +- [ ] **Step 2: Run tests, confirm compile failure** + +Run: `go test ./internal/agentlimit/` +Expected: FAIL — `Classify`, `classifyWithRules`, `rule` undefined. + +- [ ] **Step 3: Implement `Classify`, `classifyWithRules`, and `defaultRules`** + +Append to `internal/agentlimit/agentlimit.go`: + +```go +import "strings" + +// rule is one substring → kind mapping. The Agents slice restricts the +// rule to specific canonical agent names; "*" applies to any agent. +type rule struct { + Agents []string // canonical agent names; "*" = any + Substring string // case-insensitive substring match on the error message + Kind Kind +} + +// defaultRules is the production rule table. The nine quota substrings +// are copied from the original isQuotaError set in +// internal/daemon/worker.go so detection for Gemini and Codex is +// byte-for-byte unchanged. +// +// Claude session-cap patterns are intentionally absent — see the design +// doc's "Detection without a captured Claude message" section. Once a +// real session-cap message is captured, a single KindSession rule plus +// a unit-test fixture is sufficient to enable Claude detection. +var defaultRules = []rule{ + {Agents: []string{"*"}, Substring: "resource exhausted", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "quota exceeded", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "quota_exceeded", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "quota exhausted", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "quota_exhausted", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "insufficient_quota", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "exhausted your capacity", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "capacity exhausted", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "capacity_exhausted", Kind: KindQuota}, +} + +// Classify inspects an agent error message and returns a Classification +// describing whether (and how) the agent is rate-limited. The agent +// argument is the canonical agent name; the caller is responsible for +// resolving any aliases (e.g. "claude" → "claude-code") before calling. +// +// Returns Kind == KindNone when no rule matches. +func Classify(agent, errMsg string) Classification { + return classifyWithRules(agent, errMsg, defaultRules) +} + +// classifyWithRules is Classify with an explicit rule slice. Unexported; +// used inside the package's own tests so synthetic fixtures (e.g. a +// KindSession pattern) do not leak into defaultRules. +func classifyWithRules(agent, errMsg string, rules []rule) Classification { + if errMsg == "" { + return Classification{Kind: KindNone, Agent: agent, Message: errMsg} + } + lower := strings.ToLower(errMsg) + for _, r := range rules { + if !ruleAppliesToAgent(r, agent) { + continue + } + if !strings.Contains(lower, r.Substring) { + continue + } + return Classification{ + Kind: r.Kind, + Agent: agent, + ResetAt: ParseResetTime(errMsg), + CooldownFor: ParseResetDuration(errMsg), + Message: errMsg, + } + } + return Classification{Kind: KindNone, Agent: agent, Message: errMsg} +} + +func ruleAppliesToAgent(r rule, agent string) bool { + for _, a := range r.Agents { + if a == "*" || a == agent { + return true + } + } + return false +} +``` + +- [ ] **Step 4: Run all tests in the package, confirm pass** + +Run: `go test ./internal/agentlimit/ -v` +Expected: PASS for all four test functions. + +- [ ] **Step 5: Run full unit suite to confirm no regression** + +Run: `go test ./...` +Expected: PASS (or only pre-existing failures unrelated to this change). + +- [ ] **Step 6: Commit** + +```bash +git add internal/agentlimit/agentlimit.go internal/agentlimit/agentlimit_test.go +git commit -m "feat(agentlimit): add Classify with nine production quota patterns" +``` + +--- + +## Task 5: Replace `isQuotaError` + `parseQuotaCooldown` in the daemon worker + +**Files:** +- Modify: `internal/daemon/worker.go` +- Modify: `internal/daemon/worker_test.go` + +The behavior change here is *zero* for any currently-supported agent. The plan is to add an injectable `classify` function on `WorkerPool`, route the existing call site through it, then delete the now-dead helpers and their unit tests. + +- [ ] **Step 1: Add `classify` field on `WorkerPool`** + +Modify `internal/daemon/worker.go` around the existing fields (the cooldowns block, line 46-48): + +```go +// Add to imports: +// "github.com/roborev-dev/roborev/internal/agentlimit" + +// Add to the WorkerPool struct (alongside agentCooldowns): + // classify is the rate-limit/quota classifier. Defaults to + // agentlimit.Classify; tests substitute a stub via NewWorkerPool's + // caller setting it after construction (test-only access). + classify agentlimit.ClassifyFunc +``` + +In `NewWorkerPool` (line 59-74), set the default: + +```go +return &WorkerPool{ + db: db, + cfgGetter: cfgGetter, + broadcaster: broadcaster, + errorLog: errorLog, + activityLog: activityLog, + numWorkers: numWorkers, + stopCh: make(chan struct{}), + readyCh: make(chan struct{}), + runningJobs: make(map[int64]context.CancelFunc), + pendingCancels: make(map[int64]bool), + agentCooldowns: make(map[string]time.Time), + outputBuffers: NewOutputBuffer(512*1024, 4*1024*1024), + classify: agentlimit.Classify, +} +``` + +- [ ] **Step 2: Replace the call site in `failOrRetryInner`** + +Modify `internal/daemon/worker.go:789-803` (the `failOrRetryInner` function's quota and context-window branches). Replace the existing quota-detection block: + +Old: + +```go + // Quota errors skip retries entirely — cool down the agent and + // attempt failover or fail with a quota-prefixed error. + if agentError && isQuotaError(errorMsg) { + dur := parseQuotaCooldown(errorMsg, defaultCooldown) + wp.cooldownAgent(agentName, time.Now().Add(dur)) + log.Printf("[%s] Agent %s quota exhausted, cooldown %v", + workerID, agentName, dur) + wp.failoverOrFail(workerID, job, agentName, errorMsg) + return + } +``` + +New: + +```go + // Quota and session-limit errors skip retries entirely — cool down + // the agent and attempt failover or fail. Behavior matches the + // original isQuotaError branch; classification now lives in + // internal/agentlimit so the CLI fix loop can share it. + if agentError { + cls := wp.classify(agent.CanonicalName(agentName), errorMsg) + if cls.Kind == agentlimit.KindQuota || cls.Kind == agentlimit.KindSession { + dur := defaultCooldown + if cls.CooldownFor > 0 { + dur = cls.CooldownFor + } + if !cls.ResetAt.IsZero() { + if until := time.Until(cls.ResetAt); until > 0 { + dur = until + } + } + wp.cooldownAgent(agentName, time.Now().Add(dur)) + log.Printf("[%s] Agent %s quota exhausted, cooldown %v", + workerID, agentName, dur) + wp.failoverOrFail(workerID, job, agentName, errorMsg) + return + } + } +``` + +The `isContextWindowError` branch immediately below stays unchanged — that's a separate concern. + +- [ ] **Step 3: Delete the now-unused helpers** + +Remove from `internal/daemon/worker.go`: +- The entire `isQuotaError` function (lines 971-993) +- The entire `parseQuotaCooldown` function (lines 1013-1039) + +The constants `defaultCooldown`, `minCooldown`, and `maxCooldown` stay — `defaultCooldown` is still used by the new code at the call site, and `min`/`maxCooldown` were duplicated into `internal/agentlimit/parse.go` so the daemon-side ones are now unused only in `parseQuotaCooldown`. Since `parseQuotaCooldown` is gone, also remove `minCooldown` and `maxCooldown` from `worker.go` if they have no other references. + +Verify by grep: + +```bash +rg -n "minCooldown|maxCooldown" internal/daemon/ +``` + +Expected: zero matches in `internal/daemon/`. If matches remain, leave the constants in place. + +- [ ] **Step 4: Update existing unit tests in `worker_test.go`** + +Delete the unit-level `TestIsQuotaError` (around line 946) and `TestParseQuotaCooldown` (around line 1014) blocks entirely — the same coverage is now in `internal/agentlimit/agentlimit_test.go` and `internal/agentlimit/parse_test.go`. + +The integration-level tests `TestFailOrRetryInner_QuotaSkipsRetries`, `TestFailOrRetryInner_QuotaExhaustedVariant`, `TestFailOrRetryInner_NonQuotaStillRetries` (around lines 1190-1271) stay — they exercise the quota-skip-retries path, which is the exact behavior we are preserving. They will continue to pass via the default `agentlimit.Classify`. + +- [ ] **Step 5: Run the daemon test suite, confirm pass** + +Run: `go test ./internal/daemon/ -run "TestFailOrRetry" -v` +Expected: PASS — all `TestFailOrRetryInner_*` tests still pass via the default classifier. + +Run: `go test ./internal/daemon/` +Expected: PASS overall (no regressions). + +- [ ] **Step 6: Format and commit** + +Run: +```bash +go fmt ./... +go vet ./... +``` + +Expected: clean. Then: + +```bash +git add internal/daemon/worker.go internal/daemon/worker_test.go +git commit -m "refactor(daemon): route quota detection through internal/agentlimit" +``` + +--- + +## Task 6: Add a daemon-side test that exercises the injected classifier for KindSession + +**Files:** +- Modify: `internal/daemon/worker_test.go` + +This proves the `classify` injection works end-to-end against the existing worker test infrastructure. Without it, day 1 we have no positive coverage of the `KindSession` branch on the daemon side. + +- [ ] **Step 1: Write failing test using the existing `workerTestContext` helper** + +Append to `internal/daemon/worker_test.go`: + +```go +func TestFailOrRetryInner_SessionLimitCoolsDownAndFailsOver(t *testing.T) { + tc := newWorkerTestContext(t, 1) + job := tc.createAndClaimJob() + + // Stub classifier: any error message containing the marker yields + // KindSession with a 1-hour CooldownFor. + tc.wp.classify = func(agent, msg string) agentlimit.Classification { + if strings.Contains(msg, "MARKER-SESSION-LIMIT") { + return agentlimit.Classification{ + Kind: agentlimit.KindSession, + Agent: agent, + CooldownFor: 1 * time.Hour, + Message: msg, + } + } + return agentlimit.Classification{Kind: agentlimit.KindNone, Agent: agent, Message: msg} + } + + tc.wp.failOrRetryAgent("worker-0", job, "test", "boom MARKER-SESSION-LIMIT") + + assert := assert.New(t) + assert.True(tc.wp.isAgentCoolingDown("test"), "agent should be in cooldown") + + // retry_count should not have advanced — quota/session errors skip retries. + got, err := tc.wp.db.GetJobRetryCount(job.ID) + require.NoError(t, err) + assert.Equal(0, got, "session-limit error must not consume a retry slot") +} +``` + +Add the imports at the top of `worker_test.go` if not already present: + +```go +import ( + // ...existing... + "strings" + "time" + + "github.com/roborev-dev/roborev/internal/agentlimit" +) +``` + +- [ ] **Step 2: Run test, confirm pass on first try (the wiring from Task 5 already supports this)** + +Run: `go test ./internal/daemon/ -run TestFailOrRetryInner_SessionLimitCoolsDownAndFailsOver -v` +Expected: PASS. + +If FAIL with "agent should be in cooldown": the classifier injection from Task 5 is not being honored; revisit `failOrRetryInner` to confirm `wp.classify(...)` is the call. + +- [ ] **Step 3: Commit** + +```bash +git add internal/daemon/worker_test.go +git commit -m "test(daemon): cover session-limit cooldown via injected classifier" +``` + +--- + +## Task 7: Add unmatched-error WARN log on the daemon side + +**Files:** +- Modify: `internal/daemon/worker.go` +- Modify: `internal/daemon/worker_test.go` + +When the classifier returns `KindNone` for an agent error, log a single WARN line so the next time a Claude (or any other) session-cap fires we have a captured sample to add a rule for. + +- [ ] **Step 1: Write failing test** + +Append to `internal/daemon/worker_test.go`: + +```go +func TestFailOrRetryInner_UnmatchedAgentErrorLogsWarn(t *testing.T) { + tc := newWorkerTestContext(t, 1) + + // Capture log output by swapping the standard logger's writer. + var buf bytes.Buffer + origOutput := log.Writer() + log.SetOutput(&buf) + t.Cleanup(func() { log.SetOutput(origOutput) }) + + job := tc.createAndClaimJob() + tc.wp.classify = func(agent, msg string) agentlimit.Classification { + return agentlimit.Classification{Kind: agentlimit.KindNone, Agent: agent, Message: msg} + } + + tc.wp.failOrRetryAgent("worker-0", job, "test", "some brand new error wording from a future agent") + + logged := buf.String() + assert := assert.New(t) + assert.Contains(logged, "unclassified agent error", "expected WARN line for unmatched error") + assert.Contains(logged, "test", "log line should include agent name") + assert.Contains(logged, "some brand new error wording", "log line should include error preview") +} +``` + +Add imports if not already present: + +```go + "bytes" + "log" +``` + +- [ ] **Step 2: Run test, confirm failure** + +Run: `go test ./internal/daemon/ -run TestFailOrRetryInner_UnmatchedAgentErrorLogsWarn -v` +Expected: FAIL — log output does not contain "unclassified agent error". + +- [ ] **Step 3: Refactor `failOrRetryInner` to classify once and add the WARN log** + +Replace the agent-error block from Task 5 with a single switch that +covers all classifier kinds. This avoids double-classifying and adds +the `KindNone` WARN log in the same site. `truncateRunes` already lives +at `internal/daemon/classifier.go:108`; flatten newlines inline so the +log line stays on one row, then call the existing helper. + +```go +func (wp *WorkerPool) failOrRetryInner(workerID string, job *storage.ReviewJob, agentName string, errorMsg string, agentError bool) { + if agentError { + cls := wp.classify(agent.CanonicalName(agentName), errorMsg) + switch cls.Kind { + case agentlimit.KindQuota, agentlimit.KindSession: + dur := defaultCooldown + if cls.CooldownFor > 0 { + dur = cls.CooldownFor + } + if !cls.ResetAt.IsZero() { + if until := time.Until(cls.ResetAt); until > 0 { + dur = until + } + } + wp.cooldownAgent(agentName, time.Now().Add(dur)) + log.Printf("[%s] Agent %s quota exhausted, cooldown %v", + workerID, agentName, dur) + wp.failoverOrFail(workerID, job, agentName, errorMsg) + return + case agentlimit.KindNone: + if errorMsg != "" { + preview := strings.ReplaceAll(errorMsg, "\n", " ") + preview = strings.ReplaceAll(preview, "\r", "") + log.Printf("[%s] unclassified agent error from %s: %s", + workerID, agentName, truncateRunes(preview, 200)) + } + // fall through to context-window / retry handling + case agentlimit.KindTransient: + // fall through to retry handling + } + } + if agentError && isContextWindowError(errorMsg) { + wp.failoverOrFailNonRetryableAgent(workerID, job, agentName, errorMsg) + return + } + // ...existing retry path unchanged from Task 5... +} +``` + +The test from Task 6 returns `KindSession` so it hits the cooldown +branch; the new test from this task returns `KindNone` and hits the +WARN-log branch. + +- [ ] **Step 4: Run the new test and the regression suite** + +Run: `go test ./internal/daemon/ -run TestFailOrRetryInner -v` +Expected: PASS for all `TestFailOrRetryInner_*` tests, including the new `_UnmatchedAgentErrorLogsWarn`. + +- [ ] **Step 5: Format and commit** + +```bash +go fmt ./... +go vet ./... +git add internal/daemon/worker.go internal/daemon/worker_test.go +git commit -m "feat(daemon): log WARN for unclassified agent errors" +``` + +--- + +## Task 8: Add classifier hook and abort-error type to the CLI fix loop + +**Files:** +- Modify: `cmd/roborev/fix.go` + +This task only adds plumbing. The next two tasks wire it into the actual loops with tests. + +- [ ] **Step 1: Extend `fixOptions` and add the abort-error type** + +Modify `cmd/roborev/fix.go` near the existing `fixOptions` struct (line 224): + +```go +import "github.com/roborev-dev/roborev/internal/agentlimit" + +type fixOptions struct { + agentName string + model string + reasoning string + minSeverity string + quiet bool + resume bool + + // classify is the rate-limit classifier. Defaults to + // agentlimit.Classify in NewFixCmd; tests inject a stub to drive + // deterministic KindQuota / KindSession outcomes without depending + // on real agent error wording. + classify agentlimit.ClassifyFunc +} + +// agentLimitError is returned by the fix loop when the configured agent +// hits a quota or session limit. The fix command surfaces it as the +// process exit error so users see the reset time and a hint to retry. +type agentLimitError struct { + Classification agentlimit.Classification +} + +func (e *agentLimitError) Error() string { + return formatAgentLimitMessage(e.Classification, time.Now()) +} + +// formatAgentLimitMessage builds the user-facing abort message. Pulled +// out so tests can assert against it without depending on time.Now. +func formatAgentLimitMessage(cls agentlimit.Classification, now time.Time) string { + var dur time.Duration + switch { + case !cls.ResetAt.IsZero(): + dur = cls.ResetAt.Sub(now) + case cls.CooldownFor > 0: + dur = cls.CooldownFor + } + switch { + case dur > 0 && !cls.ResetAt.IsZero(): + return fmt.Sprintf( + "agent %s hit a session limit. Cooldown until %s (in %s). "+ + "Re-run after that, or pass --agent to switch.", + cls.Agent, + cls.ResetAt.Format("3:04 PM"), + dur.Round(time.Minute), + ) + case dur > 0: + return fmt.Sprintf( + "agent %s hit a session limit. Cooldown for ~%s. "+ + "Re-run after that, or pass --agent to switch.", + cls.Agent, + dur.Round(time.Minute), + ) + default: + flat := strings.ReplaceAll(cls.Message, "\n", " ") + return fmt.Sprintf( + "agent %s hit a session limit (unknown reset time). "+ + "Re-run later, or pass --agent to switch. "+ + "Original error: %s", + cls.Agent, + truncateString(flat, 200), + ) + } +} +``` + +`truncateString` already lives at `cmd/roborev/fix.go:796` — reuse it. +It does not flatten newlines, so callers that need a single-line preview +(this one and the WARN log paths in Tasks 9-10) flatten with +`strings.ReplaceAll(s, "\n", " ")` before calling. + +- [ ] **Step 2: Default the classifier where `fixOptions` is built** + +There is a single construction site for `fixOptions` in the production +flow, at `cmd/roborev/fix.go:137` (inside the `fix` cobra command's +`RunE`). Add `classify: agentlimit.Classify` to the struct literal: + +```go +opts := fixOptions{ + agentName: agentName, + model: model, + reasoning: reasoning, + minSeverity: minSeverity, + quiet: quiet, + resume: resume, + classify: agentlimit.Classify, +} +``` + +Tests that build `fixOptions` directly are responsible for setting +`classify` (or leaving it nil and relying on the test to never reach +an error path). To make accidental nil dereferences obvious, add a +single nil-guard at the top of `fixSingleJob` and `runFixBatch`: + +```go +if opts.classify == nil { + opts.classify = agentlimit.Classify +} +``` + +This mirrors how `streamfmt` and other defaults are handled elsewhere in +the file and keeps behavior identical when callers do not set +`classify`. + +- [ ] **Step 3: Format and confirm package builds** + +```bash +go fmt ./... +go vet ./... +go build ./... +``` + +Expected: clean build (no usage of `agentLimitError` yet — only declarations). + +- [ ] **Step 4: Commit** + +```bash +git add cmd/roborev/fix.go +git commit -m "feat(fix): add classifier hook and agentLimitError plumbing" +``` + +--- + +## Task 9: Wire abort path into `fixSingleJob` + +**Files:** +- Modify: `cmd/roborev/fix.go` +- Modify: `cmd/roborev/fix_test.go` + +- [ ] **Step 1: Write failing test** + +Append to `cmd/roborev/fix_test.go`: + +This follows the `agent.FakeAgent` registration pattern used by +`TestFixSingleJobRecoversPostFixDaemonCalls` (`cmd/roborev/fix_test.go:565`) +to make the `test` agent return a synthetic error, and the `newMockServer` ++ `patchServerAddr` pattern used by `TestFixSingleJob` (line 529) for +daemon plumbing. + +```go +func TestFixSingleJobAbortsOnSessionLimit(t *testing.T) { + repo := createTestRepo(t, map[string]string{"main.go": "package main\n"}) + + originalAgent, err := agent.Get("test") + require.NoError(t, err, "get test agent") + agent.Register(&agent.FakeAgent{ + NameStr: "test", + ReviewFn: func(_ context.Context, _, _, _ string, _ io.Writer) (string, error) { + return "", errors.New("simulated claude session cap") + }, + }) + t.Cleanup(func() { agent.Register(originalAgent) }) + + ts, _ := newMockServer(t, MockServerOpts{ + ReviewOutput: "## Issues\n- Found issue", + OnJobs: func(w http.ResponseWriter, r *http.Request) { + writeJSON(w, map[string]any{ + "jobs": []storage.ReviewJob{{ + ID: 99, + Status: storage.JobStatusDone, + Agent: "test", + }}, + }) + }, + }) + patchServerAddr(t, ts.URL) + + cmd, _ := newTestCmd(t) + + resetAt := time.Now().Add(2*time.Hour + 13*time.Minute) + classifyCalls := 0 + opts := fixOptions{ + agentName: "test", + reasoning: "fast", + classify: func(_, msg string) agentlimit.Classification { + classifyCalls++ + return agentlimit.Classification{ + Kind: agentlimit.KindSession, + Agent: "claude-code", + ResetAt: resetAt, + Message: msg, + } + }, + } + + tracker := &fixSessionTracker{base: agent.NewTestAgent(), out: io.Discard} + err = fixSingleJob(cmd, repo.Dir, 99, opts, tracker) + require.Error(t, err, "expected abort error, got nil") + + var lim *agentLimitError + require.ErrorAs(t, err, &lim, "expected agentLimitError, got %T: %v", err, err) + + assert := assert.New(t) + assert.Equal(agentlimit.KindSession, lim.Classification.Kind) + assert.Equal(1, classifyCalls, "classifier should be called exactly once") + assert.Contains(err.Error(), "claude-code") + assert.Contains(err.Error(), "session limit") +} +``` + +Add any missing imports to the top of `fix_test.go` (most are already +present — `errors` and `agentlimit` are likely the only new ones): + +```go +import ( + "errors" + + "github.com/roborev-dev/roborev/internal/agentlimit" +) +``` + +- [ ] **Step 2: Run test, confirm failure** + +Run: `go test ./cmd/roborev/ -run TestFixSingleJobAbortsOnSessionLimit -v` +Expected: FAIL — either compile error (no abort path yet) or assertion failure. + +- [ ] **Step 3: Add the abort branch in `fixSingleJob`** + +Modify `cmd/roborev/fix.go` — find the `result, err := fixJobDirect(...)` call inside `fixSingleJob` (around line 926). Replace the existing error block: + +Old: + +```go + if err != nil { + tracker.Reset() + return err + } +``` + +New: + +```go + if err != nil { + tracker.Reset() + cls := opts.classify(agent.CanonicalName(currentAgent.Name()), err.Error()) + switch cls.Kind { + case agentlimit.KindQuota, agentlimit.KindSession: + return &agentLimitError{Classification: cls} + case agentlimit.KindNone: + if err.Error() != "" && !opts.quiet { + flat := strings.ReplaceAll(err.Error(), "\n", " ") + cmd.PrintErrf( + "warning: unclassified agent error from %s: %s\n", + currentAgent.Name(), + truncateString(flat, 200), + ) + } + } + return err + } +``` + +The `KindTransient` case falls through to the default `return err` — same as today. The `KindNone` warn-print is the CLI mirror of the daemon-side WARN log. + +- [ ] **Step 4: Run test, confirm pass** + +Run: `go test ./cmd/roborev/ -run TestFixSingleJobAbortsOnSessionLimit -v` +Expected: PASS. + +- [ ] **Step 5: Run the broader fix suite to confirm no regression** + +Run: `go test ./cmd/roborev/ -run TestFix -v` +Expected: PASS for all `TestFix*` tests. + +- [ ] **Step 6: Commit** + +```bash +go fmt ./... +go vet ./... +git add cmd/roborev/fix.go cmd/roborev/fix_test.go +git commit -m "feat(fix): abort fixSingleJob on KindQuota or KindSession" +``` + +--- + +## Task 10: Wire abort path into `runFixBatch` + +**Files:** +- Modify: `cmd/roborev/fix.go` +- Modify: `cmd/roborev/fix_test.go` + +The batch loop is the long-running shape that originally surfaced the bug — without an abort, the CLI prints warnings and continues for every queued job. + +- [ ] **Step 1: Write failing test** + +This follows the `newMockDaemonBuilder` pattern from +`TestFixBatchSkipsPassVerdict` (`cmd/roborev/fix_test.go:2353`). The +mock daemon serves two job IDs; the `FakeAgent` returns a synthetic +quota error; the test asserts the batch aborts after the first job +without calling the daemon for the second. + +```go +func TestRunFixBatchAbortsOnQuotaError(t *testing.T) { + repo := createTestRepo(t, map[string]string{"main.go": "package main\n"}) + + originalAgent, err := agent.Get("test") + require.NoError(t, err, "get test agent") + agentCalls := 0 + agent.Register(&agent.FakeAgent{ + NameStr: "test", + ReviewFn: func(_ context.Context, _, _, _ string, _ io.Writer) (string, error) { + agentCalls++ + return "", errors.New("agent failed: you have exhausted your capacity") + }, + }) + t.Cleanup(func() { agent.Register(originalAgent) }) + + var mu sync.Mutex + var reviewedJobIDs []int64 + _ = newMockDaemonBuilder(t). + WithHandler("/api/jobs", func(w http.ResponseWriter, r *http.Request) { + id := r.URL.Query().Get("id") + switch id { + case "10", "20": + idNum := int64(10) + if id == "20" { + idNum = 20 + } + writeJSON(w, map[string]any{ + "jobs": []storage.ReviewJob{{ + ID: idNum, + Status: storage.JobStatusDone, + Agent: "test", + }}, + }) + default: + writeJSON(w, map[string]any{ + "jobs": []storage.ReviewJob{}, + "has_more": false, + }) + } + }). + WithHandler("/api/review", func(w http.ResponseWriter, r *http.Request) { + jobID := r.URL.Query().Get("job_id") + mu.Lock() + if jobID == "10" { + reviewedJobIDs = append(reviewedJobIDs, 10) + } else if jobID == "20" { + reviewedJobIDs = append(reviewedJobIDs, 20) + } + mu.Unlock() + writeJSON(w, storage.Review{ + JobID: 10, + Output: "## Issues\n- Bug", + }) + }). + WithHandler("/api/enqueue", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + }). + Build() + + classifyCalls := 0 + opts := fixOptions{ + agentName: "test", + reasoning: "fast", + classify: func(_, msg string) agentlimit.Classification { + classifyCalls++ + return agentlimit.Classification{ + Kind: agentlimit.KindQuota, + Agent: "gemini", + CooldownFor: 30 * time.Minute, + Message: msg, + } + }, + } + + _, err = runWithOutput(t, repo.Dir, func(cmd *cobra.Command) error { + return runFixBatch( + cmd, + []int64{10, 20}, + "", + false, false, false, + 0, + opts, + &fixSessionTracker{base: agent.NewTestAgent(), out: io.Discard}, + ) + }) + require.Error(t, err, "expected batch to abort with agentLimitError") + + var lim *agentLimitError + require.ErrorAs(t, err, &lim, "expected agentLimitError, got %T: %v", err, err) + + assert := assert.New(t) + assert.Equal(agentlimit.KindQuota, lim.Classification.Kind) + assert.Equal(1, classifyCalls, "classifier should be called exactly once") + assert.Equal(1, agentCalls, "fix agent should be invoked exactly once before abort") + + // Job 20 must not have been processed past initial discovery. + mu.Lock() + reviewed := append([]int64(nil), reviewedJobIDs...) + mu.Unlock() + assert.NotContains(reviewed, int64(20), "second job must not be reviewed after abort") +} +``` + +The exact handler keys (`"10"`, `"20"`) and the `reviewedJobIDs` shape +mirror `TestFixBatchSkipsPassVerdict`. If `runWithOutput`'s signature +differs from what's shown above, copy it verbatim from the nearest +existing batch test in the same file. + +- [ ] **Step 2: Run test, confirm failure** + +Run: `go test ./cmd/roborev/ -run TestRunFixBatchAbortsOnQuotaError -v` +Expected: FAIL. + +- [ ] **Step 3: Add the abort branch in `runFixBatch`** + +Modify `cmd/roborev/fix.go` — locate the `fixJobDirect` call inside `runFixBatch` (around line 1170) and the surrounding error handling. Apply the same pattern as Task 9: classify on error, abort on `KindQuota`/`KindSession`, warn on `KindNone`, fall through otherwise. Reuse the same `agentLimitError` type and `truncateString` helper. + +The exact site: + +```go +result, err := fixJobDirect(ctx, fixJobParams{ + RepoRoot: repoRoot, + Agent: currentAgent, + Output: capture, +}, prompt) +// ...flush + tracker bookkeeping unchanged... +if err != nil { + cls := opts.classify(agent.CanonicalName(currentAgent.Name()), err.Error()) + switch cls.Kind { + case agentlimit.KindQuota, agentlimit.KindSession: + return &agentLimitError{Classification: cls} + case agentlimit.KindNone: + if err.Error() != "" && !opts.quiet { + flat := strings.ReplaceAll(err.Error(), "\n", " ") + cmd.PrintErrf( + "warning: unclassified agent error from %s: %s\n", + currentAgent.Name(), + truncateString(flat, 200), + ) + } + } + // existing per-job handling continues (warn-and-continue or return) + // ... +} +``` + +Make sure the abort `return` happens *before* the existing per-job warn-and-continue logic so the second job is never attempted. + +- [ ] **Step 4: Run test, confirm pass** + +Run: `go test ./cmd/roborev/ -run TestRunFixBatchAbortsOnQuotaError -v` +Expected: PASS — classifier called exactly once, second job untouched. + +- [ ] **Step 5: Run the full fix suite** + +Run: `go test ./cmd/roborev/ -run TestFix -v` +Expected: PASS for all `TestFix*` tests, including the discovery-mode tests that exercise the warn-and-continue path with non-limit errors. + +- [ ] **Step 6: Run the entire test suite** + +Run: `go test ./...` +Expected: PASS. + +- [ ] **Step 7: Format and commit** + +```bash +go fmt ./... +go vet ./... +git add cmd/roborev/fix.go cmd/roborev/fix_test.go +git commit -m "feat(fix): abort runFixBatch on KindQuota or KindSession" +``` + +--- + +## Task 11: Final verification and lint + +**Files:** +- (verification only) + +- [ ] **Step 1: Confirm dead code is gone** + +```bash +rg -n "isQuotaError|parseQuotaCooldown" . +``` + +Expected: zero matches outside of `docs/` and possibly unrelated history. + +- [ ] **Step 2: Run lint and full tests** + +```bash +make lint +go test ./... +``` + +Expected: clean lint, all tests pass. + +- [ ] **Step 3: Read the diff start-to-end** + +```bash +git log --oneline main..HEAD +git diff main...HEAD +``` + +Look for: leftover TODO comments, unused imports, dead helpers, inconsistent naming between `agentlimit.ClassifyFunc` declarations and call sites. + +- [ ] **Step 4: Manual smoke (if a quota-exhausted agent is available)** + +If a Gemini account in known-exhausted state is reachable, run: + +```bash +roborev fix +``` + +Expected: the fix loop aborts after the first job with a message like + +``` +agent gemini hit a session limit. Cooldown for ~30m. Re-run after that, or pass --agent to switch. +``` + +If no exhausted agent is reachable, this step is informational only — coverage is in the unit tests. + +--- + +## Spec coverage check (self-review) + +| Spec section / requirement | Implementing task | +|---|---| +| `internal/agentlimit` package with `Kind`, `Classification`, `Classify`, `classifyWithRules` | Task 1, Task 4 | +| Day-1 rule table = nine `isQuotaError` substrings | Task 4 | +| `parseResetDuration` (lifted from `parseQuotaCooldown`) | Task 2 | +| `parseResetTime` (absolute time, same-day rollover) | Task 3 | +| Daemon worker: classifier replaces `isQuotaError`, behavior preserved | Task 5 | +| Daemon worker: classifier injection (test stub) | Task 5 (`classify` field), Task 6 (positive test) | +| Daemon worker: unmatched-error WARN log | Task 7 | +| `fixOptions.classify` hook | Task 8 | +| `agentLimitError` type | Task 8 | +| Abort in `fixSingleJob` on `KindQuota`/`KindSession` | Task 9 | +| Abort in `runFixBatch` on `KindQuota`/`KindSession` | Task 10 | +| CLI unmatched-error warn line | Task 9, Task 10 | +| No exported test helper (cross-package tests use struct literals) | Task 9, Task 10 (tests construct `Classification` literals) | +| Existing `TestFailOrRetryInner_QuotaSkipsRetries` and friends still pass | Task 5 (preserved by routing through default `agentlimit.Classify`) | + +--- + +## Out of scope (per spec) + +These are deliberately *not* in this plan: + +- A production Claude session-cap rule. Adding it is a one-line table change in `defaultRules` plus a unit test fixture, taken in a follow-up PR once a real session-cap message is captured. The unmatched-error WARN log from Task 7 / Tasks 9-10 will surface a sample. +- `--auto-failover` flag for `roborev fix`. +- `roborev status` field showing in-cooldown agents. +- Persisting cooldown state across daemon restarts. +- Any new HTTP API surface. From 21159050ecb9ef749cddf645e40e2bcde87a7b04 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 14:23:28 -0500 Subject: [PATCH 05/24] feat(agentlimit): create package with Kind and Classification types --- internal/agentlimit/agentlimit.go | 29 ++++++++++++++++++++++++++ internal/agentlimit/agentlimit_test.go | 18 ++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 internal/agentlimit/agentlimit.go create mode 100644 internal/agentlimit/agentlimit_test.go diff --git a/internal/agentlimit/agentlimit.go b/internal/agentlimit/agentlimit.go new file mode 100644 index 00000000..4cf4eabe --- /dev/null +++ b/internal/agentlimit/agentlimit.go @@ -0,0 +1,29 @@ +// Package agentlimit classifies agent CLI error messages as rate-limit, +// quota, or session-cap signals so the daemon worker and the foreground +// roborev fix loop can react consistently. Pure logic — no I/O. +package agentlimit + +import "time" + +// Kind labels a classified agent error. +type Kind int + +const ( + KindNone Kind = iota // no rate-limit signal recognized + KindTransient // 429-style; retry locally, no cooldown + KindQuota // hard quota exhaustion (Gemini/Codex today) + KindSession // session-level cap (e.g. Claude 5-hour) +) + +// Classification is the result of inspecting an agent error. +type Classification struct { + Kind Kind + Agent string // canonical agent name (caller resolves aliases) + ResetAt time.Time // zero if not parseable from the message + CooldownFor time.Duration // zero if not parseable; caller applies its own fallback + Message string // raw error text (for logs / user display) +} + +// ClassifyFunc is the function shape used by callers that want to inject +// a stub in tests. +type ClassifyFunc func(agent, errMsg string) Classification diff --git a/internal/agentlimit/agentlimit_test.go b/internal/agentlimit/agentlimit_test.go new file mode 100644 index 00000000..495650ac --- /dev/null +++ b/internal/agentlimit/agentlimit_test.go @@ -0,0 +1,18 @@ +package agentlimit + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestClassificationZeroValue(t *testing.T) { + var c Classification + assert := assert.New(t) + assert.Equal(KindNone, c.Kind) + assert.Empty(c.Agent) + assert.True(c.ResetAt.IsZero()) + assert.Equal(time.Duration(0), c.CooldownFor) + assert.Empty(c.Message) +} From d580130f53016075a756d809053e6b010b0a0348 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 14:27:27 -0500 Subject: [PATCH 06/24] feat(agentlimit): port reset-duration parsing from worker.parseQuotaCooldown --- internal/agentlimit/parse.go | 43 +++++++++++++++++++++++++++++++ internal/agentlimit/parse_test.go | 31 ++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 internal/agentlimit/parse.go create mode 100644 internal/agentlimit/parse_test.go diff --git a/internal/agentlimit/parse.go b/internal/agentlimit/parse.go new file mode 100644 index 00000000..e39985a2 --- /dev/null +++ b/internal/agentlimit/parse.go @@ -0,0 +1,43 @@ +package agentlimit + +import ( + "strings" + "time" +) + +// minCooldown / maxCooldown clamp parsed durations to a sane range so a +// pathological message ("reset after 1ms" or "reset after 100h") does not +// produce a useless cooldown. +const ( + minCooldown = 1 * time.Minute + maxCooldown = 24 * time.Hour +) + +// ParseResetDuration extracts a Go-format duration from a "reset after +// " substring in errMsg (case-insensitive). Returns 0 if no such +// substring is present or the duration is unparseable. Clamps positive +// values to [minCooldown, maxCooldown]. +func ParseResetDuration(errMsg string) time.Duration { + lower := strings.ToLower(errMsg) + idx := strings.Index(lower, "reset after ") + if idx < 0 { + return 0 + } + rest := errMsg[idx+len("reset after "):] + token := rest + if sp := strings.IndexAny(rest, " \t\n,;)"); sp > 0 { + token = rest[:sp] + } + token = strings.TrimRight(token, ".,;:)]}") + d, err := time.ParseDuration(token) + if err != nil || d <= 0 { + return 0 + } + if d < minCooldown { + return minCooldown + } + if d > maxCooldown { + return maxCooldown + } + return d +} diff --git a/internal/agentlimit/parse_test.go b/internal/agentlimit/parse_test.go new file mode 100644 index 00000000..8c2d094a --- /dev/null +++ b/internal/agentlimit/parse_test.go @@ -0,0 +1,31 @@ +package agentlimit + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestParseResetDuration(t *testing.T) { + cases := []struct { + name string + msg string + want time.Duration + }{ + {"none", "agent failed", 0}, + {"reset after seconds clamped to min", "quota will reset after 5s.", 1 * time.Minute}, + {"reset after minutes", "quota will reset after 48m20s.", 48*time.Minute + 20*time.Second}, + {"reset after hours", "reset after 2h13m.", 2*time.Hour + 13*time.Minute}, + {"reset after with trailing punct", "reset after 1h.. retrying", 1 * time.Hour}, + {"reset after invalid", "reset after notaduration", 0}, + {"reset after negative", "reset after -5m", 0}, + {"reset after huge clamped to max", "reset after 100h", 24 * time.Hour}, + {"case insensitive", "RESET AFTER 30m", 30 * time.Minute}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, ParseResetDuration(tc.msg)) + }) + } +} From c00b2e9dad3cccce36b98baf26c3fa646b5b84d0 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 14:41:34 -0500 Subject: [PATCH 07/24] feat(agentlimit): add absolute reset-time parser with same-day rollover --- internal/agentlimit/parse.go | 57 +++++++++++++++++++++++++++++++ internal/agentlimit/parse_test.go | 45 ++++++++++++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/internal/agentlimit/parse.go b/internal/agentlimit/parse.go index e39985a2..67df9eab 100644 --- a/internal/agentlimit/parse.go +++ b/internal/agentlimit/parse.go @@ -41,3 +41,60 @@ func ParseResetDuration(errMsg string) time.Duration { } return d } + +// ParseResetTime extracts an absolute reset time from messages like +// "resets at 5:42 PM" or "try again at 17:42". Interprets the parsed +// clock time in the local timezone. Returns the zero time.Time if no +// recognized phrase is present or the time is unparseable. +// +// If the parsed clock time is earlier than now-on-the-same-day, the +// returned time rolls forward by 24 hours so callers that compute +// "time until reset" never get a negative duration. +func ParseResetTime(errMsg string) time.Time { + return parseResetTimeAt(errMsg, time.Now()) +} + +// parseResetTimeAt is ParseResetTime with an injectable clock for tests. +func parseResetTimeAt(errMsg string, now time.Time) time.Time { + lower := strings.ToLower(errMsg) + var idx int + switch { + case strings.Contains(lower, "resets at "): + idx = strings.Index(lower, "resets at ") + len("resets at ") + case strings.Contains(lower, "try again at "): + idx = strings.Index(lower, "try again at ") + len("try again at ") + default: + return time.Time{} + } + // Consume up to the next non-time character. Allow digits, ':', and + // AM/PM markers (with optional spaces before them). + rest := errMsg[idx:] + end := len(rest) + for i, r := range rest { + // Stop at sentence-ending punctuation or newline. + if r == '.' || r == ',' || r == ';' || r == '\n' || r == ')' { + end = i + break + } + } + token := strings.TrimSpace(rest[:end]) + + formats := []string{ + "3:04 PM", "3:04 pm", + "3:04PM", "3:04pm", + "15:04", + } + for _, f := range formats { + if t, err := time.Parse(f, token); err == nil { + candidate := time.Date( + now.Year(), now.Month(), now.Day(), + t.Hour(), t.Minute(), 0, 0, now.Location(), + ) + if candidate.Before(now) { + candidate = candidate.Add(24 * time.Hour) + } + return candidate + } + } + return time.Time{} +} diff --git a/internal/agentlimit/parse_test.go b/internal/agentlimit/parse_test.go index 8c2d094a..9ccb90c1 100644 --- a/internal/agentlimit/parse_test.go +++ b/internal/agentlimit/parse_test.go @@ -29,3 +29,48 @@ func TestParseResetDuration(t *testing.T) { }) } } + +func TestParseResetTime(t *testing.T) { + // Use a fixed "now" so same-day vs next-day rollover is deterministic. + loc := time.FixedZone("test", 0) + now := time.Date(2026, 5, 5, 12, 0, 0, 0, loc) // noon UTC + cases := []struct { + name string + msg string + wantErr bool + want time.Time + }{ + {"none", "agent failed", true, time.Time{}}, + { + name: "resets at later today", + msg: "limit resets at 5:42 PM", + want: time.Date(2026, 5, 5, 17, 42, 0, 0, loc), + }, + { + name: "try again at later today 24h", + msg: "try again at 17:42", + want: time.Date(2026, 5, 5, 17, 42, 0, 0, loc), + }, + { + name: "resets at earlier today rolls to next day", + msg: "limit resets at 9:00 AM", + want: time.Date(2026, 5, 6, 9, 0, 0, 0, loc), + }, + { + name: "case insensitive", + msg: "LIMIT RESETS AT 6:00 pm", + want: time.Date(2026, 5, 5, 18, 0, 0, 0, loc), + }, + {"unparseable token", "resets at moonrise", true, time.Time{}}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := parseResetTimeAt(tc.msg, now) + if tc.wantErr { + assert.True(t, got.IsZero(), "expected zero time, got %v", got) + return + } + assert.Equal(t, tc.want, got) + }) + } +} From c4bf92c5c7c773d8a5af0761c27dfcc6e26ac200 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 14:47:24 -0500 Subject: [PATCH 08/24] fix(agentlimit): correct DST handling and equality rollover in ParseResetTime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace candidate.Add(24*time.Hour) with time.Date(..., Day()+1, ...) so the rollover stays at the same wall-clock time across DST transitions (the 24h-add lands an hour off in March/November). Switch candidate.Before(now) to !candidate.After(now) so equality also rolls forward — avoids returning a zero-duration cooldown when the agent's clock matches ours to the second. Adds tests for the rollover boundary, timezone propagation, and the conservative trailing-content behavior that documents what the parser does NOT handle yet (deferred until a real Claude message is captured). --- internal/agentlimit/parse.go | 12 +++++++++-- internal/agentlimit/parse_test.go | 34 +++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/internal/agentlimit/parse.go b/internal/agentlimit/parse.go index 67df9eab..627b5f85 100644 --- a/internal/agentlimit/parse.go +++ b/internal/agentlimit/parse.go @@ -90,8 +90,16 @@ func parseResetTimeAt(errMsg string, now time.Time) time.Time { now.Year(), now.Month(), now.Day(), t.Hour(), t.Minute(), 0, 0, now.Location(), ) - if candidate.Before(now) { - candidate = candidate.Add(24 * time.Hour) + // Roll forward when the parsed wall-clock time is at-or-before + // now. Using time.Date with Day()+1 (instead of Add(24h)) is + // DST-safe: Go normalizes lost/gained hours so the returned + // time always means "same wall-clock time tomorrow" in + // now.Location(). + if !candidate.After(now) { + candidate = time.Date( + now.Year(), now.Month(), now.Day()+1, + t.Hour(), t.Minute(), 0, 0, now.Location(), + ) } return candidate } diff --git a/internal/agentlimit/parse_test.go b/internal/agentlimit/parse_test.go index 9ccb90c1..22f51a71 100644 --- a/internal/agentlimit/parse_test.go +++ b/internal/agentlimit/parse_test.go @@ -62,6 +62,29 @@ func TestParseResetTime(t *testing.T) { want: time.Date(2026, 5, 5, 18, 0, 0, 0, loc), }, {"unparseable token", "resets at moonrise", true, time.Time{}}, + { + // Trailing content after the time defeats time.Parse and + // produces zero. Documenting current conservative behavior + // — extending the parser to handle "today" / "UTC" suffixes + // is deferred until a real Claude message is captured. + name: "trailing word causes miss", + msg: "limit resets at 5:42 PM today", + wantErr: true, + want: time.Time{}, + }, + { + // 24h with a timezone abbreviation: same conservative miss. + name: "24h with trailing zone causes miss", + msg: "resets at 17:42 UTC", + wantErr: true, + want: time.Time{}, + }, + { + // Same instant as now: should roll forward, not return now. + name: "exactly now rolls to next day", + msg: "limit resets at 12:00 PM", + want: time.Date(2026, 5, 6, 12, 0, 0, 0, loc), + }, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { @@ -74,3 +97,14 @@ func TestParseResetTime(t *testing.T) { }) } } + +func TestParseResetTimeRespectsLocation(t *testing.T) { + // Non-UTC fixture: confirms now.Location() is honored end-to-end. + est := time.FixedZone("EST", -5*60*60) + now := time.Date(2026, 5, 5, 12, 0, 0, 0, est) // noon EST + + got := parseResetTimeAt("limit resets at 5:42 PM", now) + want := time.Date(2026, 5, 5, 17, 42, 0, 0, est) + assert.Equal(t, want, got) + assert.Equal(t, est, got.Location()) +} From 44f74f76b70ceb1ce942bb814ec14a53659b7cd5 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 14:51:57 -0500 Subject: [PATCH 09/24] feat(agentlimit): add Classify with nine production quota patterns --- internal/agentlimit/agentlimit.go | 79 +++++++++++++++++++++++++- internal/agentlimit/agentlimit_test.go | 75 ++++++++++++++++++++++++ 2 files changed, 153 insertions(+), 1 deletion(-) diff --git a/internal/agentlimit/agentlimit.go b/internal/agentlimit/agentlimit.go index 4cf4eabe..802d1dbe 100644 --- a/internal/agentlimit/agentlimit.go +++ b/internal/agentlimit/agentlimit.go @@ -3,7 +3,10 @@ // roborev fix loop can react consistently. Pure logic — no I/O. package agentlimit -import "time" +import ( + "strings" + "time" +) // Kind labels a classified agent error. type Kind int @@ -27,3 +30,77 @@ type Classification struct { // ClassifyFunc is the function shape used by callers that want to inject // a stub in tests. type ClassifyFunc func(agent, errMsg string) Classification + +// rule is one substring → kind mapping. The Agents slice restricts the +// rule to specific canonical agent names; "*" applies to any agent. +type rule struct { + Agents []string // canonical agent names; "*" = any + Substring string // case-insensitive substring match on the error message + Kind Kind +} + +// defaultRules is the production rule table. The nine quota substrings +// are copied from the original isQuotaError set in +// internal/daemon/worker.go so detection for Gemini and Codex is +// byte-for-byte unchanged. +// +// Claude session-cap patterns are intentionally absent — see the design +// doc's "Detection without a captured Claude message" section. Once a +// real session-cap message is captured, a single KindSession rule plus +// a unit-test fixture is sufficient to enable Claude detection. +var defaultRules = []rule{ + {Agents: []string{"*"}, Substring: "resource exhausted", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "quota exceeded", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "quota_exceeded", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "quota exhausted", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "quota_exhausted", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "insufficient_quota", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "exhausted your capacity", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "capacity exhausted", Kind: KindQuota}, + {Agents: []string{"*"}, Substring: "capacity_exhausted", Kind: KindQuota}, +} + +// Classify inspects an agent error message and returns a Classification +// describing whether (and how) the agent is rate-limited. The agent +// argument is the canonical agent name; the caller is responsible for +// resolving any aliases (e.g. "claude" → "claude-code") before calling. +// +// Returns Kind == KindNone when no rule matches. +func Classify(agent, errMsg string) Classification { + return classifyWithRules(agent, errMsg, defaultRules) +} + +// classifyWithRules is Classify with an explicit rule slice. Unexported; +// used inside the package's own tests so synthetic fixtures (e.g. a +// KindSession pattern) do not leak into defaultRules. +func classifyWithRules(agent, errMsg string, rules []rule) Classification { + if errMsg == "" { + return Classification{Kind: KindNone, Agent: agent, Message: errMsg} + } + lower := strings.ToLower(errMsg) + for _, r := range rules { + if !ruleAppliesToAgent(r, agent) { + continue + } + if !strings.Contains(lower, r.Substring) { + continue + } + return Classification{ + Kind: r.Kind, + Agent: agent, + ResetAt: ParseResetTime(errMsg), + CooldownFor: ParseResetDuration(errMsg), + Message: errMsg, + } + } + return Classification{Kind: KindNone, Agent: agent, Message: errMsg} +} + +func ruleAppliesToAgent(r rule, agent string) bool { + for _, a := range r.Agents { + if a == "*" || a == agent { + return true + } + } + return false +} diff --git a/internal/agentlimit/agentlimit_test.go b/internal/agentlimit/agentlimit_test.go index 495650ac..006bba2a 100644 --- a/internal/agentlimit/agentlimit_test.go +++ b/internal/agentlimit/agentlimit_test.go @@ -16,3 +16,78 @@ func TestClassificationZeroValue(t *testing.T) { assert.Equal(time.Duration(0), c.CooldownFor) assert.Empty(c.Message) } + +func TestClassifyProductionPatterns(t *testing.T) { + // All nine substrings from the original isQuotaError set must + // produce KindQuota. This is the byte-for-byte regression test + // for current Gemini and Codex detection. + patterns := []string{ + "resource exhausted", + "quota exceeded", + "quota_exceeded", + "quota exhausted", + "quota_exhausted", + "insufficient_quota", + "exhausted your capacity", + "capacity exhausted", + "capacity_exhausted", + } + for _, p := range patterns { + t.Run(p, func(t *testing.T) { + cls := Classify("gemini", "agent failed: "+p+", retrying...") + assert := assert.New(t) + assert.Equal(KindQuota, cls.Kind, "expected KindQuota for %q", p) + assert.Equal("gemini", cls.Agent) + assert.Contains(cls.Message, p) + }) + } +} + +func TestClassifyExtractsCooldownDuration(t *testing.T) { + cls := Classify( + "gemini", + "You have exhausted your capacity on this model. Your quota will reset after 48m20s.", + ) + assert := assert.New(t) + assert.Equal(KindQuota, cls.Kind) + assert.Equal(48*time.Minute+20*time.Second, cls.CooldownFor) + assert.True(cls.ResetAt.IsZero(), "no absolute time in this message") +} + +func TestClassifyNegativeCases(t *testing.T) { + cases := []struct { + name string + msg string + }{ + {"empty", ""}, + {"unrelated error", "exit status 1: file not found"}, + {"benign mention of limit", "limit set to 100 in config"}, + {"benign rate limit (transient, no rule produces it day 1)", "429 rate limit, retrying"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + cls := Classify("gemini", tc.msg) + assert.Equal(t, KindNone, cls.Kind, "expected KindNone for %q", tc.msg) + }) + } +} + +func TestClassifyWithRulesIsolatesSyntheticPattern(t *testing.T) { + // Synthetic rule used only inside this test — does not pollute defaultRules. + syntheticRules := []rule{ + {Agents: []string{"*"}, Substring: "test-claude session limit", Kind: KindSession}, + } + cls := classifyWithRules( + "claude-code", + "5-hour test-claude session limit reached", + syntheticRules, + ) + assert := assert.New(t) + assert.Equal(KindSession, cls.Kind) + assert.Equal("claude-code", cls.Agent) + + // Same message via the production Classify must not match — + // the synthetic rule is not in defaultRules. + cls2 := Classify("claude-code", "5-hour test-claude session limit reached") + assert.Equal(KindNone, cls2.Kind, "synthetic rule must not leak into defaultRules") +} From 62e7aca675204ccac1e68e5b693916558b64f7a1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 14:59:16 -0500 Subject: [PATCH 10/24] refactor(daemon): route quota detection through internal/agentlimit --- internal/daemon/worker.go | 92 +++++++++------------------- internal/daemon/worker_test.go | 109 --------------------------------- 2 files changed, 29 insertions(+), 172 deletions(-) diff --git a/internal/daemon/worker.go b/internal/daemon/worker.go index 92209ea7..b05dc270 100644 --- a/internal/daemon/worker.go +++ b/internal/daemon/worker.go @@ -13,6 +13,7 @@ import ( "time" "github.com/roborev-dev/roborev/internal/agent" + "github.com/roborev-dev/roborev/internal/agentlimit" "github.com/roborev-dev/roborev/internal/config" gitpkg "github.com/roborev-dev/roborev/internal/git" "github.com/roborev-dev/roborev/internal/prompt" @@ -47,6 +48,11 @@ type WorkerPool struct { agentCooldowns map[string]time.Time // agent name -> expiry agentCooldownsMu sync.RWMutex + // classify is the rate-limit/quota classifier. Defaults to + // agentlimit.Classify; tests substitute a stub by setting this + // field directly after construction (test-only access). + classify agentlimit.ClassifyFunc + // Output capture for tail command outputBuffers *OutputBuffer @@ -70,6 +76,7 @@ func NewWorkerPool(db *storage.DB, cfgGetter ConfigGetter, numWorkers int, broad pendingCancels: make(map[int64]bool), agentCooldowns: make(map[string]time.Time), outputBuffers: NewOutputBuffer(512*1024, 4*1024*1024), // 512KB/job, 4MB total + classify: agentlimit.Classify, } } @@ -787,15 +794,28 @@ func (wp *WorkerPool) failOrRetryAgent(workerID string, job *storage.ReviewJob, } func (wp *WorkerPool) failOrRetryInner(workerID string, job *storage.ReviewJob, agentName string, errorMsg string, agentError bool) { - // Quota errors skip retries entirely — cool down the agent and - // attempt failover or fail with a quota-prefixed error. - if agentError && isQuotaError(errorMsg) { - dur := parseQuotaCooldown(errorMsg, defaultCooldown) - wp.cooldownAgent(agentName, time.Now().Add(dur)) - log.Printf("[%s] Agent %s quota exhausted, cooldown %v", - workerID, agentName, dur) - wp.failoverOrFail(workerID, job, agentName, errorMsg) - return + // Quota and session-limit errors skip retries entirely — cool down + // the agent and attempt failover or fail. Behavior matches the + // original isQuotaError branch; classification now lives in + // internal/agentlimit so the CLI fix loop can share it. + if agentError { + cls := wp.classify(agent.CanonicalName(agentName), errorMsg) + if cls.Kind == agentlimit.KindQuota || cls.Kind == agentlimit.KindSession { + dur := defaultCooldown + if cls.CooldownFor > 0 { + dur = cls.CooldownFor + } + if !cls.ResetAt.IsZero() { + if until := time.Until(cls.ResetAt); until > 0 { + dur = until + } + } + wp.cooldownAgent(agentName, time.Now().Add(dur)) + log.Printf("[%s] Agent %s quota exhausted, cooldown %v", + workerID, agentName, dur) + wp.failoverOrFail(workerID, job, agentName, errorMsg) + return + } } if agentError && isContextWindowError(errorMsg) { wp.failoverOrFailNonRetryableAgent(workerID, job, agentName, errorMsg) @@ -965,32 +985,6 @@ func (wp *WorkerPool) broadcastFailed(job *storage.ReviewJob, agentName, errorMs // defaultCooldown is the fallback duration when the error message doesn't // contain a parseable "reset after" token. const defaultCooldown = 30 * time.Minute -const minCooldown = 1 * time.Minute -const maxCooldown = 24 * time.Hour - -// isQuotaError returns true if the error message indicates a hard API -// quota exhaustion (case-insensitive). Transient rate-limit/429 errors -// are excluded — those should go through normal retries, not cooldown. -func isQuotaError(errMsg string) bool { - lower := strings.ToLower(errMsg) - patterns := []string{ - "resource exhausted", - "quota exceeded", - "quota_exceeded", - "quota exhausted", - "quota_exhausted", - "insufficient_quota", - "exhausted your capacity", - "capacity_exhausted", - "capacity exhausted", - } - for _, p := range patterns { - if strings.Contains(lower, p) { - return true - } - } - return false -} func isContextWindowError(errMsg string) bool { lower := strings.ToLower(errMsg) @@ -1010,34 +1004,6 @@ func isContextWindowError(errMsg string) bool { return false } -// parseQuotaCooldown extracts a Go-format duration from a "reset after -// " substring. Returns fallback if not found or unparseable. -func parseQuotaCooldown(errMsg string, fallback time.Duration) time.Duration { - lower := strings.ToLower(errMsg) - idx := strings.Index(lower, "reset after ") - if idx < 0 { - return fallback - } - rest := errMsg[idx+len("reset after "):] - // Take the next whitespace-delimited token as a duration - token := rest - if sp := strings.IndexAny(rest, " \t\n,;)"); sp > 0 { - token = rest[:sp] - } - token = strings.TrimRight(token, ".,;:)]}") - d, err := time.ParseDuration(token) - if err != nil || d <= 0 { - return fallback - } - if d < minCooldown { - return minCooldown - } - if d > maxCooldown { - return maxCooldown - } - return d -} - // cooldownAgent sets or extends the cooldown expiry for an agent. // Only extends — never shortens an existing cooldown. func (wp *WorkerPool) cooldownAgent(name string, until time.Time) { diff --git a/internal/daemon/worker_test.go b/internal/daemon/worker_test.go index 8cf18e3f..23cdccec 100644 --- a/internal/daemon/worker_test.go +++ b/internal/daemon/worker_test.go @@ -913,115 +913,6 @@ func TestWorkerPoolCancelJobFinalCheckDeadlockSafe(t *testing.T) { } } -func TestIsQuotaError(t *testing.T) { - tests := []struct { - errMsg string - want bool - }{ - // Hard quota exhaustion — should trigger cooldown/skip - {"quota exceeded for model", true}, - {"QUOTA_EXCEEDED: limit reached", true}, - {"quota exhausted, reset after 8h", true}, - {"QUOTA_EXHAUSTED: try later", true}, - {"insufficient_quota: limit reached", true}, - {"You have exhausted your capacity", true}, - {"RESOURCE EXHAUSTED: try later", true}, - {"MODEL_CAPACITY_EXHAUSTED: The model is overloaded", true}, - {"capacity_exhausted", true}, - // Transient rate limits — should NOT trigger cooldown (use retries) - {"Rate limit reached", false}, - {"rate_limit_error: too fast", false}, - {"Too Many Requests (429)", false}, - {"HTTP 429: slow down", false}, - {"status 429 received from API", false}, - // Other non-quota errors - {"connection reset by peer", false}, - {"timeout after 30s", false}, - {"agent not found", false}, - {"disk quota full", false}, - {"", false}, - } - for _, tt := range tests { - t.Run(tt.errMsg, func(t *testing.T) { - if got := isQuotaError(tt.errMsg); got != tt.want { - assert.Condition(t, func() bool { - return false - }, "isQuotaError(%q) = %v, want %v", - tt.errMsg, got, tt.want) - } - }) - } -} - -func TestParseQuotaCooldown(t *testing.T) { - tests := []struct { - name string - errMsg string - fallback time.Duration - want time.Duration - }{ - { - name: "extracts go duration", - errMsg: "quota exhausted, reset after 8h26m13s please wait", - fallback: 30 * time.Minute, - want: 8*time.Hour + 26*time.Minute + 13*time.Second, - }, - { - name: "no reset token falls back", - errMsg: "quota exceeded for model gemini-2.5-pro", - fallback: 30 * time.Minute, - want: 30 * time.Minute, - }, - { - name: "unparseable duration falls back", - errMsg: "reset after bogus", - fallback: 15 * time.Minute, - want: 15 * time.Minute, - }, - { - name: "duration at end of string", - errMsg: "reset after 2h30m", - fallback: 30 * time.Minute, - want: 2*time.Hour + 30*time.Minute, - }, - { - name: "trailing punctuation trimmed", - errMsg: "reset after 8h26m13s.", - fallback: 30 * time.Minute, - want: 8*time.Hour + 26*time.Minute + 13*time.Second, - }, - { - name: "trailing paren trimmed", - errMsg: "reset after 1h30m)", - fallback: 30 * time.Minute, - want: 1*time.Hour + 30*time.Minute, - }, - { - name: "clamped to max 24h", - errMsg: "reset after 99999h", - fallback: 30 * time.Minute, - want: 24 * time.Hour, - }, - { - name: "clamped to min 1m", - errMsg: "reset after 5s", - fallback: 30 * time.Minute, - want: 1 * time.Minute, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := parseQuotaCooldown(tt.errMsg, tt.fallback) - if got != tt.want { - assert.Condition(t, func() bool { - return false - }, "parseQuotaCooldown() = %v, want %v", - got, tt.want) - } - }) - } -} - func TestAgentCooldown(t *testing.T) { cfg := config.DefaultConfig() pool := NewWorkerPool(nil, NewStaticConfig(cfg), 1, NewBroadcaster(), nil, nil) From b76a7741971109e48ac765e601cab0a11e90f557 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 15:08:29 -0500 Subject: [PATCH 11/24] fix(agentlimit): address roborev review findings 18483/18487/18498 - ParseResetTime doc comment was stale after the DST fix; update to describe at-or-before rollover and DST-safe wall-clock semantics. - TestParseResetTimeAcrossDST: real IANA-zone test (America/New_York) that exercises the spring-forward and fall-back transitions where Add(24h) would land an hour off but time.Date day arithmetic does not. - Plan: formatAgentLimitMessage now branches on cls.Kind for the user- facing label so a Gemini/Codex KindQuota abort says "quota limit" instead of "session limit". - Plan: TestRunFixBatchAbortsOnQuotaError uses batchSize=1 (forces one job per agent call so agentCalls=1 unambiguously means abort) and drops the brittle "second job not reviewed" assertion that would fail because runFixBatch prefetches all reviews up-front. - Spec: replace "try again at 17:42 UTC" with "try again at 17:42" and document that timezone suffixes are out of scope until a real session-cap message is captured. Closes review jobs 18483, 18487, 18498. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-05-agent-rate-limit-handling.md | 48 +++++++++++++++---- ...-05-05-agent-rate-limit-handling-design.md | 7 ++- internal/agentlimit/parse.go | 9 ++-- internal/agentlimit/parse_test.go | 46 ++++++++++++++++++ 4 files changed, 95 insertions(+), 15 deletions(-) diff --git a/docs/superpowers/plans/2026-05-05-agent-rate-limit-handling.md b/docs/superpowers/plans/2026-05-05-agent-rate-limit-handling.md index 67734b3e..9e4afdfb 100644 --- a/docs/superpowers/plans/2026-05-05-agent-rate-limit-handling.md +++ b/docs/superpowers/plans/2026-05-05-agent-rate-limit-handling.md @@ -928,7 +928,11 @@ func (e *agentLimitError) Error() string { // formatAgentLimitMessage builds the user-facing abort message. Pulled // out so tests can assert against it without depending on time.Now. +// The label ("quota" / "session limit" / "rate limit") is derived from +// cls.Kind so a Gemini/Codex KindQuota abort doesn't mis-report itself +// as a session-cap. func formatAgentLimitMessage(cls agentlimit.Classification, now time.Time) string { + label := agentLimitLabel(cls.Kind) var dur time.Duration switch { case !cls.ResetAt.IsZero(): @@ -939,30 +943,46 @@ func formatAgentLimitMessage(cls agentlimit.Classification, now time.Time) strin switch { case dur > 0 && !cls.ResetAt.IsZero(): return fmt.Sprintf( - "agent %s hit a session limit. Cooldown until %s (in %s). "+ + "agent %s hit a %s. Cooldown until %s (in %s). "+ "Re-run after that, or pass --agent to switch.", cls.Agent, + label, cls.ResetAt.Format("3:04 PM"), dur.Round(time.Minute), ) case dur > 0: return fmt.Sprintf( - "agent %s hit a session limit. Cooldown for ~%s. "+ + "agent %s hit a %s. Cooldown for ~%s. "+ "Re-run after that, or pass --agent to switch.", cls.Agent, + label, dur.Round(time.Minute), ) default: flat := strings.ReplaceAll(cls.Message, "\n", " ") return fmt.Sprintf( - "agent %s hit a session limit (unknown reset time). "+ + "agent %s hit a %s (unknown reset time). "+ "Re-run later, or pass --agent to switch. "+ "Original error: %s", cls.Agent, + label, truncateString(flat, 200), ) } } + +func agentLimitLabel(k agentlimit.Kind) string { + switch k { + case agentlimit.KindSession: + return "session limit" + case agentlimit.KindQuota: + return "quota limit" + case agentlimit.KindTransient: + return "rate limit" + default: + return "rate limit" + } +} ``` `truncateString` already lives at `cmd/roborev/fix.go:796` — reuse it. @@ -1266,12 +1286,16 @@ func TestRunFixBatchAbortsOnQuotaError(t *testing.T) { } _, err = runWithOutput(t, repo.Dir, func(cmd *cobra.Command) error { + // batchSize=1 forces one job per agent call. With batchSize=0, + // runFixBatch may pack both jobs into a single batch, making + // agentCalls=1 ambiguous between "aborted before second batch" + // and "both jobs in one batch". batchSize=1 is unambiguous. return runFixBatch( cmd, []int64{10, 20}, "", false, false, false, - 0, + 1, opts, &fixSessionTracker{base: agent.NewTestAgent(), out: io.Discard}, ) @@ -1284,13 +1308,17 @@ func TestRunFixBatchAbortsOnQuotaError(t *testing.T) { assert := assert.New(t) assert.Equal(agentlimit.KindQuota, lim.Classification.Kind) assert.Equal(1, classifyCalls, "classifier should be called exactly once") + // agentCalls is the load-bearing assertion: with batchSize=1 and two + // jobs, two agent invocations would mean the loop continued past the + // abort. Exactly one means the abort short-circuits the second batch. assert.Equal(1, agentCalls, "fix agent should be invoked exactly once before abort") + assert.Contains(err.Error(), "quota limit", "Gemini KindQuota label should be 'quota limit', not 'session limit'") - // Job 20 must not have been processed past initial discovery. - mu.Lock() - reviewed := append([]int64(nil), reviewedJobIDs...) - mu.Unlock() - assert.NotContains(reviewed, int64(20), "second job must not be reviewed after abort") + // Note: do NOT assert `reviewedJobIDs` does not contain job 20 — + // runFixBatch prefetches every job's review at the start of the + // function (before any agent call), so /api/review fires for all + // queued jobs regardless of when the loop aborts. The agentCalls + // counter above is the correct measure of abort behavior. } ``` @@ -1407,7 +1435,7 @@ roborev fix Expected: the fix loop aborts after the first job with a message like ``` -agent gemini hit a session limit. Cooldown for ~30m. Re-run after that, or pass --agent to switch. +agent gemini hit a quota limit. Cooldown for ~30m. Re-run after that, or pass --agent to switch. ``` If no exhausted agent is reachable, this step is informational only — coverage is in the unit tests. diff --git a/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md b/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md index cb6fb5e3..4cd69088 100644 --- a/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md +++ b/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md @@ -255,10 +255,13 @@ Two formats need to be supported in the shared parser: - Relative duration: `reset after 48m20s`, `try again in 2h13m`, `retry after 5m`. Existing worker.go logic handles this for Gemini — it moves into `agentlimit.parseResetDuration` verbatim. -- Absolute time: `resets at 5:42 PM`, `try again at 17:42 UTC`. New +- Absolute time: `resets at 5:42 PM`, `try again at 17:42`. New parser, returns `time.Time` interpreted in the local timezone with a same-day or next-day disambiguation (next-day if the parsed time is - already in the past). + already in the past). Day-1 scope is bare clock times only — trailing + qualifiers like `UTC`, `EST`, or `today` defeat `time.Parse` and + produce zero. Extending coverage to timezone-suffixed messages is + deferred until a real Claude session-cap message is captured. If neither format matches, both `ResetAt` and `CooldownFor` are zero — the caller (worker or CLI) applies its own default fallback (30m diff --git a/internal/agentlimit/parse.go b/internal/agentlimit/parse.go index 627b5f85..21077903 100644 --- a/internal/agentlimit/parse.go +++ b/internal/agentlimit/parse.go @@ -47,9 +47,12 @@ func ParseResetDuration(errMsg string) time.Duration { // clock time in the local timezone. Returns the zero time.Time if no // recognized phrase is present or the time is unparseable. // -// If the parsed clock time is earlier than now-on-the-same-day, the -// returned time rolls forward by 24 hours so callers that compute -// "time until reset" never get a negative duration. +// If the parsed clock time is at or before now-on-the-same-day, the +// returned time rolls forward to the same wall-clock time on the next +// local calendar day so callers that compute "time until reset" never +// get a negative duration. Rollover is DST-safe via time.Date day +// arithmetic; on a 23-hour or 25-hour day, Go normalizes the offset +// so the returned wall-clock time matches the user's local clock. func ParseResetTime(errMsg string) time.Time { return parseResetTimeAt(errMsg, time.Now()) } diff --git a/internal/agentlimit/parse_test.go b/internal/agentlimit/parse_test.go index 22f51a71..3201f667 100644 --- a/internal/agentlimit/parse_test.go +++ b/internal/agentlimit/parse_test.go @@ -108,3 +108,49 @@ func TestParseResetTimeRespectsLocation(t *testing.T) { assert.Equal(t, want, got) assert.Equal(t, est, got.Location()) } + +// TestParseResetTimeAcrossDST exercises the rollover code path through +// real DST transitions where Add(24*time.Hour) would land an hour off. +// time.FixedZone has no DST rules, so it cannot exercise this branch — +// only a real IANA zone with a transition does. +func TestParseResetTimeAcrossDST(t *testing.T) { + ny, err := time.LoadLocation("America/New_York") + if err != nil { + t.Skipf("America/New_York tzdata unavailable: %v", err) + } + + cases := []struct { + name string + now time.Time + msg string + want time.Time + }{ + { + // Spring forward: 2026-03-08 02:00 EST does not exist; the + // clock jumps to 03:00 EDT. A reset advertised at 9:00 AM on + // the morning of the transition, evaluated when "now" is the + // previous evening, must land at 9:00 AM EDT — same wall-clock + // time, despite the day having only 23 hours. + name: "spring forward rollover preserves wall-clock", + now: time.Date(2026, 3, 7, 21, 0, 0, 0, ny), // 9pm EST + msg: "limit resets at 9:00 AM", + want: time.Date(2026, 3, 8, 9, 0, 0, 0, ny), // 9am EDT next day + }, + { + // Fall back: 2026-11-01 has 25 hours. A reset advertised at + // 9:00 AM, evaluated the prior evening, must land at 9:00 AM + // EST — same wall-clock time, longer day notwithstanding. + name: "fall back rollover preserves wall-clock", + now: time.Date(2026, 10, 31, 22, 0, 0, 0, ny), // 10pm EDT + msg: "limit resets at 9:00 AM", + want: time.Date(2026, 11, 1, 9, 0, 0, 0, ny), // 9am EST next day + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := parseResetTimeAt(tc.msg, tc.now) + assert.Equal(t, tc.want, got, + "DST-safe rollover should preserve wall-clock time") + }) + } +} From b34ea49958ee6c6e28cc8e7e19c34fa8fe4ed14f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 15:11:34 -0500 Subject: [PATCH 12/24] test(daemon): cover session-limit cooldown via injected classifier --- internal/daemon/worker_test.go | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/internal/daemon/worker_test.go b/internal/daemon/worker_test.go index 23cdccec..f9950f99 100644 --- a/internal/daemon/worker_test.go +++ b/internal/daemon/worker_test.go @@ -10,6 +10,7 @@ import ( "runtime" "github.com/roborev-dev/roborev/internal/agent" + "github.com/roborev-dev/roborev/internal/agentlimit" "github.com/roborev-dev/roborev/internal/config" gitpkg "github.com/roborev-dev/roborev/internal/git" "github.com/roborev-dev/roborev/internal/prompt" @@ -1190,6 +1191,38 @@ func TestFailOrRetryInner_NonQuotaStillRetries(t *testing.T) { } } +func TestFailOrRetryInner_SessionLimitCoolsDownAndFailsOver(t *testing.T) { + tc := newWorkerTestContext(t, 1) + sha := testutil.GetHeadSHA(t, tc.TmpDir) + job := tc.createAndClaimJob(t, sha, testWorkerID) + + // Stub classifier: any error message containing the marker yields + // KindSession with a 1-hour CooldownFor. Tests the seam without + // depending on real Claude wording. + tc.Pool.classify = func(agentName, msg string) agentlimit.Classification { + if strings.Contains(msg, "MARKER-SESSION-LIMIT") { + return agentlimit.Classification{ + Kind: agentlimit.KindSession, + Agent: agentName, + CooldownFor: 1 * time.Hour, + Message: msg, + } + } + return agentlimit.Classification{Kind: agentlimit.KindNone, Agent: agentName, Message: msg} + } + + tc.Pool.failOrRetryAgent(testWorkerID, job, "test", "boom MARKER-SESSION-LIMIT") + + assert := assert.New(t) + assert.True(tc.Pool.isAgentCoolingDown("test"), "agent should be in cooldown") + + // retry_count must NOT have advanced — quota/session errors skip + // retries entirely (matches the original isQuotaError semantics). + got, err := tc.DB.GetJobRetryCount(job.ID) + require.NoError(t, err) + assert.Equal(0, got, "session-limit error must not consume a retry slot") +} + func TestFailoverOrFail_FailsOverToBackup(t *testing.T) { tc := newWorkerTestContext(t, 1) sha := testutil.GetHeadSHA(t, tc.TmpDir) From 9d12ba837943329a48d0c6c01acf6a89bbff4a47 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 15:15:46 -0500 Subject: [PATCH 13/24] test(daemon): rename SessionLimit test to reflect what it verifies Test asserts cooldown + retry_count==0 with no backup configured, so the failover path is not exercised. Rename clarifies the actual behavior under test. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/daemon/worker_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/daemon/worker_test.go b/internal/daemon/worker_test.go index f9950f99..d5922785 100644 --- a/internal/daemon/worker_test.go +++ b/internal/daemon/worker_test.go @@ -1191,7 +1191,7 @@ func TestFailOrRetryInner_NonQuotaStillRetries(t *testing.T) { } } -func TestFailOrRetryInner_SessionLimitCoolsDownAndFailsOver(t *testing.T) { +func TestFailOrRetryInner_SessionLimitCoolsDownAndSkipsRetries(t *testing.T) { tc := newWorkerTestContext(t, 1) sha := testutil.GetHeadSHA(t, tc.TmpDir) job := tc.createAndClaimJob(t, sha, testWorkerID) From de71ad3eb98971ca489d44a83d9aef7b83fb0436 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 15:19:12 -0500 Subject: [PATCH 14/24] feat(daemon): log WARN for unclassified agent errors Refactor failOrRetryInner to classify the agent error once at the top of the agent-error path via a switch over Classification.Kind. The existing KindQuota/KindSession branch is preserved verbatim. KindNone now emits a single WARN line (with newlines flattened and truncated to 200 runes) when errorMsg is non-empty, so future unmatched session-cap or quota wordings get captured for adding a rule. KindTransient and KindNone both fall through to the unchanged isContextWindowError check and retry path. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/daemon/worker.go | 13 ++++++++++++- internal/daemon/worker_test.go | 27 +++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/internal/daemon/worker.go b/internal/daemon/worker.go index b05dc270..632e6f22 100644 --- a/internal/daemon/worker.go +++ b/internal/daemon/worker.go @@ -800,7 +800,8 @@ func (wp *WorkerPool) failOrRetryInner(workerID string, job *storage.ReviewJob, // internal/agentlimit so the CLI fix loop can share it. if agentError { cls := wp.classify(agent.CanonicalName(agentName), errorMsg) - if cls.Kind == agentlimit.KindQuota || cls.Kind == agentlimit.KindSession { + switch cls.Kind { + case agentlimit.KindQuota, agentlimit.KindSession: dur := defaultCooldown if cls.CooldownFor > 0 { dur = cls.CooldownFor @@ -815,6 +816,16 @@ func (wp *WorkerPool) failOrRetryInner(workerID string, job *storage.ReviewJob, workerID, agentName, dur) wp.failoverOrFail(workerID, job, agentName, errorMsg) return + case agentlimit.KindNone: + if errorMsg != "" { + preview := strings.ReplaceAll(errorMsg, "\n", " ") + preview = strings.ReplaceAll(preview, "\r", "") + log.Printf("[%s] unclassified agent error from %s: %s", + workerID, agentName, truncateRunes(preview, 200)) + } + // fall through to context-window / retry handling + case agentlimit.KindTransient: + // fall through to retry handling } } if agentError && isContextWindowError(errorMsg) { diff --git a/internal/daemon/worker_test.go b/internal/daemon/worker_test.go index d5922785..a95e3774 100644 --- a/internal/daemon/worker_test.go +++ b/internal/daemon/worker_test.go @@ -1,8 +1,10 @@ package daemon import ( + "bytes" "context" "fmt" + "log" "os" "os/exec" "path/filepath" @@ -1223,6 +1225,31 @@ func TestFailOrRetryInner_SessionLimitCoolsDownAndSkipsRetries(t *testing.T) { assert.Equal(0, got, "session-limit error must not consume a retry slot") } +func TestFailOrRetryInner_UnmatchedAgentErrorLogsWarn(t *testing.T) { + tc := newWorkerTestContext(t, 1) + + // Capture log output by swapping the standard logger's writer. + var buf bytes.Buffer + origOutput := log.Writer() + log.SetOutput(&buf) + t.Cleanup(func() { log.SetOutput(origOutput) }) + + sha := testutil.GetHeadSHA(t, tc.TmpDir) + job := tc.createAndClaimJob(t, sha, testWorkerID) + + tc.Pool.classify = func(agentName, msg string) agentlimit.Classification { + return agentlimit.Classification{Kind: agentlimit.KindNone, Agent: agentName, Message: msg} + } + + tc.Pool.failOrRetryAgent(testWorkerID, job, "test", "some brand new error wording from a future agent") + + logged := buf.String() + assert := assert.New(t) + assert.Contains(logged, "unclassified agent error", "expected WARN line for unmatched error") + assert.Contains(logged, "test", "log line should include agent name") + assert.Contains(logged, "some brand new error wording", "log line should include error preview") +} + func TestFailoverOrFail_FailsOverToBackup(t *testing.T) { tc := newWorkerTestContext(t, 1) sha := testutil.GetHeadSHA(t, tc.TmpDir) From 073f8163a986ee6b9e200f7811d59e49614c234f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 15:29:09 -0500 Subject: [PATCH 15/24] feat(fix): add classifier hook and agentLimitError plumbing --- cmd/roborev/fix.go | 89 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/cmd/roborev/fix.go b/cmd/roborev/fix.go index 05bc868d..bd8be53f 100644 --- a/cmd/roborev/fix.go +++ b/cmd/roborev/fix.go @@ -14,6 +14,7 @@ import ( "time" "github.com/roborev-dev/roborev/internal/agent" + "github.com/roborev-dev/roborev/internal/agentlimit" "github.com/roborev-dev/roborev/internal/config" "github.com/roborev-dev/roborev/internal/daemon" "github.com/roborev-dev/roborev/internal/git" @@ -141,6 +142,7 @@ Examples: minSeverity: minSeverity, quiet: quiet, resume: resume, + classify: agentlimit.Classify, } roots, err := resolveCurrentRepoRoots() @@ -228,6 +230,87 @@ type fixOptions struct { minSeverity string quiet bool resume bool + + // classify is the rate-limit classifier. Defaults to + // agentlimit.Classify in the production cobra command's RunE; tests + // inject a stub to drive deterministic KindQuota / KindSession + // outcomes without depending on real agent error wording. + classify agentlimit.ClassifyFunc +} + +// agentLimitError is returned by the fix loop when the configured agent +// hits a quota or session limit. The fix command surfaces it as the +// process exit error so users see the reset time and a hint to retry. +// +//nolint:unused // wired up by Tasks 9-10 (fixSingleJob / runFixBatch abort paths) +type agentLimitError struct { + Classification agentlimit.Classification +} + +//nolint:unused // wired up by Tasks 9-10 (fixSingleJob / runFixBatch abort paths) +func (e *agentLimitError) Error() string { + return formatAgentLimitMessage(e.Classification, time.Now()) +} + +// formatAgentLimitMessage builds the user-facing abort message. Pulled +// out so tests can assert against it without depending on time.Now. +// The label ("quota" / "session limit" / "rate limit") is derived from +// cls.Kind so a Gemini/Codex KindQuota abort doesn't mis-report itself +// as a session-cap. +// +//nolint:unused // wired up by Tasks 9-10 (fixSingleJob / runFixBatch abort paths) +func formatAgentLimitMessage(cls agentlimit.Classification, now time.Time) string { + label := agentLimitLabel(cls.Kind) + var dur time.Duration + switch { + case !cls.ResetAt.IsZero(): + dur = cls.ResetAt.Sub(now) + case cls.CooldownFor > 0: + dur = cls.CooldownFor + } + switch { + case dur > 0 && !cls.ResetAt.IsZero(): + return fmt.Sprintf( + "agent %s hit a %s. Cooldown until %s (in %s). "+ + "Re-run after that, or pass --agent to switch.", + cls.Agent, + label, + cls.ResetAt.Format("3:04 PM"), + dur.Round(time.Minute), + ) + case dur > 0: + return fmt.Sprintf( + "agent %s hit a %s. Cooldown for ~%s. "+ + "Re-run after that, or pass --agent to switch.", + cls.Agent, + label, + dur.Round(time.Minute), + ) + default: + flat := strings.ReplaceAll(cls.Message, "\n", " ") + return fmt.Sprintf( + "agent %s hit a %s (unknown reset time). "+ + "Re-run later, or pass --agent to switch. "+ + "Original error: %s", + cls.Agent, + label, + truncateString(flat, 200), + ) + } +} + +//nolint:unused // wired up by Tasks 9-10 (fixSingleJob / runFixBatch abort paths) +func agentLimitLabel(k agentlimit.Kind) string { + switch k { + case agentlimit.KindSession: + return "session limit" + case agentlimit.KindQuota: + return "quota limit" + case agentlimit.KindTransient: + return "rate limit" + default: + return "rate limit" + } } // fixJobParams configures a fixJobDirect operation. @@ -828,6 +911,9 @@ func jobVerdict(job *storage.ReviewJob, review *storage.Review) string { } func fixSingleJob(cmd *cobra.Command, repoRoot string, jobID int64, opts fixOptions, tracker *fixSessionTracker) error { + if opts.classify == nil { + opts.classify = agentlimit.Classify + } ctx := cmd.Context() if ctx == nil { ctx = context.Background() @@ -998,6 +1084,9 @@ type batchEntry struct { // runFixBatch discovers jobs (or uses provided IDs), splits them into batches // respecting max prompt size, and runs each batch as a single agent invocation. func runFixBatch(cmd *cobra.Command, jobIDs []int64, branch string, allBranches, explicitBranch, newestFirst bool, batchSize int, opts fixOptions, tracker *fixSessionTracker) error { + if opts.classify == nil { + opts.classify = agentlimit.Classify + } if err := ensureDaemon(); err != nil { return err } From 7f744d90a1e51036f33e764cf8aa5b9fe37a4f82 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 15:41:50 -0500 Subject: [PATCH 16/24] feat(fix): abort fixSingleJob on KindQuota or KindSession After fixJobDirect returns an error, classify it via opts.classify and abort the loop with an agentLimitError on KindQuota/KindSession. Print a warning on KindNone (mirroring the daemon-side WARN). Other kinds (KindTransient) fall through to the existing return path. Removes the //nolint:unused directives from agentLimitError, its Error() method, formatAgentLimitMessage, and agentLimitLabel now that they are wired into the abort path. --- cmd/roborev/fix.go | 20 +++++++++---- cmd/roborev/fix_test.go | 62 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 6 deletions(-) diff --git a/cmd/roborev/fix.go b/cmd/roborev/fix.go index bd8be53f..fc1e18d4 100644 --- a/cmd/roborev/fix.go +++ b/cmd/roborev/fix.go @@ -241,13 +241,10 @@ type fixOptions struct { // agentLimitError is returned by the fix loop when the configured agent // hits a quota or session limit. The fix command surfaces it as the // process exit error so users see the reset time and a hint to retry. -// -//nolint:unused // wired up by Tasks 9-10 (fixSingleJob / runFixBatch abort paths) type agentLimitError struct { Classification agentlimit.Classification } -//nolint:unused // wired up by Tasks 9-10 (fixSingleJob / runFixBatch abort paths) func (e *agentLimitError) Error() string { return formatAgentLimitMessage(e.Classification, time.Now()) } @@ -257,8 +254,6 @@ func (e *agentLimitError) Error() string { // The label ("quota" / "session limit" / "rate limit") is derived from // cls.Kind so a Gemini/Codex KindQuota abort doesn't mis-report itself // as a session-cap. -// -//nolint:unused // wired up by Tasks 9-10 (fixSingleJob / runFixBatch abort paths) func formatAgentLimitMessage(cls agentlimit.Classification, now time.Time) string { label := agentLimitLabel(cls.Kind) var dur time.Duration @@ -299,7 +294,6 @@ func formatAgentLimitMessage(cls agentlimit.Classification, now time.Time) strin } } -//nolint:unused // wired up by Tasks 9-10 (fixSingleJob / runFixBatch abort paths) func agentLimitLabel(k agentlimit.Kind) string { switch k { case agentlimit.KindSession: @@ -1021,6 +1015,20 @@ func fixSingleJob(cmd *cobra.Command, repoRoot string, jobID int64, opts fixOpti } if err != nil { tracker.Reset() + cls := opts.classify(agent.CanonicalName(currentAgent.Name()), err.Error()) + switch cls.Kind { + case agentlimit.KindQuota, agentlimit.KindSession: + return &agentLimitError{Classification: cls} + case agentlimit.KindNone: + if err.Error() != "" && !opts.quiet { + flat := strings.ReplaceAll(err.Error(), "\n", " ") + cmd.PrintErrf( + "warning: unclassified agent error from %s: %s\n", + currentAgent.Name(), + truncateString(flat, 200), + ) + } + } return err } tracker.Capture(capture.SessionID()) diff --git a/cmd/roborev/fix_test.go b/cmd/roborev/fix_test.go index 7a3c918f..bd920ea8 100644 --- a/cmd/roborev/fix_test.go +++ b/cmd/roborev/fix_test.go @@ -24,6 +24,7 @@ import ( "github.com/spf13/cobra" "github.com/roborev-dev/roborev/internal/agent" + "github.com/roborev-dev/roborev/internal/agentlimit" "github.com/roborev-dev/roborev/internal/config" "github.com/roborev-dev/roborev/internal/daemon" "github.com/roborev-dev/roborev/internal/storage" @@ -3601,3 +3602,64 @@ func TestFixCmd_BatchSizeAcceptsBranchFlags(t *testing.T) { }) } } + +func TestFixSingleJobAbortsOnSessionLimit(t *testing.T) { + repo := createTestRepo(t, map[string]string{"main.go": "package main\n"}) + + originalAgent, err := agent.Get("test") + require.NoError(t, err, "get test agent") + agent.Register(&agent.FakeAgent{ + NameStr: "test", + ReviewFn: func(_ context.Context, _, _, _ string, _ io.Writer) (string, error) { + return "", errors.New("simulated claude session cap") + }, + }) + t.Cleanup(func() { agent.Register(originalAgent) }) + + ts, _ := newMockServer(t, MockServerOpts{ + ReviewOutput: "## Issues\n- Found issue", + OnJobs: func(w http.ResponseWriter, r *http.Request) { + writeJSON(w, map[string]any{ + "jobs": []storage.ReviewJob{{ + ID: 99, + Status: storage.JobStatusDone, + Agent: "test", + }}, + }) + }, + }) + patchServerAddr(t, ts.URL) + + cmd, _ := newTestCmd(t) + + resetAt := time.Now().Add(2*time.Hour + 13*time.Minute) + classifyCalls := 0 + opts := fixOptions{ + agentName: "test", + reasoning: "fast", + classify: func(_, msg string) agentlimit.Classification { + classifyCalls++ + return agentlimit.Classification{ + Kind: agentlimit.KindSession, + Agent: "claude-code", + ResetAt: resetAt, + Message: msg, + } + }, + } + + base, err := resolveFixAgent(repo.Dir, opts) + require.NoError(t, err, "resolveFixAgent") + tracker := &fixSessionTracker{base: base, out: io.Discard} + err = fixSingleJob(cmd, repo.Dir, 99, opts, tracker) + require.Error(t, err, "expected abort error, got nil") + + var lim *agentLimitError + require.ErrorAs(t, err, &lim, "expected agentLimitError, got %T: %v", err, err) + + assert := assert.New(t) + assert.Equal(agentlimit.KindSession, lim.Classification.Kind) + assert.Equal(1, classifyCalls, "classifier should be called exactly once") + assert.Contains(err.Error(), "claude-code") + assert.Contains(err.Error(), "session limit") +} From 7823a65c85d40b3e2467a731fc1e9070b4b2454d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 15:50:25 -0500 Subject: [PATCH 17/24] feat(fix): abort runFixBatch on KindQuota or KindSession Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/roborev/fix.go | 14 +++++ cmd/roborev/fix_test.go | 114 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+) diff --git a/cmd/roborev/fix.go b/cmd/roborev/fix.go index fc1e18d4..a9424ebe 100644 --- a/cmd/roborev/fix.go +++ b/cmd/roborev/fix.go @@ -1276,6 +1276,20 @@ func runFixBatch(cmd *cobra.Command, jobIDs []int64, branch string, allBranches, } if err != nil { tracker.Reset() + cls := opts.classify(agent.CanonicalName(currentAgent.Name()), err.Error()) + switch cls.Kind { + case agentlimit.KindQuota, agentlimit.KindSession: + return &agentLimitError{Classification: cls} + case agentlimit.KindNone: + if err.Error() != "" && !opts.quiet { + flat := strings.ReplaceAll(err.Error(), "\n", " ") + cmd.PrintErrf( + "warning: unclassified agent error from %s: %s\n", + currentAgent.Name(), + truncateString(flat, 200), + ) + } + } cmd.Printf("Warning: error in batch %d: %v\n", i+1, err) continue } diff --git a/cmd/roborev/fix_test.go b/cmd/roborev/fix_test.go index bd920ea8..11c9428b 100644 --- a/cmd/roborev/fix_test.go +++ b/cmd/roborev/fix_test.go @@ -3663,3 +3663,117 @@ func TestFixSingleJobAbortsOnSessionLimit(t *testing.T) { assert.Contains(err.Error(), "claude-code") assert.Contains(err.Error(), "session limit") } + +func TestRunFixBatchAbortsOnQuotaError(t *testing.T) { + repo := createTestRepo(t, map[string]string{"main.go": "package main\n"}) + + originalAgent, err := agent.Get("test") + require.NoError(t, err, "get test agent") + agentCalls := 0 + agent.Register(&agent.FakeAgent{ + NameStr: "test", + ReviewFn: func(_ context.Context, _, _, _ string, _ io.Writer) (string, error) { + agentCalls++ + return "", errors.New("agent failed: you have exhausted your capacity") + }, + }) + t.Cleanup(func() { agent.Register(originalAgent) }) + + var mu sync.Mutex + var reviewedJobIDs []int64 + _ = newMockDaemonBuilder(t). + WithHandler("/api/jobs", func(w http.ResponseWriter, r *http.Request) { + id := r.URL.Query().Get("id") + switch id { + case "10", "20": + idNum := int64(10) + if id == "20" { + idNum = 20 + } + writeJSON(w, map[string]any{ + "jobs": []storage.ReviewJob{{ + ID: idNum, + Status: storage.JobStatusDone, + Agent: "test", + }}, + }) + default: + writeJSON(w, map[string]any{ + "jobs": []storage.ReviewJob{}, + "has_more": false, + }) + } + }). + WithHandler("/api/review", func(w http.ResponseWriter, r *http.Request) { + jobID := r.URL.Query().Get("job_id") + mu.Lock() + switch jobID { + case "10": + reviewedJobIDs = append(reviewedJobIDs, 10) + case "20": + reviewedJobIDs = append(reviewedJobIDs, 20) + } + mu.Unlock() + writeJSON(w, storage.Review{ + JobID: 10, + Output: "## Issues\n- Bug", + }) + }). + WithHandler("/api/enqueue", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + }). + Build() + + classifyCalls := 0 + opts := fixOptions{ + agentName: "test", + reasoning: "fast", + classify: func(_, msg string) agentlimit.Classification { + classifyCalls++ + return agentlimit.Classification{ + Kind: agentlimit.KindQuota, + Agent: "gemini", + CooldownFor: 30 * time.Minute, + Message: msg, + } + }, + } + + base, err := resolveFixAgent(repo.Dir, opts) + require.NoError(t, err, "resolveFixAgent") + + _, err = runWithOutput(t, repo.Dir, func(cmd *cobra.Command) error { + // batchSize=1 forces one job per agent call. With batchSize=0, + // runFixBatch may pack both jobs into a single batch, making + // agentCalls=1 ambiguous between "aborted before second batch" + // and "both jobs in one batch". batchSize=1 is unambiguous. + return runFixBatch( + cmd, + []int64{10, 20}, + "", + false, false, false, + 1, + opts, + &fixSessionTracker{base: base, out: io.Discard}, + ) + }) + require.Error(t, err, "expected batch to abort with agentLimitError") + + var lim *agentLimitError + require.ErrorAs(t, err, &lim, "expected agentLimitError, got %T: %v", err, err) + + assert := assert.New(t) + assert.Equal(agentlimit.KindQuota, lim.Classification.Kind) + assert.Equal(1, classifyCalls, "classifier should be called exactly once") + // agentCalls is the load-bearing assertion: with batchSize=1 and two + // jobs, two agent invocations would mean the loop continued past the + // abort. Exactly one means the abort short-circuits the second batch. + assert.Equal(1, agentCalls, "fix agent should be invoked exactly once before abort") + assert.Contains(err.Error(), "quota limit", "Gemini KindQuota label should be 'quota limit', not 'session limit'") + + // Note: do NOT assert `reviewedJobIDs` does not contain job 20 — + // runFixBatch prefetches every job's review at the start of the + // function (before any agent call), so /api/review fires for all + // queued jobs regardless of when the loop aborts. The agentCalls + // counter above is the correct measure of abort behavior. +} From 62f00dd60346d70750b5ab8400070ef7aee2abb9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 15:57:09 -0500 Subject: [PATCH 18/24] fix: address roborev review findings 18506 and 18513 - 18506 (worker_test.go): tighten unclassified-agent log assertion to "from test:" so it cannot pass on the workerID's "test-worker" substring. - 18513 (fix.go runFixWithSeen): detect agentLimitError via errors.As before discovery-mode warn-and-continue, otherwise quota/session aborts get demoted to warnings and the re-query loop keeps invoking the exhausted agent until every queued job is burned through. - Regression test TestRunFixWithSeenDiscoveryAbortsOnAgentLimit covers the discovery path: KindQuota classification produces an agentLimitError that propagates out of seen != nil mode without marking the second job seen. Closes review jobs 18506 and 18513. --- cmd/roborev/fix.go | 8 ++++ cmd/roborev/fix_test.go | 88 ++++++++++++++++++++++++++++++++++ internal/daemon/worker_test.go | 2 +- 3 files changed, 97 insertions(+), 1 deletion(-) diff --git a/cmd/roborev/fix.go b/cmd/roborev/fix.go index a9424ebe..2067a178 100644 --- a/cmd/roborev/fix.go +++ b/cmd/roborev/fix.go @@ -498,6 +498,14 @@ func runFixWithSeen(cmd *cobra.Command, jobIDs []int64, opts fixOptions, seen ma if isConnectionError(err) { return fmt.Errorf("daemon connection lost: %w", err) } + // Agent quota/session-limit aborts must propagate even in + // discovery mode — otherwise the re-query loop keeps + // invoking the exhausted agent until every queued job is + // burned through with the same error. + var lim *agentLimitError + if errors.As(err, &lim) { + return err + } // In discovery mode (seen != nil), log a warning and // continue best-effort. For explicit job IDs (seen == // nil), return the error so the CLI exits non-zero. diff --git a/cmd/roborev/fix_test.go b/cmd/roborev/fix_test.go index 11c9428b..9b1da9a7 100644 --- a/cmd/roborev/fix_test.go +++ b/cmd/roborev/fix_test.go @@ -3777,3 +3777,91 @@ func TestRunFixBatchAbortsOnQuotaError(t *testing.T) { // queued jobs regardless of when the loop aborts. The agentCalls // counter above is the correct measure of abort behavior. } + +func TestRunFixWithSeenDiscoveryAbortsOnAgentLimit(t *testing.T) { + // Discovery mode (seen != nil) normally warns-and-continues on + // per-job errors, but agent quota/session-limit aborts must + // propagate so the re-query loop doesn't keep invoking the + // exhausted agent. + repo := createTestRepo(t, map[string]string{"main.go": "package main\n"}) + + originalAgent, err := agent.Get("test") + require.NoError(t, err, "get test agent") + agentCalls := 0 + agent.Register(&agent.FakeAgent{ + NameStr: "test", + ReviewFn: func(_ context.Context, _, _, _ string, _ io.Writer) (string, error) { + agentCalls++ + return "", errors.New("agent failed: you have exhausted your capacity") + }, + }) + t.Cleanup(func() { agent.Register(originalAgent) }) + + _ = newMockDaemonBuilder(t). + WithHandler("/api/jobs", func(w http.ResponseWriter, r *http.Request) { + id := r.URL.Query().Get("id") + var idNum int64 + switch id { + case "10": + idNum = 10 + case "20": + idNum = 20 + default: + writeJSON(w, map[string]any{ + "jobs": []storage.ReviewJob{}, + "has_more": false, + }) + return + } + writeJSON(w, map[string]any{ + "jobs": []storage.ReviewJob{{ + ID: idNum, + Status: storage.JobStatusDone, + Agent: "test", + }}, + }) + }). + WithHandler("/api/review", func(w http.ResponseWriter, r *http.Request) { + writeJSON(w, storage.Review{Output: "## Issues\n- Bug"}) + }). + WithHandler("/api/enqueue", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + }). + Build() + + classifyCalls := 0 + opts := fixOptions{ + agentName: "test", + reasoning: "fast", + classify: func(_, msg string) agentlimit.Classification { + classifyCalls++ + return agentlimit.Classification{ + Kind: agentlimit.KindQuota, + Agent: "gemini", + CooldownFor: 30 * time.Minute, + Message: msg, + } + }, + } + + seen := make(map[int64]bool) + _, err = runWithOutput(t, repo.Dir, func(cmd *cobra.Command) error { + base, baseErr := resolveFixAgent(repo.Dir, opts) + require.NoError(t, baseErr, "resolveFixAgent") + tracker := &fixSessionTracker{base: base, out: io.Discard} + return runFixWithSeen(cmd, []int64{10, 20}, opts, seen, tracker) + }) + require.Error(t, err, "expected discovery mode to propagate agentLimitError") + + var lim *agentLimitError + require.ErrorAs(t, err, &lim, "expected agentLimitError, got %T: %v", err, err) + + assert := assert.New(t) + assert.Equal(agentlimit.KindQuota, lim.Classification.Kind) + assert.Equal(1, classifyCalls, "classifier should be called exactly once") + // Load-bearing: only one agent invocation means the abort short- + // circuited the loop. Two would mean discovery mode demoted the + // agentLimitError to a warning and continued. + assert.Equal(1, agentCalls, "fix agent should be invoked exactly once before abort") + assert.False(seen[20], "second job must not be marked seen after abort") +} diff --git a/internal/daemon/worker_test.go b/internal/daemon/worker_test.go index a95e3774..c88b6073 100644 --- a/internal/daemon/worker_test.go +++ b/internal/daemon/worker_test.go @@ -1246,7 +1246,7 @@ func TestFailOrRetryInner_UnmatchedAgentErrorLogsWarn(t *testing.T) { logged := buf.String() assert := assert.New(t) assert.Contains(logged, "unclassified agent error", "expected WARN line for unmatched error") - assert.Contains(logged, "test", "log line should include agent name") + assert.Contains(logged, "from test:", "log line should include agent name as 'from :'") assert.Contains(logged, "some brand new error wording", "log line should include error preview") } From a0bb88350873725c9a7e6a1e329baa75be06c389 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 15:58:30 -0500 Subject: [PATCH 19/24] docs: update CLAUDE.md cooldown reference to point at agentlimit The retry/failover section still pointed at the removed isQuotaError helper. Update it to describe the agentlimit.Classify-based path with the new KindQuota/KindSession kinds and ParseResetDuration/ParseResetTime helpers that own the cooldown extraction. --- CLAUDE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CLAUDE.md b/CLAUDE.md index 1b362706..d8347207 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -210,7 +210,7 @@ The internal `lookupFieldByTag()` helper resolves these via reflection on the st - **Retries**: Up to 3 retries for transient failures (`db.RetryJob`). Resets status to queued. - **Failover**: After retries exhausted (or on quota errors), switches to backup agent via `db.FailoverJob`. Resets retry_count, sets backup agent/model. -- **Cooldown**: Quota exhaustion errors (`isQuotaError`) trigger per-agent cooldown (default 30 min, parsed from error message). Cooldowns are tracked in-memory with RWMutex. +- **Cooldown**: Quota exhaustion errors (classified by `internal/agentlimit.Classify` as `KindQuota`/`KindSession`) trigger per-agent cooldown (default 30 min, parsed from the error message via `agentlimit.ParseResetDuration`/`ParseResetTime`). Cooldowns are tracked in-memory with RWMutex. ### Workflow derivation for failover From a3f244b024c87583cc9c98781843684b3d3c01f2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 16:54:21 -0500 Subject: [PATCH 20/24] remove specs --- .../2026-05-05-agent-rate-limit-handling.md | 1474 ----------------- ...-05-05-agent-rate-limit-handling-design.md | 425 ----- 2 files changed, 1899 deletions(-) delete mode 100644 docs/superpowers/plans/2026-05-05-agent-rate-limit-handling.md delete mode 100644 docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md diff --git a/docs/superpowers/plans/2026-05-05-agent-rate-limit-handling.md b/docs/superpowers/plans/2026-05-05-agent-rate-limit-handling.md deleted file mode 100644 index 9e4afdfb..00000000 --- a/docs/superpowers/plans/2026-05-05-agent-rate-limit-handling.md +++ /dev/null @@ -1,1474 +0,0 @@ -# Agent Rate-Limit and Session-Cap Handling Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add a shared rate-limit classifier so the daemon worker keeps its current cooldown-then-failover behavior for Gemini/Codex quota errors and the foreground `roborev fix` loop aborts with a reset-time message instead of looping fruitlessly when the configured agent hits a quota or session cap. - -**Architecture:** A new `internal/agentlimit` package owns the detection logic — pattern table, kind enum, reset parsing — with no I/O. The daemon worker calls it via a swappable `classify` field on `WorkerPool` (default `agentlimit.Classify`), and the CLI fix loop calls it via a new `classify` field on `fixOptions`. Tests inject stubs that return literal `agentlimit.Classification` values for deterministic outcomes. - -**Tech Stack:** Go 1.x stdlib only (no new dependencies), testify (`require`/`assert`) for test assertions per project conventions. - -**Spec:** `docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md` - ---- - -## File Structure - -**Create:** -- `internal/agentlimit/agentlimit.go` — `Kind`, `Classification`, `rule`, `defaultRules`, `Classify`, `classifyWithRules`, helper `ruleAppliesToAgent` -- `internal/agentlimit/parse.go` — `ParseResetDuration`, `ParseResetTime`, range-clamp helpers -- `internal/agentlimit/agentlimit_test.go` — table tests for `Classify` (nine production patterns + negatives), test for `classifyWithRules` with synthetic `KindSession` rule -- `internal/agentlimit/parse_test.go` — table tests for both parsers - -**Modify:** -- `internal/daemon/worker.go` — add `classify` field on `WorkerPool`; replace `isQuotaError(...)` and `parseQuotaCooldown(...)` call sites at lines 792-796 with `wp.classify(...)`; add unmatched-error WARN log; delete the now-unused `isQuotaError` and `parseQuotaCooldown` helpers (and their tests in `worker_test.go`) -- `internal/daemon/worker_test.go` — drop `TestIsQuotaError`/`TestParseQuotaCooldown` (covered in agentlimit package); add `TestFailOrRetryInner_SessionLimitCoolsDownAndFailsOver` driven by injected classifier; add `TestFailOrRetryInner_QuotaPatternRegression` covering all nine substrings end-to-end -- `cmd/roborev/fix.go` — add `classify agentlimit.ClassifyFunc` and `nowFn func() time.Time` fields on `fixOptions` (defaults wired in `NewFixCmd`); add `agentLimitError` type with `Error()` formatter; in `fixSingleJob` and `runFixBatch`, after each `fixJobDirect` error, classify and either return the abort error (KindQuota/KindSession) or log a WARN (KindNone with non-empty error) before the existing per-job handling -- `cmd/roborev/fix_test.go` — add `TestFixSingleJobAbortsOnSessionLimit` and `TestRunFixBatchAbortsOnQuotaError` using a stub classifier injected via `fixOptions` - -**Note on the daemon-vs-foreground distinction (load-bearing):** the daemon worker keeps its existing behavior for Gemini/Codex (cooldown → immediate failover-or-fail, no retries). The foreground fix loop is where the new abort behavior lives — it applies to KindQuota too, not just future KindSession, and that change is intentional (spec migration section). - ---- - -## Task 1: Create `agentlimit` package skeleton with types - -**Files:** -- Create: `internal/agentlimit/agentlimit.go` -- Test: `internal/agentlimit/agentlimit_test.go` - -- [ ] **Step 1: Write failing test for type round-trip** - -```go -// internal/agentlimit/agentlimit_test.go -package agentlimit - -import ( - "testing" - "time" - - "github.com/stretchr/testify/assert" -) - -func TestClassificationZeroValue(t *testing.T) { - var c Classification - assert := assert.New(t) - assert.Equal(KindNone, c.Kind) - assert.Equal("", c.Agent) - assert.True(c.ResetAt.IsZero()) - assert.Equal(time.Duration(0), c.CooldownFor) - assert.Equal("", c.Message) -} -``` - -- [ ] **Step 2: Run test, confirm compile failure** - -Run: `go test ./internal/agentlimit/ -run TestClassificationZeroValue` -Expected: FAIL — `package agentlimit` does not exist. - -- [ ] **Step 3: Create the package with types** - -```go -// internal/agentlimit/agentlimit.go -// Package agentlimit classifies agent CLI error messages as rate-limit, -// quota, or session-cap signals so the daemon worker and the foreground -// roborev fix loop can react consistently. Pure logic — no I/O. -package agentlimit - -import "time" - -// Kind labels a classified agent error. -type Kind int - -const ( - KindNone Kind = iota // no rate-limit signal recognized - KindTransient // 429-style; retry locally, no cooldown - KindQuota // hard quota exhaustion (Gemini/Codex today) - KindSession // session-level cap (e.g. Claude 5-hour) -) - -// Classification is the result of inspecting an agent error. -type Classification struct { - Kind Kind - Agent string // canonical agent name (caller resolves aliases) - ResetAt time.Time // zero if not parseable from the message - CooldownFor time.Duration // zero if not parseable; caller applies its own fallback - Message string // raw error text (for logs / user display) -} - -// ClassifyFunc is the function shape used by callers that want to inject -// a stub in tests. -type ClassifyFunc func(agent, errMsg string) Classification -``` - -- [ ] **Step 4: Run test, confirm pass** - -Run: `go test ./internal/agentlimit/ -run TestClassificationZeroValue -v` -Expected: PASS. - -- [ ] **Step 5: Commit** - -```bash -git add internal/agentlimit/agentlimit.go internal/agentlimit/agentlimit_test.go -git commit -m "feat(agentlimit): create package with Kind and Classification types" -``` - ---- - -## Task 2: Implement `ParseResetDuration` - -**Files:** -- Create: `internal/agentlimit/parse.go` -- Test: `internal/agentlimit/parse_test.go` - -- [ ] **Step 1: Write failing tests for relative-duration parsing** - -```go -// internal/agentlimit/parse_test.go -package agentlimit - -import ( - "testing" - "time" - - "github.com/stretchr/testify/assert" -) - -func TestParseResetDuration(t *testing.T) { - cases := []struct { - name string - msg string - want time.Duration - }{ - {"none", "agent failed", 0}, - {"reset after seconds clamped to min", "quota will reset after 5s.", 1 * time.Minute}, - {"reset after minutes", "quota will reset after 48m20s.", 48*time.Minute + 20*time.Second}, - {"reset after hours", "reset after 2h13m.", 2*time.Hour + 13*time.Minute}, - {"reset after with trailing punct", "reset after 1h.. retrying", 1 * time.Hour}, - {"reset after invalid", "reset after notaduration", 0}, - {"reset after negative", "reset after -5m", 0}, - {"reset after huge clamped to max", "reset after 100h", 24 * time.Hour}, - {"case insensitive", "RESET AFTER 30m", 30 * time.Minute}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - assert.Equal(t, tc.want, ParseResetDuration(tc.msg)) - }) - } -} -``` - -- [ ] **Step 2: Run tests, confirm compile failure** - -Run: `go test ./internal/agentlimit/ -run TestParseResetDuration` -Expected: FAIL — `ParseResetDuration` undefined. - -- [ ] **Step 3: Implement `ParseResetDuration`** - -This lifts the logic from `internal/daemon/worker.go:1015` (`parseQuotaCooldown`), with the difference that it returns `0` instead of a caller-supplied fallback when nothing parses. - -```go -// internal/agentlimit/parse.go -package agentlimit - -import ( - "strings" - "time" -) - -// minCooldown / maxCooldown clamp parsed durations to a sane range so a -// pathological message ("reset after 1ms" or "reset after 100h") does not -// produce a useless cooldown. -const ( - minCooldown = 1 * time.Minute - maxCooldown = 24 * time.Hour -) - -// ParseResetDuration extracts a Go-format duration from a "reset after -// " substring in errMsg (case-insensitive). Returns 0 if no such -// substring is present or the duration is unparseable. Clamps positive -// values to [minCooldown, maxCooldown]. -func ParseResetDuration(errMsg string) time.Duration { - lower := strings.ToLower(errMsg) - idx := strings.Index(lower, "reset after ") - if idx < 0 { - return 0 - } - rest := errMsg[idx+len("reset after "):] - token := rest - if sp := strings.IndexAny(rest, " \t\n,;)"); sp > 0 { - token = rest[:sp] - } - token = strings.TrimRight(token, ".,;:)]}") - d, err := time.ParseDuration(token) - if err != nil || d <= 0 { - return 0 - } - if d < minCooldown { - return minCooldown - } - if d > maxCooldown { - return maxCooldown - } - return d -} -``` - -- [ ] **Step 4: Run tests, confirm pass** - -Run: `go test ./internal/agentlimit/ -run TestParseResetDuration -v` -Expected: PASS for all nine cases. - -- [ ] **Step 5: Commit** - -```bash -git add internal/agentlimit/parse.go internal/agentlimit/parse_test.go -git commit -m "feat(agentlimit): port reset-duration parsing from worker.parseQuotaCooldown" -``` - ---- - -## Task 3: Implement `ParseResetTime` - -**Files:** -- Modify: `internal/agentlimit/parse.go` -- Modify: `internal/agentlimit/parse_test.go` - -This adds an absolute-time parser for messages that say "resets at 5:42 PM" / "try again at 17:42 UTC". No production rule uses it day 1 — Gemini and Codex emit relative durations — but the framework supports it so adding a Claude rule later is a one-line table change without parser work. - -- [ ] **Step 1: Write failing tests for absolute-time parsing** - -```go -// internal/agentlimit/parse_test.go — append below TestParseResetDuration -func TestParseResetTime(t *testing.T) { - // Use a fixed "now" so same-day vs next-day rollover is deterministic. - loc := time.FixedZone("test", 0) - now := time.Date(2026, 5, 5, 12, 0, 0, 0, loc) // noon UTC - cases := []struct { - name string - msg string - wantErr bool - want time.Time - }{ - {"none", "agent failed", true, time.Time{}}, - { - name: "resets at later today", - msg: "limit resets at 5:42 PM", - want: time.Date(2026, 5, 5, 17, 42, 0, 0, loc), - }, - { - name: "try again at later today 24h", - msg: "try again at 17:42", - want: time.Date(2026, 5, 5, 17, 42, 0, 0, loc), - }, - { - name: "resets at earlier today rolls to next day", - msg: "limit resets at 9:00 AM", - want: time.Date(2026, 5, 6, 9, 0, 0, 0, loc), - }, - { - name: "case insensitive", - msg: "LIMIT RESETS AT 6:00 pm", - want: time.Date(2026, 5, 5, 18, 0, 0, 0, loc), - }, - {"unparseable token", "resets at moonrise", true, time.Time{}}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - got := parseResetTimeAt(tc.msg, now) - if tc.wantErr { - assert.True(t, got.IsZero(), "expected zero time, got %v", got) - return - } - assert.Equal(t, tc.want, got) - }) - } -} -``` - -- [ ] **Step 2: Run tests, confirm compile failure** - -Run: `go test ./internal/agentlimit/ -run TestParseResetTime` -Expected: FAIL — `parseResetTimeAt` undefined. - -- [ ] **Step 3: Implement `ParseResetTime` and `parseResetTimeAt`** - -Append to `internal/agentlimit/parse.go`: - -```go -// ParseResetTime extracts an absolute reset time from messages like -// "resets at 5:42 PM" or "try again at 17:42". Interprets the parsed -// clock time in the local timezone. Returns the zero time.Time if no -// recognized phrase is present or the time is unparseable. -// -// If the parsed clock time is earlier than now-on-the-same-day, the -// returned time rolls forward by 24 hours so callers that compute -// "time until reset" never get a negative duration. -func ParseResetTime(errMsg string) time.Time { - return parseResetTimeAt(errMsg, time.Now()) -} - -// parseResetTimeAt is ParseResetTime with an injectable clock for tests. -func parseResetTimeAt(errMsg string, now time.Time) time.Time { - lower := strings.ToLower(errMsg) - var idx int - switch { - case strings.Contains(lower, "resets at "): - idx = strings.Index(lower, "resets at ") + len("resets at ") - case strings.Contains(lower, "try again at "): - idx = strings.Index(lower, "try again at ") + len("try again at ") - default: - return time.Time{} - } - // Consume up to the next non-time character. Allow digits, ':', and - // AM/PM markers (with optional spaces before them). - rest := errMsg[idx:] - end := len(rest) - for i, r := range rest { - // Stop at sentence-ending punctuation or newline. - if r == '.' || r == ',' || r == ';' || r == '\n' || r == ')' { - end = i - break - } - } - token := strings.TrimSpace(rest[:end]) - - formats := []string{ - "3:04 PM", "3:04 pm", - "3:04PM", "3:04pm", - "15:04", - } - for _, f := range formats { - if t, err := time.Parse(f, token); err == nil { - candidate := time.Date( - now.Year(), now.Month(), now.Day(), - t.Hour(), t.Minute(), 0, 0, now.Location(), - ) - if candidate.Before(now) { - candidate = candidate.Add(24 * time.Hour) - } - return candidate - } - } - return time.Time{} -} -``` - -- [ ] **Step 4: Run tests, confirm pass** - -Run: `go test ./internal/agentlimit/ -run TestParseResetTime -v` -Expected: PASS for all six cases. - -- [ ] **Step 5: Commit** - -```bash -git add internal/agentlimit/parse.go internal/agentlimit/parse_test.go -git commit -m "feat(agentlimit): add absolute reset-time parser with same-day rollover" -``` - ---- - -## Task 4: Implement `Classify` and `classifyWithRules` with the nine production patterns - -**Files:** -- Modify: `internal/agentlimit/agentlimit.go` -- Modify: `internal/agentlimit/agentlimit_test.go` - -- [ ] **Step 1: Write failing tests for production patterns + negatives** - -Append to `internal/agentlimit/agentlimit_test.go`: - -```go -func TestClassifyProductionPatterns(t *testing.T) { - // All nine substrings from the original isQuotaError set must - // produce KindQuota. This is the byte-for-byte regression test - // for current Gemini and Codex detection. - patterns := []string{ - "resource exhausted", - "quota exceeded", - "quota_exceeded", - "quota exhausted", - "quota_exhausted", - "insufficient_quota", - "exhausted your capacity", - "capacity exhausted", - "capacity_exhausted", - } - for _, p := range patterns { - t.Run(p, func(t *testing.T) { - cls := Classify("gemini", "agent failed: "+p+", retrying...") - assert := assert.New(t) - assert.Equal(KindQuota, cls.Kind, "expected KindQuota for %q", p) - assert.Equal("gemini", cls.Agent) - assert.Contains(cls.Message, p) - }) - } -} - -func TestClassifyExtractsCooldownDuration(t *testing.T) { - cls := Classify( - "gemini", - "You have exhausted your capacity on this model. Your quota will reset after 48m20s.", - ) - assert := assert.New(t) - assert.Equal(KindQuota, cls.Kind) - assert.Equal(48*time.Minute+20*time.Second, cls.CooldownFor) - assert.True(cls.ResetAt.IsZero(), "no absolute time in this message") -} - -func TestClassifyNegativeCases(t *testing.T) { - cases := []struct { - name string - msg string - }{ - {"empty", ""}, - {"unrelated error", "exit status 1: file not found"}, - {"benign mention of limit", "limit set to 100 in config"}, - {"benign rate limit (transient, no rule produces it day 1)", "429 rate limit, retrying"}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - cls := Classify("gemini", tc.msg) - assert.Equal(t, KindNone, cls.Kind, "expected KindNone for %q", tc.msg) - }) - } -} - -func TestClassifyWithRulesIsolatesSyntheticPattern(t *testing.T) { - // Synthetic rule used only inside this test — does not pollute defaultRules. - syntheticRules := []rule{ - {Agents: []string{"*"}, Substring: "test-claude session limit", Kind: KindSession}, - } - cls := classifyWithRules( - "claude-code", - "5-hour test-claude session limit reached", - syntheticRules, - ) - assert := assert.New(t) - assert.Equal(KindSession, cls.Kind) - assert.Equal("claude-code", cls.Agent) - - // Same message via the production Classify must not match — - // the synthetic rule is not in defaultRules. - cls2 := Classify("claude-code", "5-hour test-claude session limit reached") - assert.Equal(KindNone, cls2.Kind, "synthetic rule must not leak into defaultRules") -} -``` - -- [ ] **Step 2: Run tests, confirm compile failure** - -Run: `go test ./internal/agentlimit/` -Expected: FAIL — `Classify`, `classifyWithRules`, `rule` undefined. - -- [ ] **Step 3: Implement `Classify`, `classifyWithRules`, and `defaultRules`** - -Append to `internal/agentlimit/agentlimit.go`: - -```go -import "strings" - -// rule is one substring → kind mapping. The Agents slice restricts the -// rule to specific canonical agent names; "*" applies to any agent. -type rule struct { - Agents []string // canonical agent names; "*" = any - Substring string // case-insensitive substring match on the error message - Kind Kind -} - -// defaultRules is the production rule table. The nine quota substrings -// are copied from the original isQuotaError set in -// internal/daemon/worker.go so detection for Gemini and Codex is -// byte-for-byte unchanged. -// -// Claude session-cap patterns are intentionally absent — see the design -// doc's "Detection without a captured Claude message" section. Once a -// real session-cap message is captured, a single KindSession rule plus -// a unit-test fixture is sufficient to enable Claude detection. -var defaultRules = []rule{ - {Agents: []string{"*"}, Substring: "resource exhausted", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "quota exceeded", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "quota_exceeded", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "quota exhausted", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "quota_exhausted", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "insufficient_quota", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "exhausted your capacity", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "capacity exhausted", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "capacity_exhausted", Kind: KindQuota}, -} - -// Classify inspects an agent error message and returns a Classification -// describing whether (and how) the agent is rate-limited. The agent -// argument is the canonical agent name; the caller is responsible for -// resolving any aliases (e.g. "claude" → "claude-code") before calling. -// -// Returns Kind == KindNone when no rule matches. -func Classify(agent, errMsg string) Classification { - return classifyWithRules(agent, errMsg, defaultRules) -} - -// classifyWithRules is Classify with an explicit rule slice. Unexported; -// used inside the package's own tests so synthetic fixtures (e.g. a -// KindSession pattern) do not leak into defaultRules. -func classifyWithRules(agent, errMsg string, rules []rule) Classification { - if errMsg == "" { - return Classification{Kind: KindNone, Agent: agent, Message: errMsg} - } - lower := strings.ToLower(errMsg) - for _, r := range rules { - if !ruleAppliesToAgent(r, agent) { - continue - } - if !strings.Contains(lower, r.Substring) { - continue - } - return Classification{ - Kind: r.Kind, - Agent: agent, - ResetAt: ParseResetTime(errMsg), - CooldownFor: ParseResetDuration(errMsg), - Message: errMsg, - } - } - return Classification{Kind: KindNone, Agent: agent, Message: errMsg} -} - -func ruleAppliesToAgent(r rule, agent string) bool { - for _, a := range r.Agents { - if a == "*" || a == agent { - return true - } - } - return false -} -``` - -- [ ] **Step 4: Run all tests in the package, confirm pass** - -Run: `go test ./internal/agentlimit/ -v` -Expected: PASS for all four test functions. - -- [ ] **Step 5: Run full unit suite to confirm no regression** - -Run: `go test ./...` -Expected: PASS (or only pre-existing failures unrelated to this change). - -- [ ] **Step 6: Commit** - -```bash -git add internal/agentlimit/agentlimit.go internal/agentlimit/agentlimit_test.go -git commit -m "feat(agentlimit): add Classify with nine production quota patterns" -``` - ---- - -## Task 5: Replace `isQuotaError` + `parseQuotaCooldown` in the daemon worker - -**Files:** -- Modify: `internal/daemon/worker.go` -- Modify: `internal/daemon/worker_test.go` - -The behavior change here is *zero* for any currently-supported agent. The plan is to add an injectable `classify` function on `WorkerPool`, route the existing call site through it, then delete the now-dead helpers and their unit tests. - -- [ ] **Step 1: Add `classify` field on `WorkerPool`** - -Modify `internal/daemon/worker.go` around the existing fields (the cooldowns block, line 46-48): - -```go -// Add to imports: -// "github.com/roborev-dev/roborev/internal/agentlimit" - -// Add to the WorkerPool struct (alongside agentCooldowns): - // classify is the rate-limit/quota classifier. Defaults to - // agentlimit.Classify; tests substitute a stub via NewWorkerPool's - // caller setting it after construction (test-only access). - classify agentlimit.ClassifyFunc -``` - -In `NewWorkerPool` (line 59-74), set the default: - -```go -return &WorkerPool{ - db: db, - cfgGetter: cfgGetter, - broadcaster: broadcaster, - errorLog: errorLog, - activityLog: activityLog, - numWorkers: numWorkers, - stopCh: make(chan struct{}), - readyCh: make(chan struct{}), - runningJobs: make(map[int64]context.CancelFunc), - pendingCancels: make(map[int64]bool), - agentCooldowns: make(map[string]time.Time), - outputBuffers: NewOutputBuffer(512*1024, 4*1024*1024), - classify: agentlimit.Classify, -} -``` - -- [ ] **Step 2: Replace the call site in `failOrRetryInner`** - -Modify `internal/daemon/worker.go:789-803` (the `failOrRetryInner` function's quota and context-window branches). Replace the existing quota-detection block: - -Old: - -```go - // Quota errors skip retries entirely — cool down the agent and - // attempt failover or fail with a quota-prefixed error. - if agentError && isQuotaError(errorMsg) { - dur := parseQuotaCooldown(errorMsg, defaultCooldown) - wp.cooldownAgent(agentName, time.Now().Add(dur)) - log.Printf("[%s] Agent %s quota exhausted, cooldown %v", - workerID, agentName, dur) - wp.failoverOrFail(workerID, job, agentName, errorMsg) - return - } -``` - -New: - -```go - // Quota and session-limit errors skip retries entirely — cool down - // the agent and attempt failover or fail. Behavior matches the - // original isQuotaError branch; classification now lives in - // internal/agentlimit so the CLI fix loop can share it. - if agentError { - cls := wp.classify(agent.CanonicalName(agentName), errorMsg) - if cls.Kind == agentlimit.KindQuota || cls.Kind == agentlimit.KindSession { - dur := defaultCooldown - if cls.CooldownFor > 0 { - dur = cls.CooldownFor - } - if !cls.ResetAt.IsZero() { - if until := time.Until(cls.ResetAt); until > 0 { - dur = until - } - } - wp.cooldownAgent(agentName, time.Now().Add(dur)) - log.Printf("[%s] Agent %s quota exhausted, cooldown %v", - workerID, agentName, dur) - wp.failoverOrFail(workerID, job, agentName, errorMsg) - return - } - } -``` - -The `isContextWindowError` branch immediately below stays unchanged — that's a separate concern. - -- [ ] **Step 3: Delete the now-unused helpers** - -Remove from `internal/daemon/worker.go`: -- The entire `isQuotaError` function (lines 971-993) -- The entire `parseQuotaCooldown` function (lines 1013-1039) - -The constants `defaultCooldown`, `minCooldown`, and `maxCooldown` stay — `defaultCooldown` is still used by the new code at the call site, and `min`/`maxCooldown` were duplicated into `internal/agentlimit/parse.go` so the daemon-side ones are now unused only in `parseQuotaCooldown`. Since `parseQuotaCooldown` is gone, also remove `minCooldown` and `maxCooldown` from `worker.go` if they have no other references. - -Verify by grep: - -```bash -rg -n "minCooldown|maxCooldown" internal/daemon/ -``` - -Expected: zero matches in `internal/daemon/`. If matches remain, leave the constants in place. - -- [ ] **Step 4: Update existing unit tests in `worker_test.go`** - -Delete the unit-level `TestIsQuotaError` (around line 946) and `TestParseQuotaCooldown` (around line 1014) blocks entirely — the same coverage is now in `internal/agentlimit/agentlimit_test.go` and `internal/agentlimit/parse_test.go`. - -The integration-level tests `TestFailOrRetryInner_QuotaSkipsRetries`, `TestFailOrRetryInner_QuotaExhaustedVariant`, `TestFailOrRetryInner_NonQuotaStillRetries` (around lines 1190-1271) stay — they exercise the quota-skip-retries path, which is the exact behavior we are preserving. They will continue to pass via the default `agentlimit.Classify`. - -- [ ] **Step 5: Run the daemon test suite, confirm pass** - -Run: `go test ./internal/daemon/ -run "TestFailOrRetry" -v` -Expected: PASS — all `TestFailOrRetryInner_*` tests still pass via the default classifier. - -Run: `go test ./internal/daemon/` -Expected: PASS overall (no regressions). - -- [ ] **Step 6: Format and commit** - -Run: -```bash -go fmt ./... -go vet ./... -``` - -Expected: clean. Then: - -```bash -git add internal/daemon/worker.go internal/daemon/worker_test.go -git commit -m "refactor(daemon): route quota detection through internal/agentlimit" -``` - ---- - -## Task 6: Add a daemon-side test that exercises the injected classifier for KindSession - -**Files:** -- Modify: `internal/daemon/worker_test.go` - -This proves the `classify` injection works end-to-end against the existing worker test infrastructure. Without it, day 1 we have no positive coverage of the `KindSession` branch on the daemon side. - -- [ ] **Step 1: Write failing test using the existing `workerTestContext` helper** - -Append to `internal/daemon/worker_test.go`: - -```go -func TestFailOrRetryInner_SessionLimitCoolsDownAndFailsOver(t *testing.T) { - tc := newWorkerTestContext(t, 1) - job := tc.createAndClaimJob() - - // Stub classifier: any error message containing the marker yields - // KindSession with a 1-hour CooldownFor. - tc.wp.classify = func(agent, msg string) agentlimit.Classification { - if strings.Contains(msg, "MARKER-SESSION-LIMIT") { - return agentlimit.Classification{ - Kind: agentlimit.KindSession, - Agent: agent, - CooldownFor: 1 * time.Hour, - Message: msg, - } - } - return agentlimit.Classification{Kind: agentlimit.KindNone, Agent: agent, Message: msg} - } - - tc.wp.failOrRetryAgent("worker-0", job, "test", "boom MARKER-SESSION-LIMIT") - - assert := assert.New(t) - assert.True(tc.wp.isAgentCoolingDown("test"), "agent should be in cooldown") - - // retry_count should not have advanced — quota/session errors skip retries. - got, err := tc.wp.db.GetJobRetryCount(job.ID) - require.NoError(t, err) - assert.Equal(0, got, "session-limit error must not consume a retry slot") -} -``` - -Add the imports at the top of `worker_test.go` if not already present: - -```go -import ( - // ...existing... - "strings" - "time" - - "github.com/roborev-dev/roborev/internal/agentlimit" -) -``` - -- [ ] **Step 2: Run test, confirm pass on first try (the wiring from Task 5 already supports this)** - -Run: `go test ./internal/daemon/ -run TestFailOrRetryInner_SessionLimitCoolsDownAndFailsOver -v` -Expected: PASS. - -If FAIL with "agent should be in cooldown": the classifier injection from Task 5 is not being honored; revisit `failOrRetryInner` to confirm `wp.classify(...)` is the call. - -- [ ] **Step 3: Commit** - -```bash -git add internal/daemon/worker_test.go -git commit -m "test(daemon): cover session-limit cooldown via injected classifier" -``` - ---- - -## Task 7: Add unmatched-error WARN log on the daemon side - -**Files:** -- Modify: `internal/daemon/worker.go` -- Modify: `internal/daemon/worker_test.go` - -When the classifier returns `KindNone` for an agent error, log a single WARN line so the next time a Claude (or any other) session-cap fires we have a captured sample to add a rule for. - -- [ ] **Step 1: Write failing test** - -Append to `internal/daemon/worker_test.go`: - -```go -func TestFailOrRetryInner_UnmatchedAgentErrorLogsWarn(t *testing.T) { - tc := newWorkerTestContext(t, 1) - - // Capture log output by swapping the standard logger's writer. - var buf bytes.Buffer - origOutput := log.Writer() - log.SetOutput(&buf) - t.Cleanup(func() { log.SetOutput(origOutput) }) - - job := tc.createAndClaimJob() - tc.wp.classify = func(agent, msg string) agentlimit.Classification { - return agentlimit.Classification{Kind: agentlimit.KindNone, Agent: agent, Message: msg} - } - - tc.wp.failOrRetryAgent("worker-0", job, "test", "some brand new error wording from a future agent") - - logged := buf.String() - assert := assert.New(t) - assert.Contains(logged, "unclassified agent error", "expected WARN line for unmatched error") - assert.Contains(logged, "test", "log line should include agent name") - assert.Contains(logged, "some brand new error wording", "log line should include error preview") -} -``` - -Add imports if not already present: - -```go - "bytes" - "log" -``` - -- [ ] **Step 2: Run test, confirm failure** - -Run: `go test ./internal/daemon/ -run TestFailOrRetryInner_UnmatchedAgentErrorLogsWarn -v` -Expected: FAIL — log output does not contain "unclassified agent error". - -- [ ] **Step 3: Refactor `failOrRetryInner` to classify once and add the WARN log** - -Replace the agent-error block from Task 5 with a single switch that -covers all classifier kinds. This avoids double-classifying and adds -the `KindNone` WARN log in the same site. `truncateRunes` already lives -at `internal/daemon/classifier.go:108`; flatten newlines inline so the -log line stays on one row, then call the existing helper. - -```go -func (wp *WorkerPool) failOrRetryInner(workerID string, job *storage.ReviewJob, agentName string, errorMsg string, agentError bool) { - if agentError { - cls := wp.classify(agent.CanonicalName(agentName), errorMsg) - switch cls.Kind { - case agentlimit.KindQuota, agentlimit.KindSession: - dur := defaultCooldown - if cls.CooldownFor > 0 { - dur = cls.CooldownFor - } - if !cls.ResetAt.IsZero() { - if until := time.Until(cls.ResetAt); until > 0 { - dur = until - } - } - wp.cooldownAgent(agentName, time.Now().Add(dur)) - log.Printf("[%s] Agent %s quota exhausted, cooldown %v", - workerID, agentName, dur) - wp.failoverOrFail(workerID, job, agentName, errorMsg) - return - case agentlimit.KindNone: - if errorMsg != "" { - preview := strings.ReplaceAll(errorMsg, "\n", " ") - preview = strings.ReplaceAll(preview, "\r", "") - log.Printf("[%s] unclassified agent error from %s: %s", - workerID, agentName, truncateRunes(preview, 200)) - } - // fall through to context-window / retry handling - case agentlimit.KindTransient: - // fall through to retry handling - } - } - if agentError && isContextWindowError(errorMsg) { - wp.failoverOrFailNonRetryableAgent(workerID, job, agentName, errorMsg) - return - } - // ...existing retry path unchanged from Task 5... -} -``` - -The test from Task 6 returns `KindSession` so it hits the cooldown -branch; the new test from this task returns `KindNone` and hits the -WARN-log branch. - -- [ ] **Step 4: Run the new test and the regression suite** - -Run: `go test ./internal/daemon/ -run TestFailOrRetryInner -v` -Expected: PASS for all `TestFailOrRetryInner_*` tests, including the new `_UnmatchedAgentErrorLogsWarn`. - -- [ ] **Step 5: Format and commit** - -```bash -go fmt ./... -go vet ./... -git add internal/daemon/worker.go internal/daemon/worker_test.go -git commit -m "feat(daemon): log WARN for unclassified agent errors" -``` - ---- - -## Task 8: Add classifier hook and abort-error type to the CLI fix loop - -**Files:** -- Modify: `cmd/roborev/fix.go` - -This task only adds plumbing. The next two tasks wire it into the actual loops with tests. - -- [ ] **Step 1: Extend `fixOptions` and add the abort-error type** - -Modify `cmd/roborev/fix.go` near the existing `fixOptions` struct (line 224): - -```go -import "github.com/roborev-dev/roborev/internal/agentlimit" - -type fixOptions struct { - agentName string - model string - reasoning string - minSeverity string - quiet bool - resume bool - - // classify is the rate-limit classifier. Defaults to - // agentlimit.Classify in NewFixCmd; tests inject a stub to drive - // deterministic KindQuota / KindSession outcomes without depending - // on real agent error wording. - classify agentlimit.ClassifyFunc -} - -// agentLimitError is returned by the fix loop when the configured agent -// hits a quota or session limit. The fix command surfaces it as the -// process exit error so users see the reset time and a hint to retry. -type agentLimitError struct { - Classification agentlimit.Classification -} - -func (e *agentLimitError) Error() string { - return formatAgentLimitMessage(e.Classification, time.Now()) -} - -// formatAgentLimitMessage builds the user-facing abort message. Pulled -// out so tests can assert against it without depending on time.Now. -// The label ("quota" / "session limit" / "rate limit") is derived from -// cls.Kind so a Gemini/Codex KindQuota abort doesn't mis-report itself -// as a session-cap. -func formatAgentLimitMessage(cls agentlimit.Classification, now time.Time) string { - label := agentLimitLabel(cls.Kind) - var dur time.Duration - switch { - case !cls.ResetAt.IsZero(): - dur = cls.ResetAt.Sub(now) - case cls.CooldownFor > 0: - dur = cls.CooldownFor - } - switch { - case dur > 0 && !cls.ResetAt.IsZero(): - return fmt.Sprintf( - "agent %s hit a %s. Cooldown until %s (in %s). "+ - "Re-run after that, or pass --agent to switch.", - cls.Agent, - label, - cls.ResetAt.Format("3:04 PM"), - dur.Round(time.Minute), - ) - case dur > 0: - return fmt.Sprintf( - "agent %s hit a %s. Cooldown for ~%s. "+ - "Re-run after that, or pass --agent to switch.", - cls.Agent, - label, - dur.Round(time.Minute), - ) - default: - flat := strings.ReplaceAll(cls.Message, "\n", " ") - return fmt.Sprintf( - "agent %s hit a %s (unknown reset time). "+ - "Re-run later, or pass --agent to switch. "+ - "Original error: %s", - cls.Agent, - label, - truncateString(flat, 200), - ) - } -} - -func agentLimitLabel(k agentlimit.Kind) string { - switch k { - case agentlimit.KindSession: - return "session limit" - case agentlimit.KindQuota: - return "quota limit" - case agentlimit.KindTransient: - return "rate limit" - default: - return "rate limit" - } -} -``` - -`truncateString` already lives at `cmd/roborev/fix.go:796` — reuse it. -It does not flatten newlines, so callers that need a single-line preview -(this one and the WARN log paths in Tasks 9-10) flatten with -`strings.ReplaceAll(s, "\n", " ")` before calling. - -- [ ] **Step 2: Default the classifier where `fixOptions` is built** - -There is a single construction site for `fixOptions` in the production -flow, at `cmd/roborev/fix.go:137` (inside the `fix` cobra command's -`RunE`). Add `classify: agentlimit.Classify` to the struct literal: - -```go -opts := fixOptions{ - agentName: agentName, - model: model, - reasoning: reasoning, - minSeverity: minSeverity, - quiet: quiet, - resume: resume, - classify: agentlimit.Classify, -} -``` - -Tests that build `fixOptions` directly are responsible for setting -`classify` (or leaving it nil and relying on the test to never reach -an error path). To make accidental nil dereferences obvious, add a -single nil-guard at the top of `fixSingleJob` and `runFixBatch`: - -```go -if opts.classify == nil { - opts.classify = agentlimit.Classify -} -``` - -This mirrors how `streamfmt` and other defaults are handled elsewhere in -the file and keeps behavior identical when callers do not set -`classify`. - -- [ ] **Step 3: Format and confirm package builds** - -```bash -go fmt ./... -go vet ./... -go build ./... -``` - -Expected: clean build (no usage of `agentLimitError` yet — only declarations). - -- [ ] **Step 4: Commit** - -```bash -git add cmd/roborev/fix.go -git commit -m "feat(fix): add classifier hook and agentLimitError plumbing" -``` - ---- - -## Task 9: Wire abort path into `fixSingleJob` - -**Files:** -- Modify: `cmd/roborev/fix.go` -- Modify: `cmd/roborev/fix_test.go` - -- [ ] **Step 1: Write failing test** - -Append to `cmd/roborev/fix_test.go`: - -This follows the `agent.FakeAgent` registration pattern used by -`TestFixSingleJobRecoversPostFixDaemonCalls` (`cmd/roborev/fix_test.go:565`) -to make the `test` agent return a synthetic error, and the `newMockServer` -+ `patchServerAddr` pattern used by `TestFixSingleJob` (line 529) for -daemon plumbing. - -```go -func TestFixSingleJobAbortsOnSessionLimit(t *testing.T) { - repo := createTestRepo(t, map[string]string{"main.go": "package main\n"}) - - originalAgent, err := agent.Get("test") - require.NoError(t, err, "get test agent") - agent.Register(&agent.FakeAgent{ - NameStr: "test", - ReviewFn: func(_ context.Context, _, _, _ string, _ io.Writer) (string, error) { - return "", errors.New("simulated claude session cap") - }, - }) - t.Cleanup(func() { agent.Register(originalAgent) }) - - ts, _ := newMockServer(t, MockServerOpts{ - ReviewOutput: "## Issues\n- Found issue", - OnJobs: func(w http.ResponseWriter, r *http.Request) { - writeJSON(w, map[string]any{ - "jobs": []storage.ReviewJob{{ - ID: 99, - Status: storage.JobStatusDone, - Agent: "test", - }}, - }) - }, - }) - patchServerAddr(t, ts.URL) - - cmd, _ := newTestCmd(t) - - resetAt := time.Now().Add(2*time.Hour + 13*time.Minute) - classifyCalls := 0 - opts := fixOptions{ - agentName: "test", - reasoning: "fast", - classify: func(_, msg string) agentlimit.Classification { - classifyCalls++ - return agentlimit.Classification{ - Kind: agentlimit.KindSession, - Agent: "claude-code", - ResetAt: resetAt, - Message: msg, - } - }, - } - - tracker := &fixSessionTracker{base: agent.NewTestAgent(), out: io.Discard} - err = fixSingleJob(cmd, repo.Dir, 99, opts, tracker) - require.Error(t, err, "expected abort error, got nil") - - var lim *agentLimitError - require.ErrorAs(t, err, &lim, "expected agentLimitError, got %T: %v", err, err) - - assert := assert.New(t) - assert.Equal(agentlimit.KindSession, lim.Classification.Kind) - assert.Equal(1, classifyCalls, "classifier should be called exactly once") - assert.Contains(err.Error(), "claude-code") - assert.Contains(err.Error(), "session limit") -} -``` - -Add any missing imports to the top of `fix_test.go` (most are already -present — `errors` and `agentlimit` are likely the only new ones): - -```go -import ( - "errors" - - "github.com/roborev-dev/roborev/internal/agentlimit" -) -``` - -- [ ] **Step 2: Run test, confirm failure** - -Run: `go test ./cmd/roborev/ -run TestFixSingleJobAbortsOnSessionLimit -v` -Expected: FAIL — either compile error (no abort path yet) or assertion failure. - -- [ ] **Step 3: Add the abort branch in `fixSingleJob`** - -Modify `cmd/roborev/fix.go` — find the `result, err := fixJobDirect(...)` call inside `fixSingleJob` (around line 926). Replace the existing error block: - -Old: - -```go - if err != nil { - tracker.Reset() - return err - } -``` - -New: - -```go - if err != nil { - tracker.Reset() - cls := opts.classify(agent.CanonicalName(currentAgent.Name()), err.Error()) - switch cls.Kind { - case agentlimit.KindQuota, agentlimit.KindSession: - return &agentLimitError{Classification: cls} - case agentlimit.KindNone: - if err.Error() != "" && !opts.quiet { - flat := strings.ReplaceAll(err.Error(), "\n", " ") - cmd.PrintErrf( - "warning: unclassified agent error from %s: %s\n", - currentAgent.Name(), - truncateString(flat, 200), - ) - } - } - return err - } -``` - -The `KindTransient` case falls through to the default `return err` — same as today. The `KindNone` warn-print is the CLI mirror of the daemon-side WARN log. - -- [ ] **Step 4: Run test, confirm pass** - -Run: `go test ./cmd/roborev/ -run TestFixSingleJobAbortsOnSessionLimit -v` -Expected: PASS. - -- [ ] **Step 5: Run the broader fix suite to confirm no regression** - -Run: `go test ./cmd/roborev/ -run TestFix -v` -Expected: PASS for all `TestFix*` tests. - -- [ ] **Step 6: Commit** - -```bash -go fmt ./... -go vet ./... -git add cmd/roborev/fix.go cmd/roborev/fix_test.go -git commit -m "feat(fix): abort fixSingleJob on KindQuota or KindSession" -``` - ---- - -## Task 10: Wire abort path into `runFixBatch` - -**Files:** -- Modify: `cmd/roborev/fix.go` -- Modify: `cmd/roborev/fix_test.go` - -The batch loop is the long-running shape that originally surfaced the bug — without an abort, the CLI prints warnings and continues for every queued job. - -- [ ] **Step 1: Write failing test** - -This follows the `newMockDaemonBuilder` pattern from -`TestFixBatchSkipsPassVerdict` (`cmd/roborev/fix_test.go:2353`). The -mock daemon serves two job IDs; the `FakeAgent` returns a synthetic -quota error; the test asserts the batch aborts after the first job -without calling the daemon for the second. - -```go -func TestRunFixBatchAbortsOnQuotaError(t *testing.T) { - repo := createTestRepo(t, map[string]string{"main.go": "package main\n"}) - - originalAgent, err := agent.Get("test") - require.NoError(t, err, "get test agent") - agentCalls := 0 - agent.Register(&agent.FakeAgent{ - NameStr: "test", - ReviewFn: func(_ context.Context, _, _, _ string, _ io.Writer) (string, error) { - agentCalls++ - return "", errors.New("agent failed: you have exhausted your capacity") - }, - }) - t.Cleanup(func() { agent.Register(originalAgent) }) - - var mu sync.Mutex - var reviewedJobIDs []int64 - _ = newMockDaemonBuilder(t). - WithHandler("/api/jobs", func(w http.ResponseWriter, r *http.Request) { - id := r.URL.Query().Get("id") - switch id { - case "10", "20": - idNum := int64(10) - if id == "20" { - idNum = 20 - } - writeJSON(w, map[string]any{ - "jobs": []storage.ReviewJob{{ - ID: idNum, - Status: storage.JobStatusDone, - Agent: "test", - }}, - }) - default: - writeJSON(w, map[string]any{ - "jobs": []storage.ReviewJob{}, - "has_more": false, - }) - } - }). - WithHandler("/api/review", func(w http.ResponseWriter, r *http.Request) { - jobID := r.URL.Query().Get("job_id") - mu.Lock() - if jobID == "10" { - reviewedJobIDs = append(reviewedJobIDs, 10) - } else if jobID == "20" { - reviewedJobIDs = append(reviewedJobIDs, 20) - } - mu.Unlock() - writeJSON(w, storage.Review{ - JobID: 10, - Output: "## Issues\n- Bug", - }) - }). - WithHandler("/api/enqueue", func(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(http.StatusOK) - }). - Build() - - classifyCalls := 0 - opts := fixOptions{ - agentName: "test", - reasoning: "fast", - classify: func(_, msg string) agentlimit.Classification { - classifyCalls++ - return agentlimit.Classification{ - Kind: agentlimit.KindQuota, - Agent: "gemini", - CooldownFor: 30 * time.Minute, - Message: msg, - } - }, - } - - _, err = runWithOutput(t, repo.Dir, func(cmd *cobra.Command) error { - // batchSize=1 forces one job per agent call. With batchSize=0, - // runFixBatch may pack both jobs into a single batch, making - // agentCalls=1 ambiguous between "aborted before second batch" - // and "both jobs in one batch". batchSize=1 is unambiguous. - return runFixBatch( - cmd, - []int64{10, 20}, - "", - false, false, false, - 1, - opts, - &fixSessionTracker{base: agent.NewTestAgent(), out: io.Discard}, - ) - }) - require.Error(t, err, "expected batch to abort with agentLimitError") - - var lim *agentLimitError - require.ErrorAs(t, err, &lim, "expected agentLimitError, got %T: %v", err, err) - - assert := assert.New(t) - assert.Equal(agentlimit.KindQuota, lim.Classification.Kind) - assert.Equal(1, classifyCalls, "classifier should be called exactly once") - // agentCalls is the load-bearing assertion: with batchSize=1 and two - // jobs, two agent invocations would mean the loop continued past the - // abort. Exactly one means the abort short-circuits the second batch. - assert.Equal(1, agentCalls, "fix agent should be invoked exactly once before abort") - assert.Contains(err.Error(), "quota limit", "Gemini KindQuota label should be 'quota limit', not 'session limit'") - - // Note: do NOT assert `reviewedJobIDs` does not contain job 20 — - // runFixBatch prefetches every job's review at the start of the - // function (before any agent call), so /api/review fires for all - // queued jobs regardless of when the loop aborts. The agentCalls - // counter above is the correct measure of abort behavior. -} -``` - -The exact handler keys (`"10"`, `"20"`) and the `reviewedJobIDs` shape -mirror `TestFixBatchSkipsPassVerdict`. If `runWithOutput`'s signature -differs from what's shown above, copy it verbatim from the nearest -existing batch test in the same file. - -- [ ] **Step 2: Run test, confirm failure** - -Run: `go test ./cmd/roborev/ -run TestRunFixBatchAbortsOnQuotaError -v` -Expected: FAIL. - -- [ ] **Step 3: Add the abort branch in `runFixBatch`** - -Modify `cmd/roborev/fix.go` — locate the `fixJobDirect` call inside `runFixBatch` (around line 1170) and the surrounding error handling. Apply the same pattern as Task 9: classify on error, abort on `KindQuota`/`KindSession`, warn on `KindNone`, fall through otherwise. Reuse the same `agentLimitError` type and `truncateString` helper. - -The exact site: - -```go -result, err := fixJobDirect(ctx, fixJobParams{ - RepoRoot: repoRoot, - Agent: currentAgent, - Output: capture, -}, prompt) -// ...flush + tracker bookkeeping unchanged... -if err != nil { - cls := opts.classify(agent.CanonicalName(currentAgent.Name()), err.Error()) - switch cls.Kind { - case agentlimit.KindQuota, agentlimit.KindSession: - return &agentLimitError{Classification: cls} - case agentlimit.KindNone: - if err.Error() != "" && !opts.quiet { - flat := strings.ReplaceAll(err.Error(), "\n", " ") - cmd.PrintErrf( - "warning: unclassified agent error from %s: %s\n", - currentAgent.Name(), - truncateString(flat, 200), - ) - } - } - // existing per-job handling continues (warn-and-continue or return) - // ... -} -``` - -Make sure the abort `return` happens *before* the existing per-job warn-and-continue logic so the second job is never attempted. - -- [ ] **Step 4: Run test, confirm pass** - -Run: `go test ./cmd/roborev/ -run TestRunFixBatchAbortsOnQuotaError -v` -Expected: PASS — classifier called exactly once, second job untouched. - -- [ ] **Step 5: Run the full fix suite** - -Run: `go test ./cmd/roborev/ -run TestFix -v` -Expected: PASS for all `TestFix*` tests, including the discovery-mode tests that exercise the warn-and-continue path with non-limit errors. - -- [ ] **Step 6: Run the entire test suite** - -Run: `go test ./...` -Expected: PASS. - -- [ ] **Step 7: Format and commit** - -```bash -go fmt ./... -go vet ./... -git add cmd/roborev/fix.go cmd/roborev/fix_test.go -git commit -m "feat(fix): abort runFixBatch on KindQuota or KindSession" -``` - ---- - -## Task 11: Final verification and lint - -**Files:** -- (verification only) - -- [ ] **Step 1: Confirm dead code is gone** - -```bash -rg -n "isQuotaError|parseQuotaCooldown" . -``` - -Expected: zero matches outside of `docs/` and possibly unrelated history. - -- [ ] **Step 2: Run lint and full tests** - -```bash -make lint -go test ./... -``` - -Expected: clean lint, all tests pass. - -- [ ] **Step 3: Read the diff start-to-end** - -```bash -git log --oneline main..HEAD -git diff main...HEAD -``` - -Look for: leftover TODO comments, unused imports, dead helpers, inconsistent naming between `agentlimit.ClassifyFunc` declarations and call sites. - -- [ ] **Step 4: Manual smoke (if a quota-exhausted agent is available)** - -If a Gemini account in known-exhausted state is reachable, run: - -```bash -roborev fix -``` - -Expected: the fix loop aborts after the first job with a message like - -``` -agent gemini hit a quota limit. Cooldown for ~30m. Re-run after that, or pass --agent to switch. -``` - -If no exhausted agent is reachable, this step is informational only — coverage is in the unit tests. - ---- - -## Spec coverage check (self-review) - -| Spec section / requirement | Implementing task | -|---|---| -| `internal/agentlimit` package with `Kind`, `Classification`, `Classify`, `classifyWithRules` | Task 1, Task 4 | -| Day-1 rule table = nine `isQuotaError` substrings | Task 4 | -| `parseResetDuration` (lifted from `parseQuotaCooldown`) | Task 2 | -| `parseResetTime` (absolute time, same-day rollover) | Task 3 | -| Daemon worker: classifier replaces `isQuotaError`, behavior preserved | Task 5 | -| Daemon worker: classifier injection (test stub) | Task 5 (`classify` field), Task 6 (positive test) | -| Daemon worker: unmatched-error WARN log | Task 7 | -| `fixOptions.classify` hook | Task 8 | -| `agentLimitError` type | Task 8 | -| Abort in `fixSingleJob` on `KindQuota`/`KindSession` | Task 9 | -| Abort in `runFixBatch` on `KindQuota`/`KindSession` | Task 10 | -| CLI unmatched-error warn line | Task 9, Task 10 | -| No exported test helper (cross-package tests use struct literals) | Task 9, Task 10 (tests construct `Classification` literals) | -| Existing `TestFailOrRetryInner_QuotaSkipsRetries` and friends still pass | Task 5 (preserved by routing through default `agentlimit.Classify`) | - ---- - -## Out of scope (per spec) - -These are deliberately *not* in this plan: - -- A production Claude session-cap rule. Adding it is a one-line table change in `defaultRules` plus a unit test fixture, taken in a follow-up PR once a real session-cap message is captured. The unmatched-error WARN log from Task 7 / Tasks 9-10 will surface a sample. -- `--auto-failover` flag for `roborev fix`. -- `roborev status` field showing in-cooldown agents. -- Persisting cooldown state across daemon restarts. -- Any new HTTP API surface. diff --git a/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md b/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md deleted file mode 100644 index 4cd69088..00000000 --- a/docs/superpowers/specs/2026-05-05-agent-rate-limit-handling-design.md +++ /dev/null @@ -1,425 +0,0 @@ -# Agent Rate-Limit and Session-Cap Handling - -## Scope - -This spec covers the foundation: a shared classifier package, unmatched -non-zero exit logging, daemon-worker integration that preserves current -behavior, and CLI fix-loop abort plumbing wired up against a synthetic -test fixture. Adding a production Claude session-cap pattern is an -explicit follow-up, taken once a real Claude limit message is captured -(see [Detection without a captured Claude message](#detection-without-a-captured-claude-message)). -The first PR ships the framework end-to-end; Claude detection is a -one-line table addition plus a unit test in a follow-up PR. - -## Problem - -Long-running `roborev fix` sessions keep iterating fruitlessly when the -configured agent hits its session-level rate limit (e.g. Claude Code's -5-hour usage cap). Every subsequent job hits the same wall and the loop -prints warnings without aborting. - -The daemon worker already has a quota-detection layer (`isQuotaError` in -`internal/daemon/worker.go`, plus per-agent in-process cooldown), but it -has two gaps: - -1. The pattern set only catches Gemini-style ("exhausted your capacity") - and Codex-style ("resource exhausted") wording. Claude's session-cap - message is not recognized, so it falls through to generic - transient-retry and then to a plain failure. -2. `roborev fix` runs the fix agent in-process via `fixJobDirect` - (`cmd/roborev/fix.go:262`). It does not go through the daemon worker - at all, so even a perfect daemon-side classifier would not help the - foreground loop. - -## Goals - -- Detect agent-level rate-limit and session-cap failures consistently in - both the daemon worker and the foreground `roborev fix` loop. -- For the foreground fix loop, abort the entire session immediately when - the configured agent hits a session-cap, surfacing the reset time (or - a conservative fallback message) so the user knows when to retry. -- Preserve the daemon's existing behavior for daemon-owned review jobs - bit-for-bit: on a quota- or session-class error, cooldown the agent - and immediately fail over to a configured backup (or fail the job if - none) — no retries, matching today's `isQuotaError` branch in - `worker.go`. CI flows continue to benefit from best-effort completion - via failover. -- Keep the detection table conservative: high-confidence patterns only, - one unit test per pattern, with logging for unmatched non-zero exits - so new variants can be observed and added incrementally. - -## Non-goals - -- Auto-failover for foreground `roborev fix`. Strict abort is the default; - if user demand surfaces, an `--auto-failover` flag or config setting can - be added later. -- Cross-machine coordination of cooldowns (the existing in-memory cooldown - is per-daemon-process; that is unchanged). -- Surfacing daemon cooldown state to the CLI through a new API. The - classifier is shared via a Go package import, not via HTTP. - -## Architecture - -### New package: `internal/agentlimit` - -A small, dependency-free package shared by the daemon worker and the CLI -fix loop. Pure classification logic plus duration parsing — no I/O, no -process state. - -```go -package agentlimit - -type Kind int - -const ( - KindNone Kind = iota // no rate-limit signal - KindTransient // 429-style; retry locally, no cooldown - KindQuota // hard quota exhaustion (existing behavior) - KindSession // session-level cap (e.g. Claude 5-hour) -) - -type Classification struct { - Kind Kind - Agent string // canonical agent name (alias-resolved by caller) - ResetAt time.Time // zero if not parseable from the message - CooldownFor time.Duration // fallback when ResetAt is zero - Message string // raw error text (for logs / user display) -} - -// Classify inspects an agent error message for rate-limit signatures. -// agent is the canonical agent name (caller resolves aliases). Returns -// KindNone when no signature matches. -func Classify(agent, errMsg string) Classification - -// classifyWithRules is the same as Classify but accepts an explicit -// rule slice. Unexported; used inside the package's own tests so that -// fixtures (e.g. a synthetic KindSession pattern) do not leak into the -// production defaultRules slice. -func classifyWithRules(agent, errMsg string, rules []rule) Classification -``` - -The pattern table lives in the package as a slice of per-agent rules: - -```go -type rule struct { - Agents []string // agents this rule applies to ("*" = any) - Pattern *regexp.Regexp // or substring (case-insensitive) - Kind Kind - ParseReset func(match []string) (time.Time, time.Duration) -} -``` - -Day-1 rules — copied verbatim from the current `isQuotaError` -(`internal/daemon/worker.go:974`) so detection for Gemini and Codex is -byte-for-byte unchanged. The full set of nine substrings (case-insensitive, -matched via `strings.Contains`): - -| Pattern | Kind | Reset parsing | -|---------|------|---------------| -| `resource exhausted` | Quota | `reset after ` if present | -| `quota exceeded` | Quota | same | -| `quota_exceeded` | Quota | same | -| `quota exhausted` | Quota | same | -| `quota_exhausted` | Quota | same | -| `insufficient_quota` | Quota | same | -| `exhausted your capacity` | Quota | same | -| `capacity exhausted` | Quota | same | -| `capacity_exhausted` | Quota | same | - -Reset parsing applies uniformly: any matching message is run through -`parseResetDuration` (today's `parseQuotaCooldown`, moved into the -shared package) so a `reset after X` substring produces `CooldownFor` -when present, regardless of which pattern matched. - -The `Transient` kind exists in the API and is exercised in tests, but -no production rule produces it on day one. Adding a generic -`rate.?limit` pattern was considered and rejected: it overmatches benign -errors and the existing in-call retry layers (`runStreamingCLI` and the -agents' own retry logic) already handle short-lived 429s. - -Claude-specific rules are deliberately omitted at launch — see -[Detection without a captured Claude message](#detection-without-a-captured-claude-message). - -The duration parser currently inline in `worker.go` (the helper that -extracts a cooldown duration from the gemini "reset after Xs" wording) -moves into this package as `parseResetDuration(msg) time.Duration`, so -both consumers share it. - -### Daemon worker (`internal/daemon/worker.go`) changes - -The existing `isQuotaError` helper (line 974) and inline duration parsing -are replaced with calls to `agentlimit.Classify`. Behavior is preserved -exactly — quota-class errors skip retries entirely: - -- `KindQuota` or `KindSession`: cooldown the canonical agent until - `ResetAt` (or `now + CooldownFor`, defaulting to 30m if both are zero), - then immediately call `failoverOrFail`. Retries are skipped — this - matches today's `isQuotaError` branch at `worker.go:792-798`. - CI / daemon-owned review flows continue to benefit from failover when - a backup agent is configured. -- `KindTransient`: do not cooldown; fall through to the normal retry - path (`failOrRetryAgent` semantics unchanged). -- `KindNone`: unchanged (generic retry-then-fail-or-failover path). - -The `cooldownAgent`/`isAgentInCooldown` helpers stay where they are — -only the *trigger* moves to the shared classifier. - -Implementation note: rather than hard-coding `agentlimit.Classify`, the -`WorkerPool` struct holds a `classify func(agent, errMsg string) agentlimit.Classification` -field, initialized to `agentlimit.Classify` at construction. Tests in -this package can substitute a stub for deterministic -`KindSession`/`KindQuota` outcomes without touching `defaultRules`. - -### CLI fix loop (`cmd/roborev/fix.go`) changes - -`fixJobDirect` returns errors from `params.Agent.Review()` as today. -`runFixOpen` (line 456) and `runFixBatch` (line 1000) are the loop -drivers — both call `fixJobDirect` (directly or via `fixSingleJob`) and -process the result. - -After each `fixJobDirect` error, the caller runs the classifier and -branches on the result. The fix loop holds a classifier function -(default `agentlimit.Classify`, swappable in tests) on its options/ -context struct so tests can drive deterministic outcomes: - -```go -cls := opts.classify(canonicalAgent(params.Agent.Name()), err.Error()) -switch cls.Kind { -case agentlimit.KindSession, agentlimit.KindQuota: - // Strict abort. No auto-failover for foreground fix. - return newAgentLimitError(cls) -case agentlimit.KindTransient, agentlimit.KindNone: - // Existing per-job error handling (warn-and-continue in discovery - // mode, return error in explicit-IDs mode). -} -``` - -`newAgentLimitError` formats a user-visible message: - -``` -agent claude-code hit a session limit. Cooldown until 5:42 PM (in 2h 13m). -Re-run `roborev fix ...` after that, or pass --agent to switch. -``` - -If `ResetAt` is zero, fall back to `CooldownFor` (or "an unknown reset -time; try again later" if that is also zero), and the suggestion to -override `--agent` stands. - -The CLI exits non-zero with the standard `1` exit code — no new exit -code. Scripts that want to detect this case can grep for the message. - -### Detection without a captured Claude message - -We have not captured Claude Code's actual session-cap output yet -(checked `~/.roborev/errors.log`, no Claude-tagged limit messages). -Adding a guess-pattern risks both false negatives (wording differs -slightly from what we guessed) and false positives (a benign error -mentioning the word "limit" gets classified as a hard cap). - -The framework ships without a Claude-specific rule. Two safeguards make -the next observed cap actionable: - -1. **Unmatched non-zero exit logging.** When the daemon worker or CLI - fix loop receives an agent error with `Classify().Kind == KindNone`, - log a single WARN line containing the agent name and the first 200 - chars of the error message. This produces a recoverable sample the - moment the next cap fires. -2. **Test-only mock pattern via injectable rules.** The package's - public API is a single `Classify(agent, errMsg)` function backed by - a private `defaultRules` slice. To exercise the `KindSession` branch - end-to-end without polluting the production rule table, the package - also exposes an unexported `classifyWithRules(agent, errMsg, rules)` - helper. Tests inside `agentlimit` call `classifyWithRules` with a - synthetic rule slice (e.g. one that maps a fake "test-claude session - limit" message to `KindSession`); production callers always go - through `Classify` and only see `defaultRules`. - - Tests in `internal/daemon` and `cmd/roborev` need a different - mechanism — they can't reach an unexported helper. They use the - injected `classify` function on `WorkerPool` / fix-loop options - (described below) and have their stub return a literal - `agentlimit.Classification{Kind: agentlimit.KindSession, ResetAt: ...}`. - `Classification` and the `Kind*` constants are already exported, so - no extra test helper is needed. - - Real Claude patterns get added to the production rule table once - captured. - -Once a real Claude message is observed, adding the rule is a one-line -table change plus a unit test fixture. - -### Reset-time parsing - -Two formats need to be supported in the shared parser: - -- Relative duration: `reset after 48m20s`, `try again in 2h13m`, - `retry after 5m`. Existing worker.go logic handles this for Gemini — - it moves into `agentlimit.parseResetDuration` verbatim. -- Absolute time: `resets at 5:42 PM`, `try again at 17:42`. New - parser, returns `time.Time` interpreted in the local timezone with a - same-day or next-day disambiguation (next-day if the parsed time is - already in the past). Day-1 scope is bare clock times only — trailing - qualifiers like `UTC`, `EST`, or `today` defeat `time.Parse` and - produce zero. Extending coverage to timezone-suffixed messages is - deferred until a real Claude session-cap message is captured. - -If neither format matches, both `ResetAt` and `CooldownFor` are zero — -the caller (worker or CLI) applies its own default fallback (30m -cooldown for the daemon, "unknown reset time" message for the CLI). - -## Data flow - -``` -Foreground: roborev fix → fixJobDirect → Agent.Review → error - │ - ▼ - agentlimit.Classify - │ - ┌───────────┴───────────┐ - ▼ ▼ - KindSession/Quota KindTransient/None - │ │ - ▼ ▼ - Abort fix loop with Continue per existing - reset-time error per-job handling - -Daemon: worker job → Agent.Review → error - │ - ▼ - agentlimit.Classify - │ - ┌──────────┴──────────┐ - ▼ ▼ - KindSession/Quota KindTransient/None - │ │ - ▼ ▼ - cooldownAgent(...) Existing retry path - then immediate (failOrRetryAgent - failoverOrFail with up to maxRetries, - (no retries — then failoverOrFail) - matches today's - quota branch) -``` - -## Testing strategy - -### `internal/agentlimit` unit tests - -Table-driven, one row per pattern. Each row asserts: - -- The expected `Kind`. -- The expected `Agent` value. -- The expected `ResetAt` / `CooldownFor` for messages that include reset - info, and zero values for those that do not. - -Negative cases: messages that should *not* match (e.g. an unrelated -error containing the word "limit") map to `KindNone`. - -### Worker integration test - -A new test in `internal/daemon/worker_test.go` exercises the -end-to-end daemon path. To avoid depending on Claude's exact wording -or mutating the production rule table, the worker code calls into a -small package-internal entry point that takes a classifier function -(default: `agentlimit.Classify`); the test substitutes a stub -classifier that returns a `KindSession` `Classification` for the test -agent's error message. - -- Configure a worker pool with a mock agent that returns a session-limit - error string and a stub classifier that maps that string to - `KindSession` with a known `ResetAt`. -- Enqueue a job. Run a worker tick. -- Assert: the agent is now in cooldown, the cooldown expiry matches the - stub's `ResetAt`, retries did *not* happen (`retry_count` stays at 0), - and a subsequent job for the same agent is skipped per existing - `(quota cooldown active)` logic. - -A second test confirms the byte-for-byte preservation of today's quota -behavior: each of the nine substrings from the original `isQuotaError` -list is fed in as an error message; the test asserts each produces -`KindQuota` and that the worker takes the cooldown-then-failover path -without retries. - -### CLI fix abort test - -A new test in `cmd/roborev/fix_test.go` (or `fix_mock_test.go`) uses -the same classifier-injection pattern: the fix-loop call site takes a -classifier function from a package-private factory that defaults to -`agentlimit.Classify`, and the test substitutes a stub. - -- Use the existing `test` agent infrastructure to inject an error. -- Stub classifier maps the error to `KindSession` with a known - `ResetAt`. -- Run `runFixBatch` with two job IDs. -- Assert: the loop aborts after the first job, the returned error - surfaces the parsed reset time, and the second job was *not* - processed. - -### Logging behavior - -A small assertion in either the worker or CLI test confirms that an -unmatched non-zero exit (`KindNone`) emits a WARN with a truncated -error preview, so the unmatched-pattern surfacing path is covered. - -## Migration / rollout - -The work splits into two PRs: - -**PR 1 — Framework (this spec):** shared classifier package, unmatched -non-zero exit logging, daemon-worker integration, and CLI fix-loop -abort plumbing exercised by a synthetic test fixture. - -Behavior changes in PR 1: - -- **Daemon worker (no change):** quota detection for Gemini and Codex - is preserved bit-for-bit. The same nine `isQuotaError` substrings - produce the same cooldown-then-failover-or-fail outcome. -- **Foreground `roborev fix` (intentional change):** today, when a - Gemini/Codex job hits a quota error mid-session, the fix loop logs - a per-job warning and continues (in discovery mode) or returns an - error for that one job (in explicit-IDs mode), then proceeds to the - next job — which usually hits the same wall. After PR 1, any error - matching a `KindQuota` (or future `KindSession`) rule on the - configured fix agent aborts the entire session immediately with a - reset-time message. This is the central improvement of the spec; it - applies to existing Codex/Gemini quota errors as well as future - Claude session caps. Users running `roborev fix` against a - quota-exhausted Gemini or Codex backend will see strict abort - instead of warn-and-continue starting in PR 1. - -**PR 2 — Claude detection (follow-up):** add the production Claude -session-cap rule plus a unit test fixture, once a real session-limit -message has been captured (either by triggering one in normal use, or -by a contributor who has hit one previously). - -Other migration notes: - -- The change is internal (no public API surface). No flag-gating, no - config opt-in. -- `isQuotaError` becomes a thin wrapper over `agentlimit.Classify` if - any tests still reference it directly; otherwise it is removed in - PR 1. -- Existing behavior for Gemini and Codex is preserved bit-for-bit by - copying all nine patterns from `isQuotaError` verbatim into the new - rule table — verified by the regression test described above that - runs each pattern through the new classifier. - -## Risks and mitigations - -| Risk | Mitigation | -|------|------------| -| Pattern matching is brittle; agents change wording. | Conservative patterns + unit tests + WARN log on unmatched non-zero exits so new variants surface and can be added one-by-one. | -| Misclassifying a benign error as `KindSession` aborts the user's fix loop unnecessarily. | Patterns require specific phrasing (e.g. "exhausted", "session limit"), not generic words like "limit" or "rate". Negative test cases lock this in. | -| Reset-time parsing produces a wrong `time.Time` (e.g. timezone mishandling). | Local-timezone interpretation with explicit same-day/next-day disambiguation, plus unit tests for boundary cases. | -| The CLI abort message is unactionable if neither `ResetAt` nor `CooldownFor` is parseable. | Fallback message includes the raw agent error text and the `--agent` override hint, so the user has both context and a way out. | - -## Out of scope (potential follow-ups) - -- `--auto-failover` flag for `roborev fix` to mirror daemon failover. -- A `roborev status` field showing in-cooldown agents and their - expiries. -- Persisting cooldown state across daemon restarts (today it is purely - in-memory). -- Adding rate-limit handling for the CI poller and `roborev refine` - paths beyond what they inherit transitively from the daemon and the - fix loop. From 3fecb4b860669522775d903707001ebb7fd3ede6 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 17:05:20 -0500 Subject: [PATCH 21/24] refactor: fold agentlimit into agent; classify retry errors The classifier only had two consumers (the daemon worker and the foreground fix loop), so a separate package was overkill. Move the files into internal/agent/ and rename the exported names with a Limit prefix (LimitKind, LimitClassification, LimitClassifier, ClassifyLimit, LimitKindNone/Transient/Quota/Session). ParseResetDuration and ParseResetTime keep their names; they were already specific. Drop the internal/agentlimit/ directory. Address review finding 18515 (medium): fixJobDirect's commit-retry swallowed any retry error as a "Warning: commit agent failed:" line, so a quota or session-limit hit on that retry slipped past the classifier and the fix loop kept invoking the exhausted agent. Plumb agent.ClassifyLimit through fixJobParams.Classify (defaulting when nil so existing direct callers do not break) and abort with *agentLimitError on LimitKindQuota / LimitKindSession; non-limit retry errors keep today's warn-and-continue. fixSingleJob and runFixBatch now short-circuit on a returned *agentLimitError before re-classifying err.Error(), since the formatted user-facing message is no longer the original agent error. Regression test TestFixJobDirect_RetryClassifiesAgentLimit drives the retry path with a FakeAgent that succeeds-with-uncommitted- changes on the first call and returns a synthetic session-cap on the second; asserts *agentLimitError propagates and the formatted message identifies the agent and limit kind. --- CLAUDE.md | 2 +- cmd/roborev/fix.go | 61 +++++++--- cmd/roborev/fix_test.go | 93 ++++++++++++--- internal/agent/limit.go | 103 +++++++++++++++++ .../parse.go => agent/limit_parse.go} | 4 +- .../limit_parse_test.go} | 2 +- .../limit_test.go} | 51 ++++----- internal/agentlimit/agentlimit.go | 106 ------------------ internal/daemon/worker.go | 15 ++- internal/daemon/worker_test.go | 13 +-- 10 files changed, 270 insertions(+), 180 deletions(-) create mode 100644 internal/agent/limit.go rename internal/{agentlimit/parse.go => agent/limit_parse.go} (95%) rename internal/{agentlimit/parse_test.go => agent/limit_parse_test.go} (99%) rename internal/{agentlimit/agentlimit_test.go => agent/limit_test.go} (55%) delete mode 100644 internal/agentlimit/agentlimit.go diff --git a/CLAUDE.md b/CLAUDE.md index d8347207..337a7b1c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -210,7 +210,7 @@ The internal `lookupFieldByTag()` helper resolves these via reflection on the st - **Retries**: Up to 3 retries for transient failures (`db.RetryJob`). Resets status to queued. - **Failover**: After retries exhausted (or on quota errors), switches to backup agent via `db.FailoverJob`. Resets retry_count, sets backup agent/model. -- **Cooldown**: Quota exhaustion errors (classified by `internal/agentlimit.Classify` as `KindQuota`/`KindSession`) trigger per-agent cooldown (default 30 min, parsed from the error message via `agentlimit.ParseResetDuration`/`ParseResetTime`). Cooldowns are tracked in-memory with RWMutex. +- **Cooldown**: Quota exhaustion errors (classified by `agent.ClassifyLimit` as `LimitKindQuota`/`LimitKindSession`) trigger per-agent cooldown (default 30 min, parsed from the error message via `agent.ParseResetDuration`/`ParseResetTime`). Cooldowns are tracked in-memory with RWMutex. ### Workflow derivation for failover diff --git a/cmd/roborev/fix.go b/cmd/roborev/fix.go index 2067a178..3a167f29 100644 --- a/cmd/roborev/fix.go +++ b/cmd/roborev/fix.go @@ -14,7 +14,6 @@ import ( "time" "github.com/roborev-dev/roborev/internal/agent" - "github.com/roborev-dev/roborev/internal/agentlimit" "github.com/roborev-dev/roborev/internal/config" "github.com/roborev-dev/roborev/internal/daemon" "github.com/roborev-dev/roborev/internal/git" @@ -142,7 +141,7 @@ Examples: minSeverity: minSeverity, quiet: quiet, resume: resume, - classify: agentlimit.Classify, + classify: agent.ClassifyLimit, } roots, err := resolveCurrentRepoRoots() @@ -232,17 +231,17 @@ type fixOptions struct { resume bool // classify is the rate-limit classifier. Defaults to - // agentlimit.Classify in the production cobra command's RunE; tests + // agent.ClassifyLimit in the production cobra command's RunE; tests // inject a stub to drive deterministic KindQuota / KindSession // outcomes without depending on real agent error wording. - classify agentlimit.ClassifyFunc + classify agent.LimitClassifier } // agentLimitError is returned by the fix loop when the configured agent // hits a quota or session limit. The fix command surfaces it as the // process exit error so users see the reset time and a hint to retry. type agentLimitError struct { - Classification agentlimit.Classification + Classification agent.LimitClassification } func (e *agentLimitError) Error() string { @@ -254,7 +253,7 @@ func (e *agentLimitError) Error() string { // The label ("quota" / "session limit" / "rate limit") is derived from // cls.Kind so a Gemini/Codex KindQuota abort doesn't mis-report itself // as a session-cap. -func formatAgentLimitMessage(cls agentlimit.Classification, now time.Time) string { +func formatAgentLimitMessage(cls agent.LimitClassification, now time.Time) string { label := agentLimitLabel(cls.Kind) var dur time.Duration switch { @@ -294,13 +293,13 @@ func formatAgentLimitMessage(cls agentlimit.Classification, now time.Time) strin } } -func agentLimitLabel(k agentlimit.Kind) string { +func agentLimitLabel(k agent.LimitKind) string { switch k { - case agentlimit.KindSession: + case agent.LimitKindSession: return "session limit" - case agentlimit.KindQuota: + case agent.LimitKindQuota: return "quota limit" - case agentlimit.KindTransient: + case agent.LimitKindTransient: return "rate limit" default: return "rate limit" @@ -312,6 +311,9 @@ type fixJobParams struct { RepoRoot string Agent agent.Agent Output io.Writer // agent streaming output (nil = discard) + // Classify is the rate-limit classifier used for the commit-retry + // path. nil defaults to agent.ClassifyLimit. Tests inject a stub. + Classify agent.LimitClassifier } // fixJobResult contains the outcome of a fix operation. @@ -397,6 +399,18 @@ func fixJobDirect(ctx context.Context, params fixJobParams, prompt string) (*fix } } if _, retryErr := retryAgent.Review(ctx, params.RepoRoot, "HEAD", buildGenericCommitPrompt(), out); retryErr != nil { + // Classify the retry error so quota/session limits abort + // instead of being demoted to a warning — otherwise the fix + // loop keeps invoking the exhausted agent on every following + // job until the queue is empty. + classify := params.Classify + if classify == nil { + classify = agent.ClassifyLimit + } + cls := classify(agent.CanonicalName(retryAgent.Name()), retryErr.Error()) + if cls.Kind == agent.LimitKindQuota || cls.Kind == agent.LimitKindSession { + return nil, &agentLimitError{Classification: cls} + } fmt.Fprintf(out, "Warning: commit agent failed: %v\n", retryErr) } if sha, ok := detectNewCommit(params.RepoRoot, headBefore); ok { @@ -914,7 +928,7 @@ func jobVerdict(job *storage.ReviewJob, review *storage.Review) string { func fixSingleJob(cmd *cobra.Command, repoRoot string, jobID int64, opts fixOptions, tracker *fixSessionTracker) error { if opts.classify == nil { - opts.classify = agentlimit.Classify + opts.classify = agent.ClassifyLimit } ctx := cmd.Context() if ctx == nil { @@ -1015,6 +1029,7 @@ func fixSingleJob(cmd *cobra.Command, repoRoot string, jobID int64, opts fixOpti RepoRoot: repoRoot, Agent: currentAgent, Output: capture, + Classify: opts.classify, }, buildGenericFixPrompt(review.Output, minSev, comments)) // Flush capture FIRST so session extraction completes before reading SessionID. capture.Flush() @@ -1023,11 +1038,18 @@ func fixSingleJob(cmd *cobra.Command, repoRoot string, jobID int64, opts fixOpti } if err != nil { tracker.Reset() + // fixJobDirect already returns *agentLimitError for retry-path + // quota/session aborts; preserve it instead of re-classifying + // its user-facing message string. + var lim *agentLimitError + if errors.As(err, &lim) { + return err + } cls := opts.classify(agent.CanonicalName(currentAgent.Name()), err.Error()) switch cls.Kind { - case agentlimit.KindQuota, agentlimit.KindSession: + case agent.LimitKindQuota, agent.LimitKindSession: return &agentLimitError{Classification: cls} - case agentlimit.KindNone: + case agent.LimitKindNone: if err.Error() != "" && !opts.quiet { flat := strings.ReplaceAll(err.Error(), "\n", " ") cmd.PrintErrf( @@ -1101,7 +1123,7 @@ type batchEntry struct { // respecting max prompt size, and runs each batch as a single agent invocation. func runFixBatch(cmd *cobra.Command, jobIDs []int64, branch string, allBranches, explicitBranch, newestFirst bool, batchSize int, opts fixOptions, tracker *fixSessionTracker) error { if opts.classify == nil { - opts.classify = agentlimit.Classify + opts.classify = agent.ClassifyLimit } if err := ensureDaemon(); err != nil { return err @@ -1276,6 +1298,7 @@ func runFixBatch(cmd *cobra.Command, jobIDs []int64, branch string, allBranches, RepoRoot: roots.worktreeRoot, Agent: currentAgent, Output: capture, + Classify: opts.classify, }, prompt) // Flush capture FIRST so session extraction completes before reading SessionID. capture.Flush() @@ -1284,11 +1307,17 @@ func runFixBatch(cmd *cobra.Command, jobIDs []int64, branch string, allBranches, } if err != nil { tracker.Reset() + // Preserve a retry-path agentLimitError without + // re-classifying its user-facing message string. + var lim *agentLimitError + if errors.As(err, &lim) { + return err + } cls := opts.classify(agent.CanonicalName(currentAgent.Name()), err.Error()) switch cls.Kind { - case agentlimit.KindQuota, agentlimit.KindSession: + case agent.LimitKindQuota, agent.LimitKindSession: return &agentLimitError{Classification: cls} - case agentlimit.KindNone: + case agent.LimitKindNone: if err.Error() != "" && !opts.quiet { flat := strings.ReplaceAll(err.Error(), "\n", " ") cmd.PrintErrf( diff --git a/cmd/roborev/fix_test.go b/cmd/roborev/fix_test.go index 9b1da9a7..f2c16cab 100644 --- a/cmd/roborev/fix_test.go +++ b/cmd/roborev/fix_test.go @@ -24,7 +24,6 @@ import ( "github.com/spf13/cobra" "github.com/roborev-dev/roborev/internal/agent" - "github.com/roborev-dev/roborev/internal/agentlimit" "github.com/roborev-dev/roborev/internal/config" "github.com/roborev-dev/roborev/internal/daemon" "github.com/roborev-dev/roborev/internal/storage" @@ -3637,10 +3636,10 @@ func TestFixSingleJobAbortsOnSessionLimit(t *testing.T) { opts := fixOptions{ agentName: "test", reasoning: "fast", - classify: func(_, msg string) agentlimit.Classification { + classify: func(_, msg string) agent.LimitClassification { classifyCalls++ - return agentlimit.Classification{ - Kind: agentlimit.KindSession, + return agent.LimitClassification{ + Kind: agent.LimitKindSession, Agent: "claude-code", ResetAt: resetAt, Message: msg, @@ -3658,7 +3657,7 @@ func TestFixSingleJobAbortsOnSessionLimit(t *testing.T) { require.ErrorAs(t, err, &lim, "expected agentLimitError, got %T: %v", err, err) assert := assert.New(t) - assert.Equal(agentlimit.KindSession, lim.Classification.Kind) + assert.Equal(agent.LimitKindSession, lim.Classification.Kind) assert.Equal(1, classifyCalls, "classifier should be called exactly once") assert.Contains(err.Error(), "claude-code") assert.Contains(err.Error(), "session limit") @@ -3728,10 +3727,10 @@ func TestRunFixBatchAbortsOnQuotaError(t *testing.T) { opts := fixOptions{ agentName: "test", reasoning: "fast", - classify: func(_, msg string) agentlimit.Classification { + classify: func(_, msg string) agent.LimitClassification { classifyCalls++ - return agentlimit.Classification{ - Kind: agentlimit.KindQuota, + return agent.LimitClassification{ + Kind: agent.LimitKindQuota, Agent: "gemini", CooldownFor: 30 * time.Minute, Message: msg, @@ -3763,7 +3762,7 @@ func TestRunFixBatchAbortsOnQuotaError(t *testing.T) { require.ErrorAs(t, err, &lim, "expected agentLimitError, got %T: %v", err, err) assert := assert.New(t) - assert.Equal(agentlimit.KindQuota, lim.Classification.Kind) + assert.Equal(agent.LimitKindQuota, lim.Classification.Kind) assert.Equal(1, classifyCalls, "classifier should be called exactly once") // agentCalls is the load-bearing assertion: with batchSize=1 and two // jobs, two agent invocations would mean the loop continued past the @@ -3833,10 +3832,10 @@ func TestRunFixWithSeenDiscoveryAbortsOnAgentLimit(t *testing.T) { opts := fixOptions{ agentName: "test", reasoning: "fast", - classify: func(_, msg string) agentlimit.Classification { + classify: func(_, msg string) agent.LimitClassification { classifyCalls++ - return agentlimit.Classification{ - Kind: agentlimit.KindQuota, + return agent.LimitClassification{ + Kind: agent.LimitKindQuota, Agent: "gemini", CooldownFor: 30 * time.Minute, Message: msg, @@ -3857,7 +3856,7 @@ func TestRunFixWithSeenDiscoveryAbortsOnAgentLimit(t *testing.T) { require.ErrorAs(t, err, &lim, "expected agentLimitError, got %T: %v", err, err) assert := assert.New(t) - assert.Equal(agentlimit.KindQuota, lim.Classification.Kind) + assert.Equal(agent.LimitKindQuota, lim.Classification.Kind) assert.Equal(1, classifyCalls, "classifier should be called exactly once") // Load-bearing: only one agent invocation means the abort short- // circuited the loop. Two would mean discovery mode demoted the @@ -3865,3 +3864,71 @@ func TestRunFixWithSeenDiscoveryAbortsOnAgentLimit(t *testing.T) { assert.Equal(1, agentCalls, "fix agent should be invoked exactly once before abort") assert.False(seen[20], "second job must not be marked seen after abort") } + +func TestFixJobDirect_RetryClassifiesAgentLimit(t *testing.T) { + // fixJobDirect runs the agent twice when the first call leaves + // uncommitted changes: once for the fix, once with commit + // instructions. A quota or session-limit error on that second + // call must produce *agentLimitError, not a swallowed warning. + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not available") + } + + dir := t.TempDir() + for _, args := range [][]string{ + {"init"}, + {"config", "user.email", "test@test.com"}, + {"config", "user.name", "Test"}, + {"commit", "--allow-empty", "-m", "base"}, + } { + c := exec.Command("git", args...) + c.Dir = dir + require.NoError(t, c.Run(), "git %v", args) + } + + calls := 0 + ag := &agent.FakeAgent{ + NameStr: "test", + ReviewFn: func(_ context.Context, repoPath, _, _ string, _ io.Writer) (string, error) { + calls++ + if calls == 1 { + // First call: leave an uncommitted change so the retry + // path is reached. + return "applied fix", os.WriteFile( + filepath.Join(repoPath, "dirty.txt"), + []byte("uncommitted"), + 0o644, + ) + } + // Second call (commit retry): hit a session limit. + return "", errors.New("simulated claude session cap") + }, + } + + classifyCalls := 0 + classify := func(_, msg string) agent.LimitClassification { + classifyCalls++ + return agent.LimitClassification{ + Kind: agent.LimitKindSession, + Agent: "claude-code", + Message: msg, + } + } + + _, err := fixJobDirect(context.Background(), fixJobParams{ + RepoRoot: dir, + Agent: ag, + Classify: classify, + }, "fix things") + require.Error(t, err, "expected agentLimitError from retry path") + + var lim *agentLimitError + require.ErrorAs(t, err, &lim, "expected *agentLimitError, got %T: %v", err, err) + + assert := assert.New(t) + assert.Equal(agent.LimitKindSession, lim.Classification.Kind) + assert.Equal(2, calls, "agent must be called twice (fix + retry) before abort") + assert.Equal(1, classifyCalls, "classifier should run once on the retry error") + assert.Contains(err.Error(), "claude-code") + assert.Contains(err.Error(), "session limit") +} diff --git a/internal/agent/limit.go b/internal/agent/limit.go new file mode 100644 index 00000000..530876c2 --- /dev/null +++ b/internal/agent/limit.go @@ -0,0 +1,103 @@ +package agent + +import ( + "strings" + "time" +) + +// LimitKind labels a classified agent error. +type LimitKind int + +const ( + LimitKindNone LimitKind = iota // no rate-limit signal recognized + LimitKindTransient // 429-style; retry locally, no cooldown + LimitKindQuota // hard quota exhaustion (Gemini/Codex today) + LimitKindSession // session-level cap (e.g. Claude 5-hour) +) + +// LimitClassification is the result of inspecting an agent error. +type LimitClassification struct { + Kind LimitKind + Agent string // canonical agent name (caller resolves aliases) + ResetAt time.Time // zero if not parseable from the message + CooldownFor time.Duration // zero if not parseable; caller applies its own fallback + Message string // raw error text (for logs / user display) +} + +// LimitClassifier is the function shape used by callers that want to inject +// a stub in tests. +type LimitClassifier func(agent, errMsg string) LimitClassification + +// limitRule is one substring → kind mapping. The Agents slice restricts +// the rule to specific canonical agent names; "*" applies to any agent. +type limitRule struct { + Agents []string // canonical agent names; "*" = any + Substring string // case-insensitive substring match on the error message + Kind LimitKind +} + +// defaultLimitRules is the production rule table. The nine quota +// substrings are copied from the original isQuotaError set in +// internal/daemon/worker.go so detection for Gemini and Codex is +// byte-for-byte unchanged. +// +// Claude session-cap patterns are intentionally absent — once a real +// session-cap message is captured, a single LimitKindSession rule plus +// a unit-test fixture is sufficient to enable Claude detection. +var defaultLimitRules = []limitRule{ + {Agents: []string{"*"}, Substring: "resource exhausted", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "quota exceeded", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "quota_exceeded", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "quota exhausted", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "quota_exhausted", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "insufficient_quota", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "exhausted your capacity", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "capacity exhausted", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "capacity_exhausted", Kind: LimitKindQuota}, +} + +// ClassifyLimit inspects an agent error message and returns a +// LimitClassification describing whether (and how) the agent is +// rate-limited. The agent argument is the canonical agent name; the +// caller is responsible for resolving any aliases (e.g. "claude" → +// "claude-code") before calling. +// +// Returns Kind == LimitKindNone when no rule matches. +func ClassifyLimit(agent, errMsg string) LimitClassification { + return classifyLimitWithRules(agent, errMsg, defaultLimitRules) +} + +// classifyLimitWithRules is ClassifyLimit with an explicit rule slice. +// Unexported; used inside the package's own tests so synthetic fixtures +// (e.g. a LimitKindSession pattern) do not leak into defaultLimitRules. +func classifyLimitWithRules(agent, errMsg string, rules []limitRule) LimitClassification { + if errMsg == "" { + return LimitClassification{Kind: LimitKindNone, Agent: agent, Message: errMsg} + } + lower := strings.ToLower(errMsg) + for _, r := range rules { + if !limitRuleAppliesToAgent(r, agent) { + continue + } + if !strings.Contains(lower, r.Substring) { + continue + } + return LimitClassification{ + Kind: r.Kind, + Agent: agent, + ResetAt: ParseResetTime(errMsg), + CooldownFor: ParseResetDuration(errMsg), + Message: errMsg, + } + } + return LimitClassification{Kind: LimitKindNone, Agent: agent, Message: errMsg} +} + +func limitRuleAppliesToAgent(r limitRule, agent string) bool { + for _, a := range r.Agents { + if a == "*" || a == agent { + return true + } + } + return false +} diff --git a/internal/agentlimit/parse.go b/internal/agent/limit_parse.go similarity index 95% rename from internal/agentlimit/parse.go rename to internal/agent/limit_parse.go index 21077903..025a5931 100644 --- a/internal/agentlimit/parse.go +++ b/internal/agent/limit_parse.go @@ -1,4 +1,4 @@ -package agentlimit +package agent import ( "strings" @@ -69,8 +69,6 @@ func parseResetTimeAt(errMsg string, now time.Time) time.Time { default: return time.Time{} } - // Consume up to the next non-time character. Allow digits, ':', and - // AM/PM markers (with optional spaces before them). rest := errMsg[idx:] end := len(rest) for i, r := range rest { diff --git a/internal/agentlimit/parse_test.go b/internal/agent/limit_parse_test.go similarity index 99% rename from internal/agentlimit/parse_test.go rename to internal/agent/limit_parse_test.go index 3201f667..a467cfad 100644 --- a/internal/agentlimit/parse_test.go +++ b/internal/agent/limit_parse_test.go @@ -1,4 +1,4 @@ -package agentlimit +package agent import ( "testing" diff --git a/internal/agentlimit/agentlimit_test.go b/internal/agent/limit_test.go similarity index 55% rename from internal/agentlimit/agentlimit_test.go rename to internal/agent/limit_test.go index 006bba2a..18f77325 100644 --- a/internal/agentlimit/agentlimit_test.go +++ b/internal/agent/limit_test.go @@ -1,4 +1,4 @@ -package agentlimit +package agent import ( "testing" @@ -7,20 +7,20 @@ import ( "github.com/stretchr/testify/assert" ) -func TestClassificationZeroValue(t *testing.T) { - var c Classification +func TestLimitClassificationZeroValue(t *testing.T) { + var c LimitClassification assert := assert.New(t) - assert.Equal(KindNone, c.Kind) + assert.Equal(LimitKindNone, c.Kind) assert.Empty(c.Agent) assert.True(c.ResetAt.IsZero()) assert.Equal(time.Duration(0), c.CooldownFor) assert.Empty(c.Message) } -func TestClassifyProductionPatterns(t *testing.T) { +func TestClassifyLimitProductionPatterns(t *testing.T) { // All nine substrings from the original isQuotaError set must - // produce KindQuota. This is the byte-for-byte regression test - // for current Gemini and Codex detection. + // produce LimitKindQuota. This is the byte-for-byte regression + // test for current Gemini and Codex detection. patterns := []string{ "resource exhausted", "quota exceeded", @@ -34,27 +34,27 @@ func TestClassifyProductionPatterns(t *testing.T) { } for _, p := range patterns { t.Run(p, func(t *testing.T) { - cls := Classify("gemini", "agent failed: "+p+", retrying...") + cls := ClassifyLimit("gemini", "agent failed: "+p+", retrying...") assert := assert.New(t) - assert.Equal(KindQuota, cls.Kind, "expected KindQuota for %q", p) + assert.Equal(LimitKindQuota, cls.Kind, "expected LimitKindQuota for %q", p) assert.Equal("gemini", cls.Agent) assert.Contains(cls.Message, p) }) } } -func TestClassifyExtractsCooldownDuration(t *testing.T) { - cls := Classify( +func TestClassifyLimitExtractsCooldownDuration(t *testing.T) { + cls := ClassifyLimit( "gemini", "You have exhausted your capacity on this model. Your quota will reset after 48m20s.", ) assert := assert.New(t) - assert.Equal(KindQuota, cls.Kind) + assert.Equal(LimitKindQuota, cls.Kind) assert.Equal(48*time.Minute+20*time.Second, cls.CooldownFor) assert.True(cls.ResetAt.IsZero(), "no absolute time in this message") } -func TestClassifyNegativeCases(t *testing.T) { +func TestClassifyLimitNegativeCases(t *testing.T) { cases := []struct { name string msg string @@ -66,28 +66,29 @@ func TestClassifyNegativeCases(t *testing.T) { } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { - cls := Classify("gemini", tc.msg) - assert.Equal(t, KindNone, cls.Kind, "expected KindNone for %q", tc.msg) + cls := ClassifyLimit("gemini", tc.msg) + assert.Equal(t, LimitKindNone, cls.Kind, "expected LimitKindNone for %q", tc.msg) }) } } -func TestClassifyWithRulesIsolatesSyntheticPattern(t *testing.T) { - // Synthetic rule used only inside this test — does not pollute defaultRules. - syntheticRules := []rule{ - {Agents: []string{"*"}, Substring: "test-claude session limit", Kind: KindSession}, +func TestClassifyLimitWithRulesIsolatesSyntheticPattern(t *testing.T) { + // Synthetic rule used only inside this test — does not pollute + // defaultLimitRules. + syntheticRules := []limitRule{ + {Agents: []string{"*"}, Substring: "test-claude session limit", Kind: LimitKindSession}, } - cls := classifyWithRules( + cls := classifyLimitWithRules( "claude-code", "5-hour test-claude session limit reached", syntheticRules, ) assert := assert.New(t) - assert.Equal(KindSession, cls.Kind) + assert.Equal(LimitKindSession, cls.Kind) assert.Equal("claude-code", cls.Agent) - // Same message via the production Classify must not match — - // the synthetic rule is not in defaultRules. - cls2 := Classify("claude-code", "5-hour test-claude session limit reached") - assert.Equal(KindNone, cls2.Kind, "synthetic rule must not leak into defaultRules") + // Same message via the production ClassifyLimit must not match — + // the synthetic rule is not in defaultLimitRules. + cls2 := ClassifyLimit("claude-code", "5-hour test-claude session limit reached") + assert.Equal(LimitKindNone, cls2.Kind, "synthetic rule must not leak into defaultLimitRules") } diff --git a/internal/agentlimit/agentlimit.go b/internal/agentlimit/agentlimit.go deleted file mode 100644 index 802d1dbe..00000000 --- a/internal/agentlimit/agentlimit.go +++ /dev/null @@ -1,106 +0,0 @@ -// Package agentlimit classifies agent CLI error messages as rate-limit, -// quota, or session-cap signals so the daemon worker and the foreground -// roborev fix loop can react consistently. Pure logic — no I/O. -package agentlimit - -import ( - "strings" - "time" -) - -// Kind labels a classified agent error. -type Kind int - -const ( - KindNone Kind = iota // no rate-limit signal recognized - KindTransient // 429-style; retry locally, no cooldown - KindQuota // hard quota exhaustion (Gemini/Codex today) - KindSession // session-level cap (e.g. Claude 5-hour) -) - -// Classification is the result of inspecting an agent error. -type Classification struct { - Kind Kind - Agent string // canonical agent name (caller resolves aliases) - ResetAt time.Time // zero if not parseable from the message - CooldownFor time.Duration // zero if not parseable; caller applies its own fallback - Message string // raw error text (for logs / user display) -} - -// ClassifyFunc is the function shape used by callers that want to inject -// a stub in tests. -type ClassifyFunc func(agent, errMsg string) Classification - -// rule is one substring → kind mapping. The Agents slice restricts the -// rule to specific canonical agent names; "*" applies to any agent. -type rule struct { - Agents []string // canonical agent names; "*" = any - Substring string // case-insensitive substring match on the error message - Kind Kind -} - -// defaultRules is the production rule table. The nine quota substrings -// are copied from the original isQuotaError set in -// internal/daemon/worker.go so detection for Gemini and Codex is -// byte-for-byte unchanged. -// -// Claude session-cap patterns are intentionally absent — see the design -// doc's "Detection without a captured Claude message" section. Once a -// real session-cap message is captured, a single KindSession rule plus -// a unit-test fixture is sufficient to enable Claude detection. -var defaultRules = []rule{ - {Agents: []string{"*"}, Substring: "resource exhausted", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "quota exceeded", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "quota_exceeded", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "quota exhausted", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "quota_exhausted", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "insufficient_quota", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "exhausted your capacity", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "capacity exhausted", Kind: KindQuota}, - {Agents: []string{"*"}, Substring: "capacity_exhausted", Kind: KindQuota}, -} - -// Classify inspects an agent error message and returns a Classification -// describing whether (and how) the agent is rate-limited. The agent -// argument is the canonical agent name; the caller is responsible for -// resolving any aliases (e.g. "claude" → "claude-code") before calling. -// -// Returns Kind == KindNone when no rule matches. -func Classify(agent, errMsg string) Classification { - return classifyWithRules(agent, errMsg, defaultRules) -} - -// classifyWithRules is Classify with an explicit rule slice. Unexported; -// used inside the package's own tests so synthetic fixtures (e.g. a -// KindSession pattern) do not leak into defaultRules. -func classifyWithRules(agent, errMsg string, rules []rule) Classification { - if errMsg == "" { - return Classification{Kind: KindNone, Agent: agent, Message: errMsg} - } - lower := strings.ToLower(errMsg) - for _, r := range rules { - if !ruleAppliesToAgent(r, agent) { - continue - } - if !strings.Contains(lower, r.Substring) { - continue - } - return Classification{ - Kind: r.Kind, - Agent: agent, - ResetAt: ParseResetTime(errMsg), - CooldownFor: ParseResetDuration(errMsg), - Message: errMsg, - } - } - return Classification{Kind: KindNone, Agent: agent, Message: errMsg} -} - -func ruleAppliesToAgent(r rule, agent string) bool { - for _, a := range r.Agents { - if a == "*" || a == agent { - return true - } - } - return false -} diff --git a/internal/daemon/worker.go b/internal/daemon/worker.go index 632e6f22..439b70ec 100644 --- a/internal/daemon/worker.go +++ b/internal/daemon/worker.go @@ -13,7 +13,6 @@ import ( "time" "github.com/roborev-dev/roborev/internal/agent" - "github.com/roborev-dev/roborev/internal/agentlimit" "github.com/roborev-dev/roborev/internal/config" gitpkg "github.com/roborev-dev/roborev/internal/git" "github.com/roborev-dev/roborev/internal/prompt" @@ -49,9 +48,9 @@ type WorkerPool struct { agentCooldownsMu sync.RWMutex // classify is the rate-limit/quota classifier. Defaults to - // agentlimit.Classify; tests substitute a stub by setting this + // agent.ClassifyLimit; tests substitute a stub by setting this // field directly after construction (test-only access). - classify agentlimit.ClassifyFunc + classify agent.LimitClassifier // Output capture for tail command outputBuffers *OutputBuffer @@ -76,7 +75,7 @@ func NewWorkerPool(db *storage.DB, cfgGetter ConfigGetter, numWorkers int, broad pendingCancels: make(map[int64]bool), agentCooldowns: make(map[string]time.Time), outputBuffers: NewOutputBuffer(512*1024, 4*1024*1024), // 512KB/job, 4MB total - classify: agentlimit.Classify, + classify: agent.ClassifyLimit, } } @@ -797,11 +796,11 @@ func (wp *WorkerPool) failOrRetryInner(workerID string, job *storage.ReviewJob, // Quota and session-limit errors skip retries entirely — cool down // the agent and attempt failover or fail. Behavior matches the // original isQuotaError branch; classification now lives in - // internal/agentlimit so the CLI fix loop can share it. + // internal/agent (ClassifyLimit) so the CLI fix loop can share it. if agentError { cls := wp.classify(agent.CanonicalName(agentName), errorMsg) switch cls.Kind { - case agentlimit.KindQuota, agentlimit.KindSession: + case agent.LimitKindQuota, agent.LimitKindSession: dur := defaultCooldown if cls.CooldownFor > 0 { dur = cls.CooldownFor @@ -816,7 +815,7 @@ func (wp *WorkerPool) failOrRetryInner(workerID string, job *storage.ReviewJob, workerID, agentName, dur) wp.failoverOrFail(workerID, job, agentName, errorMsg) return - case agentlimit.KindNone: + case agent.LimitKindNone: if errorMsg != "" { preview := strings.ReplaceAll(errorMsg, "\n", " ") preview = strings.ReplaceAll(preview, "\r", "") @@ -824,7 +823,7 @@ func (wp *WorkerPool) failOrRetryInner(workerID string, job *storage.ReviewJob, workerID, agentName, truncateRunes(preview, 200)) } // fall through to context-window / retry handling - case agentlimit.KindTransient: + case agent.LimitKindTransient: // fall through to retry handling } } diff --git a/internal/daemon/worker_test.go b/internal/daemon/worker_test.go index c88b6073..d3151c0c 100644 --- a/internal/daemon/worker_test.go +++ b/internal/daemon/worker_test.go @@ -12,7 +12,6 @@ import ( "runtime" "github.com/roborev-dev/roborev/internal/agent" - "github.com/roborev-dev/roborev/internal/agentlimit" "github.com/roborev-dev/roborev/internal/config" gitpkg "github.com/roborev-dev/roborev/internal/git" "github.com/roborev-dev/roborev/internal/prompt" @@ -1201,16 +1200,16 @@ func TestFailOrRetryInner_SessionLimitCoolsDownAndSkipsRetries(t *testing.T) { // Stub classifier: any error message containing the marker yields // KindSession with a 1-hour CooldownFor. Tests the seam without // depending on real Claude wording. - tc.Pool.classify = func(agentName, msg string) agentlimit.Classification { + tc.Pool.classify = func(agentName, msg string) agent.LimitClassification { if strings.Contains(msg, "MARKER-SESSION-LIMIT") { - return agentlimit.Classification{ - Kind: agentlimit.KindSession, + return agent.LimitClassification{ + Kind: agent.LimitKindSession, Agent: agentName, CooldownFor: 1 * time.Hour, Message: msg, } } - return agentlimit.Classification{Kind: agentlimit.KindNone, Agent: agentName, Message: msg} + return agent.LimitClassification{Kind: agent.LimitKindNone, Agent: agentName, Message: msg} } tc.Pool.failOrRetryAgent(testWorkerID, job, "test", "boom MARKER-SESSION-LIMIT") @@ -1237,8 +1236,8 @@ func TestFailOrRetryInner_UnmatchedAgentErrorLogsWarn(t *testing.T) { sha := testutil.GetHeadSHA(t, tc.TmpDir) job := tc.createAndClaimJob(t, sha, testWorkerID) - tc.Pool.classify = func(agentName, msg string) agentlimit.Classification { - return agentlimit.Classification{Kind: agentlimit.KindNone, Agent: agentName, Message: msg} + tc.Pool.classify = func(agentName, msg string) agent.LimitClassification { + return agent.LimitClassification{Kind: agent.LimitKindNone, Agent: agentName, Message: msg} } tc.Pool.failOrRetryAgent(testWorkerID, job, "test", "some brand new error wording from a future agent") From a7ac2c19e2c77e3ee8aff7f2475832fded9d4417 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 17:10:24 -0500 Subject: [PATCH 22/24] chore: tell review agents to stop flagging build errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewers keep raising "function X not defined" or "package Y not imported" findings against diffs that build cleanly — the symbol just lives elsewhere in the file or package, which the reviewer cannot see. Add a guideline that explicitly excludes compile-level findings and points at the local toolchain (go build, go vet, golangci-lint, the pre-commit hook) as the source of truth. --- .roborev.toml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.roborev.toml b/.roborev.toml index a33799c1..ce3198a9 100644 --- a/.roborev.toml +++ b/.roborev.toml @@ -78,4 +78,16 @@ findings with summary/front matter in a way that makes verdict detection less reliable, the fix should be to tighten the review prompts/templates and output contract. Do not ask for increasingly broad deterministic heuristics to parse arbitrary narrative text. + +## Compilation, imports, and build errors + +Do not flag suspected compile errors, missing imports, undeclared +identifiers, type mismatches, or other build-level issues. The local +toolchain (go build, go vet, golangci-lint) and the pre-commit hook +catch these before any commit lands; if the diff actually fails to +compile, the PR cannot merge regardless of what the review says. +Reviews see only the diff, not the rest of the package, so claims like +"function X is not defined" or "package Y is not imported" are almost +always wrong — the symbol exists elsewhere in the file or package. +Focus reviews on logic, architecture, and behavior; trust the build. """ From 657052b9956e08df4e316b839acca88212a6a406 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 18:45:34 -0500 Subject: [PATCH 23/24] docs: mark LimitKindSession as a prepared path, not armed for production defaultLimitRules contains only the nine quota substrings inherited from isQuotaError; nothing in production produces LimitKindSession today. Claude's 5-hour cap will fall through to the unmatched-error path until a real session-cap message is captured and a narrow rule is added. Update the LimitKindSession const comment, the defaultLimitRules comment (with a TODO and a note on why speculative substrings are not added), and the CLAUDE.md cooldown reference so the daemon-side documentation reflects what the production classifier actually emits today vs. what the plumbing supports once a rule lands. Addresses the medium review finding about session-limit handling being unreachable in production. --- CLAUDE.md | 2 +- internal/agent/limit.go | 21 +++++++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 337a7b1c..18b9409b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -210,7 +210,7 @@ The internal `lookupFieldByTag()` helper resolves these via reflection on the st - **Retries**: Up to 3 retries for transient failures (`db.RetryJob`). Resets status to queued. - **Failover**: After retries exhausted (or on quota errors), switches to backup agent via `db.FailoverJob`. Resets retry_count, sets backup agent/model. -- **Cooldown**: Quota exhaustion errors (classified by `agent.ClassifyLimit` as `LimitKindQuota`/`LimitKindSession`) trigger per-agent cooldown (default 30 min, parsed from the error message via `agent.ParseResetDuration`/`ParseResetTime`). Cooldowns are tracked in-memory with RWMutex. +- **Cooldown**: Quota exhaustion errors (classified by `agent.ClassifyLimit` as `LimitKindQuota`) trigger per-agent cooldown (default 30 min, parsed from the error message via `agent.ParseResetDuration`/`ParseResetTime`). Cooldowns are tracked in-memory with RWMutex. `LimitKindSession` follows the same cooldown path, but no production rule emits it yet — a Claude session-cap rule is pending a captured error message. ### Workflow derivation for failover diff --git a/internal/agent/limit.go b/internal/agent/limit.go index 530876c2..fed68620 100644 --- a/internal/agent/limit.go +++ b/internal/agent/limit.go @@ -12,7 +12,14 @@ const ( LimitKindNone LimitKind = iota // no rate-limit signal recognized LimitKindTransient // 429-style; retry locally, no cooldown LimitKindQuota // hard quota exhaustion (Gemini/Codex today) - LimitKindSession // session-level cap (e.g. Claude 5-hour) + // LimitKindSession is a session-level cap (e.g. Claude 5-hour). + // Plumbing is in place — daemon cooldown/abort and CLI strict abort + // both handle it — but defaultLimitRules does not yet produce it for + // any real agent. ClassifyLimit can return LimitKindSession only via + // an injected classifier (tests). Production support is pending a + // captured Claude session-cap message; see the TODO at + // defaultLimitRules. + LimitKindSession ) // LimitClassification is the result of inspecting an agent error. @@ -41,9 +48,15 @@ type limitRule struct { // internal/daemon/worker.go so detection for Gemini and Codex is // byte-for-byte unchanged. // -// Claude session-cap patterns are intentionally absent — once a real -// session-cap message is captured, a single LimitKindSession rule plus -// a unit-test fixture is sufficient to enable Claude detection. +// TODO: add a LimitKindSession rule for Claude's 5-hour cap once the +// exact error wording is captured from a real session-cap failure. +// Speculative substrings ("usage limit", "limit reached", etc.) are +// not added on purpose — they would also match policy errors, +// transient 429s, and config-validation messages, and a false positive +// would abort roborev fix and cool down the agent when retrying might +// have worked. Until that pattern lands, ClassifyLimit returns +// LimitKindSession only when an injected classifier produces it +// (i.e., in tests). var defaultLimitRules = []limitRule{ {Agents: []string{"*"}, Substring: "resource exhausted", Kind: LimitKindQuota}, {Agents: []string{"*"}, Substring: "quota exceeded", Kind: LimitKindQuota}, From 333420a5add47c2711b3b3a3aaa7f7fb2482de43 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 May 2026 18:54:40 -0500 Subject: [PATCH 24/24] fix: warn on dirty tree at retry-abort and slice lowered string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two findings on commit 657052b: 1. fixJobDirect's retry-path limit abort lost the "Changes were made but not committed" warning that the success path emits. If the first agent call left changes and the commit retry hit a quota or session limit, the user got only a cooldown message and a dirty tree. Print the same warning to out before returning the limit error so the user sees both states. 2. ParseResetDuration and parseResetTimeAt computed indices against strings.ToLower(errMsg) and then sliced errMsg with those indices. A rune like "İ" U+0130 shortens to "i" under ToLower (2 bytes → 1 byte), so the indices can land in the middle of a UTF-8 sequence or extract the wrong substring. Slice the lowercased copy instead. time.ParseDuration and time.Parse work fine with lowercase unit suffixes, so the lowercase format strings are sufficient (drop the redundant uppercase "3:04 PM" / "3:04PM" entries). Add regression tests: - TestParseResetDurationToLowerByteSafety with "İ reset after 30m" - TestParseResetTimeToLowerByteSafety with "İ resets at 5:42 PM" - TestFixJobDirect_RetryClassifiesAgentLimit captures fixJobDirect's output via bytes.Buffer and asserts the dirty-tree warning fires alongside the agentLimitError propagation. --- cmd/roborev/fix.go | 10 ++++++++++ cmd/roborev/fix_test.go | 10 ++++++++++ internal/agent/limit_parse.go | 19 +++++++++++++------ internal/agent/limit_parse_test.go | 19 +++++++++++++++++++ 4 files changed, 52 insertions(+), 6 deletions(-) diff --git a/cmd/roborev/fix.go b/cmd/roborev/fix.go index 3a167f29..6739d9f0 100644 --- a/cmd/roborev/fix.go +++ b/cmd/roborev/fix.go @@ -409,6 +409,16 @@ func fixJobDirect(ctx context.Context, params fixJobParams, prompt string) (*fix } cls := classify(agent.CanonicalName(retryAgent.Name()), retryErr.Error()) if cls.Kind == agent.LimitKindQuota || cls.Kind == agent.LimitKindSession { + // The first agent call left uncommitted changes; the + // retry that would have committed them was aborted by + // the limit. Surface the dirty-tree state so the user + // can decide whether to commit manually before the + // cooldown expires — the success path emits the same + // warning, and bare cooldown text would otherwise hide + // the regression. + if hasChanges, _ := git.HasUncommittedChanges(params.RepoRoot); hasChanges { + fmt.Fprintln(out, "Warning: Changes were made but not committed. Please review and commit manually.") + } return nil, &agentLimitError{Classification: cls} } fmt.Fprintf(out, "Warning: commit agent failed: %v\n", retryErr) diff --git a/cmd/roborev/fix_test.go b/cmd/roborev/fix_test.go index f2c16cab..72e171fc 100644 --- a/cmd/roborev/fix_test.go +++ b/cmd/roborev/fix_test.go @@ -3915,9 +3915,11 @@ func TestFixJobDirect_RetryClassifiesAgentLimit(t *testing.T) { } } + var buf bytes.Buffer _, err := fixJobDirect(context.Background(), fixJobParams{ RepoRoot: dir, Agent: ag, + Output: &buf, Classify: classify, }, "fix things") require.Error(t, err, "expected agentLimitError from retry path") @@ -3931,4 +3933,12 @@ func TestFixJobDirect_RetryClassifiesAgentLimit(t *testing.T) { assert.Equal(1, classifyCalls, "classifier should run once on the retry error") assert.Contains(err.Error(), "claude-code") assert.Contains(err.Error(), "session limit") + // The first call left a dirty tree; the abort must still surface + // the uncommitted-changes warning so the user knows their tree + // needs attention before re-running after cooldown. + assert.Contains( + buf.String(), + "Changes were made but not committed", + "limit-abort retry path should still warn about uncommitted changes", + ) } diff --git a/internal/agent/limit_parse.go b/internal/agent/limit_parse.go index 025a5931..c7c460c7 100644 --- a/internal/agent/limit_parse.go +++ b/internal/agent/limit_parse.go @@ -19,11 +19,15 @@ const ( // values to [minCooldown, maxCooldown]. func ParseResetDuration(errMsg string) time.Duration { lower := strings.ToLower(errMsg) - idx := strings.Index(lower, "reset after ") - if idx < 0 { + _, after, ok := strings.Cut(lower, "reset after ") + if !ok { return 0 } - rest := errMsg[idx+len("reset after "):] + // Slice lower (not errMsg): a few runes (e.g. "İ" U+0130 → "i") + // shorten under strings.ToLower, so byte indices computed against + // lower can land in the middle of a UTF-8 sequence in errMsg. + // time.ParseDuration accepts the lowercase unit suffixes anyway. + rest := after token := rest if sp := strings.IndexAny(rest, " \t\n,;)"); sp > 0 { token = rest[:sp] @@ -69,7 +73,10 @@ func parseResetTimeAt(errMsg string, now time.Time) time.Time { default: return time.Time{} } - rest := errMsg[idx:] + // Slice lower (not errMsg) so a rune that shortens under ToLower + // can't shift byte offsets out of alignment with errMsg. The + // token feeds time.Parse with lowercase formats below. + rest := lower[idx:] end := len(rest) for i, r := range rest { // Stop at sentence-ending punctuation or newline. @@ -81,8 +88,8 @@ func parseResetTimeAt(errMsg string, now time.Time) time.Time { token := strings.TrimSpace(rest[:end]) formats := []string{ - "3:04 PM", "3:04 pm", - "3:04PM", "3:04pm", + "3:04 pm", + "3:04pm", "15:04", } for _, f := range formats { diff --git a/internal/agent/limit_parse_test.go b/internal/agent/limit_parse_test.go index a467cfad..45656f63 100644 --- a/internal/agent/limit_parse_test.go +++ b/internal/agent/limit_parse_test.go @@ -109,6 +109,25 @@ func TestParseResetTimeRespectsLocation(t *testing.T) { assert.Equal(t, est, got.Location()) } +// TestParseResetDurationToLowerByteSafety guards the ToLower +// byte-index pitfall: U+0130 "İ" lowercases to "i" — a 2→1 byte +// change. Slicing the original errMsg with indices computed against +// strings.ToLower(errMsg) would extract the wrong bytes (or panic +// inside a multi-byte sequence). Slicing the lowercased copy is +// safe and parses cleanly. +func TestParseResetDurationToLowerByteSafety(t *testing.T) { + got := ParseResetDuration("İ reset after 30m") + assert.Equal(t, 30*time.Minute, got) +} + +func TestParseResetTimeToLowerByteSafety(t *testing.T) { + loc := time.FixedZone("test", 0) + now := time.Date(2026, 5, 5, 12, 0, 0, 0, loc) // noon + got := parseResetTimeAt("İ resets at 5:42 PM", now) + want := time.Date(2026, 5, 5, 17, 42, 0, 0, loc) + assert.Equal(t, want, got) +} + // TestParseResetTimeAcrossDST exercises the rollover code path through // real DST transitions where Add(24*time.Hour) would land an hour off. // time.FixedZone has no DST rules, so it cannot exercise this branch —