diff --git a/.roborev.toml b/.roborev.toml index a33799c1..ce3198a9 100644 --- a/.roborev.toml +++ b/.roborev.toml @@ -78,4 +78,16 @@ findings with summary/front matter in a way that makes verdict detection less reliable, the fix should be to tighten the review prompts/templates and output contract. Do not ask for increasingly broad deterministic heuristics to parse arbitrary narrative text. + +## Compilation, imports, and build errors + +Do not flag suspected compile errors, missing imports, undeclared +identifiers, type mismatches, or other build-level issues. The local +toolchain (go build, go vet, golangci-lint) and the pre-commit hook +catch these before any commit lands; if the diff actually fails to +compile, the PR cannot merge regardless of what the review says. +Reviews see only the diff, not the rest of the package, so claims like +"function X is not defined" or "package Y is not imported" are almost +always wrong — the symbol exists elsewhere in the file or package. +Focus reviews on logic, architecture, and behavior; trust the build. """ diff --git a/CLAUDE.md b/CLAUDE.md index 1b362706..18b9409b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -210,7 +210,7 @@ The internal `lookupFieldByTag()` helper resolves these via reflection on the st - **Retries**: Up to 3 retries for transient failures (`db.RetryJob`). Resets status to queued. - **Failover**: After retries exhausted (or on quota errors), switches to backup agent via `db.FailoverJob`. Resets retry_count, sets backup agent/model. -- **Cooldown**: Quota exhaustion errors (`isQuotaError`) trigger per-agent cooldown (default 30 min, parsed from error message). Cooldowns are tracked in-memory with RWMutex. +- **Cooldown**: Quota exhaustion errors (classified by `agent.ClassifyLimit` as `LimitKindQuota`) trigger per-agent cooldown (default 30 min, parsed from the error message via `agent.ParseResetDuration`/`ParseResetTime`). Cooldowns are tracked in-memory with RWMutex. `LimitKindSession` follows the same cooldown path, but no production rule emits it yet — a Claude session-cap rule is pending a captured error message. ### Workflow derivation for failover diff --git a/cmd/roborev/fix.go b/cmd/roborev/fix.go index 05bc868d..6739d9f0 100644 --- a/cmd/roborev/fix.go +++ b/cmd/roborev/fix.go @@ -141,6 +141,7 @@ Examples: minSeverity: minSeverity, quiet: quiet, resume: resume, + classify: agent.ClassifyLimit, } roots, err := resolveCurrentRepoRoots() @@ -228,6 +229,81 @@ type fixOptions struct { minSeverity string quiet bool resume bool + + // classify is the rate-limit classifier. Defaults to + // agent.ClassifyLimit in the production cobra command's RunE; tests + // inject a stub to drive deterministic KindQuota / KindSession + // outcomes without depending on real agent error wording. + classify agent.LimitClassifier +} + +// agentLimitError is returned by the fix loop when the configured agent +// hits a quota or session limit. The fix command surfaces it as the +// process exit error so users see the reset time and a hint to retry. +type agentLimitError struct { + Classification agent.LimitClassification +} + +func (e *agentLimitError) Error() string { + return formatAgentLimitMessage(e.Classification, time.Now()) +} + +// formatAgentLimitMessage builds the user-facing abort message. Pulled +// out so tests can assert against it without depending on time.Now. +// The label ("quota" / "session limit" / "rate limit") is derived from +// cls.Kind so a Gemini/Codex KindQuota abort doesn't mis-report itself +// as a session-cap. +func formatAgentLimitMessage(cls agent.LimitClassification, now time.Time) string { + label := agentLimitLabel(cls.Kind) + var dur time.Duration + switch { + case !cls.ResetAt.IsZero(): + dur = cls.ResetAt.Sub(now) + case cls.CooldownFor > 0: + dur = cls.CooldownFor + } + switch { + case dur > 0 && !cls.ResetAt.IsZero(): + return fmt.Sprintf( + "agent %s hit a %s. Cooldown until %s (in %s). "+ + "Re-run after that, or pass --agent to switch.", + cls.Agent, + label, + cls.ResetAt.Format("3:04 PM"), + dur.Round(time.Minute), + ) + case dur > 0: + return fmt.Sprintf( + "agent %s hit a %s. Cooldown for ~%s. "+ + "Re-run after that, or pass --agent to switch.", + cls.Agent, + label, + dur.Round(time.Minute), + ) + default: + flat := strings.ReplaceAll(cls.Message, "\n", " ") + return fmt.Sprintf( + "agent %s hit a %s (unknown reset time). "+ + "Re-run later, or pass --agent to switch. "+ + "Original error: %s", + cls.Agent, + label, + truncateString(flat, 200), + ) + } +} + +func agentLimitLabel(k agent.LimitKind) string { + switch k { + case agent.LimitKindSession: + return "session limit" + case agent.LimitKindQuota: + return "quota limit" + case agent.LimitKindTransient: + return "rate limit" + default: + return "rate limit" + } } // fixJobParams configures a fixJobDirect operation. @@ -235,6 +311,9 @@ type fixJobParams struct { RepoRoot string Agent agent.Agent Output io.Writer // agent streaming output (nil = discard) + // Classify is the rate-limit classifier used for the commit-retry + // path. nil defaults to agent.ClassifyLimit. Tests inject a stub. + Classify agent.LimitClassifier } // fixJobResult contains the outcome of a fix operation. @@ -320,6 +399,28 @@ func fixJobDirect(ctx context.Context, params fixJobParams, prompt string) (*fix } } if _, retryErr := retryAgent.Review(ctx, params.RepoRoot, "HEAD", buildGenericCommitPrompt(), out); retryErr != nil { + // Classify the retry error so quota/session limits abort + // instead of being demoted to a warning — otherwise the fix + // loop keeps invoking the exhausted agent on every following + // job until the queue is empty. + classify := params.Classify + if classify == nil { + classify = agent.ClassifyLimit + } + cls := classify(agent.CanonicalName(retryAgent.Name()), retryErr.Error()) + if cls.Kind == agent.LimitKindQuota || cls.Kind == agent.LimitKindSession { + // The first agent call left uncommitted changes; the + // retry that would have committed them was aborted by + // the limit. Surface the dirty-tree state so the user + // can decide whether to commit manually before the + // cooldown expires — the success path emits the same + // warning, and bare cooldown text would otherwise hide + // the regression. + if hasChanges, _ := git.HasUncommittedChanges(params.RepoRoot); hasChanges { + fmt.Fprintln(out, "Warning: Changes were made but not committed. Please review and commit manually.") + } + return nil, &agentLimitError{Classification: cls} + } fmt.Fprintf(out, "Warning: commit agent failed: %v\n", retryErr) } if sha, ok := detectNewCommit(params.RepoRoot, headBefore); ok { @@ -421,6 +522,14 @@ func runFixWithSeen(cmd *cobra.Command, jobIDs []int64, opts fixOptions, seen ma if isConnectionError(err) { return fmt.Errorf("daemon connection lost: %w", err) } + // Agent quota/session-limit aborts must propagate even in + // discovery mode — otherwise the re-query loop keeps + // invoking the exhausted agent until every queued job is + // burned through with the same error. + var lim *agentLimitError + if errors.As(err, &lim) { + return err + } // In discovery mode (seen != nil), log a warning and // continue best-effort. For explicit job IDs (seen == // nil), return the error so the CLI exits non-zero. @@ -828,6 +937,9 @@ func jobVerdict(job *storage.ReviewJob, review *storage.Review) string { } func fixSingleJob(cmd *cobra.Command, repoRoot string, jobID int64, opts fixOptions, tracker *fixSessionTracker) error { + if opts.classify == nil { + opts.classify = agent.ClassifyLimit + } ctx := cmd.Context() if ctx == nil { ctx = context.Background() @@ -927,6 +1039,7 @@ func fixSingleJob(cmd *cobra.Command, repoRoot string, jobID int64, opts fixOpti RepoRoot: repoRoot, Agent: currentAgent, Output: capture, + Classify: opts.classify, }, buildGenericFixPrompt(review.Output, minSev, comments)) // Flush capture FIRST so session extraction completes before reading SessionID. capture.Flush() @@ -935,6 +1048,27 @@ func fixSingleJob(cmd *cobra.Command, repoRoot string, jobID int64, opts fixOpti } if err != nil { tracker.Reset() + // fixJobDirect already returns *agentLimitError for retry-path + // quota/session aborts; preserve it instead of re-classifying + // its user-facing message string. + var lim *agentLimitError + if errors.As(err, &lim) { + return err + } + cls := opts.classify(agent.CanonicalName(currentAgent.Name()), err.Error()) + switch cls.Kind { + case agent.LimitKindQuota, agent.LimitKindSession: + return &agentLimitError{Classification: cls} + case agent.LimitKindNone: + if err.Error() != "" && !opts.quiet { + flat := strings.ReplaceAll(err.Error(), "\n", " ") + cmd.PrintErrf( + "warning: unclassified agent error from %s: %s\n", + currentAgent.Name(), + truncateString(flat, 200), + ) + } + } return err } tracker.Capture(capture.SessionID()) @@ -998,6 +1132,9 @@ type batchEntry struct { // runFixBatch discovers jobs (or uses provided IDs), splits them into batches // respecting max prompt size, and runs each batch as a single agent invocation. func runFixBatch(cmd *cobra.Command, jobIDs []int64, branch string, allBranches, explicitBranch, newestFirst bool, batchSize int, opts fixOptions, tracker *fixSessionTracker) error { + if opts.classify == nil { + opts.classify = agent.ClassifyLimit + } if err := ensureDaemon(); err != nil { return err } @@ -1171,6 +1308,7 @@ func runFixBatch(cmd *cobra.Command, jobIDs []int64, branch string, allBranches, RepoRoot: roots.worktreeRoot, Agent: currentAgent, Output: capture, + Classify: opts.classify, }, prompt) // Flush capture FIRST so session extraction completes before reading SessionID. capture.Flush() @@ -1179,6 +1317,26 @@ func runFixBatch(cmd *cobra.Command, jobIDs []int64, branch string, allBranches, } if err != nil { tracker.Reset() + // Preserve a retry-path agentLimitError without + // re-classifying its user-facing message string. + var lim *agentLimitError + if errors.As(err, &lim) { + return err + } + cls := opts.classify(agent.CanonicalName(currentAgent.Name()), err.Error()) + switch cls.Kind { + case agent.LimitKindQuota, agent.LimitKindSession: + return &agentLimitError{Classification: cls} + case agent.LimitKindNone: + if err.Error() != "" && !opts.quiet { + flat := strings.ReplaceAll(err.Error(), "\n", " ") + cmd.PrintErrf( + "warning: unclassified agent error from %s: %s\n", + currentAgent.Name(), + truncateString(flat, 200), + ) + } + } cmd.Printf("Warning: error in batch %d: %v\n", i+1, err) continue } diff --git a/cmd/roborev/fix_test.go b/cmd/roborev/fix_test.go index 7a3c918f..72e171fc 100644 --- a/cmd/roborev/fix_test.go +++ b/cmd/roborev/fix_test.go @@ -3601,3 +3601,344 @@ func TestFixCmd_BatchSizeAcceptsBranchFlags(t *testing.T) { }) } } + +func TestFixSingleJobAbortsOnSessionLimit(t *testing.T) { + repo := createTestRepo(t, map[string]string{"main.go": "package main\n"}) + + originalAgent, err := agent.Get("test") + require.NoError(t, err, "get test agent") + agent.Register(&agent.FakeAgent{ + NameStr: "test", + ReviewFn: func(_ context.Context, _, _, _ string, _ io.Writer) (string, error) { + return "", errors.New("simulated claude session cap") + }, + }) + t.Cleanup(func() { agent.Register(originalAgent) }) + + ts, _ := newMockServer(t, MockServerOpts{ + ReviewOutput: "## Issues\n- Found issue", + OnJobs: func(w http.ResponseWriter, r *http.Request) { + writeJSON(w, map[string]any{ + "jobs": []storage.ReviewJob{{ + ID: 99, + Status: storage.JobStatusDone, + Agent: "test", + }}, + }) + }, + }) + patchServerAddr(t, ts.URL) + + cmd, _ := newTestCmd(t) + + resetAt := time.Now().Add(2*time.Hour + 13*time.Minute) + classifyCalls := 0 + opts := fixOptions{ + agentName: "test", + reasoning: "fast", + classify: func(_, msg string) agent.LimitClassification { + classifyCalls++ + return agent.LimitClassification{ + Kind: agent.LimitKindSession, + Agent: "claude-code", + ResetAt: resetAt, + Message: msg, + } + }, + } + + base, err := resolveFixAgent(repo.Dir, opts) + require.NoError(t, err, "resolveFixAgent") + tracker := &fixSessionTracker{base: base, out: io.Discard} + err = fixSingleJob(cmd, repo.Dir, 99, opts, tracker) + require.Error(t, err, "expected abort error, got nil") + + var lim *agentLimitError + require.ErrorAs(t, err, &lim, "expected agentLimitError, got %T: %v", err, err) + + assert := assert.New(t) + assert.Equal(agent.LimitKindSession, lim.Classification.Kind) + assert.Equal(1, classifyCalls, "classifier should be called exactly once") + assert.Contains(err.Error(), "claude-code") + assert.Contains(err.Error(), "session limit") +} + +func TestRunFixBatchAbortsOnQuotaError(t *testing.T) { + repo := createTestRepo(t, map[string]string{"main.go": "package main\n"}) + + originalAgent, err := agent.Get("test") + require.NoError(t, err, "get test agent") + agentCalls := 0 + agent.Register(&agent.FakeAgent{ + NameStr: "test", + ReviewFn: func(_ context.Context, _, _, _ string, _ io.Writer) (string, error) { + agentCalls++ + return "", errors.New("agent failed: you have exhausted your capacity") + }, + }) + t.Cleanup(func() { agent.Register(originalAgent) }) + + var mu sync.Mutex + var reviewedJobIDs []int64 + _ = newMockDaemonBuilder(t). + WithHandler("/api/jobs", func(w http.ResponseWriter, r *http.Request) { + id := r.URL.Query().Get("id") + switch id { + case "10", "20": + idNum := int64(10) + if id == "20" { + idNum = 20 + } + writeJSON(w, map[string]any{ + "jobs": []storage.ReviewJob{{ + ID: idNum, + Status: storage.JobStatusDone, + Agent: "test", + }}, + }) + default: + writeJSON(w, map[string]any{ + "jobs": []storage.ReviewJob{}, + "has_more": false, + }) + } + }). + WithHandler("/api/review", func(w http.ResponseWriter, r *http.Request) { + jobID := r.URL.Query().Get("job_id") + mu.Lock() + switch jobID { + case "10": + reviewedJobIDs = append(reviewedJobIDs, 10) + case "20": + reviewedJobIDs = append(reviewedJobIDs, 20) + } + mu.Unlock() + writeJSON(w, storage.Review{ + JobID: 10, + Output: "## Issues\n- Bug", + }) + }). + WithHandler("/api/enqueue", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + }). + Build() + + classifyCalls := 0 + opts := fixOptions{ + agentName: "test", + reasoning: "fast", + classify: func(_, msg string) agent.LimitClassification { + classifyCalls++ + return agent.LimitClassification{ + Kind: agent.LimitKindQuota, + Agent: "gemini", + CooldownFor: 30 * time.Minute, + Message: msg, + } + }, + } + + base, err := resolveFixAgent(repo.Dir, opts) + require.NoError(t, err, "resolveFixAgent") + + _, err = runWithOutput(t, repo.Dir, func(cmd *cobra.Command) error { + // batchSize=1 forces one job per agent call. With batchSize=0, + // runFixBatch may pack both jobs into a single batch, making + // agentCalls=1 ambiguous between "aborted before second batch" + // and "both jobs in one batch". batchSize=1 is unambiguous. + return runFixBatch( + cmd, + []int64{10, 20}, + "", + false, false, false, + 1, + opts, + &fixSessionTracker{base: base, out: io.Discard}, + ) + }) + require.Error(t, err, "expected batch to abort with agentLimitError") + + var lim *agentLimitError + require.ErrorAs(t, err, &lim, "expected agentLimitError, got %T: %v", err, err) + + assert := assert.New(t) + assert.Equal(agent.LimitKindQuota, lim.Classification.Kind) + assert.Equal(1, classifyCalls, "classifier should be called exactly once") + // agentCalls is the load-bearing assertion: with batchSize=1 and two + // jobs, two agent invocations would mean the loop continued past the + // abort. Exactly one means the abort short-circuits the second batch. + assert.Equal(1, agentCalls, "fix agent should be invoked exactly once before abort") + assert.Contains(err.Error(), "quota limit", "Gemini KindQuota label should be 'quota limit', not 'session limit'") + + // Note: do NOT assert `reviewedJobIDs` does not contain job 20 — + // runFixBatch prefetches every job's review at the start of the + // function (before any agent call), so /api/review fires for all + // queued jobs regardless of when the loop aborts. The agentCalls + // counter above is the correct measure of abort behavior. +} + +func TestRunFixWithSeenDiscoveryAbortsOnAgentLimit(t *testing.T) { + // Discovery mode (seen != nil) normally warns-and-continues on + // per-job errors, but agent quota/session-limit aborts must + // propagate so the re-query loop doesn't keep invoking the + // exhausted agent. + repo := createTestRepo(t, map[string]string{"main.go": "package main\n"}) + + originalAgent, err := agent.Get("test") + require.NoError(t, err, "get test agent") + agentCalls := 0 + agent.Register(&agent.FakeAgent{ + NameStr: "test", + ReviewFn: func(_ context.Context, _, _, _ string, _ io.Writer) (string, error) { + agentCalls++ + return "", errors.New("agent failed: you have exhausted your capacity") + }, + }) + t.Cleanup(func() { agent.Register(originalAgent) }) + + _ = newMockDaemonBuilder(t). + WithHandler("/api/jobs", func(w http.ResponseWriter, r *http.Request) { + id := r.URL.Query().Get("id") + var idNum int64 + switch id { + case "10": + idNum = 10 + case "20": + idNum = 20 + default: + writeJSON(w, map[string]any{ + "jobs": []storage.ReviewJob{}, + "has_more": false, + }) + return + } + writeJSON(w, map[string]any{ + "jobs": []storage.ReviewJob{{ + ID: idNum, + Status: storage.JobStatusDone, + Agent: "test", + }}, + }) + }). + WithHandler("/api/review", func(w http.ResponseWriter, r *http.Request) { + writeJSON(w, storage.Review{Output: "## Issues\n- Bug"}) + }). + WithHandler("/api/enqueue", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + }). + Build() + + classifyCalls := 0 + opts := fixOptions{ + agentName: "test", + reasoning: "fast", + classify: func(_, msg string) agent.LimitClassification { + classifyCalls++ + return agent.LimitClassification{ + Kind: agent.LimitKindQuota, + Agent: "gemini", + CooldownFor: 30 * time.Minute, + Message: msg, + } + }, + } + + seen := make(map[int64]bool) + _, err = runWithOutput(t, repo.Dir, func(cmd *cobra.Command) error { + base, baseErr := resolveFixAgent(repo.Dir, opts) + require.NoError(t, baseErr, "resolveFixAgent") + tracker := &fixSessionTracker{base: base, out: io.Discard} + return runFixWithSeen(cmd, []int64{10, 20}, opts, seen, tracker) + }) + require.Error(t, err, "expected discovery mode to propagate agentLimitError") + + var lim *agentLimitError + require.ErrorAs(t, err, &lim, "expected agentLimitError, got %T: %v", err, err) + + assert := assert.New(t) + assert.Equal(agent.LimitKindQuota, lim.Classification.Kind) + assert.Equal(1, classifyCalls, "classifier should be called exactly once") + // Load-bearing: only one agent invocation means the abort short- + // circuited the loop. Two would mean discovery mode demoted the + // agentLimitError to a warning and continued. + assert.Equal(1, agentCalls, "fix agent should be invoked exactly once before abort") + assert.False(seen[20], "second job must not be marked seen after abort") +} + +func TestFixJobDirect_RetryClassifiesAgentLimit(t *testing.T) { + // fixJobDirect runs the agent twice when the first call leaves + // uncommitted changes: once for the fix, once with commit + // instructions. A quota or session-limit error on that second + // call must produce *agentLimitError, not a swallowed warning. + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not available") + } + + dir := t.TempDir() + for _, args := range [][]string{ + {"init"}, + {"config", "user.email", "test@test.com"}, + {"config", "user.name", "Test"}, + {"commit", "--allow-empty", "-m", "base"}, + } { + c := exec.Command("git", args...) + c.Dir = dir + require.NoError(t, c.Run(), "git %v", args) + } + + calls := 0 + ag := &agent.FakeAgent{ + NameStr: "test", + ReviewFn: func(_ context.Context, repoPath, _, _ string, _ io.Writer) (string, error) { + calls++ + if calls == 1 { + // First call: leave an uncommitted change so the retry + // path is reached. + return "applied fix", os.WriteFile( + filepath.Join(repoPath, "dirty.txt"), + []byte("uncommitted"), + 0o644, + ) + } + // Second call (commit retry): hit a session limit. + return "", errors.New("simulated claude session cap") + }, + } + + classifyCalls := 0 + classify := func(_, msg string) agent.LimitClassification { + classifyCalls++ + return agent.LimitClassification{ + Kind: agent.LimitKindSession, + Agent: "claude-code", + Message: msg, + } + } + + var buf bytes.Buffer + _, err := fixJobDirect(context.Background(), fixJobParams{ + RepoRoot: dir, + Agent: ag, + Output: &buf, + Classify: classify, + }, "fix things") + require.Error(t, err, "expected agentLimitError from retry path") + + var lim *agentLimitError + require.ErrorAs(t, err, &lim, "expected *agentLimitError, got %T: %v", err, err) + + assert := assert.New(t) + assert.Equal(agent.LimitKindSession, lim.Classification.Kind) + assert.Equal(2, calls, "agent must be called twice (fix + retry) before abort") + assert.Equal(1, classifyCalls, "classifier should run once on the retry error") + assert.Contains(err.Error(), "claude-code") + assert.Contains(err.Error(), "session limit") + // The first call left a dirty tree; the abort must still surface + // the uncommitted-changes warning so the user knows their tree + // needs attention before re-running after cooldown. + assert.Contains( + buf.String(), + "Changes were made but not committed", + "limit-abort retry path should still warn about uncommitted changes", + ) +} diff --git a/internal/agent/limit.go b/internal/agent/limit.go new file mode 100644 index 00000000..fed68620 --- /dev/null +++ b/internal/agent/limit.go @@ -0,0 +1,116 @@ +package agent + +import ( + "strings" + "time" +) + +// LimitKind labels a classified agent error. +type LimitKind int + +const ( + LimitKindNone LimitKind = iota // no rate-limit signal recognized + LimitKindTransient // 429-style; retry locally, no cooldown + LimitKindQuota // hard quota exhaustion (Gemini/Codex today) + // LimitKindSession is a session-level cap (e.g. Claude 5-hour). + // Plumbing is in place — daemon cooldown/abort and CLI strict abort + // both handle it — but defaultLimitRules does not yet produce it for + // any real agent. ClassifyLimit can return LimitKindSession only via + // an injected classifier (tests). Production support is pending a + // captured Claude session-cap message; see the TODO at + // defaultLimitRules. + LimitKindSession +) + +// LimitClassification is the result of inspecting an agent error. +type LimitClassification struct { + Kind LimitKind + Agent string // canonical agent name (caller resolves aliases) + ResetAt time.Time // zero if not parseable from the message + CooldownFor time.Duration // zero if not parseable; caller applies its own fallback + Message string // raw error text (for logs / user display) +} + +// LimitClassifier is the function shape used by callers that want to inject +// a stub in tests. +type LimitClassifier func(agent, errMsg string) LimitClassification + +// limitRule is one substring → kind mapping. The Agents slice restricts +// the rule to specific canonical agent names; "*" applies to any agent. +type limitRule struct { + Agents []string // canonical agent names; "*" = any + Substring string // case-insensitive substring match on the error message + Kind LimitKind +} + +// defaultLimitRules is the production rule table. The nine quota +// substrings are copied from the original isQuotaError set in +// internal/daemon/worker.go so detection for Gemini and Codex is +// byte-for-byte unchanged. +// +// TODO: add a LimitKindSession rule for Claude's 5-hour cap once the +// exact error wording is captured from a real session-cap failure. +// Speculative substrings ("usage limit", "limit reached", etc.) are +// not added on purpose — they would also match policy errors, +// transient 429s, and config-validation messages, and a false positive +// would abort roborev fix and cool down the agent when retrying might +// have worked. Until that pattern lands, ClassifyLimit returns +// LimitKindSession only when an injected classifier produces it +// (i.e., in tests). +var defaultLimitRules = []limitRule{ + {Agents: []string{"*"}, Substring: "resource exhausted", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "quota exceeded", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "quota_exceeded", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "quota exhausted", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "quota_exhausted", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "insufficient_quota", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "exhausted your capacity", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "capacity exhausted", Kind: LimitKindQuota}, + {Agents: []string{"*"}, Substring: "capacity_exhausted", Kind: LimitKindQuota}, +} + +// ClassifyLimit inspects an agent error message and returns a +// LimitClassification describing whether (and how) the agent is +// rate-limited. The agent argument is the canonical agent name; the +// caller is responsible for resolving any aliases (e.g. "claude" → +// "claude-code") before calling. +// +// Returns Kind == LimitKindNone when no rule matches. +func ClassifyLimit(agent, errMsg string) LimitClassification { + return classifyLimitWithRules(agent, errMsg, defaultLimitRules) +} + +// classifyLimitWithRules is ClassifyLimit with an explicit rule slice. +// Unexported; used inside the package's own tests so synthetic fixtures +// (e.g. a LimitKindSession pattern) do not leak into defaultLimitRules. +func classifyLimitWithRules(agent, errMsg string, rules []limitRule) LimitClassification { + if errMsg == "" { + return LimitClassification{Kind: LimitKindNone, Agent: agent, Message: errMsg} + } + lower := strings.ToLower(errMsg) + for _, r := range rules { + if !limitRuleAppliesToAgent(r, agent) { + continue + } + if !strings.Contains(lower, r.Substring) { + continue + } + return LimitClassification{ + Kind: r.Kind, + Agent: agent, + ResetAt: ParseResetTime(errMsg), + CooldownFor: ParseResetDuration(errMsg), + Message: errMsg, + } + } + return LimitClassification{Kind: LimitKindNone, Agent: agent, Message: errMsg} +} + +func limitRuleAppliesToAgent(r limitRule, agent string) bool { + for _, a := range r.Agents { + if a == "*" || a == agent { + return true + } + } + return false +} diff --git a/internal/agent/limit_parse.go b/internal/agent/limit_parse.go new file mode 100644 index 00000000..c7c460c7 --- /dev/null +++ b/internal/agent/limit_parse.go @@ -0,0 +1,116 @@ +package agent + +import ( + "strings" + "time" +) + +// minCooldown / maxCooldown clamp parsed durations to a sane range so a +// pathological message ("reset after 1ms" or "reset after 100h") does not +// produce a useless cooldown. +const ( + minCooldown = 1 * time.Minute + maxCooldown = 24 * time.Hour +) + +// ParseResetDuration extracts a Go-format duration from a "reset after +// " substring in errMsg (case-insensitive). Returns 0 if no such +// substring is present or the duration is unparseable. Clamps positive +// values to [minCooldown, maxCooldown]. +func ParseResetDuration(errMsg string) time.Duration { + lower := strings.ToLower(errMsg) + _, after, ok := strings.Cut(lower, "reset after ") + if !ok { + return 0 + } + // Slice lower (not errMsg): a few runes (e.g. "İ" U+0130 → "i") + // shorten under strings.ToLower, so byte indices computed against + // lower can land in the middle of a UTF-8 sequence in errMsg. + // time.ParseDuration accepts the lowercase unit suffixes anyway. + rest := after + token := rest + if sp := strings.IndexAny(rest, " \t\n,;)"); sp > 0 { + token = rest[:sp] + } + token = strings.TrimRight(token, ".,;:)]}") + d, err := time.ParseDuration(token) + if err != nil || d <= 0 { + return 0 + } + if d < minCooldown { + return minCooldown + } + if d > maxCooldown { + return maxCooldown + } + return d +} + +// ParseResetTime extracts an absolute reset time from messages like +// "resets at 5:42 PM" or "try again at 17:42". Interprets the parsed +// clock time in the local timezone. Returns the zero time.Time if no +// recognized phrase is present or the time is unparseable. +// +// If the parsed clock time is at or before now-on-the-same-day, the +// returned time rolls forward to the same wall-clock time on the next +// local calendar day so callers that compute "time until reset" never +// get a negative duration. Rollover is DST-safe via time.Date day +// arithmetic; on a 23-hour or 25-hour day, Go normalizes the offset +// so the returned wall-clock time matches the user's local clock. +func ParseResetTime(errMsg string) time.Time { + return parseResetTimeAt(errMsg, time.Now()) +} + +// parseResetTimeAt is ParseResetTime with an injectable clock for tests. +func parseResetTimeAt(errMsg string, now time.Time) time.Time { + lower := strings.ToLower(errMsg) + var idx int + switch { + case strings.Contains(lower, "resets at "): + idx = strings.Index(lower, "resets at ") + len("resets at ") + case strings.Contains(lower, "try again at "): + idx = strings.Index(lower, "try again at ") + len("try again at ") + default: + return time.Time{} + } + // Slice lower (not errMsg) so a rune that shortens under ToLower + // can't shift byte offsets out of alignment with errMsg. The + // token feeds time.Parse with lowercase formats below. + rest := lower[idx:] + end := len(rest) + for i, r := range rest { + // Stop at sentence-ending punctuation or newline. + if r == '.' || r == ',' || r == ';' || r == '\n' || r == ')' { + end = i + break + } + } + token := strings.TrimSpace(rest[:end]) + + formats := []string{ + "3:04 pm", + "3:04pm", + "15:04", + } + for _, f := range formats { + if t, err := time.Parse(f, token); err == nil { + candidate := time.Date( + now.Year(), now.Month(), now.Day(), + t.Hour(), t.Minute(), 0, 0, now.Location(), + ) + // Roll forward when the parsed wall-clock time is at-or-before + // now. Using time.Date with Day()+1 (instead of Add(24h)) is + // DST-safe: Go normalizes lost/gained hours so the returned + // time always means "same wall-clock time tomorrow" in + // now.Location(). + if !candidate.After(now) { + candidate = time.Date( + now.Year(), now.Month(), now.Day()+1, + t.Hour(), t.Minute(), 0, 0, now.Location(), + ) + } + return candidate + } + } + return time.Time{} +} diff --git a/internal/agent/limit_parse_test.go b/internal/agent/limit_parse_test.go new file mode 100644 index 00000000..45656f63 --- /dev/null +++ b/internal/agent/limit_parse_test.go @@ -0,0 +1,175 @@ +package agent + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestParseResetDuration(t *testing.T) { + cases := []struct { + name string + msg string + want time.Duration + }{ + {"none", "agent failed", 0}, + {"reset after seconds clamped to min", "quota will reset after 5s.", 1 * time.Minute}, + {"reset after minutes", "quota will reset after 48m20s.", 48*time.Minute + 20*time.Second}, + {"reset after hours", "reset after 2h13m.", 2*time.Hour + 13*time.Minute}, + {"reset after with trailing punct", "reset after 1h.. retrying", 1 * time.Hour}, + {"reset after invalid", "reset after notaduration", 0}, + {"reset after negative", "reset after -5m", 0}, + {"reset after huge clamped to max", "reset after 100h", 24 * time.Hour}, + {"case insensitive", "RESET AFTER 30m", 30 * time.Minute}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, ParseResetDuration(tc.msg)) + }) + } +} + +func TestParseResetTime(t *testing.T) { + // Use a fixed "now" so same-day vs next-day rollover is deterministic. + loc := time.FixedZone("test", 0) + now := time.Date(2026, 5, 5, 12, 0, 0, 0, loc) // noon UTC + cases := []struct { + name string + msg string + wantErr bool + want time.Time + }{ + {"none", "agent failed", true, time.Time{}}, + { + name: "resets at later today", + msg: "limit resets at 5:42 PM", + want: time.Date(2026, 5, 5, 17, 42, 0, 0, loc), + }, + { + name: "try again at later today 24h", + msg: "try again at 17:42", + want: time.Date(2026, 5, 5, 17, 42, 0, 0, loc), + }, + { + name: "resets at earlier today rolls to next day", + msg: "limit resets at 9:00 AM", + want: time.Date(2026, 5, 6, 9, 0, 0, 0, loc), + }, + { + name: "case insensitive", + msg: "LIMIT RESETS AT 6:00 pm", + want: time.Date(2026, 5, 5, 18, 0, 0, 0, loc), + }, + {"unparseable token", "resets at moonrise", true, time.Time{}}, + { + // Trailing content after the time defeats time.Parse and + // produces zero. Documenting current conservative behavior + // — extending the parser to handle "today" / "UTC" suffixes + // is deferred until a real Claude message is captured. + name: "trailing word causes miss", + msg: "limit resets at 5:42 PM today", + wantErr: true, + want: time.Time{}, + }, + { + // 24h with a timezone abbreviation: same conservative miss. + name: "24h with trailing zone causes miss", + msg: "resets at 17:42 UTC", + wantErr: true, + want: time.Time{}, + }, + { + // Same instant as now: should roll forward, not return now. + name: "exactly now rolls to next day", + msg: "limit resets at 12:00 PM", + want: time.Date(2026, 5, 6, 12, 0, 0, 0, loc), + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := parseResetTimeAt(tc.msg, now) + if tc.wantErr { + assert.True(t, got.IsZero(), "expected zero time, got %v", got) + return + } + assert.Equal(t, tc.want, got) + }) + } +} + +func TestParseResetTimeRespectsLocation(t *testing.T) { + // Non-UTC fixture: confirms now.Location() is honored end-to-end. + est := time.FixedZone("EST", -5*60*60) + now := time.Date(2026, 5, 5, 12, 0, 0, 0, est) // noon EST + + got := parseResetTimeAt("limit resets at 5:42 PM", now) + want := time.Date(2026, 5, 5, 17, 42, 0, 0, est) + assert.Equal(t, want, got) + assert.Equal(t, est, got.Location()) +} + +// TestParseResetDurationToLowerByteSafety guards the ToLower +// byte-index pitfall: U+0130 "İ" lowercases to "i" — a 2→1 byte +// change. Slicing the original errMsg with indices computed against +// strings.ToLower(errMsg) would extract the wrong bytes (or panic +// inside a multi-byte sequence). Slicing the lowercased copy is +// safe and parses cleanly. +func TestParseResetDurationToLowerByteSafety(t *testing.T) { + got := ParseResetDuration("İ reset after 30m") + assert.Equal(t, 30*time.Minute, got) +} + +func TestParseResetTimeToLowerByteSafety(t *testing.T) { + loc := time.FixedZone("test", 0) + now := time.Date(2026, 5, 5, 12, 0, 0, 0, loc) // noon + got := parseResetTimeAt("İ resets at 5:42 PM", now) + want := time.Date(2026, 5, 5, 17, 42, 0, 0, loc) + assert.Equal(t, want, got) +} + +// TestParseResetTimeAcrossDST exercises the rollover code path through +// real DST transitions where Add(24*time.Hour) would land an hour off. +// time.FixedZone has no DST rules, so it cannot exercise this branch — +// only a real IANA zone with a transition does. +func TestParseResetTimeAcrossDST(t *testing.T) { + ny, err := time.LoadLocation("America/New_York") + if err != nil { + t.Skipf("America/New_York tzdata unavailable: %v", err) + } + + cases := []struct { + name string + now time.Time + msg string + want time.Time + }{ + { + // Spring forward: 2026-03-08 02:00 EST does not exist; the + // clock jumps to 03:00 EDT. A reset advertised at 9:00 AM on + // the morning of the transition, evaluated when "now" is the + // previous evening, must land at 9:00 AM EDT — same wall-clock + // time, despite the day having only 23 hours. + name: "spring forward rollover preserves wall-clock", + now: time.Date(2026, 3, 7, 21, 0, 0, 0, ny), // 9pm EST + msg: "limit resets at 9:00 AM", + want: time.Date(2026, 3, 8, 9, 0, 0, 0, ny), // 9am EDT next day + }, + { + // Fall back: 2026-11-01 has 25 hours. A reset advertised at + // 9:00 AM, evaluated the prior evening, must land at 9:00 AM + // EST — same wall-clock time, longer day notwithstanding. + name: "fall back rollover preserves wall-clock", + now: time.Date(2026, 10, 31, 22, 0, 0, 0, ny), // 10pm EDT + msg: "limit resets at 9:00 AM", + want: time.Date(2026, 11, 1, 9, 0, 0, 0, ny), // 9am EST next day + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := parseResetTimeAt(tc.msg, tc.now) + assert.Equal(t, tc.want, got, + "DST-safe rollover should preserve wall-clock time") + }) + } +} diff --git a/internal/agent/limit_test.go b/internal/agent/limit_test.go new file mode 100644 index 00000000..18f77325 --- /dev/null +++ b/internal/agent/limit_test.go @@ -0,0 +1,94 @@ +package agent + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestLimitClassificationZeroValue(t *testing.T) { + var c LimitClassification + assert := assert.New(t) + assert.Equal(LimitKindNone, c.Kind) + assert.Empty(c.Agent) + assert.True(c.ResetAt.IsZero()) + assert.Equal(time.Duration(0), c.CooldownFor) + assert.Empty(c.Message) +} + +func TestClassifyLimitProductionPatterns(t *testing.T) { + // All nine substrings from the original isQuotaError set must + // produce LimitKindQuota. This is the byte-for-byte regression + // test for current Gemini and Codex detection. + patterns := []string{ + "resource exhausted", + "quota exceeded", + "quota_exceeded", + "quota exhausted", + "quota_exhausted", + "insufficient_quota", + "exhausted your capacity", + "capacity exhausted", + "capacity_exhausted", + } + for _, p := range patterns { + t.Run(p, func(t *testing.T) { + cls := ClassifyLimit("gemini", "agent failed: "+p+", retrying...") + assert := assert.New(t) + assert.Equal(LimitKindQuota, cls.Kind, "expected LimitKindQuota for %q", p) + assert.Equal("gemini", cls.Agent) + assert.Contains(cls.Message, p) + }) + } +} + +func TestClassifyLimitExtractsCooldownDuration(t *testing.T) { + cls := ClassifyLimit( + "gemini", + "You have exhausted your capacity on this model. Your quota will reset after 48m20s.", + ) + assert := assert.New(t) + assert.Equal(LimitKindQuota, cls.Kind) + assert.Equal(48*time.Minute+20*time.Second, cls.CooldownFor) + assert.True(cls.ResetAt.IsZero(), "no absolute time in this message") +} + +func TestClassifyLimitNegativeCases(t *testing.T) { + cases := []struct { + name string + msg string + }{ + {"empty", ""}, + {"unrelated error", "exit status 1: file not found"}, + {"benign mention of limit", "limit set to 100 in config"}, + {"benign rate limit (transient, no rule produces it day 1)", "429 rate limit, retrying"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + cls := ClassifyLimit("gemini", tc.msg) + assert.Equal(t, LimitKindNone, cls.Kind, "expected LimitKindNone for %q", tc.msg) + }) + } +} + +func TestClassifyLimitWithRulesIsolatesSyntheticPattern(t *testing.T) { + // Synthetic rule used only inside this test — does not pollute + // defaultLimitRules. + syntheticRules := []limitRule{ + {Agents: []string{"*"}, Substring: "test-claude session limit", Kind: LimitKindSession}, + } + cls := classifyLimitWithRules( + "claude-code", + "5-hour test-claude session limit reached", + syntheticRules, + ) + assert := assert.New(t) + assert.Equal(LimitKindSession, cls.Kind) + assert.Equal("claude-code", cls.Agent) + + // Same message via the production ClassifyLimit must not match — + // the synthetic rule is not in defaultLimitRules. + cls2 := ClassifyLimit("claude-code", "5-hour test-claude session limit reached") + assert.Equal(LimitKindNone, cls2.Kind, "synthetic rule must not leak into defaultLimitRules") +} diff --git a/internal/daemon/worker.go b/internal/daemon/worker.go index 92209ea7..439b70ec 100644 --- a/internal/daemon/worker.go +++ b/internal/daemon/worker.go @@ -47,6 +47,11 @@ type WorkerPool struct { agentCooldowns map[string]time.Time // agent name -> expiry agentCooldownsMu sync.RWMutex + // classify is the rate-limit/quota classifier. Defaults to + // agent.ClassifyLimit; tests substitute a stub by setting this + // field directly after construction (test-only access). + classify agent.LimitClassifier + // Output capture for tail command outputBuffers *OutputBuffer @@ -70,6 +75,7 @@ func NewWorkerPool(db *storage.DB, cfgGetter ConfigGetter, numWorkers int, broad pendingCancels: make(map[int64]bool), agentCooldowns: make(map[string]time.Time), outputBuffers: NewOutputBuffer(512*1024, 4*1024*1024), // 512KB/job, 4MB total + classify: agent.ClassifyLimit, } } @@ -787,15 +793,39 @@ func (wp *WorkerPool) failOrRetryAgent(workerID string, job *storage.ReviewJob, } func (wp *WorkerPool) failOrRetryInner(workerID string, job *storage.ReviewJob, agentName string, errorMsg string, agentError bool) { - // Quota errors skip retries entirely — cool down the agent and - // attempt failover or fail with a quota-prefixed error. - if agentError && isQuotaError(errorMsg) { - dur := parseQuotaCooldown(errorMsg, defaultCooldown) - wp.cooldownAgent(agentName, time.Now().Add(dur)) - log.Printf("[%s] Agent %s quota exhausted, cooldown %v", - workerID, agentName, dur) - wp.failoverOrFail(workerID, job, agentName, errorMsg) - return + // Quota and session-limit errors skip retries entirely — cool down + // the agent and attempt failover or fail. Behavior matches the + // original isQuotaError branch; classification now lives in + // internal/agent (ClassifyLimit) so the CLI fix loop can share it. + if agentError { + cls := wp.classify(agent.CanonicalName(agentName), errorMsg) + switch cls.Kind { + case agent.LimitKindQuota, agent.LimitKindSession: + dur := defaultCooldown + if cls.CooldownFor > 0 { + dur = cls.CooldownFor + } + if !cls.ResetAt.IsZero() { + if until := time.Until(cls.ResetAt); until > 0 { + dur = until + } + } + wp.cooldownAgent(agentName, time.Now().Add(dur)) + log.Printf("[%s] Agent %s quota exhausted, cooldown %v", + workerID, agentName, dur) + wp.failoverOrFail(workerID, job, agentName, errorMsg) + return + case agent.LimitKindNone: + if errorMsg != "" { + preview := strings.ReplaceAll(errorMsg, "\n", " ") + preview = strings.ReplaceAll(preview, "\r", "") + log.Printf("[%s] unclassified agent error from %s: %s", + workerID, agentName, truncateRunes(preview, 200)) + } + // fall through to context-window / retry handling + case agent.LimitKindTransient: + // fall through to retry handling + } } if agentError && isContextWindowError(errorMsg) { wp.failoverOrFailNonRetryableAgent(workerID, job, agentName, errorMsg) @@ -965,32 +995,6 @@ func (wp *WorkerPool) broadcastFailed(job *storage.ReviewJob, agentName, errorMs // defaultCooldown is the fallback duration when the error message doesn't // contain a parseable "reset after" token. const defaultCooldown = 30 * time.Minute -const minCooldown = 1 * time.Minute -const maxCooldown = 24 * time.Hour - -// isQuotaError returns true if the error message indicates a hard API -// quota exhaustion (case-insensitive). Transient rate-limit/429 errors -// are excluded — those should go through normal retries, not cooldown. -func isQuotaError(errMsg string) bool { - lower := strings.ToLower(errMsg) - patterns := []string{ - "resource exhausted", - "quota exceeded", - "quota_exceeded", - "quota exhausted", - "quota_exhausted", - "insufficient_quota", - "exhausted your capacity", - "capacity_exhausted", - "capacity exhausted", - } - for _, p := range patterns { - if strings.Contains(lower, p) { - return true - } - } - return false -} func isContextWindowError(errMsg string) bool { lower := strings.ToLower(errMsg) @@ -1010,34 +1014,6 @@ func isContextWindowError(errMsg string) bool { return false } -// parseQuotaCooldown extracts a Go-format duration from a "reset after -// " substring. Returns fallback if not found or unparseable. -func parseQuotaCooldown(errMsg string, fallback time.Duration) time.Duration { - lower := strings.ToLower(errMsg) - idx := strings.Index(lower, "reset after ") - if idx < 0 { - return fallback - } - rest := errMsg[idx+len("reset after "):] - // Take the next whitespace-delimited token as a duration - token := rest - if sp := strings.IndexAny(rest, " \t\n,;)"); sp > 0 { - token = rest[:sp] - } - token = strings.TrimRight(token, ".,;:)]}") - d, err := time.ParseDuration(token) - if err != nil || d <= 0 { - return fallback - } - if d < minCooldown { - return minCooldown - } - if d > maxCooldown { - return maxCooldown - } - return d -} - // cooldownAgent sets or extends the cooldown expiry for an agent. // Only extends — never shortens an existing cooldown. func (wp *WorkerPool) cooldownAgent(name string, until time.Time) { diff --git a/internal/daemon/worker_test.go b/internal/daemon/worker_test.go index 8cf18e3f..d3151c0c 100644 --- a/internal/daemon/worker_test.go +++ b/internal/daemon/worker_test.go @@ -1,8 +1,10 @@ package daemon import ( + "bytes" "context" "fmt" + "log" "os" "os/exec" "path/filepath" @@ -913,115 +915,6 @@ func TestWorkerPoolCancelJobFinalCheckDeadlockSafe(t *testing.T) { } } -func TestIsQuotaError(t *testing.T) { - tests := []struct { - errMsg string - want bool - }{ - // Hard quota exhaustion — should trigger cooldown/skip - {"quota exceeded for model", true}, - {"QUOTA_EXCEEDED: limit reached", true}, - {"quota exhausted, reset after 8h", true}, - {"QUOTA_EXHAUSTED: try later", true}, - {"insufficient_quota: limit reached", true}, - {"You have exhausted your capacity", true}, - {"RESOURCE EXHAUSTED: try later", true}, - {"MODEL_CAPACITY_EXHAUSTED: The model is overloaded", true}, - {"capacity_exhausted", true}, - // Transient rate limits — should NOT trigger cooldown (use retries) - {"Rate limit reached", false}, - {"rate_limit_error: too fast", false}, - {"Too Many Requests (429)", false}, - {"HTTP 429: slow down", false}, - {"status 429 received from API", false}, - // Other non-quota errors - {"connection reset by peer", false}, - {"timeout after 30s", false}, - {"agent not found", false}, - {"disk quota full", false}, - {"", false}, - } - for _, tt := range tests { - t.Run(tt.errMsg, func(t *testing.T) { - if got := isQuotaError(tt.errMsg); got != tt.want { - assert.Condition(t, func() bool { - return false - }, "isQuotaError(%q) = %v, want %v", - tt.errMsg, got, tt.want) - } - }) - } -} - -func TestParseQuotaCooldown(t *testing.T) { - tests := []struct { - name string - errMsg string - fallback time.Duration - want time.Duration - }{ - { - name: "extracts go duration", - errMsg: "quota exhausted, reset after 8h26m13s please wait", - fallback: 30 * time.Minute, - want: 8*time.Hour + 26*time.Minute + 13*time.Second, - }, - { - name: "no reset token falls back", - errMsg: "quota exceeded for model gemini-2.5-pro", - fallback: 30 * time.Minute, - want: 30 * time.Minute, - }, - { - name: "unparseable duration falls back", - errMsg: "reset after bogus", - fallback: 15 * time.Minute, - want: 15 * time.Minute, - }, - { - name: "duration at end of string", - errMsg: "reset after 2h30m", - fallback: 30 * time.Minute, - want: 2*time.Hour + 30*time.Minute, - }, - { - name: "trailing punctuation trimmed", - errMsg: "reset after 8h26m13s.", - fallback: 30 * time.Minute, - want: 8*time.Hour + 26*time.Minute + 13*time.Second, - }, - { - name: "trailing paren trimmed", - errMsg: "reset after 1h30m)", - fallback: 30 * time.Minute, - want: 1*time.Hour + 30*time.Minute, - }, - { - name: "clamped to max 24h", - errMsg: "reset after 99999h", - fallback: 30 * time.Minute, - want: 24 * time.Hour, - }, - { - name: "clamped to min 1m", - errMsg: "reset after 5s", - fallback: 30 * time.Minute, - want: 1 * time.Minute, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := parseQuotaCooldown(tt.errMsg, tt.fallback) - if got != tt.want { - assert.Condition(t, func() bool { - return false - }, "parseQuotaCooldown() = %v, want %v", - got, tt.want) - } - }) - } -} - func TestAgentCooldown(t *testing.T) { cfg := config.DefaultConfig() pool := NewWorkerPool(nil, NewStaticConfig(cfg), 1, NewBroadcaster(), nil, nil) @@ -1299,6 +1192,63 @@ func TestFailOrRetryInner_NonQuotaStillRetries(t *testing.T) { } } +func TestFailOrRetryInner_SessionLimitCoolsDownAndSkipsRetries(t *testing.T) { + tc := newWorkerTestContext(t, 1) + sha := testutil.GetHeadSHA(t, tc.TmpDir) + job := tc.createAndClaimJob(t, sha, testWorkerID) + + // Stub classifier: any error message containing the marker yields + // KindSession with a 1-hour CooldownFor. Tests the seam without + // depending on real Claude wording. + tc.Pool.classify = func(agentName, msg string) agent.LimitClassification { + if strings.Contains(msg, "MARKER-SESSION-LIMIT") { + return agent.LimitClassification{ + Kind: agent.LimitKindSession, + Agent: agentName, + CooldownFor: 1 * time.Hour, + Message: msg, + } + } + return agent.LimitClassification{Kind: agent.LimitKindNone, Agent: agentName, Message: msg} + } + + tc.Pool.failOrRetryAgent(testWorkerID, job, "test", "boom MARKER-SESSION-LIMIT") + + assert := assert.New(t) + assert.True(tc.Pool.isAgentCoolingDown("test"), "agent should be in cooldown") + + // retry_count must NOT have advanced — quota/session errors skip + // retries entirely (matches the original isQuotaError semantics). + got, err := tc.DB.GetJobRetryCount(job.ID) + require.NoError(t, err) + assert.Equal(0, got, "session-limit error must not consume a retry slot") +} + +func TestFailOrRetryInner_UnmatchedAgentErrorLogsWarn(t *testing.T) { + tc := newWorkerTestContext(t, 1) + + // Capture log output by swapping the standard logger's writer. + var buf bytes.Buffer + origOutput := log.Writer() + log.SetOutput(&buf) + t.Cleanup(func() { log.SetOutput(origOutput) }) + + sha := testutil.GetHeadSHA(t, tc.TmpDir) + job := tc.createAndClaimJob(t, sha, testWorkerID) + + tc.Pool.classify = func(agentName, msg string) agent.LimitClassification { + return agent.LimitClassification{Kind: agent.LimitKindNone, Agent: agentName, Message: msg} + } + + tc.Pool.failOrRetryAgent(testWorkerID, job, "test", "some brand new error wording from a future agent") + + logged := buf.String() + assert := assert.New(t) + assert.Contains(logged, "unclassified agent error", "expected WARN line for unmatched error") + assert.Contains(logged, "from test:", "log line should include agent name as 'from :'") + assert.Contains(logged, "some brand new error wording", "log line should include error preview") +} + func TestFailoverOrFail_FailsOverToBackup(t *testing.T) { tc := newWorkerTestContext(t, 1) sha := testutil.GetHeadSHA(t, tc.TmpDir)