diff --git a/apps/code/src/main/services/cloud-task/service.test.ts b/apps/code/src/main/services/cloud-task/service.test.ts index 643f9f58f..ab9000a16 100644 --- a/apps/code/src/main/services/cloud-task/service.test.ts +++ b/apps/code/src/main/services/cloud-task/service.test.ts @@ -604,6 +604,69 @@ describe("CloudTaskService", () => { ).toBe(true); }); + it("fails the watcher after exhausting the cumulative reconnect budget on clean-EOF loops", async () => { + vi.useFakeTimers(); + + const updates: unknown[] = []; + service.on(CloudTaskEvent.Update, (payload) => updates.push(payload)); + + const makeInProgressRun = () => + createJsonResponse({ + id: "run-1", + status: "in_progress", + stage: null, + output: null, + error_message: null, + branch: "main", + updated_at: "2026-01-01T00:00:00Z", + }); + + mockNetFetch.mockImplementation((input: string | Request) => { + const url = typeof input === "string" ? input : input.url; + if (url.includes("/session_logs/")) { + return Promise.resolve( + createJsonResponse([], 200, { "X-Has-More": "false" }), + ); + } + return Promise.resolve(makeInProgressRun()); + }); + + mockStreamFetch.mockImplementation(() => + Promise.resolve(createSseResponse("")), + ); + + service.watch({ + taskId: "task-1", + runId: "run-1", + apiHost: "https://app.example.com", + teamId: 2, + }); + + await waitFor(() => mockStreamFetch.mock.calls.length === 1); + await vi.advanceTimersByTimeAsync(60 * 60_000); + + await waitFor( + () => + updates.some( + (u) => + typeof u === "object" && + u !== null && + (u as { kind?: string }).kind === "error", + ), + 10_000, + ); + + expect(updates).toContainEqual({ + taskId: "task-1", + runId: "run-1", + kind: "error", + errorTitle: "Cloud run unreachable", + errorMessage: + "Could not maintain a connection to the cloud run after many attempts. Click retry once the issue is resolved.", + retryable: true, + }); + }); + it("emits a retryable cloud error after repeated stream failures", async () => { vi.useFakeTimers(); diff --git a/apps/code/src/main/services/cloud-task/service.ts b/apps/code/src/main/services/cloud-task/service.ts index 4bafa6776..59716b068 100644 --- a/apps/code/src/main/services/cloud-task/service.ts +++ b/apps/code/src/main/services/cloud-task/service.ts @@ -19,6 +19,7 @@ import { type SseEvent, SseEventParser } from "./sse-parser"; const log = logger.scope("cloud-task"); const MAX_SSE_RECONNECT_ATTEMPTS = 5; +const MAX_CUMULATIVE_RECONNECT_ATTEMPTS = 30; const SSE_RECONNECT_BASE_DELAY_MS = 2_000; const SSE_RECONNECT_MAX_DELAY_MS = 30_000; const SSE_HEALTHY_CONNECTION_MS = 60_000; @@ -91,6 +92,7 @@ interface WatcherState { totalEntryCount: number; reconnectAttempts: number; streamErrorAttempts: number; + cumulativeReconnectAttempts: number; lastEventId: string | null; lastStatus: TaskRunStatus | null; lastStage: string | null; @@ -283,6 +285,7 @@ export class CloudTaskService extends TypedEventEmitter { watcher.reconnectAttempts = 0; watcher.streamErrorAttempts = 0; + watcher.cumulativeReconnectAttempts = 0; watcher.failed = false; watcher.pendingLogEntries = []; watcher.bufferedLogBatches = []; @@ -406,6 +409,7 @@ export class CloudTaskService extends TypedEventEmitter { totalEntryCount: 0, reconnectAttempts: 0, streamErrorAttempts: 0, + cumulativeReconnectAttempts: 0, lastEventId: null, lastStatus: null, lastStage: null, @@ -760,8 +764,9 @@ export class CloudTaskService extends TypedEventEmitter { } // A real data event proves the stream materialized; clear the backend-error - // budget too. + // and cumulative budgets too. watcher.streamErrorAttempts = 0; + watcher.cumulativeReconnectAttempts = 0; if (isTaskRunStateEvent(event.data)) { if (this.applyTaskRunState(watcher, event.data)) { @@ -1016,12 +1021,26 @@ export class CloudTaskService extends TypedEventEmitter { clearTimeout(watcher.reconnectTimeoutId); } + // Cumulative counter bounds runaway loops that clean-EOF (countAttempt=false) + // and would otherwise dodge `reconnectAttempts`. + watcher.cumulativeReconnectAttempts += 1; const countAttempt = options.countAttempt ?? true; if (countAttempt) { watcher.reconnectAttempts += 1; - } else { - watcher.reconnectAttempts = 0; } + + if ( + watcher.cumulativeReconnectAttempts > MAX_CUMULATIVE_RECONNECT_ATTEMPTS + ) { + this.failWatcher(key, { + title: "Cloud run unreachable", + message: + "Could not maintain a connection to the cloud run after many attempts. Click retry once the issue is resolved.", + retryable: true, + }); + return; + } + // The watcher fails once either budget is exhausted: transport reconnect // failures or backend stream-error frames. const attemptCount = Math.max( @@ -1112,6 +1131,9 @@ export class CloudTaskService extends TypedEventEmitter { if (!isTerminalStatus(watcher.lastStatus) && reconnectIfNonTerminal) { if (stateChanged) { + // Polled progress proves the run is alive — reset both budgets. + watcher.reconnectAttempts = 0; + watcher.cumulativeReconnectAttempts = 0; this.emit(CloudTaskEvent.Update, { taskId: watcher.taskId, runId: watcher.runId,