From bdb3677451a1ca313669bfaad5899448ff1a154c Mon Sep 17 00:00:00 2001
From: Codex <codex@openai.com>
Date: Tue, 2 Jun 2026 00:10:08 -0700
Subject: [PATCH] fix: de-flake CI test suite (5 root causes)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Electron Build matrix's "Run tests" step failed intermittently from
several independent flakes. Root-caused and fixed five:

1. Unhandled rejection crashes the whole run (production bug).
   createTerminalInvalidationHandler dispatched the refresh thunks
   fire-and-forget from a timer, but those thunks re-throw on failure, so a
   transient background-refresh failure became an unhandled promise rejection
   that fails `npm test` even though every test "passed". Contain the
   rejection in the handler (new onRefreshError dep) and at the App.tsx
   inventory dispatch site. Each contained rejection is tagged with its source
   (terminal-directory vs session-window) so the shared onRefreshError logs an
   accurate label.

2. AgentChatView reload test: mock-queue leak (test isolation).
   The "server-restart recovery" describe had no beforeEach, so a test's
   getAgentTimelinePage.mockResolvedValueOnce queue leaked into the next test
   under shuffled order. Add the same mock reset the first describe uses.

3. AgentChatView reload test: LazyMarkdown Suspense swap race (test only).
   Assistant bodies render through React.lazy + Suspense; the first render
   shows a fallback <p> that the lazy chunk later replaces, detaching the node
   `findByText` captured before toBeInTheDocument runs. Mock LazyMarkdown to
   render MarkdownRenderer synchronously, matching MessageBubble/tool-coalesce/
   agent-chat-polish-flow tests. (Production behaviour is correct.)

4. FreshAgentView redundant snapshot fetch (production bug).
   The snapshot-load effect dispatches updatePaneContent to persist its own
   resolved resumeSessionId/status, and the whole paneContent object (plus
   those output fields) was in its dependency array — so the self-update
   retriggered the effect, firing a second snapshot fetch for the same session
   on every load. Depend only on the fields that identify which snapshot to
   load; read non-identity fields live via paneContentRef.current.

5. sessions.changed background-refresh unhandled rejection (production bug).
   Same failure class as #1: the App.tsx sessions.changed websocket handler
   dispatched queueActiveSessionWindowRefresh() fire-and-forget with no
   .catch(). That thunk re-throws when it falls through to fetchSessionWindow()
   without committed window data (e.g. a sessions.changed before the sidebar
   window commits, or after a failed direct fetch retry), so a transient
   refresh failure leaked an unhandled rejection. Contain it with the same
   .catch(log.debug) guard the inventory site uses, and cover it with a
   regression test that captures unhandledRejection. (Found by independent
   review of the original four-fix commit.)

All proven with looped shuffled runs (e.g. #3: 0/120, #4: 0/40 + RED->GREEN
single-fetch assertion, #5: RED leak captured -> GREEN, 20/20 shuffled).
Full default-config unit suite green (305 files / 3526 tests). Typecheck
clean; no new lint warnings. Independently reviewed (gpt-5.5, read-only):
PASSED, one debug-log labeling nit, fixed above.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/App.tsx                                   | 16 ++++--
 src/components/fresh-agent/FreshAgentView.tsx | 10 ++--
 src/lib/terminal-invalidation-handler.ts      | 30 +++++++++--
 .../components/App.ws-bootstrap.test.tsx      | 51 ++++++++++++++++++
 .../agent-chat/AgentChatView.reload.test.tsx  | 29 ++++++++++
 .../fresh-agent/FreshAgentView.test.tsx       | 53 +++++++++++++++++++
 .../lib/terminal-invalidation-handler.test.ts | 44 +++++++++++++++
 7 files changed, 224 insertions(+), 9 deletions(-)
diff --git a/src/App.tsx b/src/App.tsx
index c31ca209d..742d99065 100644
--- a/src/App.tsx
+++ b/src/App.tsx
@@ -792,6 +792,12 @@ export default function App() {
         patchSessionRunningStateFromTerminalMeta,
         queueActiveSessionWindowRefresh: () => queueActiveSessionWindowRefresh() as any,
         fetchTerminalDirectoryWindow: (payload) => fetchTerminalDirectoryWindow(payload) as any,
+        onRefreshError: (error, source) => log.debug(
+          source === 'session-window'
+            ? 'active session window background refresh failed'
+            : 'terminal directory background refresh failed',
+          error,
+        ),
       })
 
       const unsubscribe = ws.onMessage((msg) => {
@@ -839,7 +845,9 @@ export default function App() {
           const rev = typeof msg.revision === 'number' ? msg.revision : -1
           if (rev > lastSessionsRevision) {
             lastSessionsRevision = rev
-            void appStore.dispatch(queueActiveSessionWindowRefresh() as any)
+            // Fire-and-forget refresh: the thunk re-throws on failure, so contain the
+            // rejection rather than leak an unhandled rejection (matches the inventory site).
+            void appStore.dispatch(queueActiveSessionWindowRefresh() as any).catch((error: unknown) => log.debug('active session window refresh failed', error))
           }
         }
         if (msg.type === 'settings.updated') {
@@ -911,11 +919,13 @@ export default function App() {
             upsert: terminalMeta,
             remove: removedTerminalMetaIds,
           }))
+          // Fire-and-forget refreshes: the thunks re-throw on failure, so
+          // contain the rejection rather than leak an unhandled rejection.
           void appStore.dispatch(fetchTerminalDirectoryWindow({
             surface: 'sidebar',
             priority: 'visible',
-          }) as any)
-          void appStore.dispatch(queueActiveSessionWindowRefresh() as any)
+          }) as any).catch((error: unknown) => log.debug('terminal directory background refresh failed', error))
+          void appStore.dispatch(queueActiveSessionWindowRefresh() as any).catch((error: unknown) => log.debug('active session window refresh failed', error))
         }
         if (msg.type === 'codex.activity.list.response') {
           const requestId = typeof msg.requestId === 'string' ? msg.requestId : ''
diff --git a/src/components/fresh-agent/FreshAgentView.tsx b/src/components/fresh-agent/FreshAgentView.tsx
index 18f418916..c57e47421 100644
--- a/src/components/fresh-agent/FreshAgentView.tsx
+++ b/src/components/fresh-agent/FreshAgentView.tsx
@@ -693,14 +693,18 @@ export function FreshAgentView({
         setLoadError(error instanceof Error ? error.message : 'Failed to load session')
       })
     return () => controller.abort()
+    // Depend only on what identifies *which* snapshot to load. This effect
+    // dispatches updatePaneContent to persist its own resolved resumeSessionId/
+    // status; listing the whole paneContent object (or those output fields) made
+    // that self-update retrigger the effect, firing a redundant second fetch for
+    // the same session. Current values for non-identity fields are read live via
+    // paneContentRef.current inside the effect.
   }, [
     claudeSession?.lost,
     dispatch,
-    paneContent,
     paneContent.provider,
-    paneContent.resumeSessionId,
+    paneContent.createRequestId,
     paneContent.sessionId,
-    paneContent.status,
     paneContent.sessionType,
     paneId,
     autoTitleIdentity,
diff --git a/src/lib/terminal-invalidation-handler.ts b/src/lib/terminal-invalidation-handler.ts
index 82eea7cb5..6437c2191 100644
--- a/src/lib/terminal-invalidation-handler.ts
+++ b/src/lib/terminal-invalidation-handler.ts
@@ -5,6 +5,9 @@ type DispatchLike = (action: unknown) => unknown
 
 type RefreshThunk = unknown
 
+/** Identifies which fire-and-forget background refresh produced a rejection. */
+export type RefreshSource = 'terminal-directory' | 'session-window'
+
 type TerminalInvalidationDeps = {
   dispatch: DispatchLike
   upsertTerminalMeta: (payload: TerminalMetaRecord[]) => PayloadAction<TerminalMetaRecord[]>
@@ -21,6 +24,14 @@ type TerminalInvalidationDeps = {
   setTimeout?: typeof setTimeout
   clearTimeout?: typeof clearTimeout
   refreshDelayMs?: number
+  /**
+   * Invoked when a fire-and-forget background refresh rejects. The refresh
+   * thunks re-throw on failure (callers that await them rely on this), but the
+   * handler dispatches them from a timer without awaiting, so it contains the
+   * rejection here instead of letting it surface as an unhandled rejection.
+   * `source` identifies which refresh failed so the callback can log accurately.
+   */
+  onRefreshError?: (error: unknown, source: RefreshSource) => void
 }
 
 function isTerminalMetaRecord(value: unknown): value is TerminalMetaRecord {
@@ -36,13 +47,26 @@ export function createTerminalInvalidationHandler(deps: TerminalInvalidationDeps
   const delayMs = deps.refreshDelayMs ?? 50
   let refreshTimer: ReturnType<typeof setTimeout> | undefined
 
+  const dispatchBackgroundRefresh = (thunk: RefreshThunk, source: RefreshSource) => {
+    const result = deps.dispatch(thunk) as unknown
+    // Contain rejections from the fire-and-forget refresh dispatch: the thunks
+    // re-throw on failure, and a failed background refresh must never become an
+    // unhandled promise rejection (which crashes test runs and is silently
+    // uncaught in production).
+    if (result && typeof (result as { catch?: unknown }).catch === 'function') {
+      void (result as Promise<unknown>).catch((error: unknown) => {
+        deps.onRefreshError?.(error, source)
+      })
+    }
+  }
+
   const runRefresh = () => {
     refreshTimer = undefined
-    deps.dispatch(deps.fetchTerminalDirectoryWindow({
+    dispatchBackgroundRefresh(deps.fetchTerminalDirectoryWindow({
       surface: 'sidebar',
       priority: 'visible',
-    }))
-    deps.dispatch(deps.queueActiveSessionWindowRefresh())
+    }), 'terminal-directory')
+    dispatchBackgroundRefresh(deps.queueActiveSessionWindowRefresh(), 'session-window')
   }
 
   const scheduleRefresh = () => {
diff --git a/test/unit/client/components/App.ws-bootstrap.test.tsx b/test/unit/client/components/App.ws-bootstrap.test.tsx
index d10c09b2a..572593cc5 100644
--- a/test/unit/client/components/App.ws-bootstrap.test.tsx
+++ b/test/unit/client/components/App.ws-bootstrap.test.tsx
@@ -1556,6 +1556,57 @@ describe('App WS bootstrap recovery', () => {
     })
   })
 
+  it('contains a failing queued session-window refresh from a sessions.changed broadcast instead of leaking an unhandled rejection', async () => {
+    // Regression: the sessions.changed handler dispatched queueActiveSessionWindowRefresh()
+    // fire-and-forget with no .catch(). That thunk re-throws when it falls through to
+    // fetchSessionWindow() without committed window data (e.g. a sessions.changed before the
+    // sidebar window commits, or after a failed direct fetch retry), so a transient refresh
+    // failure leaked an unhandled rejection that fails the whole test run even though every
+    // test "passed" — the same failure class the terminal.inventory site already contains.
+    const store = createStore()
+
+    // Reject every snapshot fetch. The bootstrap sidebar load fails but is contained by
+    // ensureSidebarSessionsWindow (so no window ever commits -> hasCommittedWindow stays
+    // false), and the queued refresh then takes the re-throwing fetchSessionWindow branch.
+    fetchSidebarSessionsSnapshot.mockRejectedValue(new Error('window snapshot unavailable'))
+
+    const unhandled: unknown[] = []
+    const onUnhandled = (reason: unknown) => { unhandled.push(reason) }
+    process.on('unhandledRejection', onUnhandled)
+    try {
+      render(
+        <Provider store={store}>
+          <App />
+        </Provider>
+      )
+
+      await waitFor(() => {
+        expect(messageHandler).toBeTypeOf('function')
+      })
+
+      act(() => {
+        messageHandler?.({ type: 'sessions.changed', revision: 1 })
+      })
+
+      // The queued refresh actually ran and failed (proves the path is exercised, not vacuous).
+      await waitFor(() => {
+        expect(store.getState().sessions.windows.sidebar?.error).toBe('window snapshot unavailable')
+      })
+
+      // Give Node a chance to surface any unhandled rejection from the fire-and-forget dispatch.
+      await new Promise((resolve) => setTimeout(resolve, 0))
+      await new Promise((resolve) => setTimeout(resolve, 0))
+
+      expect(
+        unhandled.filter(
+          (reason) => reason instanceof Error && reason.message.includes('window snapshot unavailable'),
+        ),
+      ).toHaveLength(0)
+    } finally {
+      process.off('unhandledRejection', onUnhandled)
+    }
+  })
+
   it('ignores legacy sessions.patch messages when bootstrapping against an already-ready socket', async () => {
     const baselineProjects = [
       {
diff --git a/test/unit/client/components/agent-chat/AgentChatView.reload.test.tsx b/test/unit/client/components/agent-chat/AgentChatView.reload.test.tsx
index 7f1be8306..6fa11aed3 100644
--- a/test/unit/client/components/agent-chat/AgentChatView.reload.test.tsx
+++ b/test/unit/client/components/agent-chat/AgentChatView.reload.test.tsx
@@ -62,6 +62,22 @@ vi.mock('@/lib/api', async () => {
   }
 })
 
+// Render markdown bodies synchronously. The real LazyMarkdown wraps MarkdownRenderer
+// in React.lazy + Suspense, so the first assistant-message render shows a fallback
+// <p> that the lazy chunk later replaces with the real node. Tests that capture the
+// fallback node via `await findByText(...)` and then assert `toBeInTheDocument()` race
+// that swap (the captured node detaches), producing an order-dependent flake. Mocking
+// LazyMarkdown to render MarkdownRenderer directly removes the swap. Matches the mock
+// used by the other agent-chat tests (MessageBubble, tool-coalesce, agent-chat-polish-flow).
+vi.mock('@/components/markdown/LazyMarkdown', async () => {
+  const { MarkdownRenderer } = await import('@/components/markdown/MarkdownRenderer')
+  return {
+    LazyMarkdown: ({ content }: { content: string }) => (
+      <MarkdownRenderer content={content} />
+    ),
+  }
+})
+
 function makeStore() {
   return configureStore({
     reducer: {
@@ -1891,6 +1907,19 @@ function getPaneContent(store: ReturnType<typeof makeStore>, tabId: string, pane
 }
 
 describe('AgentChatView server-restart recovery', () => {
+  // Reset the timeline API mocks before each test. Without this, a test's
+  // mockResolvedValueOnce queue leaks into the next test under shuffled order
+  // (e.g. one test consuming another's "Revision N body" page), producing an
+  // order-dependent flake. Mirrors the reset in the first describe's beforeEach.
+  beforeEach(() => {
+    localStorage.clear()
+    getAgentTimelinePage.mockReset()
+    getAgentTurnBody.mockReset()
+    getAgentChatCapabilities.mockReset()
+    setSessionMetadata.mockReset()
+    setSessionMetadata.mockResolvedValue(undefined)
+  })
+
   afterEach(() => {
     cleanup()
     wsSend.mockClear()
diff --git a/test/unit/client/components/fresh-agent/FreshAgentView.test.tsx b/test/unit/client/components/fresh-agent/FreshAgentView.test.tsx
index 15d36529a..c1b398c40 100644
--- a/test/unit/client/components/fresh-agent/FreshAgentView.test.tsx
+++ b/test/unit/client/components/fresh-agent/FreshAgentView.test.tsx
@@ -1153,6 +1153,59 @@ describe('FreshAgentView', () => {
     ]))
   })
 
+  it('fetches the initial snapshot once and does not refetch from its own pane update', async () => {
+    const store = createStore()
+    // First fetch returns a distinct snapshot; the default mockResolvedValue
+    // ("Codex turn") would answer any *second* fetch. The snapshot-load effect
+    // persists resumeSessionId via updatePaneContent, and if that self-update
+    // retriggers the effect, the redundant second fetch overwrites the loaded
+    // content with the default — a wasteful double network request in production
+    // and an order-dependent flake in tests.
+    apiMock.getFreshAgentThreadSnapshot.mockResolvedValueOnce({
+      status: 'idle',
+      summary: 'Codex summary',
+      capabilities: { send: true, interrupt: true, fork: true },
+      turns: [
+        { id: 'turn-user-1', role: 'user', items: [{ id: 'item-user-1', kind: 'text', text: 'Loaded user turn' }] },
+        { id: 'turn-assistant-1', role: 'assistant', items: [{ id: 'item-assistant-1', kind: 'text', text: 'Loaded assistant turn' }] },
+      ],
+    })
+    store.dispatch(initLayout({
+      tabId: 'tab-1',
+      paneId: 'pane-1',
+      content: {
+        kind: 'fresh-agent',
+        sessionType: 'freshcodex',
+        provider: 'codex',
+        createRequestId: 'req-single-fetch',
+        sessionId: 'thread-single-fetch',
+        status: 'idle',
+      },
+    }))
+
+    render(
+      <Provider store={store}>
+        <StoreBackedFreshAgentView tabId="tab-1" paneId="pane-1" />
+      </Provider>,
+    )
+
+    // Wait until the effect has persisted resumeSessionId back into pane content
+    // (the self-update that previously retriggered the effect).
+    await waitFor(() => {
+      const layout = store.getState().panes.layouts['tab-1']
+      const resumeSessionId = layout?.type === 'leaf' && layout.content.kind === 'fresh-agent'
+        ? layout.content.resumeSessionId
+        : undefined
+      expect(resumeSessionId).toBe('thread-single-fetch')
+    })
+    // Let any spurious self-triggered refetch run before asserting.
+    await act(async () => { await Promise.resolve() })
+
+    expect(apiMock.getFreshAgentThreadSnapshot).toHaveBeenCalledTimes(1)
+    // The loaded snapshot stays rendered (not overwritten by a second fetch).
+    expect(screen.getByText('Loaded assistant turn')).toBeInTheDocument()
+  })
+
   it('resets auto-title for a new conversation even if the stale prior snapshot had user turns', async () => {
     const store = createStore()
     apiMock.getFreshAgentThreadSnapshot.mockResolvedValueOnce({
diff --git a/test/unit/client/lib/terminal-invalidation-handler.test.ts b/test/unit/client/lib/terminal-invalidation-handler.test.ts
index 628976a6c..93d31c2a8 100644
--- a/test/unit/client/lib/terminal-invalidation-handler.test.ts
+++ b/test/unit/client/lib/terminal-invalidation-handler.test.ts
@@ -178,4 +178,48 @@ describe('createTerminalInvalidationHandler', () => {
     await vi.advanceTimersByTimeAsync(50)
     expect(refresh.fetchTerminalDirectoryWindow).toHaveBeenCalledTimes(1)
   })
+
+  it('contains background-refresh dispatch failures instead of leaking an unhandled rejection', async () => {
+    vi.useFakeTimers()
+    const refresh = createRefreshDoubles()
+    const refreshError = new Error('refresh failed')
+    // Mirror the real store: dispatching the refresh thunks returns a promise
+    // that rejects, because fetchTerminalDirectoryWindow / the active-window
+    // refresh re-throw on API failure. The handler dispatches them
+    // fire-and-forget from a timer, so it must contain the rejection — otherwise
+    // a transient background-refresh failure becomes an unhandled rejection that
+    // crashes the whole vitest run (and is silently uncaught in production).
+    const dispatch = vi.fn((action: unknown) => {
+      if (action === refresh.terminalDirectoryRefreshThunk || action === refresh.activeSessionWindowRefreshThunk) {
+        return Promise.reject(refreshError)
+      }
+      return undefined
+    })
+    const onRefreshError = vi.fn()
+    const handler = createTerminalInvalidationHandler({
+      dispatch,
+      upsertTerminalMeta,
+      removeTerminalMeta,
+      patchSessionRunningStateFromTerminalMeta,
+      queueActiveSessionWindowRefresh: refresh.queueActiveSessionWindowRefresh,
+      fetchTerminalDirectoryWindow: refresh.fetchTerminalDirectoryWindow,
+      setTimeout: globalThis.setTimeout,
+      clearTimeout: globalThis.clearTimeout,
+      refreshDelayMs: 50,
+      onRefreshError,
+    })
+
+    handler.handle({ type: 'terminals.changed', revision: 12 })
+    await vi.advanceTimersByTimeAsync(50)
+    // Let the contained .catch() handlers settle.
+    await Promise.resolve()
+    await Promise.resolve()
+
+    expect(onRefreshError).toHaveBeenCalledTimes(2)
+    // Each rejection is tagged with which refresh failed so a shared
+    // onRefreshError can log an accurate label instead of mislabeling a
+    // session-window failure as a terminal-directory one.
+    expect(onRefreshError).toHaveBeenCalledWith(refreshError, 'terminal-directory')
+    expect(onRefreshError).toHaveBeenCalledWith(refreshError, 'session-window')
+  })
 })