From ecaabc6d924c70e42837e92ad98dfb08055d2982 Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Thu, 21 May 2026 19:38:42 -0700 Subject: [PATCH 01/10] improvement(mcp): per-server tool queries + negative cache so one slow server can't block the workspace Move MCP tool discovery off the workspace-aggregated `Promise.all` fan-out and onto per-server React Query keys, matching how Cursor and Claude Code render remote MCP. `useMcpToolsQuery` is now a `useQueries` combiner: each server has its own cache entry, its own loading state, and a slow neighbor never gates the others. Public shape stays compatible with existing consumers. Add a short-TTL negative cache: when `listTools` fails (timeout, connection error, etc.) we mark the server unhealthy for 30s so subsequent discovery calls short-circuit instead of re-paying the timeout. OAuth-required errors are exempt so re-auth retries immediately. Drop `LIST_TOOLS_TIMEOUT_MS` from 30s to 10s to bound the worst-case first failure. Invalidations are per-server where the action is per-server (OAuth popup, per-server SSE event, refresh, update, delete). Bulk operations stay workspace-broad. Adds tests for the negative-cache behavior and the OAuth exemption. Co-Authored-By: Claude Opus 4.7 --- apps/sim/hooks/mcp/use-mcp-oauth-popup.ts | 10 +- apps/sim/hooks/mcp/use-mcp-tools.ts | 4 +- apps/sim/hooks/queries/mcp.ts | 140 +++++++++++++++++++--- apps/sim/lib/mcp/service.test.ts | 68 ++++++++++- apps/sim/lib/mcp/service.ts | 95 ++++++++++++++- apps/sim/lib/mcp/utils.ts | 7 +- 6 files changed, 291 insertions(+), 33 deletions(-) diff --git a/apps/sim/hooks/mcp/use-mcp-oauth-popup.ts b/apps/sim/hooks/mcp/use-mcp-oauth-popup.ts index 0df50e49a72..1bbe1c08a4e 100644 --- a/apps/sim/hooks/mcp/use-mcp-oauth-popup.ts +++ b/apps/sim/hooks/mcp/use-mcp-oauth-popup.ts @@ -80,7 +80,15 @@ export function useMcpOauthPopup({ workspaceId }: UseMcpOauthPopupProps) { } if (data.ok) { queryClient.invalidateQueries({ queryKey: mcpKeys.serversList(workspaceId) }) - queryClient.invalidateQueries({ queryKey: mcpKeys.toolsList(workspaceId) }) + if (data.serverId) { + // Only the freshly-authorized server needs a refetch; every other + // server's cached tools stay warm so a slow neighbor can't gate the UI. + queryClient.invalidateQueries({ + queryKey: mcpKeys.serverToolsList(workspaceId, data.serverId), + }) + } else { + queryClient.invalidateQueries({ queryKey: mcpKeys.serverTools() }) + } queryClient.invalidateQueries({ queryKey: mcpKeys.storedToolsList(workspaceId) }) toast.success('Server authorized') } else { diff --git a/apps/sim/hooks/mcp/use-mcp-tools.ts b/apps/sim/hooks/mcp/use-mcp-tools.ts index 631387e039d..7a7d12b28ff 100644 --- a/apps/sim/hooks/mcp/use-mcp-tools.ts +++ b/apps/sim/hooks/mcp/use-mcp-tools.ts @@ -39,7 +39,7 @@ export interface UseMcpToolsResult { export function useMcpTools(workspaceId: string): UseMcpToolsResult { const queryClient = useQueryClient() - const { data: mcpToolsData = [], isLoading, error: queryError } = useMcpToolsQuery(workspaceId) + const { data: mcpToolsData, isLoading, error: queryError } = useMcpToolsQuery(workspaceId) const mcpTools = useMemo(() => { return mcpToolsData.map((tool) => ({ @@ -65,7 +65,7 @@ export function useMcpTools(workspaceId: string): UseMcpToolsResult { logger.info('Refreshing MCP tools', { forceRefresh, workspaceId }) await queryClient.invalidateQueries({ - queryKey: mcpKeys.toolsList(workspaceId), + queryKey: mcpKeys.serverTools(), refetchType: forceRefresh ? 'active' : 'all', }) }, diff --git a/apps/sim/hooks/queries/mcp.ts b/apps/sim/hooks/queries/mcp.ts index 7652df84e75..6be131dee7e 100644 --- a/apps/sim/hooks/queries/mcp.ts +++ b/apps/sim/hooks/queries/mcp.ts @@ -1,7 +1,13 @@ -import { useEffect } from 'react' +import { useEffect, useMemo } from 'react' import { createLogger } from '@sim/logger' import { getErrorMessage } from '@sim/utils/errors' -import { keepPreviousData, useMutation, useQuery, useQueryClient } from '@tanstack/react-query' +import { + keepPreviousData, + useMutation, + useQueries, + useQuery, + useQueryClient, +} from '@tanstack/react-query' import { ApiClientError } from '@/lib/api/client/errors' import { requestJson } from '@/lib/api/client/request' import { @@ -41,6 +47,9 @@ export const mcpKeys = { serversList: (workspaceId?: string) => [...mcpKeys.servers(), workspaceId ?? ''] as const, tools: () => [...mcpKeys.all, 'tools'] as const, toolsList: (workspaceId?: string) => [...mcpKeys.tools(), workspaceId ?? ''] as const, + serverTools: () => [...mcpKeys.all, 'serverTools'] as const, + serverToolsList: (workspaceId?: string, serverId?: string) => + [...mcpKeys.serverTools(), workspaceId ?? '', serverId ?? ''] as const, storedTools: () => [...mcpKeys.all, 'storedTools'] as const, storedToolsList: (workspaceId?: string) => [...mcpKeys.storedTools(), workspaceId ?? ''] as const, allowedDomains: () => [...mcpKeys.all, 'allowedDomains'] as const, @@ -92,11 +101,16 @@ export function useMcpServers(workspaceId: string) { async function fetchMcpTools( workspaceId: string, forceRefresh = false, - signal?: AbortSignal + signal?: AbortSignal, + serverId?: string ): Promise { try { const data = await requestJson(discoverMcpToolsContract, { - query: { workspaceId, refresh: forceRefresh || undefined }, + query: { + workspaceId, + refresh: forceRefresh || undefined, + ...(serverId ? { serverId } : {}), + }, signal, }) return data.data.tools @@ -108,24 +122,93 @@ async function fetchMcpTools( } } -export function useMcpToolsQuery(workspaceId: string) { +/** + * Per-server tools query. Each server has its own React Query entry, so a slow + * or hung server can never block another server's load — same model Cursor and + * Claude Code use for remote MCP. + */ +export function useMcpServerTools(workspaceId: string, serverId?: string) { return useQuery({ - queryKey: mcpKeys.toolsList(workspaceId), - queryFn: ({ signal }) => fetchMcpTools(workspaceId, false, signal), - enabled: !!workspaceId, + queryKey: mcpKeys.serverToolsList(workspaceId, serverId), + queryFn: ({ signal }) => fetchMcpTools(workspaceId, false, signal, serverId), + enabled: !!workspaceId && !!serverId, retry: false, staleTime: 30 * 1000, - placeholderData: keepPreviousData, }) } +/** + * Workspace-level aggregate, derived from N parallel per-server queries via + * `useQueries`. Public shape stays compatible with the prior workspace-keyed + * query (flat `McpTool[]`, single `isLoading`, single error) so existing + * consumers don't change. A slow server only flips its own per-server state; + * fast servers populate immediately. + */ +export function useMcpToolsQuery(workspaceId: string) { + const { data: servers } = useMcpServers(workspaceId) + + const serverIds = useMemo(() => (servers ? servers.map((s) => s.id).sort() : []), [servers]) + + const results = useQueries({ + queries: serverIds.map((serverId) => ({ + queryKey: mcpKeys.serverToolsList(workspaceId, serverId), + queryFn: ({ signal }: { signal?: AbortSignal }) => + fetchMcpTools(workspaceId, false, signal, serverId), + enabled: !!workspaceId, + retry: false, + staleTime: 30 * 1000, + })), + }) + + return useMemo(() => { + const tools: McpTool[] = [] + let hasData = false + let isLoading = false + let firstError: Error | null = null + for (const result of results) { + if (result.data) { + tools.push(...result.data) + hasData = true + } + if (result.isLoading) isLoading = true + if (!firstError && result.error instanceof Error) firstError = result.error + } + return { + data: tools, + // Match the prior semantics: "loading" means we have nothing to show yet. + // Once any server has returned, the UI can render that and let slow + // neighbors fill in incrementally without keeping the spinner up. + isLoading: isLoading && !hasData, + isFetching: results.some((r) => r.isFetching), + error: firstError, + perServer: results, + } + }, [results]) +} + export function useForceRefreshMcpTools() { const queryClient = useQueryClient() return useMutation({ - mutationFn: (workspaceId: string) => fetchMcpTools(workspaceId, true), + mutationFn: async (workspaceId: string) => { + // Fetch each server's tools in parallel with `refresh=true`, populating + // the per-server query cache directly. A slow server only blocks its own + // card — fast servers light up as soon as they return. + const servers = queryClient.getQueryData(mcpKeys.serversList(workspaceId)) ?? [] + const results = await Promise.allSettled( + servers.map(async (server) => { + const tools = await fetchMcpTools(workspaceId, true, undefined, server.id) + queryClient.setQueryData(mcpKeys.serverToolsList(workspaceId, server.id), tools) + return tools + }) + ) + return results + .filter((r): r is PromiseFulfilledResult => r.status === 'fulfilled') + .flatMap((r) => r.value) + }, onSettled: (_data, _error, workspaceId) => { - queryClient.invalidateQueries({ queryKey: mcpKeys.toolsList(workspaceId) }) + // Per-server caches were already populated inside `mutationFn` via + // `setQueryData`, so we only need to refresh the dependent views. queryClient.invalidateQueries({ queryKey: mcpKeys.serversList(workspaceId) }) queryClient.invalidateQueries({ queryKey: mcpKeys.storedToolsList(workspaceId) }) }, @@ -175,7 +258,7 @@ export function useCreateMcpServer() { }, onSettled: (_data, _error, variables) => { queryClient.invalidateQueries({ queryKey: mcpKeys.serversList(variables.workspaceId) }) - queryClient.invalidateQueries({ queryKey: mcpKeys.toolsList(variables.workspaceId) }) + queryClient.invalidateQueries({ queryKey: mcpKeys.serverTools() }) }, }) } @@ -237,7 +320,9 @@ export function useDeleteMcpServer() { }, onSettled: (_data, _error, variables) => { queryClient.invalidateQueries({ queryKey: mcpKeys.serversList(variables.workspaceId) }) - queryClient.invalidateQueries({ queryKey: mcpKeys.toolsList(variables.workspaceId) }) + queryClient.removeQueries({ + queryKey: mcpKeys.serverToolsList(variables.workspaceId, variables.serverId), + }) queryClient.invalidateQueries({ queryKey: mcpKeys.storedToolsList(variables.workspaceId) }) }, }) @@ -304,7 +389,9 @@ export function useUpdateMcpServer() { }, onSettled: (_data, _error, variables) => { queryClient.invalidateQueries({ queryKey: mcpKeys.serversList(variables.workspaceId) }) - queryClient.invalidateQueries({ queryKey: mcpKeys.toolsList(variables.workspaceId) }) + queryClient.invalidateQueries({ + queryKey: mcpKeys.serverToolsList(variables.workspaceId, variables.serverId), + }) }, }) } @@ -334,7 +421,9 @@ export function useRefreshMcpServer() { }, onSettled: (_data, _error, variables) => { queryClient.invalidateQueries({ queryKey: mcpKeys.serversList(variables.workspaceId) }) - queryClient.invalidateQueries({ queryKey: mcpKeys.toolsList(variables.workspaceId) }) + queryClient.invalidateQueries({ + queryKey: mcpKeys.serverToolsList(variables.workspaceId, variables.serverId), + }) queryClient.invalidateQueries({ queryKey: mcpKeys.storedToolsList(variables.workspaceId) }) }, }) @@ -386,8 +475,14 @@ export function useMcpToolsEvents(workspaceId: string) { useEffect(() => { if (!workspaceId) return - const invalidate = () => { - queryClient.invalidateQueries({ queryKey: mcpKeys.toolsList(workspaceId) }) + const invalidate = (serverId?: string) => { + if (serverId) { + queryClient.invalidateQueries({ + queryKey: mcpKeys.serverToolsList(workspaceId, serverId), + }) + } else { + queryClient.invalidateQueries({ queryKey: mcpKeys.serverTools() }) + } queryClient.invalidateQueries({ queryKey: mcpKeys.serversList(workspaceId) }) queryClient.invalidateQueries({ queryKey: mcpKeys.storedToolsList(workspaceId) }) queryClient.invalidateQueries({ queryKey: workflowMcpServerKeys.all }) @@ -398,8 +493,15 @@ export function useMcpToolsEvents(workspaceId: string) { if (!entry) { const source = new EventSource(`/api/mcp/events?workspaceId=${workspaceId}`) - source.addEventListener('tools_changed', () => { - invalidate() + source.addEventListener('tools_changed', (e) => { + let serverId: string | undefined + try { + const parsed = JSON.parse((e as MessageEvent).data) as { serverId?: string } + serverId = parsed.serverId + } catch { + // Older event payload or non-JSON — fall back to workspace-wide invalidation. + } + invalidate(serverId) }) source.onerror = () => { diff --git a/apps/sim/lib/mcp/service.test.ts b/apps/sim/lib/mcp/service.test.ts index 42d111640d2..93a209d513a 100644 --- a/apps/sim/lib/mcp/service.test.ts +++ b/apps/sim/lib/mcp/service.test.ts @@ -124,6 +124,9 @@ function tool(name: string, serverId: string) { describe('McpService.discoverTools per-server caching', () => { beforeEach(async () => { vi.clearAllMocks() + // `clearAllMocks` does not drain `.mockResolvedValueOnce` queues; reset + // listTools so a previous test's unconsumed mock doesn't leak into the next. + mockListTools.mockReset() mockIsDomainAllowed.mockReturnValue(true) mockValidateSsrf.mockResolvedValue('1.2.3.4') mockValidateDomain.mockImplementation(() => undefined) @@ -162,11 +165,14 @@ describe('McpService.discoverTools per-server caching', () => { expect(first.map((t) => t.name)).toEqual(['a1']) mockListTools.mockClear() - mockListTools.mockResolvedValueOnce([tool('b1', 'mcp-b')]) + // a1's positive cache is intact (the failure didn't poison it). b is now + // negative-cached so it's skipped instead of re-blocking — see + // "negative-caches a failed server so the next discoverTools skips it" + // below for the full assertion. const second = await mcpService.discoverTools(USER_ID, WORKSPACE_ID) - expect(second.map((t) => t.name).sort()).toEqual(['a1', 'b1']) - expect(mockListTools).toHaveBeenCalledTimes(1) + expect(second.map((t) => t.name)).toEqual(['a1']) + expect(mockListTools).not.toHaveBeenCalled() }) it("forceRefresh bypasses every server's cache", async () => { @@ -259,4 +265,60 @@ describe('McpService.discoverTools per-server caching', () => { expect(second.map((t) => t.name)).toEqual(['a1']) expect(mockListTools).not.toHaveBeenCalled() }) + + it('negative-caches a failed server so the next discoverTools skips it', async () => { + mockGetWorkspaceServersRows.mockResolvedValue([dbRow('mcp-a', 'A'), dbRow('mcp-b', 'B')]) + mockListTools + .mockResolvedValueOnce([tool('a1', 'mcp-a')]) + .mockRejectedValueOnce(new Error('Request timed out')) + + await mcpService.discoverTools(USER_ID, WORKSPACE_ID) + expect(mockListTools).toHaveBeenCalledTimes(2) + + mockListTools.mockClear() + // Second call: a1 is success-cached, b is failure-cached. Neither should + // hit the live transport — the slow server no longer blocks the response. + const second = await mcpService.discoverTools(USER_ID, WORKSPACE_ID) + expect(second.map((t) => t.name)).toEqual(['a1']) + expect(mockListTools).not.toHaveBeenCalled() + }) + + it('successful discoverServerTools clears the negative cache', async () => { + mockGetWorkspaceServersRows.mockResolvedValue([dbRow('mcp-a', 'A')]) + mockListTools.mockRejectedValueOnce(new Error('Request timed out')) + + await expect(mcpService.discoverServerTools(USER_ID, 'mcp-a', WORKSPACE_ID)).rejects.toThrow( + 'Request timed out' + ) + + // Following the failure, the negative cache would short-circuit discoverTools. + // Reconnecting the server (e.g. after OAuth) should bring it back to live. + mockListTools.mockClear() + mockListTools.mockResolvedValueOnce([tool('a1', 'mcp-a')]) + const tools = await mcpService.discoverServerTools(USER_ID, 'mcp-a', WORKSPACE_ID) + expect(tools.map((t) => t.name)).toEqual(['a1']) + + // discoverTools now sees the cleared negative cache + primed positive cache. + mockListTools.mockClear() + const after = await mcpService.discoverTools(USER_ID, WORKSPACE_ID) + expect(after.map((t) => t.name)).toEqual(['a1']) + expect(mockListTools).not.toHaveBeenCalled() + }) + + it('does not negative-cache OAuth-required errors', async () => { + mockGetWorkspaceServersRows.mockResolvedValue([dbRow('mcp-a', 'A')]) + mockListTools.mockRejectedValueOnce(new McpOauthAuthorizationRequiredError('mcp-a', 'A')) + + await mcpService.discoverTools(USER_ID, WORKSPACE_ID) + expect(mockListTools).toHaveBeenCalledTimes(1) + + // Second call must still attempt the live transport — OAuth re-auth has + // its own pathway and a stale negative cache would make reconnects + // silently fail until the TTL expired. + mockListTools.mockClear() + mockListTools.mockResolvedValueOnce([tool('a1', 'mcp-a')]) + const after = await mcpService.discoverTools(USER_ID, WORKSPACE_ID) + expect(after.map((t) => t.name)).toEqual(['a1']) + expect(mockListTools).toHaveBeenCalledTimes(1) + }) }) diff --git a/apps/sim/lib/mcp/service.ts b/apps/sim/lib/mcp/service.ts index b024c179300..56bf2626a12 100644 --- a/apps/sim/lib/mcp/service.ts +++ b/apps/sim/lib/mcp/service.ts @@ -41,7 +41,7 @@ import { type McpToolResult, type McpTransport, } from '@/lib/mcp/types' -import { MCP_CONSTANTS } from '@/lib/mcp/utils' +import { MCP_CLIENT_CONSTANTS, MCP_CONSTANTS } from '@/lib/mcp/utils' const logger = createLogger('McpService') @@ -50,6 +50,15 @@ function serverCacheKey(workspaceId: string, serverId: string): string { return `workspace:${workspaceId}:server:${serverId}` } +// Parallel key for negative caching: when a server's listTools fails, we store +// a short-lived marker here so the next discoverTools call skips the live fetch +// instead of re-paying the listTools timeout. +function failureCacheKey(workspaceId: string, serverId: string): string { + return `workspace:${workspaceId}:server:${serverId}:failure` +} + +const FAILURE_CACHE_SENTINEL: McpTool[] = [] + type DiscoveryOutcome = | { kind: 'cached'; tools: McpTool[] } | { @@ -59,6 +68,7 @@ type DiscoveryOutcome = resolvedIP: string | null } | { kind: 'oauth-pending' } + | { kind: 'unhealthy' } | { kind: 'error'; message: string } class McpService { @@ -77,6 +87,14 @@ class McpService { .catch((err) => logger.warn(`Failed to invalidate cache for ${event.serverName} on listChanged:`, err) ) + this.cacheAdapter + .delete(failureCacheKey(event.workspaceId, event.serverId)) + .catch((err) => + logger.warn( + `Failed to invalidate failure cache for ${event.serverName} on listChanged:`, + err + ) + ) }) } } @@ -389,6 +407,48 @@ class McpService { } } + /** + * Record a server as unhealthy so subsequent discovery calls within the TTL + * short-circuit instead of re-paying the listTools timeout. OAuth-required + * errors are intentionally skipped — they have their own pathway and should + * be retried as soon as the user reconnects. + */ + private async markServerUnhealthy( + workspaceId: string, + serverId: string, + error: unknown + ): Promise { + if (error instanceof McpOauthAuthorizationRequiredError || error instanceof UnauthorizedError) { + return + } + try { + await this.cacheAdapter.set( + failureCacheKey(workspaceId, serverId), + FAILURE_CACHE_SENTINEL, + MCP_CLIENT_CONSTANTS.FAILURE_CACHE_TTL_MS + ) + } catch (err) { + logger.warn(`Failed to write failure cache for server ${serverId}:`, err) + } + } + + private async isServerUnhealthy(workspaceId: string, serverId: string): Promise { + try { + const entry = await this.cacheAdapter.get(failureCacheKey(workspaceId, serverId)) + return entry !== null + } catch { + return false + } + } + + private async clearServerFailure(workspaceId: string, serverId: string): Promise { + try { + await this.cacheAdapter.delete(failureCacheKey(workspaceId, serverId)) + } catch (err) { + logger.warn(`Failed to clear failure cache for server ${serverId}:`, err) + } + } + /** * Discover tools from all workspace servers */ @@ -423,6 +483,13 @@ class McpService { error ) } + // Short-circuit recently-failed servers so they don't block the fan-out. + if (await this.isServerUnhealthy(workspaceId, config.id)) { + logger.info( + `[${requestId}] Skipping recently-failed server ${config.name} (negative-cache hit)` + ) + return { kind: 'unhealthy' } + } } try { @@ -484,6 +551,7 @@ class McpService { logger.warn(`[${requestId}] Cache write failed for ${server.name}:`, err) ) ) + deferredSideEffects.push(this.clearServerFailure(workspaceId, server.id)) liveConnections.push({ resolvedConfig: outcome.resolvedConfig, resolvedIP: outcome.resolvedIP, @@ -509,12 +577,19 @@ class McpService { ) return } + if (outcome.kind === 'unhealthy') { + // Short-circuited via negative cache; status was already persisted on + // the original failure, no need to re-write it. + failedCount++ + return + } failedCount++ logger.warn( `[${requestId}] Failed to discover tools from server ${server.name}: ${outcome.message}` ) deferredSideEffects.push( - this.updateServerStatus(server.id, workspaceId, false, outcome.message) + this.updateServerStatus(server.id, workspaceId, false, outcome.message), + this.markServerUnhealthy(workspaceId, server.id, new Error(outcome.message)) ) }) @@ -580,15 +655,16 @@ class McpService { try { const tools = await client.listTools() logger.info(`[${requestId}] Discovered ${tools.length} tools from server ${config.name}`) - // Prime the per-server cache and reflect the successful connection on - // the row so the UI doesn't keep showing "Connect with OAuth" or stale - // disconnected/error state. + // Prime the per-server cache, clear any negative-cache marker, and + // reflect the successful connection on the row so the UI doesn't keep + // showing "Connect with OAuth" or stale disconnected/error state. await Promise.allSettled([ this.cacheAdapter .set(serverCacheKey(workspaceId, serverId), tools, this.cacheTimeout) .catch((err) => logger.warn(`[${requestId}] Cache write failed for ${config.name}:`, err) ), + this.clearServerFailure(workspaceId, serverId), this.updateServerStatus(serverId, workspaceId, true, undefined, tools.length), ]) return tools @@ -604,6 +680,10 @@ class McpService { await sleep(100) continue } + // Negative-cache the failure so the next call short-circuits instead of + // re-paying the listTools timeout. OAuth-required errors are skipped + // inside `markServerUnhealthy` so re-auth can retry immediately. + await this.markServerUnhealthy(workspaceId, serverId, error) throw error } } @@ -692,7 +772,10 @@ class McpService { .from(mcpServers) .where(eq(mcpServers.workspaceId, workspaceId)) await Promise.allSettled( - rows.map((r) => this.cacheAdapter.delete(serverCacheKey(workspaceId, r.id))) + rows.flatMap((r) => [ + this.cacheAdapter.delete(serverCacheKey(workspaceId, r.id)), + this.cacheAdapter.delete(failureCacheKey(workspaceId, r.id)), + ]) ) logger.debug(`Cleared MCP tool cache for workspace ${workspaceId} (${rows.length} servers)`) } else { diff --git a/apps/sim/lib/mcp/utils.ts b/apps/sim/lib/mcp/utils.ts index af16621567e..2c8e38e1bd7 100644 --- a/apps/sim/lib/mcp/utils.ts +++ b/apps/sim/lib/mcp/utils.ts @@ -46,8 +46,11 @@ export function sanitizeHeaders( export const MCP_CLIENT_CONSTANTS = { CLIENT_TIMEOUT: DEFAULT_EXECUTION_TIMEOUT_MS, AUTO_REFRESH_INTERVAL: 5 * 60 * 1000, - // Cap metadata calls so a slow upstream can't hang the UI for 60s+. - LIST_TOOLS_TIMEOUT_MS: 30_000, + // Cap metadata calls so a slow upstream can't hang the UI. + LIST_TOOLS_TIMEOUT_MS: 10_000, + // Negative-cache TTL for failed tool discovery: subsequent calls within this + // window short-circuit instead of re-paying the listTools timeout. + FAILURE_CACHE_TTL_MS: 30_000, } as const /** From fbcffed68305352d8a7bf2c442cc352dbfae08fe Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Thu, 21 May 2026 19:45:01 -0700 Subject: [PATCH 02/10] improvement(mcp): in-flight dedup + 2min negative TTL + no refetch on window focus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three small follow-ups on top of per-server tool queries: - Coalesce concurrent `discoverServerTools(userId, serverId, workspaceId)` calls into a single upstream `tools/list`. Races between OAuth-callback cache priming, post-OAuth UI refetch, and multi-tab loads no longer double-fetch the same server. - Bump negative-cache TTL from 30s to 2 minutes. Cleared on listChanged, OAuth completion, manual refresh, and the next successful discovery, so this floor only matters for genuinely dead servers — drops their floor traffic by 4x. - Disable `refetchOnWindowFocus` on per-server tool queries. listChanged SSE + mutation invalidations already cover real schema changes; alt-tab no longer triggers N parallel `tools/list` calls. --- apps/sim/hooks/queries/mcp.ts | 5 +++++ apps/sim/lib/mcp/service.ts | 25 +++++++++++++++++++++++++ apps/sim/lib/mcp/utils.ts | 6 ++++-- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/apps/sim/hooks/queries/mcp.ts b/apps/sim/hooks/queries/mcp.ts index 6be131dee7e..3e56b6d539f 100644 --- a/apps/sim/hooks/queries/mcp.ts +++ b/apps/sim/hooks/queries/mcp.ts @@ -134,6 +134,10 @@ export function useMcpServerTools(workspaceId: string, serverId?: string) { enabled: !!workspaceId && !!serverId, retry: false, staleTime: 30 * 1000, + // Tool schemas rarely change between tab switches; `listChanged` SSE events + // and mutation invalidations already cover real updates. Skip the alt-tab + // refetch storm. + refetchOnWindowFocus: false, }) } @@ -157,6 +161,7 @@ export function useMcpToolsQuery(workspaceId: string) { enabled: !!workspaceId, retry: false, staleTime: 30 * 1000, + refetchOnWindowFocus: false, })), }) diff --git a/apps/sim/lib/mcp/service.ts b/apps/sim/lib/mcp/service.ts index 56bf2626a12..486d1236fec 100644 --- a/apps/sim/lib/mcp/service.ts +++ b/apps/sim/lib/mcp/service.ts @@ -75,6 +75,10 @@ class McpService { private cacheAdapter: McpCacheStorageAdapter private readonly cacheTimeout = MCP_CONSTANTS.CACHE_TIMEOUT private unsubscribeConnectionManager?: () => void + // Coalesce concurrent `discoverServerTools` calls for the same target into a + // single upstream `tools/list`. Keyed on (workspaceId, serverId, userId) + // because OAuth-scoped tokens (and therefore visible tools) can vary per user. + private inflightServerDiscovery = new Map>() constructor() { this.cacheAdapter = createMcpCacheAdapter() @@ -625,11 +629,32 @@ class McpService { /** * Discover tools from a specific server with retry logic for session errors. * Retries once on session-related errors (400, 404, session ID issues). + * + * Concurrent callers for the same `(workspaceId, serverId, userId)` share a + * single in-flight upstream request — important when an OAuth callback + * primes the cache while a UI refetch races against it, or when multiple + * tabs land on the same workspace at the same moment. */ async discoverServerTools( userId: string, serverId: string, workspaceId: string + ): Promise { + const inflightKey = `${workspaceId}:${serverId}:${userId}` + const existing = this.inflightServerDiscovery.get(inflightKey) + if (existing) return existing + + const promise = this.discoverServerToolsImpl(userId, serverId, workspaceId).finally(() => { + this.inflightServerDiscovery.delete(inflightKey) + }) + this.inflightServerDiscovery.set(inflightKey, promise) + return promise + } + + private async discoverServerToolsImpl( + userId: string, + serverId: string, + workspaceId: string ): Promise { const requestId = generateRequestId() const maxRetries = 2 diff --git a/apps/sim/lib/mcp/utils.ts b/apps/sim/lib/mcp/utils.ts index 2c8e38e1bd7..4c3e129e2de 100644 --- a/apps/sim/lib/mcp/utils.ts +++ b/apps/sim/lib/mcp/utils.ts @@ -49,8 +49,10 @@ export const MCP_CLIENT_CONSTANTS = { // Cap metadata calls so a slow upstream can't hang the UI. LIST_TOOLS_TIMEOUT_MS: 10_000, // Negative-cache TTL for failed tool discovery: subsequent calls within this - // window short-circuit instead of re-paying the listTools timeout. - FAILURE_CACHE_TTL_MS: 30_000, + // window short-circuit instead of re-paying the listTools timeout. Cleared on + // listChanged, OAuth completion, manual refresh, or the next successful + // discovery — so 2 minutes is the floor only for genuinely dead servers. + FAILURE_CACHE_TTL_MS: 120_000, } as const /** From 397dd64afca44f0eaa6b6963fda7058bd13b1965 Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Thu, 21 May 2026 19:55:19 -0700 Subject: [PATCH 03/10] fix(mcp): address bugbot/greptile review on per-server tool discovery - Workspace-scoped `mcpKeys.serverToolsWorkspace(workspaceId)` prefix for bulk invalidations (create-server, refresh-all, SSE workspace fallback, OAuth fallback). The previous `mcpKeys.serverTools()` prefix was global and invalidated every workspace's tools cache. - `useMcpToolsQuery` folds `useMcpServers().isLoading` into the aggregate `isLoading` so mounting no longer flashes an "empty tools" state during the servers-list fetch. Aggregate `error` is suppressed when any per-server query already returned data so one slow server can't blank out the others. - `useForceRefreshMcpTools` invalidates the per-server query keys of servers whose force refresh failed, so stale tools don't linger. - `DiscoveryOutcome` error variant carries the original error, restoring the OAuth-exemption check that `getErrorMessage(...)` previously erased. - `discoverServerTools(userId, serverId, workspaceId, forceRefresh = false)` now consults the positive + negative cache by default. Per-server React Query refetches hit the cache instead of re-paying the listTools timeout; callers that explicitly bypass cache (refresh route, OAuth callback, bulk POST refresh) pass `forceRefresh: true`. Negative-cache hits throw a typed `McpConnectionError` so the route layer can surface a fast 503. --- apps/sim/app/api/mcp/oauth/callback/route.ts | 6 +- .../app/api/mcp/servers/[id]/refresh/route.ts | 7 ++- apps/sim/app/api/mcp/tools/discover/route.ts | 4 +- apps/sim/hooks/mcp/use-mcp-oauth-popup.ts | 4 +- apps/sim/hooks/mcp/use-mcp-tools.ts | 2 +- apps/sim/hooks/queries/mcp.ts | 55 +++++++++++++----- apps/sim/lib/mcp/service.test.ts | 13 ++++- apps/sim/lib/mcp/service.ts | 57 ++++++++++++++++--- 8 files changed, 116 insertions(+), 32 deletions(-) diff --git a/apps/sim/app/api/mcp/oauth/callback/route.ts b/apps/sim/app/api/mcp/oauth/callback/route.ts index 15b115c0b7c..2c38301249f 100644 --- a/apps/sim/app/api/mcp/oauth/callback/route.ts +++ b/apps/sim/app/api/mcp/oauth/callback/route.ts @@ -168,8 +168,10 @@ export const GET = withRouteHandler(async (request: NextRequest) => { try { // discoverServerTools writes the result to this server's cache so the UI's - // immediate refetch hits it instead of re-fetching live. - await mcpService.discoverServerTools(session.user.id, server.id, server.workspaceId) + // immediate refetch hits it instead of re-fetching live. Bypass any + // stale positive/negative cache from before re-auth so we actually + // verify the new tokens. + await mcpService.discoverServerTools(session.user.id, server.id, server.workspaceId, true) } catch (e) { logger.warn('Post-auth tools refresh failed', toError(e).message) } diff --git a/apps/sim/app/api/mcp/servers/[id]/refresh/route.ts b/apps/sim/app/api/mcp/servers/[id]/refresh/route.ts index 7bab3fade1f..9f216ebf959 100644 --- a/apps/sim/app/api/mcp/servers/[id]/refresh/route.ts +++ b/apps/sim/app/api/mcp/servers/[id]/refresh/route.ts @@ -197,7 +197,12 @@ export const POST = withRouteHandler( } try { - discoveredTools = await mcpService.discoverServerTools(userId, serverId, workspaceId) + discoveredTools = await mcpService.discoverServerTools( + userId, + serverId, + workspaceId, + true + ) connectionStatus = 'connected' toolCount = discoveredTools.length logger.info(`[${requestId}] Discovered ${toolCount} tools from server ${serverId}`) diff --git a/apps/sim/app/api/mcp/tools/discover/route.ts b/apps/sim/app/api/mcp/tools/discover/route.ts index b125fa7ff2b..612788b4875 100644 --- a/apps/sim/app/api/mcp/tools/discover/route.ts +++ b/apps/sim/app/api/mcp/tools/discover/route.ts @@ -28,7 +28,7 @@ export const GET = withRouteHandler( logger.info(`[${requestId}] Discovering MCP tools`, { serverId, workspaceId, forceRefresh }) const tools = serverId - ? await mcpService.discoverServerTools(userId, serverId, workspaceId) + ? await mcpService.discoverServerTools(userId, serverId, workspaceId, forceRefresh) : await mcpService.discoverTools(userId, workspaceId, forceRefresh) const byServer: Record = {} @@ -76,7 +76,7 @@ export const POST = withRouteHandler( const results = await Promise.allSettled( serverIds.map(async (serverId: string) => { - const tools = await mcpService.discoverServerTools(userId, serverId, workspaceId) + const tools = await mcpService.discoverServerTools(userId, serverId, workspaceId, true) return { serverId, toolCount: tools.length } }) ) diff --git a/apps/sim/hooks/mcp/use-mcp-oauth-popup.ts b/apps/sim/hooks/mcp/use-mcp-oauth-popup.ts index 1bbe1c08a4e..58cc53e88a0 100644 --- a/apps/sim/hooks/mcp/use-mcp-oauth-popup.ts +++ b/apps/sim/hooks/mcp/use-mcp-oauth-popup.ts @@ -87,7 +87,9 @@ export function useMcpOauthPopup({ workspaceId }: UseMcpOauthPopupProps) { queryKey: mcpKeys.serverToolsList(workspaceId, data.serverId), }) } else { - queryClient.invalidateQueries({ queryKey: mcpKeys.serverTools() }) + queryClient.invalidateQueries({ + queryKey: mcpKeys.serverToolsWorkspace(workspaceId), + }) } queryClient.invalidateQueries({ queryKey: mcpKeys.storedToolsList(workspaceId) }) toast.success('Server authorized') diff --git a/apps/sim/hooks/mcp/use-mcp-tools.ts b/apps/sim/hooks/mcp/use-mcp-tools.ts index 7a7d12b28ff..d200820e822 100644 --- a/apps/sim/hooks/mcp/use-mcp-tools.ts +++ b/apps/sim/hooks/mcp/use-mcp-tools.ts @@ -65,7 +65,7 @@ export function useMcpTools(workspaceId: string): UseMcpToolsResult { logger.info('Refreshing MCP tools', { forceRefresh, workspaceId }) await queryClient.invalidateQueries({ - queryKey: mcpKeys.serverTools(), + queryKey: mcpKeys.serverToolsWorkspace(workspaceId), refetchType: forceRefresh ? 'active' : 'all', }) }, diff --git a/apps/sim/hooks/queries/mcp.ts b/apps/sim/hooks/queries/mcp.ts index 3e56b6d539f..1dee97b9ca3 100644 --- a/apps/sim/hooks/queries/mcp.ts +++ b/apps/sim/hooks/queries/mcp.ts @@ -48,8 +48,13 @@ export const mcpKeys = { tools: () => [...mcpKeys.all, 'tools'] as const, toolsList: (workspaceId?: string) => [...mcpKeys.tools(), workspaceId ?? ''] as const, serverTools: () => [...mcpKeys.all, 'serverTools'] as const, + // Workspace-scoped prefix so bulk invalidations (refresh-all, create-server) + // only touch the current workspace's per-server entries — not every + // workspace cached in the QueryClient. + serverToolsWorkspace: (workspaceId?: string) => + [...mcpKeys.serverTools(), workspaceId ?? ''] as const, serverToolsList: (workspaceId?: string, serverId?: string) => - [...mcpKeys.serverTools(), workspaceId ?? '', serverId ?? ''] as const, + [...mcpKeys.serverToolsWorkspace(workspaceId), serverId ?? ''] as const, storedTools: () => [...mcpKeys.all, 'storedTools'] as const, storedToolsList: (workspaceId?: string) => [...mcpKeys.storedTools(), workspaceId ?? ''] as const, allowedDomains: () => [...mcpKeys.all, 'allowedDomains'] as const, @@ -149,7 +154,7 @@ export function useMcpServerTools(workspaceId: string, serverId?: string) { * fast servers populate immediately. */ export function useMcpToolsQuery(workspaceId: string) { - const { data: servers } = useMcpServers(workspaceId) + const { data: servers, isLoading: serversLoading } = useMcpServers(workspaceId) const serverIds = useMemo(() => (servers ? servers.map((s) => s.id).sort() : []), [servers]) @@ -168,27 +173,31 @@ export function useMcpToolsQuery(workspaceId: string) { return useMemo(() => { const tools: McpTool[] = [] let hasData = false - let isLoading = false + let anyServerLoading = false let firstError: Error | null = null for (const result of results) { if (result.data) { tools.push(...result.data) hasData = true } - if (result.isLoading) isLoading = true + if (result.isLoading) anyServerLoading = true if (!firstError && result.error instanceof Error) firstError = result.error } return { data: tools, - // Match the prior semantics: "loading" means we have nothing to show yet. - // Once any server has returned, the UI can render that and let slow - // neighbors fill in incrementally without keeping the spinner up. - isLoading: isLoading && !hasData, - isFetching: results.some((r) => r.isFetching), - error: firstError, + // "Loading" means we have nothing to render yet — including the gap + // between mount and the server-list arriving (otherwise consumers flash + // an empty state). Once any per-server query has returned, we drop the + // spinner and let slow neighbors fill in incrementally. + isLoading: (serversLoading || anyServerLoading) && !hasData, + isFetching: serversLoading || results.some((r) => r.isFetching), + // Only surface an aggregate error when no server returned anything — + // one slow/dead server should not blank out tools from the others. + // Per-server errors are still exposed via `perServer`. + error: hasData ? null : firstError, perServer: results, } - }, [results]) + }, [results, serversLoading]) } export function useForceRefreshMcpTools() { @@ -207,13 +216,27 @@ export function useForceRefreshMcpTools() { return tools }) ) + // Invalidate the per-server keys of any server that failed so stale tools + // don't linger after a force-refresh — React Query will refetch and the + // negative cache (set server-side) will make that retry fail fast. + results.forEach((result, index) => { + if (result.status === 'rejected') { + const failedServer = servers[index] + if (failedServer) { + queryClient.invalidateQueries({ + queryKey: mcpKeys.serverToolsList(workspaceId, failedServer.id), + }) + } + } + }) return results .filter((r): r is PromiseFulfilledResult => r.status === 'fulfilled') .flatMap((r) => r.value) }, onSettled: (_data, _error, workspaceId) => { - // Per-server caches were already populated inside `mutationFn` via - // `setQueryData`, so we only need to refresh the dependent views. + // Successful per-server caches were already populated inside `mutationFn` + // via `setQueryData`; failed ones were invalidated above. Just refresh + // the dependent views. queryClient.invalidateQueries({ queryKey: mcpKeys.serversList(workspaceId) }) queryClient.invalidateQueries({ queryKey: mcpKeys.storedToolsList(workspaceId) }) }, @@ -263,7 +286,9 @@ export function useCreateMcpServer() { }, onSettled: (_data, _error, variables) => { queryClient.invalidateQueries({ queryKey: mcpKeys.serversList(variables.workspaceId) }) - queryClient.invalidateQueries({ queryKey: mcpKeys.serverTools() }) + queryClient.invalidateQueries({ + queryKey: mcpKeys.serverToolsWorkspace(variables.workspaceId), + }) }, }) } @@ -486,7 +511,7 @@ export function useMcpToolsEvents(workspaceId: string) { queryKey: mcpKeys.serverToolsList(workspaceId, serverId), }) } else { - queryClient.invalidateQueries({ queryKey: mcpKeys.serverTools() }) + queryClient.invalidateQueries({ queryKey: mcpKeys.serverToolsWorkspace(workspaceId) }) } queryClient.invalidateQueries({ queryKey: mcpKeys.serversList(workspaceId) }) queryClient.invalidateQueries({ queryKey: mcpKeys.storedToolsList(workspaceId) }) diff --git a/apps/sim/lib/mcp/service.test.ts b/apps/sim/lib/mcp/service.test.ts index 93a209d513a..11a2631d34e 100644 --- a/apps/sim/lib/mcp/service.test.ts +++ b/apps/sim/lib/mcp/service.test.ts @@ -291,11 +291,18 @@ describe('McpService.discoverTools per-server caching', () => { 'Request timed out' ) - // Following the failure, the negative cache would short-circuit discoverTools. - // Reconnecting the server (e.g. after OAuth) should bring it back to live. + // After the failure the negative cache is set, so the next default call + // short-circuits without re-paying the listTools timeout. mockListTools.mockClear() + await expect(mcpService.discoverServerTools(USER_ID, 'mcp-a', WORKSPACE_ID)).rejects.toThrow( + 'cooldown' + ) + expect(mockListTools).not.toHaveBeenCalled() + + // Reconnecting via the explicit-refresh path (refresh button / OAuth + // callback) bypasses both caches and brings the server back to live. mockListTools.mockResolvedValueOnce([tool('a1', 'mcp-a')]) - const tools = await mcpService.discoverServerTools(USER_ID, 'mcp-a', WORKSPACE_ID) + const tools = await mcpService.discoverServerTools(USER_ID, 'mcp-a', WORKSPACE_ID, true) expect(tools.map((t) => t.name)).toEqual(['a1']) // discoverTools now sees the cleared negative cache + primed positive cache. diff --git a/apps/sim/lib/mcp/service.ts b/apps/sim/lib/mcp/service.ts index 486d1236fec..32f6ddcfd0a 100644 --- a/apps/sim/lib/mcp/service.ts +++ b/apps/sim/lib/mcp/service.ts @@ -32,6 +32,7 @@ import { type McpCacheStorageAdapter, } from '@/lib/mcp/storage' import { + McpConnectionError, McpOauthAuthorizationRequiredError, type McpServerConfig, type McpServerStatusConfig, @@ -69,7 +70,9 @@ type DiscoveryOutcome = } | { kind: 'oauth-pending' } | { kind: 'unhealthy' } - | { kind: 'error'; message: string } + // Carries the original error so `markServerUnhealthy` can apply its + // `instanceof` exemption — `getErrorMessage(...)` erases type info. + | { kind: 'error'; message: string; originalError: unknown } class McpService { private cacheAdapter: McpCacheStorageAdapter @@ -519,7 +522,11 @@ class McpService { ) { return { kind: 'oauth-pending' } } - return { kind: 'error', message: getErrorMessage(error, 'Unknown error') } + return { + kind: 'error', + message: getErrorMessage(error, 'Unknown error'), + originalError: error, + } } }) ) @@ -593,7 +600,7 @@ class McpService { ) deferredSideEffects.push( this.updateServerStatus(server.id, workspaceId, false, outcome.message), - this.markServerUnhealthy(workspaceId, server.id, new Error(outcome.message)) + this.markServerUnhealthy(workspaceId, server.id, outcome.originalError) ) }) @@ -630,6 +637,12 @@ class McpService { * Discover tools from a specific server with retry logic for session errors. * Retries once on session-related errors (400, 404, session ID issues). * + * By default consults the positive + negative cache before going live, so + * a React Query refetch for a healthy server is served from cache and a + * recently-failed server short-circuits instead of re-paying the listTools + * timeout. Pass `forceRefresh: true` from explicit user-initiated paths + * (refresh button, OAuth callback) to bypass both caches. + * * Concurrent callers for the same `(workspaceId, serverId, userId)` share a * single in-flight upstream request — important when an OAuth callback * primes the cache while a UI refetch races against it, or when multiple @@ -638,13 +651,19 @@ class McpService { async discoverServerTools( userId: string, serverId: string, - workspaceId: string + workspaceId: string, + forceRefresh = false ): Promise { - const inflightKey = `${workspaceId}:${serverId}:${userId}` + const inflightKey = `${workspaceId}:${serverId}:${userId}:${forceRefresh ? 'force' : 'cache'}` const existing = this.inflightServerDiscovery.get(inflightKey) if (existing) return existing - const promise = this.discoverServerToolsImpl(userId, serverId, workspaceId).finally(() => { + const promise = this.discoverServerToolsImpl( + userId, + serverId, + workspaceId, + forceRefresh + ).finally(() => { this.inflightServerDiscovery.delete(inflightKey) }) this.inflightServerDiscovery.set(inflightKey, promise) @@ -654,11 +673,35 @@ class McpService { private async discoverServerToolsImpl( userId: string, serverId: string, - workspaceId: string + workspaceId: string, + forceRefresh: boolean ): Promise { const requestId = generateRequestId() const maxRetries = 2 + if (!forceRefresh) { + // Positive cache hit — same payload the aggregate fan-out would return, + // no upstream traffic. + try { + const cached = await this.cacheAdapter.get(serverCacheKey(workspaceId, serverId)) + if (cached) { + logger.debug(`[${requestId}] Cache hit for server ${serverId}`) + return cached.tools + } + } catch (error) { + logger.warn(`[${requestId}] Cache read failed for server ${serverId}:`, error) + } + // Negative cache hit — fail fast with a typed error the caller can map + // to a 503-style response, rather than waiting out the listTools timeout. + if (await this.isServerUnhealthy(workspaceId, serverId)) { + logger.info(`[${requestId}] Skipping recently-failed server ${serverId} (negative-cache)`) + throw new McpConnectionError( + 'Server recently failed and is in cooldown — try again shortly.', + serverId + ) + } + } + for (let attempt = 0; attempt < maxRetries; attempt++) { try { logger.info( From 8713f0da844f49537149d00b33e9ecb421d89486 Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Thu, 21 May 2026 19:56:23 -0700 Subject: [PATCH 04/10] update icons --- apps/sim/components/icons.tsx | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/apps/sim/components/icons.tsx b/apps/sim/components/icons.tsx index cd326d6220a..5f79d8ad05c 100644 --- a/apps/sim/components/icons.tsx +++ b/apps/sim/components/icons.tsx @@ -524,7 +524,7 @@ export function HubspotIcon(props: SVGProps) { xmlns='http://www.w3.org/2000/svg' fill='currentColor' > - + ) } @@ -2284,23 +2284,26 @@ export function ElevenLabsIcon(props: SVGProps) { } export function FindymailIcon(props: SVGProps) { + const id = useId() + const gradient0 = `findymail_paint0_${id}` + const gradient1 = `findymail_paint1_${id}` return ( ) { ) => ( ) export const ResendIcon = (props: SVGProps) => ( - + Date: Thu, 21 May 2026 20:03:44 -0700 Subject: [PATCH 05/10] chore(mcp): remove dead query keys and trim verbose comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Drop unused `mcpKeys.tools()` / `mcpKeys.toolsList()` — replaced by per-server keys, no remaining callers. - Trim narrative comments to keep only the non-obvious "why" notes. --- apps/sim/app/api/mcp/oauth/callback/route.ts | 5 +- apps/sim/hooks/mcp/use-mcp-oauth-popup.ts | 2 - apps/sim/hooks/queries/mcp.ts | 43 ++++------------- apps/sim/lib/mcp/service.ts | 50 +++++--------------- apps/sim/lib/mcp/utils.ts | 5 -- 5 files changed, 20 insertions(+), 85 deletions(-) diff --git a/apps/sim/app/api/mcp/oauth/callback/route.ts b/apps/sim/app/api/mcp/oauth/callback/route.ts index 2c38301249f..08171fbc97c 100644 --- a/apps/sim/app/api/mcp/oauth/callback/route.ts +++ b/apps/sim/app/api/mcp/oauth/callback/route.ts @@ -167,10 +167,7 @@ export const GET = withRouteHandler(async (request: NextRequest) => { } try { - // discoverServerTools writes the result to this server's cache so the UI's - // immediate refetch hits it instead of re-fetching live. Bypass any - // stale positive/negative cache from before re-auth so we actually - // verify the new tokens. + // forceRefresh: skip any stale cache from before re-auth. await mcpService.discoverServerTools(session.user.id, server.id, server.workspaceId, true) } catch (e) { logger.warn('Post-auth tools refresh failed', toError(e).message) diff --git a/apps/sim/hooks/mcp/use-mcp-oauth-popup.ts b/apps/sim/hooks/mcp/use-mcp-oauth-popup.ts index 58cc53e88a0..27bd0286f94 100644 --- a/apps/sim/hooks/mcp/use-mcp-oauth-popup.ts +++ b/apps/sim/hooks/mcp/use-mcp-oauth-popup.ts @@ -81,8 +81,6 @@ export function useMcpOauthPopup({ workspaceId }: UseMcpOauthPopupProps) { if (data.ok) { queryClient.invalidateQueries({ queryKey: mcpKeys.serversList(workspaceId) }) if (data.serverId) { - // Only the freshly-authorized server needs a refetch; every other - // server's cached tools stay warm so a slow neighbor can't gate the UI. queryClient.invalidateQueries({ queryKey: mcpKeys.serverToolsList(workspaceId, data.serverId), }) diff --git a/apps/sim/hooks/queries/mcp.ts b/apps/sim/hooks/queries/mcp.ts index 1dee97b9ca3..23626f5507f 100644 --- a/apps/sim/hooks/queries/mcp.ts +++ b/apps/sim/hooks/queries/mcp.ts @@ -45,12 +45,7 @@ export const mcpKeys = { all: ['mcp'] as const, servers: () => [...mcpKeys.all, 'servers'] as const, serversList: (workspaceId?: string) => [...mcpKeys.servers(), workspaceId ?? ''] as const, - tools: () => [...mcpKeys.all, 'tools'] as const, - toolsList: (workspaceId?: string) => [...mcpKeys.tools(), workspaceId ?? ''] as const, serverTools: () => [...mcpKeys.all, 'serverTools'] as const, - // Workspace-scoped prefix so bulk invalidations (refresh-all, create-server) - // only touch the current workspace's per-server entries — not every - // workspace cached in the QueryClient. serverToolsWorkspace: (workspaceId?: string) => [...mcpKeys.serverTools(), workspaceId ?? ''] as const, serverToolsList: (workspaceId?: string, serverId?: string) => @@ -127,11 +122,6 @@ async function fetchMcpTools( } } -/** - * Per-server tools query. Each server has its own React Query entry, so a slow - * or hung server can never block another server's load — same model Cursor and - * Claude Code use for remote MCP. - */ export function useMcpServerTools(workspaceId: string, serverId?: string) { return useQuery({ queryKey: mcpKeys.serverToolsList(workspaceId, serverId), @@ -139,19 +129,13 @@ export function useMcpServerTools(workspaceId: string, serverId?: string) { enabled: !!workspaceId && !!serverId, retry: false, staleTime: 30 * 1000, - // Tool schemas rarely change between tab switches; `listChanged` SSE events - // and mutation invalidations already cover real updates. Skip the alt-tab - // refetch storm. refetchOnWindowFocus: false, }) } /** - * Workspace-level aggregate, derived from N parallel per-server queries via - * `useQueries`. Public shape stays compatible with the prior workspace-keyed - * query (flat `McpTool[]`, single `isLoading`, single error) so existing - * consumers don't change. A slow server only flips its own per-server state; - * fast servers populate immediately. + * Workspace aggregate derived from N parallel per-server queries via + * `useQueries`. One slow server cannot block the others. */ export function useMcpToolsQuery(workspaceId: string) { const { data: servers, isLoading: serversLoading } = useMcpServers(workspaceId) @@ -185,15 +169,12 @@ export function useMcpToolsQuery(workspaceId: string) { } return { data: tools, - // "Loading" means we have nothing to render yet — including the gap - // between mount and the server-list arriving (otherwise consumers flash - // an empty state). Once any per-server query has returned, we drop the - // spinner and let slow neighbors fill in incrementally. + // Stay loading until we have something to render; once any server + // returned, drop the spinner and let slow neighbors fill in. isLoading: (serversLoading || anyServerLoading) && !hasData, isFetching: serversLoading || results.some((r) => r.isFetching), - // Only surface an aggregate error when no server returned anything — - // one slow/dead server should not blank out tools from the others. - // Per-server errors are still exposed via `perServer`. + // One dead server must not blank out the workspace — only surface the + // aggregate error when nothing rendered. Per-server errors live in `perServer`. error: hasData ? null : firstError, perServer: results, } @@ -205,9 +186,6 @@ export function useForceRefreshMcpTools() { return useMutation({ mutationFn: async (workspaceId: string) => { - // Fetch each server's tools in parallel with `refresh=true`, populating - // the per-server query cache directly. A slow server only blocks its own - // card — fast servers light up as soon as they return. const servers = queryClient.getQueryData(mcpKeys.serversList(workspaceId)) ?? [] const results = await Promise.allSettled( servers.map(async (server) => { @@ -216,9 +194,7 @@ export function useForceRefreshMcpTools() { return tools }) ) - // Invalidate the per-server keys of any server that failed so stale tools - // don't linger after a force-refresh — React Query will refetch and the - // negative cache (set server-side) will make that retry fail fast. + // Failed servers: invalidate so React Query retries via the server-side negative cache. results.forEach((result, index) => { if (result.status === 'rejected') { const failedServer = servers[index] @@ -234,9 +210,6 @@ export function useForceRefreshMcpTools() { .flatMap((r) => r.value) }, onSettled: (_data, _error, workspaceId) => { - // Successful per-server caches were already populated inside `mutationFn` - // via `setQueryData`; failed ones were invalidated above. Just refresh - // the dependent views. queryClient.invalidateQueries({ queryKey: mcpKeys.serversList(workspaceId) }) queryClient.invalidateQueries({ queryKey: mcpKeys.storedToolsList(workspaceId) }) }, @@ -529,7 +502,7 @@ export function useMcpToolsEvents(workspaceId: string) { const parsed = JSON.parse((e as MessageEvent).data) as { serverId?: string } serverId = parsed.serverId } catch { - // Older event payload or non-JSON — fall back to workspace-wide invalidation. + // Non-JSON payload → workspace-wide fallback. } invalidate(serverId) }) diff --git a/apps/sim/lib/mcp/service.ts b/apps/sim/lib/mcp/service.ts index 32f6ddcfd0a..adfb20c702c 100644 --- a/apps/sim/lib/mcp/service.ts +++ b/apps/sim/lib/mcp/service.ts @@ -46,14 +46,10 @@ import { MCP_CLIENT_CONSTANTS, MCP_CONSTANTS } from '@/lib/mcp/utils' const logger = createLogger('McpService') -// Per-server keys so one slow server can't invalidate another's cached tools. function serverCacheKey(workspaceId: string, serverId: string): string { return `workspace:${workspaceId}:server:${serverId}` } -// Parallel key for negative caching: when a server's listTools fails, we store -// a short-lived marker here so the next discoverTools call skips the live fetch -// instead of re-paying the listTools timeout. function failureCacheKey(workspaceId: string, serverId: string): string { return `workspace:${workspaceId}:server:${serverId}:failure` } @@ -70,17 +66,15 @@ type DiscoveryOutcome = } | { kind: 'oauth-pending' } | { kind: 'unhealthy' } - // Carries the original error so `markServerUnhealthy` can apply its - // `instanceof` exemption — `getErrorMessage(...)` erases type info. + // originalError preserves the type so markServerUnhealthy's instanceof + // exemption survives the getErrorMessage call. | { kind: 'error'; message: string; originalError: unknown } class McpService { private cacheAdapter: McpCacheStorageAdapter private readonly cacheTimeout = MCP_CONSTANTS.CACHE_TIMEOUT private unsubscribeConnectionManager?: () => void - // Coalesce concurrent `discoverServerTools` calls for the same target into a - // single upstream `tools/list`. Keyed on (workspaceId, serverId, userId) - // because OAuth-scoped tokens (and therefore visible tools) can vary per user. + // Keyed on (workspaceId, serverId, userId) — OAuth-scoped tokens vary per user. private inflightServerDiscovery = new Map>() constructor() { @@ -415,10 +409,8 @@ class McpService { } /** - * Record a server as unhealthy so subsequent discovery calls within the TTL - * short-circuit instead of re-paying the listTools timeout. OAuth-required - * errors are intentionally skipped — they have their own pathway and should - * be retried as soon as the user reconnects. + * Negative-cache a discovery failure. OAuth-required errors are exempt so + * reconnects retry immediately. */ private async markServerUnhealthy( workspaceId: string, @@ -490,7 +482,6 @@ class McpService { error ) } - // Short-circuit recently-failed servers so they don't block the fan-out. if (await this.isServerUnhealthy(workspaceId, config.id)) { logger.info( `[${requestId}] Skipping recently-failed server ${config.name} (negative-cache hit)` @@ -589,8 +580,7 @@ class McpService { return } if (outcome.kind === 'unhealthy') { - // Short-circuited via negative cache; status was already persisted on - // the original failure, no need to re-write it. + // Status was persisted on the original failure; nothing to re-write. failedCount++ return } @@ -634,19 +624,11 @@ class McpService { } /** - * Discover tools from a specific server with retry logic for session errors. - * Retries once on session-related errors (400, 404, session ID issues). - * - * By default consults the positive + negative cache before going live, so - * a React Query refetch for a healthy server is served from cache and a - * recently-failed server short-circuits instead of re-paying the listTools - * timeout. Pass `forceRefresh: true` from explicit user-initiated paths - * (refresh button, OAuth callback) to bypass both caches. - * - * Concurrent callers for the same `(workspaceId, serverId, userId)` share a - * single in-flight upstream request — important when an OAuth callback - * primes the cache while a UI refetch races against it, or when multiple - * tabs land on the same workspace at the same moment. + * Discover tools from one server. Cache-aside by default; pass + * `forceRefresh: true` from explicit-refresh paths (refresh button, OAuth + * callback) to bypass both positive and negative caches. Concurrent callers + * for the same `(workspaceId, serverId, userId, forceRefresh)` share one + * upstream request. */ async discoverServerTools( userId: string, @@ -680,8 +662,6 @@ class McpService { const maxRetries = 2 if (!forceRefresh) { - // Positive cache hit — same payload the aggregate fan-out would return, - // no upstream traffic. try { const cached = await this.cacheAdapter.get(serverCacheKey(workspaceId, serverId)) if (cached) { @@ -691,8 +671,6 @@ class McpService { } catch (error) { logger.warn(`[${requestId}] Cache read failed for server ${serverId}:`, error) } - // Negative cache hit — fail fast with a typed error the caller can map - // to a 503-style response, rather than waiting out the listTools timeout. if (await this.isServerUnhealthy(workspaceId, serverId)) { logger.info(`[${requestId}] Skipping recently-failed server ${serverId} (negative-cache)`) throw new McpConnectionError( @@ -723,9 +701,6 @@ class McpService { try { const tools = await client.listTools() logger.info(`[${requestId}] Discovered ${tools.length} tools from server ${config.name}`) - // Prime the per-server cache, clear any negative-cache marker, and - // reflect the successful connection on the row so the UI doesn't keep - // showing "Connect with OAuth" or stale disconnected/error state. await Promise.allSettled([ this.cacheAdapter .set(serverCacheKey(workspaceId, serverId), tools, this.cacheTimeout) @@ -748,9 +723,6 @@ class McpService { await sleep(100) continue } - // Negative-cache the failure so the next call short-circuits instead of - // re-paying the listTools timeout. OAuth-required errors are skipped - // inside `markServerUnhealthy` so re-auth can retry immediately. await this.markServerUnhealthy(workspaceId, serverId, error) throw error } diff --git a/apps/sim/lib/mcp/utils.ts b/apps/sim/lib/mcp/utils.ts index 4c3e129e2de..db5b2ac5a99 100644 --- a/apps/sim/lib/mcp/utils.ts +++ b/apps/sim/lib/mcp/utils.ts @@ -46,12 +46,7 @@ export function sanitizeHeaders( export const MCP_CLIENT_CONSTANTS = { CLIENT_TIMEOUT: DEFAULT_EXECUTION_TIMEOUT_MS, AUTO_REFRESH_INTERVAL: 5 * 60 * 1000, - // Cap metadata calls so a slow upstream can't hang the UI. LIST_TOOLS_TIMEOUT_MS: 10_000, - // Negative-cache TTL for failed tool discovery: subsequent calls within this - // window short-circuit instead of re-paying the listTools timeout. Cleared on - // listChanged, OAuth completion, manual refresh, or the next successful - // discovery — so 2 minutes is the floor only for genuinely dead servers. FAILURE_CACHE_TTL_MS: 120_000, } as const From fedf75eedd85f4094e6fa4706bb29d487e28d1cc Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Thu, 21 May 2026 20:26:00 -0700 Subject: [PATCH 06/10] fix(mcp): map negative-cache cooldown error to HTTP 503 `McpConnectionError` thrown when a server is in cooldown previously fell through `categorizeError` to a generic 500. Cooldown is a transient-unavailability condition, so route it to 503. --- apps/sim/lib/mcp/utils.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/sim/lib/mcp/utils.ts b/apps/sim/lib/mcp/utils.ts index db5b2ac5a99..6364cafb111 100644 --- a/apps/sim/lib/mcp/utils.ts +++ b/apps/sim/lib/mcp/utils.ts @@ -143,6 +143,10 @@ export function categorizeError(error: unknown): { message: string; status: numb return { message: 'Request timed out', status: 408 } } + if (msg.includes('cooldown')) { + return { message: 'Server temporarily unavailable', status: 503 } + } + if (msg.includes('not found') || msg.includes('not accessible')) { return { message: 'Resource not found', status: 404 } } From aa2d7562b11c740087bb1e816bf243acb90065f4 Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Thu, 21 May 2026 20:46:05 -0700 Subject: [PATCH 07/10] =?UTF-8?q?test(mcp):=20cover=20cooldown=20error=20?= =?UTF-8?q?=E2=86=92=20503=20categorization?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/sim/lib/mcp/utils.test.ts | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/apps/sim/lib/mcp/utils.test.ts b/apps/sim/lib/mcp/utils.test.ts index 3a23c5bce26..29aded1358e 100644 --- a/apps/sim/lib/mcp/utils.test.ts +++ b/apps/sim/lib/mcp/utils.test.ts @@ -284,6 +284,13 @@ describe('categorizeError', () => { expect(result.message).toBe('Invalid request parameters') }) + it.concurrent('returns 503 for cooldown errors', () => { + const error = new Error('Server recently failed and is in cooldown — try again shortly.') + const result = categorizeError(error) + expect(result.status).toBe(503) + expect(result.message).toBe('Server temporarily unavailable') + }) + it.concurrent('returns 500 for generic errors', () => { const error = new Error('Something went wrong') const result = categorizeError(error) From a91546e1457e511e4e8ed8ecefdafee65a506571 Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Thu, 21 May 2026 21:56:10 -0700 Subject: [PATCH 08/10] fix(mcp): address second-round bugbot review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - useMcpToolsQuery serverIds: filter on enabled + workspaceId match. Disabled rows no longer trigger discover calls that get negative-cached, and keepPreviousData on useMcpServers no longer races a workspace switch into cross-workspace discover requests. - Aggregate skips per-server data when that server's latest refetch errored, so a broken server's last-known tools no longer linger in the workspace view while its card shows an error. - discoverServerTools failure path drops the positive cache alongside writing the negative-cache marker. A cache-respecting follow-up now fails fast via cooldown instead of returning stale tools from a now-broken server. - useMcpTools.refreshTools drops the dead forceRefresh param — the per-server queryFn always sends refresh=false, so the flag was never effective. Callers wanting cache-bypass should use useForceRefreshMcpTools. --- apps/sim/hooks/mcp/use-mcp-tools.ts | 30 ++++++++++++++--------------- apps/sim/hooks/queries/mcp.ts | 20 +++++++++++++++++-- apps/sim/lib/mcp/service.ts | 12 +++++++++++- 3 files changed, 43 insertions(+), 19 deletions(-) diff --git a/apps/sim/hooks/mcp/use-mcp-tools.ts b/apps/sim/hooks/mcp/use-mcp-tools.ts index d200820e822..bba437be9ab 100644 --- a/apps/sim/hooks/mcp/use-mcp-tools.ts +++ b/apps/sim/hooks/mcp/use-mcp-tools.ts @@ -32,7 +32,7 @@ export interface UseMcpToolsResult { mcpTools: McpToolForUI[] isLoading: boolean error: string | null - refreshTools: (forceRefresh?: boolean) => Promise + refreshTools: () => Promise getToolsByServer: (serverId: string) => McpToolForUI[] } @@ -55,22 +55,20 @@ export function useMcpTools(workspaceId: string): UseMcpToolsResult { })) }, [mcpToolsData]) - const refreshTools = useCallback( - async (forceRefresh = false) => { - if (!workspaceId) { - logger.warn('Cannot refresh tools: no workspaceId provided') - return - } + // Soft refresh: invalidate per-server query entries so React Query refetches + // them, but let the server-side cache decide whether to go live (the per-server + // queryFn always sends refresh=false). For an explicit cache-bypass refresh, + // use `useForceRefreshMcpTools` instead. + const refreshTools = useCallback(async () => { + if (!workspaceId) { + logger.warn('Cannot refresh tools: no workspaceId provided') + return + } - logger.info('Refreshing MCP tools', { forceRefresh, workspaceId }) - - await queryClient.invalidateQueries({ - queryKey: mcpKeys.serverToolsWorkspace(workspaceId), - refetchType: forceRefresh ? 'active' : 'all', - }) - }, - [workspaceId, queryClient] - ) + await queryClient.invalidateQueries({ + queryKey: mcpKeys.serverToolsWorkspace(workspaceId), + }) + }, [workspaceId, queryClient]) const getToolsByServer = useCallback( (serverId: string): McpToolForUI[] => { diff --git a/apps/sim/hooks/queries/mcp.ts b/apps/sim/hooks/queries/mcp.ts index 23626f5507f..0aad4328e79 100644 --- a/apps/sim/hooks/queries/mcp.ts +++ b/apps/sim/hooks/queries/mcp.ts @@ -140,7 +140,20 @@ export function useMcpServerTools(workspaceId: string, serverId?: string) { export function useMcpToolsQuery(workspaceId: string) { const { data: servers, isLoading: serversLoading } = useMcpServers(workspaceId) - const serverIds = useMemo(() => (servers ? servers.map((s) => s.id).sort() : []), [servers]) + // Filter on (a) enabled (disabled servers would 404 in discover and pollute the + // negative cache), and (b) workspaceId match (useMcpServers keeps the previous + // workspace's data via keepPreviousData, so during a workspace switch we'd + // otherwise spawn discover calls for the wrong workspace's IDs). + const serverIds = useMemo( + () => + servers + ? servers + .filter((s) => s.enabled && s.workspaceId === workspaceId) + .map((s) => s.id) + .sort() + : [], + [servers, workspaceId] + ) const results = useQueries({ queries: serverIds.map((serverId) => ({ @@ -160,7 +173,10 @@ export function useMcpToolsQuery(workspaceId: string) { let anyServerLoading = false let firstError: Error | null = null for (const result of results) { - if (result.data) { + // Drop stale data when the latest refetch failed — otherwise React Query's + // stale-while-revalidate behavior would surface broken-server tools in the + // aggregate while the per-server card shows an error. + if (result.data && !result.isError) { tools.push(...result.data) hasData = true } diff --git a/apps/sim/lib/mcp/service.ts b/apps/sim/lib/mcp/service.ts index adfb20c702c..cb047064df3 100644 --- a/apps/sim/lib/mcp/service.ts +++ b/apps/sim/lib/mcp/service.ts @@ -723,7 +723,17 @@ class McpService { await sleep(100) continue } - await this.markServerUnhealthy(workspaceId, serverId, error) + // Drop any stale positive cache so a follow-up cache-respecting call + // doesn't keep returning old tools from a now-broken server. The + // negative cache marker then makes that follow-up fail fast. + await Promise.allSettled([ + this.cacheAdapter + .delete(serverCacheKey(workspaceId, serverId)) + .catch((err) => + logger.warn(`[${requestId}] Cache delete failed for ${serverId}:`, err) + ), + this.markServerUnhealthy(workspaceId, serverId, error), + ]) throw error } } From 3fd031c7ebb134b9b0c52a6c5e32dd54eead7516 Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Thu, 21 May 2026 22:15:00 -0700 Subject: [PATCH 09/10] chore(mcp): trim verbose comments --- apps/sim/hooks/mcp/use-mcp-tools.ts | 5 +---- apps/sim/hooks/queries/mcp.ts | 16 ++++------------ apps/sim/lib/mcp/service.ts | 4 +--- 3 files changed, 6 insertions(+), 19 deletions(-) diff --git a/apps/sim/hooks/mcp/use-mcp-tools.ts b/apps/sim/hooks/mcp/use-mcp-tools.ts index bba437be9ab..a6e816b038b 100644 --- a/apps/sim/hooks/mcp/use-mcp-tools.ts +++ b/apps/sim/hooks/mcp/use-mcp-tools.ts @@ -55,10 +55,7 @@ export function useMcpTools(workspaceId: string): UseMcpToolsResult { })) }, [mcpToolsData]) - // Soft refresh: invalidate per-server query entries so React Query refetches - // them, but let the server-side cache decide whether to go live (the per-server - // queryFn always sends refresh=false). For an explicit cache-bypass refresh, - // use `useForceRefreshMcpTools` instead. + // Soft refresh — invalidate per-server entries. For cache-bypass, use `useForceRefreshMcpTools`. const refreshTools = useCallback(async () => { if (!workspaceId) { logger.warn('Cannot refresh tools: no workspaceId provided') diff --git a/apps/sim/hooks/queries/mcp.ts b/apps/sim/hooks/queries/mcp.ts index 0aad4328e79..74e5017aec3 100644 --- a/apps/sim/hooks/queries/mcp.ts +++ b/apps/sim/hooks/queries/mcp.ts @@ -140,10 +140,8 @@ export function useMcpServerTools(workspaceId: string, serverId?: string) { export function useMcpToolsQuery(workspaceId: string) { const { data: servers, isLoading: serversLoading } = useMcpServers(workspaceId) - // Filter on (a) enabled (disabled servers would 404 in discover and pollute the - // negative cache), and (b) workspaceId match (useMcpServers keeps the previous - // workspace's data via keepPreviousData, so during a workspace switch we'd - // otherwise spawn discover calls for the wrong workspace's IDs). + // Skip disabled rows (would 404 → negative-cache) and rows from a previous + // workspace (keepPreviousData on useMcpServers). const serverIds = useMemo( () => servers @@ -173,9 +171,7 @@ export function useMcpToolsQuery(workspaceId: string) { let anyServerLoading = false let firstError: Error | null = null for (const result of results) { - // Drop stale data when the latest refetch failed — otherwise React Query's - // stale-while-revalidate behavior would surface broken-server tools in the - // aggregate while the per-server card shows an error. + // Drop stale data from servers whose latest refetch errored. if (result.data && !result.isError) { tools.push(...result.data) hasData = true @@ -185,12 +181,9 @@ export function useMcpToolsQuery(workspaceId: string) { } return { data: tools, - // Stay loading until we have something to render; once any server - // returned, drop the spinner and let slow neighbors fill in. isLoading: (serversLoading || anyServerLoading) && !hasData, isFetching: serversLoading || results.some((r) => r.isFetching), - // One dead server must not blank out the workspace — only surface the - // aggregate error when nothing rendered. Per-server errors live in `perServer`. + // Suppress when any healthy server rendered; per-server errors live in `perServer`. error: hasData ? null : firstError, perServer: results, } @@ -210,7 +203,6 @@ export function useForceRefreshMcpTools() { return tools }) ) - // Failed servers: invalidate so React Query retries via the server-side negative cache. results.forEach((result, index) => { if (result.status === 'rejected') { const failedServer = servers[index] diff --git a/apps/sim/lib/mcp/service.ts b/apps/sim/lib/mcp/service.ts index cb047064df3..72a2662beb5 100644 --- a/apps/sim/lib/mcp/service.ts +++ b/apps/sim/lib/mcp/service.ts @@ -723,9 +723,7 @@ class McpService { await sleep(100) continue } - // Drop any stale positive cache so a follow-up cache-respecting call - // doesn't keep returning old tools from a now-broken server. The - // negative cache marker then makes that follow-up fail fast. + // Drop positive cache so a follow-up doesn't return stale tools. await Promise.allSettled([ this.cacheAdapter .delete(serverCacheKey(workspaceId, serverId)) From ed1b970fcd2a49e95d68a214b2e0ede2fffa4eea Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Thu, 21 May 2026 22:17:10 -0700 Subject: [PATCH 10/10] fix(mcp): third-round bugbot review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - discoverTools failure path now drops the per-server positive cache alongside writing the negative-cache marker, matching discoverServerTools' behavior so a workspace-aggregate failure doesn't leave stale tools cached. - useForceRefreshMcpTools filters disabled and out-of-workspace rows before fan-out so disabled servers don't 404 → negative-cache themselves. - Remove unused useMcpServerTools export — the aggregate goes through useQueries directly, no external consumer exists. --- apps/sim/hooks/queries/mcp.ts | 15 +++------------ apps/sim/lib/mcp/service.ts | 7 ++++++- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/apps/sim/hooks/queries/mcp.ts b/apps/sim/hooks/queries/mcp.ts index 74e5017aec3..9f483a4fef7 100644 --- a/apps/sim/hooks/queries/mcp.ts +++ b/apps/sim/hooks/queries/mcp.ts @@ -122,17 +122,6 @@ async function fetchMcpTools( } } -export function useMcpServerTools(workspaceId: string, serverId?: string) { - return useQuery({ - queryKey: mcpKeys.serverToolsList(workspaceId, serverId), - queryFn: ({ signal }) => fetchMcpTools(workspaceId, false, signal, serverId), - enabled: !!workspaceId && !!serverId, - retry: false, - staleTime: 30 * 1000, - refetchOnWindowFocus: false, - }) -} - /** * Workspace aggregate derived from N parallel per-server queries via * `useQueries`. One slow server cannot block the others. @@ -195,7 +184,9 @@ export function useForceRefreshMcpTools() { return useMutation({ mutationFn: async (workspaceId: string) => { - const servers = queryClient.getQueryData(mcpKeys.serversList(workspaceId)) ?? [] + const allServers = + queryClient.getQueryData(mcpKeys.serversList(workspaceId)) ?? [] + const servers = allServers.filter((s) => s.enabled && s.workspaceId === workspaceId) const results = await Promise.allSettled( servers.map(async (server) => { const tools = await fetchMcpTools(workspaceId, true, undefined, server.id) diff --git a/apps/sim/lib/mcp/service.ts b/apps/sim/lib/mcp/service.ts index 72a2662beb5..113998c61aa 100644 --- a/apps/sim/lib/mcp/service.ts +++ b/apps/sim/lib/mcp/service.ts @@ -590,7 +590,12 @@ class McpService { ) deferredSideEffects.push( this.updateServerStatus(server.id, workspaceId, false, outcome.message), - this.markServerUnhealthy(workspaceId, server.id, outcome.originalError) + this.markServerUnhealthy(workspaceId, server.id, outcome.originalError), + this.cacheAdapter + .delete(serverCacheKey(workspaceId, server.id)) + .catch((err) => + logger.warn(`[${requestId}] Cache delete failed for ${server.name}:`, err) + ) ) })