diff --git a/databricks-mcp-server/databricks_mcp_server/middleware.py b/databricks-mcp-server/databricks_mcp_server/middleware.py index 44ae008a..71514694 100644 --- a/databricks-mcp-server/databricks_mcp_server/middleware.py +++ b/databricks-mcp-server/databricks_mcp_server/middleware.py @@ -4,7 +4,7 @@ Provides cross-cutting concerns like timeout and error handling for all MCP tool calls. """ -import asyncio +import anyio import json import logging import traceback @@ -44,7 +44,27 @@ async def on_call_tool( arguments = context.message.arguments try: - return await call_next(context) + result = await call_next(context) + + # Fix for FastMCP not populating structured_content automatically. + # When a tool has a return type annotation (e.g., -> Dict[str, Any]), + # FastMCP generates an outputSchema but doesn't set structured_content. + # MCP SDK then fails validation: "outputSchema defined but no structured output" + # We fix this by parsing the JSON text content and setting structured_content. + if result and not result.structured_content and result.content: + if len(result.content) == 1 and isinstance(result.content[0], TextContent): + try: + parsed = json.loads(result.content[0].text) + if isinstance(parsed, dict): + # Create new ToolResult with structured_content populated + result = ToolResult( + content=result.content, + structured_content=parsed, + ) + except (json.JSONDecodeError, TypeError): + pass # Not valid JSON, leave as-is + + return result except TimeoutError as e: # In Python 3.11+, asyncio.TimeoutError is an alias for TimeoutError, @@ -53,47 +73,33 @@ async def on_call_tool( "Tool '%s' timed out. Returning structured result.", tool_name, ) + # Don't set structured_content for errors - it would be validated against + # the tool's outputSchema and fail (error dict doesn't match expected type) return ToolResult( - content=[ - TextContent( - type="text", - text=json.dumps( - { - "error": True, - "error_type": "timeout", - "tool": tool_name, - "message": str(e) or "Operation timed out", - "action_required": ( - "Operation may still be in progress. " - "Do NOT retry the same call. " - "Use the appropriate get/status tool to check current state." - ), - } - ), - ) - ] + content=[TextContent(type="text", text=json.dumps({ + "error": True, + "error_type": "timeout", + "tool": tool_name, + "message": str(e) or "Operation timed out", + "action_required": ( + "Operation may still be in progress. " + "Do NOT retry the same call. " + "Use the appropriate get/status tool to check current state." + ), + }))] ) - except asyncio.CancelledError: + except anyio.get_cancelled_exc_class(): + # Re-raise CancelledError so MCP SDK's handler catches it and skips + # calling message.respond(). If we return a result here, the SDK will + # try to respond, but the request may already be marked as responded + # by the cancellation handler, causing an AssertionError crash. + # See: https://github.com/modelcontextprotocol/python-sdk/pull/1153 logger.warning( - "Tool '%s' was cancelled. Returning structured result.", + "Tool '%s' was cancelled. Re-raising to let MCP SDK handle cleanup.", tool_name, ) - return ToolResult( - content=[ - TextContent( - type="text", - text=json.dumps( - { - "error": True, - "error_type": "cancelled", - "tool": tool_name, - "message": "Operation was cancelled by the client", - } - ), - ) - ] - ) + raise except Exception as e: # Log the full traceback for debugging @@ -104,22 +110,14 @@ async def on_call_tool( traceback.format_exc(), ) - # Return a structured error response - error_message = str(e) - error_type = type(e).__name__ - + # Return error as text content only - don't set structured_content. + # Setting structured_content would cause MCP SDK to validate it against + # the tool's outputSchema, which fails (error dict doesn't match expected type). return ToolResult( - content=[ - TextContent( - type="text", - text=json.dumps( - { - "error": True, - "error_type": error_type, - "tool": tool_name, - "message": error_message, - } - ), - ) - ] + content=[TextContent(type="text", text=json.dumps({ + "error": True, + "error_type": type(e).__name__, + "tool": tool_name, + "message": str(e), + }))] ) diff --git a/databricks-mcp-server/databricks_mcp_server/server.py b/databricks-mcp-server/databricks_mcp_server/server.py index d1810f2a..4ee150aa 100644 --- a/databricks-mcp-server/databricks_mcp_server/server.py +++ b/databricks-mcp-server/databricks_mcp_server/server.py @@ -72,15 +72,23 @@ async def async_wrapper(**kwargs): _patch_subprocess_stdin() -def _patch_tool_decorator_for_windows(): - """Wrap sync tool functions in asyncio.to_thread() on Windows. +def _patch_tool_decorator_for_async(): + """Wrap sync tool functions in asyncio.to_thread() on all platforms. FastMCP's FunctionTool.run() calls sync functions directly on the asyncio - event loop thread, which blocks the stdio transport's I/O tasks. On Windows - with ProactorEventLoop this causes a deadlock where all MCP tools hang. + event loop thread, which blocks the stdio transport's I/O tasks. This causes: + + 1. On Windows with ProactorEventLoop: deadlock where all MCP tools hang. + + 2. On ALL platforms: cancellation race conditions. When the MCP client + cancels a request (e.g., timeout), the event loop can't propagate the + CancelledError to blocking sync code. The sync function eventually + returns, but the MCP SDK has already responded to the cancellation, + causing "Request already responded to" assertion errors and crashes. This patch intercepts @mcp.tool registration to wrap sync functions so they - run in a thread pool, yielding control back to the event loop for I/O. + run in a thread pool, yielding control back to the event loop for I/O and + enabling proper cancellation handling via anyio's task cancellation. """ original_tool = mcp.tool @@ -132,11 +140,14 @@ async def _noop_lifespan(*args, **kwargs): # Register middleware (see middleware.py for details on each) mcp.add_middleware(TimeoutHandlingMiddleware()) -# Apply async wrapper on Windows to prevent event loop deadlocks. +# Apply async wrapper on ALL platforms to: +# 1. Prevent event loop deadlocks (critical on Windows) +# 2. Enable proper cancellation handling (critical on all platforms) +# Without this, sync tools block the event loop, preventing CancelledError +# propagation and causing "Request already responded to" crashes. # TODO: FastMCP 3.x automatically wraps sync functions in asyncio.to_thread(). -# Test if this Windows-specific patch is still needed with FastMCP 3.x. -if sys.platform == "win32": - _patch_tool_decorator_for_windows() +# Test if this patch is still needed with FastMCP 3.x. +_patch_tool_decorator_for_async() # Import and register all tools (side-effect imports: each module registers @mcp.tool decorators) from .tools import ( # noqa: F401, E402