|
| 1 | +"""MCP client that fulfils server-initiated sampling via a real LLM. |
| 2 | +
|
| 3 | +Answers the questions raised in issue |
| 4 | +https://github.com/modelcontextprotocol/python-sdk/issues/1205 by wiring |
| 5 | +an actual LLM call into the ClientSession `sampling_callback` and |
| 6 | +showing how each advisory field in `CreateMessageRequestParams` should |
| 7 | +be interpreted. |
| 8 | +
|
| 9 | +The LLM backend is deliberately provider-agnostic: we speak the |
| 10 | +OpenAI-compatible `/chat/completions` schema over httpx, so the example |
| 11 | +runs against OpenAI, Groq, OpenRouter, Ollama, vLLM, or any other |
| 12 | +gateway that honours the same contract. Users swap providers by |
| 13 | +changing environment variables rather than code — maintainer feedback |
| 14 | +on earlier attempts flagged provider-specific SDKs as a no-go for the |
| 15 | +examples directory. |
| 16 | +""" |
| 17 | + |
| 18 | +from __future__ import annotations |
| 19 | + |
| 20 | +import logging |
| 21 | +import os |
| 22 | +import shlex |
| 23 | +from typing import Any |
| 24 | + |
| 25 | +import anyio |
| 26 | +import click |
| 27 | +import httpx |
| 28 | +from mcp import ClientSession, StdioServerParameters, types |
| 29 | +from mcp.client.context import ClientRequestContext |
| 30 | +from mcp.client.stdio import stdio_client |
| 31 | + |
| 32 | +logger = logging.getLogger("mcp-simple-sampling-client") |
| 33 | + |
| 34 | +# Defaults point at Groq because it has a generous free tier and speaks the |
| 35 | +# OpenAI-compatible schema. Override via env vars to target any other |
| 36 | +# provider without editing this file. |
| 37 | +DEFAULT_BASE_URL = "https://api.groq.com/openai/v1" |
| 38 | +DEFAULT_MODEL = "llama-3.3-70b-versatile" |
| 39 | + |
| 40 | +# Minimal mapping from OpenAI's `finish_reason` to MCP's `stop_reason`. |
| 41 | +# Both are advisory strings, so missing entries round-trip unchanged |
| 42 | +# rather than raising. |
| 43 | +_FINISH_REASON_TO_STOP_REASON: dict[str, str] = { |
| 44 | + "stop": "endTurn", |
| 45 | + "length": "maxTokens", |
| 46 | + "content_filter": "endTurn", |
| 47 | +} |
| 48 | + |
| 49 | + |
| 50 | +class LLMClient: |
| 51 | + """Thin async wrapper over an OpenAI-compatible /chat/completions endpoint.""" |
| 52 | + |
| 53 | + def __init__(self, *, api_key: str, base_url: str, default_model: str) -> None: |
| 54 | + self.api_key = api_key |
| 55 | + self.base_url = base_url.rstrip("/") |
| 56 | + self.default_model = default_model |
| 57 | + |
| 58 | + def pick_model(self, preferences: types.ModelPreferences | None) -> str: |
| 59 | + # modelPreferences are advisory per spec: "The client MAY ignore |
| 60 | + # them." We treat the first usable hint as a soft override and |
| 61 | + # fall back to LLM_MODEL. Numeric priorities are not used to |
| 62 | + # pick a model here — that would require a catalogue of available |
| 63 | + # models, which is provider-specific — but we log them so the |
| 64 | + # user can see what the server asked for. |
| 65 | + if preferences is None: |
| 66 | + return self.default_model |
| 67 | + if ( |
| 68 | + preferences.cost_priority is not None |
| 69 | + or preferences.speed_priority is not None |
| 70 | + or (preferences.intelligence_priority is not None) |
| 71 | + ): |
| 72 | + logger.info( |
| 73 | + "Server model priorities — cost=%s speed=%s intelligence=%s", |
| 74 | + preferences.cost_priority, |
| 75 | + preferences.speed_priority, |
| 76 | + preferences.intelligence_priority, |
| 77 | + ) |
| 78 | + if preferences.hints: |
| 79 | + for hint in preferences.hints: |
| 80 | + if hint.name: |
| 81 | + return hint.name |
| 82 | + return self.default_model |
| 83 | + |
| 84 | + async def chat( |
| 85 | + self, |
| 86 | + *, |
| 87 | + messages: list[dict[str, Any]], |
| 88 | + model: str, |
| 89 | + system_prompt: str | None, |
| 90 | + max_tokens: int, |
| 91 | + temperature: float | None, |
| 92 | + stop_sequences: list[str] | None, |
| 93 | + metadata: dict[str, Any] | None, |
| 94 | + ) -> dict[str, Any]: |
| 95 | + payload: dict[str, Any] = { |
| 96 | + "model": model, |
| 97 | + "messages": ([{"role": "system", "content": system_prompt}] if system_prompt else []) + messages, |
| 98 | + "max_tokens": max_tokens, |
| 99 | + } |
| 100 | + if temperature is not None: |
| 101 | + payload["temperature"] = temperature |
| 102 | + if stop_sequences: |
| 103 | + payload["stop"] = stop_sequences |
| 104 | + if metadata: |
| 105 | + # OpenAI's schema accepts arbitrary metadata for provider-side |
| 106 | + # logging. Non-OpenAI gateways typically ignore unknown keys |
| 107 | + # rather than rejecting them, so a raw passthrough is safe. |
| 108 | + payload["metadata"] = metadata |
| 109 | + |
| 110 | + headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"} |
| 111 | + async with httpx.AsyncClient(timeout=60.0) as http: |
| 112 | + response = await http.post(f"{self.base_url}/chat/completions", headers=headers, json=payload) |
| 113 | + response.raise_for_status() |
| 114 | + return response.json() |
| 115 | + |
| 116 | + |
| 117 | +def _sampling_messages_to_openai(messages: list[types.SamplingMessage]) -> list[dict[str, Any]]: |
| 118 | + """Flatten MCP SamplingMessages into OpenAI-style chat messages. |
| 119 | +
|
| 120 | + MCP allows a message `content` to be either a single block or a list |
| 121 | + of mixed blocks (text/image/audio). This example only forwards text; |
| 122 | + other block types are surfaced to the LLM as a placeholder so the |
| 123 | + conversation stays coherent without silently dropping content. A |
| 124 | + production client would either forward image URLs/base64 directly or |
| 125 | + refuse the request with an ErrorData response. |
| 126 | + """ |
| 127 | + converted: list[dict[str, Any]] = [] |
| 128 | + for message in messages: |
| 129 | + parts: list[str] = [] |
| 130 | + for block in message.content_as_list: |
| 131 | + if isinstance(block, types.TextContent): |
| 132 | + parts.append(block.text) |
| 133 | + else: |
| 134 | + parts.append(f"[{block.type} content omitted]") |
| 135 | + converted.append({"role": message.role, "content": "\n".join(parts)}) |
| 136 | + return converted |
| 137 | + |
| 138 | + |
| 139 | +class SamplingHandler: |
| 140 | + """Implements the ClientSession `sampling_callback` protocol.""" |
| 141 | + |
| 142 | + def __init__(self, llm: LLMClient) -> None: |
| 143 | + self.llm = llm |
| 144 | + |
| 145 | + async def __call__( |
| 146 | + self, |
| 147 | + context: ClientRequestContext, |
| 148 | + params: types.CreateMessageRequestParams, |
| 149 | + ) -> types.CreateMessageResult | types.ErrorData: |
| 150 | + # includeContext asks the client to attach context from its other |
| 151 | + # active sessions. A real multi-server client would query its |
| 152 | + # session registry here and prepend the relevant context to the |
| 153 | + # prompt. We only log the request so the example stays a single |
| 154 | + # file, but the hook point is this branch. |
| 155 | + if params.include_context and params.include_context != "none": |
| 156 | + logger.info( |
| 157 | + "Server requested includeContext=%s — real clients would inject session context here", |
| 158 | + params.include_context, |
| 159 | + ) |
| 160 | + |
| 161 | + model = self.llm.pick_model(params.model_preferences) |
| 162 | + try: |
| 163 | + raw = await self.llm.chat( |
| 164 | + messages=_sampling_messages_to_openai(params.messages), |
| 165 | + model=model, |
| 166 | + system_prompt=params.system_prompt, |
| 167 | + max_tokens=params.max_tokens, |
| 168 | + temperature=params.temperature, |
| 169 | + stop_sequences=params.stop_sequences, |
| 170 | + metadata=params.metadata, |
| 171 | + ) |
| 172 | + except httpx.HTTPError: |
| 173 | + # Callback contracts require returning ErrorData on failure |
| 174 | + # rather than raising — the session turns an exception into a |
| 175 | + # transport-level error, which is less useful to the server. |
| 176 | + logger.exception("LLM provider call failed") |
| 177 | + return types.ErrorData(code=types.INTERNAL_ERROR, message="LLM provider call failed") |
| 178 | + |
| 179 | + choice = raw["choices"][0] |
| 180 | + finish_reason = choice.get("finish_reason") |
| 181 | + return types.CreateMessageResult( |
| 182 | + role="assistant", |
| 183 | + content=types.TextContent(type="text", text=choice["message"]["content"]), |
| 184 | + model=raw.get("model", model), |
| 185 | + stop_reason=_FINISH_REASON_TO_STOP_REASON.get(finish_reason, finish_reason), |
| 186 | + ) |
| 187 | + |
| 188 | + |
| 189 | +async def _run(server_command: str, server_args: list[str], tool_arguments: dict[str, Any]) -> None: |
| 190 | + api_key = os.environ.get("LLM_API_KEY") |
| 191 | + if not api_key: |
| 192 | + raise click.UsageError("LLM_API_KEY is required; see README for provider setup.") |
| 193 | + base_url = os.environ.get("LLM_API_BASE_URL", DEFAULT_BASE_URL) |
| 194 | + default_model = os.environ.get("LLM_MODEL", DEFAULT_MODEL) |
| 195 | + |
| 196 | + sampling = SamplingHandler(LLMClient(api_key=api_key, base_url=base_url, default_model=default_model)) |
| 197 | + params = StdioServerParameters(command=server_command, args=server_args) |
| 198 | + |
| 199 | + async with stdio_client(params) as (read, write): |
| 200 | + async with ClientSession(read, write, sampling_callback=sampling) as session: |
| 201 | + await session.initialize() |
| 202 | + tools = await session.list_tools() |
| 203 | + if not tools.tools: |
| 204 | + click.echo("Server exposes no tools; nothing to demo.") |
| 205 | + return |
| 206 | + tool = tools.tools[0] |
| 207 | + click.echo(f"Calling tool '{tool.name}' with {tool_arguments}") |
| 208 | + result = await session.call_tool(tool.name, tool_arguments) |
| 209 | + for block in result.content: |
| 210 | + if isinstance(block, types.TextContent): |
| 211 | + click.echo(block.text) |
| 212 | + |
| 213 | + |
| 214 | +@click.command() |
| 215 | +@click.option( |
| 216 | + "--server-command", |
| 217 | + default="uv", |
| 218 | + show_default=True, |
| 219 | + help="Executable that launches the MCP server over stdio.", |
| 220 | +) |
| 221 | +@click.option( |
| 222 | + "--server-args", |
| 223 | + default="run mcp-simple-sampling", |
| 224 | + show_default=True, |
| 225 | + help="Arguments for server-command; split with POSIX shell rules.", |
| 226 | +) |
| 227 | +@click.option( |
| 228 | + "--topic", |
| 229 | + default="a lighthouse keeper", |
| 230 | + show_default=True, |
| 231 | + help="Story topic forwarded to the server's write_story tool.", |
| 232 | +) |
| 233 | +def main(server_command: str, server_args: str, topic: str) -> int: |
| 234 | + logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s") |
| 235 | + anyio.run(_run, server_command, shlex.split(server_args), {"topic": topic}) |
| 236 | + return 0 |
| 237 | + |
| 238 | + |
| 239 | +if __name__ == "__main__": |
| 240 | + main() |
0 commit comments