diff --git a/mcp/distributed_test.go b/mcp/distributed_test.go
new file mode 100644
index 00000000..7031d86e
--- /dev/null
+++ b/mcp/distributed_test.go
@@ -0,0 +1,1005 @@
+// Copyright 2025 The Go MCP SDK Authors. All rights reserved.
+// Use of this source code is governed by an MIT-style
+// license that can be found in the LICENSE file.
+
+// FORK: distributed-sessions
+// Integration tests for distributed session management.
+
+package mcp
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+)
+
+// TestDistributed_SessionCreation verifies that sessions are created in the backend.
+func TestDistributed_SessionCreation(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 2)
+	defer cluster.Close()
+
+	// Create a session on Pod A
+	session, sessionID := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	// Verify session was created in backend
+	data, err := cluster.Backend().Get(ctx, sessionID)
+	if err != nil {
+		t.Fatalf("session not found in backend: %v", err)
+	}
+	if data.SessionID != sessionID {
+		t.Errorf("session ID mismatch: got %q, want %q", data.SessionID, sessionID)
+	}
+}
+
+// TestDistributed_SamePodRequests verifies that requests on the same pod work
+// correctly with the SessionBackend configured.
+func TestDistributed_SamePodRequests(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 1)
+	defer cluster.Close()
+
+	// Create session on Pod A
+	session, _ := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	// Make a tool call using the client session (stays on same pod)
+	result, err := session.CallTool(ctx, &CallToolParams{
+		Name:      "echo",
+		Arguments: map[string]any{"msg": "hello world"},
+	})
+	if err != nil {
+		t.Fatalf("CallTool failed: %v", err)
+	}
+
+	// Verify response
+	if len(result.Content) == 0 {
+		t.Fatal("expected content in result")
+	}
+	text, ok := result.Content[0].(*TextContent)
+	if !ok || text.Text != "hello world" {
+		t.Errorf("unexpected result: %+v", result.Content)
+	}
+}
+
+// TestDistributed_SessionTakeover verifies that a different pod can handle
+// requests for a session created on another pod.
+func TestDistributed_SessionTakeover(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 2)
+	defer cluster.Close()
+
+	// Create session on Pod A
+	session, sessionID := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	// Verify session is local on Pod A
+	if !cluster.Pod("A").HasLocalSession(sessionID) {
+		t.Error("session should be local on Pod A")
+	}
+
+	// Wait for state to persist
+	time.Sleep(50 * time.Millisecond)
+
+	// Clear local session on Pod A (simulate pod restart or LB routing elsewhere)
+	cluster.Pod("A").ClearLocalSession(sessionID)
+
+	// Send request to Pod B - should work via session takeover
+	result, err := CallToolOnPod(ctx, t, cluster.Pod("B"), sessionID, "echo", map[string]any{"msg": "hello from B"})
+	if err != nil {
+		t.Fatalf("CallToolOnPod failed: %v", err)
+	}
+
+	// Verify response
+	if len(result.Content) == 0 {
+		t.Fatal("expected content in result")
+	}
+	text, ok := result.Content[0].(*TextContent)
+	if !ok || text.Text != "hello from B" {
+		t.Errorf("unexpected result: %+v", result.Content)
+	}
+
+	// Verify session is now local on Pod B
+	if !cluster.Pod("B").HasLocalSession(sessionID) {
+		t.Error("session should now be local on Pod B")
+	}
+}
+
+// TestDistributed_SessionNotFound verifies 404 for unknown sessions.
+func TestDistributed_SessionNotFound(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 1)
+	defer cluster.Close()
+
+	// Try to use a non-existent session
+	_, err := CallToolOnPod(ctx, t, cluster.Pod("A"), "nonexistent-session-id", "echo", nil)
+	if err == nil {
+		t.Fatal("expected error for non-existent session")
+	}
+	// The error should indicate session not found
+	t.Logf("Got expected error: %v", err)
+}
+
+// TestDistributed_ConcurrentRequests verifies that concurrent requests to
+// the same pod don't corrupt session state when using a SessionBackend.
+func TestDistributed_ConcurrentRequests(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 1)
+	defer cluster.Close()
+
+	// Create session on Pod A
+	session, _ := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	// Send concurrent requests using the existing session (stays on same pod)
+	var wg sync.WaitGroup
+	errors := make(chan error, 10)
+	results := make(chan *CallToolResult, 10)
+
+	for i := 0; i < 10; i++ {
+		wg.Add(1)
+		go func(n int) {
+			defer wg.Done()
+			result, err := session.CallTool(ctx, &CallToolParams{
+				Name:      "echo",
+				Arguments: map[string]any{"msg": "test"},
+			})
+			if err != nil {
+				errors <- err
+			} else {
+				results <- result
+			}
+		}(i)
+	}
+
+	wg.Wait()
+	close(errors)
+	close(results)
+
+	// Check for errors
+	for err := range errors {
+		t.Errorf("request failed: %v", err)
+	}
+
+	// Count successful results
+	count := 0
+	for range results {
+		count++
+	}
+	t.Logf("Successful requests: %d/10", count)
+
+	if count != 10 {
+		t.Errorf("expected all 10 requests to succeed, got %d", count)
+	}
+}
+
+// TestDistributed_BackendTouched verifies that Touch is called on activity.
+func TestDistributed_BackendTouched(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 1)
+	defer cluster.Close()
+
+	// Create session
+	session, sessionID := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	cluster.Backend().ClearOps()
+
+	// Make a request
+	_, err := session.CallTool(ctx, &CallToolParams{
+		Name:      "echo",
+		Arguments: map[string]any{"msg": "test"},
+	})
+	if err != nil {
+		t.Fatalf("CallTool failed: %v", err)
+	}
+
+	// Verify Touch was called
+	ops := cluster.Backend().GetOps()
+	touchCount := 0
+	for _, op := range ops {
+		if op.Op == "Touch" && op.SessionID == sessionID {
+			touchCount++
+		}
+	}
+	if touchCount == 0 {
+		t.Error("expected Touch to be called on backend")
+	}
+}
+
+// TestDistributed_SessionDelete verifies session deletion works across pods.
+func TestDistributed_SessionDelete(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 2)
+	defer cluster.Close()
+
+	// Create session on Pod A
+	session, sessionID := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+
+	// Close session (triggers DELETE)
+	session.Close()
+
+	// Give it a moment to propagate
+	time.Sleep(50 * time.Millisecond)
+
+	// Verify session is deleted from backend
+	_, err := cluster.Backend().Get(ctx, sessionID)
+	if err != ErrSessionNotFound {
+		t.Errorf("expected ErrSessionNotFound, got: %v", err)
+	}
+}
+
+// --- Tests for known gaps (expected to reveal bugs) ---
+
+// TestDistributed_StatePersistedAfterInitialize tests that session state
+// is persisted to the backend after Initialize completes.
+func TestDistributed_StatePersistedAfterInitialize(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 2)
+	defer cluster.Close()
+
+	// Create and initialize session on Pod A
+	session, sessionID := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	// Wait for state to persist (async operation)
+	time.Sleep(50 * time.Millisecond)
+
+	// Check backend directly for state
+	data, err := cluster.Backend().Get(ctx, sessionID)
+	if err != nil {
+		t.Fatalf("failed to get session from backend: %v", err)
+	}
+
+	// State should be persisted after initialization
+	if data.State == nil {
+		t.Fatal("State is nil - not persisted after initialization")
+	}
+	if data.State.InitializeParams == nil {
+		t.Fatal("InitializeParams is nil - not persisted after initialization")
+	}
+
+	t.Log("State persistence is working!")
+}
+
+// TestDistributed_StateSurvivesTakeover tests that session state is preserved
+// when a different pod takes over the session.
+func TestDistributed_StateSurvivesTakeover(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 2)
+	defer cluster.Close()
+
+	// Create session on Pod A
+	session, sessionID := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	// Wait a moment for state to persist
+	time.Sleep(50 * time.Millisecond)
+
+	// Get the state that was set during initialization
+	data1, err := cluster.Backend().Get(ctx, sessionID)
+	if err != nil {
+		t.Fatalf("failed to get session: %v", err)
+	}
+	if data1.State == nil || data1.State.InitializeParams == nil {
+		t.Fatal("state not persisted after initialization")
+	}
+
+	// Clear Pod A's local cache
+	cluster.Pod("A").ClearLocalSession(sessionID)
+
+	// Access from Pod B - should trigger session takeover
+	result, err := CallToolOnPod(ctx, t, cluster.Pod("B"), sessionID, "echo", map[string]any{"msg": "from B"})
+	if err != nil {
+		t.Fatalf("CallToolOnPod failed: %v", err)
+	}
+	if len(result.Content) == 0 {
+		t.Fatal("expected content in result")
+	}
+
+	// Verify session is now local on Pod B
+	if !cluster.Pod("B").HasLocalSession(sessionID) {
+		t.Error("session should now be local on Pod B")
+	}
+
+	// Get state after takeover - should still have the same InitializeParams
+	data2, err := cluster.Backend().Get(ctx, sessionID)
+	if err != nil {
+		t.Fatalf("failed to get session after takeover: %v", err)
+	}
+	if data2.State == nil || data2.State.InitializeParams == nil {
+		t.Error("state lost after takeover")
+	}
+}
+
+// TestDistributed_SSEOwnershipExclusive tests that only one pod has an active
+// SSE subscription at a time. When a new SSE stream is opened on a different pod,
+// the previous pod's subscription should be superseded.
+func TestDistributed_SSEOwnershipExclusive(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 2)
+	defer cluster.Close()
+
+	// Create session on Pod A
+	session, sessionID := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	// Wait for state to persist
+	time.Sleep(50 * time.Millisecond)
+
+	// Clear Pod A's local session so we can test SSE from scratch
+	cluster.Pod("A").ClearLocalSession(sessionID)
+
+	// Open SSE stream to Pod A
+	sseA := OpenSSEStream(ctx, t, cluster.Pod("A"), sessionID)
+	defer sseA.Close()
+
+	// Give SSE time to establish and subscribe
+	time.Sleep(100 * time.Millisecond)
+
+	// Verify Pod A has an active subscriber
+	if cluster.Backend().GetSubscriberCount(sessionID) != 1 {
+		t.Errorf("expected 1 subscriber after SSE A, got %d", cluster.Backend().GetSubscriberCount(sessionID))
+	}
+
+	// Open SSE stream to Pod B - should supersede Pod A
+	sseB := OpenSSEStream(ctx, t, cluster.Pod("B"), sessionID)
+	defer sseB.Close()
+
+	// Pod A's SSE stream should close due to subscription superseding
+	if !sseA.WaitClosed(2 * time.Second) {
+		t.Error("Pod A's SSE stream should have closed when Pod B took over")
+	}
+
+	// Verify Pod B is now the subscriber
+	time.Sleep(50 * time.Millisecond)
+	cluster.Backend().AssertSingleSubscriber(t, sessionID)
+}
+
+// TestDistributed_SSEMessageDelivery tests that messages published to the backend
+// are delivered via the active SSE stream.
+func TestDistributed_SSEMessageDelivery(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 1)
+	defer cluster.Close()
+
+	// Create session on Pod A
+	session, sessionID := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	// Wait for state to persist
+	time.Sleep(50 * time.Millisecond)
+
+	// Clear local session to test SSE independently
+	cluster.Pod("A").ClearLocalSession(sessionID)
+
+	// Open SSE stream
+	sse := OpenSSEStream(ctx, t, cluster.Pod("A"), sessionID)
+	defer sse.Close()
+
+	// Give SSE time to establish
+	time.Sleep(100 * time.Millisecond)
+
+	// Publish a message directly to the backend
+	testMsg := `{"jsonrpc":"2.0","method":"notifications/message","params":{"level":"info","data":"test"}}`
+	if err := cluster.Backend().Publish(ctx, sessionID, []byte(testMsg)); err != nil {
+		t.Fatalf("Publish failed: %v", err)
+	}
+
+	// Verify message received via SSE
+	evt, err := sse.NextEvent(2 * time.Second)
+	if err != nil {
+		t.Fatalf("failed to receive SSE event: %v", err)
+	}
+	if evt.Name != "message" {
+		t.Errorf("expected event name 'message', got %q", evt.Name)
+	}
+	if string(evt.Data) != testMsg {
+		t.Errorf("message mismatch: got %q, want %q", string(evt.Data), testMsg)
+	}
+}
+
+// TestDistributed_PublishSubscribeFlow tests the message routing flow.
+func TestDistributed_PublishSubscribeFlow(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 1)
+	defer cluster.Close()
+
+	// Create a session
+	session, sessionID := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	// Set up a subscriber
+	received := make(chan []byte, 10)
+	subCtx, subCancel := context.WithCancel(ctx)
+	defer subCancel()
+
+	go func() {
+		_ = cluster.Backend().Subscribe(subCtx, sessionID, func(ctx context.Context, msg []byte) error {
+			received <- msg
+			return nil
+		})
+	}()
+
+	// Give subscriber time to start
+	time.Sleep(50 * time.Millisecond)
+
+	// Drain any messages from initialization (e.g., tools/list_changed)
+drainLoop:
+	for {
+		select {
+		case <-received:
+			// Discard initialization messages
+		case <-time.After(100 * time.Millisecond):
+			break drainLoop
+		}
+	}
+
+	// Publish our test message
+	testMsg := []byte(`{"test": "message"}`)
+	if err := cluster.Backend().Publish(ctx, sessionID, testMsg); err != nil {
+		t.Fatalf("Publish failed: %v", err)
+	}
+
+	// Verify message received
+	select {
+	case msg := <-received:
+		if string(msg) != string(testMsg) {
+			t.Errorf("message mismatch: got %q, want %q", string(msg), string(testMsg))
+		}
+	case <-time.After(time.Second):
+		t.Error("timeout waiting for message")
+	}
+}
+
+// TestDistributed_LogLevelPersisted tests that setLogLevel changes are persisted.
+func TestDistributed_LogLevelPersisted(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 2)
+	defer cluster.Close()
+
+	// Create session on Pod A
+	session, sessionID := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	// Wait for initial state to persist
+	time.Sleep(50 * time.Millisecond)
+
+	// Set log level
+	testLogLevel := LoggingLevel("debug")
+	if err := session.SetLoggingLevel(ctx, &SetLoggingLevelParams{Level: testLogLevel}); err != nil {
+		t.Fatalf("SetLoggingLevel failed: %v", err)
+	}
+
+	// Wait for state to persist (async)
+	time.Sleep(100 * time.Millisecond)
+
+	// Check backend directly
+	data, err := cluster.Backend().Get(ctx, sessionID)
+	if err != nil {
+		t.Fatalf("failed to get session: %v", err)
+	}
+
+	if data.State == nil {
+		t.Fatal("state is nil")
+	}
+	if data.State.LogLevel != testLogLevel {
+		t.Errorf("log level not persisted: got %v, want %v", data.State.LogLevel, testLogLevel)
+	}
+
+	// Clear Pod A's cache and access from Pod B
+	cluster.Pod("A").ClearLocalSession(sessionID)
+
+	// Pod B should see the persisted log level
+	result, err := CallToolOnPod(ctx, t, cluster.Pod("B"), sessionID, "echo", map[string]any{"msg": "test"})
+	if err != nil {
+		t.Fatalf("CallToolOnPod failed: %v", err)
+	}
+	if len(result.Content) == 0 {
+		t.Fatal("expected content")
+	}
+
+	// Verify Pod B has the session with correct state
+	// (The session was restored from backend with the correct LogLevel)
+	data2, err := cluster.Backend().Get(ctx, sessionID)
+	if err != nil {
+		t.Fatalf("failed to get session after takeover: %v", err)
+	}
+	if data2.State.LogLevel != testLogLevel {
+		t.Errorf("log level lost after takeover: got %v, want %v", data2.State.LogLevel, testLogLevel)
+	}
+}
+
+// TestDistributed_SubscriptionSuperseded tests that new subscribers supersede old ones.
+func TestDistributed_SubscriptionSuperseded(t *testing.T) {
+	// Use the backend directly, not a full cluster with clients
+	backend := NewInstrumentedBackend()
+	ctx := context.Background()
+
+	// Create a session directly in the backend
+	data := &SessionData{UserID: "test"}
+	sessionID, err := backend.Create(ctx, data)
+	if err != nil {
+		t.Fatalf("Create failed: %v", err)
+	}
+
+	// Start first subscriber
+	sub1Err := make(chan error, 1)
+	sub1Ctx, sub1Cancel := context.WithCancel(ctx)
+	defer sub1Cancel()
+
+	go func() {
+		err := backend.Subscribe(sub1Ctx, sessionID, func(ctx context.Context, msg []byte) error {
+			return nil
+		})
+		sub1Err <- err
+	}()
+
+	// Give first subscriber time to start
+	time.Sleep(50 * time.Millisecond)
+
+	// Verify first subscriber is active
+	if backend.GetSubscriberCount(sessionID) != 1 {
+		t.Errorf("expected 1 subscriber, got %d", backend.GetSubscriberCount(sessionID))
+	}
+
+	// Start second subscriber - should supersede first
+	sub2Ctx, sub2Cancel := context.WithCancel(ctx)
+	defer sub2Cancel()
+
+	go func() {
+		_ = backend.Subscribe(sub2Ctx, sessionID, func(ctx context.Context, msg []byte) error {
+			return nil
+		})
+	}()
+
+	// First subscriber should be superseded
+	select {
+	case err := <-sub1Err:
+		if err != ErrSubscriptionSuperseded {
+			t.Errorf("expected ErrSubscriptionSuperseded, got: %v", err)
+		}
+	case <-time.After(time.Second):
+		t.Error("timeout waiting for first subscriber to be superseded")
+	}
+}
+
+// =============================================================================
+// Failure Injection Tests
+// =============================================================================
+
+// TestDistributed_BackendGetFailure tests graceful handling when backend Get fails.
+func TestDistributed_BackendGetFailure(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 2)
+	defer cluster.Close()
+
+	// Create session directly in backend (no SSE stream to worry about)
+	data := &SessionData{
+		UserID: "",
+		State:  &ServerSessionState{InitializeParams: &InitializeParams{ProtocolVersion: "2024-11-05"}},
+	}
+	sessionID, err := cluster.Backend().Create(ctx, data)
+	if err != nil {
+		t.Fatalf("Create failed: %v", err)
+	}
+
+	// Inject failure on Get
+	testErr := fmt.Errorf("simulated backend failure")
+	cluster.Backend().SetFailGet(testErr)
+
+	// Try to access - should fail gracefully
+	_, err = CallToolOnPod(ctx, t, cluster.Pod("A"), sessionID, "echo", map[string]any{"msg": "test"})
+	if err == nil {
+		t.Error("expected error when backend Get fails")
+	}
+	// Should return error (either 500 or failed)
+	t.Logf("error response (expected): %v", err)
+}
+
+// TestDistributed_BackendUpdateFailure tests handling when state persistence fails.
+func TestDistributed_BackendUpdateFailure(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 1)
+	defer cluster.Close()
+
+	// Create session
+	session, _ := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	// Wait for initial state to persist
+	time.Sleep(50 * time.Millisecond)
+
+	// Inject failure on Update (for state persistence)
+	testErr := fmt.Errorf("simulated update failure")
+	cluster.Backend().SetFailUpdate(testErr)
+
+	// Set log level - this triggers state persistence
+	err := session.SetLoggingLevel(ctx, &SetLoggingLevelParams{Level: "debug"})
+	// The request should succeed (state persistence is async and best-effort)
+	if err != nil {
+		t.Errorf("SetLoggingLevel should succeed despite backend failure: %v", err)
+	}
+
+	// Wait for async state persistence attempt
+	time.Sleep(100 * time.Millisecond)
+
+	// Verify the error was recorded
+	ops := cluster.Backend().GetOps()
+	var sawUpdateError bool
+	for _, op := range ops {
+		if op.Op == "Update" && op.Error != nil {
+			sawUpdateError = true
+			break
+		}
+	}
+	if !sawUpdateError {
+		t.Error("expected Update error to be recorded")
+	}
+}
+
+// TestDistributed_BackendTouchFailure tests handling when Touch fails.
+func TestDistributed_BackendTouchFailure(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 2)
+	defer cluster.Close()
+
+	// Create session directly in backend with initialized state
+	data := &SessionData{
+		UserID: "",
+		State: &ServerSessionState{
+			InitializeParams:  &InitializeParams{ProtocolVersion: "2024-11-05"},
+			InitializedParams: &InitializedParams{},
+		},
+	}
+	sessionID, err := cluster.Backend().Create(ctx, data)
+	if err != nil {
+		t.Fatalf("Create failed: %v", err)
+	}
+
+	// Inject Touch failure
+	testErr := fmt.Errorf("simulated touch failure")
+	cluster.Backend().SetFailTouch(testErr)
+
+	// Request should still work (Touch failure is logged but not fatal)
+	result, err := CallToolOnPod(ctx, t, cluster.Pod("A"), sessionID, "echo", map[string]any{"msg": "hello"})
+	if err != nil {
+		t.Fatalf("request should succeed despite Touch failure: %v", err)
+	}
+	if len(result.Content) == 0 {
+		t.Fatal("expected content in response")
+	}
+}
+
+// =============================================================================
+// Race Condition Tests
+// =============================================================================
+
+// TestDistributed_ConcurrentTakeover tests behavior when two pods try to take over
+// the same session simultaneously.
+func TestDistributed_ConcurrentTakeover(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 3)
+	defer cluster.Close()
+
+	// Create session directly in backend with initialized state
+	data := &SessionData{
+		UserID: "",
+		State: &ServerSessionState{
+			InitializeParams:  &InitializeParams{ProtocolVersion: "2024-11-05"},
+			InitializedParams: &InitializedParams{},
+		},
+	}
+	sessionID, err := cluster.Backend().Create(ctx, data)
+	if err != nil {
+		t.Fatalf("Create failed: %v", err)
+	}
+
+	// Simultaneously try to access from Pod A and Pod B
+	var wg sync.WaitGroup
+	results := make(chan error, 2)
+
+	for _, podName := range []string{"A", "B"} {
+		wg.Add(1)
+		go func(name string) {
+			defer wg.Done()
+			_, err := CallToolOnPod(ctx, t, cluster.Pod(name), sessionID, "echo", map[string]any{"msg": name})
+			results <- err
+		}(podName)
+	}
+
+	wg.Wait()
+	close(results)
+
+	// Both requests should succeed (or at most one fails due to race)
+	var successCount int
+	for err := range results {
+		if err == nil {
+			successCount++
+		}
+	}
+
+	if successCount == 0 {
+		t.Error("at least one concurrent request should succeed")
+	}
+	t.Logf("concurrent takeover: %d/2 requests succeeded", successCount)
+}
+
+// TestDistributed_StateUpdateRace tests that concurrent state updates don't corrupt data.
+func TestDistributed_StateUpdateRace(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 1)
+	defer cluster.Close()
+
+	// Create session
+	session, sessionID := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	// Wait for initial state
+	time.Sleep(50 * time.Millisecond)
+
+	// Rapidly update state multiple times
+	var wg sync.WaitGroup
+	for i := 0; i < 10; i++ {
+		wg.Add(1)
+		go func(n int) {
+			defer wg.Done()
+			level := LoggingLevel(fmt.Sprintf("level-%d", n))
+			_ = session.SetLoggingLevel(ctx, &SetLoggingLevelParams{Level: level})
+		}(i)
+	}
+	wg.Wait()
+
+	// Wait for async persistence
+	time.Sleep(200 * time.Millisecond)
+
+	// Verify session state is valid (not corrupted)
+	data, err := cluster.Backend().Get(ctx, sessionID)
+	if err != nil {
+		t.Fatalf("Get failed: %v", err)
+	}
+	if data.State == nil {
+		t.Fatal("state should not be nil")
+	}
+	// LogLevel should be one of the values we set (exact value depends on race)
+	if data.State.LogLevel == "" {
+		t.Error("LogLevel should be set")
+	}
+}
+
+// =============================================================================
+// Edge Case Tests
+// =============================================================================
+
+// TestDistributed_SessionExpiredDuringRequest tests handling when a session
+// expires/is deleted while a request is in progress.
+func TestDistributed_SessionExpiredDuringRequest(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 2)
+	defer cluster.Close()
+
+	// Create session directly in backend
+	data := &SessionData{
+		UserID: "",
+		State: &ServerSessionState{
+			InitializeParams:  &InitializeParams{ProtocolVersion: "2024-11-05"},
+			InitializedParams: &InitializedParams{},
+		},
+	}
+	sessionID, err := cluster.Backend().Create(ctx, data)
+	if err != nil {
+		t.Fatalf("Create failed: %v", err)
+	}
+
+	// Add latency to Get so we can delete during lookup
+	cluster.Backend().SetGetLatency(100 * time.Millisecond)
+
+	// Start request
+	errCh := make(chan error, 1)
+	go func() {
+		_, err := CallToolOnPod(ctx, t, cluster.Pod("A"), sessionID, "echo", map[string]any{"msg": "test"})
+		errCh <- err
+	}()
+
+	// Delete session while request is in progress
+	time.Sleep(50 * time.Millisecond)
+	cluster.Backend().Delete(ctx, sessionID)
+
+	// Request might fail or succeed depending on timing
+	err = <-errCh
+	t.Logf("request result during deletion: %v", err)
+	// This is acceptable behavior - the request may succeed or fail
+}
+
+// TestDistributed_EmptySessionState tests handling when session exists but has no state.
+func TestDistributed_EmptySessionState(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 2)
+	defer cluster.Close()
+
+	// Create session directly in backend with nil state (simulates partial init)
+	data := &SessionData{
+		UserID: "",
+		State:  nil, // No state
+	}
+	sessionID, err := cluster.Backend().Create(ctx, data)
+	if err != nil {
+		t.Fatalf("Create failed: %v", err)
+	}
+
+	// Try to access - should handle gracefully
+	_, err = CallToolOnPod(ctx, t, cluster.Pod("A"), sessionID, "echo", map[string]any{"msg": "test"})
+	// This may fail because session isn't fully initialized, which is acceptable
+	t.Logf("request with empty state: %v", err)
+}
+
+// TestDistributed_LargeMessageDelivery tests that large messages are delivered correctly.
+func TestDistributed_LargeMessageDelivery(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 1)
+	defer cluster.Close()
+
+	// Create session
+	session, sessionID := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	// Wait for state to persist
+	time.Sleep(50 * time.Millisecond)
+
+	// Clear local session
+	cluster.Pod("A").ClearLocalSession(sessionID)
+
+	// Open SSE stream
+	sse := OpenSSEStream(ctx, t, cluster.Pod("A"), sessionID)
+	defer sse.Close()
+
+	// Give SSE time to establish
+	time.Sleep(100 * time.Millisecond)
+
+	// Publish a large message
+	largeData := strings.Repeat("x", 100000) // 100KB
+	largeMsg := fmt.Sprintf(`{"jsonrpc":"2.0","method":"test","params":{"data":"%s"}}`, largeData)
+	if err := cluster.Backend().Publish(ctx, sessionID, []byte(largeMsg)); err != nil {
+		t.Fatalf("Publish failed: %v", err)
+	}
+
+	// Verify message received
+	evt, err := sse.NextEvent(2 * time.Second)
+	if err != nil {
+		t.Fatalf("failed to receive large message: %v", err)
+	}
+	if len(evt.Data) < 100000 {
+		t.Errorf("message truncated: got %d bytes, want at least 100000", len(evt.Data))
+	}
+}
+
+// =============================================================================
+// Cross-Pod Message Routing Tests
+// =============================================================================
+
+// TestDistributed_CrossPodNotificationRouting tests that notifications sent from
+// a pod that doesn't own the SSE stream are routed via the backend to the SSE owner.
+func TestDistributed_CrossPodNotificationRouting(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 2)
+	defer cluster.Close()
+
+	// Create session on Pod A
+	session, sessionID := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	// Wait for state to persist
+	time.Sleep(50 * time.Millisecond)
+
+	// Clear Pod A's local session so we can control SSE separately
+	cluster.Pod("A").ClearLocalSession(sessionID)
+
+	// Open SSE stream on Pod A - Pod A becomes SSE owner
+	sse := OpenSSEStream(ctx, t, cluster.Pod("A"), sessionID)
+	defer sse.Close()
+
+	// Give SSE time to establish and subscribe
+	time.Sleep(100 * time.Millisecond)
+
+	// Clear operation history to track new operations
+	cluster.Backend().ClearOps()
+
+	// Make a request to Pod B - this triggers session takeover on Pod B
+	// Pod B will NOT have an SSE stream, so any server-initiated messages
+	// should be routed via the backend
+	result, err := CallToolOnPod(ctx, t, cluster.Pod("B"), sessionID, "echo", map[string]any{"msg": "from B"})
+	if err != nil {
+		t.Fatalf("CallToolOnPod failed: %v", err)
+	}
+	if len(result.Content) == 0 {
+		t.Fatal("expected content in result")
+	}
+
+	// Verify Pod B now has a local session (takeover occurred)
+	if !cluster.Pod("B").HasLocalSession(sessionID) {
+		t.Error("Pod B should have local session after takeover")
+	}
+
+	// Pod B doesn't own SSE (Pod A does), so if Pod B tries to send
+	// notifications they should go through backend.Publish.
+	// The echo tool doesn't send notifications, so let's verify the
+	// infrastructure is in place by checking that Publish is available.
+
+	// Verify that Pod A still owns SSE by publishing directly and receiving
+	testMsg := `{"jsonrpc":"2.0","method":"notifications/test","params":{}}`
+	if err := cluster.Backend().Publish(ctx, sessionID, []byte(testMsg)); err != nil {
+		t.Fatalf("Publish failed: %v", err)
+	}
+
+	// Message should arrive on Pod A's SSE
+	evt, err := sse.NextEvent(2 * time.Second)
+	if err != nil {
+		t.Fatalf("failed to receive routed message: %v", err)
+	}
+	if string(evt.Data) != testMsg {
+		t.Errorf("message mismatch: got %q, want %q", string(evt.Data), testMsg)
+	}
+}
+
+// TestDistributed_WriteRoutesViaPublish tests that Write() on a non-SSE-owning
+// pod routes messages through the backend's Publish function.
+func TestDistributed_WriteRoutesViaPublish(t *testing.T) {
+	ctx := context.Background()
+	cluster := NewTestCluster(t, 2)
+	defer cluster.Close()
+
+	// Create session on Pod A and establish SSE
+	session, sessionID := ConnectTestClient(ctx, t, cluster.Pod("A"), nil)
+	defer session.Close()
+
+	time.Sleep(50 * time.Millisecond)
+
+	// Clear Pod A's session to control SSE separately
+	cluster.Pod("A").ClearLocalSession(sessionID)
+
+	// Pod A opens SSE - becomes owner
+	sse := OpenSSEStream(ctx, t, cluster.Pod("A"), sessionID)
+	defer sse.Close()
+
+	time.Sleep(100 * time.Millisecond)
+
+	// Create session takeover on Pod B (no SSE)
+	_, err := CallToolOnPod(ctx, t, cluster.Pod("B"), sessionID, "echo", map[string]any{"msg": "trigger takeover"})
+	if err != nil {
+		t.Fatalf("CallToolOnPod failed: %v", err)
+	}
+
+	// Clear ops to track new Publish calls
+	cluster.Backend().ClearOps()
+
+	// Publish directly to backend - simulates what Write() would do when not owning SSE
+	testNotification := `{"jsonrpc":"2.0","method":"notifications/resources/updated","params":{"uri":"test://resource"}}`
+	if err := cluster.Backend().Publish(ctx, sessionID, []byte(testNotification)); err != nil {
+		t.Fatalf("Publish failed: %v", err)
+	}
+
+	// Verify Publish was called
+	ops := cluster.Backend().GetOps()
+	publishCount := 0
+	for _, op := range ops {
+		if op.Op == "Publish" && op.SessionID == sessionID {
+			publishCount++
+		}
+	}
+	if publishCount == 0 {
+		t.Error("expected Publish to be called")
+	}
+
+	// Verify message delivered via SSE
+	evt, err := sse.NextEvent(2 * time.Second)
+	if err != nil {
+		t.Fatalf("failed to receive message: %v", err)
+	}
+	if string(evt.Data) != testNotification {
+		t.Errorf("message mismatch: got %q, want %q", string(evt.Data), testNotification)
+	}
+}
diff --git a/mcp/distributed_testutil_test.go b/mcp/distributed_testutil_test.go
new file mode 100644
index 00000000..05f01157
--- /dev/null
+++ b/mcp/distributed_testutil_test.go
@@ -0,0 +1,711 @@
+// Copyright 2025 The Go MCP SDK Authors. All rights reserved.
+// Use of this source code is governed by an MIT-style
+// license that can be found in the LICENSE file.
+
+// FORK: distributed-sessions
+// Test utilities for distributed session testing.
+
+package mcp
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+)
+
+// TestCluster manages multiple simulated pods sharing a SessionBackend.
+type TestCluster struct {
+	t       *testing.T
+	pods    map[string]*TestPod
+	backend *InstrumentedBackend
+	server  *Server // Shared server instance
+}
+
+// TestPod represents a single server replica in the test cluster.
+type TestPod struct {
+	Name    string
+	handler *StreamableHTTPHandler
+	httpSrv *httptest.Server
+	URL     string
+}
+
+// NewTestCluster creates a test cluster with the specified number of pods.
+func NewTestCluster(t *testing.T, numPods int) *TestCluster {
+	t.Helper()
+
+	backend := NewInstrumentedBackend()
+
+	// Create shared server with test tools
+	server := NewServer(testImpl, nil)
+	AddTool(server, &Tool{Name: "echo", Description: "echo input"}, echoTool)
+	AddTool(server, &Tool{Name: "getState", Description: "get session state"}, getStateTool)
+
+	cluster := &TestCluster{
+		t:       t,
+		pods:    make(map[string]*TestPod),
+		backend: backend,
+		server:  server,
+	}
+
+	// Create pods A, B, C, ...
+	for i := 0; i < numPods; i++ {
+		name := string(rune('A' + i))
+		cluster.createPod(name)
+	}
+
+	return cluster
+}
+
+func (c *TestCluster) createPod(name string) *TestPod {
+	handler := NewStreamableHTTPHandler(
+		func(req *http.Request) *Server { return c.server },
+		&StreamableHTTPOptions{
+			SessionBackend: c.backend,
+			Logger:         slog.Default(),
+		},
+	)
+
+	httpSrv := httptest.NewServer(handler)
+
+	pod := &TestPod{
+		Name:    name,
+		handler: handler,
+		httpSrv: httpSrv,
+		URL:     httpSrv.URL,
+	}
+	c.pods[name] = pod
+	return pod
+}
+
+// Pod returns the pod with the given name.
+func (c *TestCluster) Pod(name string) *TestPod {
+	pod, ok := c.pods[name]
+	if !ok {
+		c.t.Fatalf("pod %q not found", name)
+	}
+	return pod
+}
+
+// Backend returns the instrumented backend for verification.
+func (c *TestCluster) Backend() *InstrumentedBackend {
+	return c.backend
+}
+
+// Close shuts down all pods.
+func (c *TestCluster) Close() {
+	for _, pod := range c.pods {
+		pod.httpSrv.Close()
+		pod.handler.closeAll()
+	}
+}
+
+// ClearLocalSession removes a session from a pod's local cache.
+// This simulates a pod restart.
+func (p *TestPod) ClearLocalSession(sessionID string) {
+	p.handler.mu.Lock()
+	defer p.handler.mu.Unlock()
+	delete(p.handler.sessions, sessionID)
+}
+
+// HasLocalSession checks if a session is in the pod's local cache.
+func (p *TestPod) HasLocalSession(sessionID string) bool {
+	p.handler.mu.Lock()
+	defer p.handler.mu.Unlock()
+	_, ok := p.handler.sessions[sessionID]
+	return ok
+}
+
+// Tool handlers for tests
+func echoTool(ctx context.Context, req *CallToolRequest, args struct {
+	Msg string `json:"msg"`
+}) (*CallToolResult, any, error) {
+	return &CallToolResult{
+		Content: []Content{&TextContent{Text: args.Msg}},
+	}, nil, nil
+}
+
+func getStateTool(ctx context.Context, req *CallToolRequest, args any) (*CallToolResult, any, error) {
+	// Return a simple response - we can't easily access session state from here
+	return &CallToolResult{
+		Content: []Content{&TextContent{Text: "ok"}},
+	}, nil, nil
+}
+
+// InstrumentedBackend wraps MemorySessionBackend with instrumentation and
+// failure injection for testing error handling paths.
+type InstrumentedBackend struct {
+	*MemorySessionBackend
+
+	mu         sync.Mutex
+	ops        []BackendOp
+	subCount   map[string]int // sessionID -> active subscriber count
+	superseded map[string]chan struct{}
+
+	// Failure injection
+	failGet       error // If set, Get() returns this error
+	failUpdate    error // If set, Update() returns this error
+	failTouch     error // If set, Touch() returns this error
+	failPublish   error // If set, Publish() returns this error
+	failSubscribe error // If set, Subscribe() returns this error immediately
+
+	// Conditional failures
+	failGetAfterN    int // Fail Get after N successful calls (-1 to disable)
+	failUpdateAfterN int // Fail Update after N successful calls (-1 to disable)
+	getCount         int
+	updateCount      int
+
+	// Latency injection
+	getLatency    time.Duration
+	updateLatency time.Duration
+}
+
+// BackendOp records a backend operation for verification.
+type BackendOp struct {
+	Op        string
+	SessionID string
+	Error     error
+	Timestamp time.Time
+}
+
+// NewInstrumentedBackend creates a new instrumented backend.
+func NewInstrumentedBackend() *InstrumentedBackend {
+	return &InstrumentedBackend{
+		MemorySessionBackend: NewMemorySessionBackend(),
+		subCount:             make(map[string]int),
+		superseded:           make(map[string]chan struct{}),
+		failGetAfterN:        -1,
+		failUpdateAfterN:     -1,
+	}
+}
+
+func (b *InstrumentedBackend) recordOp(op, sessionID string, err error) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.ops = append(b.ops, BackendOp{Op: op, SessionID: sessionID, Error: err, Timestamp: time.Now()})
+}
+
+// SetFailGet configures Get to fail with the given error.
+func (b *InstrumentedBackend) SetFailGet(err error) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.failGet = err
+}
+
+// SetFailUpdate configures Update to fail with the given error.
+func (b *InstrumentedBackend) SetFailUpdate(err error) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.failUpdate = err
+}
+
+// SetFailTouch configures Touch to fail with the given error.
+func (b *InstrumentedBackend) SetFailTouch(err error) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.failTouch = err
+}
+
+// SetFailPublish configures Publish to fail with the given error.
+func (b *InstrumentedBackend) SetFailPublish(err error) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.failPublish = err
+}
+
+// SetFailSubscribe configures Subscribe to fail immediately with the given error.
+func (b *InstrumentedBackend) SetFailSubscribe(err error) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.failSubscribe = err
+}
+
+// SetFailGetAfterN configures Get to fail after N successful calls.
+func (b *InstrumentedBackend) SetFailGetAfterN(n int, err error) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.failGetAfterN = n
+	b.failGet = err
+	b.getCount = 0
+}
+
+// SetFailUpdateAfterN configures Update to fail after N successful calls.
+func (b *InstrumentedBackend) SetFailUpdateAfterN(n int, err error) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.failUpdateAfterN = n
+	b.failUpdate = err
+	b.updateCount = 0
+}
+
+// SetGetLatency configures latency for Get operations.
+func (b *InstrumentedBackend) SetGetLatency(d time.Duration) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.getLatency = d
+}
+
+// SetUpdateLatency configures latency for Update operations.
+func (b *InstrumentedBackend) SetUpdateLatency(d time.Duration) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.updateLatency = d
+}
+
+// ClearFailures removes all failure injection.
+func (b *InstrumentedBackend) ClearFailures() {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.failGet = nil
+	b.failUpdate = nil
+	b.failTouch = nil
+	b.failPublish = nil
+	b.failSubscribe = nil
+	b.failGetAfterN = -1
+	b.failUpdateAfterN = -1
+	b.getLatency = 0
+	b.updateLatency = 0
+}
+
+func (b *InstrumentedBackend) Create(ctx context.Context, data *SessionData) (string, error) {
+	id, err := b.MemorySessionBackend.Create(ctx, data)
+	b.recordOp("Create", id, err)
+	return id, err
+}
+
+func (b *InstrumentedBackend) Get(ctx context.Context, id string) (*SessionData, error) {
+	b.mu.Lock()
+	latency := b.getLatency
+	failErr := b.failGet
+	failAfterN := b.failGetAfterN
+	b.getCount++
+	count := b.getCount
+	b.mu.Unlock()
+
+	// Apply latency
+	if latency > 0 {
+		time.Sleep(latency)
+	}
+
+	// Check for conditional failure
+	if failAfterN >= 0 && count > failAfterN && failErr != nil {
+		b.recordOp("Get", id, failErr)
+		return nil, failErr
+	}
+
+	// Check for unconditional failure
+	if failAfterN < 0 && failErr != nil {
+		b.recordOp("Get", id, failErr)
+		return nil, failErr
+	}
+
+	data, err := b.MemorySessionBackend.Get(ctx, id)
+	b.recordOp("Get", id, err)
+	return data, err
+}
+
+func (b *InstrumentedBackend) Update(ctx context.Context, id string, data *SessionData) error {
+	b.mu.Lock()
+	latency := b.updateLatency
+	failErr := b.failUpdate
+	failAfterN := b.failUpdateAfterN
+	b.updateCount++
+	count := b.updateCount
+	b.mu.Unlock()
+
+	// Apply latency
+	if latency > 0 {
+		time.Sleep(latency)
+	}
+
+	// Check for conditional failure
+	if failAfterN >= 0 && count > failAfterN && failErr != nil {
+		b.recordOp("Update", id, failErr)
+		return failErr
+	}
+
+	// Check for unconditional failure
+	if failAfterN < 0 && failErr != nil {
+		b.recordOp("Update", id, failErr)
+		return failErr
+	}
+
+	err := b.MemorySessionBackend.Update(ctx, id, data)
+	b.recordOp("Update", id, err)
+	return err
+}
+
+func (b *InstrumentedBackend) Delete(ctx context.Context, id string) error {
+	err := b.MemorySessionBackend.Delete(ctx, id)
+	b.recordOp("Delete", id, err)
+	return err
+}
+
+func (b *InstrumentedBackend) Touch(ctx context.Context, id string) error {
+	b.mu.Lock()
+	failErr := b.failTouch
+	b.mu.Unlock()
+
+	if failErr != nil {
+		b.recordOp("Touch", id, failErr)
+		return failErr
+	}
+
+	err := b.MemorySessionBackend.Touch(ctx, id)
+	b.recordOp("Touch", id, err)
+	return err
+}
+
+func (b *InstrumentedBackend) Publish(ctx context.Context, sessionID string, msg []byte) error {
+	b.mu.Lock()
+	failErr := b.failPublish
+	b.mu.Unlock()
+
+	if failErr != nil {
+		b.recordOp("Publish", sessionID, failErr)
+		return failErr
+	}
+
+	err := b.MemorySessionBackend.Publish(ctx, sessionID, msg)
+	b.recordOp("Publish", sessionID, err)
+	return err
+}
+
+func (b *InstrumentedBackend) Subscribe(ctx context.Context, sessionID string, handler MessageHandler) error {
+	b.mu.Lock()
+	failErr := b.failSubscribe
+	b.mu.Unlock()
+
+	if failErr != nil {
+		b.recordOp("Subscribe", sessionID, failErr)
+		return failErr
+	}
+	// Track subscriber count
+	b.mu.Lock()
+	b.subCount[sessionID]++
+	count := b.subCount[sessionID]
+
+	// If there's already a subscriber, signal it to close
+	if count > 1 {
+		if ch, ok := b.superseded[sessionID]; ok {
+			close(ch)
+		}
+	}
+
+	// Create our own supersede channel
+	supersedeCh := make(chan struct{})
+	b.superseded[sessionID] = supersedeCh
+	b.mu.Unlock()
+
+	defer func() {
+		b.mu.Lock()
+		b.subCount[sessionID]--
+		if b.superseded[sessionID] == supersedeCh {
+			delete(b.superseded, sessionID)
+		}
+		b.mu.Unlock()
+	}()
+
+	// Wrap the context to also listen for supersede signal
+	ctx, cancel := context.WithCancel(ctx)
+	defer cancel()
+
+	go func() {
+		select {
+		case <-supersedeCh:
+			cancel() // Will cause Subscribe to return
+		case <-ctx.Done():
+		}
+	}()
+
+	err := b.MemorySessionBackend.Subscribe(ctx, sessionID, handler)
+
+	// Check if we were superseded
+	select {
+	case <-supersedeCh:
+		b.recordOp("Subscribe", sessionID, ErrSubscriptionSuperseded)
+		return ErrSubscriptionSuperseded
+	default:
+		b.recordOp("Subscribe", sessionID, err)
+		return err
+	}
+}
+
+// GetSubscriberCount returns the number of active subscribers for a session.
+func (b *InstrumentedBackend) GetSubscriberCount(sessionID string) int {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	return b.subCount[sessionID]
+}
+
+// AssertSingleSubscriber verifies only one subscriber exists for a session.
+func (b *InstrumentedBackend) AssertSingleSubscriber(t *testing.T, sessionID string) {
+	t.Helper()
+	count := b.GetSubscriberCount(sessionID)
+	if count > 1 {
+		t.Errorf("expected at most 1 subscriber for session %q, got %d", sessionID, count)
+	}
+}
+
+// GetOps returns all recorded operations.
+func (b *InstrumentedBackend) GetOps() []BackendOp {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	return append([]BackendOp(nil), b.ops...)
+}
+
+// ClearOps clears the operation history.
+func (b *InstrumentedBackend) ClearOps() {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.ops = nil
+}
+
+// TestClientOptions configures a test client.
+type TestClientOptions struct {
+	UserID string
+}
+
+// ConnectTestClient creates and connects a test client to a pod.
+func ConnectTestClient(ctx context.Context, t *testing.T, pod *TestPod, opts *TestClientOptions) (*ClientSession, string) {
+	t.Helper()
+
+	transport := &StreamableClientTransport{
+		Endpoint: pod.URL,
+	}
+
+	client := NewClient(testImpl, nil)
+
+	session, err := client.Connect(ctx, transport, nil)
+	if err != nil {
+		t.Fatalf("client.Connect() failed: %v", err)
+	}
+
+	sessionID := session.ID()
+	if sessionID == "" {
+		t.Fatal("empty session ID")
+	}
+
+	return session, sessionID
+}
+
+// SSEStream represents an open SSE connection for testing.
+// It allows direct control over SSE stream lifecycle and event monitoring.
+type SSEStream struct {
+	t      *testing.T
+	resp   *http.Response
+	events chan Event
+	errors chan error
+	done   chan struct{}
+	closed bool
+	mu     sync.Mutex
+	cancel context.CancelFunc
+}
+
+// OpenSSEStream opens an SSE stream to a pod for the given session.
+// This simulates a client establishing the standalone SSE GET connection.
+func OpenSSEStream(ctx context.Context, t *testing.T, pod *TestPod, sessionID string) *SSEStream {
+	t.Helper()
+
+	ctx, cancel := context.WithCancel(ctx)
+
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, pod.URL, nil)
+	if err != nil {
+		cancel()
+		t.Fatalf("failed to create SSE request: %v", err)
+	}
+	req.Header.Set("Accept", "text/event-stream")
+	req.Header.Set(sessionIDHeader, sessionID)
+
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		cancel()
+		t.Fatalf("SSE request failed: %v", err)
+	}
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		resp.Body.Close()
+		cancel()
+		t.Fatalf("SSE request got status %d: %s", resp.StatusCode, string(body))
+	}
+
+	stream := &SSEStream{
+		t:      t,
+		resp:   resp,
+		events: make(chan Event, 100),
+		errors: make(chan error, 1),
+		done:   make(chan struct{}),
+		cancel: cancel,
+	}
+
+	// Start goroutine to read events
+	go stream.readEvents()
+
+	return stream
+}
+
+// readEvents reads SSE events from the response body and sends them to the events channel.
+func (s *SSEStream) readEvents() {
+	defer func() {
+		s.mu.Lock()
+		if !s.closed {
+			s.closed = true
+			close(s.done)
+		}
+		s.mu.Unlock()
+		s.resp.Body.Close()
+	}()
+
+	for evt, err := range scanEvents(s.resp.Body) {
+		if err != nil {
+			select {
+			case s.errors <- err:
+			default:
+			}
+			return
+		}
+		select {
+		case s.events <- evt:
+		case <-s.done:
+			return
+		}
+	}
+}
+
+// NextEvent waits for and returns the next SSE event, or an error if timeout.
+func (s *SSEStream) NextEvent(timeout time.Duration) (*Event, error) {
+	select {
+	case evt := <-s.events:
+		return &evt, nil
+	case err := <-s.errors:
+		return nil, err
+	case <-s.done:
+		return nil, io.EOF
+	case <-time.After(timeout):
+		return nil, fmt.Errorf("timeout waiting for SSE event")
+	}
+}
+
+// IsClosed returns true if the SSE stream has been closed.
+func (s *SSEStream) IsClosed() bool {
+	select {
+	case <-s.done:
+		return true
+	default:
+		return false
+	}
+}
+
+// WaitClosed waits for the stream to close with a timeout.
+func (s *SSEStream) WaitClosed(timeout time.Duration) bool {
+	select {
+	case <-s.done:
+		return true
+	case <-time.After(timeout):
+		return false
+	}
+}
+
+// Close closes the SSE stream.
+func (s *SSEStream) Close() {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if !s.closed {
+		s.closed = true
+		s.cancel()
+		close(s.done)
+	}
+}
+
+// CallToolOnPod sends a tool call to a specific pod using an existing session ID.
+// This uses raw HTTP requests to send a request with an existing session ID.
+func CallToolOnPod(ctx context.Context, t *testing.T, pod *TestPod, sessionID string, toolName string, args map[string]any) (*CallToolResult, error) {
+	t.Helper()
+
+	// Build the JSON-RPC request
+	callReq := &CallToolParams{
+		Name:      toolName,
+		Arguments: args,
+	}
+	paramsData, err := json.Marshal(callReq)
+	if err != nil {
+		return nil, fmt.Errorf("marshal params: %w", err)
+	}
+
+	rpcReq := map[string]any{
+		"jsonrpc": "2.0",
+		"id":      1,
+		"method":  "tools/call",
+		"params":  json.RawMessage(paramsData),
+	}
+	reqData, err := json.Marshal(rpcReq)
+	if err != nil {
+		return nil, fmt.Errorf("marshal request: %w", err)
+	}
+
+	// Make HTTP request
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, pod.URL, bytes.NewReader(reqData))
+	if err != nil {
+		return nil, fmt.Errorf("create request: %w", err)
+	}
+	httpReq.Header.Set("Content-Type", "application/json")
+	httpReq.Header.Set("Accept", "application/json, text/event-stream")
+	httpReq.Header.Set(sessionIDHeader, sessionID)
+
+	resp, err := http.DefaultClient.Do(httpReq)
+	if err != nil {
+		return nil, fmt.Errorf("http request: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		return nil, fmt.Errorf("status %d: %s", resp.StatusCode, string(body))
+	}
+
+	// Parse response - handle both JSON and SSE formats
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("read response: %w", err)
+	}
+
+	// Check if this is SSE format (starts with "event:")
+	bodyStr := string(body)
+	if strings.HasPrefix(bodyStr, "event:") {
+		// Extract the data from SSE format
+		// Format: "event: message\ndata: {...}\n\n"
+		lines := strings.Split(bodyStr, "\n")
+		for _, line := range lines {
+			if strings.HasPrefix(line, "data: ") {
+				body = []byte(strings.TrimPrefix(line, "data: "))
+				break
+			}
+		}
+	}
+
+	var rpcResp struct {
+		Result *CallToolResult `json:"result"`
+		Error  *struct {
+			Code    int    `json:"code"`
+			Message string `json:"message"`
+		} `json:"error"`
+	}
+	if err := json.Unmarshal(body, &rpcResp); err != nil {
+		return nil, fmt.Errorf("unmarshal response: %w (body: %s)", err, string(body))
+	}
+
+	if rpcResp.Error != nil {
+		return nil, fmt.Errorf("rpc error %d: %s", rpcResp.Error.Code, rpcResp.Error.Message)
+	}
+
+	return rpcResp.Result, nil
+}
diff --git a/mcp/server.go b/mcp/server.go
index 207276c2..83c4953b 100644
--- a/mcp/server.go
+++ b/mcp/server.go
@@ -5,11 +5,9 @@
 package mcp
 
 import (
-	"bytes"
 	"context"
 	"crypto/rand"
 	"encoding/base64"
-	"encoding/gob"
 	"encoding/json"
 	"errors"
 	"fmt"
@@ -1039,7 +1037,7 @@ func (ss *ServerSession) initialized(ctx context.Context, params *InitializedPar
 		params = new(InitializedParams)
 	}
 	var wasInit, wasInitd bool
-	ss.updateState(func(state *ServerSessionState) {
+	ss.updateState(ctx, func(state *ServerSessionState) {
 		wasInit = state.InitializeParams != nil
 		wasInitd = state.InitializedParams != nil
 		if wasInit && !wasInitd {
@@ -1113,13 +1111,14 @@ type ServerSession struct {
 	state ServerSessionState
 }
 
-func (ss *ServerSession) updateState(mut func(*ServerSessionState)) {
+// FORK: distributed-sessions - added ctx parameter
+func (ss *ServerSession) updateState(ctx context.Context, mut func(*ServerSessionState)) {
 	ss.mu.Lock()
 	mut(&ss.state)
 	copy := ss.state
 	ss.mu.Unlock()
 	if c, ok := ss.mcpConn.(serverConnection); ok {
-		c.sessionUpdated(copy)
+		c.sessionUpdated(ctx, copy)
 	}
 }
 
@@ -1457,7 +1456,7 @@ func (ss *ServerSession) initialize(ctx context.Context, params *InitializeParam
 	if params == nil {
 		return nil, fmt.Errorf("%w: \"params\" must be be provided", jsonrpc2.ErrInvalidParams)
 	}
-	ss.updateState(func(state *ServerSessionState) {
+	ss.updateState(ctx, func(state *ServerSessionState) {
 		state.InitializeParams = params
 	})
 
@@ -1485,8 +1484,8 @@ func (ss *ServerSession) cancel(context.Context, *CancelledParams) (Result, erro
 	return nil, nil
 }
 
-func (ss *ServerSession) setLevel(_ context.Context, params *SetLoggingLevelParams) (*emptyResult, error) {
-	ss.updateState(func(state *ServerSessionState) {
+func (ss *ServerSession) setLevel(ctx context.Context, params *SetLoggingLevelParams) (*emptyResult, error) {
+	ss.updateState(ctx, func(state *ServerSessionState) {
 		state.LogLevel = params.Level
 	})
 	ss.server.opts.Logger.Info("client log level set", "level", params.Level)
@@ -1527,21 +1526,20 @@ func (ss *ServerSession) startKeepalive(interval time.Duration) {
 }
 
 // pageToken is the internal structure for the opaque pagination cursor.
-// It will be Gob-encoded and then Base64-encoded for use as a string token.
+// It will be JSON-encoded and then Base64-encoded for use as a string token.
 type pageToken struct {
-	LastUID string // The unique ID of the last resource seen.
+	LastUID string `json:"last_uid"` // The unique ID of the last resource seen.
 }
 
 // encodeCursor encodes a unique identifier (UID) into a opaque pagination cursor
 // by serializing a pageToken struct.
 func encodeCursor(uid string) (string, error) {
-	var buf bytes.Buffer
 	token := pageToken{LastUID: uid}
-	encoder := gob.NewEncoder(&buf)
-	if err := encoder.Encode(token); err != nil {
+	encodedBytes, err := json.Marshal(token)
+	if err != nil {
 		return "", fmt.Errorf("failed to encode page token: %w", err)
 	}
-	return base64.URLEncoding.EncodeToString(buf.Bytes()), nil
+	return base64.URLEncoding.EncodeToString(encodedBytes), nil
 }
 
 // decodeCursor decodes an opaque pagination cursor into the original pageToken struct.
@@ -1552,9 +1550,7 @@ func decodeCursor(cursor string) (*pageToken, error) {
 	}
 
 	var token pageToken
-	buf := bytes.NewBuffer(decodedBytes)
-	decoder := gob.NewDecoder(buf)
-	if err := decoder.Decode(&token); err != nil {
+	if err := json.Unmarshal(decodedBytes, &token); err != nil {
 		return nil, fmt.Errorf("failed to decode page token: %w, cursor: %v", err, cursor)
 	}
 	return &token, nil
diff --git a/mcp/session_backend.go b/mcp/session_backend.go
new file mode 100644
index 00000000..3edce847
--- /dev/null
+++ b/mcp/session_backend.go
@@ -0,0 +1,119 @@
+// Copyright 2025 The Go MCP SDK Authors. All rights reserved.
+// Use of this source code is governed by an MIT-style
+// license that can be found in the LICENSE file.
+
+// FORK: distributed-sessions
+// This file implements the SessionBackend interface for distributed
+// session management across multiple server replicas.
+
+package mcp
+
+import (
+	"context"
+	"errors"
+)
+
+// ErrSubscriptionSuperseded is returned by Subscribe when another subscriber
+// took over the session's message stream. This typically happens during
+// failover when a new pod claims ownership of the session.
+var ErrSubscriptionSuperseded = errors.New("mcp: subscription superseded")
+
+// ErrSessionNotFound is returned when a session lookup fails because the
+// session does not exist in the backend.
+var ErrSessionNotFound = errors.New("mcp: session not found")
+
+// SessionData holds the persistent state for a distributed session.
+// This data is stored in the SessionBackend and can be retrieved by any
+// server replica.
+type SessionData struct {
+	// SessionID is the unique identifier for this session, generated by
+	// the SessionBackend.Create method.
+	SessionID string `json:"sessionId"`
+
+	// State contains the MCP protocol state for the session.
+	State *ServerSessionState `json:"state,omitempty"`
+
+	// UserID is the authenticated user ID, used to prevent session hijacking.
+	// If non-empty, subsequent requests must have the same user ID.
+	UserID string `json:"userId,omitempty"`
+}
+
+// MessageHandler is called by Subscribe for each message that needs to be
+// delivered to the session's SSE stream.
+//
+// Return nil to acknowledge the message (it will be removed from the queue).
+// Return an error to signal delivery failure (message may be redelivered).
+type MessageHandler func(ctx context.Context, msg []byte) error
+
+// SessionBackend provides session persistence and cross-pod message routing
+// for multi-replica MCP server deployments.
+//
+// Implementations must be safe for concurrent use by multiple goroutines.
+//
+// The interface combines two responsibilities:
+//  1. Session CRUD: Persisting session metadata so any replica can handle requests
+//  2. Message routing: Delivering messages from non-owner pods to the SSE-owner pod
+//
+// Example flow for a distributed deployment:
+//  1. Client connects to Pod A, which calls Create() and stores session
+//  2. Client's SSE stream connects to Pod A, which calls Subscribe()
+//  3. Client's POST request hits Pod B (via load balancer)
+//  4. Pod B calls Get() to find the session, then Publish() to route the response
+//  5. Pod A's Subscribe handler receives the message and writes to SSE
+type SessionBackend interface {
+	// Create persists a new session and returns its unique ID.
+	// The implementation generates the session ID (e.g., UUID, ULID).
+	//
+	// The returned ID must be globally unique across all sessions.
+	Create(ctx context.Context, data *SessionData) (string, error)
+
+	// Get retrieves session data by ID.
+	// Returns ErrSessionNotFound if the session does not exist.
+	Get(ctx context.Context, id string) (*SessionData, error)
+
+	// Update persists changes to an existing session.
+	// The SessionID field in data must match id.
+	// Returns ErrSessionNotFound if the session does not exist.
+	Update(ctx context.Context, id string, data *SessionData) error
+
+	// Delete removes a session from the backend.
+	// This should also clean up any associated message queues.
+	// Returns nil if the session does not exist (idempotent).
+	Delete(ctx context.Context, id string) error
+
+	// Touch updates the session's last activity timestamp without
+	// modifying other data. Called on each POST request to signal activity.
+	//
+	// Backend implementations SHOULD use this to extend the session's TTL.
+	// For example, a Redis implementation might call EXPIRE to reset the TTL.
+	// This is the primary mechanism for distributed session timeout management,
+	// since local timers are lost on pod restarts.
+	//
+	// Returns ErrSessionNotFound if the session does not exist.
+	Touch(ctx context.Context, id string) error
+
+	// Publish sends a message to the session's message queue.
+	// The message will be delivered to the Subscribe handler on the
+	// pod that owns the session's SSE stream.
+	//
+	// Messages are delivered in FIFO order per session.
+	Publish(ctx context.Context, sessionID string, msg []byte) error
+
+	// Subscribe starts receiving messages for a session.
+	//
+	// The handler is called for each message in order. Subscribe blocks
+	// until one of:
+	//   - ctx is cancelled: returns ctx.Err()
+	//   - handler returns an error: returns that error (message not acked)
+	//   - another subscriber takes over: returns ErrSubscriptionSuperseded
+	//   - session is deleted: returns ErrSessionNotFound
+	//
+	// When handler returns nil, the message is acknowledged and removed
+	// from the queue. When handler returns an error, the message may be
+	// redelivered to another subscriber.
+	//
+	// Only one subscriber should be active per session at a time.
+	// Implementations may enforce this by returning ErrSubscriptionSuperseded
+	// to the previous subscriber when a new one connects.
+	Subscribe(ctx context.Context, sessionID string, handler MessageHandler) error
+}
diff --git a/mcp/session_backend_integration.go b/mcp/session_backend_integration.go
new file mode 100644
index 00000000..c12f15ab
--- /dev/null
+++ b/mcp/session_backend_integration.go
@@ -0,0 +1,310 @@
+// Copyright 2025 The Go MCP SDK Authors. All rights reserved.
+// Use of this source code is governed by an MIT-style
+// license that can be found in the LICENSE file.
+
+// FORK: distributed-sessions
+// This file contains integration helpers for connecting the SessionBackend
+// to the StreamableHTTPHandler.
+//
+// Design: Any pod can handle any request. No sticky sessions required.
+//
+// When a request arrives for a session not in local cache:
+//  1. Load session data from backend
+//  2. Create local session with backend's state
+//  3. Handle request normally
+//  4. Local session persists for subsequent requests (with timeout)
+//
+// For SSE (GET requests):
+//  - The handling pod becomes the "SSE owner"
+//  - It subscribes to backend messages and forwards to the client
+//  - Previous owner's subscription ends (ErrSubscriptionSuperseded)
+
+package mcp
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net/http"
+	"time"
+
+	"github.com/modelcontextprotocol/go-sdk/auth"
+)
+
+// hasSessionBackend reports whether the handler has a SessionBackend configured.
+func (h *StreamableHTTPHandler) hasSessionBackend() bool {
+	return h.opts.SessionBackend != nil
+}
+
+// lookupSessionFromBackend looks up a session, checking the local cache first,
+// then falling back to the SessionBackend if configured.
+//
+// Returns:
+//   - Local sessionInfo if found in cache
+//   - Remote marker if found in backend but not locally
+//   - nil if not found anywhere
+//   - error only for backend failures
+func (h *StreamableHTTPHandler) lookupSessionFromBackend(ctx context.Context, sessionID string) (*sessionInfo, error) {
+	// Check local cache first
+	h.mu.Lock()
+	sessInfo := h.sessions[sessionID]
+	h.mu.Unlock()
+
+	if sessInfo != nil {
+		return sessInfo, nil
+	}
+
+	// If no backend configured, session doesn't exist
+	if !h.hasSessionBackend() {
+		return nil, nil
+	}
+
+	// Check backend
+	data, err := h.opts.SessionBackend.Get(ctx, sessionID)
+	if err != nil {
+		if errors.Is(err, ErrSessionNotFound) {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("backend lookup failed: %w", err)
+	}
+
+	// Session exists in backend but not locally.
+	// Return a marker so the caller can create a local session.
+	return &sessionInfo{
+		remoteSession: &remoteSessionInfo{
+			data: data,
+		},
+	}, nil
+}
+
+// remoteSessionInfo holds backend data for a session not yet local.
+type remoteSessionInfo struct {
+	data *SessionData
+}
+
+// createSessionInBackend persists a new session to the backend.
+func (h *StreamableHTTPHandler) createSessionInBackend(ctx context.Context, sessInfo *sessionInfo) error {
+	if !h.hasSessionBackend() {
+		return nil
+	}
+
+	data := &SessionData{
+		SessionID: sessInfo.transport.SessionID,
+		UserID:    sessInfo.userID,
+		// State will be updated after initialization completes
+	}
+
+	return h.opts.SessionBackend.Update(ctx, data.SessionID, data)
+}
+
+// deleteSessionFromBackend removes a session from the backend.
+func (h *StreamableHTTPHandler) deleteSessionFromBackend(ctx context.Context, sessionID string) error {
+	if !h.hasSessionBackend() {
+		return nil
+	}
+	return h.opts.SessionBackend.Delete(ctx, sessionID)
+}
+
+// touchSessionInBackend updates the session's last activity timestamp.
+func (h *StreamableHTTPHandler) touchSessionInBackend(ctx context.Context, sessionID string) error {
+	if !h.hasSessionBackend() {
+		return nil
+	}
+	return h.opts.SessionBackend.Touch(ctx, sessionID)
+}
+
+// subscribeFromBackend subscribes to messages for a session.
+func (h *StreamableHTTPHandler) subscribeFromBackend(ctx context.Context, sessionID string, handler MessageHandler) error {
+	if !h.hasSessionBackend() {
+		return errors.New("no session backend configured")
+	}
+	return h.opts.SessionBackend.Subscribe(ctx, sessionID, handler)
+}
+
+// handleRemoteSession handles a request for a session that exists in the
+// backend but not on this pod. It creates a local session from the backend
+// state and then processes the request normally.
+func (h *StreamableHTTPHandler) handleRemoteSession(w http.ResponseWriter, req *http.Request, remote *remoteSessionInfo) {
+	ctx := req.Context()
+	sessionID := remote.data.SessionID
+
+	// Security: Verify user ID to prevent session hijacking
+	if remote.data.UserID != "" {
+		tokenInfo := auth.TokenInfoFromContext(ctx)
+		if tokenInfo == nil || tokenInfo.UserID != remote.data.UserID {
+			http.Error(w, "session user mismatch", http.StatusForbidden)
+			return
+		}
+	}
+
+	// DELETE can be handled without creating a local session
+	if req.Method == http.MethodDelete {
+		if err := h.deleteSessionFromBackend(ctx, sessionID); err != nil {
+			h.opts.Logger.Error("failed to delete session from backend", "error", err, "sessionID", sessionID)
+			http.Error(w, "failed to delete session", http.StatusInternalServerError)
+			return
+		}
+		w.WriteHeader(http.StatusNoContent)
+		return
+	}
+
+	// For GET/POST, create a local session from backend state
+	server := h.getServer(req)
+	if server == nil {
+		http.Error(w, "no server available", http.StatusBadRequest)
+		return
+	}
+
+	sessInfo, err := h.createLocalSessionFromBackend(ctx, server, remote.data)
+	if err != nil {
+		h.opts.Logger.Error("failed to create local session", "error", err, "sessionID", sessionID)
+		http.Error(w, "failed to restore session", http.StatusInternalServerError)
+		return
+	}
+
+	// Touch backend to extend TTL
+	if err := h.touchSessionInBackend(ctx, sessionID); err != nil {
+		h.opts.Logger.Warn("failed to touch session in backend", "error", err, "sessionID", sessionID)
+	}
+
+	// Now handle the request using the local session
+	if req.Method == http.MethodPost {
+		sessInfo.startPOST()
+		defer sessInfo.endPOST()
+	}
+	sessInfo.transport.ServeHTTP(w, req)
+}
+
+// createLocalSessionFromBackend creates local runtime state for a session
+// that exists in the backend. This allows any pod to handle requests for
+// any session.
+func (h *StreamableHTTPHandler) createLocalSessionFromBackend(ctx context.Context, server *Server, data *SessionData) (*sessionInfo, error) {
+	sessionID := data.SessionID
+
+	// Double-check local cache (race condition protection)
+	h.mu.Lock()
+	if existing := h.sessions[sessionID]; existing != nil {
+		h.mu.Unlock()
+		return existing, nil
+	}
+	h.mu.Unlock()
+
+	// Create transport with the existing session ID
+	transport := &StreamableServerTransport{
+		SessionID:    sessionID,
+		Stateless:    h.opts.Stateless,
+		EventStore:   h.opts.EventStore,
+		jsonResponse: h.opts.JSONResponse,
+		logger:       h.opts.Logger,
+	}
+
+	// Set up message routing: when SSE starts, subscribe to backend messages
+	transport.OnSSEStart = func(ctx context.Context, writer func(data []byte) error, closeSSE func()) {
+		// This callback blocks - it runs in a goroutine spawned by the caller.
+		err := h.subscribeFromBackend(ctx, sessionID, func(msgCtx context.Context, msg []byte) error {
+			return writer(msg)
+		})
+		if err != nil && ctx.Err() == nil {
+			h.opts.Logger.Error("backend subscription ended", "error", err, "sessionID", sessionID)
+			// If subscription was superseded, close the SSE stream
+			if errors.Is(err, ErrSubscriptionSuperseded) {
+				closeSSE()
+			}
+		}
+	}
+
+	// Set up state persistence: when state changes, persist to backend
+	backendUserID := data.UserID
+	transport.OnStateChange = func(ctx context.Context, state ServerSessionState) error {
+		// This callback blocks - it runs in a goroutine spawned by the caller.
+		// Use a timeout derived from the request context.
+		ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
+		defer cancel()
+		return h.updateSessionStateInBackend(ctx, sessionID, backendUserID, &state)
+	}
+
+	// Set up message routing: route messages when not SSE owner
+	transport.OnPublish = func(ctx context.Context, sid string, msg []byte) error {
+		return h.opts.SessionBackend.Publish(ctx, sid, msg)
+	}
+
+	// Use backend state if available, otherwise create minimal initialized state
+	var state *ServerSessionState
+	if data.State != nil {
+		state = data.State
+	} else {
+		// Session exists but state wasn't stored - create minimal state
+		// This happens if the session was created but Initialize hasn't completed
+		state = &ServerSessionState{}
+	}
+
+	// Connect with pre-initialized state
+	connectOpts := &ServerSessionOptions{
+		State: state,
+		onClose: func() {
+			h.mu.Lock()
+			defer h.mu.Unlock()
+			if info, ok := h.sessions[sessionID]; ok {
+				info.stopTimer()
+				delete(h.sessions, sessionID)
+				if h.onTransportDeletion != nil {
+					h.onTransportDeletion(sessionID)
+				}
+				// Note: We don't delete from backend here - the session might
+				// be accessed by another pod. Let TTL or explicit DELETE handle it.
+			}
+		},
+	}
+
+	session, err := server.Connect(ctx, transport, connectOpts)
+	if err != nil {
+		return nil, fmt.Errorf("server connect failed: %w", err)
+	}
+
+	sessInfo := &sessionInfo{
+		session:   session,
+		transport: transport,
+		userID:    data.UserID,
+	}
+
+	// Set up timeout if configured
+	if h.opts.SessionTimeout > 0 {
+		sessInfo.timeout = h.opts.SessionTimeout
+		sessInfo.timer = time.AfterFunc(sessInfo.timeout, func() {
+			sessInfo.session.Close()
+		})
+	}
+
+	// Store in local cache
+	h.mu.Lock()
+	// Final race check
+	if existing := h.sessions[sessionID]; existing != nil {
+		h.mu.Unlock()
+		session.Close() // Clean up the one we just created
+		return existing, nil
+	}
+	h.sessions[sessionID] = sessInfo
+	h.mu.Unlock()
+
+	h.opts.Logger.Debug("created local session from backend", "sessionID", sessionID)
+	return sessInfo, nil
+}
+
+// updateSessionStateInBackend persists session state changes to the backend.
+// This should be called when session state changes (e.g., after Initialize).
+//
+// FORK: distributed-sessions - This uses a direct Update with reconstructed
+// SessionData instead of a read-modify-write cycle to avoid race conditions
+// when multiple goroutines or pods update state concurrently.
+func (h *StreamableHTTPHandler) updateSessionStateInBackend(ctx context.Context, sessionID string, userID string, state *ServerSessionState) error {
+	if !h.hasSessionBackend() {
+		return nil
+	}
+
+	data := &SessionData{
+		SessionID: sessionID,
+		State:     state,
+		UserID:    userID,
+	}
+	return h.opts.SessionBackend.Update(ctx, sessionID, data)
+}
diff --git a/mcp/session_backend_memory.go b/mcp/session_backend_memory.go
new file mode 100644
index 00000000..f6558d14
--- /dev/null
+++ b/mcp/session_backend_memory.go
@@ -0,0 +1,190 @@
+// Copyright 2025 The Go MCP SDK Authors. All rights reserved.
+// Use of this source code is governed by an MIT-style
+// license that can be found in the LICENSE file.
+
+// FORK: distributed-sessions
+// In-memory implementation of SessionBackend for development and testing.
+
+package mcp
+
+import (
+	"context"
+	"crypto/rand"
+	"sync"
+)
+
+// MemorySessionBackend is an in-memory implementation of SessionBackend.
+//
+// This implementation is suitable for:
+//   - Development and testing
+//   - Single-replica deployments
+//   - Prototyping before implementing a production backend
+//
+// Limitations:
+//   - Data is not persisted across restarts
+//   - Does not work across multiple processes/pods
+//   - No TTL enforcement (Touch is a no-op)
+//   - Messages are lost if no subscriber is active (no persistence)
+//
+// For production multi-replica deployments, implement SessionBackend
+// using Redis, PostgreSQL, or another distributed data store.
+type MemorySessionBackend struct {
+	mu       sync.RWMutex
+	sessions map[string]*SessionData
+	subs     map[string]*subscription
+}
+
+// subscription tracks the active subscriber for a session.
+type subscription struct {
+	ch     chan []byte
+	cancel context.CancelFunc
+}
+
+// NewMemorySessionBackend creates a new in-memory session backend.
+func NewMemorySessionBackend() *MemorySessionBackend {
+	return &MemorySessionBackend{
+		sessions: make(map[string]*SessionData),
+		subs:     make(map[string]*subscription),
+	}
+}
+
+// Create implements SessionBackend.Create.
+func (m *MemorySessionBackend) Create(ctx context.Context, data *SessionData) (string, error) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	id := rand.Text()
+	data.SessionID = id
+	m.sessions[id] = data
+	return id, nil
+}
+
+// Get implements SessionBackend.Get.
+func (m *MemorySessionBackend) Get(ctx context.Context, id string) (*SessionData, error) {
+	m.mu.RLock()
+	defer m.mu.RUnlock()
+
+	data, ok := m.sessions[id]
+	if !ok {
+		return nil, ErrSessionNotFound
+	}
+	// Return a copy to prevent mutation
+	copy := *data
+	if data.State != nil {
+		stateCopy := *data.State
+		copy.State = &stateCopy
+	}
+	return &copy, nil
+}
+
+// Update implements SessionBackend.Update.
+func (m *MemorySessionBackend) Update(ctx context.Context, id string, data *SessionData) error {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	if _, ok := m.sessions[id]; !ok {
+		return ErrSessionNotFound
+	}
+	m.sessions[id] = data
+	return nil
+}
+
+// Delete implements SessionBackend.Delete.
+func (m *MemorySessionBackend) Delete(ctx context.Context, id string) error {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	delete(m.sessions, id)
+
+	// Close subscriber channel if exists
+	if sub, ok := m.subs[id]; ok {
+		close(sub.ch)
+		delete(m.subs, id)
+	}
+
+	return nil
+}
+
+// Touch implements SessionBackend.Touch.
+// In this implementation, Touch is a no-op since there's no TTL.
+func (m *MemorySessionBackend) Touch(ctx context.Context, id string) error {
+	m.mu.RLock()
+	defer m.mu.RUnlock()
+
+	if _, ok := m.sessions[id]; !ok {
+		return ErrSessionNotFound
+	}
+	return nil
+}
+
+// Publish implements SessionBackend.Publish.
+func (m *MemorySessionBackend) Publish(ctx context.Context, sessionID string, msg []byte) error {
+	m.mu.RLock()
+	sub := m.subs[sessionID]
+	m.mu.RUnlock()
+
+	if sub == nil {
+		// No subscriber - in production, you might queue the message
+		return nil
+	}
+
+	// Non-blocking send
+	select {
+	case sub.ch <- msg:
+		return nil
+	default:
+		// Channel full - drop message (in production, handle backpressure)
+		return nil
+	}
+}
+
+// Subscribe implements SessionBackend.Subscribe.
+// Only one subscriber is allowed per session. New subscribers supersede old ones.
+func (m *MemorySessionBackend) Subscribe(ctx context.Context, sessionID string, handler MessageHandler) error {
+	ch := make(chan []byte, 100)
+
+	// Create cancellable context for this subscription
+	subCtx, subCancel := context.WithCancel(ctx)
+
+	m.mu.Lock()
+	// Supersede existing subscriber
+	if existing, ok := m.subs[sessionID]; ok {
+		close(existing.ch) // Will cause existing subscriber to return ErrSubscriptionSuperseded
+	}
+	m.subs[sessionID] = &subscription{ch: ch, cancel: subCancel}
+	m.mu.Unlock()
+
+	defer func() {
+		subCancel()
+		m.mu.Lock()
+		// Only remove if we're still the active subscriber
+		if sub, ok := m.subs[sessionID]; ok && sub.ch == ch {
+			delete(m.subs, sessionID)
+		}
+		m.mu.Unlock()
+	}()
+
+	for {
+		select {
+		case <-subCtx.Done():
+			return subCtx.Err()
+		case msg, ok := <-ch:
+			if !ok {
+				// Channel closed - either session deleted or superseded
+				m.mu.RLock()
+				_, sessionExists := m.sessions[sessionID]
+				m.mu.RUnlock()
+				if !sessionExists {
+					return ErrSessionNotFound
+				}
+				return ErrSubscriptionSuperseded
+			}
+			if err := handler(subCtx, msg); err != nil {
+				return err
+			}
+		}
+	}
+}
+
+// Verify MemorySessionBackend implements SessionBackend
+var _ SessionBackend = (*MemorySessionBackend)(nil)
diff --git a/mcp/session_backend_test.go b/mcp/session_backend_test.go
new file mode 100644
index 00000000..dd3c1683
--- /dev/null
+++ b/mcp/session_backend_test.go
@@ -0,0 +1,380 @@
+// Copyright 2025 The Go MCP SDK Authors. All rights reserved.
+// Use of this source code is governed by an MIT-style
+// license that can be found in the LICENSE file.
+
+// FORK: distributed-sessions
+// Tests for the SessionBackend interface and MemorySessionBackend implementation.
+
+package mcp
+
+import (
+	"context"
+	"testing"
+	"time"
+)
+
+func TestMemorySessionBackend_CreateAndGet(t *testing.T) {
+	backend := NewMemorySessionBackend()
+	ctx := context.Background()
+
+	// Create a session
+	data := &SessionData{
+		UserID: "user123",
+		State: &ServerSessionState{
+			LogLevel: "info",
+		},
+	}
+
+	id, err := backend.Create(ctx, data)
+	if err != nil {
+		t.Fatalf("Create failed: %v", err)
+	}
+	if id == "" {
+		t.Fatal("Create returned empty ID")
+	}
+
+	// Get the session
+	retrieved, err := backend.Get(ctx, id)
+	if err != nil {
+		t.Fatalf("Get failed: %v", err)
+	}
+	if retrieved.SessionID != id {
+		t.Errorf("SessionID mismatch: got %q, want %q", retrieved.SessionID, id)
+	}
+	if retrieved.UserID != "user123" {
+		t.Errorf("UserID mismatch: got %q, want %q", retrieved.UserID, "user123")
+	}
+}
+
+func TestMemorySessionBackend_GetReturnsDefensiveCopy(t *testing.T) {
+	backend := NewMemorySessionBackend()
+	ctx := context.Background()
+
+	// Create a session with state
+	data := &SessionData{
+		UserID: "user123",
+		State:  &ServerSessionState{LogLevel: "info"},
+	}
+	id, err := backend.Create(ctx, data)
+	if err != nil {
+		t.Fatalf("Create failed: %v", err)
+	}
+
+	// Get and modify
+	retrieved, _ := backend.Get(ctx, id)
+	retrieved.UserID = "modified"
+	retrieved.State.LogLevel = "debug"
+
+	// Get again - should not see modifications
+	retrieved2, _ := backend.Get(ctx, id)
+	if retrieved2.UserID != "user123" {
+		t.Errorf("Get returned mutable reference: UserID was modified")
+	}
+	if retrieved2.State.LogLevel != "info" {
+		t.Errorf("Get returned mutable reference: LogLevel was modified")
+	}
+}
+
+func TestMemorySessionBackend_GetNotFound(t *testing.T) {
+	backend := NewMemorySessionBackend()
+	ctx := context.Background()
+
+	_, err := backend.Get(ctx, "nonexistent")
+	if err != ErrSessionNotFound {
+		t.Errorf("Expected ErrSessionNotFound, got: %v", err)
+	}
+}
+
+func TestMemorySessionBackend_Update(t *testing.T) {
+	backend := NewMemorySessionBackend()
+	ctx := context.Background()
+
+	// Create a session
+	data := &SessionData{UserID: "user123"}
+	id, err := backend.Create(ctx, data)
+	if err != nil {
+		t.Fatalf("Create failed: %v", err)
+	}
+
+	// Update the session
+	updated := &SessionData{
+		SessionID: id,
+		UserID:    "user456",
+	}
+	if err := backend.Update(ctx, id, updated); err != nil {
+		t.Fatalf("Update failed: %v", err)
+	}
+
+	// Verify the update
+	retrieved, err := backend.Get(ctx, id)
+	if err != nil {
+		t.Fatalf("Get failed: %v", err)
+	}
+	if retrieved.UserID != "user456" {
+		t.Errorf("UserID not updated: got %q, want %q", retrieved.UserID, "user456")
+	}
+}
+
+func TestMemorySessionBackend_UpdateNotFound(t *testing.T) {
+	backend := NewMemorySessionBackend()
+	ctx := context.Background()
+
+	err := backend.Update(ctx, "nonexistent", &SessionData{})
+	if err != ErrSessionNotFound {
+		t.Errorf("Expected ErrSessionNotFound, got: %v", err)
+	}
+}
+
+func TestMemorySessionBackend_Delete(t *testing.T) {
+	backend := NewMemorySessionBackend()
+	ctx := context.Background()
+
+	// Create a session
+	data := &SessionData{UserID: "user123"}
+	id, err := backend.Create(ctx, data)
+	if err != nil {
+		t.Fatalf("Create failed: %v", err)
+	}
+
+	// Delete the session
+	if err := backend.Delete(ctx, id); err != nil {
+		t.Fatalf("Delete failed: %v", err)
+	}
+
+	// Verify deletion
+	_, err = backend.Get(ctx, id)
+	if err != ErrSessionNotFound {
+		t.Errorf("Expected ErrSessionNotFound after delete, got: %v", err)
+	}
+
+	// Delete again should be idempotent
+	if err := backend.Delete(ctx, id); err != nil {
+		t.Errorf("Second delete should be idempotent, got: %v", err)
+	}
+}
+
+func TestMemorySessionBackend_TouchNotFound(t *testing.T) {
+	backend := NewMemorySessionBackend()
+	ctx := context.Background()
+
+	err := backend.Touch(ctx, "nonexistent")
+	if err != ErrSessionNotFound {
+		t.Errorf("Expected ErrSessionNotFound, got: %v", err)
+	}
+}
+
+func TestMemorySessionBackend_PublishSubscribe(t *testing.T) {
+	backend := NewMemorySessionBackend()
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	// Create a session
+	data := &SessionData{UserID: "user123"}
+	id, err := backend.Create(ctx, data)
+	if err != nil {
+		t.Fatalf("Create failed: %v", err)
+	}
+
+	// Channel to collect received messages
+	received := make(chan []byte, 10)
+
+	// Start subscriber
+	subCtx, subCancel := context.WithCancel(ctx)
+	subDone := make(chan error, 1)
+	go func() {
+		err := backend.Subscribe(subCtx, id, func(ctx context.Context, msg []byte) error {
+			received <- msg
+			return nil
+		})
+		subDone <- err
+	}()
+
+	// Give subscriber time to start
+	time.Sleep(50 * time.Millisecond)
+
+	// Publish some messages
+	messages := [][]byte{
+		[]byte(`{"jsonrpc":"2.0","method":"test1"}`),
+		[]byte(`{"jsonrpc":"2.0","method":"test2"}`),
+	}
+
+	for _, msg := range messages {
+		if err := backend.Publish(ctx, id, msg); err != nil {
+			t.Fatalf("Publish failed: %v", err)
+		}
+	}
+
+	// Verify messages received
+	for i, expected := range messages {
+		select {
+		case got := <-received:
+			if string(got) != string(expected) {
+				t.Errorf("Message %d mismatch: got %q, want %q", i, string(got), string(expected))
+			}
+		case <-time.After(time.Second):
+			t.Fatalf("Timeout waiting for message %d", i)
+		}
+	}
+
+	// Cancel subscription
+	subCancel()
+
+	select {
+	case err := <-subDone:
+		if err != context.Canceled {
+			t.Errorf("Expected context.Canceled, got: %v", err)
+		}
+	case <-time.After(time.Second):
+		t.Fatal("Timeout waiting for subscription to end")
+	}
+}
+
+func TestMemorySessionBackend_SubscribeSessionDeleted(t *testing.T) {
+	backend := NewMemorySessionBackend()
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	// Create a session
+	data := &SessionData{UserID: "user123"}
+	id, err := backend.Create(ctx, data)
+	if err != nil {
+		t.Fatalf("Create failed: %v", err)
+	}
+
+	// Start subscriber
+	subDone := make(chan error, 1)
+	go func() {
+		err := backend.Subscribe(ctx, id, func(ctx context.Context, msg []byte) error {
+			return nil
+		})
+		subDone <- err
+	}()
+
+	// Give subscriber time to start
+	time.Sleep(50 * time.Millisecond)
+
+	// Delete the session
+	if err := backend.Delete(ctx, id); err != nil {
+		t.Fatalf("Delete failed: %v", err)
+	}
+
+	// Verify subscription ends with ErrSessionNotFound
+	select {
+	case err := <-subDone:
+		if err != ErrSessionNotFound {
+			t.Errorf("Expected ErrSessionNotFound, got: %v", err)
+		}
+	case <-time.After(time.Second):
+		t.Fatal("Timeout waiting for subscription to end")
+	}
+}
+
+func TestMemorySessionBackend_SubscribeSuperseded(t *testing.T) {
+	backend := NewMemorySessionBackend()
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	// Create a session
+	data := &SessionData{UserID: "user123"}
+	id, err := backend.Create(ctx, data)
+	if err != nil {
+		t.Fatalf("Create failed: %v", err)
+	}
+
+	// Start first subscriber
+	sub1Done := make(chan error, 1)
+	go func() {
+		err := backend.Subscribe(ctx, id, func(ctx context.Context, msg []byte) error {
+			return nil
+		})
+		sub1Done <- err
+	}()
+
+	// Give first subscriber time to start
+	time.Sleep(50 * time.Millisecond)
+
+	// Start second subscriber - should supersede first
+	sub2Ctx, sub2Cancel := context.WithCancel(ctx)
+	sub2Done := make(chan error, 1)
+	go func() {
+		err := backend.Subscribe(sub2Ctx, id, func(ctx context.Context, msg []byte) error {
+			return nil
+		})
+		sub2Done <- err
+	}()
+
+	// First subscriber should be superseded
+	select {
+	case err := <-sub1Done:
+		if err != ErrSubscriptionSuperseded {
+			t.Errorf("Expected ErrSubscriptionSuperseded for first subscriber, got: %v", err)
+		}
+	case <-time.After(time.Second):
+		t.Fatal("Timeout waiting for first subscriber to be superseded")
+	}
+
+	// Second subscriber should still be active
+	select {
+	case err := <-sub2Done:
+		t.Errorf("Second subscriber ended unexpectedly: %v", err)
+	case <-time.After(100 * time.Millisecond):
+		// Expected - subscriber is still running
+	}
+
+	// Clean up
+	sub2Cancel()
+}
+
+func TestMemorySessionBackend_PublishNoSubscriber(t *testing.T) {
+	backend := NewMemorySessionBackend()
+	ctx := context.Background()
+
+	// Create a session
+	data := &SessionData{UserID: "user123"}
+	id, err := backend.Create(ctx, data)
+	if err != nil {
+		t.Fatalf("Create failed: %v", err)
+	}
+
+	// Publish without subscriber - should not error
+	err = backend.Publish(ctx, id, []byte(`{"test":"message"}`))
+	if err != nil {
+		t.Errorf("Publish without subscriber should not error, got: %v", err)
+	}
+}
+
+func TestMemorySessionBackend_ConcurrentAccess(t *testing.T) {
+	backend := NewMemorySessionBackend()
+	ctx := context.Background()
+
+	// Create a session
+	data := &SessionData{UserID: "user123"}
+	id, err := backend.Create(ctx, data)
+	if err != nil {
+		t.Fatalf("Create failed: %v", err)
+	}
+
+	// Concurrent operations
+	done := make(chan bool, 100)
+	for i := 0; i < 100; i++ {
+		go func(n int) {
+			// Mix of operations
+			switch n % 4 {
+			case 0:
+				backend.Get(ctx, id)
+			case 1:
+				backend.Update(ctx, id, &SessionData{SessionID: id, UserID: "concurrent"})
+			case 2:
+				backend.Touch(ctx, id)
+			case 3:
+				backend.Publish(ctx, id, []byte(`{"test":"concurrent"}`))
+			}
+			done <- true
+		}(i)
+	}
+
+	// Wait for all operations
+	for i := 0; i < 100; i++ {
+		<-done
+	}
+}
diff --git a/mcp/streamable.go b/mcp/streamable.go
index ceb17421..353c4561 100644
--- a/mcp/streamable.go
+++ b/mcp/streamable.go
@@ -71,6 +71,11 @@ type sessionInfo struct {
 	timerMu sync.Mutex
 	refs    int // reference count
 	timer   *time.Timer
+
+	// FORK: distributed-sessions
+	// remoteSession is set when the session exists in the backend but not locally.
+	// This is used to route requests to the correct pod.
+	remoteSession *remoteSessionInfo
 }
 
 // startPOST signals that a POST request for this session is starting (which
@@ -183,6 +188,14 @@ type StreamableHTTPOptions struct {
 	// Use `disablecrossoriginprotection` MCPGODEBUG compatibility parameter
 	// to disable the default protection until v1.6.0.
 	CrossOriginProtection *http.CrossOriginProtection
+
+	// FORK: distributed-sessions
+	// SessionBackend enables distributed session management across multiple
+	// server replicas. When set, session state is persisted to the backend
+	// and messages can be routed between pods.
+	//
+	// If nil, sessions are stored only in memory on the local pod.
+	SessionBackend SessionBackend
 }
 
 // NewStreamableHTTPHandler returns a new [StreamableHTTPHandler].
@@ -301,9 +314,14 @@ func (h *StreamableHTTPHandler) ServeHTTP(w http.ResponseWriter, req *http.Reque
 	sessionID := req.Header.Get(sessionIDHeader)
 	var sessInfo *sessionInfo
 	if sessionID != "" {
-		h.mu.Lock()
-		sessInfo = h.sessions[sessionID]
-		h.mu.Unlock()
+		// FORK: distributed-sessions - check backend if configured
+		var lookupErr error
+		sessInfo, lookupErr = h.lookupSessionFromBackend(req.Context(), sessionID)
+		if lookupErr != nil {
+			h.opts.Logger.Error("session lookup failed", "error", lookupErr, "sessionID", sessionID)
+			http.Error(w, "session lookup failed", http.StatusInternalServerError)
+			return
+		}
 		if sessInfo == nil && !h.opts.Stateless {
 			// Unless we're in 'stateless' mode, which doesn't perform any Session-ID
 			// validation, we require that the session ID matches a known session.
@@ -312,6 +330,11 @@ func (h *StreamableHTTPHandler) ServeHTTP(w http.ResponseWriter, req *http.Reque
 			http.Error(w, "session not found", http.StatusNotFound)
 			return
 		}
+		// FORK: distributed-sessions - handle remote session (session exists in backend but not locally)
+		if sessInfo != nil && sessInfo.remoteSession != nil {
+			h.handleRemoteSession(w, req, sessInfo.remoteSession)
+			return
+		}
 		// Prevent session hijacking: if the session was created with a user ID,
 		// verify that subsequent requests come from the same user.
 		if sessInfo != nil && sessInfo.userID != "" {
@@ -413,7 +436,24 @@ func (h *StreamableHTTPHandler) ServeHTTP(w http.ResponseWriter, req *http.Reque
 		if sessionID == "" {
 			// In stateless mode, sessionID may be nonempty even if there's no
 			// existing transport.
-			sessionID = server.opts.GetSessionID()
+			// FORK: distributed-sessions - use backend to generate ID if configured
+			if h.hasSessionBackend() {
+				initialData := &SessionData{} // Will be updated after session creation
+				var err error
+				sessionID, err = h.opts.SessionBackend.Create(req.Context(), initialData)
+				if err != nil {
+					h.opts.Logger.Error("failed to create session in backend", "error", err)
+					http.Error(w, "failed to create session", http.StatusInternalServerError)
+					return
+				}
+			} else {
+				sessionID = server.opts.GetSessionID()
+			}
+		}
+		// FORK: distributed-sessions - extract userID early so callbacks can capture it
+		var userID string
+		if tokenInfo := auth.TokenInfoFromContext(req.Context()); tokenInfo != nil {
+			userID = tokenInfo.UserID
 		}
 		transport := &StreamableServerTransport{
 			SessionID:    sessionID,
@@ -422,6 +462,34 @@ func (h *StreamableHTTPHandler) ServeHTTP(w http.ResponseWriter, req *http.Reque
 			jsonResponse: h.opts.JSONResponse,
 			logger:       h.opts.Logger,
 		}
+		// FORK: distributed-sessions - set up message routing callbacks
+		if h.hasSessionBackend() {
+			transport.OnSSEStart = func(ctx context.Context, writer func(data []byte) error, closeSSE func()) {
+				// This callback blocks - it runs in a goroutine spawned by the caller.
+				err := h.subscribeFromBackend(ctx, sessionID, func(ctx context.Context, msg []byte) error {
+					return writer(msg)
+				})
+				if err != nil && ctx.Err() == nil {
+					h.opts.Logger.Error("backend subscription ended", "error", err, "sessionID", sessionID)
+					// If subscription was superseded, close the SSE stream
+					if errors.Is(err, ErrSubscriptionSuperseded) {
+						closeSSE()
+					}
+				}
+			}
+			// FORK: distributed-sessions - persist state changes to backend
+			transport.OnStateChange = func(ctx context.Context, state ServerSessionState) error {
+				// This callback blocks - it runs in a goroutine spawned by the caller.
+				// Use a timeout derived from the request context.
+				ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
+				defer cancel()
+				return h.updateSessionStateInBackend(ctx, sessionID, userID, &state)
+			}
+			// FORK: distributed-sessions - route messages when not SSE owner
+			transport.OnPublish = func(ctx context.Context, sid string, msg []byte) error {
+				return h.opts.SessionBackend.Publish(ctx, sid, msg)
+			}
+		}
 
 		// Sessions without a session ID are also stateless: there's no way to
 		// address them.
@@ -486,6 +554,16 @@ func (h *StreamableHTTPHandler) ServeHTTP(w http.ResponseWriter, req *http.Reque
 					if info, ok := h.sessions[transport.SessionID]; ok {
 						info.stopTimer()
 						delete(h.sessions, transport.SessionID)
+						// FORK: distributed-sessions - cleanup backend
+						if h.hasSessionBackend() {
+							// Use background context since the request context may be done.
+							// Add a timeout to avoid blocking indefinitely if the backend is slow.
+							deleteCtx, deleteCancel := context.WithTimeout(context.Background(), 10*time.Second)
+							if err := h.deleteSessionFromBackend(deleteCtx, transport.SessionID); err != nil {
+								h.opts.Logger.Error("failed to delete session from backend", "error", err, "sessionID", transport.SessionID)
+							}
+							deleteCancel()
+						}
 						if h.onTransportDeletion != nil {
 							h.onTransportDeletion(transport.SessionID)
 						}
@@ -502,12 +580,6 @@ func (h *StreamableHTTPHandler) ServeHTTP(w http.ResponseWriter, req *http.Reque
 			http.Error(w, "failed connection", http.StatusInternalServerError)
 			return
 		}
-		// Capture the user ID from the token info to enable session hijacking
-		// prevention on subsequent requests.
-		var userID string
-		if tokenInfo := auth.TokenInfoFromContext(req.Context()); tokenInfo != nil {
-			userID = tokenInfo.UserID
-		}
 		sessInfo = &sessionInfo{
 			session:   session,
 			transport: transport,
@@ -533,6 +605,13 @@ func (h *StreamableHTTPHandler) ServeHTTP(w http.ResponseWriter, req *http.Reque
 			h.mu.Lock()
 			h.sessions[transport.SessionID] = sessInfo
 			h.mu.Unlock()
+			// FORK: distributed-sessions - update backend with complete session data
+			if h.hasSessionBackend() {
+				if err := h.createSessionInBackend(req.Context(), sessInfo); err != nil {
+					h.opts.Logger.Error("failed to update session in backend", "error", err, "sessionID", transport.SessionID)
+					// Continue anyway - the session exists locally
+				}
+			}
 			defer func() {
 				// If initialization failed, clean up the session (#578).
 				if session.InitializeParams() == nil {
@@ -546,6 +625,12 @@ func (h *StreamableHTTPHandler) ServeHTTP(w http.ResponseWriter, req *http.Reque
 	if req.Method == http.MethodPost {
 		sessInfo.startPOST()
 		defer sessInfo.endPOST()
+		// FORK: distributed-sessions - extend backend TTL on activity
+		if h.hasSessionBackend() {
+			if err := h.touchSessionInBackend(req.Context(), sessionID); err != nil {
+				h.opts.Logger.Warn("failed to touch session in backend", "error", err, "sessionID", sessionID)
+			}
+		}
 	}
 
 	sessInfo.transport.ServeHTTP(w, req)
@@ -612,6 +697,34 @@ type StreamableServerTransport struct {
 	// to write their own streamable HTTP handler.
 	logger *slog.Logger
 
+	// FORK: distributed-sessions
+	// OnSSEStart is called in a goroutine when a standalone SSE stream is established
+	// (GET request). The callback receives:
+	//   - ctx: cancelled when SSE ends
+	//   - writer: writes raw message bytes to the SSE stream
+	//   - closeSSE: closes the SSE stream (e.g., when subscription is superseded)
+	//
+	// The callback may block (e.g., to run a message subscription loop).
+	// It will be called in its own goroutine by the SDK.
+	OnSSEStart func(ctx context.Context, writer func(data []byte) error, closeSSE func())
+
+	// FORK: distributed-sessions
+	// OnStateChange is called in a goroutine when the session state changes
+	// (e.g., after Initialize, setLogLevel). This allows the handler to persist
+	// state to the SessionBackend.
+	//
+	// The callback receives the request context and may block.
+	// It will be called in its own goroutine by the SDK.
+	// Return nil to indicate success, or an error which will be logged.
+	OnStateChange func(ctx context.Context, state ServerSessionState) error
+
+	// FORK: distributed-sessions
+	// OnPublish routes messages to the SSE owner when this pod is not the owner.
+	// Called when Write() targets the standalone SSE stream but no local SSE
+	// connection is active. If nil and the pod is not the owner, writes return
+	// an error.
+	OnPublish func(ctx context.Context, sessionID string, msg []byte) error
+
 	// connection is non-nil if and only if the transport has been connected.
 	connection *streamableServerConn
 }
@@ -627,6 +740,8 @@ func (t *StreamableServerTransport) Connect(ctx context.Context) (Connection, er
 		eventStore:     t.EventStore,
 		jsonResponse:   t.jsonResponse,
 		logger:         ensureLogger(t.logger), // see #556: must be non-nil
+		onStateChange:  t.OnStateChange,        // FORK: distributed-sessions
+		onPublish:      t.OnPublish,            // FORK: distributed-sessions
 		incoming:       make(chan jsonrpc.Message, 10),
 		done:           make(chan struct{}),
 		streams:        make(map[string]*stream),
@@ -652,9 +767,21 @@ type streamableServerConn struct {
 
 	logger *slog.Logger
 
+	// FORK: distributed-sessions
+	onStateChange func(ctx context.Context, state ServerSessionState) error
+
+	// FORK: distributed-sessions
+	// onPublish routes messages when this pod is not the SSE owner.
+	onPublish func(ctx context.Context, sessionID string, msg []byte) error
+
+	// FORK: distributed-sessions
+	// sseOwned tracks whether this pod owns the standalone SSE stream.
+	// Protected by mu.
+	sseOwned bool
+
 	incoming chan jsonrpc.Message // messages from the client to the server
 
-	mu sync.Mutex // guards all fields below
+	mu sync.Mutex // guards all fields below (including sseOwned)
 
 	// Sessions are closed exactly once.
 	isDone bool
@@ -685,6 +812,65 @@ func (c *streamableServerConn) SessionID() string {
 	return c.sessionID
 }
 
+// FORK: distributed-sessions
+// sessionUpdated implements serverConnection to receive state change notifications.
+// This is called by ServerSession.updateState whenever the session state changes.
+func (c *streamableServerConn) sessionUpdated(ctx context.Context, state ServerSessionState) {
+	if c.onStateChange != nil {
+		// Run callback in goroutine to avoid blocking request processing.
+		go func() {
+			if err := c.onStateChange(ctx, state); err != nil {
+				c.logger.Error("state change callback failed", "error", err, "sessionID", c.sessionID)
+			}
+		}()
+	}
+}
+
+// FORK: distributed-sessions
+// writeToStandaloneSSE writes raw message data to the standalone SSE stream.
+// This is used by distributed session backends to forward messages from other pods.
+func (c *streamableServerConn) writeToStandaloneSSE(data []byte) error {
+	c.mu.Lock()
+	s := c.streams[""] // standalone SSE stream
+	sessionClosed := c.isDone
+	c.mu.Unlock()
+
+	if s == nil {
+		return fmt.Errorf("standalone SSE stream not available")
+	}
+	if sessionClosed {
+		return fmt.Errorf("session is closed")
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if s.done == nil {
+		return fmt.Errorf("stream not connected")
+	}
+
+	// Store in eventStore before delivering
+	if c.eventStore != nil {
+		if err := c.eventStore.Append(context.Background(), c.sessionID, s.id, data); err != nil {
+			c.logger.Warn(fmt.Sprintf("failed to store routed message: %v", err))
+		}
+	}
+
+	// Compute eventID for SSE streams with event store
+	var eventID string
+	if c.eventStore != nil {
+		eventID = formatEventID(s.id, s.lastIdx+1)
+	}
+
+	// Write the event
+	s.lastIdx++
+	if _, err := writeEvent(s.w, Event{Name: "message", Data: data, ID: eventID}); err != nil {
+		return err
+	}
+
+	return nil
+}
+
 // A stream is a single logical stream of SSE events within a server session.
 // A stream begins with a client request, or with a client GET that has
 // no Last-Event-ID header.
@@ -889,7 +1075,8 @@ func (t *StreamableServerTransport) ServeHTTP(w http.ResponseWriter, req *http.R
 	}
 	switch req.Method {
 	case http.MethodGet:
-		t.connection.serveGET(w, req)
+		// FORK: distributed-sessions - pass OnSSEStart callback
+		t.connection.serveGET(w, req, t.OnSSEStart)
 	case http.MethodPost:
 		t.connection.servePOST(w, req)
 	default:
@@ -904,7 +1091,7 @@ func (t *StreamableServerTransport) ServeHTTP(w http.ResponseWriter, req *http.R
 // message parsed from the Last-Event-ID header.
 //
 // It returns an HTTP status code and error message.
-func (c *streamableServerConn) serveGET(w http.ResponseWriter, req *http.Request) {
+func (c *streamableServerConn) serveGET(w http.ResponseWriter, req *http.Request, onSSEStart func(ctx context.Context, writer func(data []byte) error, closeSSE func())) {
 	// streamID "" corresponds to the default GET request.
 	streamID := ""
 	// By default, we haven't seen a last index. Since indices start at 0, we represent
@@ -938,7 +1125,39 @@ func (c *streamableServerConn) serveGET(w http.ResponseWriter, req *http.Request
 		return
 	}
 	defer stream.release()
-	c.hangResponse(ctx, done)
+
+	// FORK: distributed-sessions - track SSE ownership for standalone stream
+	if streamID == "" {
+		c.mu.Lock()
+		c.sseOwned = true
+		c.mu.Unlock()
+		defer func() {
+			c.mu.Lock()
+			c.sseOwned = false
+			c.mu.Unlock()
+		}()
+	}
+
+	// FORK: distributed-sessions - notify that SSE stream is established
+	// We use a derived context so that the callback can close the SSE
+	// stream by calling closeSSE (which cancels the context).
+	sseCtx, sseCancel := context.WithCancel(ctx)
+	defer sseCancel()
+
+	if onSSEStart != nil && streamID == "" {
+		// Create a writer function that writes to the standalone SSE stream
+		writer := func(data []byte) error {
+			return c.writeToStandaloneSSE(data)
+		}
+		// closeSSE cancels the SSE context, causing hangResponse to return
+		closeSSE := func() {
+			sseCancel()
+		}
+		// Run callback in goroutine - it may block (e.g., subscription loop)
+		go onSSEStart(sseCtx, writer, closeSSE)
+	}
+
+	c.hangResponse(sseCtx, done)
 }
 
 // hangResponse blocks the HTTP response until one of three conditions is met:
@@ -1373,6 +1592,7 @@ func (c *streamableServerConn) Write(ctx context.Context, msg jsonrpc.Message) e
 
 	// Write the message to the stream.
 	var s *stream
+	var isStandaloneSSE bool
 	c.mu.Lock()
 	if relatedRequest.IsValid() {
 		if streamID, ok := c.requestStreams[relatedRequest]; ok {
@@ -1380,6 +1600,7 @@ func (c *streamableServerConn) Write(ctx context.Context, msg jsonrpc.Message) e
 		}
 	} else {
 		s = c.streams[""] // standalone SSE stream
+		isStandaloneSSE = true
 	}
 	if responseTo.IsValid() {
 		// Once we've responded to a request, disallow related messages by removing
@@ -1387,6 +1608,7 @@ func (c *streamableServerConn) Write(ctx context.Context, msg jsonrpc.Message) e
 		delete(c.requestStreams, responseTo)
 	}
 	sessionClosed := c.isDone
+	sseOwned := c.sseOwned // FORK: distributed-sessions - capture ownership state
 	c.mu.Unlock()
 
 	if s == nil {
@@ -1401,6 +1623,17 @@ func (c *streamableServerConn) Write(ctx context.Context, msg jsonrpc.Message) e
 		return errors.New("session is closed")
 	}
 
+	// FORK: distributed-sessions - route via backend if we don't own SSE stream.
+	// If no backend is configured, fall back to local delivery so event replay
+	// can still capture messages for a reconnecting client.
+	if isStandaloneSSE && !sseOwned && c.onPublish != nil {
+		if err := c.onPublish(ctx, c.sessionID, data); err != nil {
+			return fmt.Errorf("%w: publish failed: %v", jsonrpc2.ErrRejected, err)
+		}
+		// Message routed to SSE owner via backend; don't write locally
+		return nil
+	}
+
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
diff --git a/mcp/transport.go b/mcp/transport.go
index 23dccf8e..326b6e1b 100644
--- a/mcp/transport.go
+++ b/mcp/transport.go
@@ -86,7 +86,7 @@ type clientConnection interface {
 // TODO: should this interface be exported?
 type serverConnection interface {
 	Connection
-	sessionUpdated(ServerSessionState)
+	sessionUpdated(ctx context.Context, state ServerSessionState) // FORK: distributed-sessions - added ctx
 }
 
 // A StdioTransport is a [Transport] that communicates over stdin/stdout using
@@ -428,7 +428,7 @@ func newIOConn(rwc io.ReadWriteCloser) *ioConn {
 
 func (c *ioConn) SessionID() string { return "" }
 
-func (c *ioConn) sessionUpdated(state ServerSessionState) {
+func (c *ioConn) sessionUpdated(_ context.Context, state ServerSessionState) { // FORK: distributed-sessions - added ctx
 	protocolVersion := ""
 	if state.InitializeParams != nil {
 		protocolVersion = state.InitializeParams.ProtocolVersion
diff --git a/mcp/transport_test.go b/mcp/transport_test.go
index 515b8c19..c1ed66bc 100644
--- a/mcp/transport_test.go
+++ b/mcp/transport_test.go
@@ -108,7 +108,7 @@ func TestIOConnRead(t *testing.T) {
 			})
 			t.Cleanup(func() { tr.Close() })
 			if tt.protocolVersion != "" {
-				tr.sessionUpdated(ServerSessionState{
+				tr.sessionUpdated(context.Background(), ServerSessionState{
 					InitializeParams: &InitializeParams{
 						ProtocolVersion: tt.protocolVersion,
 					},