Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/CONFIGURATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ Run `./awmg --help` for full CLI options. Key flags:
- Enables per-server DIFC guard assignment independent of `guard-policies`
- Example: `guard = "github"` (uses the guard named `github` from `[guards.github]`)

- **`connect_timeout`** (optional, HTTP servers only): Per-transport connection timeout in seconds for connecting to HTTP backends. The gateway tries streamable HTTP, then SSE, then plain JSON-RPC over HTTP POST in sequence; this timeout applies to each attempt. Default: `30`.
- **`connect_timeout`** (optional, HTTP servers only): Per-transport connection timeout in seconds for connecting to HTTP backends. The gateway tries streamable HTTP, then SSE, then plain JSON-RPC over HTTP POST in sequence; this timeout applies to each attempt. It does **not** set the end-to-end `tools/call` execution timeout. Default: `30`.

- **`rate_limit_threshold`** (optional, TOML/JSON file configs only): Number of consecutive rate-limit errors from this backend that will trip the circuit breaker (transition CLOSED → OPEN). When OPEN, requests are immediately rejected until the breaker is eligible to transition to HALF-OPEN again; this is normally controlled by `rate_limit_cooldown`, but if the gateway knows an upstream rate-limit reset time (for example from response headers or parsed tool error text), that reset time takes precedence. **Not available in JSON stdin format.** Default: `3`.

Expand Down Expand Up @@ -372,7 +372,7 @@ The `customSchemas` top-level field allows you to define custom server types bey
| `apiKey` | API key for authentication | (disabled) |
| `domain` | Gateway domain (`"localhost"`, `"host.docker.internal"`, or `"${VAR}"`) | (unset) |
| `startupTimeout` | Seconds to wait for backend startup | `30` |
| `toolTimeout` | Seconds to wait for tool execution | `60` |
| `toolTimeout` | Maximum seconds for a single tool call, enforced as a context deadline on all backend requests (stdio and HTTP) | `60` |
| `payloadDir` | Directory for large payload files | `/tmp/jq-payloads` |
| `payloadSizeThreshold` (JSON) / `payload_size_threshold` (TOML) | Size threshold in bytes; responses larger than this are stored to disk and returned as a `payloadPath` reference | `524288` (512 KB) |
| `trustedBots` (JSON) / `trusted_bots` (TOML) | Optional list of additional bot usernames to trust with "approved" integrity level. Additive to the built-in trusted bot list. When specified, must be a non-empty array with non-empty string entries (spec §4.1.3.4); omit the field entirely if not needed. Example: `["my-bot[bot]", "org-automation"]` | (disabled) |
Expand Down
10 changes: 4 additions & 6 deletions internal/mcp/connection.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,13 +204,11 @@ func NewHTTPConnection(ctx context.Context, serverID, url string, headers map[st
logger.LogInfo("backend", "Creating HTTP MCP connection with transport fallback, url=%s, connectTimeout=%v", url, connectTimeout)
ctx, cancel := context.WithCancel(ctx)

// Create an HTTP client with appropriate timeouts.
// Keep the existing overall request timeout, but also apply connectTimeout to
// the underlying HTTP transport so plain JSON-RPC fallback attempts honor the
// configured per-attempt connection timeout instead of waiting for the full
// client timeout.
// Create an HTTP client with connect/setup timeouts.
// Do not set http.Client.Timeout: request execution should be controlled by
// per-request context deadlines (for example the gateway tool timeout budget),
// while connectTimeout continues to bound transport establishment/fallback.
httpClient := &http.Client{
Timeout: 120 * time.Second, // Overall request timeout
Transport: &http.Transport{
DialContext: (&net.Dialer{
Timeout: connectTimeout,
Expand Down
13 changes: 7 additions & 6 deletions internal/mcp/http_connection_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -483,13 +483,13 @@ func TestNewHTTPConnection_NilHeaders(t *testing.T) {
assert.Equal(t, testServer.URL, conn.GetHTTPURL())
}

// TestNewHTTPConnection_HTTPClientTimeout tests that HTTP client timeout is properly configured
func TestNewHTTPConnection_HTTPClientTimeout(t *testing.T) {
// TestNewHTTPConnection_HTTPClientTimeoutUnset tests that overall client timeout is unset
// so tool execution budgets are controlled by request context deadlines.
func TestNewHTTPConnection_HTTPClientTimeoutUnset(t *testing.T) {
require := require.New(t)

// Create test server with delayed response (but not too long to hit default 120s timeout)
// Create test server with delayed response.
testServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Small delay to verify timeout handling works
time.Sleep(50 * time.Millisecond)

response := map[string]interface{}{
Expand All @@ -516,8 +516,9 @@ func TestNewHTTPConnection_HTTPClientTimeout(t *testing.T) {
require.NotNil(conn)
defer conn.Close()

// Verify HTTP client has proper timeout set
assert.Equal(t, 120*time.Second, conn.httpClient.Timeout, "HTTP client should have 120s timeout")
// Verify overall HTTP client timeout is unset. End-to-end request budgets
// should come from context deadlines, not a hardcoded client timeout.
assert.Zero(t, conn.httpClient.Timeout)
Comment on lines +519 to +521
Copy link

Copilot AI Apr 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test asserts the client-level timeout is zero, but it doesn't verify the replacement behavior: that request context deadlines actually cap long-running backend calls. Adding a test that issues a request with a short context timeout and confirms it aborts (for both plain JSON-RPC and an SDK transport, if feasible) would guard against regressions now that there is no transport-wide timeout.

Copilot uses AI. Check for mistakes.
}

// TestNewHTTPConnection_ConnectionRefused tests handling of connection refused errors
Expand Down
70 changes: 70 additions & 0 deletions internal/server/call_backend_tool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/http/httptest"
"strings"
"testing"
"time"

"github.com/github/gh-aw-mcpg/internal/config"
sdk "github.com/modelcontextprotocol/go-sdk/mcp"
Expand Down Expand Up @@ -484,3 +485,72 @@ func TestCallBackendTool_AllowedToolsError_MessageFormat(t *testing.T) {
assert.True(t, strings.Contains(text, `"blocked"`), "error message should include tool name: %s", text)
assert.True(t, strings.Contains(text, "allowed-tools"), "error message should mention allowed-tools: %s", text)
}

// TestCallBackendTool_ToolTimeoutEnforcedViaContext verifies that the configured
// toolTimeout is applied as a context deadline, causing slow backend calls to fail
// with a deadline-exceeded error instead of hanging indefinitely.
func TestCallBackendTool_ToolTimeoutEnforcedViaContext(t *testing.T) {
require := require.New(t)

// Create a slow backend that delays longer than our tool timeout
backend := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
var req map[string]interface{}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
w.WriteHeader(http.StatusBadRequest)
return
}

method, _ := req["method"].(string)
switch method {
case "initialize":
json.NewEncoder(w).Encode(map[string]interface{}{
"jsonrpc": "2.0", "id": req["id"],
"result": map[string]interface{}{
"protocolVersion": "2024-11-05",
"capabilities": map[string]interface{}{},
"serverInfo": map[string]interface{}{"name": "slow-backend", "version": "1.0.0"},
},
})
case "tools/list":
json.NewEncoder(w).Encode(map[string]interface{}{
"jsonrpc": "2.0", "id": req["id"],
"result": map[string]interface{}{"tools": []map[string]interface{}{}},
})
case "tools/call":
// Simulate a slow tool: sleep longer than the configured toolTimeout.
// The actual tool call should return after ~1s (timeout), but the
// httptest.Server cleanup waits for this goroutine to finish.
time.Sleep(3 * time.Second)
json.NewEncoder(w).Encode(map[string]interface{}{
"jsonrpc": "2.0", "id": req["id"],
"result": map[string]interface{}{
"content": []map[string]interface{}{{"type": "text", "text": "should not reach here"}},
},
})
}
}))
defer backend.Close()

// Configure with a very short toolTimeout (1 second)
cfg := &config.Config{
Gateway: &config.GatewayConfig{
ToolTimeout: 1,
},
Servers: map[string]*config.ServerConfig{
"slow": {Type: "http", URL: backend.URL},
},
}

us, err := NewUnified(context.Background(), cfg)
require.NoError(err)
defer us.Close()

ctx := context.WithValue(context.Background(), SessionIDContextKey, "timeout-test")
result, _, callErr := us.callBackendTool(ctx, "slow", "slow_tool", map[string]interface{}{})

// The call should fail due to context deadline exceeded
require.Error(callErr, "Tool call should fail due to timeout")
require.NotNil(result, "Should return a CallToolResult even on timeout")
require.True(result.IsError, "Result should be marked as error")
t.Logf("Tool call correctly timed out: %v", callErr)
}
10 changes: 10 additions & 0 deletions internal/server/unified.go
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,16 @@ func (us *UnifiedServer) callBackendTool(ctx context.Context, serverID, toolName
// The closure captures the request and validates before calling this method
logUnified.Printf("callBackendTool: serverID=%s, toolName=%s, args=%+v", serverID, toolName, args)

// Apply the configured tool timeout as a context deadline so backend calls
// (including HTTP backends) are bounded by toolTimeout rather than hanging
// indefinitely. This is the primary enforcement point for the gateway's
// tool execution budget.
if us.cfg != nil && us.cfg.Gateway != nil && us.cfg.Gateway.ToolTimeout > 0 {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, time.Duration(us.cfg.Gateway.ToolTimeout)*time.Second)
defer cancel()
}

// Start an OTEL span for the full tool call lifecycle (spans all phases 0–6)
// Attribute names follow MCP Gateway Specification §4.1.3.6
ctx, toolSpan := us.getTracer().Start(ctx, "mcp.tool_call",
Expand Down
6 changes: 3 additions & 3 deletions test/integration/http_error_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,10 @@ func TestHTTPError_ClientError(t *testing.T) {
}

// TestHTTPError_ConnectionTimeout tests that connection timeouts are properly handled
// This test is SKIPPED by default because it takes a long time (the HTTP client has a 120s timeout).
// This test is SKIPPED by default because it takes a long time.
// The timeout behavior is tested implicitly in other tests that use context timeouts.
func TestHTTPError_ConnectionTimeout(t *testing.T) {
t.Skip("Skipping timeout test - takes too long due to HTTP client 120s timeout. Other tests verify context-based timeouts work correctly.")
t.Skip("Skipping timeout test - takes too long. Other tests verify context-based timeouts work correctly.")

if testing.Short() {
t.Skip("Skipping integration test in short mode")
Expand All @@ -95,7 +95,7 @@ func TestHTTPError_ConnectionTimeout(t *testing.T) {
if f, ok := w.(http.Flusher); ok {
f.Flush()
}
// Sleep to cause a read timeout (HTTP client has 120s timeout)
// Sleep to cause a read timeout
// But we'll use a shorter context timeout to cut it off sooner
time.Sleep(150 * time.Second)
}))
Expand Down
Loading