diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml index b7ea4ed1..1a8d3d50 100644 --- a/.github/workflows/release-please.yml +++ b/.github/workflows/release-please.yml @@ -34,7 +34,7 @@ jobs: release-version: ${{ steps.release.outputs.version }} release-tag: ${{ steps.release.outputs.tag_name }} release-sha: ${{ steps.release.outputs.sha }} - pr-number: ${{ steps.release.outputs.pr }} + pr-number: ${{ steps.release.outputs.pr != '' && fromJSON(steps.release.outputs.pr).number || '' }} steps: - name: Generate release token diff --git a/README.md b/README.md index d8cf33cf..14390d36 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ The server handles GitLab connectivity issues gracefully: - **Bounded startup** — Server starts within `GITLAB_INIT_TIMEOUT_MS` (default 5s) regardless of GitLab availability - **Disconnected mode** — When GitLab is unreachable (`disconnected`/`failed` state), only the `manage_context` tool is exposed, with local actions such as `whoami`, `switch_profile`, and `set_scope` for diagnostics. During active reconnect (`connecting` state), the full tool list remains available so MCP clients don't lose their tool catalog during brief outages. MCP clients are notified of tool availability changes via `tools/list_changed` - **Auto-reconnect** — Exponential backoff reconnection (5s → 60s) with ±10% jitter -- **Error classification** — Transient errors (network, 5xx, timeouts) trigger auto-reconnect. Auth/config errors at startup transition to `failed` state (no auto-reconnect). Runtime auth errors from tool calls are forwarded to `HealthMonitor.reportError()` via `classifyError()`; the remaining gap is token-revocation/403 detection (#370) +- **Error classification** — Transient errors (network, 5xx, timeouts) trigger auto-reconnect. Auth/config errors at startup transition to `failed` state (no auto-reconnect). Mid-session token revocation is detected via an authenticated `HEAD /api/v4/user` check that runs alongside each periodic health check (static token mode only; skipped in OAuth mode). A 401 or 403 on this check transitions the instance to `failed` state immediately. - **Instance health monitor** — Each monitored instance URL has its own XState state machine. Untracked OAuth URLs currently pass through as reachable. | Variable | Default | Description | @@ -100,6 +100,8 @@ The server handles GitLab connectivity issues gracefully: | `GITLAB_FAILURE_THRESHOLD` | `3` | Consecutive transient failures before disconnecting | | `GITLAB_TOOL_TIMEOUT_MS` | `120000` | Max time for tool/bootstrap execution before timeout | | `GITLAB_RESPONSE_WRITE_TIMEOUT_MS` | `10000` | Max time to flush a non-SSE response before destroying zombie connection (`0` to disable; SSE uses heartbeat) | +| `GITLAB_INSTANCE_CACHE_MAX` | `100` | Max number of per-URL instance states kept in memory (OAuth multi-tenant; LRU eviction when exceeded) | +| `GITLAB_INSTANCE_TTL_MS` | `3600000` | TTL for idle per-URL instance states in ms; evicted on next insert (OAuth multi-tenant) | ## Feature Flags diff --git a/README.md.in b/README.md.in index 6b144495..5b8ee1bf 100644 --- a/README.md.in +++ b/README.md.in @@ -88,7 +88,7 @@ The server handles GitLab connectivity issues gracefully: - **Bounded startup** — Server starts within `GITLAB_INIT_TIMEOUT_MS` (default 5s) regardless of GitLab availability - **Disconnected mode** — When GitLab is unreachable (`disconnected`/`failed` state), only the `manage_context` tool is exposed, with local actions such as `whoami`, `switch_profile`, and `set_scope` for diagnostics. During active reconnect (`connecting` state), the full tool list remains available so MCP clients don't lose their tool catalog during brief outages. MCP clients are notified of tool availability changes via `tools/list_changed` - **Auto-reconnect** — Exponential backoff reconnection (5s → 60s) with ±10% jitter -- **Error classification** — Transient errors (network, 5xx, timeouts) trigger auto-reconnect. Auth/config errors at startup transition to `failed` state (no auto-reconnect). Runtime auth errors from tool calls are forwarded to `HealthMonitor.reportError()` via `classifyError()`; the remaining gap is token-revocation/403 detection (#370) +- **Error classification** — Transient errors (network, 5xx, timeouts) trigger auto-reconnect. Auth/config errors at startup transition to `failed` state (no auto-reconnect). Mid-session token revocation is detected via an authenticated `HEAD /api/v4/user` check that runs alongside each periodic health check (static token mode only; skipped in OAuth mode). A 401 or 403 on this check transitions the instance to `failed` state immediately. - **Instance health monitor** — Each monitored instance URL has its own XState state machine. Untracked OAuth URLs currently pass through as reachable. | Variable | Default | Description | diff --git a/sonar-project.properties b/sonar-project.properties new file mode 100644 index 00000000..f0d7b632 --- /dev/null +++ b/sonar-project.properties @@ -0,0 +1,7 @@ +# SonarCloud configuration +# https://sonarcloud.io/documentation/project-administration/narrowing-the-focus/ + +# Exclude test files from Copy-Paste Detection (CPD). +# Test files naturally repeat assertion patterns (expect, mock setup, await) across +# test cases — this is intentional test structure, not accidental code duplication. +sonar.cpd.exclusions=tests/** diff --git a/src/services/HealthMonitor.ts b/src/services/HealthMonitor.ts index 30c4a4a9..6974dcf6 100644 --- a/src/services/HealthMonitor.ts +++ b/src/services/HealthMonitor.ts @@ -25,7 +25,7 @@ import { import { ConnectionManager } from './ConnectionManager'; import { normalizeInstanceUrl } from '../utils/url'; import { InstanceRegistry } from './InstanceRegistry'; -import { classifyError, type ErrorCategory } from '../utils/error-handler'; +import { classifyError, parseGitLabApiError, type ErrorCategory } from '../utils/error-handler'; import { enhancedFetch } from '../utils/fetch'; import { logInfo, logWarn, logError, logDebug } from '../logger'; import { @@ -35,7 +35,9 @@ import { HEALTH_CHECK_INTERVAL_MS, FAILURE_THRESHOLD, GITLAB_BASE_URL, + GITLAB_TOKEN, } from '../config'; +import { isOAuthEnabled } from '../oauth/index'; // ============================================================================ // Types @@ -150,6 +152,10 @@ const performConnect = fromPromise<{ degraded: boolean }, { instanceUrl: string // classifyError maps this to 'transient' → disconnected → auto-reconnect. throw new Error(`Health check failed for ${input.instanceUrl}`); } + // Re-validate the token on reconnect, not just during steady-state polls. + // Without this, forceReconnect() while the token is still revoked would + // bounce failed → healthy until the next health-check interval. + await authenticatedTokenCheck(input.instanceUrl, HEALTH_CHECK_PROBE_MS); return { degraded: isDegradedInstance(connectionManager, input.instanceUrl) }; } @@ -224,6 +230,13 @@ const performHealthCheck = fromPromise<{ degraded: boolean }, { instanceUrl: str throw new Error(`Health check failed for ${input.instanceUrl}`); } + // Detect mid-session token revocation in static token mode. + // Throws GitLab API 401/403 when the token is invalid or lacks required scope. + // healthCheckErrorIsAuth guard detects these by parsing the error message + // and routes to '#connection.failed' (no auto-reconnect). + // No-op in OAuth mode (no global token) and when GITLAB_TOKEN is unset. + await authenticatedTokenCheck(input.instanceUrl, HEALTH_CHECK_PROBE_MS); + const connectionManager = ConnectionManager.getInstance(); return { degraded: isDegradedInstance(connectionManager, input.instanceUrl) }; }, @@ -269,10 +282,99 @@ async function quickHealthCheck( } } +/** + * Authenticated token validity check: HEAD /api/v4/user with the static token. + * Detects mid-session token revocation that the unauthenticated reachability check + * cannot see (401 from /api/v4/version is treated as "server alive"). + * + * Only runs in static token mode — OAuth tokens are per-request context and are + * not available during background health checks. + * + * Throws a GitLab API 401 or 403 error when the token is invalid, revoked, + * expired, or lacks the required scope. The healthCheckErrorIsAuth guard detects + * these by parsing the status code and transitions to 'failed' (no auto-reconnect). + * + * AbortError (our own timeout) and transient connectivity failures are swallowed: + * reachability was already confirmed by quickHealthCheck. Unexpected errors are + * logged and re-thrown so programming bugs don't silently leave the instance healthy. + */ +async function authenticatedTokenCheck(instanceUrl: string, timeoutMs: number): Promise { + // OAuth mode: token is per-request context, unavailable during background checks + if (isOAuthEnabled()) return; + // No static token configured — nothing to validate + if (!GITLAB_TOKEN) return; + + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), timeoutMs); + + try { + const response = await enhancedFetch(`${instanceUrl}/api/v4/user`, { + method: 'HEAD', + signal: controller.signal, + retry: false, + rateLimit: false, + // skipAuth suppresses auto-injected credentials (session cookies, getAuthHeaders()). + // The explicit PRIVATE-TOKEN header ensures we validate ONLY the static token — + // a valid session cookie must not mask a revoked token and keep the probe alive. + skipAuth: true, + headers: { 'PRIVATE-TOKEN': GITLAB_TOKEN }, + }); + + if (response.status === 401 || response.status === 403) { + // Both 401 (invalid/revoked token) and 403 (insufficient scope) mean the configured + // token cannot authenticate — include the actual status for accurate log messages. + throw new Error( + `GitLab API error: ${response.status} - token invalid or lacks required scope`, + ); + } + if (!response.ok) { + // Non-auth, non-2xx response (e.g. 429 rate-limit, 5xx server error) — throw so + // the catch block can classify it as transient and swallow appropriately, rather + // than letting the probe silently succeed with a broken status code. + throw new Error(`GitLab API error: ${response.status} - authenticated health probe failed`); + } + } catch (error) { + // Re-throw auth errors from the token probe (401 = invalid, 403 = insufficient scope). + if (error instanceof Error) { + const parsed = parseGitLabApiError(error.message); + if (parsed?.status === 401 || parsed?.status === 403) throw error; + + // Swallow our own AbortController timeout and transient connectivity failures. + // Reachability was already confirmed by quickHealthCheck; failures on this + // second request are noise, not signal. + if (error.name === 'AbortError' || classifyError(error) === 'transient') return; + } + + // Unexpected error (programming bug, invalid URL, etc.) — log and rethrow so it + // doesn't silently leave the instance healthy with a broken probe. + logError('Unexpected error during authenticated token health check', { + err: error instanceof Error ? error : new Error(String(error)), + }); + throw error; + } finally { + clearTimeout(timeoutId); + } +} + // ============================================================================ // XState Machine Definition // ============================================================================ +// Shared onError handler for health-check substates (healthy.checking, degraded.checking). +// Auth errors (401/403 from the authenticated probe) → failed, no auto-reconnect. +// All other errors → idle via recordFailure (transient failures accumulate toward threshold). +const healthCheckOnError = [ + { + guard: 'healthCheckErrorIsAuth' as const, + target: '#connection.failed' as const, + actions: 'recordFailure' as const, + }, + { + target: 'idle' as const, + actions: 'recordFailure' as const, + }, +] as const; + const connectionMachine = setup({ types: { context: {} as MachineContext, @@ -296,6 +398,18 @@ const connectionMachine = setup({ const error = (event as { error?: unknown }).error; return classifyError(error) === 'transient'; }, + // Auth error during periodic health check → failed (no auto-reconnect). + // Uses parseGitLabApiError to extract the status code: both 401 (invalid token) + // and 403 (insufficient scope) from the authenticated probe are terminal failures. + // Direct message parsing is used because classifyError maps 403 → 'permanent', + // not 'auth', so we can't rely on classifyError for the 403 path. + healthCheckErrorIsAuth: ({ event }) => { + const error = (event as { error?: unknown }).error; + /* istanbul ignore if */ + if (!(error instanceof Error)) return false; + const parsed = parseGitLabApiError(error.message); + return parsed?.status === 401 || parsed?.status === 403; + }, }, actions: { recordSuccess: assign({ @@ -416,10 +530,7 @@ const connectionMachine = setup({ actions: 'recordSuccess', }, ], - onError: { - target: 'idle', - actions: 'recordFailure', - }, + onError: healthCheckOnError, }, }, }, @@ -469,10 +580,7 @@ const connectionMachine = setup({ actions: 'recordSuccess', }, ], - onError: { - target: 'idle', - actions: 'recordFailure', - }, + onError: healthCheckOnError, }, }, }, @@ -518,6 +626,10 @@ type StateChangeCallback = ( to: ConnectionState, ) => void; +/** + * Singleton service that manages per-instance GitLab connection health using XState state machines. + * Tracks connectivity state, drives automatic reconnection, and notifies listeners of state changes. + */ export class HealthMonitor { private static instance: HealthMonitor | null = null; private readonly actors = new Map(); @@ -527,6 +639,7 @@ export class HealthMonitor { private constructor() {} + /** Return the singleton instance, creating it on first call. */ public static getInstance(): HealthMonitor { HealthMonitor.instance ??= new HealthMonitor(); return HealthMonitor.instance; @@ -680,6 +793,7 @@ export class HealthMonitor { return topLevel as ConnectionState; } + /** Return the current top-level state for an actor. */ private getActorState(actor: ConnectionActor): ConnectionState { return this.extractState(actor.getSnapshot()); } diff --git a/tests/unit/services/HealthMonitor.test.ts b/tests/unit/services/HealthMonitor.test.ts index 2dfc897d..04047ae8 100644 --- a/tests/unit/services/HealthMonitor.test.ts +++ b/tests/unit/services/HealthMonitor.test.ts @@ -68,7 +68,11 @@ jest.mock('../../../src/utils/fetch', () => ({ enhancedFetch: (...args: unknown[]) => mockFetch(...args), })); -// Mock config with shorter timeouts for testing +// GITLAB_TOKEN mock value — controlled per-test to cover the !GITLAB_TOKEN early-return branch +let mockGitLabToken: string | undefined = 'test-token'; + +// Mock config with shorter timeouts for testing. +// GITLAB_TOKEN uses a getter so individual tests can clear it (mockGitLabToken = undefined). jest.mock('../../../src/config', () => ({ INIT_TIMEOUT_MS: 200, RECONNECT_BASE_DELAY_MS: 100, @@ -76,8 +80,26 @@ jest.mock('../../../src/config', () => ({ HEALTH_CHECK_INTERVAL_MS: 300, // Short for testing health check substates FAILURE_THRESHOLD: 3, GITLAB_BASE_URL: 'https://gitlab.example.com', + get GITLAB_TOKEN() { + return mockGitLabToken; + }, +})); + +// Mock isOAuthEnabled — controls authenticated health check in static vs OAuth mode +const mockIsOAuthEnabled = jest.fn(); +jest.mock('../../../src/oauth/index', () => ({ + isOAuthEnabled: () => mockIsOAuthEnabled(), })); +/** Return the given status for /api/v4/user; 200 for all other URLs. */ +function stubUserEndpointStatus(status: number): void { + mockFetch.mockImplementation((url: string) => { + if (url.includes('/api/v4/user')) + return Promise.resolve({ status, ok: status >= 200 && status < 300 }); + return Promise.resolve({ status: 200, ok: true }); + }); +} + describe('HealthMonitor', () => { beforeEach(() => { jest.useRealTimers(); @@ -90,6 +112,9 @@ describe('HealthMonitor', () => { mockGetCurrentInstanceUrl.mockReset(); mockRegistryIsInitialized.mockReset(); mockGetIntrospection.mockReset(); + // Default: static token mode (authenticated health check enabled) + mockIsOAuthEnabled.mockReturnValue(false); + mockGitLabToken = 'test-token'; HealthMonitor.resetInstance(); mockInitialize.mockResolvedValue(undefined); @@ -821,6 +846,167 @@ describe('HealthMonitor', () => { }); }); + /** + * Init monitor in healthy state (beforeEach already provides 200 fetch + v17 instance info). + * Used by token revocation tests that need to start from a known-healthy baseline. + */ + async function initHealthy(): Promise> { + const monitor = await initMonitor(); + expect(monitor.getState(TEST_URL)).toBe('healthy'); + return monitor; + } + + describe('token revocation detection', () => { + // Buffer accounts for HEALTH_CHECK_INTERVAL_MS (300ms test config) + processing + const HEALTH_CYCLE_MS = 600; + + // Alias used by several tests below + const stubUserEndpoint401 = (): void => stubUserEndpointStatus(401); + + it.each([401, 403])( + 'should transition to failed when authenticated health check returns %i', + async (authStatus) => { + // Regression test for #370: token auth failure mid-session must move to failed state. + // Previously the health monitor stayed healthy because the unauthenticated + // /api/v4/version check treats 401/403 as "server alive" (status < 500). + const monitor = await initHealthy(); + stubUserEndpointStatus(authStatus); + await new Promise((r) => setTimeout(r, HEALTH_CYCLE_MS)); + // Verify token-only probe contract: must use skipAuth + explicit PRIVATE-TOKEN header + // so ambient session cookies cannot mask a revoked static token. + expect(mockFetch).toHaveBeenCalledWith( + `${TEST_URL}/api/v4/user`, + expect.objectContaining({ + method: 'HEAD', + skipAuth: true, + headers: expect.objectContaining({ 'PRIVATE-TOKEN': 'test-token' }), + }), + ); + expect(monitor.getState(TEST_URL)).toBe('failed'); + expect(monitor.isInstanceReachable(TEST_URL)).toBe(false); + }, + ); + + it('should transition from degraded to failed on token revocation', async () => { + // Regression test for #370 in degraded path: same auth check runs from degraded state. + mockGetInstanceInfo.mockReturnValue({ version: 'unknown', tier: 'free' }); // degraded + const monitor = await initMonitor(); + expect(monitor.getState(TEST_URL)).toBe('degraded'); + + stubUserEndpoint401(); + // degradedCheckInterval = min(HEALTH_CHECK_INTERVAL_MS, 30000) = 300ms + await new Promise((r) => setTimeout(r, HEALTH_CYCLE_MS)); + // Verify token-only probe contract is enforced in the degraded path too + expect(mockFetch).toHaveBeenCalledWith( + `${TEST_URL}/api/v4/user`, + expect.objectContaining({ + method: 'HEAD', + skipAuth: true, + headers: expect.objectContaining({ 'PRIVATE-TOKEN': 'test-token' }), + }), + ); + expect(monitor.getState(TEST_URL)).toBe('failed'); + }); + + it.each([ + [ + 'no GITLAB_TOKEN', + (): void => { + mockGitLabToken = undefined; + }, + ], + [ + 'OAuth mode', + (): void => { + mockIsOAuthEnabled.mockReturnValue(true); + }, + ], + ])('should skip authenticated check in %s', async (_label, setup) => { + // Guard: /api/v4/user must never be called — if it is, the 401 would flip state to failed. + stubUserEndpoint401(); + setup(); + const monitor = await initMonitor(); + expect(monitor.getState(TEST_URL)).toBe('healthy'); + await new Promise((r) => setTimeout(r, HEALTH_CYCLE_MS)); + expect(monitor.getState(TEST_URL)).toBe('healthy'); + // Negative assertion: probe URL must not have been fetched in skip-path scenarios + expect( + mockFetch.mock.calls.some( + ([url]) => typeof url === 'string' && url.includes('/api/v4/user'), + ), + ).toBe(false); + }); + + it('should swallow network errors during authenticated check', async () => { + // Connectivity errors on the second request are noise, not signal: + // the unauthenticated check already verified reachability. + const monitor = await initHealthy(); + mockFetch.mockImplementation((url: string) => { + if (url.includes('/api/v4/user')) return Promise.reject(new Error('network error')); + return Promise.resolve({ status: 200, ok: true }); + }); + await new Promise((r) => setTimeout(r, HEALTH_CYCLE_MS)); + expect(monitor.getState(TEST_URL)).toBe('healthy'); + }); + + it.each([429, 500, 503])( + 'should stay healthy when authenticated probe returns transient %i (not an auth error)', + async (transientStatus) => { + // Regression test for #370: non-auth, non-2xx responses from /api/v4/user + // must be classified as transient and swallowed — the instance should remain + // healthy because reachability was already confirmed by quickHealthCheck. + const monitor = await initHealthy(); + stubUserEndpointStatus(transientStatus); + await new Promise((r) => setTimeout(r, HEALTH_CYCLE_MS)); + expect(monitor.getState(TEST_URL)).toBe('healthy'); + }, + ); + + it('should recover after forceReconnect when authenticated checks succeed again', async () => { + // After detecting token revocation (→ failed), a manual forceReconnect should + // recover to healthy once the authenticated check starts succeeding again. + const monitor = await initHealthy(); + stubUserEndpoint401(); + await new Promise((r) => setTimeout(r, HEALTH_CYCLE_MS)); + expect(monitor.getState(TEST_URL)).toBe('failed'); + + mockFetch.mockResolvedValue({ status: 200, ok: true }); + monitor.forceReconnect(TEST_URL); + await new Promise((r) => setTimeout(r, 400)); + expect(monitor.getState(TEST_URL)).toBe('healthy'); + }); + + it('should remain in failed state on forceReconnect when token is still revoked', async () => { + // Regression: forceReconnect uses the performConnect fast-path which must + // also call authenticatedTokenCheck. Without it, a still-revoked token could + // bounce failed → healthy until the next health-check interval. + const monitor = await initHealthy(); + stubUserEndpoint401(); + await new Promise((r) => setTimeout(r, HEALTH_CYCLE_MS)); + expect(monitor.getState(TEST_URL)).toBe('failed'); + + // Exercise the fast-path: isConnected() = true skips full re-initialization + // and goes directly to quickHealthCheck + authenticatedTokenCheck. + // The 401 from the token probe must route to failed (not healthy) immediately. + const initCallsBefore = mockInitialize.mock.calls.length; + mockIsConnected.mockReturnValue(true); + monitor.forceReconnect(TEST_URL); + await new Promise((r) => setTimeout(r, 400)); + expect(monitor.getState(TEST_URL)).toBe('failed'); + // Confirm fast-path was taken: initialize() must not have been called again + expect(mockInitialize.mock.calls.length).toBe(initCallsBefore); + // Verify token-only probe was invoked in the fast-path + expect(mockFetch).toHaveBeenCalledWith( + `${TEST_URL}/api/v4/user`, + expect.objectContaining({ + method: 'HEAD', + skipAuth: true, + headers: expect.objectContaining({ 'PRIVATE-TOKEN': 'test-token' }), + }), + ); + }); + }); + describe('periodic health checks', () => { it('should run health check and stay healthy when GitLab is reachable', async () => { mockInitialize.mockResolvedValue(undefined);