Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
9879675
feat(health): detect token revocation via authenticated health checks
polaz Apr 8, 2026
3ddfc41
fix(health): use neutral 401 message; test no-token skip path
polaz Apr 8, 2026
56cd228
test(health): extract helpers to reduce duplication in revocation tests
polaz Apr 8, 2026
4cf2486
test(health): reduce token revocation test duplication via it.each an…
polaz Apr 8, 2026
bb895f2
fix(health): route 403 from authenticated probe to failed state; add …
polaz Apr 8, 2026
e9a0ab8
chore(sonar): exclude test files from copy-paste detection
polaz Apr 8, 2026
d7d618f
refactor(health): extract shared healthCheckOnError constant to elimi…
polaz Apr 8, 2026
e7fed88
refactor(health): replace startsWith auth checks with parseGitLabApiE…
polaz Apr 8, 2026
b3f57c6
fix(health): suppress unreachable istanbul branches in auth probe guards
polaz Apr 8, 2026
f311ab5
fix(health): revalidate token on forceReconnect fast-path; add regres…
polaz Apr 8, 2026
2a1bf83
fix(health): use token-only probe to prevent session cookie masking
polaz Apr 8, 2026
60d9e40
refactor(test): move token revocation helpers to outer describe scope
polaz Apr 8, 2026
a2a31c3
fix(ci): bash syntax error in release summary when changelog has mark…
polaz Apr 8, 2026
1e3fded
test(health): exercise performConnect fast-path in token-still-revoke…
polaz Apr 8, 2026
8fa0311
test(health): assert token-only probe contract in revocation tests
polaz Apr 9, 2026
a9d4bb6
test(health): move stubUserEndpointStatus to module scope
polaz Apr 9, 2026
fa28b1c
docs(health): add missing JSDoc to HealthMonitor class and getInstanc…
polaz Apr 9, 2026
328f95b
fix(health): narrow error swallow in authenticatedTokenCheck to abort…
polaz Apr 9, 2026
119cc0a
test(health): derive ok flag from status code in stubUserEndpointStatus
polaz Apr 9, 2026
e2204b7
fix(health): throw on non-2xx responses in authenticatedTokenCheck
polaz Apr 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/release-please.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
release-version: ${{ steps.release.outputs.version }}
release-tag: ${{ steps.release.outputs.tag_name }}
release-sha: ${{ steps.release.outputs.sha }}
pr-number: ${{ steps.release.outputs.pr }}
pr-number: ${{ steps.release.outputs.pr != '' && fromJSON(steps.release.outputs.pr).number || '' }}

steps:
- name: Generate release token
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ The server handles GitLab connectivity issues gracefully:
- **Bounded startup** — Server starts within `GITLAB_INIT_TIMEOUT_MS` (default 5s) regardless of GitLab availability
- **Disconnected mode** — When GitLab is unreachable (`disconnected`/`failed` state), only the `manage_context` tool is exposed, with local actions such as `whoami`, `switch_profile`, and `set_scope` for diagnostics. During active reconnect (`connecting` state), the full tool list remains available so MCP clients don't lose their tool catalog during brief outages. MCP clients are notified of tool availability changes via `tools/list_changed`
- **Auto-reconnect** — Exponential backoff reconnection (5s → 60s) with ±10% jitter
- **Error classification** — Transient errors (network, 5xx, timeouts) trigger auto-reconnect. Auth/config errors at startup transition to `failed` state (no auto-reconnect). Runtime auth errors from tool calls are forwarded to `HealthMonitor.reportError()` via `classifyError()`; the remaining gap is token-revocation/403 detection (#370)
- **Error classification** — Transient errors (network, 5xx, timeouts) trigger auto-reconnect. Auth/config errors at startup transition to `failed` state (no auto-reconnect). Mid-session token revocation is detected via an authenticated `HEAD /api/v4/user` check that runs alongside each periodic health check (static token mode only; skipped in OAuth mode). A 401 or 403 on this check transitions the instance to `failed` state immediately.
- **Instance health monitor** — Each monitored instance URL has its own XState state machine. Untracked OAuth URLs currently pass through as reachable.

| Variable | Default | Description |
Expand All @@ -100,6 +100,8 @@ The server handles GitLab connectivity issues gracefully:
| `GITLAB_FAILURE_THRESHOLD` | `3` | Consecutive transient failures before disconnecting |
| `GITLAB_TOOL_TIMEOUT_MS` | `120000` | Max time for tool/bootstrap execution before timeout |
| `GITLAB_RESPONSE_WRITE_TIMEOUT_MS` | `10000` | Max time to flush a non-SSE response before destroying zombie connection (`0` to disable; SSE uses heartbeat) |
| `GITLAB_INSTANCE_CACHE_MAX` | `100` | Max number of per-URL instance states kept in memory (OAuth multi-tenant; LRU eviction when exceeded) |
| `GITLAB_INSTANCE_TTL_MS` | `3600000` | TTL for idle per-URL instance states in ms; evicted on next insert (OAuth multi-tenant) |

## Feature Flags

Expand Down
2 changes: 1 addition & 1 deletion README.md.in
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ The server handles GitLab connectivity issues gracefully:
- **Bounded startup** — Server starts within `GITLAB_INIT_TIMEOUT_MS` (default 5s) regardless of GitLab availability
- **Disconnected mode** — When GitLab is unreachable (`disconnected`/`failed` state), only the `manage_context` tool is exposed, with local actions such as `whoami`, `switch_profile`, and `set_scope` for diagnostics. During active reconnect (`connecting` state), the full tool list remains available so MCP clients don't lose their tool catalog during brief outages. MCP clients are notified of tool availability changes via `tools/list_changed`
- **Auto-reconnect** — Exponential backoff reconnection (5s → 60s) with ±10% jitter
- **Error classification** — Transient errors (network, 5xx, timeouts) trigger auto-reconnect. Auth/config errors at startup transition to `failed` state (no auto-reconnect). Runtime auth errors from tool calls are forwarded to `HealthMonitor.reportError()` via `classifyError()`; the remaining gap is token-revocation/403 detection (#370)
- **Error classification** — Transient errors (network, 5xx, timeouts) trigger auto-reconnect. Auth/config errors at startup transition to `failed` state (no auto-reconnect). Mid-session token revocation is detected via an authenticated `HEAD /api/v4/user` check that runs alongside each periodic health check (static token mode only; skipped in OAuth mode). A 401 or 403 on this check transitions the instance to `failed` state immediately.
- **Instance health monitor** — Each monitored instance URL has its own XState state machine. Untracked OAuth URLs currently pass through as reachable.

| Variable | Default | Description |
Expand Down
7 changes: 7 additions & 0 deletions sonar-project.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# SonarCloud configuration
# https://sonarcloud.io/documentation/project-administration/narrowing-the-focus/

# Exclude test files from Copy-Paste Detection (CPD).
# Test files naturally repeat assertion patterns (expect, mock setup, await) across
# test cases — this is intentional test structure, not accidental code duplication.
sonar.cpd.exclusions=tests/**
132 changes: 123 additions & 9 deletions src/services/HealthMonitor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import {
import { ConnectionManager } from './ConnectionManager';
import { normalizeInstanceUrl } from '../utils/url';
import { InstanceRegistry } from './InstanceRegistry';
import { classifyError, type ErrorCategory } from '../utils/error-handler';
import { classifyError, parseGitLabApiError, type ErrorCategory } from '../utils/error-handler';
import { enhancedFetch } from '../utils/fetch';
import { logInfo, logWarn, logError, logDebug } from '../logger';
import {
Expand All @@ -35,7 +35,9 @@ import {
HEALTH_CHECK_INTERVAL_MS,
FAILURE_THRESHOLD,
GITLAB_BASE_URL,
GITLAB_TOKEN,
} from '../config';
import { isOAuthEnabled } from '../oauth/index';

// ============================================================================
// Types
Expand Down Expand Up @@ -150,6 +152,10 @@ const performConnect = fromPromise<{ degraded: boolean }, { instanceUrl: string
// classifyError maps this to 'transient' → disconnected → auto-reconnect.
throw new Error(`Health check failed for ${input.instanceUrl}`);
}
// Re-validate the token on reconnect, not just during steady-state polls.
// Without this, forceReconnect() while the token is still revoked would
// bounce failed → healthy until the next health-check interval.
await authenticatedTokenCheck(input.instanceUrl, HEALTH_CHECK_PROBE_MS);
return { degraded: isDegradedInstance(connectionManager, input.instanceUrl) };
}

Expand Down Expand Up @@ -224,6 +230,13 @@ const performHealthCheck = fromPromise<{ degraded: boolean }, { instanceUrl: str
throw new Error(`Health check failed for ${input.instanceUrl}`);
}

// Detect mid-session token revocation in static token mode.
// Throws GitLab API 401/403 when the token is invalid or lacks required scope.
// healthCheckErrorIsAuth guard detects these by parsing the error message
// and routes to '#connection.failed' (no auto-reconnect).
// No-op in OAuth mode (no global token) and when GITLAB_TOKEN is unset.
await authenticatedTokenCheck(input.instanceUrl, HEALTH_CHECK_PROBE_MS);
Comment thread
coderabbitai[bot] marked this conversation as resolved.

Comment thread
coderabbitai[bot] marked this conversation as resolved.
const connectionManager = ConnectionManager.getInstance();
return { degraded: isDegradedInstance(connectionManager, input.instanceUrl) };
},
Expand Down Expand Up @@ -269,10 +282,99 @@ async function quickHealthCheck(
}
}

/**
* Authenticated token validity check: HEAD /api/v4/user with the static token.
* Detects mid-session token revocation that the unauthenticated reachability check
* cannot see (401 from /api/v4/version is treated as "server alive").
*
* Only runs in static token mode — OAuth tokens are per-request context and are
* not available during background health checks.
*
* Throws a GitLab API 401 or 403 error when the token is invalid, revoked,
* expired, or lacks the required scope. The healthCheckErrorIsAuth guard detects
* these by parsing the status code and transitions to 'failed' (no auto-reconnect).
*
* AbortError (our own timeout) and transient connectivity failures are swallowed:
* reachability was already confirmed by quickHealthCheck. Unexpected errors are
* logged and re-thrown so programming bugs don't silently leave the instance healthy.
*/
async function authenticatedTokenCheck(instanceUrl: string, timeoutMs: number): Promise<void> {
// OAuth mode: token is per-request context, unavailable during background checks
if (isOAuthEnabled()) return;
// No static token configured — nothing to validate
if (!GITLAB_TOKEN) return;

const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);

try {
const response = await enhancedFetch(`${instanceUrl}/api/v4/user`, {
method: 'HEAD',
signal: controller.signal,
retry: false,
rateLimit: false,
// skipAuth suppresses auto-injected credentials (session cookies, getAuthHeaders()).
// The explicit PRIVATE-TOKEN header ensures we validate ONLY the static token —
// a valid session cookie must not mask a revoked token and keep the probe alive.
skipAuth: true,
headers: { 'PRIVATE-TOKEN': GITLAB_TOKEN },
});
Comment thread
polaz marked this conversation as resolved.

if (response.status === 401 || response.status === 403) {
// Both 401 (invalid/revoked token) and 403 (insufficient scope) mean the configured
// token cannot authenticate — include the actual status for accurate log messages.
throw new Error(
`GitLab API error: ${response.status} - token invalid or lacks required scope`,
);
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
if (!response.ok) {
// Non-auth, non-2xx response (e.g. 429 rate-limit, 5xx server error) — throw so
// the catch block can classify it as transient and swallow appropriately, rather
// than letting the probe silently succeed with a broken status code.
throw new Error(`GitLab API error: ${response.status} - authenticated health probe failed`);
}
} catch (error) {
// Re-throw auth errors from the token probe (401 = invalid, 403 = insufficient scope).
if (error instanceof Error) {
const parsed = parseGitLabApiError(error.message);
if (parsed?.status === 401 || parsed?.status === 403) throw error;

// Swallow our own AbortController timeout and transient connectivity failures.
// Reachability was already confirmed by quickHealthCheck; failures on this
// second request are noise, not signal.
if (error.name === 'AbortError' || classifyError(error) === 'transient') return;
}

// Unexpected error (programming bug, invalid URL, etc.) — log and rethrow so it
// doesn't silently leave the instance healthy with a broken probe.
logError('Unexpected error during authenticated token health check', {
err: error instanceof Error ? error : new Error(String(error)),
});
throw error;
} finally {
clearTimeout(timeoutId);
}
}

// ============================================================================
// XState Machine Definition
// ============================================================================

// Shared onError handler for health-check substates (healthy.checking, degraded.checking).
// Auth errors (401/403 from the authenticated probe) → failed, no auto-reconnect.
// All other errors → idle via recordFailure (transient failures accumulate toward threshold).
const healthCheckOnError = [
{
guard: 'healthCheckErrorIsAuth' as const,
target: '#connection.failed' as const,
actions: 'recordFailure' as const,
},
{
target: 'idle' as const,
actions: 'recordFailure' as const,
},
] as const;

const connectionMachine = setup({
types: {
context: {} as MachineContext,
Expand All @@ -296,6 +398,18 @@ const connectionMachine = setup({
const error = (event as { error?: unknown }).error;
return classifyError(error) === 'transient';
},
// Auth error during periodic health check → failed (no auto-reconnect).
// Uses parseGitLabApiError to extract the status code: both 401 (invalid token)
// and 403 (insufficient scope) from the authenticated probe are terminal failures.
// Direct message parsing is used because classifyError maps 403 → 'permanent',
// not 'auth', so we can't rely on classifyError for the 403 path.
healthCheckErrorIsAuth: ({ event }) => {
const error = (event as { error?: unknown }).error;
/* istanbul ignore if */
if (!(error instanceof Error)) return false;
const parsed = parseGitLabApiError(error.message);
return parsed?.status === 401 || parsed?.status === 403;
},
},
actions: {
recordSuccess: assign({
Expand Down Expand Up @@ -416,10 +530,7 @@ const connectionMachine = setup({
actions: 'recordSuccess',
},
],
onError: {
target: 'idle',
actions: 'recordFailure',
},
onError: healthCheckOnError,
},
},
},
Expand Down Expand Up @@ -469,10 +580,7 @@ const connectionMachine = setup({
actions: 'recordSuccess',
},
],
onError: {
target: 'idle',
actions: 'recordFailure',
},
onError: healthCheckOnError,
},
},
},
Expand Down Expand Up @@ -518,6 +626,10 @@ type StateChangeCallback = (
to: ConnectionState,
) => void;

/**
* Singleton service that manages per-instance GitLab connection health using XState state machines.
* Tracks connectivity state, drives automatic reconnection, and notifies listeners of state changes.
*/
export class HealthMonitor {
private static instance: HealthMonitor | null = null;
private readonly actors = new Map<string, ConnectionActor>();
Expand All @@ -527,6 +639,7 @@ export class HealthMonitor {

private constructor() {}

/** Return the singleton instance, creating it on first call. */
public static getInstance(): HealthMonitor {
HealthMonitor.instance ??= new HealthMonitor();
return HealthMonitor.instance;
Expand Down Expand Up @@ -680,6 +793,7 @@ export class HealthMonitor {
return topLevel as ConnectionState;
}

/** Return the current top-level state for an actor. */
private getActorState(actor: ConnectionActor): ConnectionState {
return this.extractState(actor.getSnapshot());
}
Expand Down
Loading
Loading