diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index bff72aafa4..eb9fb7c58c 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -430,6 +430,8 @@ const { applyPreflightGatewayCleanup } = require("./onboard/preflight-gateway-cleanup-decision") as typeof import("./onboard/preflight-gateway-cleanup-decision"); const { verifyGatewayContainerRunning } = require("./onboard/gateway-container-running") as typeof import("./onboard/gateway-container-running"); +const { applyHealthyPortReuse } = + require("./onboard/gateway-stale-port-reuse") as typeof import("./onboard/gateway-stale-port-reuse"); const { destroyGatewayWithVolumeCleanup } = require("./onboard/gateway-destroy") as typeof import("./onboard/gateway-destroy"); const { @@ -2235,12 +2237,9 @@ async function preflight( port === GATEWAY_PORT ? dockerDriverGatewayEnv.getGatewayPortCheckOptions() : undefined; let portCheck = await checkPortAvailable(port, portCheckOptions); if (!portCheck.ok) { - if ((port === GATEWAY_PORT || port === DASHBOARD_PORT) && gatewayReuseState === "healthy") { - console.log( - ` ✓ Port ${port} already owned by healthy ${cliDisplayName()} runtime (${label})`, - ); - continue; - } + const reuse = await applyHealthyPortReuse({ port, gatewayPort: GATEWAY_PORT, dashboardPort: DASHBOARD_PORT, label, runtimeDisplayName: cliDisplayName(), gatewayName: GATEWAY_NAME, gatewayReuseState, portCheckOptions, supportsLifecycleCommands: gatewayCliSupportsLifecycleCommands(runCaptureOpenshell), destroyGateway, runOpenshell, checkPortAvailable, verifyGatewayContainerRunning }); + if (reuse === "continue") continue; + if (reuse) { ({ gatewayReuseState, portCheck } = reuse); if (portCheck.ok) continue; } if (port === GATEWAY_PORT) { const dockerGatewayPid = getDockerDriverGatewayPortListenerPid(portCheck); if (dockerGatewayPid !== null) { diff --git a/src/lib/onboard/gateway-stale-port-reuse.test.ts b/src/lib/onboard/gateway-stale-port-reuse.test.ts new file mode 100644 index 0000000000..5469488b43 --- /dev/null +++ b/src/lib/onboard/gateway-stale-port-reuse.test.ts @@ -0,0 +1,234 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { afterEach, describe, expect, it, vi } from "vitest"; + +import type { GatewayReuseState } from "../state/gateway"; +import type { GatewayContainerState } from "./gateway-container-running"; +import { applyHealthyPortReuse, classifyGatewayPortReuse } from "./gateway-stale-port-reuse"; + +const NON_HEALTHY_STATES: GatewayReuseState[] = [ + "active-unnamed", + "foreign-active", + "stale", + "missing", +]; + +describe("classifyGatewayPortReuse", () => { + it("returns stale when recorded healthy but the legacy container is missing", () => { + expect( + classifyGatewayPortReuse({ + gatewayReuseState: "healthy", + supportsLifecycleCommands: true, + containerState: "missing", + }), + ).toBe("stale"); + }); + + it("returns reuse when recorded healthy and the container is live", () => { + expect( + classifyGatewayPortReuse({ + gatewayReuseState: "healthy", + supportsLifecycleCommands: true, + containerState: "running", + }), + ).toBe("reuse"); + }); + + it("returns reuse when recorded healthy and the container probe was inconclusive", () => { + expect( + classifyGatewayPortReuse({ + gatewayReuseState: "healthy", + supportsLifecycleCommands: true, + containerState: "unknown", + }), + ).toBe("reuse"); + }); + + it("returns reuse when the CLI lacks lifecycle commands, regardless of container state", () => { + for (const containerState of ["missing", "running", "unknown"] as GatewayContainerState[]) { + expect( + classifyGatewayPortReuse({ + gatewayReuseState: "healthy", + supportsLifecycleCommands: false, + containerState, + }), + ).toBe("reuse"); + } + }); + + it("returns skip for any non-healthy recorded state", () => { + for (const gatewayReuseState of NON_HEALTHY_STATES) { + expect( + classifyGatewayPortReuse({ + gatewayReuseState, + supportsLifecycleCommands: true, + containerState: "missing", + }), + ).toBe("skip"); + } + }); +}); + +const BASE_INPUT = { + port: 8080, + gatewayPort: 8080, + dashboardPort: 18789, + label: "OpenShell gateway", + runtimeDisplayName: "NemoClaw", + gatewayName: "nemoclaw", + gatewayReuseState: "healthy" as GatewayReuseState, + portCheckOptions: undefined, + supportsLifecycleCommands: true, +}; + +describe("applyHealthyPortReuse", () => { + afterEach(() => { + vi.restoreAllMocks(); + }); + + it("returns null when recorded state is not healthy", async () => { + const result = await applyHealthyPortReuse({ + ...BASE_INPUT, + gatewayReuseState: "missing", + destroyGateway: () => true, + runOpenshell: vi.fn(), + checkPortAvailable: vi.fn(), + verifyGatewayContainerRunning: vi.fn(), + }); + expect(result).toBeNull(); + }); + + it("returns null for ports that are neither the gateway nor dashboard port", async () => { + const result = await applyHealthyPortReuse({ + ...BASE_INPUT, + port: 9999, + destroyGateway: () => true, + runOpenshell: vi.fn(), + checkPortAvailable: vi.fn(), + verifyGatewayContainerRunning: vi.fn(), + }); + expect(result).toBeNull(); + }); + + it("cleans up stale metadata and returns downgraded state when the port frees up", async () => { + const log = vi.spyOn(console, "log").mockImplementation(() => undefined); + const destroyGateway = vi.fn(() => true); + const runOpenshell = vi.fn(); + const checkPortAvailable = vi.fn().mockResolvedValue({ ok: true }); + + const result = await applyHealthyPortReuse({ + ...BASE_INPUT, + destroyGateway, + runOpenshell, + checkPortAvailable, + verifyGatewayContainerRunning: () => "missing", + }); + + expect(result).not.toBe("continue"); + expect(result).not.toBeNull(); + if (result && result !== "continue") { + // Caller must see the downgraded reuse state even when the port frees up, + // so downstream branches don't keep treating the runtime as healthy. + expect(result.gatewayReuseState).toBe("missing"); + expect(result.portCheck.ok).toBe(true); + } + expect(destroyGateway).toHaveBeenCalledTimes(1); + expect(runOpenshell).toHaveBeenCalledWith(["forward", "stop", "18789"], { ignoreError: true }); + expect(checkPortAvailable).toHaveBeenCalledWith(8080, undefined); + const messages = log.mock.calls.map((c) => c[0]); + expect(messages).toContain(" Gateway metadata is stale (container not running). Cleaning up..."); + expect(messages).toContain(" ✓ Port 8080 available (OpenShell gateway)"); + }); + + it("returns fall-through state when cleanup succeeds but the port is still busy", async () => { + vi.spyOn(console, "log").mockImplementation(() => undefined); + const checkPortAvailable = vi.fn().mockResolvedValue({ ok: false }); + + const result = await applyHealthyPortReuse({ + ...BASE_INPUT, + destroyGateway: () => true, + runOpenshell: vi.fn(), + checkPortAvailable, + verifyGatewayContainerRunning: () => "missing", + }); + + expect(result).not.toBeNull(); + expect(result).not.toBe("continue"); + if (result && result !== "continue") { + expect(result.gatewayReuseState).toBe("missing"); + expect(result.portCheck.ok).toBe(false); + } + }); + + it("logs the reuse message and continues when the container is live", async () => { + const log = vi.spyOn(console, "log").mockImplementation(() => undefined); + const destroyGateway = vi.fn(() => true); + const checkPortAvailable = vi.fn(); + + const result = await applyHealthyPortReuse({ + ...BASE_INPUT, + destroyGateway, + runOpenshell: vi.fn(), + checkPortAvailable, + verifyGatewayContainerRunning: () => "running", + }); + + expect(result).toBe("continue"); + expect(destroyGateway).not.toHaveBeenCalled(); + expect(checkPortAvailable).not.toHaveBeenCalled(); + const messages = log.mock.calls.map((c) => c[0]); + expect(messages).toContain( + " ✓ Port 8080 already owned by healthy NemoClaw runtime (OpenShell gateway)", + ); + }); + + it("skips the container probe when lifecycle commands are unsupported", async () => { + // Package-managed openshell builds intentionally have no openshell-cluster-* + // container. Probing Docker would always return "missing" and the decision + // would still come out as "reuse", so the probe is wasted work — worse, in + // environments where the docker CLI is flaky the probe can stall onboard. + const log = vi.spyOn(console, "log").mockImplementation(() => undefined); + const verifyContainer = vi.fn(); + + const result = await applyHealthyPortReuse({ + ...BASE_INPUT, + supportsLifecycleCommands: false, + destroyGateway: () => true, + runOpenshell: vi.fn(), + checkPortAvailable: vi.fn(), + verifyGatewayContainerRunning: verifyContainer, + }); + + expect(result).toBe("continue"); + expect(verifyContainer).not.toHaveBeenCalled(); + const messages = log.mock.calls.map((c) => c[0]); + expect(messages).toContain( + " ✓ Port 8080 already owned by healthy NemoClaw runtime (OpenShell gateway)", + ); + }); + + it("reuses the dashboard port without consulting the container", async () => { + const log = vi.spyOn(console, "log").mockImplementation(() => undefined); + const checkPortAvailable = vi.fn(); + const verifyContainer = vi.fn(); + + const result = await applyHealthyPortReuse({ + ...BASE_INPUT, + port: 18789, + label: "NemoClaw dashboard", + destroyGateway: () => true, + runOpenshell: vi.fn(), + checkPortAvailable, + verifyGatewayContainerRunning: verifyContainer, + }); + + expect(result).toBe("continue"); + expect(verifyContainer).not.toHaveBeenCalled(); + expect(checkPortAvailable).not.toHaveBeenCalled(); + const messages = log.mock.calls.map((c) => c[0]); + expect(messages).toContain( + " ✓ Port 18789 already owned by healthy NemoClaw runtime (NemoClaw dashboard)", + ); + }); +}); diff --git a/src/lib/onboard/gateway-stale-port-reuse.ts b/src/lib/onboard/gateway-stale-port-reuse.ts new file mode 100644 index 0000000000..4a5cc239d7 --- /dev/null +++ b/src/lib/onboard/gateway-stale-port-reuse.ts @@ -0,0 +1,107 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { GatewayReuseState } from "../state/gateway"; +import { destroyGatewayForReuse } from "./gateway-cleanup"; +import type { GatewayContainerState } from "./gateway-container-running"; +import type { CheckPortOpts, PortProbeResult } from "./preflight"; + +export type GatewayPortReuseDecision = "stale" | "reuse" | "skip"; + +export interface GatewayPortReuseInput { + gatewayReuseState: GatewayReuseState; + supportsLifecycleCommands: boolean; + containerState: GatewayContainerState; +} + +// When the preflight port-required check finds the gateway port occupied, the +// short-circuit in onboard.ts trusts `gatewayReuseState === "healthy"` and +// reuses the recorded runtime. That short-circuit can fire even after the +// openshell-cluster-* container has been removed out-of-band (Colima/Docker +// restart on macOS; manual `docker rm`), because the upstream stale-metadata +// check that would have cleared the recorded state is gated by +// `gatewayCliSupportsLifecycleCommands` and can be skipped. (#4268) +// +// This decision wraps the second-line defense: when the port is the gateway +// port and recorded state is "healthy", consult the live container state +// before reusing. Returns: +// - "stale" — recorded state is "healthy" but the legacy container is +// missing; caller should clear registry and re-check the port. +// - "reuse" — recorded state is "healthy" and the container is live (or the +// CLI lacks lifecycle commands, so the container model is +// package-managed and Docker is not the source of truth). +// - "skip" — recorded state is not "healthy"; this defensive path does not +// apply and the caller should fall through to other handling. +export function classifyGatewayPortReuse( + input: GatewayPortReuseInput, +): GatewayPortReuseDecision { + if (input.gatewayReuseState !== "healthy") return "skip"; + if (!input.supportsLifecycleCommands) return "reuse"; + if (input.containerState === "missing") return "stale"; + return "reuse"; +} + +export interface HealthyPortReuseInput { + port: number; + gatewayPort: number; + dashboardPort: number; + label: string; + runtimeDisplayName: string; + gatewayName: string; + gatewayReuseState: GatewayReuseState; + portCheckOptions: CheckPortOpts | undefined; + supportsLifecycleCommands: boolean; + destroyGateway: () => boolean; + runOpenshell: (args: string[], opts: { ignoreError: true }) => unknown; + checkPortAvailable: (port?: number, opts?: CheckPortOpts) => Promise; + verifyGatewayContainerRunning: (gatewayName: string) => GatewayContainerState; +} + +export type HealthyPortReuseOutcome = + | "continue" + | { gatewayReuseState: GatewayReuseState; portCheck: PortProbeResult }; + +// Drive the port-required short-circuit when `gatewayReuseState === "healthy"`. +// Owns the stale-container detection, registry cleanup, port re-check, and +// console messaging. Returns `null` when the recorded state isn't reusable +// (caller should fall through to its usual port-conflict handling); returns +// "continue" when the loop iteration should be skipped; otherwise returns the +// updated `gatewayReuseState` plus the re-checked port result so the caller +// can replay the iteration with fresh state. +export async function applyHealthyPortReuse( + input: HealthyPortReuseInput, +): Promise { + const { port, gatewayPort, dashboardPort, label, runtimeDisplayName, gatewayName } = input; + if (input.gatewayReuseState !== "healthy") return null; + if (port !== gatewayPort && port !== dashboardPort) return null; + // Only probe the container when lifecycle commands are advertised — for + // package-managed gateways without lifecycle commands the openshell-cluster-* + // container intentionally doesn't exist and the probe would always report + // "missing", which is then ignored by classifyGatewayPortReuse anyway. + if (port === gatewayPort && input.supportsLifecycleCommands) { + const decision = classifyGatewayPortReuse({ + gatewayReuseState: input.gatewayReuseState, + supportsLifecycleCommands: true, + containerState: input.verifyGatewayContainerRunning(gatewayName), + }); + if (decision === "stale") { + console.log(" Gateway metadata is stale (container not running). Cleaning up..."); + input.runOpenshell(["forward", "stop", String(dashboardPort)], { ignoreError: true }); + const gatewayReuseState = destroyGatewayForReuse( + input.destroyGateway, + " ✓ Stale gateway metadata cleaned up", + " ! Stale gateway metadata cleanup failed; leaving registry state intact.", + ); + const portCheck = await input.checkPortAvailable(port, input.portCheckOptions); + if (portCheck.ok) { + console.log(` ✓ Port ${port} available (${label})`); + } + // Always return the downgraded state so the caller stops treating the + // runtime as healthy. The caller uses `portCheck.ok` to decide whether + // to continue the loop or fall through to its port-conflict diagnostic. + return { gatewayReuseState, portCheck }; + } + } + console.log(` ✓ Port ${port} already owned by healthy ${runtimeDisplayName} runtime (${label})`); + return "continue"; +}