Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions src/lib/onboard.ts
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,8 @@ const { applyPreflightGatewayCleanup } =
require("./onboard/preflight-gateway-cleanup-decision") as typeof import("./onboard/preflight-gateway-cleanup-decision");
const { verifyGatewayContainerRunning } =
require("./onboard/gateway-container-running") as typeof import("./onboard/gateway-container-running");
const { applyHealthyPortReuse } =
require("./onboard/gateway-stale-port-reuse") as typeof import("./onboard/gateway-stale-port-reuse");
const { destroyGatewayWithVolumeCleanup } =
require("./onboard/gateway-destroy") as typeof import("./onboard/gateway-destroy");
const {
Expand Down Expand Up @@ -2235,12 +2237,9 @@ async function preflight(
port === GATEWAY_PORT ? dockerDriverGatewayEnv.getGatewayPortCheckOptions() : undefined;
let portCheck = await checkPortAvailable(port, portCheckOptions);
if (!portCheck.ok) {
if ((port === GATEWAY_PORT || port === DASHBOARD_PORT) && gatewayReuseState === "healthy") {
console.log(
` ✓ Port ${port} already owned by healthy ${cliDisplayName()} runtime (${label})`,
);
continue;
}
const reuse = await applyHealthyPortReuse({ port, gatewayPort: GATEWAY_PORT, dashboardPort: DASHBOARD_PORT, label, runtimeDisplayName: cliDisplayName(), gatewayName: GATEWAY_NAME, gatewayReuseState, portCheckOptions, supportsLifecycleCommands: gatewayCliSupportsLifecycleCommands(runCaptureOpenshell), destroyGateway, runOpenshell, checkPortAvailable, verifyGatewayContainerRunning });
if (reuse === "continue") continue;
if (reuse) { ({ gatewayReuseState, portCheck } = reuse); if (portCheck.ok) continue; }
if (port === GATEWAY_PORT) {
const dockerGatewayPid = getDockerDriverGatewayPortListenerPid(portCheck);
if (dockerGatewayPid !== null) {
Expand Down
234 changes: 234 additions & 0 deletions src/lib/onboard/gateway-stale-port-reuse.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

import { afterEach, describe, expect, it, vi } from "vitest";

import type { GatewayReuseState } from "../state/gateway";
import type { GatewayContainerState } from "./gateway-container-running";
import { applyHealthyPortReuse, classifyGatewayPortReuse } from "./gateway-stale-port-reuse";

const NON_HEALTHY_STATES: GatewayReuseState[] = [
"active-unnamed",
"foreign-active",
"stale",
"missing",
];

describe("classifyGatewayPortReuse", () => {
it("returns stale when recorded healthy but the legacy container is missing", () => {
expect(
classifyGatewayPortReuse({
gatewayReuseState: "healthy",
supportsLifecycleCommands: true,
containerState: "missing",
}),
).toBe("stale");
});

it("returns reuse when recorded healthy and the container is live", () => {
expect(
classifyGatewayPortReuse({
gatewayReuseState: "healthy",
supportsLifecycleCommands: true,
containerState: "running",
}),
).toBe("reuse");
});

it("returns reuse when recorded healthy and the container probe was inconclusive", () => {
expect(
classifyGatewayPortReuse({
gatewayReuseState: "healthy",
supportsLifecycleCommands: true,
containerState: "unknown",
}),
).toBe("reuse");
});

it("returns reuse when the CLI lacks lifecycle commands, regardless of container state", () => {
for (const containerState of ["missing", "running", "unknown"] as GatewayContainerState[]) {
expect(
classifyGatewayPortReuse({
gatewayReuseState: "healthy",
supportsLifecycleCommands: false,
containerState,
}),
).toBe("reuse");
}
});

it("returns skip for any non-healthy recorded state", () => {
for (const gatewayReuseState of NON_HEALTHY_STATES) {
expect(
classifyGatewayPortReuse({
gatewayReuseState,
supportsLifecycleCommands: true,
containerState: "missing",
}),
).toBe("skip");
}
});
});

const BASE_INPUT = {
port: 8080,
gatewayPort: 8080,
dashboardPort: 18789,
label: "OpenShell gateway",
runtimeDisplayName: "NemoClaw",
gatewayName: "nemoclaw",
gatewayReuseState: "healthy" as GatewayReuseState,
portCheckOptions: undefined,
supportsLifecycleCommands: true,
};

describe("applyHealthyPortReuse", () => {
afterEach(() => {
vi.restoreAllMocks();
});

it("returns null when recorded state is not healthy", async () => {
const result = await applyHealthyPortReuse({
...BASE_INPUT,
gatewayReuseState: "missing",
destroyGateway: () => true,
runOpenshell: vi.fn(),
checkPortAvailable: vi.fn(),
verifyGatewayContainerRunning: vi.fn(),
});
expect(result).toBeNull();
});

it("returns null for ports that are neither the gateway nor dashboard port", async () => {
const result = await applyHealthyPortReuse({
...BASE_INPUT,
port: 9999,
destroyGateway: () => true,
runOpenshell: vi.fn(),
checkPortAvailable: vi.fn(),
verifyGatewayContainerRunning: vi.fn(),
});
expect(result).toBeNull();
});

it("cleans up stale metadata and returns downgraded state when the port frees up", async () => {
const log = vi.spyOn(console, "log").mockImplementation(() => undefined);
const destroyGateway = vi.fn(() => true);
const runOpenshell = vi.fn();
const checkPortAvailable = vi.fn().mockResolvedValue({ ok: true });

const result = await applyHealthyPortReuse({
...BASE_INPUT,
destroyGateway,
runOpenshell,
checkPortAvailable,
verifyGatewayContainerRunning: () => "missing",
});

expect(result).not.toBe("continue");
expect(result).not.toBeNull();
if (result && result !== "continue") {
// Caller must see the downgraded reuse state even when the port frees up,
// so downstream branches don't keep treating the runtime as healthy.
expect(result.gatewayReuseState).toBe("missing");
expect(result.portCheck.ok).toBe(true);
}
expect(destroyGateway).toHaveBeenCalledTimes(1);
expect(runOpenshell).toHaveBeenCalledWith(["forward", "stop", "18789"], { ignoreError: true });
expect(checkPortAvailable).toHaveBeenCalledWith(8080, undefined);
const messages = log.mock.calls.map((c) => c[0]);
expect(messages).toContain(" Gateway metadata is stale (container not running). Cleaning up...");
expect(messages).toContain(" ✓ Port 8080 available (OpenShell gateway)");
});

it("returns fall-through state when cleanup succeeds but the port is still busy", async () => {
vi.spyOn(console, "log").mockImplementation(() => undefined);
const checkPortAvailable = vi.fn().mockResolvedValue({ ok: false });

const result = await applyHealthyPortReuse({
...BASE_INPUT,
destroyGateway: () => true,
runOpenshell: vi.fn(),
checkPortAvailable,
verifyGatewayContainerRunning: () => "missing",
});

expect(result).not.toBeNull();
expect(result).not.toBe("continue");
if (result && result !== "continue") {
expect(result.gatewayReuseState).toBe("missing");
expect(result.portCheck.ok).toBe(false);
}
});

it("logs the reuse message and continues when the container is live", async () => {
const log = vi.spyOn(console, "log").mockImplementation(() => undefined);
const destroyGateway = vi.fn(() => true);
const checkPortAvailable = vi.fn();

const result = await applyHealthyPortReuse({
...BASE_INPUT,
destroyGateway,
runOpenshell: vi.fn(),
checkPortAvailable,
verifyGatewayContainerRunning: () => "running",
});

expect(result).toBe("continue");
expect(destroyGateway).not.toHaveBeenCalled();
expect(checkPortAvailable).not.toHaveBeenCalled();
const messages = log.mock.calls.map((c) => c[0]);
expect(messages).toContain(
" ✓ Port 8080 already owned by healthy NemoClaw runtime (OpenShell gateway)",
);
});

it("skips the container probe when lifecycle commands are unsupported", async () => {
// Package-managed openshell builds intentionally have no openshell-cluster-*
// container. Probing Docker would always return "missing" and the decision
// would still come out as "reuse", so the probe is wasted work — worse, in
// environments where the docker CLI is flaky the probe can stall onboard.
const log = vi.spyOn(console, "log").mockImplementation(() => undefined);
const verifyContainer = vi.fn();

const result = await applyHealthyPortReuse({
...BASE_INPUT,
supportsLifecycleCommands: false,
destroyGateway: () => true,
runOpenshell: vi.fn(),
checkPortAvailable: vi.fn(),
verifyGatewayContainerRunning: verifyContainer,
});

expect(result).toBe("continue");
expect(verifyContainer).not.toHaveBeenCalled();
const messages = log.mock.calls.map((c) => c[0]);
expect(messages).toContain(
" ✓ Port 8080 already owned by healthy NemoClaw runtime (OpenShell gateway)",
);
});

it("reuses the dashboard port without consulting the container", async () => {
const log = vi.spyOn(console, "log").mockImplementation(() => undefined);
const checkPortAvailable = vi.fn();
const verifyContainer = vi.fn();

const result = await applyHealthyPortReuse({
...BASE_INPUT,
port: 18789,
label: "NemoClaw dashboard",
destroyGateway: () => true,
runOpenshell: vi.fn(),
checkPortAvailable,
verifyGatewayContainerRunning: verifyContainer,
});

expect(result).toBe("continue");
expect(verifyContainer).not.toHaveBeenCalled();
expect(checkPortAvailable).not.toHaveBeenCalled();
const messages = log.mock.calls.map((c) => c[0]);
expect(messages).toContain(
" ✓ Port 18789 already owned by healthy NemoClaw runtime (NemoClaw dashboard)",
);
});
});
107 changes: 107 additions & 0 deletions src/lib/onboard/gateway-stale-port-reuse.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

import type { GatewayReuseState } from "../state/gateway";
import { destroyGatewayForReuse } from "./gateway-cleanup";
import type { GatewayContainerState } from "./gateway-container-running";
import type { CheckPortOpts, PortProbeResult } from "./preflight";

export type GatewayPortReuseDecision = "stale" | "reuse" | "skip";

export interface GatewayPortReuseInput {
gatewayReuseState: GatewayReuseState;
supportsLifecycleCommands: boolean;
containerState: GatewayContainerState;
}

// When the preflight port-required check finds the gateway port occupied, the
// short-circuit in onboard.ts trusts `gatewayReuseState === "healthy"` and
// reuses the recorded runtime. That short-circuit can fire even after the
// openshell-cluster-* container has been removed out-of-band (Colima/Docker
// restart on macOS; manual `docker rm`), because the upstream stale-metadata
// check that would have cleared the recorded state is gated by
// `gatewayCliSupportsLifecycleCommands` and can be skipped. (#4268)
//
// This decision wraps the second-line defense: when the port is the gateway
// port and recorded state is "healthy", consult the live container state
// before reusing. Returns:
// - "stale" — recorded state is "healthy" but the legacy container is
// missing; caller should clear registry and re-check the port.
// - "reuse" — recorded state is "healthy" and the container is live (or the
// CLI lacks lifecycle commands, so the container model is
// package-managed and Docker is not the source of truth).
// - "skip" — recorded state is not "healthy"; this defensive path does not
// apply and the caller should fall through to other handling.
export function classifyGatewayPortReuse(
input: GatewayPortReuseInput,
): GatewayPortReuseDecision {
if (input.gatewayReuseState !== "healthy") return "skip";
if (!input.supportsLifecycleCommands) return "reuse";
if (input.containerState === "missing") return "stale";
return "reuse";
}

export interface HealthyPortReuseInput {
port: number;
gatewayPort: number;
dashboardPort: number;
label: string;
runtimeDisplayName: string;
gatewayName: string;
gatewayReuseState: GatewayReuseState;
portCheckOptions: CheckPortOpts | undefined;
supportsLifecycleCommands: boolean;
destroyGateway: () => boolean;
runOpenshell: (args: string[], opts: { ignoreError: true }) => unknown;
checkPortAvailable: (port?: number, opts?: CheckPortOpts) => Promise<PortProbeResult>;
verifyGatewayContainerRunning: (gatewayName: string) => GatewayContainerState;
}

export type HealthyPortReuseOutcome =
| "continue"
| { gatewayReuseState: GatewayReuseState; portCheck: PortProbeResult };

// Drive the port-required short-circuit when `gatewayReuseState === "healthy"`.
// Owns the stale-container detection, registry cleanup, port re-check, and
// console messaging. Returns `null` when the recorded state isn't reusable
// (caller should fall through to its usual port-conflict handling); returns
// "continue" when the loop iteration should be skipped; otherwise returns the
// updated `gatewayReuseState` plus the re-checked port result so the caller
// can replay the iteration with fresh state.
export async function applyHealthyPortReuse(
input: HealthyPortReuseInput,
): Promise<HealthyPortReuseOutcome | null> {
const { port, gatewayPort, dashboardPort, label, runtimeDisplayName, gatewayName } = input;
if (input.gatewayReuseState !== "healthy") return null;
if (port !== gatewayPort && port !== dashboardPort) return null;
// Only probe the container when lifecycle commands are advertised — for
// package-managed gateways without lifecycle commands the openshell-cluster-*
// container intentionally doesn't exist and the probe would always report
// "missing", which is then ignored by classifyGatewayPortReuse anyway.
if (port === gatewayPort && input.supportsLifecycleCommands) {
const decision = classifyGatewayPortReuse({
gatewayReuseState: input.gatewayReuseState,
supportsLifecycleCommands: true,
containerState: input.verifyGatewayContainerRunning(gatewayName),
});
Comment thread
coderabbitai[bot] marked this conversation as resolved.
if (decision === "stale") {
console.log(" Gateway metadata is stale (container not running). Cleaning up...");
input.runOpenshell(["forward", "stop", String(dashboardPort)], { ignoreError: true });
const gatewayReuseState = destroyGatewayForReuse(
input.destroyGateway,
" ✓ Stale gateway metadata cleaned up",
" ! Stale gateway metadata cleanup failed; leaving registry state intact.",
);
const portCheck = await input.checkPortAvailable(port, input.portCheckOptions);
if (portCheck.ok) {
console.log(` ✓ Port ${port} available (${label})`);
}
// Always return the downgraded state so the caller stops treating the
// runtime as healthy. The caller uses `portCheck.ok` to decide whether
// to continue the loop or fall through to its port-conflict diagnostic.
return { gatewayReuseState, portCheck };
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}
}
console.log(` ✓ Port ${port} already owned by healthy ${runtimeDisplayName} runtime (${label})`);
return "continue";
}