-
Notifications
You must be signed in to change notification settings - Fork 2.8k
fix(onboard): verify gateway container before reusing healthy state on port conflict #4306
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
cv
merged 1 commit into
NVIDIA:main
from
Dongni-Yang:fix/4268-stale-gateway-not-cleaned
May 27, 2026
+346
−6
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,234 @@ | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| import { afterEach, describe, expect, it, vi } from "vitest"; | ||
|
|
||
| import type { GatewayReuseState } from "../state/gateway"; | ||
| import type { GatewayContainerState } from "./gateway-container-running"; | ||
| import { applyHealthyPortReuse, classifyGatewayPortReuse } from "./gateway-stale-port-reuse"; | ||
|
|
||
| const NON_HEALTHY_STATES: GatewayReuseState[] = [ | ||
| "active-unnamed", | ||
| "foreign-active", | ||
| "stale", | ||
| "missing", | ||
| ]; | ||
|
|
||
| describe("classifyGatewayPortReuse", () => { | ||
| it("returns stale when recorded healthy but the legacy container is missing", () => { | ||
| expect( | ||
| classifyGatewayPortReuse({ | ||
| gatewayReuseState: "healthy", | ||
| supportsLifecycleCommands: true, | ||
| containerState: "missing", | ||
| }), | ||
| ).toBe("stale"); | ||
| }); | ||
|
|
||
| it("returns reuse when recorded healthy and the container is live", () => { | ||
| expect( | ||
| classifyGatewayPortReuse({ | ||
| gatewayReuseState: "healthy", | ||
| supportsLifecycleCommands: true, | ||
| containerState: "running", | ||
| }), | ||
| ).toBe("reuse"); | ||
| }); | ||
|
|
||
| it("returns reuse when recorded healthy and the container probe was inconclusive", () => { | ||
| expect( | ||
| classifyGatewayPortReuse({ | ||
| gatewayReuseState: "healthy", | ||
| supportsLifecycleCommands: true, | ||
| containerState: "unknown", | ||
| }), | ||
| ).toBe("reuse"); | ||
| }); | ||
|
|
||
| it("returns reuse when the CLI lacks lifecycle commands, regardless of container state", () => { | ||
| for (const containerState of ["missing", "running", "unknown"] as GatewayContainerState[]) { | ||
| expect( | ||
| classifyGatewayPortReuse({ | ||
| gatewayReuseState: "healthy", | ||
| supportsLifecycleCommands: false, | ||
| containerState, | ||
| }), | ||
| ).toBe("reuse"); | ||
| } | ||
| }); | ||
|
|
||
| it("returns skip for any non-healthy recorded state", () => { | ||
| for (const gatewayReuseState of NON_HEALTHY_STATES) { | ||
| expect( | ||
| classifyGatewayPortReuse({ | ||
| gatewayReuseState, | ||
| supportsLifecycleCommands: true, | ||
| containerState: "missing", | ||
| }), | ||
| ).toBe("skip"); | ||
| } | ||
| }); | ||
| }); | ||
|
|
||
| const BASE_INPUT = { | ||
| port: 8080, | ||
| gatewayPort: 8080, | ||
| dashboardPort: 18789, | ||
| label: "OpenShell gateway", | ||
| runtimeDisplayName: "NemoClaw", | ||
| gatewayName: "nemoclaw", | ||
| gatewayReuseState: "healthy" as GatewayReuseState, | ||
| portCheckOptions: undefined, | ||
| supportsLifecycleCommands: true, | ||
| }; | ||
|
|
||
| describe("applyHealthyPortReuse", () => { | ||
| afterEach(() => { | ||
| vi.restoreAllMocks(); | ||
| }); | ||
|
|
||
| it("returns null when recorded state is not healthy", async () => { | ||
| const result = await applyHealthyPortReuse({ | ||
| ...BASE_INPUT, | ||
| gatewayReuseState: "missing", | ||
| destroyGateway: () => true, | ||
| runOpenshell: vi.fn(), | ||
| checkPortAvailable: vi.fn(), | ||
| verifyGatewayContainerRunning: vi.fn(), | ||
| }); | ||
| expect(result).toBeNull(); | ||
| }); | ||
|
|
||
| it("returns null for ports that are neither the gateway nor dashboard port", async () => { | ||
| const result = await applyHealthyPortReuse({ | ||
| ...BASE_INPUT, | ||
| port: 9999, | ||
| destroyGateway: () => true, | ||
| runOpenshell: vi.fn(), | ||
| checkPortAvailable: vi.fn(), | ||
| verifyGatewayContainerRunning: vi.fn(), | ||
| }); | ||
| expect(result).toBeNull(); | ||
| }); | ||
|
|
||
| it("cleans up stale metadata and returns downgraded state when the port frees up", async () => { | ||
| const log = vi.spyOn(console, "log").mockImplementation(() => undefined); | ||
| const destroyGateway = vi.fn(() => true); | ||
| const runOpenshell = vi.fn(); | ||
| const checkPortAvailable = vi.fn().mockResolvedValue({ ok: true }); | ||
|
|
||
| const result = await applyHealthyPortReuse({ | ||
| ...BASE_INPUT, | ||
| destroyGateway, | ||
| runOpenshell, | ||
| checkPortAvailable, | ||
| verifyGatewayContainerRunning: () => "missing", | ||
| }); | ||
|
|
||
| expect(result).not.toBe("continue"); | ||
| expect(result).not.toBeNull(); | ||
| if (result && result !== "continue") { | ||
| // Caller must see the downgraded reuse state even when the port frees up, | ||
| // so downstream branches don't keep treating the runtime as healthy. | ||
| expect(result.gatewayReuseState).toBe("missing"); | ||
| expect(result.portCheck.ok).toBe(true); | ||
| } | ||
| expect(destroyGateway).toHaveBeenCalledTimes(1); | ||
| expect(runOpenshell).toHaveBeenCalledWith(["forward", "stop", "18789"], { ignoreError: true }); | ||
| expect(checkPortAvailable).toHaveBeenCalledWith(8080, undefined); | ||
| const messages = log.mock.calls.map((c) => c[0]); | ||
| expect(messages).toContain(" Gateway metadata is stale (container not running). Cleaning up..."); | ||
| expect(messages).toContain(" ✓ Port 8080 available (OpenShell gateway)"); | ||
| }); | ||
|
|
||
| it("returns fall-through state when cleanup succeeds but the port is still busy", async () => { | ||
| vi.spyOn(console, "log").mockImplementation(() => undefined); | ||
| const checkPortAvailable = vi.fn().mockResolvedValue({ ok: false }); | ||
|
|
||
| const result = await applyHealthyPortReuse({ | ||
| ...BASE_INPUT, | ||
| destroyGateway: () => true, | ||
| runOpenshell: vi.fn(), | ||
| checkPortAvailable, | ||
| verifyGatewayContainerRunning: () => "missing", | ||
| }); | ||
|
|
||
| expect(result).not.toBeNull(); | ||
| expect(result).not.toBe("continue"); | ||
| if (result && result !== "continue") { | ||
| expect(result.gatewayReuseState).toBe("missing"); | ||
| expect(result.portCheck.ok).toBe(false); | ||
| } | ||
| }); | ||
|
|
||
| it("logs the reuse message and continues when the container is live", async () => { | ||
| const log = vi.spyOn(console, "log").mockImplementation(() => undefined); | ||
| const destroyGateway = vi.fn(() => true); | ||
| const checkPortAvailable = vi.fn(); | ||
|
|
||
| const result = await applyHealthyPortReuse({ | ||
| ...BASE_INPUT, | ||
| destroyGateway, | ||
| runOpenshell: vi.fn(), | ||
| checkPortAvailable, | ||
| verifyGatewayContainerRunning: () => "running", | ||
| }); | ||
|
|
||
| expect(result).toBe("continue"); | ||
| expect(destroyGateway).not.toHaveBeenCalled(); | ||
| expect(checkPortAvailable).not.toHaveBeenCalled(); | ||
| const messages = log.mock.calls.map((c) => c[0]); | ||
| expect(messages).toContain( | ||
| " ✓ Port 8080 already owned by healthy NemoClaw runtime (OpenShell gateway)", | ||
| ); | ||
| }); | ||
|
|
||
| it("skips the container probe when lifecycle commands are unsupported", async () => { | ||
| // Package-managed openshell builds intentionally have no openshell-cluster-* | ||
| // container. Probing Docker would always return "missing" and the decision | ||
| // would still come out as "reuse", so the probe is wasted work — worse, in | ||
| // environments where the docker CLI is flaky the probe can stall onboard. | ||
| const log = vi.spyOn(console, "log").mockImplementation(() => undefined); | ||
| const verifyContainer = vi.fn(); | ||
|
|
||
| const result = await applyHealthyPortReuse({ | ||
| ...BASE_INPUT, | ||
| supportsLifecycleCommands: false, | ||
| destroyGateway: () => true, | ||
| runOpenshell: vi.fn(), | ||
| checkPortAvailable: vi.fn(), | ||
| verifyGatewayContainerRunning: verifyContainer, | ||
| }); | ||
|
|
||
| expect(result).toBe("continue"); | ||
| expect(verifyContainer).not.toHaveBeenCalled(); | ||
| const messages = log.mock.calls.map((c) => c[0]); | ||
| expect(messages).toContain( | ||
| " ✓ Port 8080 already owned by healthy NemoClaw runtime (OpenShell gateway)", | ||
| ); | ||
| }); | ||
|
|
||
| it("reuses the dashboard port without consulting the container", async () => { | ||
| const log = vi.spyOn(console, "log").mockImplementation(() => undefined); | ||
| const checkPortAvailable = vi.fn(); | ||
| const verifyContainer = vi.fn(); | ||
|
|
||
| const result = await applyHealthyPortReuse({ | ||
| ...BASE_INPUT, | ||
| port: 18789, | ||
| label: "NemoClaw dashboard", | ||
| destroyGateway: () => true, | ||
| runOpenshell: vi.fn(), | ||
| checkPortAvailable, | ||
| verifyGatewayContainerRunning: verifyContainer, | ||
| }); | ||
|
|
||
| expect(result).toBe("continue"); | ||
| expect(verifyContainer).not.toHaveBeenCalled(); | ||
| expect(checkPortAvailable).not.toHaveBeenCalled(); | ||
| const messages = log.mock.calls.map((c) => c[0]); | ||
| expect(messages).toContain( | ||
| " ✓ Port 18789 already owned by healthy NemoClaw runtime (NemoClaw dashboard)", | ||
| ); | ||
| }); | ||
| }); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,107 @@ | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| import type { GatewayReuseState } from "../state/gateway"; | ||
| import { destroyGatewayForReuse } from "./gateway-cleanup"; | ||
| import type { GatewayContainerState } from "./gateway-container-running"; | ||
| import type { CheckPortOpts, PortProbeResult } from "./preflight"; | ||
|
|
||
| export type GatewayPortReuseDecision = "stale" | "reuse" | "skip"; | ||
|
|
||
| export interface GatewayPortReuseInput { | ||
| gatewayReuseState: GatewayReuseState; | ||
| supportsLifecycleCommands: boolean; | ||
| containerState: GatewayContainerState; | ||
| } | ||
|
|
||
| // When the preflight port-required check finds the gateway port occupied, the | ||
| // short-circuit in onboard.ts trusts `gatewayReuseState === "healthy"` and | ||
| // reuses the recorded runtime. That short-circuit can fire even after the | ||
| // openshell-cluster-* container has been removed out-of-band (Colima/Docker | ||
| // restart on macOS; manual `docker rm`), because the upstream stale-metadata | ||
| // check that would have cleared the recorded state is gated by | ||
| // `gatewayCliSupportsLifecycleCommands` and can be skipped. (#4268) | ||
| // | ||
| // This decision wraps the second-line defense: when the port is the gateway | ||
| // port and recorded state is "healthy", consult the live container state | ||
| // before reusing. Returns: | ||
| // - "stale" — recorded state is "healthy" but the legacy container is | ||
| // missing; caller should clear registry and re-check the port. | ||
| // - "reuse" — recorded state is "healthy" and the container is live (or the | ||
| // CLI lacks lifecycle commands, so the container model is | ||
| // package-managed and Docker is not the source of truth). | ||
| // - "skip" — recorded state is not "healthy"; this defensive path does not | ||
| // apply and the caller should fall through to other handling. | ||
| export function classifyGatewayPortReuse( | ||
| input: GatewayPortReuseInput, | ||
| ): GatewayPortReuseDecision { | ||
| if (input.gatewayReuseState !== "healthy") return "skip"; | ||
| if (!input.supportsLifecycleCommands) return "reuse"; | ||
| if (input.containerState === "missing") return "stale"; | ||
| return "reuse"; | ||
| } | ||
|
|
||
| export interface HealthyPortReuseInput { | ||
| port: number; | ||
| gatewayPort: number; | ||
| dashboardPort: number; | ||
| label: string; | ||
| runtimeDisplayName: string; | ||
| gatewayName: string; | ||
| gatewayReuseState: GatewayReuseState; | ||
| portCheckOptions: CheckPortOpts | undefined; | ||
| supportsLifecycleCommands: boolean; | ||
| destroyGateway: () => boolean; | ||
| runOpenshell: (args: string[], opts: { ignoreError: true }) => unknown; | ||
| checkPortAvailable: (port?: number, opts?: CheckPortOpts) => Promise<PortProbeResult>; | ||
| verifyGatewayContainerRunning: (gatewayName: string) => GatewayContainerState; | ||
| } | ||
|
|
||
| export type HealthyPortReuseOutcome = | ||
| | "continue" | ||
| | { gatewayReuseState: GatewayReuseState; portCheck: PortProbeResult }; | ||
|
|
||
| // Drive the port-required short-circuit when `gatewayReuseState === "healthy"`. | ||
| // Owns the stale-container detection, registry cleanup, port re-check, and | ||
| // console messaging. Returns `null` when the recorded state isn't reusable | ||
| // (caller should fall through to its usual port-conflict handling); returns | ||
| // "continue" when the loop iteration should be skipped; otherwise returns the | ||
| // updated `gatewayReuseState` plus the re-checked port result so the caller | ||
| // can replay the iteration with fresh state. | ||
| export async function applyHealthyPortReuse( | ||
| input: HealthyPortReuseInput, | ||
| ): Promise<HealthyPortReuseOutcome | null> { | ||
| const { port, gatewayPort, dashboardPort, label, runtimeDisplayName, gatewayName } = input; | ||
| if (input.gatewayReuseState !== "healthy") return null; | ||
| if (port !== gatewayPort && port !== dashboardPort) return null; | ||
| // Only probe the container when lifecycle commands are advertised — for | ||
| // package-managed gateways without lifecycle commands the openshell-cluster-* | ||
| // container intentionally doesn't exist and the probe would always report | ||
| // "missing", which is then ignored by classifyGatewayPortReuse anyway. | ||
| if (port === gatewayPort && input.supportsLifecycleCommands) { | ||
| const decision = classifyGatewayPortReuse({ | ||
| gatewayReuseState: input.gatewayReuseState, | ||
| supportsLifecycleCommands: true, | ||
| containerState: input.verifyGatewayContainerRunning(gatewayName), | ||
| }); | ||
| if (decision === "stale") { | ||
| console.log(" Gateway metadata is stale (container not running). Cleaning up..."); | ||
| input.runOpenshell(["forward", "stop", String(dashboardPort)], { ignoreError: true }); | ||
| const gatewayReuseState = destroyGatewayForReuse( | ||
| input.destroyGateway, | ||
| " ✓ Stale gateway metadata cleaned up", | ||
| " ! Stale gateway metadata cleanup failed; leaving registry state intact.", | ||
| ); | ||
| const portCheck = await input.checkPortAvailable(port, input.portCheckOptions); | ||
| if (portCheck.ok) { | ||
| console.log(` ✓ Port ${port} available (${label})`); | ||
| } | ||
| // Always return the downgraded state so the caller stops treating the | ||
| // runtime as healthy. The caller uses `portCheck.ok` to decide whether | ||
| // to continue the loop or fall through to its port-conflict diagnostic. | ||
| return { gatewayReuseState, portCheck }; | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| } | ||
| } | ||
| console.log(` ✓ Port ${port} already owned by healthy ${runtimeDisplayName} runtime (${label})`); | ||
| return "continue"; | ||
| } | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.