diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 9afcdbbf0a..c3b412d9a2 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -141,7 +141,6 @@ const crypto = require("node:crypto"); const fs = require("fs"); const os = require("os"); const path = require("path"); -const { spawnSync } = require("child_process"); const pRetry = require("p-retry"); /** Strip ANSI escape sequences before printing process output to the terminal. @@ -379,7 +378,6 @@ const { const sandboxGpuPreflight: typeof import("./onboard/sandbox-gpu-preflight") = require("./onboard/sandbox-gpu-preflight"); const { exitOnSandboxGpuConfigErrors, - formatSandboxGpuPassthroughNote, resolveSandboxGpuFlagFromOptions, validateSandboxGpuPreflight, } = sandboxGpuPreflight; @@ -459,21 +457,19 @@ const { const { handleFinalizationState, }: typeof import("./onboard/machine/handlers/finalization") = require("./onboard/machine/handlers/finalization"); -const { - handleGatewayState, -}: typeof import("./onboard/machine/handlers/gateway") = require("./onboard/machine/handlers/gateway"); const { handlePoliciesState, }: typeof import("./onboard/machine/handlers/policies") = require("./onboard/machine/handlers/policies"); -const { - handlePreflightState, -}: typeof import("./onboard/machine/handlers/preflight") = require("./onboard/machine/handlers/preflight"); const { handleProviderInferenceState, }: typeof import("./onboard/machine/handlers/provider-inference") = require("./onboard/machine/handlers/provider-inference"); const { handleSandboxState, }: typeof import("./onboard/machine/handlers/sandbox") = require("./onboard/machine/handlers/sandbox"); +const { + createInitialOnboardFlowPhases, + runInitialOnboardFlowSlice, +}: typeof import("./onboard/machine/initial-flow-phases") = require("./onboard/machine/initial-flow-phases"); const { advanceTo, }: typeof import("./onboard/machine/result") = require("./onboard/machine/result"); @@ -6415,29 +6411,56 @@ async function onboard(opts: OnboardOptions = {}): Promise { const explicitSandboxGpuFlag = resolveSandboxGpuFlagFromOptions(opts); const recordedGpuPassthroughBeforePreflight = session?.gpuPassthrough === true; - const preflightResult = await handlePreflightState({ + type InitialOnboardFlowContext = + import("./onboard/machine/initial-flow-phases").InitialOnboardFlowContext< + typeof agent, + ReturnType, + ReturnType + >; + const initialFlowContext: InitialOnboardFlowContext = { resume, + fresh, session, + agent, recordedSandboxName, requestedSandboxName, + sandboxName: recordedSandboxName || requestedSandboxName || null, + fromDockerfile, + model: session?.model || null, + provider: session?.provider || null, + endpointUrl: session?.endpointUrl || null, + credentialEnv: session?.credentialEnv || null, + hermesAuthMethod: normalizeHermesAuthMethod(session?.hermesAuthMethod), + hermesToolGateways: normalizeHermesToolGatewaySelections(session?.hermesToolGateways), + preferredInferenceApi: session?.preferredInferenceApi || null, + nimContainer: session?.nimContainer || null, + webSearchConfig: session?.webSearchConfig || null, + webSearchSupported: false, + selectedMessagingChannels, + gpu: null, + sandboxGpuConfig: null, + gpuPassthrough: false, + resumeHasResolvedGpuIntent: false, + requestedGpuPassthrough: opts.gpu === true, + }; + + const [preflightPhase, gatewayPhase] = createInitialOnboardFlowPhases({ explicitSandboxGpuFlag, sandboxGpuDevice: opts.sandboxGpuDevice ?? null, gpuRequested: opts.gpu === true, noGpu: opts.noGpu === true, env: process.env, - deps: { + recordedGpuPassthroughBeforePreflight, + ensureResumePreflightDashboardPortAvailable: () => { + if (_preflightDashboardPort === null) preflightDashboardPortRangeAvailability(); + }, + preflightDeps: { getSandbox: registry.getSandbox.bind(registry), getResumeSandboxGpuOverrides, detectGpu: nim.detectGpu, runPreflight: (preflightOptions) => preflight({ ...opts, ...preflightOptions }), assessHost, assertCdiNvidiaGpuSpecPresent, - // Resume backstops for #3508/#3630/Podman: the cached preflight - // step does not capture host Docker/DNS state, and a session - // written by an older NemoClaw may not have run the new bridge/ - // DNS fatal checks (mirrors the assertCdiNvidiaGpuSpecPresent - // resume pattern). Podman rejection runs first so users on - // unsupported runtimes don't see Docker-specific diagnostics. rejectUnsupportedContainerRuntime, assertDockerBridgeAndContainerDnsHealthy, resolveSandboxGpuConfig, @@ -6448,59 +6471,11 @@ async function onboard(opts: OnboardOptions = {}): Promise { recordStepComplete, updateSession: onboardSession.updateSession, }, - }); - if (resume && _preflightDashboardPort === null) preflightDashboardPortRangeAvailability(); // #3953 — resume must mirror preflight()'s fail-fast - session = (await recordStateResult(preflightResult.stateResult), preflightResult.session); - const { - sandboxGpuConfig, - resumeHasResolvedGpuIntent, - requestedGpuPassthrough, - gpuPassthrough, - } = preflightResult; - const gpu = preflightResult.gpu ?? null; - if (gpuPassthrough) { - note( - formatSandboxGpuPassthroughNote({ - hostGpuPlatform: sandboxGpuConfig.hostGpuPlatform, - resumeHasResolvedGpuIntent, - recordedGpuPassthroughBeforePreflight, - requestedGpuPassthrough, - sandboxGpuMode: sandboxGpuConfig.mode, - }), - ); - } else if (gpu?.platform === "jetson") { - note(" Sandbox GPU disabled by configuration on Jetson/Tegra."); - } else if (process.platform === "linux" && !opts.noGpu) { - try { - const lspci = spawnSync("lspci", { encoding: "utf-8", timeout: 5000 }); - if (lspci.status === 0 && /nvidia/i.test(lspci.stdout || "")) { - const smi = spawnSync( - "nvidia-smi", - ["--query-gpu=name", "--format=csv,noheader,nounits"], - { encoding: "utf-8", timeout: 5000 }, - ); - note( - smi.status === 0 && smi.stdout?.trim() - ? " NVIDIA GPU detected with working drivers, but GPU passthrough was not enabled.\n If Docker GPU support is needed, install nvidia-container-toolkit and run:\n sudo nvidia-ctk runtime configure --runtime=docker && sudo systemctl restart docker" - : " NVIDIA GPU hardware detected but nvidia-smi is not available.\n Install NVIDIA drivers and the Container Toolkit for default GPU passthrough.", - ); - } - } catch { - /* lspci not available — skip hint */ - } - } - const gatewaySnapshot = selectNamedGatewayForReuseIfNeeded(getGatewayReuseSnapshot()); - const gatewayResult = await handleGatewayState({ - resume, - session, - initialGatewayReuseState: gatewaySnapshot.gatewayReuseState, - gpu, - gpuPassthrough, + getInitialGatewayReuseState: () => + selectNamedGatewayForReuseIfNeeded(getGatewayReuseSnapshot()).gatewayReuseState, gatewayName: GATEWAY_NAME, - recordedSandboxName, - requestedSandboxName, - recreateSandbox: isRecreateSandbox(), - deps: { + recreateSandbox: isRecreateSandbox, + gatewayDeps: { refreshDockerDriverGatewayReuseState, gatewayCliSupportsLifecycleCommands: () => gatewayCliSupportsLifecycleCommands(runCaptureOpenshell), @@ -6529,8 +6504,23 @@ async function onboard(opts: OnboardOptions = {}): Promise { recordStepComplete, exitProcess: (code) => process.exit(code), }, + note, }); - session = (await recordStateResult(gatewayResult.stateResult), gatewayResult.session); + const initialFlowResult = await runInitialOnboardFlowSlice({ + context: initialFlowContext, + runtime: onboardRuntimeBoundary.getRuntime(), + phases: [preflightPhase, gatewayPhase], + resume, + recordStateResult, + }); + + const initialContext = initialFlowResult.context; + if (!initialContext.sandboxGpuConfig) { + throw new Error("Preflight did not produce a sandbox GPU configuration."); + } + session = initialFlowResult.session; + const sandboxGpuConfig = initialContext.sandboxGpuConfig; + const gpu = initialContext.gpu; // #2753: prefer requestedSandboxName over an unconfirmed session name. // A pre-fix session may carry sandboxName even though sandbox creation diff --git a/src/lib/onboard/machine/initial-flow-phases.test.ts b/src/lib/onboard/machine/initial-flow-phases.test.ts new file mode 100644 index 0000000000..4aac746740 --- /dev/null +++ b/src/lib/onboard/machine/initial-flow-phases.test.ts @@ -0,0 +1,468 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it, vi } from "vitest"; + +import { createSession, type Session } from "../../state/onboard-session"; +import { + createInitialOnboardFlowPhases, + type InitialOnboardFlowContext, + runInitialOnboardFlowSlice, +} from "./initial-flow-phases"; +import { advanceTo } from "./result"; +import type { OnboardMachineRunnerRuntime } from "./runner"; +import type { OnboardSequencePhase } from "./sequence-runner"; + +type Gpu = { type: "nvidia"; platform: "linux" | "jetson" } | null; +type SandboxGpuConfig = { + sandboxGpuEnabled: boolean; + mode: string; + hostGpuPlatform: string | null; + sandboxGpuDevice?: string | null; + errors?: string[]; +}; +type Context = InitialOnboardFlowContext; + +function context(overrides: Partial = {}): Context { + return { + resume: false, + fresh: false, + session: createSession(), + agent: null, + recordedSandboxName: null, + requestedSandboxName: null, + sandboxName: null, + fromDockerfile: null, + model: null, + provider: null, + endpointUrl: null, + credentialEnv: null, + hermesAuthMethod: null, + hermesToolGateways: [], + preferredInferenceApi: null, + nimContainer: null, + webSearchConfig: null, + webSearchSupported: false, + selectedMessagingChannels: [], + gpu: null, + sandboxGpuConfig: null, + gpuPassthrough: false, + resumeHasResolvedGpuIntent: false, + requestedGpuPassthrough: false, + ...overrides, + }; +} + +function config(gpu: Gpu): SandboxGpuConfig { + return { + sandboxGpuEnabled: Boolean(gpu), + mode: gpu ? "1" : "0", + hostGpuPlatform: gpu?.platform ?? null, + sandboxGpuDevice: null, + errors: [], + }; +} + +function runtime(session: Session = createSession()): OnboardMachineRunnerRuntime { + return { + session: async () => session, + applyResult: async () => session, + }; +} + +function completeStep(): Session["steps"][string] { + return { + status: "complete", + startedAt: "2026-06-09T00:00:00.000Z", + completedAt: "2026-06-09T00:01:00.000Z", + error: null, + }; +} + +describe("initial onboard flow phases", () => { + it("carries preflight GPU output into the gateway phase", async () => { + const notes: string[] = []; + const gpu: Gpu = { type: "nvidia", platform: "linux" }; + const phases = createInitialOnboardFlowPhases({ + explicitSandboxGpuFlag: null, + sandboxGpuDevice: null, + gpuRequested: true, + noGpu: false, + env: {}, + platform: "darwin", + recordedGpuPassthroughBeforePreflight: false, + ensureResumePreflightDashboardPortAvailable: vi.fn(), + preflightDeps: { + getSandbox: () => null, + getResumeSandboxGpuOverrides: () => ({ flag: null, device: null }), + detectGpu: () => gpu, + runPreflight: async () => gpu, + assessHost: () => ({}), + assertCdiNvidiaGpuSpecPresent: vi.fn(), + rejectUnsupportedContainerRuntime: vi.fn(), + assertDockerBridgeAndContainerDnsHealthy: vi.fn(), + resolveSandboxGpuConfig: config, + validateSandboxGpuPreflight: vi.fn(), + skippedStepMessage: vi.fn(), + recordStateSkipped: async () => createSession(), + startRecordedStep: vi.fn(), + recordStepComplete: async () => createSession(), + updateSession: (mutator) => { + const next = createSession(); + return mutator(next) ?? next; + }, + }, + getInitialGatewayReuseState: () => "healthy", + gatewayName: "nemoclaw", + recreateSandbox: () => false, + gatewayDeps: { + refreshDockerDriverGatewayReuseState: async (state) => state, + gatewayCliSupportsLifecycleCommands: () => false, + verifyGatewayContainerRunning: () => "running", + waitForGatewayHttpReady: async () => true, + recoverGatewayRuntime: async () => true, + getGatewayLocalEndpoint: () => "http://127.0.0.1:31818", + stopDashboardForward: vi.fn(), + destroyGateway: () => true, + destroyGatewayForReuse: () => "missing", + getGatewayClusterImageDrift: () => null, + stopAllDashboardForwards: vi.fn(), + reconcileGatewayGpuReuseForGpuIntent: (options) => options.gatewayReuseState, + isLinuxDockerDriverGatewayEnabled: () => false, + retireLegacyGatewayForDockerDriverUpgrade: vi.fn(), + destroyGatewayRuntimeForGpuReuse: () => true, + skippedStepMessage: vi.fn(), + recordStateSkipped: async () => createSession(), + note: (message) => notes.push(message), + startRecordedStep: vi.fn(), + startGateway: vi.fn(), + recordStepComplete: async () => createSession(), + exitProcess: (code) => { + throw new Error(`exit ${code}`); + }, + }, + note: (message) => notes.push(message), + }); + + const preflight = await phases[0].run(context()); + const gateway = await phases[1].run(preflight.context); + + expect(preflight.context.gpu).toEqual(gpu); + expect(preflight.context.sandboxGpuConfig).toEqual(config(gpu)); + expect(preflight.context.gpuPassthrough).toBe(true); + expect(gateway.result).toEqual( + advanceTo("provider_selection", { + metadata: { state: "gateway", gatewayReuseState: "healthy" }, + }), + ); + expect(notes).toContain( + " GPU passthrough requested; passing --gpu to OpenShell gateway and sandbox creation.", + ); + }); + + it("records each phase result on the resume compatibility path", async () => { + const recorded: string[] = []; + const phases: readonly OnboardSequencePhase[] = [ + { + state: "preflight", + run: (ctx) => ({ context: ctx, result: advanceTo("gateway") }), + }, + { + state: "gateway", + run: (ctx) => ({ context: ctx, result: advanceTo("provider_selection") }), + }, + ]; + + await runInitialOnboardFlowSlice({ + context: context({ resume: true }), + runtime: runtime(), + phases, + resume: true, + recordStateResult: async (result) => { + if (result.type === "transition") recorded.push(result.next); + }, + }); + + expect(recorded).toEqual(["gateway", "provider_selection"]); + }); + + it("returns the runtime session after resume compatibility state recording", async () => { + const phaseSession = createSession({ + machine: { + version: 1, + state: "preflight", + stateEnteredAt: "2026-06-09T00:00:00.000Z", + revision: 0, + }, + }); + let runtimeSession = createSession({ + machine: { + version: 1, + state: "preflight", + stateEnteredAt: "2026-06-09T00:00:00.000Z", + revision: 0, + }, + }); + const phases: readonly OnboardSequencePhase[] = [ + { + state: "preflight", + run: (ctx) => ({ + context: { ...ctx, session: phaseSession }, + result: advanceTo("gateway"), + }), + }, + ]; + + const result = await runInitialOnboardFlowSlice({ + context: context({ resume: true, session: phaseSession }), + runtime: { + session: async () => runtimeSession, + applyResult: async () => { + throw new Error("resume compatibility path should not use strict applyResult"); + }, + }, + phases, + resume: true, + recordStateResult: async (stateResult) => { + if (stateResult.type === "transition") { + runtimeSession = createSession({ + machine: { + version: 1, + state: stateResult.next, + stateEnteredAt: "2026-06-09T00:01:00.000Z", + revision: 1, + }, + }); + } + }, + }); + + expect(result.context.session).toBe(phaseSession); + expect(result.session).toBe(runtimeSession); + expect(result.session.machine.state).toBe("gateway"); + }); + + it("runs resume preflight and gateway backstops when saved machine state is already ahead", async () => { + const calls: string[] = []; + const gpu: Gpu = { type: "nvidia", platform: "linux" }; + const session = createSession({ + gpuPassthrough: true, + machine: { + version: 1, + state: "provider_selection", + stateEnteredAt: "2026-06-09T00:02:00.000Z", + revision: 7, + }, + steps: { + preflight: completeStep(), + gateway: completeStep(), + }, + }); + const ensureResumePreflightDashboardPortAvailable = vi.fn(() => { + calls.push("ensure-resume-preflight-port"); + }); + const phases = createInitialOnboardFlowPhases({ + explicitSandboxGpuFlag: null, + sandboxGpuDevice: null, + gpuRequested: false, + noGpu: false, + env: {}, + platform: "darwin", + recordedGpuPassthroughBeforePreflight: true, + ensureResumePreflightDashboardPortAvailable, + preflightDeps: { + getSandbox: vi.fn(() => { + calls.push("get-sandbox"); + return { name: "existing" }; + }), + getResumeSandboxGpuOverrides: vi.fn(() => { + calls.push("resume-gpu-overrides"); + return { flag: "enable" as const, device: null }; + }), + detectGpu: vi.fn(() => { + calls.push("detect-gpu"); + return gpu; + }), + runPreflight: vi.fn(async () => { + throw new Error("cached resume preflight should not run full preflight"); + }), + assessHost: vi.fn(() => { + calls.push("assess-host"); + return { docker: true }; + }), + assertCdiNvidiaGpuSpecPresent: vi.fn(() => { + calls.push("assert-cdi"); + }), + rejectUnsupportedContainerRuntime: vi.fn(() => { + calls.push("reject-unsupported-runtime"); + }), + assertDockerBridgeAndContainerDnsHealthy: vi.fn(() => { + calls.push("assert-bridge-dns"); + }), + resolveSandboxGpuConfig: vi.fn((detectedGpu) => { + calls.push("resolve-gpu-config"); + return config(detectedGpu); + }), + validateSandboxGpuPreflight: vi.fn(() => { + calls.push("validate-gpu-preflight"); + }), + skippedStepMessage: vi.fn(() => { + calls.push("skip-preflight"); + }), + recordStateSkipped: vi.fn(async () => { + calls.push("record-preflight-skipped"); + return session; + }), + startRecordedStep: vi.fn(async () => { + throw new Error("cached resume preflight should not start a recorded preflight step"); + }), + recordStepComplete: vi.fn(async () => session), + updateSession: vi.fn((mutator) => mutator(session) ?? session), + }, + getInitialGatewayReuseState: () => { + calls.push("initial-gateway-reuse-state"); + return "healthy"; + }, + gatewayName: "nemoclaw", + recreateSandbox: () => false, + gatewayDeps: { + refreshDockerDriverGatewayReuseState: vi.fn(async (state) => { + calls.push("refresh-gateway-reuse"); + return state; + }), + gatewayCliSupportsLifecycleCommands: vi.fn(() => { + calls.push("gateway-lifecycle-support"); + return false; + }), + verifyGatewayContainerRunning: vi.fn(() => { + throw new Error("gateway lifecycle probe should not run without lifecycle support"); + }), + waitForGatewayHttpReady: vi.fn(async () => true), + recoverGatewayRuntime: vi.fn(async () => true), + getGatewayLocalEndpoint: vi.fn(() => "http://127.0.0.1:31818"), + stopDashboardForward: vi.fn(), + destroyGateway: vi.fn(() => true), + destroyGatewayForReuse: vi.fn(() => "missing" as const), + getGatewayClusterImageDrift: vi.fn(() => null), + stopAllDashboardForwards: vi.fn(), + reconcileGatewayGpuReuseForGpuIntent: vi.fn((options) => { + calls.push("reconcile-gateway-gpu"); + return options.gatewayReuseState; + }), + isLinuxDockerDriverGatewayEnabled: vi.fn(() => false), + retireLegacyGatewayForDockerDriverUpgrade: vi.fn(), + destroyGatewayRuntimeForGpuReuse: vi.fn(() => true), + skippedStepMessage: vi.fn(() => { + calls.push("skip-gateway"); + }), + recordStateSkipped: vi.fn(async () => { + calls.push("record-gateway-skipped"); + return session; + }), + note: vi.fn(), + startRecordedStep: vi.fn(async () => { + throw new Error("healthy resume gateway should not start a recorded gateway step"); + }), + startGateway: vi.fn(async () => { + throw new Error("healthy resume gateway should not start a new gateway"); + }), + recordStepComplete: vi.fn(async () => { + calls.push("record-gateway-complete"); + return session; + }), + exitProcess: (code) => { + throw new Error(`exit ${code}`); + }, + }, + note: vi.fn(), + }); + const recorded: string[] = []; + + const result = await runInitialOnboardFlowSlice({ + context: context({ + resume: true, + session, + recordedSandboxName: "existing", + gpuPassthrough: true, + }), + runtime: runtime(session), + phases, + resume: true, + recordStateResult: async (stateResult) => { + if (stateResult.type === "transition") recorded.push(stateResult.next); + }, + }); + + expect(result.session.machine.state).toBe("provider_selection"); + expect(ensureResumePreflightDashboardPortAvailable).toHaveBeenCalledOnce(); + expect(calls).toEqual([ + "get-sandbox", + "resume-gpu-overrides", + "skip-preflight", + "record-preflight-skipped", + "detect-gpu", + "resolve-gpu-config", + "validate-gpu-preflight", + "assess-host", + "reject-unsupported-runtime", + "assert-cdi", + "assert-bridge-dns", + "resolve-gpu-config", + "ensure-resume-preflight-port", + "initial-gateway-reuse-state", + "refresh-gateway-reuse", + "gateway-lifecycle-support", + "reconcile-gateway-gpu", + "skip-gateway", + "record-gateway-skipped", + "record-gateway-complete", + ]); + expect(recorded).toEqual(["gateway", "provider_selection"]); + }); + + it("uses the strict runner for fresh init sessions", async () => { + const order: string[] = []; + const session = createSession(); + const phases: readonly OnboardSequencePhase[] = [ + { + state: "preflight", + run: (ctx) => { + order.push("preflight"); + return { context: ctx, result: advanceTo("gateway") }; + }, + }, + { + state: "gateway", + run: (ctx) => { + order.push("gateway"); + return { context: ctx, result: advanceTo("provider_selection") }; + }, + }, + ]; + + const result = await runInitialOnboardFlowSlice({ + context: context(), + runtime: { + session: async () => session, + applyResult: async (stateResult) => { + if (stateResult.type === "transition") { + session.machine = { + ...session.machine, + state: stateResult.next, + revision: session.machine.revision + 1, + }; + } + return session; + }, + }, + phases, + resume: false, + recordStateResult: async () => { + throw new Error("compatibility recorder should not run"); + }, + }); + + expect(order).toEqual(["preflight", "gateway"]); + expect(result.session.machine.state).toBe("provider_selection"); + }); +}); diff --git a/src/lib/onboard/machine/initial-flow-phases.ts b/src/lib/onboard/machine/initial-flow-phases.ts new file mode 100644 index 0000000000..30bc1f827c --- /dev/null +++ b/src/lib/onboard/machine/initial-flow-phases.ts @@ -0,0 +1,227 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { spawnSync } from "node:child_process"; +import type { GatewayReuseState } from "../../state/gateway"; +import { formatSandboxGpuPassthroughNote } from "../sandbox-gpu-notes"; +import type { OnboardFlowContext } from "./flow-context"; +import { runInitialOnboardFlowSequence } from "./flow-slices"; +import { type GatewayStateOptions, handleGatewayState } from "./handlers/gateway"; +import { + handlePreflightState, + type PreflightSandboxGpuConfig, + type PreflightSandboxGpuFlag, + type PreflightStateOptions, +} from "./handlers/preflight"; +import type { OnboardStateResult } from "./result"; +import type { + OnboardMachineRunnerResult, + OnboardMachineRunnerRuntime, + OnboardStateHandlerResult, +} from "./runner"; +import type { OnboardSequencePhase } from "./sequence-runner"; + +export type InitialOnboardFlowContext< + Agent, + Gpu, + Config extends PreflightSandboxGpuConfig, +> = OnboardFlowContext & { + resumeHasResolvedGpuIntent: boolean; + requestedGpuPassthrough: boolean; +}; + +type SpawnSync = typeof spawnSync; + +export interface InitialOnboardFlowPhaseOptions< + Context extends InitialOnboardFlowContext, + Agent, + Gpu, + SandboxEntry, + Host, + Config extends PreflightSandboxGpuConfig, +> { + explicitSandboxGpuFlag: PreflightSandboxGpuFlag; + sandboxGpuDevice?: string | null; + gpuRequested: boolean; + noGpu: boolean; + env: NodeJS.ProcessEnv; + platform?: NodeJS.Platform; + recordedGpuPassthroughBeforePreflight: boolean; + ensureResumePreflightDashboardPortAvailable(): void; + preflightDeps: PreflightStateOptions["deps"]; + getInitialGatewayReuseState(): GatewayReuseState; + gatewayName: string; + recreateSandbox(): boolean; + gatewayDeps: GatewayStateOptions["deps"]; + note(message: string): void; + spawnSync?: SpawnSync; +} + +function stateResults(result: OnboardStateHandlerResult): readonly OnboardStateResult[] { + if (Array.isArray(result)) return result as readonly OnboardStateResult[]; + return [result as OnboardStateResult]; +} + +function emitPreflightGpuNote(options: { + gpu: Gpu | null; + sandboxGpuConfig: Config; + gpuPassthrough: boolean; + resumeHasResolvedGpuIntent: boolean; + requestedGpuPassthrough: boolean; + recordedGpuPassthroughBeforePreflight: boolean; + noGpu: boolean; + platform: NodeJS.Platform; + note(message: string): void; + spawnSync: SpawnSync; +}): void { + const gpuPlatform = (options.gpu as { platform?: string | null } | null)?.platform ?? null; + if (options.gpuPassthrough) { + options.note( + formatSandboxGpuPassthroughNote({ + hostGpuPlatform: options.sandboxGpuConfig.hostGpuPlatform, + resumeHasResolvedGpuIntent: options.resumeHasResolvedGpuIntent, + recordedGpuPassthroughBeforePreflight: options.recordedGpuPassthroughBeforePreflight, + requestedGpuPassthrough: options.requestedGpuPassthrough, + sandboxGpuMode: options.sandboxGpuConfig.mode, + }), + ); + return; + } + if (gpuPlatform === "jetson") { + options.note(" Sandbox GPU disabled by configuration on Jetson/Tegra."); + return; + } + if (options.platform !== "linux" || options.noGpu) return; + try { + const lspci = options.spawnSync("lspci", { encoding: "utf-8", timeout: 5000 }); + if (lspci.status === 0 && /nvidia/i.test(lspci.stdout || "")) { + const smi = options.spawnSync( + "nvidia-smi", + ["--query-gpu=name", "--format=csv,noheader,nounits"], + { encoding: "utf-8", timeout: 5000 }, + ); + options.note( + smi.status === 0 && smi.stdout?.trim() + ? " NVIDIA GPU detected with working drivers, but GPU passthrough was not enabled.\n If Docker GPU support is needed, install nvidia-container-toolkit and run:\n sudo nvidia-ctk runtime configure --runtime=docker && sudo systemctl restart docker" + : " NVIDIA GPU hardware detected but nvidia-smi is not available.\n Install NVIDIA drivers and the Container Toolkit for default GPU passthrough.", + ); + } + } catch { + /* lspci not available - skip hint */ + } +} + +export function createInitialOnboardFlowPhases< + Context extends InitialOnboardFlowContext, + Agent, + Gpu, + SandboxEntry, + Host, + Config extends PreflightSandboxGpuConfig, +>( + options: InitialOnboardFlowPhaseOptions, +): readonly [OnboardSequencePhase, OnboardSequencePhase] { + const preflightPhase: OnboardSequencePhase = { + state: "preflight", + async run(context) { + const preflightResult = await handlePreflightState({ + resume: context.resume, + session: context.session, + recordedSandboxName: context.recordedSandboxName, + requestedSandboxName: context.requestedSandboxName, + explicitSandboxGpuFlag: options.explicitSandboxGpuFlag, + sandboxGpuDevice: options.sandboxGpuDevice ?? null, + gpuRequested: options.gpuRequested, + noGpu: options.noGpu, + env: options.env, + deps: options.preflightDeps, + }); + if (context.resume) options.ensureResumePreflightDashboardPortAvailable(); + + const preflightGpu = preflightResult.gpu ?? null; + emitPreflightGpuNote({ + gpu: preflightGpu, + sandboxGpuConfig: preflightResult.sandboxGpuConfig, + gpuPassthrough: preflightResult.gpuPassthrough, + resumeHasResolvedGpuIntent: preflightResult.resumeHasResolvedGpuIntent, + requestedGpuPassthrough: preflightResult.requestedGpuPassthrough, + recordedGpuPassthroughBeforePreflight: options.recordedGpuPassthroughBeforePreflight, + noGpu: options.noGpu, + platform: options.platform ?? process.platform, + note: options.note, + spawnSync: options.spawnSync ?? spawnSync, + }); + return { + context: { + ...context, + session: preflightResult.session, + gpu: preflightGpu, + sandboxGpuConfig: preflightResult.sandboxGpuConfig, + gpuPassthrough: preflightResult.gpuPassthrough, + resumeHasResolvedGpuIntent: preflightResult.resumeHasResolvedGpuIntent, + requestedGpuPassthrough: preflightResult.requestedGpuPassthrough, + }, + result: preflightResult.stateResult, + }; + }, + }; + + const gatewayPhase: OnboardSequencePhase = { + state: "gateway", + async run(context) { + const gatewayResult = await handleGatewayState({ + resume: context.resume, + session: context.session, + initialGatewayReuseState: options.getInitialGatewayReuseState(), + gpu: context.gpu as Gpu, + gpuPassthrough: context.gpuPassthrough, + gatewayName: options.gatewayName, + recordedSandboxName: context.recordedSandboxName, + requestedSandboxName: context.requestedSandboxName, + recreateSandbox: options.recreateSandbox(), + deps: options.gatewayDeps, + }); + return { + context: { ...context, session: gatewayResult.session }, + result: gatewayResult.stateResult, + }; + }, + }; + + return [preflightPhase, gatewayPhase]; +} + +export async function runInitialOnboardFlowSlice(options: { + context: Context; + runtime: OnboardMachineRunnerRuntime; + phases: readonly OnboardSequencePhase[]; + resume: boolean; + recordStateResult(result: OnboardStateResult): Promise; +}): Promise> { + const initialRuntimeSession = await options.runtime.session(); + // Keep resume on the compatibility path for now: resume intentionally re-runs + // preflight/gateway backstops even when the saved machine is already ahead. + // Remove this fallback only after resume repairs are modeled as strict FSM + // transitions that preserve these safety checks before later phases run. + if ( + !options.resume && + (initialRuntimeSession.machine.state === "init" || + initialRuntimeSession.machine.state === "preflight") + ) { + return runInitialOnboardFlowSequence({ + context: options.context, + runtime: options.runtime, + phases: options.phases, + }); + } + + let context = options.context; + for (const phase of options.phases) { + const phaseResult = await phase.run(context); + for (const stateResult of stateResults(phaseResult.result)) { + await options.recordStateResult(stateResult); + } + context = phaseResult.context; + } + return { context, session: await options.runtime.session() }; +} diff --git a/src/lib/onboard/sandbox-gpu-notes.ts b/src/lib/onboard/sandbox-gpu-notes.ts new file mode 100644 index 0000000000..9a8cf954a7 --- /dev/null +++ b/src/lib/onboard/sandbox-gpu-notes.ts @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +export function formatSandboxGpuPassthroughNote(options: { + hostGpuPlatform?: string | null; + resumeHasResolvedGpuIntent?: boolean; + recordedGpuPassthroughBeforePreflight?: boolean; + requestedGpuPassthrough?: boolean; + sandboxGpuMode?: string | null; +}): string { + if (options.hostGpuPlatform === "jetson") { + return " NVIDIA Jetson/Tegra GPU detected; enabling sandbox GPU through Docker NVIDIA runtime. Use --no-gpu to opt out."; + } + if (options.resumeHasResolvedGpuIntent && options.recordedGpuPassthroughBeforePreflight) { + return " [resume] Continuing GPU passthrough from the saved onboarding session."; + } + if (options.requestedGpuPassthrough || options.sandboxGpuMode === "1") { + return " GPU passthrough requested; passing --gpu to OpenShell gateway and sandbox creation."; + } + return " NVIDIA GPU detected; enabling OpenShell GPU passthrough. Use --no-gpu to opt out."; +} diff --git a/src/lib/onboard/sandbox-gpu-preflight.ts b/src/lib/onboard/sandbox-gpu-preflight.ts index e859e26ccc..9778b9ab3c 100644 --- a/src/lib/onboard/sandbox-gpu-preflight.ts +++ b/src/lib/onboard/sandbox-gpu-preflight.ts @@ -13,6 +13,8 @@ import { wslDockerDesktopGpuCompatibilityRemediationLines, } from "./wsl-docker-desktop-gpu"; +export { formatSandboxGpuPassthroughNote } from "./sandbox-gpu-notes"; + const SANDBOX_GPU_PREFLIGHT_TIMEOUT_MS = 30_000; export type SandboxGpuPreflightDeps = WslDockerDesktopDetectionDeps & { @@ -86,25 +88,6 @@ export function exitOnSandboxGpuConfigErrors(config: SandboxGpuConfig): void { } } -export function formatSandboxGpuPassthroughNote(options: { - hostGpuPlatform?: string | null; - resumeHasResolvedGpuIntent?: boolean; - recordedGpuPassthroughBeforePreflight?: boolean; - requestedGpuPassthrough?: boolean; - sandboxGpuMode?: string | null; -}): string { - if (options.hostGpuPlatform === "jetson") { - return " NVIDIA Jetson/Tegra GPU detected; enabling sandbox GPU through Docker NVIDIA runtime. Use --no-gpu to opt out."; - } - if (options.resumeHasResolvedGpuIntent && options.recordedGpuPassthroughBeforePreflight) { - return " [resume] Continuing GPU passthrough from the saved onboarding session."; - } - if (options.requestedGpuPassthrough || options.sandboxGpuMode === "1") { - return " GPU passthrough requested; passing --gpu to OpenShell gateway and sandbox creation."; - } - return " NVIDIA GPU detected; enabling OpenShell GPU passthrough. Use --no-gpu to opt out."; -} - export function parseDockerRuntimeNames(value: string | null | undefined): string[] { const raw = String(value || "").trim(); if (!raw || raw === "") return [];