diff --git a/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts b/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts index 7947b9a81b..fa7c15ef01 100644 --- a/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts +++ b/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts @@ -8,16 +8,21 @@ // generated-shell shape assertions live in // runtime-hermes-secret-boundary-shape.test.ts. +import { spawnSync } from "node:child_process"; import fs from "node:fs"; import os from "node:os"; import path from "node:path"; -import { spawnSync } from "node:child_process"; -import { describe, it, expect } from "vitest"; +import { describe, expect, it } from "vitest"; import { - HERMES_SECRET_BOUNDARY_VALIDATOR_PATH, __testing, + HERMES_SECRET_BOUNDARY_VALIDATOR_PATH, } from "../../../dist/lib/agent/hermes-recovery-boundary"; import { buildRecoveryScript } from "../../../dist/lib/agent/runtime"; +import { + createRecoveryPreloadHarnessPaths, + type RecoveryPreloadHarnessPaths, + rewriteRecoveryPreloadPaths, +} from "../../../test/helpers/runtime-recovery-preload-test-helpers"; import { hermesAgent } from "./hermes-recovery-boundary-fixtures"; function writeStub(dir: string, name: string, body: string) { @@ -193,6 +198,7 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = hermesLaunchMarker, gatewayLogPath, recoveryFallbackLog, + ...createRecoveryPreloadHarnessPaths(tmp), }; } @@ -204,19 +210,21 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = writeStub(stubsDir, "hermes", `: > ${JSON.stringify(hermesLaunchMarker)}\nexit 0`); } - function runRecovery(opts: { - stubsDir: string; - validatorPath: string; - envFilePath?: string; - proxyEnvPath?: string; - recoveryLogPath: string; - gatewayLogPath: string; - recoveryFallbackLog: string; - tmp: string; - }) { + function runRecovery( + opts: { + stubsDir: string; + validatorPath: string; + envFilePath?: string; + proxyEnvPath?: string; + recoveryLogPath: string; + gatewayLogPath: string; + recoveryFallbackLog: string; + tmp: string; + } & RecoveryPreloadHarnessPaths, + ) { const recoveryScript = buildRecoveryScript(hermesAgent, 8642); expect(recoveryScript).not.toBeNull(); - let stubbed = recoveryScript! + let stubbed = rewriteRecoveryPreloadPaths(recoveryScript!, opts) .replace(new RegExp(HERMES_SECRET_BOUNDARY_VALIDATOR_PATH, "g"), opts.validatorPath) .replace(/\/tmp\/gateway-recovery\.log/g, opts.recoveryLogPath) .replace(/\/tmp\/gateway\.log/g, opts.gatewayLogPath) @@ -384,6 +392,7 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = proxyEnvFile, "export NODE_OPTIONS='--require=nemoclaw-sandbox-safety-net --require=nemoclaw-ciao-network-guard'\n", ); + fs.chmodSync(proxyEnvFile, 0o444); writeStub(harness.stubsDir, "python3", `${SHARED_PYTHON_STUB_BY_MODE}\n`); stubBaselineUtilities(harness.stubsDir, harness.pkillLog, harness.hermesLaunchMarker); @@ -394,7 +403,7 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = (() => { const recoveryScript = buildRecoveryScript(hermesAgent, 8642); expect(recoveryScript).not.toBeNull(); - const stubbed = recoveryScript! + const stubbed = rewriteRecoveryPreloadPaths(recoveryScript!, harness) .replace( new RegExp(HERMES_SECRET_BOUNDARY_VALIDATOR_PATH, "g"), path.join(validatorRoot, "validate-hermes-env-secret-boundary.py"), @@ -462,6 +471,7 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = "", ].join("\n"), ); + fs.chmodSync(proxyEnvFile, 0o444); stubBaselineUtilities(harness.stubsDir, harness.pkillLog, harness.hermesLaunchMarker); try { diff --git a/src/lib/agent/runtime-recovery-preload.test.ts b/src/lib/agent/runtime-recovery-preload.test.ts new file mode 100644 index 0000000000..255b0a5553 --- /dev/null +++ b/src/lib/agent/runtime-recovery-preload.test.ts @@ -0,0 +1,382 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { spawnSync } from "node:child_process"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { describe, expect, it } from "vitest"; +import { buildOpenClawRecoveryScript, buildRecoveryScript } from "../../../dist/lib/agent/runtime"; +import { + buildGatewayGuardRecoveryLines, + GATEWAY_PRELOAD_GUARDS, +} from "../../../dist/lib/agent/runtime-recovery-preload"; +import { minimalAgent } from "./hermes-recovery-boundary-fixtures"; + +const [SAFETY_NET_GUARD, CIAO_GUARD] = GATEWAY_PRELOAD_GUARDS; + +function writeStub(dir: string, name: string, body: string) { + const stub = path.join(dir, name); + fs.writeFileSync(stub, `#!/usr/bin/env sh\n${body}\n`, { mode: 0o755 }); + return stub; +} + +function makeHarness() { + const root = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-recovery-preload-")); + const sourceDir = path.join(root, "usr-local-lib-nemoclaw-preloads"); + const workDir = path.join(root, "tmp"); + const stubsDir = path.join(root, "bin"); + fs.mkdirSync(sourceDir, { recursive: true }); + fs.mkdirSync(workDir, { recursive: true }); + fs.mkdirSync(stubsDir, { recursive: true }); + + const paths = { + root, + stubsDir, + sourceSafetyNet: path.join(sourceDir, "sandbox-safety-net.js"), + sourceCiao: path.join(sourceDir, "ciao-network-guard.js"), + tmpSafetyNet: path.join(workDir, "nemoclaw-sandbox-safety-net.js"), + tmpCiao: path.join(workDir, "nemoclaw-ciao-network-guard.js"), + proxyEnv: path.join(workDir, "nemoclaw-proxy-env.sh"), + gatewayLog: path.join(workDir, "gateway.log"), + hostileMarker: path.join(root, "hostile-proxy-env-sourced"), + }; + + fs.writeFileSync(paths.sourceSafetyNet, "module.exports = 'trusted safety net';\n"); + fs.writeFileSync(paths.sourceCiao, "module.exports = 'trusted ciao guard';\n"); + + return paths; +} + +function rewriteRuntimePaths(script: string, paths: ReturnType): string { + return script + .replaceAll(SAFETY_NET_GUARD.tmpPath, paths.tmpSafetyNet) + .replaceAll(SAFETY_NET_GUARD.sourcePath, paths.sourceSafetyNet) + .replaceAll(CIAO_GUARD.tmpPath, paths.tmpCiao) + .replaceAll(CIAO_GUARD.sourcePath, paths.sourceCiao) + .replaceAll("/tmp/nemoclaw-proxy-env.sh", paths.proxyEnv); +} + +function runGuardRecovery(opts: { + proxyEnvContent?: string; + beforeScript?: (paths: ReturnType) => void; + fakeRoot?: boolean; + shell?: "bash" | "sh"; +}) { + const paths = makeHarness(); + if (opts.fakeRoot) { + writeStub( + paths.stubsDir, + "id", + '[ "$1" = "-u" ] && { printf "0\\n"; exit 0; }\n/usr/bin/id "$@"', + ); + writeStub(paths.stubsDir, "chown", "exit 0"); + writeStub( + paths.stubsDir, + "stat", + '[ "$1" = "-c" ] && [ "$2" = "%u" ] && { printf "0\\n"; exit 0; }\n/usr/bin/stat "$@"', + ); + } + opts.beforeScript?.(paths); + + const proxyEnvContent = opts.proxyEnvContent + ? rewriteRuntimePaths(opts.proxyEnvContent, paths) + : undefined; + const writeProxyEnv = proxyEnvContent + ? [ + `cat > ${JSON.stringify(paths.proxyEnv)} <<'PROXYENV'`, + proxyEnvContent, + "PROXYENV", + `chmod 444 ${JSON.stringify(paths.proxyEnv)}`, + ] + : []; + + const recoveryLines = rewriteRuntimePaths(buildGatewayGuardRecoveryLines().join("\n"), paths); + const script = [ + "set -u", + `export _GATEWAY_LOG=${JSON.stringify(paths.gatewayLog)}`, + ': > "$_GATEWAY_LOG"', + ...writeProxyEnv, + recoveryLines, + 'if [ "$_GUARDS_MISSING" = "1" ]; then echo GUARDS_MISSING; exit 17; fi', + 'printf "PE_MISSING=%s\\n" "$_PE_MISSING"', + 'printf "NODE_OPTIONS=%s\\n" "$NODE_OPTIONS"', + ].join("\n"); + try { + const result = spawnSync(opts.shell ?? "sh", ["-c", script], { + encoding: "utf-8", + timeout: 10000, + env: { + PATH: `${paths.stubsDir}:${process.env.PATH ?? "/usr/bin:/bin"}`, + HOME: paths.root, + }, + }); + const readIfExists = (pathname: string) => { + try { + return fs.readFileSync(pathname, "utf-8"); + } catch (error) { + const code = (error as NodeJS.ErrnoException).code; + if (code === "ENOENT" || code === "EISDIR") return null; + throw error; + } + }; + const modeIfExists = (pathname: string) => { + try { + return mode(pathname); + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "ENOENT") return null; + throw error; + } + }; + const symlinkIfExists = (pathname: string) => { + try { + return fs.lstatSync(pathname).isSymbolicLink(); + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "ENOENT") return null; + throw error; + } + }; + return { + ...result, + paths, + files: { + gatewayLog: readIfExists(paths.gatewayLog) ?? "", + proxyEnv: readIfExists(paths.proxyEnv), + tmpSafetyNet: readIfExists(paths.tmpSafetyNet), + tmpCiao: readIfExists(paths.tmpCiao), + tmpSafetyNetMode: modeIfExists(paths.tmpSafetyNet), + tmpCiaoMode: modeIfExists(paths.tmpCiao), + proxyEnvMode: modeIfExists(paths.proxyEnv), + tmpSafetyNetIsSymlink: symlinkIfExists(paths.tmpSafetyNet), + proxyEnvIsSymlink: symlinkIfExists(paths.proxyEnv), + hostileProxyEnvSourced: fs.existsSync(paths.hostileMarker), + }, + }; + } finally { + fs.rmSync(paths.root, { recursive: true, force: true }); + } +} + +function mode(pathname: string): number { + return fs.statSync(pathname).mode & 0o777; +} + +describe("gateway recovery preload repair", () => { + it("runs the generated recovery helper through /bin/sh -c", () => { + const result = runGuardRecovery({ fakeRoot: true, shell: "sh" }); + expect(result.status).toBe(0); + expect(result.stdout).toContain("PE_MISSING=0"); + expect(result.stderr).not.toContain("Bad substitution"); + expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpCiao}`); + }); + + it("restores missing proxy-env.sh from trusted packaged preloads", () => { + const result = runGuardRecovery({}); + expect(result.status).toBe(0); + expect(result.stdout).toContain("PE_MISSING=0"); + expect(result.stdout).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(result.stdout).toContain(`--require ${result.paths.tmpCiao}`); + expect(result.files.tmpSafetyNet).toContain("trusted safety net"); + expect(result.files.tmpCiao).toContain("trusted ciao guard"); + expect(result.files.tmpSafetyNetMode).toBe(0o444); + expect(result.files.tmpCiaoMode).toBe(0o444); + expect(result.files.proxyEnvMode).toBe(0o444); + const proxyEnv = result.files.proxyEnv ?? ""; + expect(proxyEnv).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(proxyEnv).toContain(`--require ${result.paths.tmpCiao}`); + expect(result.files.gatewayLog).toContain("[gateway-recovery] WARNING"); + expect(result.files.gatewayLog).toContain("restoring library guards"); + }); + + it("does not accept substring matches as installed preload guards", () => { + const result = runGuardRecovery({ + proxyEnvContent: `export NODE_OPTIONS="--require /tmp/not-nemoclaw-sandbox-safety-net.js --require /tmp/not-nemoclaw-ciao-network-guard.js"\n`, + }); + expect(result.status).toBe(0); + expect(result.stdout).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(result.stdout).toContain(`--require ${result.paths.tmpCiao}`); + }); + + it("does not duplicate exact --require entries", () => { + const result = runGuardRecovery({ + proxyEnvContent: `export NODE_OPTIONS="--require ${SAFETY_NET_GUARD.tmpPath} --require=${CIAO_GUARD.tmpPath}"\n`, + }); + expect(result.status).toBe(0); + const nodeOptions = result.stdout.match(/^NODE_OPTIONS=(.*)$/m)?.[1] ?? ""; + expect(nodeOptions.match(new RegExp(result.paths.tmpSafetyNet, "g"))?.length).toBe(1); + expect(nodeOptions.match(new RegExp(result.paths.tmpCiao, "g"))?.length).toBe(1); + }); + + it("persists repairs for a metadata-safe proxy-env.sh that is missing one guard", () => { + const result = runGuardRecovery({ + proxyEnvContent: `export NODE_OPTIONS="--require ${SAFETY_NET_GUARD.tmpPath}"\n`, + }); + expect(result.status).toBe(0); + expect(result.stdout).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(result.stdout).toContain(`--require ${result.paths.tmpCiao}`); + expect(result.files.proxyEnvMode).toBe(0o444); + expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpCiao}`); + expect(result.files.gatewayLog).toContain("proxy-env.sh incomplete"); + }); + + it("rebuilds an unsafe proxy-env.sh without sourcing attacker-controlled content", () => { + const result = runGuardRecovery({ + beforeScript(paths) { + fs.writeFileSync( + paths.proxyEnv, + [ + `touch ${JSON.stringify(paths.hostileMarker)}`, + "export NODE_OPTIONS='--require /tmp/attacker.js'", + "", + ].join("\n"), + ); + fs.chmodSync(paths.proxyEnv, 0o666); + }, + }); + expect(result.status).toBe(0); + expect(result.stdout).toContain("PE_MISSING=0"); + expect(result.files.hostileProxyEnvSourced).toBe(false); + expect(result.files.proxyEnvMode).toBe(0o444); + expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(result.files.proxyEnv).not.toContain("/tmp/attacker.js"); + expect(result.files.gatewayLog).toContain("unsafe mode"); + expect(result.files.gatewayLog).toContain("rebuilding from packaged preloads"); + }); + + it("replaces a symlinked proxy-env.sh instead of sourcing it", () => { + const result = runGuardRecovery({ + beforeScript(paths) { + const target = path.join(paths.root, "attacker-proxy-env.sh"); + fs.writeFileSync(target, `touch ${JSON.stringify(paths.hostileMarker)}\n`); + fs.symlinkSync(target, paths.proxyEnv); + }, + }); + expect(result.status).toBe(0); + expect(result.files.hostileProxyEnvSourced).toBe(false); + expect(result.files.proxyEnvIsSymlink).toBe(false); + expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(result.files.gatewayLog).toContain("is a symlink"); + }); + + it("fails closed when proxy-env.sh is a directory", () => { + const result = runGuardRecovery({ + beforeScript(paths) { + fs.mkdirSync(paths.proxyEnv); + }, + }); + expect(result.status).toBe(17); + expect(result.stdout).toContain("GUARDS_MISSING"); + expect(result.files.gatewayLog).toContain("is a directory"); + expect(result.files.gatewayLog).toContain("refusing recovered proxy-env install"); + }); + + it("replaces a symlinked tmp preload with a trusted staged file", () => { + const result = runGuardRecovery({ + beforeScript(paths) { + const target = path.join(paths.root, "attacker-controlled.js"); + fs.writeFileSync(target, "module.exports = 'wrong';\n"); + fs.symlinkSync(target, paths.tmpSafetyNet); + }, + }); + expect(result.status).toBe(0); + expect(result.files.tmpSafetyNetIsSymlink).toBe(false); + expect(result.files.tmpSafetyNet).toContain("trusted safety net"); + }); + + it("refuses recovery when a trusted packaged preload source is unavailable", () => { + const result = runGuardRecovery({ + beforeScript(paths) { + fs.rmSync(paths.sourceCiao); + }, + }); + expect(result.status).toBe(17); + expect(result.stdout).toContain("GUARDS_MISSING"); + expect(result.files.gatewayLog).toContain("trusted preload source"); + expect(result.files.gatewayLog).toContain("refusing preload install"); + }); + + it("refuses recovery when a trusted packaged preload source is group writable in root mode", () => { + const result = runGuardRecovery({ + fakeRoot: true, + beforeScript(paths) { + fs.chmodSync(paths.sourceCiao, 0o664); + }, + }); + expect(result.status).toBe(17); + expect(result.stdout).toContain("GUARDS_MISSING"); + expect(result.files.gatewayLog).toContain("trusted preload source"); + expect(result.files.gatewayLog).toContain("unsafe mode=664"); + }); + + it("wires the repair helper into both recovery script builders", () => { + const genericScript = buildRecoveryScript(minimalAgent, 19000); + const openClawScript = buildOpenClawRecoveryScript(18789); + for (const script of [genericScript, openClawScript]) { + expect(script).toContain("restoring library guards from packaged preloads"); + expect(script).toContain("/usr/local/lib/nemoclaw/preloads/sandbox-safety-net.js"); + expect(script).toContain("/usr/local/lib/nemoclaw/preloads/ciao-network-guard.js"); + expect(script).not.toContain("gateway launching without library guards"); + } + }); + + it("validates proxy-env.sh before sourcing it in gateway recovery scripts", () => { + for (const script of [ + buildRecoveryScript(minimalAgent, 19000), + buildOpenClawRecoveryScript(18789), + ]) { + expect(script).not.toContain("[ -r /tmp/nemoclaw-proxy-env.sh ]; then ."); + const validateIdx = script!.indexOf( + "_nemoclaw_validate_recovery_proxy_env /tmp/nemoclaw-proxy-env.sh", + ); + const sourceIdx = script!.indexOf("then . /tmp/nemoclaw-proxy-env.sh"); + expect(validateIdx).toBeGreaterThanOrEqual(0); + expect(sourceIdx).toBeGreaterThanOrEqual(0); + expect(validateIdx).toBeLessThan(sourceIdx); + } + }); + + it("validates proxy-env.sh before shell init hooks can source it", () => { + for (const script of [ + buildRecoveryScript(minimalAgent, 19000), + buildOpenClawRecoveryScript(18789), + ]) { + expect(script).not.toBeNull(); + const validateIdx = script!.indexOf( + "_nemoclaw_validate_recovery_proxy_env /tmp/nemoclaw-proxy-env.sh", + ); + const bashrcIdx = script!.indexOf("[ -f ~/.bashrc ] && . ~/.bashrc;"); + const healthIdx = script!.indexOf("_GW_CODE="); + expect(validateIdx).toBeGreaterThanOrEqual(0); + expect(bashrcIdx).toBeGreaterThanOrEqual(0); + expect(healthIdx).toBeGreaterThanOrEqual(0); + expect(validateIdx).toBeLessThan(bashrcIdx); + expect(bashrcIdx).toBeLessThan(healthIdx); + } + }); + + it("repairs guard files before health probing and the ALREADY_RUNNING fast path", () => { + for (const script of [ + buildRecoveryScript(minimalAgent, 19000), + buildOpenClawRecoveryScript(18789), + ]) { + expect(script).not.toBeNull(); + const safetyNetStageIdx = script!.indexOf( + `_nemoclaw_stage_recovery_preload ${SAFETY_NET_GUARD.tmpPath} ${SAFETY_NET_GUARD.sourcePath}`, + ); + const ciaoStageIdx = script!.indexOf( + `_nemoclaw_stage_recovery_preload ${CIAO_GUARD.tmpPath} ${CIAO_GUARD.sourcePath}`, + ); + const healthIdx = script!.indexOf("_GW_CODE="); + const alreadyRunningIdx = script!.indexOf("echo ALREADY_RUNNING; exit 0"); + expect(safetyNetStageIdx).toBeGreaterThanOrEqual(0); + expect(ciaoStageIdx).toBeGreaterThanOrEqual(0); + expect(healthIdx).toBeGreaterThanOrEqual(0); + expect(alreadyRunningIdx).toBeGreaterThanOrEqual(0); + expect(safetyNetStageIdx).toBeLessThan(healthIdx); + expect(ciaoStageIdx).toBeLessThan(healthIdx); + expect(healthIdx).toBeLessThan(alreadyRunningIdx); + } + }); +}); diff --git a/src/lib/agent/runtime-recovery-preload.ts b/src/lib/agent/runtime-recovery-preload.ts new file mode 100644 index 0000000000..068250c081 --- /dev/null +++ b/src/lib/agent/runtime-recovery-preload.ts @@ -0,0 +1,162 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Gateway recovery preload repair logic. The generated shell restores the two +// critical Node preload guards from the packaged image copies before recovery +// relaunches a gateway. + +export const GATEWAY_PRELOAD_GUARDS: ReadonlyArray<{ + tmpPath: string; + sourcePath: string; +}> = [ + { + tmpPath: "/tmp/nemoclaw-sandbox-safety-net.js", + sourcePath: "/usr/local/lib/nemoclaw/preloads/sandbox-safety-net.js", + }, + { + tmpPath: "/tmp/nemoclaw-ciao-network-guard.js", + sourcePath: "/usr/local/lib/nemoclaw/preloads/ciao-network-guard.js", + }, +]; + +/** + * Build shell lines that restore and validate the required recovery preloads. + * + * The recovery script must not source /tmp/nemoclaw-proxy-env.sh before these + * lines. This helper validates the env file first, rebuilds it when it is + * missing or unsafe, then sources only the trusted result. The recovered env is + * intentionally minimal: it restores the critical Node preload guard chain that + * recovery itself requires. The normal sandbox entrypoint remains the source of + * truth for the full proxy/tool/token environment and refreshes that file on a + * regular sandbox restart. + */ +export function buildGatewayGuardRecoveryLines(): string[] { + const recoveredProxyEnvExports = GATEWAY_PRELOAD_GUARDS.map( + ({ tmpPath }) => + `printf '%s\\n' 'export NODE_OPTIONS="\${NODE_OPTIONS:+$NODE_OPTIONS }--require ${tmpPath}"';`, + ); + const stageCalls = GATEWAY_PRELOAD_GUARDS.map( + ({ tmpPath, sourcePath }) => + `_nemoclaw_stage_recovery_preload ${tmpPath} ${sourcePath} || _NEMOCLAW_CRITICAL_GUARDS_READY=0;`, + ); + const appendCalls = GATEWAY_PRELOAD_GUARDS.map( + ({ tmpPath }) => + `if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ]; then _nemoclaw_append_node_require ${tmpPath}; fi;`, + ); + const proxyEnvRewriteChecks = GATEWAY_PRELOAD_GUARDS.map( + ({ tmpPath }) => + `_nemoclaw_node_options_has_require ${tmpPath} || _PROXY_ENV_REWRITE_NEEDED=1;`, + ); + const guardChecks = GATEWAY_PRELOAD_GUARDS.map( + ({ tmpPath }) => `_nemoclaw_node_options_has_require ${tmpPath} || _GUARDS_MISSING=1;`, + ); + + const helpers = [ + "_nemoclaw_recovery_log() {", + 'local _msg="$1";', + 'echo "$_msg" >&2;', + 'if [ -n "${_GATEWAY_LOG:-}" ]; then echo "$_msg" >> "$_GATEWAY_LOG" 2>/dev/null || true; fi;', + "};", + "_nemoclaw_node_options_has_require() {", + 'local wanted="$1"; local token prev;', + "prev=;", + "for token in ${NODE_OPTIONS:-}; do", + 'if [ "$prev" = "--require" ] && [ "$token" = "$wanted" ]; then return 0; fi;', + 'if [ "$token" = "--require=$wanted" ]; then return 0; fi;', + 'prev="$token";', + "done;", + "return 1;", + "};", + "_nemoclaw_mode_group_or_other_writable() {", + 'local perms="$1";', + 'case "$perms" in ""|*[!0-7]*) return 0 ;; esac;', + 'while [ "${#perms}" -lt 3 ]; do perms="0$perms"; done;', + 'case "$perms" in *[2367]?|*[2367]) return 0 ;; *) return 1 ;; esac;', + "};", + "_nemoclaw_append_node_require() {", + 'local wanted="$1";', + 'if ! _nemoclaw_node_options_has_require "$wanted"; then export NODE_OPTIONS="${NODE_OPTIONS:+$NODE_OPTIONS }--require $wanted"; fi;', + "};", + "_nemoclaw_validate_recovery_preload() {", + 'local file="$1"; local perms owner _msg;', + 'if [ -L "$file" ]; then _msg="[gateway-recovery] ERROR: $file is a symlink - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if [ ! -f "$file" ]; then _msg="[gateway-recovery] ERROR: $file is not a regular file - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'perms="$(stat -c %a "$file" 2>/dev/null || stat -f %Lp "$file" 2>/dev/null || echo unknown)";', + 'if [ "$perms" != "444" ]; then _msg="[gateway-recovery] ERROR: $file has unsafe mode=$perms (expected 444) - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if [ "$(id -u)" -eq 0 ]; then', + 'owner="$(stat -c %u "$file" 2>/dev/null || stat -f %u "$file" 2>/dev/null || echo unknown)";', + 'if [ "$owner" != "0" ]; then _msg="[gateway-recovery] ERROR: $file owner_uid=$owner (expected 0) - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + "fi;", + "return 0;", + "};", + "_nemoclaw_validate_trusted_preload_source() {", + 'local src="$1"; local perms owner _msg;', + 'if [ ! -r "$src" ] || [ -L "$src" ] || [ ! -f "$src" ]; then _msg="[gateway-recovery] ERROR: trusted preload source $src unavailable - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'perms="$(stat -c %a "$src" 2>/dev/null || stat -f %Lp "$src" 2>/dev/null || echo unknown)";', + 'if [ "$(id -u)" -eq 0 ]; then', + 'owner="$(stat -c %u "$src" 2>/dev/null || stat -f %u "$src" 2>/dev/null || echo unknown)";', + 'if [ "$owner" != "0" ]; then _msg="[gateway-recovery] ERROR: trusted preload source $src owner_uid=$owner (expected 0) - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if _nemoclaw_mode_group_or_other_writable "$perms"; then _msg="[gateway-recovery] ERROR: trusted preload source $src has unsafe mode=$perms (group/other writable) - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + "fi;", + "return 0;", + "};", + "_nemoclaw_validate_recovery_proxy_env() {", + 'local env_file="$1"; local perms owner _msg;', + 'if [ -L "$env_file" ]; then _msg="[gateway-recovery] WARNING: $env_file is a symlink - rebuilding from packaged preloads"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if [ ! -f "$env_file" ]; then return 1; fi;', + 'perms="$(stat -c %a "$env_file" 2>/dev/null || stat -f %Lp "$env_file" 2>/dev/null || echo unknown)";', + 'if [ "$perms" != "444" ]; then _msg="[gateway-recovery] WARNING: $env_file has unsafe mode=$perms (expected 444) - rebuilding from packaged preloads"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if [ "$(id -u)" -eq 0 ]; then', + 'owner="$(stat -c %u "$env_file" 2>/dev/null || stat -f %u "$env_file" 2>/dev/null || echo unknown)";', + 'if [ "$owner" != "0" ]; then _msg="[gateway-recovery] WARNING: $env_file owner_uid=$owner (expected 0) - rebuilding from packaged preloads"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + "fi;", + "return 0;", + "};", + "_nemoclaw_stage_recovery_preload() {", + 'local tmp="$1"; local src="$2"; local dir base stage _msg;', + '_nemoclaw_validate_trusted_preload_source "$src" || return 1;', + 'dir="$(dirname "$tmp")"; base="$(basename "$tmp")";', + 'stage="$(mktemp "${dir}/.${base}.tmp.XXXXXX")" || { _msg="[gateway-recovery] ERROR: failed to stage $tmp"; _nemoclaw_recovery_log "$_msg"; return 1; };', + 'if ! cp "$src" "$stage"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to copy $src into recovery stage"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if [ "$(id -u)" -eq 0 ] && ! chown root:root "$stage"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to chown recovery stage for $tmp"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if ! chmod 444 "$stage"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to chmod recovery stage for $tmp"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if ! mv -f "$stage" "$tmp"; then rm -f "$stage"; if _nemoclaw_validate_recovery_preload "$tmp"; then return 0; fi; _msg="[gateway-recovery] ERROR: failed to install recovery preload $tmp"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + '_nemoclaw_validate_recovery_preload "$tmp";', + "};", + "_nemoclaw_write_recovered_proxy_env() {", + "local env_file=/tmp/nemoclaw-proxy-env.sh; local stage _msg;", + 'stage="$(mktemp /tmp/.nemoclaw-proxy-env.sh.tmp.XXXXXX)" || { _msg="[gateway-recovery] ERROR: failed to stage recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; };', + "{", + "printf '%s\\n' '# Recovered by NemoClaw gateway recovery; sandbox restart refreshes the full proxy env.';", + ...recoveredProxyEnvExports, + '} > "$stage" || { rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to write recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; };', + 'if [ "$(id -u)" -eq 0 ] && ! chown root:root "$stage"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to chown recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if ! chmod 444 "$stage"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to chmod recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if [ -d "$env_file" ]; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: $env_file is a directory - refusing recovered proxy-env install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if ! mv -f "$stage" "$env_file"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to install recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + '_nemoclaw_validate_recovery_proxy_env "$env_file";', + "};", + ].join(" "); + + return [ + helpers, + "if _nemoclaw_validate_recovery_proxy_env /tmp/nemoclaw-proxy-env.sh; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;", + "_NEMOCLAW_CRITICAL_GUARDS_READY=1;", + ...stageCalls, + 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ] && [ "${_PE_MISSING:-0}" = "1" ]; then', + '_W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - restoring library guards from packaged preloads (#2478/#2701)"; _nemoclaw_recovery_log "$_W";', + "_nemoclaw_write_recovered_proxy_env || _NEMOCLAW_CRITICAL_GUARDS_READY=0;", + 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; fi;', + "fi;", + "_PROXY_ENV_REWRITE_NEEDED=0;", + ...proxyEnvRewriteChecks, + 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ] && [ "${_PE_MISSING:-0}" = "0" ] && [ "${_PROXY_ENV_REWRITE_NEEDED:-0}" = "1" ]; then', + '_W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh incomplete - persisting library guards from packaged preloads (#2478/#2701)"; _nemoclaw_recovery_log "$_W";', + "_nemoclaw_write_recovered_proxy_env || _NEMOCLAW_CRITICAL_GUARDS_READY=0;", + "fi;", + ...appendCalls, + "_GUARDS_MISSING=0;", + 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" != "1" ]; then _GUARDS_MISSING=1; fi;', + ...guardChecks, + ]; +} diff --git a/src/lib/agent/runtime.test.ts b/src/lib/agent/runtime.test.ts index 839afad4f1..5f46ed7521 100644 --- a/src/lib/agent/runtime.test.ts +++ b/src/lib/agent/runtime.test.ts @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -import { describe, it, expect } from "vitest"; +import { describe, expect, it } from "vitest"; // Import from compiled dist/ so coverage is attributed correctly. import { buildHermesDashboardProcessRecoveryScript, @@ -195,22 +195,23 @@ describe("buildRecoveryScript", () => { // Regression coverage for #2478. The recovery script must explicitly source // /tmp/nemoclaw-proxy-env.sh (single source of truth for NODE_OPTIONS - // library guards) and warn — not silently continue — when the file is - // missing or the safety-net preload is absent from NODE_OPTIONS. The pre-fix - // recovery path swallowed sourcing errors via `2>/dev/null`, leaving - // respawned gateways guard-less and crash-looping on the next library - // error from ciao, model-pricing, or anything else hitting a sandboxed - // syscall. + // library guards) and restore the critical safety-net/ciao preloads from + // trusted packaged sources when the file is missing. The pre-fix recovery + // path swallowed sourcing errors via `2>/dev/null`, leaving respawned + // gateways guard-less and crash-looping on the next library error from ciao, + // model-pricing, or anything else hitting a sandboxed syscall. describe("#2478 hardened library-guard preload chain", () => { it("explicitly sources the gateway env file", () => { const script = buildRecoveryScript(minimalAgent, 19000); expect(script).toContain(". /tmp/nemoclaw-proxy-env.sh"); }); - it("warns when the gateway env file is missing instead of silently launching", () => { + it("warns and restores guards when the gateway env file is missing", () => { const script = buildRecoveryScript(minimalAgent, 19000); expect(script).toContain("/tmp/nemoclaw-proxy-env.sh missing"); + expect(script).toContain("restoring library guards from packaged preloads"); expect(script).toContain("#2478"); + expect(script).not.toContain("gateway launching without library guards"); }); it("does not silence sourcing errors with 2>/dev/null", () => { @@ -237,22 +238,9 @@ describe("buildRecoveryScript", () => { expect(script).toContain("GATEWAY_STALE_PROCESSES"); }); - it("sources proxy-env.sh BEFORE launching the gateway binary", () => { + it("fails recovery when trusted guard restoration cannot install required guards", () => { const script = buildRecoveryScript(minimalAgent, 19000); - expect(script).not.toBeNull(); - const staleStopIdx = script!.indexOf('pkill -TERM -f "$_GATEWAY_PROC_PATTERN"'); - const sourceIdx = script!.indexOf("then . /tmp/nemoclaw-proxy-env.sh"); - const launchIdx = script!.indexOf("nohup"); - expect(staleStopIdx).toBeGreaterThanOrEqual(0); - expect(sourceIdx).toBeGreaterThanOrEqual(0); - expect(launchIdx).toBeGreaterThanOrEqual(0); - expect(staleStopIdx).toBeLessThan(sourceIdx); - expect(sourceIdx).toBeLessThan(launchIdx); - }); - - it("fails recovery when an existing proxy-env.sh does not install required guards", () => { - const script = buildRecoveryScript(minimalAgent, 19000); - expect(script).toContain('if [ "$_PE_MISSING" = "0" ]'); + expect(script).toContain("_NEMOCLAW_CRITICAL_GUARDS_READY"); expect(script).toContain("refusing unguarded gateway relaunch"); expect(script).toContain('echo "$_E" >> "$_GATEWAY_LOG"; exit 1'); }); @@ -263,13 +251,14 @@ describe("buildRecoveryScript", () => { // executeSandboxCommand silently discards stderr from the recovery // script, so a warning that only goes to stderr is invisible to // anyone debugging a crash-loop. (#2478) - expect(script).toContain('echo "$_W" >> "$_GATEWAY_LOG"'); + expect(script).toContain('_nemoclaw_recovery_log "$_W"'); + expect(script).toContain('echo "$_msg" >> "$_GATEWAY_LOG"'); // And the warning must be deferred until AFTER gateway.log is // safely opened with O_NOFOLLOW, otherwise the redirect targets a // stale or attacker-controlled file. const gatewayPrepIdx = script!.indexOf(" /tmp/gateway.log || exit 1;"); const logSelectionIdx = script!.indexOf("_GATEWAY_LOG=/tmp/gateway.log"); - const warnIdx = script!.indexOf('echo "$_W" >> "$_GATEWAY_LOG"'); + const warnIdx = script!.indexOf("_W="); expect(gatewayPrepIdx).toBeGreaterThanOrEqual(0); expect(logSelectionIdx).toBeGreaterThanOrEqual(0); expect(warnIdx).toBeGreaterThanOrEqual(0); @@ -297,7 +286,7 @@ describe("buildRecoveryScript", () => { expect(script).not.toContain("rm -f /tmp/gateway.log"); expect(script).toContain("_GATEWAY_LOG=/tmp/gateway.log"); expect(script).toContain("_GATEWAY_LOG=/tmp/gateway-recovery.log"); - expect(script).toContain('echo "$_W" >> "$_GATEWAY_LOG"'); + expect(script).toContain('_nemoclaw_recovery_log "$_W"'); expect(script).toContain('tail -5 "$_GATEWAY_LOG"'); expect(script).not.toContain('echo "$_W" >> /tmp/gateway.log'); expect(script).not.toContain("cat /tmp/gateway.log"); diff --git a/src/lib/agent/runtime.ts b/src/lib/agent/runtime.ts index ef79716e78..bd8e515efc 100644 --- a/src/lib/agent/runtime.ts +++ b/src/lib/agent/runtime.ts @@ -11,11 +11,12 @@ import { DASHBOARD_PORT } from "../core/ports"; import { shellQuote } from "../runner"; import * as onboardSession from "../state/onboard-session"; import * as registry from "../state/registry"; -import { loadAgent, type AgentDefinition } from "./defs"; +import { type AgentDefinition, loadAgent } from "./defs"; import { buildHermesEnvFileBoundaryGuard, buildHermesRuntimeEnvBoundaryGuard, } from "./hermes-recovery-boundary"; +import { buildGatewayGuardRecoveryLines } from "./runtime-recovery-preload"; /** * Resolve the agent for a sandbox. Checks the per-sandbox registry first @@ -193,17 +194,15 @@ export function buildHermesDashboardProcessRecoveryScript( export function buildOpenClawRecoveryScript(port: number): string { const staleGatewayPattern = "[o]penclaw([ -]gateway| gateway run|$)"; return [ - "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;", + ...buildGatewayLogSetup(true, "gateway"), + buildGatewayLogSelection(), + ...buildGatewayGuardRecoveryLines(), + '[ "$_GUARDS_MISSING" = "1" ] && { _E="[gateway-recovery] ERROR: NODE_OPTIONS missing safety-net preload or ciao preload after trusted recovery - refusing unguarded gateway relaunch (#2478/#2701)"; echo "$_E" >&2; echo "$_E" >> "$_GATEWAY_LOG"; exit 1; };', "[ -f ~/.bashrc ] && . ~/.bashrc;", - 'if [ "$_PE_MISSING" = "0" ]; then case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _SN_MISSING=0 ;; *) _SN_MISSING=1 ;; esac; case "${NODE_OPTIONS:-}" in *nemoclaw-ciao-network-guard*) _CIAO_MISSING=0 ;; *) _CIAO_MISSING=1 ;; esac; if [ "$_SN_MISSING" = "0" ] && [ "$_CIAO_MISSING" = "0" ]; then _GUARDS_MISSING=0; else _GUARDS_MISSING=1; fi; else _GUARDS_MISSING=0; fi;', `_GW_CODE=$(curl -so /dev/null -w '%{http_code}' --max-time 3 http://127.0.0.1:${port}/health 2>/dev/null || echo 000); case "$_GW_CODE" in 200|401) echo ALREADY_RUNNING; exit 0 ;; esac;`, "rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;", - ...buildGatewayLogSetup(true, "gateway"), - buildGatewayLogSelection(), `_GATEWAY_PROC_PATTERN=${shellQuote(staleGatewayPattern)};`, 'if [ -n "$_GATEWAY_PROC_PATTERN" ]; then pkill -TERM -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; pkill -KILL -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; if pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1; then echo GATEWAY_STALE_PROCESSES; exit 1; fi; fi;', - '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };', - '[ "$_PE_MISSING" = "0" ] && [ "$_GUARDS_MISSING" = "1" ] && { _E="[gateway-recovery] ERROR: /tmp/nemoclaw-proxy-env.sh present but NODE_OPTIONS missing safety-net preload or ciao preload - refusing unguarded gateway relaunch (#2478)"; echo "$_E" >&2; echo "$_E" >> "$_GATEWAY_LOG"; exit 1; };', 'OPENCLAW="$(command -v openclaw)";', 'if [ -z "$OPENCLAW" ]; then echo OPENCLAW_MISSING; exit 1; fi;', gatewayLaunchCommand('"$OPENCLAW" gateway run --port ' + port, "gateway"), @@ -257,27 +256,22 @@ export function buildRecoveryScript( `${hermesLaunchEnv}${configuredGatewayCommand}${isHermes ? "" : ` --port ${port}`}`, ); - // Source /tmp/nemoclaw-proxy-env.sh immediately before launching. That file - // is the single source of truth for NODE_OPTIONS preload guards (safety-net, - // ciao networkInterfaces, slack, http-proxy, nemotron). Recovery - // also stops stale launcher/gateway processes that may have respawned - // between the health probe and relaunch. A missing env file remains warning- - // only; a present env file that does not install required guards is a hard - // failure because launching would create an unguarded gateway. + // Validate or rebuild /tmp/nemoclaw-proxy-env.sh before shell init and the + // health fast path so a healthy gateway cannot leave a wiped guard chain + // unrepaired. Recovery also stops stale launcher/gateway processes that may + // have respawned between the health probe and relaunch. return [ - "[ -f ~/.bashrc ] && . ~/.bashrc;", hermesHome, ...(isHermes ? [buildHermesEnvFileBoundaryGuard()] : []), - `_GW_CODE=$(curl -so /dev/null -w '%{http_code}' --max-time 3 ${shellQuote(probeUrl)} 2>/dev/null || echo 000); case "$_GW_CODE" in 200|401) echo ALREADY_RUNNING; exit 0 ;; esac;`, ...buildGatewayLogSetup(false), buildGatewayLogSelection(), + ...buildGatewayGuardRecoveryLines(), + '[ "$_GUARDS_MISSING" = "1" ] && { _E="[gateway-recovery] ERROR: NODE_OPTIONS missing safety-net preload or ciao preload after trusted recovery - refusing unguarded gateway relaunch (#2478/#2701)"; echo "$_E" >&2; echo "$_E" >> "$_GATEWAY_LOG"; exit 1; };', + "[ -f ~/.bashrc ] && . ~/.bashrc;", + `_GW_CODE=$(curl -so /dev/null -w '%{http_code}' --max-time 3 ${shellQuote(probeUrl)} 2>/dev/null || echo 000); case "$_GW_CODE" in 200|401) echo ALREADY_RUNNING; exit 0 ;; esac;`, `_GATEWAY_PROC_PATTERN=${shellQuote(staleGatewayPattern)};`, 'if [ -n "$_GATEWAY_PROC_PATTERN" ]; then pkill -TERM -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; pkill -KILL -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; if pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1; then echo GATEWAY_STALE_PROCESSES; exit 1; fi; fi;', ...validationSteps, - "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;", - 'if [ "$_PE_MISSING" = "0" ]; then case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _SN_MISSING=0 ;; *) _SN_MISSING=1 ;; esac; case "${NODE_OPTIONS:-}" in *nemoclaw-ciao-network-guard*) _CIAO_MISSING=0 ;; *) _CIAO_MISSING=1 ;; esac; if [ "$_SN_MISSING" = "0" ] && [ "$_CIAO_MISSING" = "0" ]; then _GUARDS_MISSING=0; else _GUARDS_MISSING=1; fi; else _GUARDS_MISSING=0; fi;', - '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };', - '[ "$_PE_MISSING" = "0" ] && [ "$_GUARDS_MISSING" = "1" ] && { _E="[gateway-recovery] ERROR: /tmp/nemoclaw-proxy-env.sh present but NODE_OPTIONS missing safety-net preload or ciao preload - refusing unguarded gateway relaunch (#2478)"; echo "$_E" >&2; echo "$_E" >> "$_GATEWAY_LOG"; exit 1; };', ...(isHermes ? [buildHermesRuntimeEnvBoundaryGuard()] : []), launchCommand, "GPID=$!; sleep 2;", diff --git a/test/e2e-scenario/live/gateway-guard-recovery.test.ts b/test/e2e-scenario/live/gateway-guard-recovery.test.ts index b8a56a83b1..3a26756765 100644 --- a/test/e2e-scenario/live/gateway-guard-recovery.test.ts +++ b/test/e2e-scenario/live/gateway-guard-recovery.test.ts @@ -13,14 +13,33 @@ * netns blocks the syscall). The only manual recovery is a 5-min * `nemoclaw rebuild --yes`. * - * This test asserts the desired contract — recovery RESTORES the guard - * chain before launching, no WARNING line, gateway PID stable. It will fail - * on `main` (proving the bug), pass once the fix lands. + * This test asserts the desired contract — recovery logs that it is restoring + * from trusted packaged preloads, RESTORES the guard chain before launching, + * and keeps the gateway PID stable. It will fail on `main` (proving the bug), + * pass once the fix lands. * * The contract is platform-independent: we don't need aarch64 to assert * "guards are present after recovery." The aarch64 ciao crash is a * downstream consequence of the same broken contract. * + * #2701 acceptance scope for this PR: + * - Covered: the default OpenClaw production recovery route + * (`nemoclaw connect --probe-only` → + * checkAndRecoverSandboxProcesses() → `openshell sandbox exec -- sh -c` + * buildOpenClawRecoveryScript()) after the pod-recreate-equivalent state + * of an empty guard-chain `/tmp` plus no running gateway process. This + * proves the user no longer needs `nemoclaw rebuild --yes` for + * that recovered runtime state. + * - Deliberately out of scope for this merge gate: physical DGX Spark / + * GB10 / aarch64 hardware, provider breadth beyond `cloud-openclaw`, and + * destructive host reboot / OOM / supervisor crash / manual + * `kubectl delete pod` triggers. The current live Vitest runner exposes a + * Docker-driver OpenShell sandbox and does not provide a stable per-test + * Kubernetes pod handle that can be deleted without destabilizing shared + * gateway state. Those trigger/hardware/provider clauses need a dedicated + * platform-runtime job; this test locks down the shared recovery contract + * they all depend on. + * * The corresponding legacy bash phase remains in * `test/e2e/test-issue-2478-crash-loop-recovery.sh` Phase 4 with both the * #2478 WARNING assertion (current contract) and the new #2701 guard-chain @@ -60,6 +79,19 @@ test("gateway recovery restores /tmp guard chain after pod-recreate wipe (#2701) runner: "vitest", boundary: "sandbox-lifecycle", issues: ["#2701", "#2478"], + acceptanceCoverage: { + covered: [ + "production connect --probe-only recovery route", + "openshell sandbox exec -- sh -c OpenClaw recovery script", + "pod-recreate-equivalent empty /tmp guard chain plus missing gateway process", + "no rebuild required for the recovered runtime state", + ], + intentionallyOutOfScope: [ + "DGX Spark / GB10 / aarch64 hardware matrix", + "provider breadth beyond cloud-openclaw", + "host reboot / OOM / supervisor crash / manual kubectl delete pod triggers", + ], + }, }); // ── Setup ──────────────────────────────────────────────────────── @@ -72,8 +104,10 @@ test("gateway recovery restores /tmp guard chain after pod-recreate wipe (#2701) await gateway.expectGuardChainActive(instance); // ── Disrupt ────────────────────────────────────────────────────── - // Same shape as a fresh container after pod recreate: /tmp is empty of - // the guard chain, and the openclaw process tree is gone. + // Deterministic pod-recreate-equivalent state: /tmp is empty of the guard + // chain, and the OpenClaw process tree is gone. This avoids coupling the + // merge gate to a host-specific pod/container delete primitive while still + // exercising the production sandbox-exec recovery route below. await sandbox.wipeGuardChain(instance.sandboxName); await sandbox.killGatewayTree(instance.sandboxName); @@ -101,15 +135,16 @@ test("gateway recovery restores /tmp guard chain after pod-recreate wipe (#2701) }); // ── Assert #2701 contract ──────────────────────────────────────── - // After recovery completes, the guard chain MUST be restored. Today - // this fails: recovery emits a WARNING but launches the gateway - // naked, leaving /tmp/nemoclaw-proxy-env.sh absent. After the fix - // lands, recovery re-emits the chain before launching. + // After recovery completes, the guard chain MUST be restored. Before the + // fix, recovery emitted a WARNING but launched the gateway naked, leaving + // /tmp/nemoclaw-proxy-env.sh absent. After the fix lands, recovery re-emits + // the chain before launching. await gateway.expectGuardChainActive(instance); - // No WARNING line should appear in the gateway log — the fix turns - // the warn-and-proceed branch into a re-emit-and-continue branch. - await gateway.expectLogDoesNotContain(instance, /\[gateway-recovery\] WARNING/); + // A missing proxy-env file is still worth surfacing, but the warning must + // describe trusted restoration instead of an unguarded launch. + await gateway.expectLogContains(instance, /restoring library guards from packaged preloads/); + await gateway.expectLogDoesNotContain(instance, /gateway launching without library guards/); // Gateway must be steady-state — no crash loop. This assertion is // the "would have caught DGX Spark" check, even on x86 runners, diff --git a/test/helpers/runtime-recovery-preload-test-helpers.ts b/test/helpers/runtime-recovery-preload-test-helpers.ts new file mode 100644 index 0000000000..7bc932088c --- /dev/null +++ b/test/helpers/runtime-recovery-preload-test-helpers.ts @@ -0,0 +1,38 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import fs from "node:fs"; +import path from "node:path"; +import { GATEWAY_PRELOAD_GUARDS } from "../../dist/lib/agent/runtime-recovery-preload"; + +const [SAFETY_NET_GUARD, CIAO_GUARD] = GATEWAY_PRELOAD_GUARDS; + +export interface RecoveryPreloadHarnessPaths { + preloadSourceSafetyNet: string; + preloadSourceCiao: string; + preloadTmpSafetyNet: string; + preloadTmpCiao: string; +} + +export function createRecoveryPreloadHarnessPaths(root: string): RecoveryPreloadHarnessPaths { + const paths = { + preloadSourceSafetyNet: path.join(root, "preload-source-safety-net.js"), + preloadSourceCiao: path.join(root, "preload-source-ciao.js"), + preloadTmpSafetyNet: path.join(root, "preload-tmp-safety-net.js"), + preloadTmpCiao: path.join(root, "preload-tmp-ciao.js"), + }; + fs.writeFileSync(paths.preloadSourceSafetyNet, "module.exports = 'trusted safety net';\n"); + fs.writeFileSync(paths.preloadSourceCiao, "module.exports = 'trusted ciao guard';\n"); + return paths; +} + +export function rewriteRecoveryPreloadPaths( + script: string, + paths: RecoveryPreloadHarnessPaths, +): string { + return script + .replaceAll(SAFETY_NET_GUARD.tmpPath, paths.preloadTmpSafetyNet) + .replaceAll(SAFETY_NET_GUARD.sourcePath, paths.preloadSourceSafetyNet) + .replaceAll(CIAO_GUARD.tmpPath, paths.preloadTmpCiao) + .replaceAll(CIAO_GUARD.sourcePath, paths.preloadSourceCiao); +}