From 34d5354f70560b7d3463405cf45792be2aba9e78 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Thu, 11 Jun 2026 23:14:19 -0700 Subject: [PATCH 1/8] fix(agent): restore gateway guard chain during recovery Signed-off-by: Carlos Villela --- ...hermes-secret-boundary-behavioural.test.ts | 25 +++ .../agent/runtime-recovery-preload.test.ts | 192 ++++++++++++++++++ src/lib/agent/runtime-recovery-preload.ts | 119 +++++++++++ src/lib/agent/runtime.test.ts | 26 +-- src/lib/agent/runtime.ts | 25 +-- 5 files changed, 361 insertions(+), 26 deletions(-) create mode 100644 src/lib/agent/runtime-recovery-preload.test.ts create mode 100644 src/lib/agent/runtime-recovery-preload.ts diff --git a/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts b/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts index 7947b9a81b..ce61f5d2ba 100644 --- a/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts +++ b/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts @@ -17,9 +17,12 @@ import { HERMES_SECRET_BOUNDARY_VALIDATOR_PATH, __testing, } from "../../../dist/lib/agent/hermes-recovery-boundary"; +import { GATEWAY_PRELOAD_GUARDS } from "../../../dist/lib/agent/runtime-recovery-preload"; import { buildRecoveryScript } from "../../../dist/lib/agent/runtime"; import { hermesAgent } from "./hermes-recovery-boundary-fixtures"; +const [SAFETY_NET_GUARD, CIAO_GUARD] = GATEWAY_PRELOAD_GUARDS; + function writeStub(dir: string, name: string, body: string) { const stub = path.join(dir, name); fs.writeFileSync(stub, `#!/usr/bin/env bash\n${body}\n`, { mode: 0o755 }); @@ -184,7 +187,13 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = const hermesLaunchMarker = path.join(tmp, "hermes-launched"); const gatewayLogPath = path.join(tmp, "gateway.log"); const recoveryFallbackLog = path.join(tmp, "gateway-recovery-fallback.log"); + const preloadSourceSafetyNet = path.join(tmp, "preload-source-safety-net.js"); + const preloadSourceCiao = path.join(tmp, "preload-source-ciao.js"); + const preloadTmpSafetyNet = path.join(tmp, "preload-tmp-safety-net.js"); + const preloadTmpCiao = path.join(tmp, "preload-tmp-ciao.js"); fs.mkdirSync(stubsDir, { recursive: true }); + fs.writeFileSync(preloadSourceSafetyNet, "module.exports = 'trusted safety net';\n"); + fs.writeFileSync(preloadSourceCiao, "module.exports = 'trusted ciao guard';\n"); return { tmp, stubsDir, @@ -193,6 +202,10 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = hermesLaunchMarker, gatewayLogPath, recoveryFallbackLog, + preloadSourceSafetyNet, + preloadSourceCiao, + preloadTmpSafetyNet, + preloadTmpCiao, }; } @@ -212,6 +225,10 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = recoveryLogPath: string; gatewayLogPath: string; recoveryFallbackLog: string; + preloadSourceSafetyNet: string; + preloadSourceCiao: string; + preloadTmpSafetyNet: string; + preloadTmpCiao: string; tmp: string; }) { const recoveryScript = buildRecoveryScript(hermesAgent, 8642); @@ -220,6 +237,10 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = .replace(new RegExp(HERMES_SECRET_BOUNDARY_VALIDATOR_PATH, "g"), opts.validatorPath) .replace(/\/tmp\/gateway-recovery\.log/g, opts.recoveryLogPath) .replace(/\/tmp\/gateway\.log/g, opts.gatewayLogPath) + .replaceAll(SAFETY_NET_GUARD.tmpPath, opts.preloadTmpSafetyNet) + .replaceAll(SAFETY_NET_GUARD.sourcePath, opts.preloadSourceSafetyNet) + .replaceAll(CIAO_GUARD.tmpPath, opts.preloadTmpCiao) + .replaceAll(CIAO_GUARD.sourcePath, opts.preloadSourceCiao) .replace( /_GATEWAY_LOG=\/tmp\/gateway-recovery\.log/g, `_GATEWAY_LOG=${opts.recoveryFallbackLog}`, @@ -402,6 +423,10 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = .replace(/\/tmp\/gateway-recovery\.log/g, harness.recoveryLogPath) .replace(/\/tmp\/nemoclaw-proxy-env\.sh/g, proxyEnvFile) .replace(/\/tmp\/gateway\.log/g, harness.gatewayLogPath) + .replaceAll(SAFETY_NET_GUARD.tmpPath, harness.preloadTmpSafetyNet) + .replaceAll(SAFETY_NET_GUARD.sourcePath, harness.preloadSourceSafetyNet) + .replaceAll(CIAO_GUARD.tmpPath, harness.preloadTmpCiao) + .replaceAll(CIAO_GUARD.sourcePath, harness.preloadSourceCiao) .replace( /_GATEWAY_LOG=\/tmp\/gateway-recovery\.log/g, `_GATEWAY_LOG=${harness.recoveryFallbackLog}`, diff --git a/src/lib/agent/runtime-recovery-preload.test.ts b/src/lib/agent/runtime-recovery-preload.test.ts new file mode 100644 index 0000000000..1c0bf72316 --- /dev/null +++ b/src/lib/agent/runtime-recovery-preload.test.ts @@ -0,0 +1,192 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { spawnSync } from "node:child_process"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { describe, expect, it } from "vitest"; +import { + GATEWAY_PRELOAD_GUARDS, + buildGatewayGuardRecoveryLines, +} from "../../../dist/lib/agent/runtime-recovery-preload"; +import { buildOpenClawRecoveryScript, buildRecoveryScript } from "../../../dist/lib/agent/runtime"; +import { minimalAgent } from "./hermes-recovery-boundary-fixtures"; + +const [SAFETY_NET_GUARD, CIAO_GUARD] = GATEWAY_PRELOAD_GUARDS; + +function makeHarness() { + const root = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-recovery-preload-")); + const sourceDir = path.join(root, "usr-local-lib-nemoclaw-preloads"); + const workDir = path.join(root, "tmp"); + fs.mkdirSync(sourceDir, { recursive: true }); + fs.mkdirSync(workDir, { recursive: true }); + + const paths = { + root, + sourceSafetyNet: path.join(sourceDir, "sandbox-safety-net.js"), + sourceCiao: path.join(sourceDir, "ciao-network-guard.js"), + tmpSafetyNet: path.join(workDir, "nemoclaw-sandbox-safety-net.js"), + tmpCiao: path.join(workDir, "nemoclaw-ciao-network-guard.js"), + proxyEnv: path.join(workDir, "nemoclaw-proxy-env.sh"), + gatewayLog: path.join(workDir, "gateway.log"), + }; + + fs.writeFileSync(paths.sourceSafetyNet, "module.exports = 'trusted safety net';\n"); + fs.writeFileSync(paths.sourceCiao, "module.exports = 'trusted ciao guard';\n"); + + return paths; +} + +function rewriteRuntimePaths(script: string, paths: ReturnType): string { + return script + .replaceAll(SAFETY_NET_GUARD.tmpPath, paths.tmpSafetyNet) + .replaceAll(SAFETY_NET_GUARD.sourcePath, paths.sourceSafetyNet) + .replaceAll(CIAO_GUARD.tmpPath, paths.tmpCiao) + .replaceAll(CIAO_GUARD.sourcePath, paths.sourceCiao) + .replaceAll("/tmp/nemoclaw-proxy-env.sh", paths.proxyEnv); +} + +function runGuardRecovery(opts: { + proxyEnvContent?: string; + beforeScript?: (paths: ReturnType) => void; +}) { + const paths = makeHarness(); + opts.beforeScript?.(paths); + + const sourceProxyEnv = opts.proxyEnvContent + ? [ + `cat > ${JSON.stringify(paths.proxyEnv)} <<'PROXYENV'`, + opts.proxyEnvContent, + "PROXYENV", + `chmod 444 ${JSON.stringify(paths.proxyEnv)}`, + `. ${JSON.stringify(paths.proxyEnv)}`, + "_PE_MISSING=0", + ] + : ["_PE_MISSING=1"]; + + const script = rewriteRuntimePaths( + [ + "#!/usr/bin/env bash", + "set -u", + `export _GATEWAY_LOG=${JSON.stringify(paths.gatewayLog)}`, + ': > "$_GATEWAY_LOG"', + ...sourceProxyEnv, + ...buildGatewayGuardRecoveryLines(), + 'if [ "$_GUARDS_MISSING" = "1" ]; then echo GUARDS_MISSING; exit 17; fi', + 'printf "PE_MISSING=%s\\n" "$_PE_MISSING"', + 'printf "NODE_OPTIONS=%s\\n" "$NODE_OPTIONS"', + ].join("\n"), + paths, + ); + const scriptPath = path.join(paths.root, "recovery.sh"); + fs.writeFileSync(scriptPath, script, { mode: 0o700 }); + + try { + const result = spawnSync("bash", [scriptPath], { + encoding: "utf-8", + timeout: 10000, + env: { PATH: process.env.PATH ?? "/usr/bin:/bin", HOME: paths.root }, + }); + const readIfExists = (pathname: string) => + fs.existsSync(pathname) ? fs.readFileSync(pathname, "utf-8") : null; + const modeIfExists = (pathname: string) => (fs.existsSync(pathname) ? mode(pathname) : null); + return { + ...result, + paths, + files: { + gatewayLog: readIfExists(paths.gatewayLog) ?? "", + proxyEnv: readIfExists(paths.proxyEnv), + tmpSafetyNet: readIfExists(paths.tmpSafetyNet), + tmpCiao: readIfExists(paths.tmpCiao), + tmpSafetyNetMode: modeIfExists(paths.tmpSafetyNet), + tmpCiaoMode: modeIfExists(paths.tmpCiao), + proxyEnvMode: modeIfExists(paths.proxyEnv), + tmpSafetyNetIsSymlink: fs.existsSync(paths.tmpSafetyNet) + ? fs.lstatSync(paths.tmpSafetyNet).isSymbolicLink() + : null, + }, + }; + } finally { + fs.rmSync(paths.root, { recursive: true, force: true }); + } +} + +function mode(pathname: string): number { + return fs.statSync(pathname).mode & 0o777; +} + +describe("gateway recovery preload repair", () => { + it("restores missing proxy-env.sh from trusted packaged preloads", () => { + const result = runGuardRecovery({}); + expect(result.status).toBe(0); + expect(result.stdout).toContain("PE_MISSING=0"); + expect(result.stdout).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(result.stdout).toContain(`--require ${result.paths.tmpCiao}`); + expect(result.files.tmpSafetyNet).toContain("trusted safety net"); + expect(result.files.tmpCiao).toContain("trusted ciao guard"); + expect(result.files.tmpSafetyNetMode).toBe(0o444); + expect(result.files.tmpCiaoMode).toBe(0o444); + expect(result.files.proxyEnvMode).toBe(0o444); + const proxyEnv = result.files.proxyEnv ?? ""; + expect(proxyEnv).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(proxyEnv).toContain(`--require ${result.paths.tmpCiao}`); + expect(result.files.gatewayLog).toContain("[gateway-recovery] WARNING"); + expect(result.files.gatewayLog).toContain("restoring library guards"); + }); + + it("does not accept substring matches as installed preload guards", () => { + const result = runGuardRecovery({ + proxyEnvContent: `export NODE_OPTIONS="--require /tmp/not-nemoclaw-sandbox-safety-net.js --require /tmp/not-nemoclaw-ciao-network-guard.js"\n`, + }); + expect(result.status).toBe(0); + expect(result.stdout).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(result.stdout).toContain(`--require ${result.paths.tmpCiao}`); + }); + + it("does not duplicate exact --require entries", () => { + const result = runGuardRecovery({ + proxyEnvContent: `export NODE_OPTIONS="--require ${SAFETY_NET_GUARD.tmpPath} --require=${CIAO_GUARD.tmpPath}"\n`, + }); + expect(result.status).toBe(0); + const nodeOptions = result.stdout.match(/^NODE_OPTIONS=(.*)$/m)?.[1] ?? ""; + expect(nodeOptions.match(new RegExp(result.paths.tmpSafetyNet, "g"))?.length).toBe(1); + expect(nodeOptions.match(new RegExp(result.paths.tmpCiao, "g"))?.length).toBe(1); + }); + + it("replaces a symlinked tmp preload with a trusted staged file", () => { + const result = runGuardRecovery({ + beforeScript(paths) { + const target = path.join(paths.root, "attacker-controlled.js"); + fs.writeFileSync(target, "module.exports = 'wrong';\n"); + fs.symlinkSync(target, paths.tmpSafetyNet); + }, + }); + expect(result.status).toBe(0); + expect(result.files.tmpSafetyNetIsSymlink).toBe(false); + expect(result.files.tmpSafetyNet).toContain("trusted safety net"); + }); + + it("refuses recovery when a trusted packaged preload source is unavailable", () => { + const result = runGuardRecovery({ + beforeScript(paths) { + fs.rmSync(paths.sourceCiao); + }, + }); + expect(result.status).toBe(17); + expect(result.stdout).toContain("GUARDS_MISSING"); + expect(result.files.gatewayLog).toContain("trusted preload source"); + expect(result.files.gatewayLog).toContain("refusing preload install"); + }); + + it("wires the repair helper into both recovery script builders", () => { + const genericScript = buildRecoveryScript(minimalAgent, 19000); + const openClawScript = buildOpenClawRecoveryScript(18789); + for (const script of [genericScript, openClawScript]) { + expect(script).toContain("restoring library guards from packaged preloads"); + expect(script).toContain("/usr/local/lib/nemoclaw/preloads/sandbox-safety-net.js"); + expect(script).toContain("/usr/local/lib/nemoclaw/preloads/ciao-network-guard.js"); + expect(script).not.toContain("gateway launching without library guards"); + } + }); +}); diff --git a/src/lib/agent/runtime-recovery-preload.ts b/src/lib/agent/runtime-recovery-preload.ts new file mode 100644 index 0000000000..cc3ebdf9d9 --- /dev/null +++ b/src/lib/agent/runtime-recovery-preload.ts @@ -0,0 +1,119 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Gateway recovery preload repair logic. The generated shell restores the two +// critical Node preload guards from the packaged image copies before recovery +// relaunches a gateway. + +export const GATEWAY_PRELOAD_GUARDS: ReadonlyArray<{ + tmpPath: string; + sourcePath: string; +}> = [ + { + tmpPath: "/tmp/nemoclaw-sandbox-safety-net.js", + sourcePath: "/usr/local/lib/nemoclaw/preloads/sandbox-safety-net.js", + }, + { + tmpPath: "/tmp/nemoclaw-ciao-network-guard.js", + sourcePath: "/usr/local/lib/nemoclaw/preloads/ciao-network-guard.js", + }, +]; + +/** + * Build shell lines that restore and validate the required recovery preloads. + * + * The recovery script sources /tmp/nemoclaw-proxy-env.sh before these lines. + * If that file is missing, this helper recreates a minimal proxy-env file from + * trusted packaged preload sources, sources it, and leaves the sandbox in a + * state the E2E guard-chain checks can inspect. If the trusted sources cannot + * be staged safely, callers see _GUARDS_MISSING=1 and refuse the relaunch. + */ +export function buildGatewayGuardRecoveryLines(): string[] { + const recoveredProxyEnvExports = GATEWAY_PRELOAD_GUARDS.map( + ({ tmpPath }) => + `printf '%s\\n' 'export NODE_OPTIONS="\${NODE_OPTIONS:+$NODE_OPTIONS }--require ${tmpPath}"';`, + ); + const stageCalls = GATEWAY_PRELOAD_GUARDS.map( + ({ tmpPath, sourcePath }) => + `_nemoclaw_stage_recovery_preload ${tmpPath} ${sourcePath} || _NEMOCLAW_CRITICAL_GUARDS_READY=0;`, + ); + const appendCalls = GATEWAY_PRELOAD_GUARDS.map( + ({ tmpPath }) => + `if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ]; then _nemoclaw_append_node_require ${tmpPath}; fi;`, + ); + const guardChecks = GATEWAY_PRELOAD_GUARDS.map( + ({ tmpPath }) => `_nemoclaw_node_options_has_require ${tmpPath} || _GUARDS_MISSING=1;`, + ); + + const helpers = [ + "_nemoclaw_recovery_log() {", + 'local _msg="$1";', + 'echo "$_msg" >&2;', + 'if [ -n "${_GATEWAY_LOG:-}" ]; then echo "$_msg" >> "$_GATEWAY_LOG" 2>/dev/null || true; fi;', + "};", + "_nemoclaw_node_options_has_require() {", + 'local wanted="$1"; local token prev;', + "prev=;", + "for token in ${NODE_OPTIONS:-}; do", + 'if [ "$prev" = "--require" ] && [ "$token" = "$wanted" ]; then return 0; fi;', + 'if [ "$token" = "--require=$wanted" ]; then return 0; fi;', + 'prev="$token";', + "done;", + "return 1;", + "};", + "_nemoclaw_append_node_require() {", + 'local wanted="$1";', + 'if ! _nemoclaw_node_options_has_require "$wanted"; then export NODE_OPTIONS="${NODE_OPTIONS:+$NODE_OPTIONS }--require $wanted"; fi;', + "};", + "_nemoclaw_validate_recovery_preload() {", + 'local file="$1"; local perms owner _msg;', + 'if [ -L "$file" ]; then _msg="[gateway-recovery] ERROR: $file is a symlink - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if [ ! -f "$file" ]; then _msg="[gateway-recovery] ERROR: $file is not a regular file - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'perms="$(stat -c %a "$file" 2>/dev/null || stat -f %Lp "$file" 2>/dev/null || echo unknown)";', + 'if [ "$perms" != "444" ]; then _msg="[gateway-recovery] ERROR: $file has unsafe mode=$perms (expected 444) - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if [ "$(id -u)" -eq 0 ]; then', + 'owner="$(stat -c %U "$file" 2>/dev/null || stat -f %Su "$file" 2>/dev/null || echo unknown)";', + 'if [ "$owner" != "root" ]; then _msg="[gateway-recovery] ERROR: $file owner=$owner (expected root) - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + "fi;", + "return 0;", + "};", + "_nemoclaw_stage_recovery_preload() {", + 'local tmp="$1"; local src="$2"; local dir base stage _msg;', + 'if [ ! -r "$src" ] || [ -L "$src" ] || [ ! -f "$src" ]; then _msg="[gateway-recovery] ERROR: trusted preload source $src unavailable - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'dir="$(dirname "$tmp")"; base="$(basename "$tmp")";', + 'stage="$(mktemp "${dir}/.${base}.tmp.XXXXXX")" || { _msg="[gateway-recovery] ERROR: failed to stage $tmp"; _nemoclaw_recovery_log "$_msg"; return 1; };', + 'if ! cp "$src" "$stage"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to copy $src into recovery stage"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if [ "$(id -u)" -eq 0 ] && ! chown root:root "$stage"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to chown recovery stage for $tmp"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if ! chmod 444 "$stage"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to chmod recovery stage for $tmp"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if ! mv -f "$stage" "$tmp"; then rm -f "$stage"; if _nemoclaw_validate_recovery_preload "$tmp"; then return 0; fi; _msg="[gateway-recovery] ERROR: failed to install recovery preload $tmp"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + '_nemoclaw_validate_recovery_preload "$tmp";', + "};", + "_nemoclaw_write_recovered_proxy_env() {", + "local env_file=/tmp/nemoclaw-proxy-env.sh; local stage _msg;", + 'stage="$(mktemp /tmp/.nemoclaw-proxy-env.sh.tmp.XXXXXX)" || { _msg="[gateway-recovery] ERROR: failed to stage recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; };', + "{", + "printf '%s\\n' '# Recovered by NemoClaw gateway recovery; sandbox restart refreshes the full proxy env.';", + ...recoveredProxyEnvExports, + '} > "$stage" || { rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to write recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; };', + 'if [ "$(id -u)" -eq 0 ] && ! chown root:root "$stage"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to chown recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if ! chmod 444 "$stage"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to chmod recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if ! mv -f "$stage" "$env_file"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to install recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + "return 0;", + "};", + ].join(" "); + + return [ + helpers, + "_NEMOCLAW_CRITICAL_GUARDS_READY=1;", + ...stageCalls, + 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ] && [ "${_PE_MISSING:-0}" = "1" ]; then', + '_W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - restoring library guards from packaged preloads (#2478/#2701)"; _nemoclaw_recovery_log "$_W";', + "_nemoclaw_write_recovered_proxy_env || _NEMOCLAW_CRITICAL_GUARDS_READY=0;", + 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; fi;', + "fi;", + ...appendCalls, + "_GUARDS_MISSING=0;", + 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" != "1" ]; then _GUARDS_MISSING=1; fi;', + ...guardChecks, + ]; +} diff --git a/src/lib/agent/runtime.test.ts b/src/lib/agent/runtime.test.ts index 839afad4f1..309660147d 100644 --- a/src/lib/agent/runtime.test.ts +++ b/src/lib/agent/runtime.test.ts @@ -195,22 +195,23 @@ describe("buildRecoveryScript", () => { // Regression coverage for #2478. The recovery script must explicitly source // /tmp/nemoclaw-proxy-env.sh (single source of truth for NODE_OPTIONS - // library guards) and warn — not silently continue — when the file is - // missing or the safety-net preload is absent from NODE_OPTIONS. The pre-fix - // recovery path swallowed sourcing errors via `2>/dev/null`, leaving - // respawned gateways guard-less and crash-looping on the next library - // error from ciao, model-pricing, or anything else hitting a sandboxed - // syscall. + // library guards) and restore the critical safety-net/ciao preloads from + // trusted packaged sources when the file is missing. The pre-fix recovery + // path swallowed sourcing errors via `2>/dev/null`, leaving respawned + // gateways guard-less and crash-looping on the next library error from ciao, + // model-pricing, or anything else hitting a sandboxed syscall. describe("#2478 hardened library-guard preload chain", () => { it("explicitly sources the gateway env file", () => { const script = buildRecoveryScript(minimalAgent, 19000); expect(script).toContain(". /tmp/nemoclaw-proxy-env.sh"); }); - it("warns when the gateway env file is missing instead of silently launching", () => { + it("warns and restores guards when the gateway env file is missing", () => { const script = buildRecoveryScript(minimalAgent, 19000); expect(script).toContain("/tmp/nemoclaw-proxy-env.sh missing"); + expect(script).toContain("restoring library guards from packaged preloads"); expect(script).toContain("#2478"); + expect(script).not.toContain("gateway launching without library guards"); }); it("does not silence sourcing errors with 2>/dev/null", () => { @@ -250,9 +251,9 @@ describe("buildRecoveryScript", () => { expect(sourceIdx).toBeLessThan(launchIdx); }); - it("fails recovery when an existing proxy-env.sh does not install required guards", () => { + it("fails recovery when trusted guard restoration cannot install required guards", () => { const script = buildRecoveryScript(minimalAgent, 19000); - expect(script).toContain('if [ "$_PE_MISSING" = "0" ]'); + expect(script).toContain("_NEMOCLAW_CRITICAL_GUARDS_READY"); expect(script).toContain("refusing unguarded gateway relaunch"); expect(script).toContain('echo "$_E" >> "$_GATEWAY_LOG"; exit 1'); }); @@ -263,13 +264,14 @@ describe("buildRecoveryScript", () => { // executeSandboxCommand silently discards stderr from the recovery // script, so a warning that only goes to stderr is invisible to // anyone debugging a crash-loop. (#2478) - expect(script).toContain('echo "$_W" >> "$_GATEWAY_LOG"'); + expect(script).toContain('_nemoclaw_recovery_log "$_W"'); + expect(script).toContain('echo "$_msg" >> "$_GATEWAY_LOG"'); // And the warning must be deferred until AFTER gateway.log is // safely opened with O_NOFOLLOW, otherwise the redirect targets a // stale or attacker-controlled file. const gatewayPrepIdx = script!.indexOf(" /tmp/gateway.log || exit 1;"); const logSelectionIdx = script!.indexOf("_GATEWAY_LOG=/tmp/gateway.log"); - const warnIdx = script!.indexOf('echo "$_W" >> "$_GATEWAY_LOG"'); + const warnIdx = script!.indexOf("_W="); expect(gatewayPrepIdx).toBeGreaterThanOrEqual(0); expect(logSelectionIdx).toBeGreaterThanOrEqual(0); expect(warnIdx).toBeGreaterThanOrEqual(0); @@ -297,7 +299,7 @@ describe("buildRecoveryScript", () => { expect(script).not.toContain("rm -f /tmp/gateway.log"); expect(script).toContain("_GATEWAY_LOG=/tmp/gateway.log"); expect(script).toContain("_GATEWAY_LOG=/tmp/gateway-recovery.log"); - expect(script).toContain('echo "$_W" >> "$_GATEWAY_LOG"'); + expect(script).toContain('_nemoclaw_recovery_log "$_W"'); expect(script).toContain('tail -5 "$_GATEWAY_LOG"'); expect(script).not.toContain('echo "$_W" >> /tmp/gateway.log'); expect(script).not.toContain("cat /tmp/gateway.log"); diff --git a/src/lib/agent/runtime.ts b/src/lib/agent/runtime.ts index ef79716e78..e66d312dcc 100644 --- a/src/lib/agent/runtime.ts +++ b/src/lib/agent/runtime.ts @@ -16,6 +16,7 @@ import { buildHermesEnvFileBoundaryGuard, buildHermesRuntimeEnvBoundaryGuard, } from "./hermes-recovery-boundary"; +import { buildGatewayGuardRecoveryLines } from "./runtime-recovery-preload"; /** * Resolve the agent for a sandbox. Checks the per-sandbox registry first @@ -193,17 +194,16 @@ export function buildHermesDashboardProcessRecoveryScript( export function buildOpenClawRecoveryScript(port: number): string { const staleGatewayPattern = "[o]penclaw([ -]gateway| gateway run|$)"; return [ - "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;", "[ -f ~/.bashrc ] && . ~/.bashrc;", - 'if [ "$_PE_MISSING" = "0" ]; then case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _SN_MISSING=0 ;; *) _SN_MISSING=1 ;; esac; case "${NODE_OPTIONS:-}" in *nemoclaw-ciao-network-guard*) _CIAO_MISSING=0 ;; *) _CIAO_MISSING=1 ;; esac; if [ "$_SN_MISSING" = "0" ] && [ "$_CIAO_MISSING" = "0" ]; then _GUARDS_MISSING=0; else _GUARDS_MISSING=1; fi; else _GUARDS_MISSING=0; fi;', `_GW_CODE=$(curl -so /dev/null -w '%{http_code}' --max-time 3 http://127.0.0.1:${port}/health 2>/dev/null || echo 000); case "$_GW_CODE" in 200|401) echo ALREADY_RUNNING; exit 0 ;; esac;`, "rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;", ...buildGatewayLogSetup(true, "gateway"), buildGatewayLogSelection(), `_GATEWAY_PROC_PATTERN=${shellQuote(staleGatewayPattern)};`, 'if [ -n "$_GATEWAY_PROC_PATTERN" ]; then pkill -TERM -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; pkill -KILL -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; if pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1; then echo GATEWAY_STALE_PROCESSES; exit 1; fi; fi;', - '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };', - '[ "$_PE_MISSING" = "0" ] && [ "$_GUARDS_MISSING" = "1" ] && { _E="[gateway-recovery] ERROR: /tmp/nemoclaw-proxy-env.sh present but NODE_OPTIONS missing safety-net preload or ciao preload - refusing unguarded gateway relaunch (#2478)"; echo "$_E" >&2; echo "$_E" >> "$_GATEWAY_LOG"; exit 1; };', + "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;", + ...buildGatewayGuardRecoveryLines(), + '[ "$_GUARDS_MISSING" = "1" ] && { _E="[gateway-recovery] ERROR: NODE_OPTIONS missing safety-net preload or ciao preload after trusted recovery - refusing unguarded gateway relaunch (#2478/#2701)"; echo "$_E" >&2; echo "$_E" >> "$_GATEWAY_LOG"; exit 1; };', 'OPENCLAW="$(command -v openclaw)";', 'if [ -z "$OPENCLAW" ]; then echo OPENCLAW_MISSING; exit 1; fi;', gatewayLaunchCommand('"$OPENCLAW" gateway run --port ' + port, "gateway"), @@ -257,13 +257,11 @@ export function buildRecoveryScript( `${hermesLaunchEnv}${configuredGatewayCommand}${isHermes ? "" : ` --port ${port}`}`, ); - // Source /tmp/nemoclaw-proxy-env.sh immediately before launching. That file - // is the single source of truth for NODE_OPTIONS preload guards (safety-net, - // ciao networkInterfaces, slack, http-proxy, nemotron). Recovery - // also stops stale launcher/gateway processes that may have respawned - // between the health probe and relaunch. A missing env file remains warning- - // only; a present env file that does not install required guards is a hard - // failure because launching would create an unguarded gateway. + // Source /tmp/nemoclaw-proxy-env.sh immediately before launching. Recovery + // restores the critical safety-net and ciao preloads from packaged trusted + // sources when the env file is missing, and refuses to relaunch if those + // guards cannot be staged safely. Recovery also stops stale launcher/gateway + // processes that may have respawned between the health probe and relaunch. return [ "[ -f ~/.bashrc ] && . ~/.bashrc;", hermesHome, @@ -275,9 +273,8 @@ export function buildRecoveryScript( 'if [ -n "$_GATEWAY_PROC_PATTERN" ]; then pkill -TERM -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; pkill -KILL -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; if pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1; then echo GATEWAY_STALE_PROCESSES; exit 1; fi; fi;', ...validationSteps, "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;", - 'if [ "$_PE_MISSING" = "0" ]; then case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _SN_MISSING=0 ;; *) _SN_MISSING=1 ;; esac; case "${NODE_OPTIONS:-}" in *nemoclaw-ciao-network-guard*) _CIAO_MISSING=0 ;; *) _CIAO_MISSING=1 ;; esac; if [ "$_SN_MISSING" = "0" ] && [ "$_CIAO_MISSING" = "0" ]; then _GUARDS_MISSING=0; else _GUARDS_MISSING=1; fi; else _GUARDS_MISSING=0; fi;', - '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };', - '[ "$_PE_MISSING" = "0" ] && [ "$_GUARDS_MISSING" = "1" ] && { _E="[gateway-recovery] ERROR: /tmp/nemoclaw-proxy-env.sh present but NODE_OPTIONS missing safety-net preload or ciao preload - refusing unguarded gateway relaunch (#2478)"; echo "$_E" >&2; echo "$_E" >> "$_GATEWAY_LOG"; exit 1; };', + ...buildGatewayGuardRecoveryLines(), + '[ "$_GUARDS_MISSING" = "1" ] && { _E="[gateway-recovery] ERROR: NODE_OPTIONS missing safety-net preload or ciao preload after trusted recovery - refusing unguarded gateway relaunch (#2478/#2701)"; echo "$_E" >&2; echo "$_E" >> "$_GATEWAY_LOG"; exit 1; };', ...(isHermes ? [buildHermesRuntimeEnvBoundaryGuard()] : []), launchCommand, "GPID=$!; sleep 2;", From 9058d77324351ed38704dd783855a4f40eb8d06c Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Thu, 11 Jun 2026 23:30:28 -0700 Subject: [PATCH 2/8] test(e2e): accept guard restoration warning Signed-off-by: Carlos Villela --- .../live/gateway-guard-recovery.test.ts | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/test/e2e-scenario/live/gateway-guard-recovery.test.ts b/test/e2e-scenario/live/gateway-guard-recovery.test.ts index b8a56a83b1..9a4ac8312f 100644 --- a/test/e2e-scenario/live/gateway-guard-recovery.test.ts +++ b/test/e2e-scenario/live/gateway-guard-recovery.test.ts @@ -13,9 +13,10 @@ * netns blocks the syscall). The only manual recovery is a 5-min * `nemoclaw rebuild --yes`. * - * This test asserts the desired contract — recovery RESTORES the guard - * chain before launching, no WARNING line, gateway PID stable. It will fail - * on `main` (proving the bug), pass once the fix lands. + * This test asserts the desired contract — recovery logs that it is restoring + * from trusted packaged preloads, RESTORES the guard chain before launching, + * and keeps the gateway PID stable. It will fail on `main` (proving the bug), + * pass once the fix lands. * * The contract is platform-independent: we don't need aarch64 to assert * "guards are present after recovery." The aarch64 ciao crash is a @@ -101,15 +102,16 @@ test("gateway recovery restores /tmp guard chain after pod-recreate wipe (#2701) }); // ── Assert #2701 contract ──────────────────────────────────────── - // After recovery completes, the guard chain MUST be restored. Today - // this fails: recovery emits a WARNING but launches the gateway - // naked, leaving /tmp/nemoclaw-proxy-env.sh absent. After the fix - // lands, recovery re-emits the chain before launching. + // After recovery completes, the guard chain MUST be restored. Before the + // fix, recovery emitted a WARNING but launched the gateway naked, leaving + // /tmp/nemoclaw-proxy-env.sh absent. After the fix lands, recovery re-emits + // the chain before launching. await gateway.expectGuardChainActive(instance); - // No WARNING line should appear in the gateway log — the fix turns - // the warn-and-proceed branch into a re-emit-and-continue branch. - await gateway.expectLogDoesNotContain(instance, /\[gateway-recovery\] WARNING/); + // A missing proxy-env file is still worth surfacing, but the warning must + // describe trusted restoration instead of an unguarded launch. + await gateway.expectLogContains(instance, /restoring library guards from packaged preloads/); + await gateway.expectLogDoesNotContain(instance, /gateway launching without library guards/); // Gateway must be steady-state — no crash loop. This assertion is // the "would have caught DGX Spark" check, even on x86 runners, From cd55950cf47678de55116a60cd107073e4385131 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Fri, 12 Jun 2026 00:39:18 -0700 Subject: [PATCH 3/8] fix(agent): harden recovery guard trust boundary --- ...hermes-secret-boundary-behavioural.test.ts | 65 ++++------ .../agent/runtime-recovery-preload.test.ts | 113 ++++++++++++++++-- src/lib/agent/runtime-recovery-preload.ts | 52 ++++++-- src/lib/agent/runtime.test.ts | 39 +++++- src/lib/agent/runtime.ts | 27 ++--- .../runtime-recovery-preload-test-helpers.ts | 38 ++++++ 6 files changed, 256 insertions(+), 78 deletions(-) create mode 100644 test/helpers/runtime-recovery-preload-test-helpers.ts diff --git a/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts b/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts index ce61f5d2ba..fa7c15ef01 100644 --- a/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts +++ b/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts @@ -8,21 +8,23 @@ // generated-shell shape assertions live in // runtime-hermes-secret-boundary-shape.test.ts. +import { spawnSync } from "node:child_process"; import fs from "node:fs"; import os from "node:os"; import path from "node:path"; -import { spawnSync } from "node:child_process"; -import { describe, it, expect } from "vitest"; +import { describe, expect, it } from "vitest"; import { - HERMES_SECRET_BOUNDARY_VALIDATOR_PATH, __testing, + HERMES_SECRET_BOUNDARY_VALIDATOR_PATH, } from "../../../dist/lib/agent/hermes-recovery-boundary"; -import { GATEWAY_PRELOAD_GUARDS } from "../../../dist/lib/agent/runtime-recovery-preload"; import { buildRecoveryScript } from "../../../dist/lib/agent/runtime"; +import { + createRecoveryPreloadHarnessPaths, + type RecoveryPreloadHarnessPaths, + rewriteRecoveryPreloadPaths, +} from "../../../test/helpers/runtime-recovery-preload-test-helpers"; import { hermesAgent } from "./hermes-recovery-boundary-fixtures"; -const [SAFETY_NET_GUARD, CIAO_GUARD] = GATEWAY_PRELOAD_GUARDS; - function writeStub(dir: string, name: string, body: string) { const stub = path.join(dir, name); fs.writeFileSync(stub, `#!/usr/bin/env bash\n${body}\n`, { mode: 0o755 }); @@ -187,13 +189,7 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = const hermesLaunchMarker = path.join(tmp, "hermes-launched"); const gatewayLogPath = path.join(tmp, "gateway.log"); const recoveryFallbackLog = path.join(tmp, "gateway-recovery-fallback.log"); - const preloadSourceSafetyNet = path.join(tmp, "preload-source-safety-net.js"); - const preloadSourceCiao = path.join(tmp, "preload-source-ciao.js"); - const preloadTmpSafetyNet = path.join(tmp, "preload-tmp-safety-net.js"); - const preloadTmpCiao = path.join(tmp, "preload-tmp-ciao.js"); fs.mkdirSync(stubsDir, { recursive: true }); - fs.writeFileSync(preloadSourceSafetyNet, "module.exports = 'trusted safety net';\n"); - fs.writeFileSync(preloadSourceCiao, "module.exports = 'trusted ciao guard';\n"); return { tmp, stubsDir, @@ -202,10 +198,7 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = hermesLaunchMarker, gatewayLogPath, recoveryFallbackLog, - preloadSourceSafetyNet, - preloadSourceCiao, - preloadTmpSafetyNet, - preloadTmpCiao, + ...createRecoveryPreloadHarnessPaths(tmp), }; } @@ -217,30 +210,24 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = writeStub(stubsDir, "hermes", `: > ${JSON.stringify(hermesLaunchMarker)}\nexit 0`); } - function runRecovery(opts: { - stubsDir: string; - validatorPath: string; - envFilePath?: string; - proxyEnvPath?: string; - recoveryLogPath: string; - gatewayLogPath: string; - recoveryFallbackLog: string; - preloadSourceSafetyNet: string; - preloadSourceCiao: string; - preloadTmpSafetyNet: string; - preloadTmpCiao: string; - tmp: string; - }) { + function runRecovery( + opts: { + stubsDir: string; + validatorPath: string; + envFilePath?: string; + proxyEnvPath?: string; + recoveryLogPath: string; + gatewayLogPath: string; + recoveryFallbackLog: string; + tmp: string; + } & RecoveryPreloadHarnessPaths, + ) { const recoveryScript = buildRecoveryScript(hermesAgent, 8642); expect(recoveryScript).not.toBeNull(); - let stubbed = recoveryScript! + let stubbed = rewriteRecoveryPreloadPaths(recoveryScript!, opts) .replace(new RegExp(HERMES_SECRET_BOUNDARY_VALIDATOR_PATH, "g"), opts.validatorPath) .replace(/\/tmp\/gateway-recovery\.log/g, opts.recoveryLogPath) .replace(/\/tmp\/gateway\.log/g, opts.gatewayLogPath) - .replaceAll(SAFETY_NET_GUARD.tmpPath, opts.preloadTmpSafetyNet) - .replaceAll(SAFETY_NET_GUARD.sourcePath, opts.preloadSourceSafetyNet) - .replaceAll(CIAO_GUARD.tmpPath, opts.preloadTmpCiao) - .replaceAll(CIAO_GUARD.sourcePath, opts.preloadSourceCiao) .replace( /_GATEWAY_LOG=\/tmp\/gateway-recovery\.log/g, `_GATEWAY_LOG=${opts.recoveryFallbackLog}`, @@ -405,6 +392,7 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = proxyEnvFile, "export NODE_OPTIONS='--require=nemoclaw-sandbox-safety-net --require=nemoclaw-ciao-network-guard'\n", ); + fs.chmodSync(proxyEnvFile, 0o444); writeStub(harness.stubsDir, "python3", `${SHARED_PYTHON_STUB_BY_MODE}\n`); stubBaselineUtilities(harness.stubsDir, harness.pkillLog, harness.hermesLaunchMarker); @@ -415,7 +403,7 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = (() => { const recoveryScript = buildRecoveryScript(hermesAgent, 8642); expect(recoveryScript).not.toBeNull(); - const stubbed = recoveryScript! + const stubbed = rewriteRecoveryPreloadPaths(recoveryScript!, harness) .replace( new RegExp(HERMES_SECRET_BOUNDARY_VALIDATOR_PATH, "g"), path.join(validatorRoot, "validate-hermes-env-secret-boundary.py"), @@ -423,10 +411,6 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = .replace(/\/tmp\/gateway-recovery\.log/g, harness.recoveryLogPath) .replace(/\/tmp\/nemoclaw-proxy-env\.sh/g, proxyEnvFile) .replace(/\/tmp\/gateway\.log/g, harness.gatewayLogPath) - .replaceAll(SAFETY_NET_GUARD.tmpPath, harness.preloadTmpSafetyNet) - .replaceAll(SAFETY_NET_GUARD.sourcePath, harness.preloadSourceSafetyNet) - .replaceAll(CIAO_GUARD.tmpPath, harness.preloadTmpCiao) - .replaceAll(CIAO_GUARD.sourcePath, harness.preloadSourceCiao) .replace( /_GATEWAY_LOG=\/tmp\/gateway-recovery\.log/g, `_GATEWAY_LOG=${harness.recoveryFallbackLog}`, @@ -487,6 +471,7 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = "", ].join("\n"), ); + fs.chmodSync(proxyEnvFile, 0o444); stubBaselineUtilities(harness.stubsDir, harness.pkillLog, harness.hermesLaunchMarker); try { diff --git a/src/lib/agent/runtime-recovery-preload.test.ts b/src/lib/agent/runtime-recovery-preload.test.ts index 1c0bf72316..7f4ad08aba 100644 --- a/src/lib/agent/runtime-recovery-preload.test.ts +++ b/src/lib/agent/runtime-recovery-preload.test.ts @@ -6,30 +6,40 @@ import fs from "node:fs"; import os from "node:os"; import path from "node:path"; import { describe, expect, it } from "vitest"; +import { buildOpenClawRecoveryScript, buildRecoveryScript } from "../../../dist/lib/agent/runtime"; import { - GATEWAY_PRELOAD_GUARDS, buildGatewayGuardRecoveryLines, + GATEWAY_PRELOAD_GUARDS, } from "../../../dist/lib/agent/runtime-recovery-preload"; -import { buildOpenClawRecoveryScript, buildRecoveryScript } from "../../../dist/lib/agent/runtime"; import { minimalAgent } from "./hermes-recovery-boundary-fixtures"; const [SAFETY_NET_GUARD, CIAO_GUARD] = GATEWAY_PRELOAD_GUARDS; +function writeStub(dir: string, name: string, body: string) { + const stub = path.join(dir, name); + fs.writeFileSync(stub, `#!/usr/bin/env bash\n${body}\n`, { mode: 0o755 }); + return stub; +} + function makeHarness() { const root = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-recovery-preload-")); const sourceDir = path.join(root, "usr-local-lib-nemoclaw-preloads"); const workDir = path.join(root, "tmp"); + const stubsDir = path.join(root, "bin"); fs.mkdirSync(sourceDir, { recursive: true }); fs.mkdirSync(workDir, { recursive: true }); + fs.mkdirSync(stubsDir, { recursive: true }); const paths = { root, + stubsDir, sourceSafetyNet: path.join(sourceDir, "sandbox-safety-net.js"), sourceCiao: path.join(sourceDir, "ciao-network-guard.js"), tmpSafetyNet: path.join(workDir, "nemoclaw-sandbox-safety-net.js"), tmpCiao: path.join(workDir, "nemoclaw-ciao-network-guard.js"), proxyEnv: path.join(workDir, "nemoclaw-proxy-env.sh"), gatewayLog: path.join(workDir, "gateway.log"), + hostileMarker: path.join(root, "hostile-proxy-env-sourced"), }; fs.writeFileSync(paths.sourceSafetyNet, "module.exports = 'trusted safety net';\n"); @@ -50,20 +60,32 @@ function rewriteRuntimePaths(script: string, paths: ReturnType) => void; + fakeRoot?: boolean; }) { const paths = makeHarness(); + if (opts.fakeRoot) { + writeStub( + paths.stubsDir, + "id", + '[ "$1" = "-u" ] && { printf "0\\n"; exit 0; }\n/usr/bin/id "$@"', + ); + writeStub(paths.stubsDir, "chown", "exit 0"); + writeStub( + paths.stubsDir, + "stat", + '[ "$1" = "-c" ] && [ "$2" = "%u" ] && { printf "0\\n"; exit 0; }\n/usr/bin/stat "$@"', + ); + } opts.beforeScript?.(paths); - const sourceProxyEnv = opts.proxyEnvContent + const writeProxyEnv = opts.proxyEnvContent ? [ `cat > ${JSON.stringify(paths.proxyEnv)} <<'PROXYENV'`, opts.proxyEnvContent, "PROXYENV", `chmod 444 ${JSON.stringify(paths.proxyEnv)}`, - `. ${JSON.stringify(paths.proxyEnv)}`, - "_PE_MISSING=0", ] - : ["_PE_MISSING=1"]; + : []; const script = rewriteRuntimePaths( [ @@ -71,7 +93,7 @@ function runGuardRecovery(opts: { "set -u", `export _GATEWAY_LOG=${JSON.stringify(paths.gatewayLog)}`, ': > "$_GATEWAY_LOG"', - ...sourceProxyEnv, + ...writeProxyEnv, ...buildGatewayGuardRecoveryLines(), 'if [ "$_GUARDS_MISSING" = "1" ]; then echo GUARDS_MISSING; exit 17; fi', 'printf "PE_MISSING=%s\\n" "$_PE_MISSING"', @@ -86,10 +108,15 @@ function runGuardRecovery(opts: { const result = spawnSync("bash", [scriptPath], { encoding: "utf-8", timeout: 10000, - env: { PATH: process.env.PATH ?? "/usr/bin:/bin", HOME: paths.root }, + env: { + PATH: `${paths.stubsDir}:${process.env.PATH ?? "/usr/bin:/bin"}`, + HOME: paths.root, + }, }); const readIfExists = (pathname: string) => - fs.existsSync(pathname) ? fs.readFileSync(pathname, "utf-8") : null; + fs.existsSync(pathname) && fs.statSync(pathname).isFile() + ? fs.readFileSync(pathname, "utf-8") + : null; const modeIfExists = (pathname: string) => (fs.existsSync(pathname) ? mode(pathname) : null); return { ...result, @@ -105,6 +132,10 @@ function runGuardRecovery(opts: { tmpSafetyNetIsSymlink: fs.existsSync(paths.tmpSafetyNet) ? fs.lstatSync(paths.tmpSafetyNet).isSymbolicLink() : null, + proxyEnvIsSymlink: fs.existsSync(paths.proxyEnv) + ? fs.lstatSync(paths.proxyEnv).isSymbolicLink() + : null, + hostileProxyEnvSourced: fs.existsSync(paths.hostileMarker), }, }; } finally { @@ -154,6 +185,57 @@ describe("gateway recovery preload repair", () => { expect(nodeOptions.match(new RegExp(result.paths.tmpCiao, "g"))?.length).toBe(1); }); + it("rebuilds an unsafe proxy-env.sh without sourcing attacker-controlled content", () => { + const result = runGuardRecovery({ + beforeScript(paths) { + fs.writeFileSync( + paths.proxyEnv, + [ + `touch ${JSON.stringify(paths.hostileMarker)}`, + "export NODE_OPTIONS='--require /tmp/attacker.js'", + "", + ].join("\n"), + ); + fs.chmodSync(paths.proxyEnv, 0o666); + }, + }); + expect(result.status).toBe(0); + expect(result.stdout).toContain("PE_MISSING=0"); + expect(result.files.hostileProxyEnvSourced).toBe(false); + expect(result.files.proxyEnvMode).toBe(0o444); + expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(result.files.proxyEnv).not.toContain("/tmp/attacker.js"); + expect(result.files.gatewayLog).toContain("unsafe mode"); + expect(result.files.gatewayLog).toContain("rebuilding from packaged preloads"); + }); + + it("replaces a symlinked proxy-env.sh instead of sourcing it", () => { + const result = runGuardRecovery({ + beforeScript(paths) { + const target = path.join(paths.root, "attacker-proxy-env.sh"); + fs.writeFileSync(target, `touch ${JSON.stringify(paths.hostileMarker)}\n`); + fs.symlinkSync(target, paths.proxyEnv); + }, + }); + expect(result.status).toBe(0); + expect(result.files.hostileProxyEnvSourced).toBe(false); + expect(result.files.proxyEnvIsSymlink).toBe(false); + expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(result.files.gatewayLog).toContain("is a symlink"); + }); + + it("fails closed when proxy-env.sh is a directory", () => { + const result = runGuardRecovery({ + beforeScript(paths) { + fs.mkdirSync(paths.proxyEnv); + }, + }); + expect(result.status).toBe(17); + expect(result.stdout).toContain("GUARDS_MISSING"); + expect(result.files.gatewayLog).toContain("is a directory"); + expect(result.files.gatewayLog).toContain("refusing recovered proxy-env install"); + }); + it("replaces a symlinked tmp preload with a trusted staged file", () => { const result = runGuardRecovery({ beforeScript(paths) { @@ -179,6 +261,19 @@ describe("gateway recovery preload repair", () => { expect(result.files.gatewayLog).toContain("refusing preload install"); }); + it("refuses recovery when a trusted packaged preload source is group writable in root mode", () => { + const result = runGuardRecovery({ + fakeRoot: true, + beforeScript(paths) { + fs.chmodSync(paths.sourceCiao, 0o664); + }, + }); + expect(result.status).toBe(17); + expect(result.stdout).toContain("GUARDS_MISSING"); + expect(result.files.gatewayLog).toContain("trusted preload source"); + expect(result.files.gatewayLog).toContain("unsafe mode=664"); + }); + it("wires the repair helper into both recovery script builders", () => { const genericScript = buildRecoveryScript(minimalAgent, 19000); const openClawScript = buildOpenClawRecoveryScript(18789); diff --git a/src/lib/agent/runtime-recovery-preload.ts b/src/lib/agent/runtime-recovery-preload.ts index cc3ebdf9d9..0c4e71b767 100644 --- a/src/lib/agent/runtime-recovery-preload.ts +++ b/src/lib/agent/runtime-recovery-preload.ts @@ -22,11 +22,13 @@ export const GATEWAY_PRELOAD_GUARDS: ReadonlyArray<{ /** * Build shell lines that restore and validate the required recovery preloads. * - * The recovery script sources /tmp/nemoclaw-proxy-env.sh before these lines. - * If that file is missing, this helper recreates a minimal proxy-env file from - * trusted packaged preload sources, sources it, and leaves the sandbox in a - * state the E2E guard-chain checks can inspect. If the trusted sources cannot - * be staged safely, callers see _GUARDS_MISSING=1 and refuse the relaunch. + * The recovery script must not source /tmp/nemoclaw-proxy-env.sh before these + * lines. This helper validates the env file first, rebuilds it when it is + * missing or unsafe, then sources only the trusted result. The recovered env is + * intentionally minimal: it restores the critical Node preload guard chain that + * recovery itself requires. The normal sandbox entrypoint remains the source of + * truth for the full proxy/tool/token environment and refreshes that file on a + * regular sandbox restart. */ export function buildGatewayGuardRecoveryLines(): string[] { const recoveredProxyEnvExports = GATEWAY_PRELOAD_GUARDS.map( @@ -61,6 +63,13 @@ export function buildGatewayGuardRecoveryLines(): string[] { "done;", "return 1;", "};", + "_nemoclaw_mode_group_or_other_writable() {", + 'local perms="$1"; local go;', + 'case "$perms" in ""|*[!0-7]*) return 0 ;; esac;', + 'while [ "${#perms}" -lt 3 ]; do perms="0$perms"; done;', + 'go="${perms: -2}";', + 'case "$go" in [2367]*|?[2367]) return 0 ;; *) return 1 ;; esac;', + "};", "_nemoclaw_append_node_require() {", 'local wanted="$1";', 'if ! _nemoclaw_node_options_has_require "$wanted"; then export NODE_OPTIONS="${NODE_OPTIONS:+$NODE_OPTIONS }--require $wanted"; fi;', @@ -72,14 +81,37 @@ export function buildGatewayGuardRecoveryLines(): string[] { 'perms="$(stat -c %a "$file" 2>/dev/null || stat -f %Lp "$file" 2>/dev/null || echo unknown)";', 'if [ "$perms" != "444" ]; then _msg="[gateway-recovery] ERROR: $file has unsafe mode=$perms (expected 444) - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', 'if [ "$(id -u)" -eq 0 ]; then', - 'owner="$(stat -c %U "$file" 2>/dev/null || stat -f %Su "$file" 2>/dev/null || echo unknown)";', - 'if [ "$owner" != "root" ]; then _msg="[gateway-recovery] ERROR: $file owner=$owner (expected root) - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'owner="$(stat -c %u "$file" 2>/dev/null || stat -f %u "$file" 2>/dev/null || echo unknown)";', + 'if [ "$owner" != "0" ]; then _msg="[gateway-recovery] ERROR: $file owner_uid=$owner (expected 0) - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + "fi;", + "return 0;", + "};", + "_nemoclaw_validate_trusted_preload_source() {", + 'local src="$1"; local perms owner _msg;', + 'if [ ! -r "$src" ] || [ -L "$src" ] || [ ! -f "$src" ]; then _msg="[gateway-recovery] ERROR: trusted preload source $src unavailable - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'perms="$(stat -c %a "$src" 2>/dev/null || stat -f %Lp "$src" 2>/dev/null || echo unknown)";', + 'if [ "$(id -u)" -eq 0 ]; then', + 'owner="$(stat -c %u "$src" 2>/dev/null || stat -f %u "$src" 2>/dev/null || echo unknown)";', + 'if [ "$owner" != "0" ]; then _msg="[gateway-recovery] ERROR: trusted preload source $src owner_uid=$owner (expected 0) - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if _nemoclaw_mode_group_or_other_writable "$perms"; then _msg="[gateway-recovery] ERROR: trusted preload source $src has unsafe mode=$perms (group/other writable) - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + "fi;", + "return 0;", + "};", + "_nemoclaw_validate_recovery_proxy_env() {", + 'local env_file="$1"; local perms owner _msg;', + 'if [ -L "$env_file" ]; then _msg="[gateway-recovery] WARNING: $env_file is a symlink - rebuilding from packaged preloads"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if [ ! -f "$env_file" ]; then return 1; fi;', + 'perms="$(stat -c %a "$env_file" 2>/dev/null || stat -f %Lp "$env_file" 2>/dev/null || echo unknown)";', + 'if [ "$perms" != "444" ]; then _msg="[gateway-recovery] WARNING: $env_file has unsafe mode=$perms (expected 444) - rebuilding from packaged preloads"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if [ "$(id -u)" -eq 0 ]; then', + 'owner="$(stat -c %u "$env_file" 2>/dev/null || stat -f %u "$env_file" 2>/dev/null || echo unknown)";', + 'if [ "$owner" != "0" ]; then _msg="[gateway-recovery] WARNING: $env_file owner_uid=$owner (expected 0) - rebuilding from packaged preloads"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', "fi;", "return 0;", "};", "_nemoclaw_stage_recovery_preload() {", 'local tmp="$1"; local src="$2"; local dir base stage _msg;', - 'if [ ! -r "$src" ] || [ -L "$src" ] || [ ! -f "$src" ]; then _msg="[gateway-recovery] ERROR: trusted preload source $src unavailable - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + '_nemoclaw_validate_trusted_preload_source "$src" || return 1;', 'dir="$(dirname "$tmp")"; base="$(basename "$tmp")";', 'stage="$(mktemp "${dir}/.${base}.tmp.XXXXXX")" || { _msg="[gateway-recovery] ERROR: failed to stage $tmp"; _nemoclaw_recovery_log "$_msg"; return 1; };', 'if ! cp "$src" "$stage"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to copy $src into recovery stage"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', @@ -97,13 +129,15 @@ export function buildGatewayGuardRecoveryLines(): string[] { '} > "$stage" || { rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to write recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; };', 'if [ "$(id -u)" -eq 0 ] && ! chown root:root "$stage"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to chown recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', 'if ! chmod 444 "$stage"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to chmod recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', + 'if [ -d "$env_file" ]; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: $env_file is a directory - refusing recovered proxy-env install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', 'if ! mv -f "$stage" "$env_file"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to install recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', - "return 0;", + '_nemoclaw_validate_recovery_proxy_env "$env_file";', "};", ].join(" "); return [ helpers, + "if _nemoclaw_validate_recovery_proxy_env /tmp/nemoclaw-proxy-env.sh; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;", "_NEMOCLAW_CRITICAL_GUARDS_READY=1;", ...stageCalls, 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ] && [ "${_PE_MISSING:-0}" = "1" ]; then', diff --git a/src/lib/agent/runtime.test.ts b/src/lib/agent/runtime.test.ts index 309660147d..bea6908502 100644 --- a/src/lib/agent/runtime.test.ts +++ b/src/lib/agent/runtime.test.ts @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -import { describe, it, expect } from "vitest"; +import { describe, expect, it } from "vitest"; // Import from compiled dist/ so coverage is attributed correctly. import { buildHermesDashboardProcessRecoveryScript, @@ -228,6 +228,35 @@ describe("buildRecoveryScript", () => { expect(script).toContain("or ciao preload"); }); + it("validates proxy-env.sh before sourcing it", () => { + const script = buildRecoveryScript(minimalAgent, 19000); + expect(script).not.toContain("[ -r /tmp/nemoclaw-proxy-env.sh ]; then ."); + const validateIdx = script!.indexOf( + "_nemoclaw_validate_recovery_proxy_env /tmp/nemoclaw-proxy-env.sh", + ); + const sourceIdx = script!.indexOf("then . /tmp/nemoclaw-proxy-env.sh"); + expect(validateIdx).toBeGreaterThanOrEqual(0); + expect(sourceIdx).toBeGreaterThanOrEqual(0); + expect(validateIdx).toBeLessThan(sourceIdx); + }); + + it("repairs guard files before the ALREADY_RUNNING fast path", () => { + for (const script of [ + buildRecoveryScript(minimalAgent, 19000), + buildOpenClawRecoveryScript(18789), + ]) { + expect(script).not.toBeNull(); + const repairIdx = script!.indexOf("_NEMOCLAW_CRITICAL_GUARDS_READY=1"); + const healthIdx = script!.indexOf("_GW_CODE="); + const alreadyRunningIdx = script!.indexOf("echo ALREADY_RUNNING; exit 0"); + expect(repairIdx).toBeGreaterThanOrEqual(0); + expect(healthIdx).toBeGreaterThanOrEqual(0); + expect(alreadyRunningIdx).toBeGreaterThanOrEqual(0); + expect(repairIdx).toBeLessThan(healthIdx); + expect(healthIdx).toBeLessThan(alreadyRunningIdx); + } + }); + it("stops stale launcher and gateway processes before relaunch", () => { const script = buildRecoveryScript(minimalAgent, 19000); expect(script).toContain( @@ -238,16 +267,16 @@ describe("buildRecoveryScript", () => { expect(script).toContain("GATEWAY_STALE_PROCESSES"); }); - it("sources proxy-env.sh BEFORE launching the gateway binary", () => { + it("sources proxy-env.sh before health probing and launching the gateway binary", () => { const script = buildRecoveryScript(minimalAgent, 19000); expect(script).not.toBeNull(); - const staleStopIdx = script!.indexOf('pkill -TERM -f "$_GATEWAY_PROC_PATTERN"'); const sourceIdx = script!.indexOf("then . /tmp/nemoclaw-proxy-env.sh"); + const healthIdx = script!.indexOf("_GW_CODE="); const launchIdx = script!.indexOf("nohup"); - expect(staleStopIdx).toBeGreaterThanOrEqual(0); expect(sourceIdx).toBeGreaterThanOrEqual(0); + expect(healthIdx).toBeGreaterThanOrEqual(0); expect(launchIdx).toBeGreaterThanOrEqual(0); - expect(staleStopIdx).toBeLessThan(sourceIdx); + expect(sourceIdx).toBeLessThan(healthIdx); expect(sourceIdx).toBeLessThan(launchIdx); }); diff --git a/src/lib/agent/runtime.ts b/src/lib/agent/runtime.ts index e66d312dcc..8426029d30 100644 --- a/src/lib/agent/runtime.ts +++ b/src/lib/agent/runtime.ts @@ -11,7 +11,7 @@ import { DASHBOARD_PORT } from "../core/ports"; import { shellQuote } from "../runner"; import * as onboardSession from "../state/onboard-session"; import * as registry from "../state/registry"; -import { loadAgent, type AgentDefinition } from "./defs"; +import { type AgentDefinition, loadAgent } from "./defs"; import { buildHermesEnvFileBoundaryGuard, buildHermesRuntimeEnvBoundaryGuard, @@ -195,15 +195,14 @@ export function buildOpenClawRecoveryScript(port: number): string { const staleGatewayPattern = "[o]penclaw([ -]gateway| gateway run|$)"; return [ "[ -f ~/.bashrc ] && . ~/.bashrc;", - `_GW_CODE=$(curl -so /dev/null -w '%{http_code}' --max-time 3 http://127.0.0.1:${port}/health 2>/dev/null || echo 000); case "$_GW_CODE" in 200|401) echo ALREADY_RUNNING; exit 0 ;; esac;`, - "rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;", ...buildGatewayLogSetup(true, "gateway"), buildGatewayLogSelection(), - `_GATEWAY_PROC_PATTERN=${shellQuote(staleGatewayPattern)};`, - 'if [ -n "$_GATEWAY_PROC_PATTERN" ]; then pkill -TERM -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; pkill -KILL -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; if pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1; then echo GATEWAY_STALE_PROCESSES; exit 1; fi; fi;', - "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;", ...buildGatewayGuardRecoveryLines(), '[ "$_GUARDS_MISSING" = "1" ] && { _E="[gateway-recovery] ERROR: NODE_OPTIONS missing safety-net preload or ciao preload after trusted recovery - refusing unguarded gateway relaunch (#2478/#2701)"; echo "$_E" >&2; echo "$_E" >> "$_GATEWAY_LOG"; exit 1; };', + `_GW_CODE=$(curl -so /dev/null -w '%{http_code}' --max-time 3 http://127.0.0.1:${port}/health 2>/dev/null || echo 000); case "$_GW_CODE" in 200|401) echo ALREADY_RUNNING; exit 0 ;; esac;`, + "rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;", + `_GATEWAY_PROC_PATTERN=${shellQuote(staleGatewayPattern)};`, + 'if [ -n "$_GATEWAY_PROC_PATTERN" ]; then pkill -TERM -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; pkill -KILL -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; if pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1; then echo GATEWAY_STALE_PROCESSES; exit 1; fi; fi;', 'OPENCLAW="$(command -v openclaw)";', 'if [ -z "$OPENCLAW" ]; then echo OPENCLAW_MISSING; exit 1; fi;', gatewayLaunchCommand('"$OPENCLAW" gateway run --port ' + port, "gateway"), @@ -257,24 +256,22 @@ export function buildRecoveryScript( `${hermesLaunchEnv}${configuredGatewayCommand}${isHermes ? "" : ` --port ${port}`}`, ); - // Source /tmp/nemoclaw-proxy-env.sh immediately before launching. Recovery - // restores the critical safety-net and ciao preloads from packaged trusted - // sources when the env file is missing, and refuses to relaunch if those - // guards cannot be staged safely. Recovery also stops stale launcher/gateway - // processes that may have respawned between the health probe and relaunch. + // Validate or rebuild /tmp/nemoclaw-proxy-env.sh before the health fast path so + // a healthy gateway cannot leave a wiped guard chain unrepaired. Recovery also + // stops stale launcher/gateway processes that may have respawned between the + // health probe and relaunch. return [ "[ -f ~/.bashrc ] && . ~/.bashrc;", hermesHome, ...(isHermes ? [buildHermesEnvFileBoundaryGuard()] : []), - `_GW_CODE=$(curl -so /dev/null -w '%{http_code}' --max-time 3 ${shellQuote(probeUrl)} 2>/dev/null || echo 000); case "$_GW_CODE" in 200|401) echo ALREADY_RUNNING; exit 0 ;; esac;`, ...buildGatewayLogSetup(false), buildGatewayLogSelection(), + ...buildGatewayGuardRecoveryLines(), + '[ "$_GUARDS_MISSING" = "1" ] && { _E="[gateway-recovery] ERROR: NODE_OPTIONS missing safety-net preload or ciao preload after trusted recovery - refusing unguarded gateway relaunch (#2478/#2701)"; echo "$_E" >&2; echo "$_E" >> "$_GATEWAY_LOG"; exit 1; };', + `_GW_CODE=$(curl -so /dev/null -w '%{http_code}' --max-time 3 ${shellQuote(probeUrl)} 2>/dev/null || echo 000); case "$_GW_CODE" in 200|401) echo ALREADY_RUNNING; exit 0 ;; esac;`, `_GATEWAY_PROC_PATTERN=${shellQuote(staleGatewayPattern)};`, 'if [ -n "$_GATEWAY_PROC_PATTERN" ]; then pkill -TERM -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; pkill -KILL -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; if pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1; then echo GATEWAY_STALE_PROCESSES; exit 1; fi; fi;', ...validationSteps, - "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;", - ...buildGatewayGuardRecoveryLines(), - '[ "$_GUARDS_MISSING" = "1" ] && { _E="[gateway-recovery] ERROR: NODE_OPTIONS missing safety-net preload or ciao preload after trusted recovery - refusing unguarded gateway relaunch (#2478/#2701)"; echo "$_E" >&2; echo "$_E" >> "$_GATEWAY_LOG"; exit 1; };', ...(isHermes ? [buildHermesRuntimeEnvBoundaryGuard()] : []), launchCommand, "GPID=$!; sleep 2;", diff --git a/test/helpers/runtime-recovery-preload-test-helpers.ts b/test/helpers/runtime-recovery-preload-test-helpers.ts new file mode 100644 index 0000000000..7bc932088c --- /dev/null +++ b/test/helpers/runtime-recovery-preload-test-helpers.ts @@ -0,0 +1,38 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import fs from "node:fs"; +import path from "node:path"; +import { GATEWAY_PRELOAD_GUARDS } from "../../dist/lib/agent/runtime-recovery-preload"; + +const [SAFETY_NET_GUARD, CIAO_GUARD] = GATEWAY_PRELOAD_GUARDS; + +export interface RecoveryPreloadHarnessPaths { + preloadSourceSafetyNet: string; + preloadSourceCiao: string; + preloadTmpSafetyNet: string; + preloadTmpCiao: string; +} + +export function createRecoveryPreloadHarnessPaths(root: string): RecoveryPreloadHarnessPaths { + const paths = { + preloadSourceSafetyNet: path.join(root, "preload-source-safety-net.js"), + preloadSourceCiao: path.join(root, "preload-source-ciao.js"), + preloadTmpSafetyNet: path.join(root, "preload-tmp-safety-net.js"), + preloadTmpCiao: path.join(root, "preload-tmp-ciao.js"), + }; + fs.writeFileSync(paths.preloadSourceSafetyNet, "module.exports = 'trusted safety net';\n"); + fs.writeFileSync(paths.preloadSourceCiao, "module.exports = 'trusted ciao guard';\n"); + return paths; +} + +export function rewriteRecoveryPreloadPaths( + script: string, + paths: RecoveryPreloadHarnessPaths, +): string { + return script + .replaceAll(SAFETY_NET_GUARD.tmpPath, paths.preloadTmpSafetyNet) + .replaceAll(SAFETY_NET_GUARD.sourcePath, paths.preloadSourceSafetyNet) + .replaceAll(CIAO_GUARD.tmpPath, paths.preloadTmpCiao) + .replaceAll(CIAO_GUARD.sourcePath, paths.preloadSourceCiao); +} From ea191c375b48d6260d0a3132b222fe9f4efd75d3 Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Fri, 12 Jun 2026 00:50:41 -0700 Subject: [PATCH 4/8] test(agent): avoid recovery harness file race --- .../agent/runtime-recovery-preload.test.ts | 38 +++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/lib/agent/runtime-recovery-preload.test.ts b/src/lib/agent/runtime-recovery-preload.test.ts index 7f4ad08aba..03a10bee6c 100644 --- a/src/lib/agent/runtime-recovery-preload.test.ts +++ b/src/lib/agent/runtime-recovery-preload.test.ts @@ -113,11 +113,31 @@ function runGuardRecovery(opts: { HOME: paths.root, }, }); - const readIfExists = (pathname: string) => - fs.existsSync(pathname) && fs.statSync(pathname).isFile() - ? fs.readFileSync(pathname, "utf-8") - : null; - const modeIfExists = (pathname: string) => (fs.existsSync(pathname) ? mode(pathname) : null); + const readIfExists = (pathname: string) => { + try { + return fs.readFileSync(pathname, "utf-8"); + } catch (error) { + const code = (error as NodeJS.ErrnoException).code; + if (code === "ENOENT" || code === "EISDIR") return null; + throw error; + } + }; + const modeIfExists = (pathname: string) => { + try { + return mode(pathname); + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "ENOENT") return null; + throw error; + } + }; + const symlinkIfExists = (pathname: string) => { + try { + return fs.lstatSync(pathname).isSymbolicLink(); + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "ENOENT") return null; + throw error; + } + }; return { ...result, paths, @@ -129,12 +149,8 @@ function runGuardRecovery(opts: { tmpSafetyNetMode: modeIfExists(paths.tmpSafetyNet), tmpCiaoMode: modeIfExists(paths.tmpCiao), proxyEnvMode: modeIfExists(paths.proxyEnv), - tmpSafetyNetIsSymlink: fs.existsSync(paths.tmpSafetyNet) - ? fs.lstatSync(paths.tmpSafetyNet).isSymbolicLink() - : null, - proxyEnvIsSymlink: fs.existsSync(paths.proxyEnv) - ? fs.lstatSync(paths.proxyEnv).isSymbolicLink() - : null, + tmpSafetyNetIsSymlink: symlinkIfExists(paths.tmpSafetyNet), + proxyEnvIsSymlink: symlinkIfExists(paths.proxyEnv), hostileProxyEnvSourced: fs.existsSync(paths.hostileMarker), }, }; From f0418cf1e0a7b17470c695b7ba71f0ac4ad0d30d Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Fri, 12 Jun 2026 01:10:08 -0700 Subject: [PATCH 5/8] test(agent): cover recovery helper under sh --- .../agent/runtime-recovery-preload.test.ts | 55 +++++++++++++++++-- src/lib/agent/runtime-recovery-preload.ts | 5 +- src/lib/agent/runtime.test.ts | 42 -------------- 3 files changed, 51 insertions(+), 51 deletions(-) diff --git a/src/lib/agent/runtime-recovery-preload.test.ts b/src/lib/agent/runtime-recovery-preload.test.ts index 03a10bee6c..0119c50dd0 100644 --- a/src/lib/agent/runtime-recovery-preload.test.ts +++ b/src/lib/agent/runtime-recovery-preload.test.ts @@ -17,7 +17,7 @@ const [SAFETY_NET_GUARD, CIAO_GUARD] = GATEWAY_PRELOAD_GUARDS; function writeStub(dir: string, name: string, body: string) { const stub = path.join(dir, name); - fs.writeFileSync(stub, `#!/usr/bin/env bash\n${body}\n`, { mode: 0o755 }); + fs.writeFileSync(stub, `#!/usr/bin/env sh\n${body}\n`, { mode: 0o755 }); return stub; } @@ -61,6 +61,7 @@ function runGuardRecovery(opts: { proxyEnvContent?: string; beforeScript?: (paths: ReturnType) => void; fakeRoot?: boolean; + shell?: "bash" | "sh"; }) { const paths = makeHarness(); if (opts.fakeRoot) { @@ -89,7 +90,6 @@ function runGuardRecovery(opts: { const script = rewriteRuntimePaths( [ - "#!/usr/bin/env bash", "set -u", `export _GATEWAY_LOG=${JSON.stringify(paths.gatewayLog)}`, ': > "$_GATEWAY_LOG"', @@ -101,11 +101,8 @@ function runGuardRecovery(opts: { ].join("\n"), paths, ); - const scriptPath = path.join(paths.root, "recovery.sh"); - fs.writeFileSync(scriptPath, script, { mode: 0o700 }); - try { - const result = spawnSync("bash", [scriptPath], { + const result = spawnSync(opts.shell ?? "sh", ["-c", script], { encoding: "utf-8", timeout: 10000, env: { @@ -164,6 +161,15 @@ function mode(pathname: string): number { } describe("gateway recovery preload repair", () => { + it("runs the generated recovery helper through /bin/sh -c", () => { + const result = runGuardRecovery({ fakeRoot: true, shell: "sh" }); + expect(result.status).toBe(0); + expect(result.stdout).toContain("PE_MISSING=0"); + expect(result.stderr).not.toContain("Bad substitution"); + expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpCiao}`); + }); + it("restores missing proxy-env.sh from trusted packaged preloads", () => { const result = runGuardRecovery({}); expect(result.status).toBe(0); @@ -300,4 +306,41 @@ describe("gateway recovery preload repair", () => { expect(script).not.toContain("gateway launching without library guards"); } }); + + it("validates proxy-env.sh before sourcing it in gateway recovery scripts", () => { + for (const script of [ + buildRecoveryScript(minimalAgent, 19000), + buildOpenClawRecoveryScript(18789), + ]) { + expect(script).not.toContain("[ -r /tmp/nemoclaw-proxy-env.sh ]; then ."); + const validateIdx = script!.indexOf( + "_nemoclaw_validate_recovery_proxy_env /tmp/nemoclaw-proxy-env.sh", + ); + const sourceIdx = script!.indexOf("then . /tmp/nemoclaw-proxy-env.sh"); + expect(validateIdx).toBeGreaterThanOrEqual(0); + expect(sourceIdx).toBeGreaterThanOrEqual(0); + expect(validateIdx).toBeLessThan(sourceIdx); + } + }); + + it("repairs guard files before health probing and the ALREADY_RUNNING fast path", () => { + for (const script of [ + buildRecoveryScript(minimalAgent, 19000), + buildOpenClawRecoveryScript(18789), + ]) { + expect(script).not.toBeNull(); + const repairIdx = script!.indexOf("_NEMOCLAW_CRITICAL_GUARDS_READY=1"); + const sourceIdx = script!.indexOf("then . /tmp/nemoclaw-proxy-env.sh"); + const healthIdx = script!.indexOf("_GW_CODE="); + const alreadyRunningIdx = script!.indexOf("echo ALREADY_RUNNING; exit 0"); + expect(repairIdx).toBeGreaterThanOrEqual(0); + expect(sourceIdx).toBeGreaterThanOrEqual(0); + expect(healthIdx).toBeGreaterThanOrEqual(0); + expect(alreadyRunningIdx).toBeGreaterThanOrEqual(0); + expect(sourceIdx).toBeLessThan(repairIdx); + expect(repairIdx).toBeLessThan(healthIdx); + expect(sourceIdx).toBeLessThan(healthIdx); + expect(healthIdx).toBeLessThan(alreadyRunningIdx); + } + }); }); diff --git a/src/lib/agent/runtime-recovery-preload.ts b/src/lib/agent/runtime-recovery-preload.ts index 0c4e71b767..918cff5bb2 100644 --- a/src/lib/agent/runtime-recovery-preload.ts +++ b/src/lib/agent/runtime-recovery-preload.ts @@ -64,11 +64,10 @@ export function buildGatewayGuardRecoveryLines(): string[] { "return 1;", "};", "_nemoclaw_mode_group_or_other_writable() {", - 'local perms="$1"; local go;', + 'local perms="$1";', 'case "$perms" in ""|*[!0-7]*) return 0 ;; esac;', 'while [ "${#perms}" -lt 3 ]; do perms="0$perms"; done;', - 'go="${perms: -2}";', - 'case "$go" in [2367]*|?[2367]) return 0 ;; *) return 1 ;; esac;', + 'case "$perms" in *[2367]?|*[2367]) return 0 ;; *) return 1 ;; esac;', "};", "_nemoclaw_append_node_require() {", 'local wanted="$1";', diff --git a/src/lib/agent/runtime.test.ts b/src/lib/agent/runtime.test.ts index bea6908502..5f46ed7521 100644 --- a/src/lib/agent/runtime.test.ts +++ b/src/lib/agent/runtime.test.ts @@ -228,35 +228,6 @@ describe("buildRecoveryScript", () => { expect(script).toContain("or ciao preload"); }); - it("validates proxy-env.sh before sourcing it", () => { - const script = buildRecoveryScript(minimalAgent, 19000); - expect(script).not.toContain("[ -r /tmp/nemoclaw-proxy-env.sh ]; then ."); - const validateIdx = script!.indexOf( - "_nemoclaw_validate_recovery_proxy_env /tmp/nemoclaw-proxy-env.sh", - ); - const sourceIdx = script!.indexOf("then . /tmp/nemoclaw-proxy-env.sh"); - expect(validateIdx).toBeGreaterThanOrEqual(0); - expect(sourceIdx).toBeGreaterThanOrEqual(0); - expect(validateIdx).toBeLessThan(sourceIdx); - }); - - it("repairs guard files before the ALREADY_RUNNING fast path", () => { - for (const script of [ - buildRecoveryScript(minimalAgent, 19000), - buildOpenClawRecoveryScript(18789), - ]) { - expect(script).not.toBeNull(); - const repairIdx = script!.indexOf("_NEMOCLAW_CRITICAL_GUARDS_READY=1"); - const healthIdx = script!.indexOf("_GW_CODE="); - const alreadyRunningIdx = script!.indexOf("echo ALREADY_RUNNING; exit 0"); - expect(repairIdx).toBeGreaterThanOrEqual(0); - expect(healthIdx).toBeGreaterThanOrEqual(0); - expect(alreadyRunningIdx).toBeGreaterThanOrEqual(0); - expect(repairIdx).toBeLessThan(healthIdx); - expect(healthIdx).toBeLessThan(alreadyRunningIdx); - } - }); - it("stops stale launcher and gateway processes before relaunch", () => { const script = buildRecoveryScript(minimalAgent, 19000); expect(script).toContain( @@ -267,19 +238,6 @@ describe("buildRecoveryScript", () => { expect(script).toContain("GATEWAY_STALE_PROCESSES"); }); - it("sources proxy-env.sh before health probing and launching the gateway binary", () => { - const script = buildRecoveryScript(minimalAgent, 19000); - expect(script).not.toBeNull(); - const sourceIdx = script!.indexOf("then . /tmp/nemoclaw-proxy-env.sh"); - const healthIdx = script!.indexOf("_GW_CODE="); - const launchIdx = script!.indexOf("nohup"); - expect(sourceIdx).toBeGreaterThanOrEqual(0); - expect(healthIdx).toBeGreaterThanOrEqual(0); - expect(launchIdx).toBeGreaterThanOrEqual(0); - expect(sourceIdx).toBeLessThan(healthIdx); - expect(sourceIdx).toBeLessThan(launchIdx); - }); - it("fails recovery when trusted guard restoration cannot install required guards", () => { const script = buildRecoveryScript(minimalAgent, 19000); expect(script).toContain("_NEMOCLAW_CRITICAL_GUARDS_READY"); From 35c188c949fac64667f35781dd406acbdc5692bc Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Fri, 12 Jun 2026 01:24:04 -0700 Subject: [PATCH 6/8] test(e2e): document guard recovery coverage scope --- .../live/gateway-guard-recovery.test.ts | 37 ++++++++++++++++++- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/test/e2e-scenario/live/gateway-guard-recovery.test.ts b/test/e2e-scenario/live/gateway-guard-recovery.test.ts index 9a4ac8312f..3a26756765 100644 --- a/test/e2e-scenario/live/gateway-guard-recovery.test.ts +++ b/test/e2e-scenario/live/gateway-guard-recovery.test.ts @@ -22,6 +22,24 @@ * "guards are present after recovery." The aarch64 ciao crash is a * downstream consequence of the same broken contract. * + * #2701 acceptance scope for this PR: + * - Covered: the default OpenClaw production recovery route + * (`nemoclaw connect --probe-only` → + * checkAndRecoverSandboxProcesses() → `openshell sandbox exec -- sh -c` + * buildOpenClawRecoveryScript()) after the pod-recreate-equivalent state + * of an empty guard-chain `/tmp` plus no running gateway process. This + * proves the user no longer needs `nemoclaw rebuild --yes` for + * that recovered runtime state. + * - Deliberately out of scope for this merge gate: physical DGX Spark / + * GB10 / aarch64 hardware, provider breadth beyond `cloud-openclaw`, and + * destructive host reboot / OOM / supervisor crash / manual + * `kubectl delete pod` triggers. The current live Vitest runner exposes a + * Docker-driver OpenShell sandbox and does not provide a stable per-test + * Kubernetes pod handle that can be deleted without destabilizing shared + * gateway state. Those trigger/hardware/provider clauses need a dedicated + * platform-runtime job; this test locks down the shared recovery contract + * they all depend on. + * * The corresponding legacy bash phase remains in * `test/e2e/test-issue-2478-crash-loop-recovery.sh` Phase 4 with both the * #2478 WARNING assertion (current contract) and the new #2701 guard-chain @@ -61,6 +79,19 @@ test("gateway recovery restores /tmp guard chain after pod-recreate wipe (#2701) runner: "vitest", boundary: "sandbox-lifecycle", issues: ["#2701", "#2478"], + acceptanceCoverage: { + covered: [ + "production connect --probe-only recovery route", + "openshell sandbox exec -- sh -c OpenClaw recovery script", + "pod-recreate-equivalent empty /tmp guard chain plus missing gateway process", + "no rebuild required for the recovered runtime state", + ], + intentionallyOutOfScope: [ + "DGX Spark / GB10 / aarch64 hardware matrix", + "provider breadth beyond cloud-openclaw", + "host reboot / OOM / supervisor crash / manual kubectl delete pod triggers", + ], + }, }); // ── Setup ──────────────────────────────────────────────────────── @@ -73,8 +104,10 @@ test("gateway recovery restores /tmp guard chain after pod-recreate wipe (#2701) await gateway.expectGuardChainActive(instance); // ── Disrupt ────────────────────────────────────────────────────── - // Same shape as a fresh container after pod recreate: /tmp is empty of - // the guard chain, and the openclaw process tree is gone. + // Deterministic pod-recreate-equivalent state: /tmp is empty of the guard + // chain, and the OpenClaw process tree is gone. This avoids coupling the + // merge gate to a host-specific pod/container delete primitive while still + // exercising the production sandbox-exec recovery route below. await sandbox.wipeGuardChain(instance.sandboxName); await sandbox.killGatewayTree(instance.sandboxName); From 9e5653de5ede6e7fd715276ea202d1759772387f Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Fri, 12 Jun 2026 08:36:26 -0700 Subject: [PATCH 7/8] fix(agent): persist incomplete recovery proxy env --- .../agent/runtime-recovery-preload.test.ts | 61 ++++++++++++------- src/lib/agent/runtime-recovery-preload.ts | 10 +++ 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/src/lib/agent/runtime-recovery-preload.test.ts b/src/lib/agent/runtime-recovery-preload.test.ts index 0119c50dd0..3559f4fff0 100644 --- a/src/lib/agent/runtime-recovery-preload.test.ts +++ b/src/lib/agent/runtime-recovery-preload.test.ts @@ -79,28 +79,29 @@ function runGuardRecovery(opts: { } opts.beforeScript?.(paths); - const writeProxyEnv = opts.proxyEnvContent + const proxyEnvContent = opts.proxyEnvContent + ? rewriteRuntimePaths(opts.proxyEnvContent, paths) + : undefined; + const writeProxyEnv = proxyEnvContent ? [ `cat > ${JSON.stringify(paths.proxyEnv)} <<'PROXYENV'`, - opts.proxyEnvContent, + proxyEnvContent, "PROXYENV", `chmod 444 ${JSON.stringify(paths.proxyEnv)}`, ] : []; - const script = rewriteRuntimePaths( - [ - "set -u", - `export _GATEWAY_LOG=${JSON.stringify(paths.gatewayLog)}`, - ': > "$_GATEWAY_LOG"', - ...writeProxyEnv, - ...buildGatewayGuardRecoveryLines(), - 'if [ "$_GUARDS_MISSING" = "1" ]; then echo GUARDS_MISSING; exit 17; fi', - 'printf "PE_MISSING=%s\\n" "$_PE_MISSING"', - 'printf "NODE_OPTIONS=%s\\n" "$NODE_OPTIONS"', - ].join("\n"), - paths, - ); + const recoveryLines = rewriteRuntimePaths(buildGatewayGuardRecoveryLines().join("\n"), paths); + const script = [ + "set -u", + `export _GATEWAY_LOG=${JSON.stringify(paths.gatewayLog)}`, + ': > "$_GATEWAY_LOG"', + ...writeProxyEnv, + recoveryLines, + 'if [ "$_GUARDS_MISSING" = "1" ]; then echo GUARDS_MISSING; exit 17; fi', + 'printf "PE_MISSING=%s\\n" "$_PE_MISSING"', + 'printf "NODE_OPTIONS=%s\\n" "$NODE_OPTIONS"', + ].join("\n"); try { const result = spawnSync(opts.shell ?? "sh", ["-c", script], { encoding: "utf-8", @@ -207,6 +208,19 @@ describe("gateway recovery preload repair", () => { expect(nodeOptions.match(new RegExp(result.paths.tmpCiao, "g"))?.length).toBe(1); }); + it("persists repairs for a metadata-safe proxy-env.sh that is missing one guard", () => { + const result = runGuardRecovery({ + proxyEnvContent: `export NODE_OPTIONS="--require ${SAFETY_NET_GUARD.tmpPath}"\n`, + }); + expect(result.status).toBe(0); + expect(result.stdout).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(result.stdout).toContain(`--require ${result.paths.tmpCiao}`); + expect(result.files.proxyEnvMode).toBe(0o444); + expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpCiao}`); + expect(result.files.gatewayLog).toContain("proxy-env.sh incomplete"); + }); + it("rebuilds an unsafe proxy-env.sh without sourcing attacker-controlled content", () => { const result = runGuardRecovery({ beforeScript(paths) { @@ -329,17 +343,20 @@ describe("gateway recovery preload repair", () => { buildOpenClawRecoveryScript(18789), ]) { expect(script).not.toBeNull(); - const repairIdx = script!.indexOf("_NEMOCLAW_CRITICAL_GUARDS_READY=1"); - const sourceIdx = script!.indexOf("then . /tmp/nemoclaw-proxy-env.sh"); + const safetyNetStageIdx = script!.indexOf( + `_nemoclaw_stage_recovery_preload ${SAFETY_NET_GUARD.tmpPath} ${SAFETY_NET_GUARD.sourcePath}`, + ); + const ciaoStageIdx = script!.indexOf( + `_nemoclaw_stage_recovery_preload ${CIAO_GUARD.tmpPath} ${CIAO_GUARD.sourcePath}`, + ); const healthIdx = script!.indexOf("_GW_CODE="); const alreadyRunningIdx = script!.indexOf("echo ALREADY_RUNNING; exit 0"); - expect(repairIdx).toBeGreaterThanOrEqual(0); - expect(sourceIdx).toBeGreaterThanOrEqual(0); + expect(safetyNetStageIdx).toBeGreaterThanOrEqual(0); + expect(ciaoStageIdx).toBeGreaterThanOrEqual(0); expect(healthIdx).toBeGreaterThanOrEqual(0); expect(alreadyRunningIdx).toBeGreaterThanOrEqual(0); - expect(sourceIdx).toBeLessThan(repairIdx); - expect(repairIdx).toBeLessThan(healthIdx); - expect(sourceIdx).toBeLessThan(healthIdx); + expect(safetyNetStageIdx).toBeLessThan(healthIdx); + expect(ciaoStageIdx).toBeLessThan(healthIdx); expect(healthIdx).toBeLessThan(alreadyRunningIdx); } }); diff --git a/src/lib/agent/runtime-recovery-preload.ts b/src/lib/agent/runtime-recovery-preload.ts index 918cff5bb2..068250c081 100644 --- a/src/lib/agent/runtime-recovery-preload.ts +++ b/src/lib/agent/runtime-recovery-preload.ts @@ -43,6 +43,10 @@ export function buildGatewayGuardRecoveryLines(): string[] { ({ tmpPath }) => `if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ]; then _nemoclaw_append_node_require ${tmpPath}; fi;`, ); + const proxyEnvRewriteChecks = GATEWAY_PRELOAD_GUARDS.map( + ({ tmpPath }) => + `_nemoclaw_node_options_has_require ${tmpPath} || _PROXY_ENV_REWRITE_NEEDED=1;`, + ); const guardChecks = GATEWAY_PRELOAD_GUARDS.map( ({ tmpPath }) => `_nemoclaw_node_options_has_require ${tmpPath} || _GUARDS_MISSING=1;`, ); @@ -144,6 +148,12 @@ export function buildGatewayGuardRecoveryLines(): string[] { "_nemoclaw_write_recovered_proxy_env || _NEMOCLAW_CRITICAL_GUARDS_READY=0;", 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; fi;', "fi;", + "_PROXY_ENV_REWRITE_NEEDED=0;", + ...proxyEnvRewriteChecks, + 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ] && [ "${_PE_MISSING:-0}" = "0" ] && [ "${_PROXY_ENV_REWRITE_NEEDED:-0}" = "1" ]; then', + '_W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh incomplete - persisting library guards from packaged preloads (#2478/#2701)"; _nemoclaw_recovery_log "$_W";', + "_nemoclaw_write_recovered_proxy_env || _NEMOCLAW_CRITICAL_GUARDS_READY=0;", + "fi;", ...appendCalls, "_GUARDS_MISSING=0;", 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" != "1" ]; then _GUARDS_MISSING=1; fi;', From 03a3168a9970204eb2dc2645c06ef697a469766e Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Fri, 12 Jun 2026 08:49:17 -0700 Subject: [PATCH 8/8] fix(agent): validate guard env before shell init --- .../agent/runtime-recovery-preload.test.ts | 19 +++++++++++++++++++ src/lib/agent/runtime.ts | 12 ++++++------ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/lib/agent/runtime-recovery-preload.test.ts b/src/lib/agent/runtime-recovery-preload.test.ts index 3559f4fff0..255b0a5553 100644 --- a/src/lib/agent/runtime-recovery-preload.test.ts +++ b/src/lib/agent/runtime-recovery-preload.test.ts @@ -337,6 +337,25 @@ describe("gateway recovery preload repair", () => { } }); + it("validates proxy-env.sh before shell init hooks can source it", () => { + for (const script of [ + buildRecoveryScript(minimalAgent, 19000), + buildOpenClawRecoveryScript(18789), + ]) { + expect(script).not.toBeNull(); + const validateIdx = script!.indexOf( + "_nemoclaw_validate_recovery_proxy_env /tmp/nemoclaw-proxy-env.sh", + ); + const bashrcIdx = script!.indexOf("[ -f ~/.bashrc ] && . ~/.bashrc;"); + const healthIdx = script!.indexOf("_GW_CODE="); + expect(validateIdx).toBeGreaterThanOrEqual(0); + expect(bashrcIdx).toBeGreaterThanOrEqual(0); + expect(healthIdx).toBeGreaterThanOrEqual(0); + expect(validateIdx).toBeLessThan(bashrcIdx); + expect(bashrcIdx).toBeLessThan(healthIdx); + } + }); + it("repairs guard files before health probing and the ALREADY_RUNNING fast path", () => { for (const script of [ buildRecoveryScript(minimalAgent, 19000), diff --git a/src/lib/agent/runtime.ts b/src/lib/agent/runtime.ts index 8426029d30..bd8e515efc 100644 --- a/src/lib/agent/runtime.ts +++ b/src/lib/agent/runtime.ts @@ -194,11 +194,11 @@ export function buildHermesDashboardProcessRecoveryScript( export function buildOpenClawRecoveryScript(port: number): string { const staleGatewayPattern = "[o]penclaw([ -]gateway| gateway run|$)"; return [ - "[ -f ~/.bashrc ] && . ~/.bashrc;", ...buildGatewayLogSetup(true, "gateway"), buildGatewayLogSelection(), ...buildGatewayGuardRecoveryLines(), '[ "$_GUARDS_MISSING" = "1" ] && { _E="[gateway-recovery] ERROR: NODE_OPTIONS missing safety-net preload or ciao preload after trusted recovery - refusing unguarded gateway relaunch (#2478/#2701)"; echo "$_E" >&2; echo "$_E" >> "$_GATEWAY_LOG"; exit 1; };', + "[ -f ~/.bashrc ] && . ~/.bashrc;", `_GW_CODE=$(curl -so /dev/null -w '%{http_code}' --max-time 3 http://127.0.0.1:${port}/health 2>/dev/null || echo 000); case "$_GW_CODE" in 200|401) echo ALREADY_RUNNING; exit 0 ;; esac;`, "rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;", `_GATEWAY_PROC_PATTERN=${shellQuote(staleGatewayPattern)};`, @@ -256,18 +256,18 @@ export function buildRecoveryScript( `${hermesLaunchEnv}${configuredGatewayCommand}${isHermes ? "" : ` --port ${port}`}`, ); - // Validate or rebuild /tmp/nemoclaw-proxy-env.sh before the health fast path so - // a healthy gateway cannot leave a wiped guard chain unrepaired. Recovery also - // stops stale launcher/gateway processes that may have respawned between the - // health probe and relaunch. + // Validate or rebuild /tmp/nemoclaw-proxy-env.sh before shell init and the + // health fast path so a healthy gateway cannot leave a wiped guard chain + // unrepaired. Recovery also stops stale launcher/gateway processes that may + // have respawned between the health probe and relaunch. return [ - "[ -f ~/.bashrc ] && . ~/.bashrc;", hermesHome, ...(isHermes ? [buildHermesEnvFileBoundaryGuard()] : []), ...buildGatewayLogSetup(false), buildGatewayLogSelection(), ...buildGatewayGuardRecoveryLines(), '[ "$_GUARDS_MISSING" = "1" ] && { _E="[gateway-recovery] ERROR: NODE_OPTIONS missing safety-net preload or ciao preload after trusted recovery - refusing unguarded gateway relaunch (#2478/#2701)"; echo "$_E" >&2; echo "$_E" >> "$_GATEWAY_LOG"; exit 1; };', + "[ -f ~/.bashrc ] && . ~/.bashrc;", `_GW_CODE=$(curl -so /dev/null -w '%{http_code}' --max-time 3 ${shellQuote(probeUrl)} 2>/dev/null || echo 000); case "$_GW_CODE" in 200|401) echo ALREADY_RUNNING; exit 0 ;; esac;`, `_GATEWAY_PROC_PATTERN=${shellQuote(staleGatewayPattern)};`, 'if [ -n "$_GATEWAY_PROC_PATTERN" ]; then pkill -TERM -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; pkill -KILL -f "$_GATEWAY_PROC_PATTERN" 2>/dev/null || true; for _i in 1 2 3 4 5; do pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1 || break; sleep 1; done; if pgrep -f "$_GATEWAY_PROC_PATTERN" >/dev/null 2>&1; then echo GATEWAY_STALE_PROCESSES; exit 1; fi; fi;',