diff --git a/.github/workflows/nightly-e2e.yaml b/.github/workflows/nightly-e2e.yaml index d186ea2c88..14cc4c5497 100644 --- a/.github/workflows/nightly-e2e.yaml +++ b/.github/workflows/nightly-e2e.yaml @@ -1004,8 +1004,7 @@ jobs: timeout_minutes: 30 artifact_name: "issue-2478-crash-loop-recovery-install-log" artifact_path: "/tmp/nemoclaw-e2e-install.log" - env_json: '{"NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE":"1","NEMOCLAW_NON_INTERACTIVE":"1","NEMOCLAW_SANDBOX_NAME":"e2e-2478"}' - nvidia_api_key: true + env_json: '{"NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE":"1","NEMOCLAW_E2E_USE_COMPAT_MOCK":"1","NEMOCLAW_NON_INTERACTIVE":"1","NEMOCLAW_SANDBOX_NAME":"e2e-2478"}' github_token: true secrets: *nightly-e2e-default-secrets hermes-e2e: diff --git a/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts b/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts index fa7c15ef01..22138034f3 100644 --- a/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts +++ b/src/lib/agent/runtime-hermes-secret-boundary-behavioural.test.ts @@ -31,6 +31,20 @@ function writeStub(dir: string, name: string, body: string) { return stub; } +function removeTempDir(dir: string) { + fs.rmSync(dir, { recursive: true, force: true, maxRetries: 5, retryDelay: 100 }); +} + +function waitForPath(filePath: string, timeoutMs = 1000) { + const sleepView = new Int32Array(new SharedArrayBuffer(4)); + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + if (fs.existsSync(filePath)) return true; + Atomics.wait(sleepView, 0, 0, 10); + } + return fs.existsSync(filePath); +} + const SHARED_PYTHON_STUB_BY_MODE = [ 'if [ "$1" = "-c" ]; then', " exit 0", @@ -114,7 +128,7 @@ describe("Hermes secret-boundary guard — guard snippet behaviour", () => { : "", }; } finally { - fs.rmSync(tmp, { recursive: true, force: true }); + removeTempDir(tmp); } } @@ -207,7 +221,7 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = writeStub(stubsDir, "pgrep", "exit 1"); writeStub(stubsDir, "sleep", "exit 0"); writeStub(stubsDir, "curl", 'printf "000"\nexit 0'); - writeStub(stubsDir, "hermes", `: > ${JSON.stringify(hermesLaunchMarker)}\nexit 0`); + writeStub(stubsDir, "hermes", `: > ${JSON.stringify(hermesLaunchMarker)}\n/bin/sleep 5`); } function runRecovery( @@ -289,7 +303,7 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = expect(log).toContain("[SECURITY] Refusing Hermes startup"); expect(log).toContain("TELEGRAM_BOT_TOKEN (line 2)"); } finally { - fs.rmSync(harness.tmp, { recursive: true, force: true }); + removeTempDir(harness.tmp); } }); @@ -326,7 +340,7 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = expect(log).toContain("(line 2)"); expect(log).not.toContain("1234567890:AAExample-RawSecretValueHere"); } finally { - fs.rmSync(harness.tmp, { recursive: true, force: true }); + removeTempDir(harness.tmp); } }); @@ -375,7 +389,7 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = expect(log).toContain("TELEGRAM_BOT_TOKEN"); expect(log).not.toContain("1234567890:AAExample-RawSecretValueHere"); } finally { - fs.rmSync(harness.tmp, { recursive: true, force: true }); + removeTempDir(harness.tmp); } }); @@ -443,11 +457,11 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = expect(log).toContain("[SECURITY] Refusing Hermes startup because the process environment"); expect(log).toContain("TELEGRAM_BOT_TOKEN"); } finally { - fs.rmSync(harness.tmp, { recursive: true, force: true }); + removeTempDir(harness.tmp); } }); - it("refuses on runtime-env violation using the real validator against a proxy-env that exports a raw secret", () => { + it("does not import a raw secret from a metadata-safe proxy-env during runtime validation", () => { const harness = prepareRecoveryHarness("runtime-env-real"); const envFile = path.join(harness.tmp, "hermes-dot-env"); const proxyEnvFile = path.join(harness.tmp, "nemoclaw-proxy-env.sh"); @@ -460,8 +474,10 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = "hermes", "validate-env-secret-boundary.py", ); - // Clean .env so env-file passes, hostile proxy-env contributes the raw - // runtime-env secret that runtime-env validation must catch after sourcing. + // Clean .env so env-file passes. The hostile proxy-env used to contribute a + // raw runtime-env secret; recovery now rewrites that volatile shell file + // before sourcing it, so the runtime-env validator should never see the raw + // value. fs.writeFileSync(envFile, "API_SERVER_PORT=18642\n"); fs.writeFileSync( proxyEnvFile, @@ -481,15 +497,20 @@ describe("Hermes secret-boundary guard — full recovery script behaviour", () = envFilePath: envFile, proxyEnvPath: proxyEnvFile, }); - expect(result.status).toBe(1); - expect(result.stdout).toContain("SECRET_BOUNDARY_REFUSED"); - expect(fs.existsSync(harness.hermesLaunchMarker)).toBe(false); + expect(result.status).toBe(0); + expect(waitForPath(harness.hermesLaunchMarker)).toBe(true); + expect(result.stdout).not.toContain("SECRET_BOUNDARY_REFUSED"); + expect(result.stderr).not.toContain("TELEGRAM_BOT_TOKEN"); + const proxyEnv = fs.readFileSync(proxyEnvFile, "utf-8"); + expect(proxyEnv).not.toContain("TELEGRAM_BOT_TOKEN"); + expect(proxyEnv).toContain(harness.preloadTmpSafetyNet); + expect(proxyEnv).toContain(harness.preloadTmpCiao); const log = fs.readFileSync(harness.recoveryLogPath, "utf-8"); - expect(log).toContain("[SECURITY] Refusing Hermes startup because the process environment"); - expect(log).toContain("TELEGRAM_BOT_TOKEN"); + expect(log).not.toContain("[SECURITY] Refusing Hermes startup"); + expect(log).not.toContain("TELEGRAM_BOT_TOKEN"); expect(log).not.toContain("1234567890:AAExample-RawSecretValueHere"); } finally { - fs.rmSync(harness.tmp, { recursive: true, force: true }); + removeTempDir(harness.tmp); } }); }); diff --git a/src/lib/agent/runtime-hermes-secret-boundary-shape.test.ts b/src/lib/agent/runtime-hermes-secret-boundary-shape.test.ts index 6c4e6dd350..7eadcfc3a2 100644 --- a/src/lib/agent/runtime-hermes-secret-boundary-shape.test.ts +++ b/src/lib/agent/runtime-hermes-secret-boundary-shape.test.ts @@ -7,7 +7,7 @@ // tests that actually execute the synthesised script live in // runtime-hermes-secret-boundary-behavioural.test.ts. -import { describe, it, expect } from "vitest"; +import { describe, expect, it } from "vitest"; import { HERMES_SECRET_BOUNDARY_VALIDATOR_PATH } from "../../../dist/lib/agent/hermes-recovery-boundary"; import { buildHermesDashboardProcessRecoveryScript, @@ -26,10 +26,10 @@ describe("Hermes secret-boundary guard — generated shell shape", () => { expect(script).toContain(`python3 '${VALIDATOR_PATH}' env-file /sandbox/.hermes/.env`); }); - it("invokes the runtime-env validator after sourcing /tmp/nemoclaw-proxy-env.sh", () => { + it("invokes the runtime-env validator after sourcing the generated recovery env", () => { const script = buildRecoveryScript(hermesAgent, 8642); expect(script).not.toBeNull(); - const proxyEnvIdx = script!.indexOf(". /tmp/nemoclaw-proxy-env.sh"); + const proxyEnvIdx = script!.indexOf('. "$_NEMOCLAW_RECOVERY_SOURCE_ENV"'); const runtimeGuardIdx = script!.indexOf(`python3 '${VALIDATOR_PATH}' runtime-env`); const launchIdx = script!.indexOf("nohup"); expect(proxyEnvIdx).toBeGreaterThanOrEqual(0); @@ -77,23 +77,31 @@ describe("Hermes secret-boundary guard — generated shell shape", () => { expect(script).not.toContain("SECRET_BOUNDARY_REFUSED"); }); - it("guards the dashboard-only recovery path with env-file before sourcing and runtime-env after", () => { + it("guards the dashboard-only recovery path with env-file before generated sourcing and runtime-env after", () => { const script = buildHermesDashboardProcessRecoveryScript({ publicPort: 9119, internalPort: 19119, tuiEnabled: false, }); const envFileIdx = script.indexOf(`python3 '${VALIDATOR_PATH}' env-file /sandbox/.hermes/.env`); - const proxyEnvIdx = script.indexOf(". /tmp/nemoclaw-proxy-env.sh"); + const guardRecoveryIdx = script.indexOf("_nemoclaw_validate_recovery_proxy_env"); + const proxyEnvIdx = script.indexOf('. "$_NEMOCLAW_RECOVERY_SOURCE_ENV"'); + const bashrcIdx = script.indexOf("[ -f ~/.bashrc ] && . ~/.bashrc;"); const runtimeIdx = script.indexOf(`python3 '${VALIDATOR_PATH}' runtime-env`); const launchIdx = script.indexOf('"$AGENT_BIN" dashboard'); expect(envFileIdx).toBeGreaterThanOrEqual(0); + expect(guardRecoveryIdx).toBeGreaterThanOrEqual(0); expect(proxyEnvIdx).toBeGreaterThanOrEqual(0); + expect(bashrcIdx).toBeGreaterThanOrEqual(0); expect(runtimeIdx).toBeGreaterThanOrEqual(0); expect(launchIdx).toBeGreaterThanOrEqual(0); expect(envFileIdx).toBeLessThan(proxyEnvIdx); + expect(guardRecoveryIdx).toBeLessThan(proxyEnvIdx); expect(proxyEnvIdx).toBeLessThan(runtimeIdx); + expect(proxyEnvIdx).toBeLessThan(bashrcIdx); + expect(bashrcIdx).toBeLessThan(runtimeIdx); expect(runtimeIdx).toBeLessThan(launchIdx); + expect(script).not.toContain("if [ -r /tmp/nemoclaw-proxy-env.sh ]; then ."); expect(script).toContain("SECRET_BOUNDARY_REFUSED"); }); diff --git a/src/lib/agent/runtime-recovery-preload.test.ts b/src/lib/agent/runtime-recovery-preload.test.ts index 255b0a5553..cb51797ff7 100644 --- a/src/lib/agent/runtime-recovery-preload.test.ts +++ b/src/lib/agent/runtime-recovery-preload.test.ts @@ -38,6 +38,7 @@ function makeHarness() { tmpSafetyNet: path.join(workDir, "nemoclaw-sandbox-safety-net.js"), tmpCiao: path.join(workDir, "nemoclaw-ciao-network-guard.js"), proxyEnv: path.join(workDir, "nemoclaw-proxy-env.sh"), + recoverySourceEnv: path.join(workDir, "nemoclaw-recovered-proxy-env.sh"), gatewayLog: path.join(workDir, "gateway.log"), hostileMarker: path.join(root, "hostile-proxy-env-sourced"), }; @@ -54,11 +55,12 @@ function rewriteRuntimePaths(script: string, paths: ReturnType) => string); beforeScript?: (paths: ReturnType) => void; fakeRoot?: boolean; shell?: "bash" | "sh"; @@ -79,8 +81,10 @@ function runGuardRecovery(opts: { } opts.beforeScript?.(paths); - const proxyEnvContent = opts.proxyEnvContent - ? rewriteRuntimePaths(opts.proxyEnvContent, paths) + const rawProxyEnvContent = + typeof opts.proxyEnvContent === "function" ? opts.proxyEnvContent(paths) : opts.proxyEnvContent; + const proxyEnvContent = rawProxyEnvContent + ? rewriteRuntimePaths(rawProxyEnvContent, paths) : undefined; const writeProxyEnv = proxyEnvContent ? [ @@ -142,6 +146,7 @@ function runGuardRecovery(opts: { files: { gatewayLog: readIfExists(paths.gatewayLog) ?? "", proxyEnv: readIfExists(paths.proxyEnv), + recoverySourceEnv: readIfExists(paths.recoverySourceEnv), tmpSafetyNet: readIfExists(paths.tmpSafetyNet), tmpCiao: readIfExists(paths.tmpCiao), tmpSafetyNetMode: modeIfExists(paths.tmpSafetyNet), @@ -208,7 +213,81 @@ describe("gateway recovery preload repair", () => { expect(nodeOptions.match(new RegExp(result.paths.tmpCiao, "g"))?.length).toBe(1); }); - it("persists repairs for a metadata-safe proxy-env.sh that is missing one guard", () => { + it("rebuilds a metadata-safe proxy-env.sh without sourcing shell content", () => { + const result = runGuardRecovery({ + proxyEnvContent: (paths) => + [ + `touch ${JSON.stringify(paths.hostileMarker)}`, + `export NODE_OPTIONS="--require ${SAFETY_NET_GUARD.tmpPath} --require=${CIAO_GUARD.tmpPath}"`, + "", + ].join("\n"), + }); + expect(result.status).toBe(0); + expect(result.files.hostileProxyEnvSourced).toBe(false); + expect(result.files.proxyEnv).not.toContain("touch"); + expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpSafetyNet}`); + expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpCiao}`); + }); + + it("preserves a trusted full proxy-env.sh while sourcing only a generated recovery copy", () => { + const result = runGuardRecovery({ + fakeRoot: true, + proxyEnvContent: [ + "# Proxy configuration (overrides narrow OpenShell defaults on connect)", + 'export OPENCLAW_GATEWAY_URL="ws://127.0.0.1:18789"', + "export OPENCLAW_GATEWAY_TOKEN='trusted-token'", + "# nemoclaw-configure-guard begin", + "openclaw() {", + ' command openclaw "$@"', + "}", + "# nemoclaw-configure-guard end", + `export NODE_OPTIONS="\${NODE_OPTIONS:+$NODE_OPTIONS }--require ${SAFETY_NET_GUARD.tmpPath}"`, + 'export NODE_OPTIONS="${NODE_OPTIONS:+$NODE_OPTIONS }--require /tmp/nemoclaw-http-proxy-fix.js"', + `export NODE_OPTIONS="\${NODE_OPTIONS:+$NODE_OPTIONS }--require ${CIAO_GUARD.tmpPath}"`, + "# Tool cache redirects — keep transient tool state under /tmp", + "export npm_config_cache=/tmp/.npm-cache", + "", + ].join("\n"), + }); + expect(result.status).toBe(0); + expect(result.files.proxyEnv).toContain("OPENCLAW_GATEWAY_TOKEN='trusted-token'"); + expect(result.files.proxyEnv).toContain("openclaw() {"); + expect(result.files.proxyEnv).toContain("/tmp/nemoclaw-http-proxy-fix.js"); + expect(result.files.recoverySourceEnv).toContain("OPENCLAW_GATEWAY_TOKEN='trusted-token'"); + expect(result.files.recoverySourceEnv).toContain("openclaw() {"); + expect(result.files.recoverySourceEnv).toContain("/tmp/nemoclaw-http-proxy-fix.js"); + const nodeOptions = result.stdout.match(/^NODE_OPTIONS=(.*)$/m)?.[1] ?? ""; + expect(nodeOptions.match(new RegExp(result.paths.tmpSafetyNet, "g"))?.length).toBe(1); + expect(nodeOptions.match(new RegExp(result.paths.tmpCiao, "g"))?.length).toBe(1); + }); + + it("repairs an incomplete trusted proxy-env.sh without dropping full runtime entries", () => { + const result = runGuardRecovery({ + fakeRoot: true, + proxyEnvContent: [ + "# Proxy configuration (overrides narrow OpenShell defaults on connect)", + 'export OPENCLAW_GATEWAY_URL="ws://127.0.0.1:18789"', + "export OPENCLAW_GATEWAY_TOKEN='trusted-token'", + "# nemoclaw-configure-guard begin", + "openclaw() {", + ' command openclaw "$@"', + "}", + "# nemoclaw-configure-guard end", + `export NODE_OPTIONS="\${NODE_OPTIONS:+$NODE_OPTIONS }--require ${SAFETY_NET_GUARD.tmpPath}"`, + 'export NODE_OPTIONS="${NODE_OPTIONS:+$NODE_OPTIONS }--require /tmp/nemoclaw-http-proxy-fix.js"', + "export npm_config_cache=/tmp/.npm-cache", + "", + ].join("\n"), + }); + expect(result.status).toBe(0); + expect(result.files.proxyEnv).toContain("OPENCLAW_GATEWAY_TOKEN='trusted-token'"); + expect(result.files.proxyEnv).toContain("openclaw() {"); + expect(result.files.proxyEnv).toContain("/tmp/nemoclaw-http-proxy-fix.js"); + expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpCiao}`); + expect(result.files.gatewayLog).toContain("proxy-env.sh incomplete"); + }); + + it("rewrites a non-root metadata-safe proxy-env.sh that is missing one guard", () => { const result = runGuardRecovery({ proxyEnvContent: `export NODE_OPTIONS="--require ${SAFETY_NET_GUARD.tmpPath}"\n`, }); @@ -218,7 +297,7 @@ describe("gateway recovery preload repair", () => { expect(result.files.proxyEnvMode).toBe(0o444); expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpSafetyNet}`); expect(result.files.proxyEnv).toContain(`--require ${result.paths.tmpCiao}`); - expect(result.files.gatewayLog).toContain("proxy-env.sh incomplete"); + expect(result.files.gatewayLog).toContain("proxy-env.sh missing or unsafe"); }); it("rebuilds an unsafe proxy-env.sh without sourcing attacker-controlled content", () => { @@ -297,6 +376,18 @@ describe("gateway recovery preload repair", () => { expect(result.files.gatewayLog).toContain("refusing preload install"); }); + it("refuses recovery when a trusted packaged preload source is group writable", () => { + const result = runGuardRecovery({ + beforeScript(paths) { + fs.chmodSync(paths.sourceCiao, 0o664); + }, + }); + expect(result.status).toBe(17); + expect(result.stdout).toContain("GUARDS_MISSING"); + expect(result.files.gatewayLog).toContain("trusted preload source"); + expect(result.files.gatewayLog).toContain("unsafe mode=664"); + }); + it("refuses recovery when a trusted packaged preload source is group writable in root mode", () => { const result = runGuardRecovery({ fakeRoot: true, @@ -330,7 +421,7 @@ describe("gateway recovery preload repair", () => { const validateIdx = script!.indexOf( "_nemoclaw_validate_recovery_proxy_env /tmp/nemoclaw-proxy-env.sh", ); - const sourceIdx = script!.indexOf("then . /tmp/nemoclaw-proxy-env.sh"); + const sourceIdx = script!.indexOf('then . "$_NEMOCLAW_RECOVERY_SOURCE_ENV"'); expect(validateIdx).toBeGreaterThanOrEqual(0); expect(sourceIdx).toBeGreaterThanOrEqual(0); expect(validateIdx).toBeLessThan(sourceIdx); diff --git a/src/lib/agent/runtime-recovery-preload.ts b/src/lib/agent/runtime-recovery-preload.ts index 068250c081..6c796d002d 100644 --- a/src/lib/agent/runtime-recovery-preload.ts +++ b/src/lib/agent/runtime-recovery-preload.ts @@ -23,17 +23,15 @@ export const GATEWAY_PRELOAD_GUARDS: ReadonlyArray<{ * Build shell lines that restore and validate the required recovery preloads. * * The recovery script must not source /tmp/nemoclaw-proxy-env.sh before these - * lines. This helper validates the env file first, rebuilds it when it is - * missing or unsafe, then sources only the trusted result. The recovered env is - * intentionally minimal: it restores the critical Node preload guard chain that - * recovery itself requires. The normal sandbox entrypoint remains the source of - * truth for the full proxy/tool/token environment and refreshes that file on a - * regular sandbox restart. + * lines. This helper inspects any existing env file only for diagnostics, then + * stages trusted preloads, writes a fresh generated recovery source, and sources + * only that generated result. A trusted root-owned proxy env keeps its startup + * contract intact; unsafe or incomplete files are regenerated from trusted + * preloads. */ export function buildGatewayGuardRecoveryLines(): string[] { - const recoveredProxyEnvExports = GATEWAY_PRELOAD_GUARDS.map( - ({ tmpPath }) => - `printf '%s\\n' 'export NODE_OPTIONS="\${NODE_OPTIONS:+$NODE_OPTIONS }--require ${tmpPath}"';`, + const emitRecoveredProxyEnvRequires = GATEWAY_PRELOAD_GUARDS.map( + ({ tmpPath }) => `_nemoclaw_emit_recovery_node_require "$preserve_file" ${tmpPath};`, ); const stageCalls = GATEWAY_PRELOAD_GUARDS.map( ({ tmpPath, sourcePath }) => @@ -45,7 +43,7 @@ export function buildGatewayGuardRecoveryLines(): string[] { ); const proxyEnvRewriteChecks = GATEWAY_PRELOAD_GUARDS.map( ({ tmpPath }) => - `_nemoclaw_node_options_has_require ${tmpPath} || _PROXY_ENV_REWRITE_NEEDED=1;`, + `_nemoclaw_proxy_env_file_has_require /tmp/nemoclaw-proxy-env.sh ${tmpPath} || _PROXY_ENV_REWRITE_NEEDED=1;`, ); const guardChecks = GATEWAY_PRELOAD_GUARDS.map( ({ tmpPath }) => `_nemoclaw_node_options_has_require ${tmpPath} || _GUARDS_MISSING=1;`, @@ -77,6 +75,18 @@ export function buildGatewayGuardRecoveryLines(): string[] { 'local wanted="$1";', 'if ! _nemoclaw_node_options_has_require "$wanted"; then export NODE_OPTIONS="${NODE_OPTIONS:+$NODE_OPTIONS }--require $wanted"; fi;', "};", + "_nemoclaw_proxy_env_file_has_require() {", + 'local env_file="$1"; local wanted="$2";', + '[ -r "$env_file" ] || return 1;', + 'grep -F -- "--require $wanted" "$env_file" >/dev/null 2>&1 && return 0;', + 'grep -F -- "--require=$wanted" "$env_file" >/dev/null 2>&1 && return 0;', + "return 1;", + "};", + "_nemoclaw_emit_recovery_node_require() {", + 'local preserve_file="$1"; local wanted="$2";', + 'if [ -n "$preserve_file" ] && _nemoclaw_proxy_env_file_has_require "$preserve_file" "$wanted"; then return 0; fi;', + 'printf \'%s\\n\' "export NODE_OPTIONS=\\"\\${NODE_OPTIONS:+\\$NODE_OPTIONS }--require $wanted\\"";', + "};", "_nemoclaw_validate_recovery_preload() {", 'local file="$1"; local perms owner _msg;', 'if [ -L "$file" ]; then _msg="[gateway-recovery] ERROR: $file is a symlink - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', @@ -93,10 +103,10 @@ export function buildGatewayGuardRecoveryLines(): string[] { 'local src="$1"; local perms owner _msg;', 'if [ ! -r "$src" ] || [ -L "$src" ] || [ ! -f "$src" ]; then _msg="[gateway-recovery] ERROR: trusted preload source $src unavailable - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', 'perms="$(stat -c %a "$src" 2>/dev/null || stat -f %Lp "$src" 2>/dev/null || echo unknown)";', + 'if _nemoclaw_mode_group_or_other_writable "$perms"; then _msg="[gateway-recovery] ERROR: trusted preload source $src has unsafe mode=$perms (group/other writable) - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', 'if [ "$(id -u)" -eq 0 ]; then', 'owner="$(stat -c %u "$src" 2>/dev/null || stat -f %u "$src" 2>/dev/null || echo unknown)";', 'if [ "$owner" != "0" ]; then _msg="[gateway-recovery] ERROR: trusted preload source $src owner_uid=$owner (expected 0) - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', - 'if _nemoclaw_mode_group_or_other_writable "$perms"; then _msg="[gateway-recovery] ERROR: trusted preload source $src has unsafe mode=$perms (group/other writable) - refusing preload install"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', "fi;", "return 0;", "};", @@ -123,12 +133,13 @@ export function buildGatewayGuardRecoveryLines(): string[] { 'if ! mv -f "$stage" "$tmp"; then rm -f "$stage"; if _nemoclaw_validate_recovery_preload "$tmp"; then return 0; fi; _msg="[gateway-recovery] ERROR: failed to install recovery preload $tmp"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', '_nemoclaw_validate_recovery_preload "$tmp";', "};", - "_nemoclaw_write_recovered_proxy_env() {", - "local env_file=/tmp/nemoclaw-proxy-env.sh; local stage _msg;", + "_nemoclaw_write_generated_proxy_env() {", + 'local env_file="$1"; local preserve_file="${2:-}"; local stage _msg;', 'stage="$(mktemp /tmp/.nemoclaw-proxy-env.sh.tmp.XXXXXX)" || { _msg="[gateway-recovery] ERROR: failed to stage recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; };', "{", - "printf '%s\\n' '# Recovered by NemoClaw gateway recovery; sandbox restart refreshes the full proxy env.';", - ...recoveredProxyEnvExports, + "printf '%s\\n' '# Generated by NemoClaw gateway recovery; do not edit.';", + 'if [ -n "$preserve_file" ]; then cat "$preserve_file"; fi;', + ...emitRecoveredProxyEnvRequires, '} > "$stage" || { rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to write recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; };', 'if [ "$(id -u)" -eq 0 ] && ! chown root:root "$stage"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to chown recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', 'if ! chmod 444 "$stage"; then rm -f "$stage"; _msg="[gateway-recovery] ERROR: failed to chmod recovered proxy-env.sh"; _nemoclaw_recovery_log "$_msg"; return 1; fi;', @@ -140,20 +151,23 @@ export function buildGatewayGuardRecoveryLines(): string[] { return [ helpers, - "if _nemoclaw_validate_recovery_proxy_env /tmp/nemoclaw-proxy-env.sh; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;", + "_PE_MISSING=0; _PROXY_ENV_REWRITE_NEEDED=0; _PROXY_ENV_PRESERVE_FILE=;", + "if _nemoclaw_validate_recovery_proxy_env /tmp/nemoclaw-proxy-env.sh; then :; else _PE_MISSING=1; _PROXY_ENV_REWRITE_NEEDED=1; fi;", + 'if [ "${_PE_MISSING:-0}" = "0" ]; then if [ "$(id -u)" = "0" ]; then _PROXY_ENV_PRESERVE_FILE=/tmp/nemoclaw-proxy-env.sh; else _PE_MISSING=1; _PROXY_ENV_REWRITE_NEEDED=1; fi; fi;', + ...proxyEnvRewriteChecks, "_NEMOCLAW_CRITICAL_GUARDS_READY=1;", ...stageCalls, 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ] && [ "${_PE_MISSING:-0}" = "1" ]; then', - '_W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - restoring library guards from packaged preloads (#2478/#2701)"; _nemoclaw_recovery_log "$_W";', - "_nemoclaw_write_recovered_proxy_env || _NEMOCLAW_CRITICAL_GUARDS_READY=0;", - 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; fi;', + '_W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing or unsafe - restoring library guards from packaged preloads (#2478/#2701)"; _nemoclaw_recovery_log "$_W";', "fi;", - "_PROXY_ENV_REWRITE_NEEDED=0;", - ...proxyEnvRewriteChecks, 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ] && [ "${_PE_MISSING:-0}" = "0" ] && [ "${_PROXY_ENV_REWRITE_NEEDED:-0}" = "1" ]; then', - '_W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh incomplete - persisting library guards from packaged preloads (#2478/#2701)"; _nemoclaw_recovery_log "$_W";', - "_nemoclaw_write_recovered_proxy_env || _NEMOCLAW_CRITICAL_GUARDS_READY=0;", + '_W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh incomplete - rewriting library guards from packaged preloads (#2478/#2701)"; _nemoclaw_recovery_log "$_W";', + "fi;", + 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ]; then', + 'if [ "${_PE_MISSING:-0}" = "1" ] || [ "${_PROXY_ENV_REWRITE_NEEDED:-0}" = "1" ]; then _nemoclaw_write_generated_proxy_env /tmp/nemoclaw-proxy-env.sh "$_PROXY_ENV_PRESERVE_FILE" || _NEMOCLAW_CRITICAL_GUARDS_READY=0; _NEMOCLAW_RECOVERY_SOURCE_ENV=/tmp/nemoclaw-proxy-env.sh;', + 'else _NEMOCLAW_RECOVERY_SOURCE_ENV=/tmp/nemoclaw-recovered-proxy-env.sh; _nemoclaw_write_generated_proxy_env "$_NEMOCLAW_RECOVERY_SOURCE_ENV" "$_PROXY_ENV_PRESERVE_FILE" || _NEMOCLAW_CRITICAL_GUARDS_READY=0; fi;', "fi;", + 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" = "1" ]; then . "$_NEMOCLAW_RECOVERY_SOURCE_ENV"; _PE_MISSING=0; fi;', ...appendCalls, "_GUARDS_MISSING=0;", 'if [ "$_NEMOCLAW_CRITICAL_GUARDS_READY" != "1" ]; then _GUARDS_MISSING=1; fi;', diff --git a/src/lib/agent/runtime.test.ts b/src/lib/agent/runtime.test.ts index 5f46ed7521..8d1fc7830d 100644 --- a/src/lib/agent/runtime.test.ts +++ b/src/lib/agent/runtime.test.ts @@ -127,7 +127,13 @@ describe("buildRecoveryScript", () => { internalPort: 19119, tuiEnabled: false, }); - expect(script).toContain(". /tmp/nemoclaw-proxy-env.sh"); + const validationIndex = script.indexOf( + "_nemoclaw_validate_recovery_proxy_env /tmp/nemoclaw-proxy-env.sh", + ); + const sourceIndex = script.indexOf('. "$_NEMOCLAW_RECOVERY_SOURCE_ENV"'); + expect(validationIndex).toBeGreaterThanOrEqual(0); + expect(sourceIndex).toBeGreaterThan(validationIndex); + expect(script).toContain('. "$_NEMOCLAW_RECOVERY_SOURCE_ENV"'); expect(script).toContain("/usr/local/bin/hermes"); expect(script).toContain( '"$AGENT_BIN" dashboard --host 127.0.0.1 --port 19119 --skip-build --no-open', @@ -193,17 +199,17 @@ describe("buildRecoveryScript", () => { expect(script).not.toContain("hermes gateway run --profile recovery --port 8642"); }); - // Regression coverage for #2478. The recovery script must explicitly source - // /tmp/nemoclaw-proxy-env.sh (single source of truth for NODE_OPTIONS - // library guards) and restore the critical safety-net/ciao preloads from - // trusted packaged sources when the file is missing. The pre-fix recovery - // path swallowed sourcing errors via `2>/dev/null`, leaving respawned - // gateways guard-less and crash-looping on the next library error from ciao, + // Regression coverage for #2478. The recovery script must validate + // /tmp/nemoclaw-proxy-env.sh, then source a generated recovery env carrying + // the critical NODE_OPTIONS library guards. The pre-fix recovery path + // swallowed sourcing errors via `2>/dev/null`, leaving respawned gateways + // guard-less and crash-looping on the next library error from ciao, // model-pricing, or anything else hitting a sandboxed syscall. describe("#2478 hardened library-guard preload chain", () => { - it("explicitly sources the gateway env file", () => { + it("sources the generated recovery env after validating the gateway env file", () => { const script = buildRecoveryScript(minimalAgent, 19000); - expect(script).toContain(". /tmp/nemoclaw-proxy-env.sh"); + expect(script).toContain("_nemoclaw_validate_recovery_proxy_env /tmp/nemoclaw-proxy-env.sh"); + expect(script).toContain('. "$_NEMOCLAW_RECOVERY_SOURCE_ENV"'); }); it("warns and restores guards when the gateway env file is missing", () => { diff --git a/src/lib/agent/runtime.ts b/src/lib/agent/runtime.ts index bd8e515efc..f0c3d6446a 100644 --- a/src/lib/agent/runtime.ts +++ b/src/lib/agent/runtime.ts @@ -177,10 +177,11 @@ export function buildHermesDashboardProcessRecoveryScript( config: HermesDashboardRecoveryConfig, ): string { return [ - "[ -f ~/.bashrc ] && . ~/.bashrc;", "export HERMES_HOME=/sandbox/.hermes;", buildHermesEnvFileBoundaryGuard(), - "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; fi;", + ...buildGatewayGuardRecoveryLines(), + '[ "$_GUARDS_MISSING" = "1" ] && { _E="[gateway-recovery] ERROR: NODE_OPTIONS missing safety-net preload or ciao preload after trusted recovery - refusing unguarded dashboard relaunch (#2478/#2701)"; echo "$_E" >&2; exit 1; };', + "[ -f ~/.bashrc ] && . ~/.bashrc;", buildHermesRuntimeEnvBoundaryGuard(), 'AGENT_BIN=/usr/local/bin/hermes; if [ ! -x "$AGENT_BIN" ]; then AGENT_BIN="$(command -v hermes)"; fi;', 'if [ -z "$AGENT_BIN" ]; then echo AGENT_MISSING; exit 1; fi;', diff --git a/test/e2e/test-issue-2478-crash-loop-recovery.sh b/test/e2e/test-issue-2478-crash-loop-recovery.sh index 1b39f8e5a5..9cce18f453 100755 --- a/test/e2e/test-issue-2478-crash-loop-recovery.sh +++ b/test/e2e/test-issue-2478-crash-loop-recovery.sh @@ -44,24 +44,29 @@ # # Prerequisites: # - Docker running -# - NVIDIA_API_KEY set (real key, starts with nvapi-) -# - Network access to integrate.api.nvidia.com +# - python3 + curl for the local OpenAI-compatible mock endpoint # # Environment variables: # NEMOCLAW_NON_INTERACTIVE=1 — required # NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE=1 — required -# NVIDIA_API_KEY — required for onboard +# NEMOCLAW_E2E_USE_COMPAT_MOCK — use local mock provider (default: 1) +# NEMOCLAW_COMPAT_MOCK_PORT — local mock endpoint port (default: 18191) +# NEMOCLAW_COMPAT_MODEL — local mock model id (default: test-model) +# NEMOCLAW_COMPAT_MOCK_LOG — local mock server log path # NEMOCLAW_SANDBOX_NAME — sandbox name (default: e2e-2478) +# NEMOCLAW_E2E_INSTALL_LOG — install log path for CI artifacts # NEMOCLAW_E2E_TIMEOUT_SECONDS — overall timeout (default: 1500) # NEMOCLAW_E2E_CRASH_CYCLES — crash-recover cycles (default: 5) # NEMOCLAW_E2E_SOAK_SECONDS — idle soak window (default: 300) +# NVIDIA_API_KEY — required only with NEMOCLAW_E2E_USE_COMPAT_MOCK=0 # # Usage: # NEMOCLAW_NON_INTERACTIVE=1 \ # NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE=1 \ -# NVIDIA_API_KEY=nvapi-... \ # bash test/e2e/test-issue-2478-crash-loop-recovery.sh +# ShellCheck cannot see EXIT trap invocations of cleanup helpers in this E2E script. +# shellcheck disable=SC2317,SC2329 set -uo pipefail export NEMOCLAW_E2E_DEFAULT_TIMEOUT=1500 @@ -94,9 +99,169 @@ CRASH_CYCLES="${NEMOCLAW_E2E_CRASH_CYCLES:-5}" SOAK_SECONDS="${NEMOCLAW_E2E_SOAK_SECONDS:-300}" DASHBOARD_PORT="${NEMOCLAW_DASHBOARD_PORT:-18789}" REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")/../.." && pwd)" +USE_COMPAT_MOCK="${NEMOCLAW_E2E_USE_COMPAT_MOCK:-1}" +COMPAT_MOCK_PORT="${NEMOCLAW_COMPAT_MOCK_PORT:-18191}" +COMPAT_MODEL="${NEMOCLAW_COMPAT_MODEL:-test-model}" +COMPATIBLE_KEY="${COMPATIBLE_API_KEY:-nemoclaw-e2e-compatible-key}" +COMPAT_MOCK_LOG="${NEMOCLAW_COMPAT_MOCK_LOG:-/tmp/nemoclaw-2478-compatible-endpoint.log}" +COMPAT_MOCK_PID="" # ── Helpers ────────────────────────────────────────────────────── +host_ip_for_sandbox() { + local ip_addr + if command -v ip >/dev/null 2>&1; then + ip_addr="$(ip route get 1 2>/dev/null | awk '{for (i=1;i<=NF;i++) if ($i=="src") {print $(i+1); exit}}')" + if [ -n "$ip_addr" ]; then + echo "$ip_addr" + return + fi + fi + if command -v hostname >/dev/null 2>&1; then + for ip_addr in $(hostname -I 2>/dev/null); do + case "$ip_addr" in + 127.* | ::1) ;; + *) + echo "$ip_addr" + return + ;; + esac + done + fi + echo "127.0.0.1" +} + +stop_compat_mock() { + if [ -n "${COMPAT_MOCK_PID:-}" ] && kill -0 "$COMPAT_MOCK_PID" 2>/dev/null; then + kill "$COMPAT_MOCK_PID" 2>/dev/null || true + wait "$COMPAT_MOCK_PID" 2>/dev/null || true + fi + COMPAT_MOCK_PID="" +} +trap stop_compat_mock EXIT + +start_compat_mock() { + : >"$COMPAT_MOCK_LOG" + python3 - "$COMPAT_MOCK_PORT" "$COMPAT_MODEL" "$COMPATIBLE_KEY" >"$COMPAT_MOCK_LOG" 2>&1 <<'PY' & +import json +import sys +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer + +port = int(sys.argv[1]) +model = sys.argv[2] +api_key = sys.argv[3] + + +class Handler(BaseHTTPRequestHandler): + def log_message(self, fmt, *args): + return + + def _path(self): + return self.path.split("?", 1)[0] + + def _send(self, status, payload): + body = json.dumps(payload).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def _send_chat_sse(self, content): + chunk = json.dumps({ + "id": "chatcmpl-2478-mock", + "object": "chat.completion.chunk", + "choices": [{"index": 0, "delta": {"role": "assistant", "content": content}, "finish_reason": None}], + }) + done_chunk = json.dumps({ + "id": "chatcmpl-2478-mock", + "object": "chat.completion.chunk", + "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], + }) + body = ("data: %s\n\ndata: %s\n\ndata: [DONE]\n\n" % (chunk, done_chunk)).encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "text/event-stream") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def _auth_ok(self): + return self.headers.get("Authorization", "") == "Bearer " + api_key + + def do_GET(self): + if self._path() in ("/v1/models", "/models"): + print("GET %s" % self._path(), flush=True) + self._send(200, {"object": "list", "data": [{"id": model, "object": "model"}]}) + return + self._send(404, {"error": {"message": "not found"}}) + + def do_POST(self): + length = int(self.headers.get("Content-Length", "0") or "0") + raw = self.rfile.read(length) if length else b"" + try: + payload = json.loads(raw.decode("utf-8") or "{}") + except Exception: + payload = {} + + if self._path() in ("/v1/chat/completions", "/chat/completions"): + print( + "POST %s auth=%s model=%s stream=%s" + % (self._path(), "ok" if self._auth_ok() else "missing", payload.get("model"), payload.get("stream")), + flush=True, + ) + if not self._auth_ok(): + self._send(401, {"error": {"message": "missing bearer credential"}}) + return + if payload.get("stream"): + self._send_chat_sse("PONG from issue-2478 mock") + return + self._send(200, { + "id": "chatcmpl-2478-mock", + "object": "chat.completion", + "choices": [{ + "index": 0, + "message": {"role": "assistant", "content": "PONG from issue-2478 mock"}, + "finish_reason": "stop", + }], + }) + return + + if self._path() in ("/v1/responses", "/responses"): + print( + "POST %s auth=%s stream=%s" + % (self._path(), "ok" if self._auth_ok() else "missing", payload.get("stream")), + flush=True, + ) + if not self._auth_ok(): + self._send(401, {"error": {"message": "missing bearer credential"}}) + return + self._send(200, { + "id": "resp-2478-mock", + "object": "response", + "output": [{ + "type": "message", + "role": "assistant", + "content": [{"type": "output_text", "text": "PONG from issue-2478 mock"}], + }], + }) + return + + self._send(404, {"error": {"message": "not found"}}) + + +ThreadingHTTPServer(("0.0.0.0", port), Handler).serve_forever() +PY + COMPAT_MOCK_PID=$! + + for _ in $(seq 1 30); do + if curl -sf "http://127.0.0.1:${COMPAT_MOCK_PORT}/v1/models" >/dev/null 2>&1; then + return 0 + fi + sleep 1 + done + return 1 +} + # Run a command inside the sandbox via openshell sandbox exec. Returns # stdout; non-zero exit prints stderr but does not abort the test. sandbox_exec() { @@ -121,6 +286,47 @@ proxy_env_contents() { sandbox_exec sh -c "cat /tmp/nemoclaw-proxy-env.sh 2>/dev/null" } +gateway_log_line_count() { + sandbox_exec sh -c "wc -l < /tmp/gateway.log 2>/dev/null || printf '0\n'" \ + | awk '/^[0-9]+$/ { print; found=1; exit } END { if (!found) print 0 }' +} + +gateway_log_marker() { + local label="$1" + local marker + marker="NEMOCLAW_E2E_LOG_MARKER_${label}_$(date +%s)_$$" + if sandbox_exec sh -c "printf '%s\n' '$marker' >> /tmp/gateway.log 2>/dev/null && grep -Fq '$marker' /tmp/gateway.log 2>/dev/null" >/dev/null; then + printf '%s\n' "$marker" + return + fi + gateway_log_line_count +} + +gateway_log_after_boundary() { + local boundary="${1:-0}" + if [[ "$boundary" =~ ^[0-9]+$ ]]; then + local current_lines + current_lines="$(gateway_log_line_count)" + if [ "$current_lines" -lt "$boundary" ]; then + boundary=0 + fi + sandbox_exec sh -c "cat /tmp/gateway.log 2>/dev/null" | tail -n +"$((boundary + 1))" + return + fi + + local out status + out="$(sandbox_exec sh -c "awk -v marker='$boundary' 'found { print } index(\$0, marker) { found=1; next } END { if (!found) exit 42 }' /tmp/gateway.log 2>/dev/null")" + status=$? + if [ "$status" -eq 0 ]; then + printf '%s\n' "$out" + return + fi + + # If the gateway rewrote or rotated /tmp/gateway.log, the marker is gone and + # the whole current file is fresh relative to the boundary. + sandbox_exec sh -c "cat /tmp/gateway.log 2>/dev/null" +} + # Returns 0 if the gateway has the library guard chain active, 1 otherwise. # /proc//environ is unreadable across non-ancestor process trees due # to kernel.yama.ptrace_scope=1, so we verify the guards by their effects: @@ -137,6 +343,7 @@ proxy_env_contents() { gateway_guards_active() { local pid="$1" local timeout="${2:-30}" + local log_boundary="${3:-0}" local elapsed=0 if [ -z "$pid" ]; then @@ -155,8 +362,8 @@ gateway_guards_active() { fi while [ "$elapsed" -lt "$timeout" ]; do - if sandbox_exec sh -c "grep -Eq '\\[sandbox-safety-net\\] loaded \\((openclaw-gateway|launcher)\\)' /tmp/gateway.log 2>/dev/null" \ - && sandbox_exec sh -c "grep -Eq '\\[guard\\] ciao-network-guard loaded \\((openclaw-gateway|launcher)\\)' /tmp/gateway.log 2>/dev/null"; then + if gateway_log_after_boundary "$log_boundary" | grep -Eq '\[sandbox-safety-net\] loaded \((openclaw-gateway|launcher)\)' \ + && gateway_log_after_boundary "$log_boundary" | grep -Eq '\[guard\] ciao-network-guard loaded \((openclaw-gateway|launcher)\)'; then # Confirm gateway is still alive after guard activations. if [ -n "$(gateway_pid)" ]; then return 0 @@ -166,7 +373,7 @@ gateway_guards_active() { fi # Backward-compatible proof for older images: this line is emitted by # the ciao preload only when ciao calls os.networkInterfaces(). - if sandbox_exec sh -c "grep -Fq '[guard] os.networkInterfaces() failed:' /tmp/gateway.log 2>/dev/null"; then + if gateway_log_after_boundary "$log_boundary" | grep -Fq '[guard] os.networkInterfaces() failed:'; then if [ -n "$(gateway_pid)" ]; then return 0 fi @@ -177,7 +384,7 @@ gateway_guards_active() { elapsed=$((elapsed + 3)) done - echo " [guards] no gateway-process guard activation signatures in gateway.log within ${timeout}s" + echo " [guards] no fresh gateway-process guard activation signatures in gateway.log within ${timeout}s" return 1 } @@ -315,11 +522,23 @@ if ! docker info >/dev/null 2>&1; then fi pass "Docker running" -if [ -z "${NVIDIA_API_KEY:-}" ] || [[ "${NVIDIA_API_KEY}" != nvapi-* ]]; then - fail "NVIDIA_API_KEY not set or invalid" - exit 1 +if [ "$USE_COMPAT_MOCK" = "1" ]; then + if ! command -v python3 >/dev/null 2>&1; then + fail "python3 is required for the compatible endpoint mock" + exit 1 + fi + if ! command -v curl >/dev/null 2>&1; then + fail "curl is required for the compatible endpoint mock readiness probe" + exit 1 + fi + pass "Compatible endpoint mock prerequisites available" +else + if [ -z "${NVIDIA_API_KEY:-}" ] || [[ "${NVIDIA_API_KEY}" != nvapi-* ]]; then + fail "NVIDIA_API_KEY not set or invalid" + exit 1 + fi + pass "NVIDIA_API_KEY set" fi -pass "NVIDIA_API_KEY set" if [ "${NEMOCLAW_NON_INTERACTIVE:-}" != "1" ] || [ "${NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE:-}" != "1" ]; then fail "NEMOCLAW_NON_INTERACTIVE=1 and NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE=1 are required" @@ -344,22 +563,46 @@ cd "$REPO_ROOT" || { exit 1 } -INSTALL_LOG="$(mktemp)" -env \ - NEMOCLAW_NON_INTERACTIVE=1 \ - NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE=1 \ - NEMOCLAW_SANDBOX_NAME="$SANDBOX_NAME" \ - NEMOCLAW_RECREATE_SANDBOX=1 \ - bash install.sh --non-interactive >"$INSTALL_LOG" 2>&1 +install_env=( + env + NEMOCLAW_NON_INTERACTIVE=1 + NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE=1 + NEMOCLAW_SANDBOX_NAME="$SANDBOX_NAME" + NEMOCLAW_RECREATE_SANDBOX=1 +) + +if [ "$USE_COMPAT_MOCK" = "1" ]; then + COMPAT_HOST="$(host_ip_for_sandbox)" + COMPAT_ENDPOINT_URL="http://${COMPAT_HOST}:${COMPAT_MOCK_PORT}/v1" + if start_compat_mock; then + pass "Compatible endpoint mock listening at ${COMPAT_ENDPOINT_URL}" + else + fail "Compatible endpoint mock did not become ready; see ${COMPAT_MOCK_LOG}" + cat "$COMPAT_MOCK_LOG" + exit 1 + fi + install_env+=( + COMPATIBLE_API_KEY="$COMPATIBLE_KEY" + NEMOCLAW_PROVIDER=custom + NEMOCLAW_ENDPOINT_URL="$COMPAT_ENDPOINT_URL" + NEMOCLAW_MODEL="$COMPAT_MODEL" + ) +fi + +INSTALL_LOG="${NEMOCLAW_E2E_INSTALL_LOG:-/tmp/nemoclaw-e2e-install.log}" +: >"$INSTALL_LOG" || { + fail "Cannot write install log at $INSTALL_LOG" + exit 1 +} +"${install_env[@]}" bash install.sh --non-interactive >"$INSTALL_LOG" 2>&1 install_exit=$? if [ $install_exit -ne 0 ]; then fail "install.sh failed (exit $install_exit). Last 30 lines:" tail -30 "$INSTALL_LOG" - rm -f "$INSTALL_LOG" + info "Full install log retained at $INSTALL_LOG" exit 1 fi -rm -f "$INSTALL_LOG" pass "install.sh + onboard completed" # Pick up PATH changes @@ -411,6 +654,7 @@ section "Phase 3: Crash-recovery loop ($CRASH_CYCLES cycles)" prev_pid="$INIT_PID" for cycle in $(seq 1 "$CRASH_CYCLES"); do info "Cycle $cycle/$CRASH_CYCLES — killing gateway pid=$prev_pid" + guard_log_start="$(gateway_log_marker "cycle_${cycle}")" sandbox_exec sh -c "kill -9 $prev_pid 2>/dev/null; sleep 1" >/dev/null # Trigger recovery via the actual operator probe path: @@ -438,7 +682,7 @@ for cycle in $(seq 1 "$CRASH_CYCLES"); do fi pass "Cycle $cycle: gateway respawned (pid $prev_pid → $new_pid)" - if gateway_guards_active "$new_pid" 30; then + if gateway_guards_active "$new_pid" 30 "$guard_log_start"; then pass "Cycle $cycle: respawned gateway retains guard chain (proxy-env + gateway preloads loaded)" else fail "Cycle $cycle: respawned gateway LOST guard chain — recovery hardening regressed" @@ -482,6 +726,7 @@ info "Snapshotted proxy-env.sh ($SNAPSHOT_SIZE bytes, ${#SNAPSHOT_B64}-char base # pkill -9 -f '[o]penclaw' takes them all out so the launcher's watchdog # can't silently respawn the gateway before nemoclaw status runs the # recovery script (which is the only path that emits the warning). +negative_guard_log_start="$(gateway_log_marker "negative")" sandbox_exec sh -c 'rm -f /tmp/nemoclaw-proxy-env.sh' >/dev/null sandbox_exec sh -c "pkill -9 -f '[o]penclaw' 2>/dev/null; sleep 2; pgrep -af '[o]penclaw' || echo ALL_DEAD" >/dev/null run_probe_only_or_fail "Negative case after proxy-env removal" @@ -520,7 +765,7 @@ info "Negative-case recovery respawned gateway pid=$NEGATIVE_PID" # # Once the #2701 fix lands, recovery re-emits the chain before launching # and this assertion flips green. Will fail on origin/main as of 2026-06-09. -if gateway_guards_active "$NEGATIVE_PID"; then +if gateway_guards_active "$NEGATIVE_PID" 30 "$negative_guard_log_start"; then pass "#2701: recovery restored guard chain (proxy-env.sh + safety-net + ciao)" else fail "#2701: recovery did NOT restore guard chain — gateway respawned naked (DGX Spark crash-loop scenario)" @@ -534,18 +779,28 @@ fi # sandbox exec` does not pipe stdin from the caller through to the subshell, # so a `printf | sandbox_exec sh -c 'cat > file'` would leave an empty file. # Encoding into the command argv sidesteps the stdin gap entirely. +# +# After the #2701 fix, recovery may already have restored a minimal hardened +# proxy-env from trusted preload sources. In that case the sandbox user cannot +# necessarily overwrite the root-owned 0444 file with the original snapshot; +# accept the recovered file if it still proves the guard chain is active. This +# keeps the test focused on the user-visible contract instead of byte-identical +# cleanup of an implementation detail. sandbox_exec sh -c "echo '$SNAPSHOT_B64' | base64 -d > /tmp/nemoclaw-proxy-env.sh && chmod 444 /tmp/nemoclaw-proxy-env.sh" >/dev/null -# Verify restore is byte-identical to the snapshot. restored_size="$(sandbox_exec sh -c 'wc -c < /tmp/nemoclaw-proxy-env.sh' | tr -d '[:space:]')" -if [ "$restored_size" != "$SNAPSHOT_SIZE" ]; then - fail "proxy-env.sh restore failed: expected $SNAPSHOT_SIZE bytes, got '${restored_size}'" +if [ "$restored_size" = "$SNAPSHOT_SIZE" ]; then + info "proxy-env.sh restored from snapshot (${restored_size} bytes verified)" +elif gateway_guards_active "$NEGATIVE_PID" 5 "$negative_guard_log_start"; then + info "proxy-env.sh retained recovered guard env (${restored_size} bytes; original snapshot was ${SNAPSHOT_SIZE} bytes)" +else + fail "proxy-env.sh restore failed and recovered guard env is not active: expected $SNAPSHOT_SIZE bytes, got '${restored_size}'" exit 1 fi -info "proxy-env.sh restored (${restored_size} bytes verified)" -# Kill the guardless negative-case gateway, then trigger recovery to bring the -# gateway back with guards intact from the restored env file. +# Kill the current negative-case gateway, then trigger recovery to bring the +# gateway back with guards intact from either the snapshot or recovered env file. +restore_guard_log_start="$(gateway_log_marker "restore")" sandbox_exec sh -c "pkill -9 -f '[o]penclaw' 2>/dev/null; sleep 2; pgrep -af '[o]penclaw' || echo ALL_DEAD" >/dev/null run_probe_only_or_fail "Guard restore recovery" SOAK_START_PID="$(wait_for_gateway_up 30)" @@ -556,7 +811,7 @@ if [ -z "$SOAK_START_PID" ]; then fi # Confirm the restored gateway has guards back in place — otherwise the # soak measures a crash-looping gateway, not steady-state recovery. -if ! gateway_guards_active "$SOAK_START_PID" 30; then +if ! gateway_guards_active "$SOAK_START_PID" 30 "$restore_guard_log_start"; then fail "Gateway up but guards not active entering soak — restore did not take" gateway_diagnostics "$SOAK_START_PID" exit 1