diff --git a/.github/workflows/e2e-vitest-scenarios.yaml b/.github/workflows/e2e-vitest-scenarios.yaml index 159b28426b..c3c3afa3e1 100644 --- a/.github/workflows/e2e-vitest-scenarios.yaml +++ b/.github/workflows/e2e-vitest-scenarios.yaml @@ -841,7 +841,6 @@ jobs: if-no-files-found: ignore retention-days: 14 - credential-sanitization-vitest: needs: generate-matrix if: ${{ (inputs.jobs == '' && inputs.scenarios == '') || contains(format(',{0},', inputs.jobs), ',credential-sanitization-vitest,') || contains(format(',{0},', inputs.scenarios), ',credential-sanitization,') }} @@ -927,7 +926,6 @@ jobs: run: | docker logout docker.io >/dev/null 2>&1 || true - credential-migration-vitest: needs: generate-matrix if: ${{ (inputs.jobs == '' && inputs.scenarios == '') || contains(format(',{0},', inputs.jobs), ',credential-migration-vitest,') }} @@ -2844,6 +2842,109 @@ jobs: docker logout docker.io || true rm -rf "${DOCKER_CONFIG}" + issue-2478-crash-loop-recovery-vitest: + needs: generate-matrix + if: ${{ (inputs.jobs == '' && inputs.scenarios == '') || contains(format(',{0},', inputs.jobs), ',issue-2478-crash-loop-recovery-vitest,') || contains(format(',{0},', inputs.scenarios), ',issue-2478-crash-loop-recovery,') }} + runs-on: ubuntu-latest + timeout-minutes: 30 + env: + FREE_STANDING_VITEST_JOB: "1" + FREE_STANDING_SCENARIO_ID: "issue-2478-crash-loop-recovery" + E2E_ARTIFACT_DIR: ${{ github.workspace }}/e2e-artifacts/vitest/issue-2478-crash-loop-recovery + NEMOCLAW_CLI_BIN: ${{ github.workspace }}/bin/nemoclaw.js + NEMOCLAW_RUN_E2E_SCENARIOS: "1" + NEMOCLAW_NON_INTERACTIVE: "1" + NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1" + NEMOCLAW_SANDBOX_NAME: "e2e-2478" + OPENSHELL_GATEWAY: "nemoclaw" + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + persist-credentials: false + + - name: Configure isolated Docker auth directory + run: echo "DOCKER_CONFIG=${RUNNER_TEMP}/docker-config-issue-2478-crash-loop-recovery" >> "$GITHUB_ENV" + + - name: Authenticate to Docker Hub + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + shell: bash + run: | + set -euo pipefail + if [[ -z "${DOCKERHUB_USERNAME}" || -z "${DOCKERHUB_TOKEN}" ]]; then + echo "::notice::Docker Hub credentials not configured; continuing with anonymous pulls." + exit 0 + fi + mkdir -p "${DOCKER_CONFIG}" + chmod 700 "${DOCKER_CONFIG}" + login_succeeded=0 + for attempt in 1 2 3; do + if echo "${DOCKERHUB_TOKEN}" | timeout 30s docker login docker.io --username "${DOCKERHUB_USERNAME}" --password-stdin; then + login_succeeded=1 + break + fi + if [[ "$attempt" -lt 3 ]]; then + echo "::warning::Docker Hub login attempt ${attempt} failed; retrying." + sleep 5 + fi + done + if [[ "$login_succeeded" -ne 1 ]]; then + echo "::warning::Docker Hub login failed after 3 attempts; continuing with anonymous pulls." + fi + + - name: Set up Node + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.0.0 + with: + node-version: 22 + cache: npm + + - name: Install root dependencies + run: npm ci --ignore-scripts + + - name: Build CLI + run: npm run build:cli + + - name: Install OpenShell CLI + run: bash scripts/install-openshell.sh + + - name: "Run issue #2478 crash-loop recovery live Vitest test" + run: | + set -euo pipefail + export PATH="$HOME/.local/bin:$HOME/.npm-global/bin:$PATH" + if command -v openshell >/dev/null 2>&1; then + OPENSHELL_BIN="$(command -v openshell)" + elif [ -x "$HOME/.local/bin/openshell" ]; then + OPENSHELL_BIN="$HOME/.local/bin/openshell" + else + echo "::error::OpenShell CLI not found after install" + ls -la /usr/local/bin/openshell "$HOME/.local/bin/openshell" 2>&1 || true + exit 1 + fi + export OPENSHELL_BIN + echo "Using OPENSHELL_BIN=$OPENSHELL_BIN" + "$OPENSHELL_BIN" --version + npx vitest run --project e2e-scenarios-live \ + test/e2e-scenario/live/issue-2478-crash-loop-recovery.test.ts \ + --silent=false --reporter=default + + - name: "Upload issue #2478 crash-loop recovery artifacts" + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: e2e-vitest-scenarios-issue-2478-crash-loop-recovery + path: e2e-artifacts/vitest/issue-2478-crash-loop-recovery/ + include-hidden-files: false + if-no-files-found: ignore + retention-days: 14 + + - name: Clean up Docker auth + if: always() + run: | + set -euo pipefail + docker logout docker.io || true + rm -rf "${DOCKER_CONFIG}" + # ── PR result comment (mirrors nightly-e2e.yaml's report-to-pr) ─────────── # Posts a results table on the open PR for the dispatching branch (or the # PR identified by `inputs.pr_number`). `if: always()` so the comment lands @@ -2887,6 +2988,7 @@ jobs: issue-4434-tui-unreachable-inference-vitest, openclaw-inference-switch-vitest, bedrock-runtime-compatible-anthropic-vitest, + issue-2478-crash-loop-recovery-vitest, gateway-health-honest-vitest, channels-add-remove-vitest, ] diff --git a/test/e2e-scenario/live/issue-2478-crash-loop-recovery.test.ts b/test/e2e-scenario/live/issue-2478-crash-loop-recovery.test.ts new file mode 100644 index 0000000000..2f943971b2 --- /dev/null +++ b/test/e2e-scenario/live/issue-2478-crash-loop-recovery.test.ts @@ -0,0 +1,579 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Live Vitest replacement for test/e2e/test-issue-2478-crash-loop-recovery.sh. + * + * Preserves the legacy contract with real Docker/OpenShell/NemoClaw boundaries: + * onboard an OpenClaw sandbox, kill and recover the gateway via the production + * `connect --probe-only` path, verify the guard-chain preloads remain present, + * prove inference.local keeps serving models, exercise the missing proxy-env + * warning path, restore the env file, and soak for crash-loop churn. + */ + +import http from "node:http"; +import type { AddressInfo } from "node:net"; + +import type { ArtifactSink } from "../fixtures/artifacts.ts"; +import { buildAvailabilityProbeEnv } from "../fixtures/availability-env.ts"; +import type { HostCliClient } from "../fixtures/clients/index.ts"; +import type { CleanupRegistry } from "../fixtures/cleanup.ts"; +import { expect, test } from "../fixtures/e2e-test.ts"; +import type { NemoClawInstance } from "../fixtures/phases/onboarding.ts"; +import { ubuntuRepoDocker } from "../scenarios/matrix.ts"; + +const ENVIRONMENT = ubuntuRepoDocker("cloud-openclaw"); +const SANDBOX_NAME = process.env.NEMOCLAW_SANDBOX_NAME ?? "e2e-2478"; +const CRASH_CYCLES = positiveInteger(process.env.NEMOCLAW_E2E_CRASH_CYCLES, 5); +const SOAK_SECONDS = positiveInteger(process.env.NEMOCLAW_E2E_SOAK_SECONDS, 300); +const COMPATIBLE_MODEL = process.env.NEMOCLAW_COMPAT_MODEL ?? "test-model"; +const COMPATIBLE_AUTH_VALUE = ["nemoclaw", "e2e", "compatible", "mock"].join("-"); +const ONBOARD_ARGS = [ + "onboard", + "--non-interactive", + "--yes", + "--yes-i-accept-third-party-software", +]; + +function positiveInteger(raw: string | undefined, fallback: number): number { + const parsed = raw ? Number(raw) : fallback; + return Number.isInteger(parsed) && parsed > 0 ? parsed : fallback; +} + +function probeEnv(): NodeJS.ProcessEnv { + return { + ...buildAvailabilityProbeEnv(), + OPENSHELL_GATEWAY: process.env.OPENSHELL_GATEWAY ?? "nemoclaw", + }; +} + +interface FakeOpenAiEndpoint { + baseUrl: string; + close: () => Promise; + requests: () => readonly string[]; +} + +function jsonResponse(response: http.ServerResponse, status: number, body: unknown): void { + const payload = JSON.stringify(body); + response.writeHead(status, { + "content-type": "application/json", + "content-length": Buffer.byteLength(payload), + }); + response.end(payload); +} + +async function startCompatibleEndpointMock(artifacts: ArtifactSink): Promise { + const requests: string[] = []; + const server = http.createServer((request, response) => { + const chunks: Buffer[] = []; + request.on("data", (chunk) => chunks.push(Buffer.from(chunk))); + request.on("end", () => { + const requestPath = request.url?.split("?", 1)[0] ?? "/"; + const rawBody = Buffer.concat(chunks).toString("utf8"); + requests.push(`${request.method ?? "GET"} ${requestPath} ${rawBody}`.slice(0, 1_000)); + + if (request.method === "GET" && ["/v1/models", "/models"].includes(requestPath)) { + jsonResponse(response, 200, { + object: "list", + data: [{ id: COMPATIBLE_MODEL, object: "model" }], + }); + return; + } + + if ( + request.method === "POST" && + ["/v1/chat/completions", "/chat/completions"].includes(requestPath) + ) { + jsonResponse(response, 200, { + id: "chatcmpl-2478-mock", + object: "chat.completion", + choices: [ + { + index: 0, + message: { role: "assistant", content: "OK" }, + finish_reason: "stop", + }, + ], + }); + return; + } + + if (request.method === "POST" && ["/v1/responses", "/responses"].includes(requestPath)) { + jsonResponse(response, 200, { + id: "resp-2478-mock", + object: "response", + output: [ + { + type: "message", + role: "assistant", + content: [{ type: "output_text", text: "OK" }], + }, + ], + }); + return; + } + + jsonResponse(response, 404, { error: { message: "not found" } }); + }); + }); + + await new Promise((resolve, reject) => { + server.once("error", reject); + server.listen(0, "0.0.0.0", () => { + server.off("error", reject); + resolve(); + }); + }); + + const address = server.address(); + if (!address || typeof address === "string") { + throw new Error("issue-2478 compatible endpoint mock did not bind to a TCP port"); + } + const port = (address as AddressInfo).port; + const baseUrl = `http://host.openshell.internal:${port}/v1`; + await artifacts.writeJson("compatible-endpoint-mock.json", { + baseUrl, + model: COMPATIBLE_MODEL, + }); + + return { + baseUrl, + requests: () => requests, + close: () => + new Promise((resolve, reject) => { + server.close((error) => (error ? reject(error) : resolve())); + }), + }; +} + +async function cleanupSandbox(host: HostCliClient, sandboxName: string): Promise { + const result = await host.nemoclaw([sandboxName, "destroy", "--yes"], { + artifactName: `cleanup-destroy-${sandboxName}`, + env: probeEnv(), + timeoutMs: 15 * 60_000, + }); + if (result.exitCode === 0) return; + const text = [result.stdout, result.stderr].filter(Boolean).join("\n"); + if ( + /Sandbox '.+' does not exist|Run 'nemoclaw onboard' to create one|sandbox .* not found|no such sandbox/i.test( + text, + ) + ) { + return; + } + expect(result.exitCode, `cleanup destroy sandbox ${sandboxName}\n${text}`).toBe(0); +} + +async function onboardWithCompatibleEndpoint( + host: HostCliClient, + cleanup: CleanupRegistry, + sandboxName: string, + endpoint: FakeOpenAiEndpoint, +): Promise { + await cleanupSandbox(host, sandboxName); + const result = await host.nemoclaw(ONBOARD_ARGS, { + artifactName: "onboard-compatible-openclaw", + env: { + ...probeEnv(), + COMPATIBLE_API_KEY: COMPATIBLE_AUTH_VALUE, + NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1", + NEMOCLAW_AGENT: "openclaw", + NEMOCLAW_ENDPOINT_URL: endpoint.baseUrl, + NEMOCLAW_MODEL: COMPATIBLE_MODEL, + NEMOCLAW_NON_INTERACTIVE: "1", + NEMOCLAW_PROVIDER: "custom", + NEMOCLAW_SANDBOX_NAME: sandboxName, + }, + redactionValues: [COMPATIBLE_AUTH_VALUE], + timeoutMs: 15 * 60_000, + }); + expect( + result.exitCode, + `compatible OpenClaw onboard failed\nstdout:\n${result.stdout}\nstderr:\n${result.stderr}`, + ).toBe(0); + cleanup.add(`destroy NemoClaw sandbox ${sandboxName}`, () => cleanupSandbox(host, sandboxName)); + + return { + onboarding: "cloud-openclaw", + sandboxName, + agent: "openclaw", + provider: "nvidia", + providerEnv: "cloud", + gatewayUrl: "http://127.0.0.1:18789", + result, + }; +} + +async function waitForGatewayPid( + gateway: { + resolveGatewayPid(instance: NemoClawInstance): Promise; + }, + instance: NemoClawInstance, + timeoutMs: number, +): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + const pid = await gateway.resolveGatewayPid(instance); + if (pid !== null) return pid; + await sleep(2_000); + } + return null; +} + +async function runProbeOnly( + host: { + nemoclaw( + args?: string[], + options?: Record, + ): Promise<{ exitCode: number | null; stdout: string; stderr: string }>; + }, + sandboxName: string, + artifactName: string, +): Promise { + const result = await host.nemoclaw([sandboxName, "connect", "--probe-only"], { + artifactName, + env: probeEnv(), + timeoutMs: 90_000, + }); + expect( + result.exitCode, + `${artifactName} failed\nstdout:\n${result.stdout}\nstderr:\n${result.stderr}`, + ).toBe(0); +} + +async function killGatewayPid( + sandbox: { + exec( + name: string, + command: string[], + options?: Record, + ): Promise<{ exitCode: number | null; stdout: string; stderr: string }>; + }, + sandboxName: string, + pid: number, + artifactName: string, +): Promise { + const result = await sandbox.exec( + sandboxName, + ["sh", "-c", `kill -9 ${pid} 2>/dev/null || true; sleep 1`], + { + artifactName, + env: probeEnv(), + timeoutMs: 30_000, + }, + ); + expect( + result.exitCode, + `${artifactName}\nstdout:\n${result.stdout}\nstderr:\n${result.stderr}`, + ).toBe(0); +} + +async function killOpenclawTreeForRecovery( + sandbox: { + exec( + name: string, + command: string[], + options?: Record, + ): Promise<{ exitCode: number | null; stdout: string; stderr: string }>; + }, + sandboxName: string, + artifactName: string, +): Promise { + const result = await sandbox.exec( + sandboxName, + [ + "sh", + "-c", + "pkill -9 -f '[o]penclaw' 2>/dev/null || true; sleep 2; pgrep -af '[o]penclaw' || echo ALL_DEAD", + ], + { artifactName, env: probeEnv(), timeoutMs: 30_000 }, + ); + expect(result.exitCode, result.stderr).toBe(0); +} + +async function snapshotProxyEnv( + sandbox: { + exec( + name: string, + command: string[], + options?: Record, + ): Promise<{ exitCode: number | null; stdout: string; stderr: string }>; + }, + sandboxName: string, +): Promise<{ b64: string; size: number }> { + const result = await sandbox.exec( + sandboxName, + [ + "sh", + "-c", + "base64 < /tmp/nemoclaw-proxy-env.sh && printf '\\nSIZE=' && wc -c < /tmp/nemoclaw-proxy-env.sh", + ], + { artifactName: "snapshot-proxy-env", env: probeEnv(), timeoutMs: 30_000 }, + ); + expect(result.exitCode, result.stderr).toBe(0); + const match = result.stdout.match(/([A-Za-z0-9+/=\n]+)\nSIZE=(\d+)/); + expect(match, `unexpected proxy-env snapshot output: ${result.stdout}`).not.toBeNull(); + const b64 = match?.[1]?.replace(/\s+/g, "") ?? ""; + const size = Number(match?.[2] ?? 0); + expect(b64.length, "proxy-env snapshot must not be empty").toBeGreaterThan(0); + expect(size, "proxy-env snapshot size must be positive").toBeGreaterThan(0); + return { b64, size }; +} + +async function removeProxyEnv( + sandbox: { + exec( + name: string, + command: string[], + options?: Record, + ): Promise<{ exitCode: number | null; stdout: string; stderr: string }>; + }, + sandboxName: string, +): Promise { + const result = await sandbox.exec(sandboxName, ["rm", "-f", "/tmp/nemoclaw-proxy-env.sh"], { + artifactName: "remove-proxy-env", + env: probeEnv(), + timeoutMs: 30_000, + }); + expect(result.exitCode, result.stderr).toBe(0); +} + +async function proxyEnvHasGuardMarkers( + sandbox: { + exec( + name: string, + command: string[], + options?: Record, + ): Promise<{ exitCode: number | null; stdout: string; stderr: string }>; + }, + sandboxName: string, + artifactName: string, +): Promise { + const result = await sandbox.exec( + sandboxName, + ["sh", "-c", "cat /tmp/nemoclaw-proxy-env.sh 2>/dev/null || true"], + { artifactName, env: probeEnv(), timeoutMs: 30_000 }, + ); + return ( + result.stdout.includes("nemoclaw-sandbox-safety-net") && + result.stdout.includes("nemoclaw-ciao-network-guard") + ); +} + +async function restoreProxyEnv( + sandbox: { + exec( + name: string, + command: string[], + options?: Record, + ): Promise<{ exitCode: number | null; stdout: string; stderr: string }>; + }, + sandboxName: string, + snapshot: { b64: string; size: number }, +): Promise { + const result = await sandbox.exec( + sandboxName, + [ + "sh", + "-c", + `rm -f /tmp/nemoclaw-proxy-env.sh 2>/dev/null || true; (printf '%s' '${snapshot.b64}' | base64 -d > /tmp/nemoclaw-proxy-env.sh 2>/dev/null && chmod 444 /tmp/nemoclaw-proxy-env.sh) || true; wc -c < /tmp/nemoclaw-proxy-env.sh 2>/dev/null || true`, + ], + { artifactName: "restore-proxy-env", env: probeEnv(), timeoutMs: 30_000 }, + ); + expect(result.exitCode, result.stderr).toBe(0); + + const restoredSize = Number(result.stdout.trim() || 0); + if (restoredSize === snapshot.size) return; + if (await proxyEnvHasGuardMarkers(sandbox, sandboxName, "restore-proxy-env-guard-markers")) + return; + + expect(restoredSize, "restored proxy-env byte size or recovered guard markers").toBe( + snapshot.size, + ); +} + +async function waitForRecoveryWarning( + gateway: { + expectLogContains( + instance: NemoClawInstance, + pattern: RegExp, + options?: Record, + ): Promise; + expectLogDoesNotContain( + instance: NemoClawInstance, + pattern: RegExp, + options?: Record, + ): Promise; + }, + instance: NemoClawInstance, +): Promise { + let lastError: unknown; + for (let attempt = 1; attempt <= 5; attempt += 1) { + try { + await gateway.expectLogContains( + instance, + /\[gateway-recovery\] WARNING: .*restoring library guards from packaged preloads/, + { lines: 200 }, + ); + await gateway.expectLogDoesNotContain(instance, /gateway launching without library guards/, { + lines: 200, + }); + return; + } catch (error) { + lastError = error; + await sleep(3_000); + } + } + throw lastError; +} + +async function sampleGatewayStability( + gateway: { + resolveGatewayPid(instance: NemoClawInstance): Promise; + }, + runtime: { + expectInferenceLocalModels( + instance: NemoClawInstance, + options?: Record, + ): Promise; + }, + instance: NemoClawInstance, + soakSeconds: number, +): Promise<{ + samples: Array; + inferenceFailures: number; + inferenceProbes: number; +}> { + const samples: Array = []; + let inferenceFailures = 0; + let inferenceProbes = 0; + const intervalSeconds = 15; + + for (let elapsed = 0; elapsed < soakSeconds; elapsed += intervalSeconds) { + samples.push(await gateway.resolveGatewayPid(instance)); + if (elapsed % 60 === 0) { + inferenceProbes += 1; + try { + await runtime.expectInferenceLocalModels(instance, { + artifactName: `soak-inference-local-models-${elapsed}s`, + curlMaxTimeSeconds: 5, + timeoutMs: 15_000, + }); + } catch { + inferenceFailures += 1; + } + } + await sleep(intervalSeconds * 1_000); + } + + return { samples, inferenceFailures, inferenceProbes }; +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +test("issue-2478: gateway recovery preserves guard chain and avoids crash loop", async ({ + artifacts, + cleanup, + environment, + gateway, + host, + runtime, + sandbox, +}) => { + await artifacts.writeJson("scenario.json", { + id: "issue-2478-crash-loop-recovery", + legacyScript: "test/e2e/test-issue-2478-crash-loop-recovery.sh", + issues: ["#2478", "#2701"], + crashCycles: CRASH_CYCLES, + soakSeconds: SOAK_SECONDS, + compatibleEndpointModel: COMPATIBLE_MODEL, + }); + + const compatibleEndpoint = await startCompatibleEndpointMock(artifacts); + cleanup.add("stop issue-2478 compatible endpoint mock", async () => { + await artifacts.writeJson("compatible-endpoint-mock-requests.json", [ + ...compatibleEndpoint.requests(), + ]); + await compatibleEndpoint.close(); + }); + + await environment.assertReady(ENVIRONMENT); + const instance = await onboardWithCompatibleEndpoint( + host, + cleanup, + SANDBOX_NAME, + compatibleEndpoint, + ); + cleanup.add(`final guard-chain diagnostics ${instance.sandboxName}`, async () => { + const pid = await gateway.resolveGatewayPid(instance); + await artifacts.writeJson("final-gateway-pid.json", { pid }); + }); + + const initialPid = await waitForGatewayPid(gateway, instance, 60_000); + expect(initialPid, "gateway should be running after onboard").not.toBeNull(); + await gateway.expectGuardChainActive(instance); + await runtime.expectInferenceLocalModels(instance, { + artifactName: "initial-inference-local-models", + timeoutMs: 60_000, + }); + + let previousPid = initialPid!; + for (let cycle = 1; cycle <= CRASH_CYCLES; cycle += 1) { + await killGatewayPid(sandbox, instance.sandboxName, previousPid, `cycle-${cycle}-kill-gateway`); + await runProbeOnly(host, instance.sandboxName, `cycle-${cycle}-connect-probe-only`); + const nextPid = await waitForGatewayPid(gateway, instance, 45_000); + expect(nextPid, `cycle ${cycle}: gateway should respawn`).not.toBeNull(); + expect(nextPid, `cycle ${cycle}: kill should force a new PID`).not.toBe(previousPid); + await gateway.expectGuardChainActive(instance); + await runtime.expectInferenceLocalModels(instance, { + artifactName: `cycle-${cycle}-inference-local-models`, + timeoutMs: 60_000, + }); + previousPid = nextPid!; + } + + const snapshot = await snapshotProxyEnv(sandbox, instance.sandboxName); + await removeProxyEnv(sandbox, instance.sandboxName); + await killOpenclawTreeForRecovery( + sandbox, + instance.sandboxName, + "missing-proxy-env-kill-gateway-tree", + ); + await runProbeOnly(host, instance.sandboxName, "missing-proxy-env-connect-probe-only"); + await waitForRecoveryWarning(gateway, instance); + const negativePid = await waitForGatewayPid(gateway, instance, 45_000); + expect(negativePid, "missing proxy-env warning path should still respawn gateway").not.toBeNull(); + await gateway.expectGuardChainActive(instance); + + await restoreProxyEnv(sandbox, instance.sandboxName, snapshot); + await killOpenclawTreeForRecovery( + sandbox, + instance.sandboxName, + "restored-proxy-env-kill-gateway-tree", + ); + await runProbeOnly(host, instance.sandboxName, "restored-proxy-env-connect-probe-only"); + const soakStartPid = await waitForGatewayPid(gateway, instance, 45_000); + expect(soakStartPid, "gateway should be up before soak").not.toBeNull(); + await gateway.expectGuardChainActive(instance); + await runtime.expectInferenceLocalModels(instance, { + artifactName: "pre-soak-inference-local-models", + timeoutMs: 60_000, + }); + + const soak = await sampleGatewayStability(gateway, runtime, instance, SOAK_SECONDS); + await artifacts.writeJson("soak-summary.json", soak); + const distinctPids = new Set(soak.samples.filter((pid): pid is number => pid !== null)); + const emptySamples = soak.samples.filter((pid) => pid === null).length; + + expect( + distinctPids.size, + `crash-loop signature: ${distinctPids.size} distinct PIDs in samples ${soak.samples.join(",")}`, + ).toBeLessThanOrEqual(2); + expect( + emptySamples, + `gateway should not disappear repeatedly during soak: ${soak.samples.join(",")}`, + ).toBeLessThanOrEqual(1); + expect(soak.inferenceFailures, "inference.local should stay available during soak").toBe(0); +}); diff --git a/test/e2e-scenario/support-tests/e2e-scenarios-workflow.test.ts b/test/e2e-scenario/support-tests/e2e-scenarios-workflow.test.ts index 9f44e4108c..41195ebb07 100644 --- a/test/e2e-scenario/support-tests/e2e-scenarios-workflow.test.ts +++ b/test/e2e-scenario/support-tests/e2e-scenarios-workflow.test.ts @@ -77,7 +77,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { it("evaluates high-risk dispatch selector behavior before secret-bearing jobs run", () => { expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "network-policy,../escape" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "network-policy,../escape", + }), ).toMatchObject({ valid: false, liveScenariosRuns: false, @@ -94,7 +96,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { selectedFreeStandingJobs: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "network-policy" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "network-policy", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -112,7 +116,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: ["ubuntu-repo-cloud-openclaw"], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "openshell-version-pin" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "openshell-version-pin", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -126,7 +132,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ jobs: "skill-agent-vitest" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "skill-agent-vitest", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -134,7 +142,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "openclaw-skill-cli" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "openclaw-skill-cli", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -142,7 +152,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ jobs: "openclaw-skill-cli-vitest" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "openclaw-skill-cli-vitest", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -150,7 +162,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "credential-sanitization" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "credential-sanitization", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -158,7 +172,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ jobs: "credential-sanitization-vitest" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "credential-sanitization-vitest", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -166,7 +182,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "sessions-agents-cli" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "sessions-agents-cli", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -174,7 +192,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ jobs: "sessions-agents-cli-vitest" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "sessions-agents-cli-vitest", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -182,7 +202,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ jobs: "runtime-overrides-vitest" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "runtime-overrides-vitest", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -190,7 +212,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "runtime-overrides" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "runtime-overrides", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -218,7 +242,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "inference-routing" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "inference-routing", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -226,7 +252,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ jobs: "inference-routing-vitest" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "inference-routing-vitest", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -234,7 +262,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "cloud-inference" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "cloud-inference", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -242,7 +272,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ jobs: "cloud-inference-vitest" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "cloud-inference-vitest", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -256,7 +288,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "hermes-root-entrypoint-smoke" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "hermes-root-entrypoint-smoke", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -264,7 +298,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ jobs: "hermes-root-entrypoint-smoke-vitest" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "hermes-root-entrypoint-smoke-vitest", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -272,7 +308,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "common-egress-agent" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "common-egress-agent", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -280,7 +318,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ jobs: "common-egress-agent-vitest" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "common-egress-agent-vitest", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -288,7 +328,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "shields-config" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "shields-config", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -296,7 +338,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ jobs: "shields-config-vitest" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "shields-config-vitest", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -304,7 +348,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "rebuild-openclaw" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "rebuild-openclaw", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -312,7 +358,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ jobs: "rebuild-openclaw-vitest" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "rebuild-openclaw-vitest", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -320,7 +368,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "state-backup-restore" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "state-backup-restore", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -328,7 +378,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ jobs: "state-backup-restore-vitest" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "state-backup-restore-vitest", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -356,7 +408,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "gateway-drift-preflight" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "gateway-drift-preflight", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -364,7 +418,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ jobs: "gateway-drift-preflight-vitest" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "gateway-drift-preflight-vitest", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -372,7 +428,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "openclaw-inference-switch" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "openclaw-inference-switch", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -380,7 +438,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ jobs: "openclaw-inference-switch-vitest" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "openclaw-inference-switch-vitest", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, @@ -407,6 +467,26 @@ describe("e2e-vitest-scenarios workflow boundary", () => { selectedFreeStandingJobs: ["bedrock-runtime-compatible-anthropic-vitest"], registryScenarios: [], }); + expect( + evaluateE2eVitestWorkflowDispatchSelectors({ + scenarios: "issue-2478-crash-loop-recovery", + }), + ).toMatchObject({ + valid: true, + liveScenariosRuns: false, + selectedFreeStandingJobs: ["issue-2478-crash-loop-recovery-vitest"], + registryScenarios: [], + }); + expect( + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "issue-2478-crash-loop-recovery-vitest", + }), + ).toMatchObject({ + valid: true, + liveScenariosRuns: false, + selectedFreeStandingJobs: ["issue-2478-crash-loop-recovery-vitest"], + registryScenarios: [], + }); expect( evaluateE2eVitestWorkflowDispatchSelectors({ scenarios: "gateway-health-honest" }), ).toMatchObject({ @@ -432,7 +512,9 @@ describe("e2e-vitest-scenarios workflow boundary", () => { registryScenarios: [], }); expect( - evaluateE2eVitestWorkflowDispatchSelectors({ jobs: "channels-add-remove-vitest" }), + evaluateE2eVitestWorkflowDispatchSelectors({ + jobs: "channels-add-remove-vitest", + }), ).toMatchObject({ valid: true, liveScenariosRuns: false, diff --git a/tools/e2e-scenarios/free-standing-workflow-inventory.mts b/tools/e2e-scenarios/free-standing-workflow-inventory.mts index a618c42575..070a9b02bc 100644 --- a/tools/e2e-scenarios/free-standing-workflow-inventory.mts +++ b/tools/e2e-scenarios/free-standing-workflow-inventory.mts @@ -14,7 +14,10 @@ function usage(): string { ].join("\n"); } -function parseArgs(argv: readonly string[]): { shell: boolean; workflowPath?: string } { +function parseArgs(argv: readonly string[]): { + shell: boolean; + workflowPath?: string; +} { const parsed: { shell: boolean; workflowPath?: string } = { shell: false }; for (let index = 0; index < argv.length; index += 1) { const arg = argv[index]; diff --git a/tools/e2e-scenarios/workflow-boundary.mts b/tools/e2e-scenarios/workflow-boundary.mts index 8e3cbe95e8..1afd0be5d2 100644 --- a/tools/e2e-scenarios/workflow-boundary.mts +++ b/tools/e2e-scenarios/workflow-boundary.mts @@ -1630,7 +1630,9 @@ function validateMessagingCompatibleEndpointVitestJob( ); } if (!stringValue(jobEnv.NEMOCLAW_CLI_BIN).includes("bin/nemoclaw.js")) { - errors.push("messaging-compatible-endpoint-vitest job must point NEMOCLAW_CLI_BIN at the repo CLI"); + errors.push( + "messaging-compatible-endpoint-vitest job must point NEMOCLAW_CLI_BIN at the repo CLI", + ); } if (jobEnv.NEMOCLAW_RUN_E2E_SCENARIOS !== "1") { errors.push("messaging-compatible-endpoint-vitest job must set NEMOCLAW_RUN_E2E_SCENARIOS=1"); @@ -1700,7 +1702,9 @@ function validateMessagingCompatibleEndpointVitestJob( if (!checkout) errors.push("messaging-compatible-endpoint-vitest job missing checkout step"); requireFullShaAction(errors, checkout, "messaging-compatible-endpoint-vitest checkout"); if (asRecord(checkout?.with)["persist-credentials"] !== false) { - errors.push("messaging-compatible-endpoint-vitest checkout step must set persist-credentials=false"); + errors.push( + "messaging-compatible-endpoint-vitest checkout step must set persist-credentials=false", + ); } const setupNode = namedStep(steps, "Set up Node"); @@ -1732,7 +1736,9 @@ function validateMessagingCompatibleEndpointVitestJob( "NVIDIA_API_KEY", ); if (runVitestEnv.NEMOCLAW_COMPAT_MOCK_API_KEY !== "fake-compatible-key-e2e") { - errors.push("messaging-compatible-endpoint-vitest step must set a fake compatible endpoint key"); + errors.push( + "messaging-compatible-endpoint-vitest step must set a fake compatible endpoint key", + ); } if (runVitestEnv.TELEGRAM_BOT_TOKEN !== "test-fake-telegram-token-e2e") { errors.push("messaging-compatible-endpoint-vitest step must set a fake Telegram token"); @@ -1759,9 +1765,15 @@ function validateMessagingCompatibleEndpointVitestJob( errors.push("messaging-compatible-endpoint-vitest artifact upload name must be stable"); } const uploadPath = stringValue(uploadWith.path); - requireUploadPathContains(errors, uploadPath, "e2e-artifacts/vitest/messaging-compatible-endpoint/"); + requireUploadPathContains( + errors, + uploadPath, + "e2e-artifacts/vitest/messaging-compatible-endpoint/", + ); if (uploadWith["include-hidden-files"] !== false) { - errors.push("messaging-compatible-endpoint-vitest artifact upload must set include-hidden-files: false"); + errors.push( + "messaging-compatible-endpoint-vitest artifact upload must set include-hidden-files: false", + ); } if (uploadWith["if-no-files-found"] !== "ignore") { errors.push( @@ -2652,6 +2664,191 @@ function validateModelRouterProviderRoutedInferenceVitestJob( requireRunContains(errors, cleanup, 'rm -rf "${DOCKER_CONFIG}"'); } +function validateIssue2478CrashLoopRecoveryVitestJob(errors: string[], jobs: WorkflowRecord): void { + const jobName = "issue-2478-crash-loop-recovery-vitest"; + const scenarioName = "issue-2478-crash-loop-recovery"; + const job = asRecord(jobs[jobName]); + if (Object.keys(job).length === 0) { + errors.push("workflow missing issue-2478-crash-loop-recovery-vitest job"); + return; + } + + if (job["runs-on"] !== "ubuntu-latest") { + errors.push("issue-2478-crash-loop-recovery-vitest job must run on ubuntu-latest"); + } + if (job["timeout-minutes"] !== 30) { + errors.push("issue-2478-crash-loop-recovery-vitest job must keep the 30 minute timeout"); + } + validateFreeStandingJobSelector(errors, jobs, jobName, scenarioName); + + const jobEnv = asRecord(job.env); + if ("DOCKER_CONFIG" in jobEnv) { + errors.push( + "issue-2478-crash-loop-recovery-vitest job must not set DOCKER_CONFIG at job level", + ); + } + const expectedEnv: Record = { + FREE_STANDING_VITEST_JOB: "1", + FREE_STANDING_SCENARIO_ID: scenarioName, + E2E_ARTIFACT_DIR: "${{ github.workspace }}/e2e-artifacts/vitest/issue-2478-crash-loop-recovery", + NEMOCLAW_CLI_BIN: "${{ github.workspace }}/bin/nemoclaw.js", + NEMOCLAW_RUN_E2E_SCENARIOS: "1", + NEMOCLAW_NON_INTERACTIVE: "1", + NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1", + NEMOCLAW_SANDBOX_NAME: "e2e-2478", + OPENSHELL_GATEWAY: "nemoclaw", + }; + for (const [key, value] of Object.entries(expectedEnv)) { + if (jobEnv[key] !== value) { + errors.push(`issue-2478-crash-loop-recovery-vitest job env ${key} must be ${value}`); + } + } + for (const secret of ["NVIDIA_INFERENCE_API_KEY", ...COMMON_SECRET_ENV_NAMES]) { + requireEnvDoesNotExposeSecret( + errors, + "issue-2478-crash-loop-recovery-vitest job", + jobEnv, + secret, + ); + } + + const steps = asSteps(job.steps); + requireNoDispatchInputInterpolation(errors, steps); + for (const step of steps) { + const stepName = `issue-2478-crash-loop-recovery-vitest step '${step.name ?? step.uses ?? ""}'`; + const stepEnv = asRecord(step.env); + requireEnvDoesNotExposeSecret(errors, stepName, stepEnv, "NVIDIA_INFERENCE_API_KEY"); + requireEnvDoesNotExposeSecret(errors, stepName, stepEnv, "NVIDIA_API_KEY"); + if (step.name !== "Authenticate to Docker Hub") { + requireEnvDoesNotExposeSecret(errors, stepName, stepEnv, "DOCKERHUB_USERNAME"); + requireEnvDoesNotExposeSecret(errors, stepName, stepEnv, "DOCKERHUB_TOKEN"); + requireNoDockerHubAuthInRun(errors, stepName, stringValue(step.run)); + } + requireEnvDoesNotExposeSecret(errors, stepName, stepEnv, "GITHUB_TOKEN"); + } + + const checkout = steps.find((step) => stringValue(step.uses).startsWith("actions/checkout@")); + if (!checkout) { + errors.push("issue-2478-crash-loop-recovery-vitest job missing checkout step"); + } + requireFullShaAction(errors, checkout, "issue-2478-crash-loop-recovery-vitest checkout"); + if (asRecord(checkout?.with)["persist-credentials"] !== false) { + errors.push( + "issue-2478-crash-loop-recovery-vitest checkout step must set persist-credentials=false", + ); + } + + const configureDockerAuth = requireJobStep( + errors, + jobName, + steps, + "Configure isolated Docker auth directory", + ); + requireRunContains( + errors, + configureDockerAuth, + 'echo "DOCKER_CONFIG=${RUNNER_TEMP}/docker-config-issue-2478-crash-loop-recovery" >> "$GITHUB_ENV"', + ); + requireRunDoesNotContain(errors, configureDockerAuth, "${{ runner.temp }}"); + requireRunDoesNotContain(errors, configureDockerAuth, "${{ github.workspace }}"); + + const dockerLogin = requireJobStep(errors, jobName, steps, "Authenticate to Docker Hub"); + const dockerLoginEnv = asRecord(dockerLogin?.env); + if (dockerLoginEnv.DOCKERHUB_USERNAME !== "${{ secrets.DOCKERHUB_USERNAME }}") { + errors.push( + "issue-2478-crash-loop-recovery-vitest Docker Hub auth must receive DOCKERHUB_USERNAME from secrets", + ); + } + if (dockerLoginEnv.DOCKERHUB_TOKEN !== "${{ secrets.DOCKERHUB_TOKEN }}") { + errors.push( + "issue-2478-crash-loop-recovery-vitest Docker Hub auth must receive DOCKERHUB_TOKEN from secrets", + ); + } + requireRunContains(errors, dockerLogin, 'mkdir -p "${DOCKER_CONFIG}"'); + requireRunContains(errors, dockerLogin, 'chmod 700 "${DOCKER_CONFIG}"'); + requireRunContains(errors, dockerLogin, "docker login docker.io"); + requireRunContains(errors, dockerLogin, "--password-stdin"); + requireRunContains(errors, dockerLogin, "continuing with anonymous pulls"); + + const setupNode = namedStep(steps, "Set up Node"); + if (!setupNode) { + errors.push("issue-2478-crash-loop-recovery-vitest job missing step: Set up Node"); + } + requireFullShaAction(errors, setupNode, "issue-2478-crash-loop-recovery-vitest setup-node"); + + const installRootDependencies = requireJobStep( + errors, + jobName, + steps, + "Install root dependencies", + ); + requireRunContains(errors, installRootDependencies, "npm ci --ignore-scripts"); + + const buildCli = requireJobStep(errors, jobName, steps, "Build CLI"); + requireRunContains(errors, buildCli, "npm run build:cli"); + + const installOpenShell = requireJobStep(errors, jobName, steps, "Install OpenShell CLI"); + requireRunContains(errors, installOpenShell, "bash scripts/install-openshell.sh"); + + const runVitest = requireJobStep( + errors, + jobName, + steps, + "Run issue #2478 crash-loop recovery live Vitest test", + ); + const runVitestEnv = asRecord(runVitest?.env); + requireEnvDoesNotExposeSecret( + errors, + "issue-2478-crash-loop-recovery-vitest Vitest step", + runVitestEnv, + "NVIDIA_INFERENCE_API_KEY", + ); + requireRunContains(errors, runVitest, "npx vitest run --project e2e-scenarios-live"); + requireRunContains( + errors, + runVitest, + "test/e2e-scenario/live/issue-2478-crash-loop-recovery.test.ts", + ); + + const upload = requireJobStep( + errors, + jobName, + steps, + "Upload issue #2478 crash-loop recovery artifacts", + ); + requireFullShaAction(errors, upload, "issue-2478-crash-loop-recovery-vitest upload-artifact"); + const uploadWith = asRecord(upload?.with); + if (uploadWith.name !== "e2e-vitest-scenarios-issue-2478-crash-loop-recovery") { + errors.push("issue-2478-crash-loop-recovery-vitest artifact upload name must be stable"); + } + const uploadPath = stringValue(uploadWith.path); + requireUploadPathContains( + errors, + uploadPath, + "e2e-artifacts/vitest/issue-2478-crash-loop-recovery/", + ); + if (uploadWith["include-hidden-files"] !== false) { + errors.push( + "issue-2478-crash-loop-recovery-vitest artifact upload must set include-hidden-files: false", + ); + } + if (uploadWith["if-no-files-found"] !== "ignore") { + errors.push( + "issue-2478-crash-loop-recovery-vitest artifact upload must ignore missing fixture artifacts", + ); + } + if (uploadWith["retention-days"] !== 14) { + errors.push("issue-2478-crash-loop-recovery-vitest artifact upload retention-days must be 14"); + } + + const cleanup = requireJobStep(errors, jobName, steps, "Clean up Docker auth"); + if (cleanup?.if !== "always()") { + errors.push("issue-2478-crash-loop-recovery-vitest Docker auth cleanup must always run"); + } + requireRunContains(errors, cleanup, "docker logout docker.io"); + requireRunContains(errors, cleanup, 'rm -rf "${DOCKER_CONFIG}"'); +} + function validateChannelsAddRemoveVitestJob(errors: string[], jobs: WorkflowRecord): void { const jobName = "channels-add-remove-vitest"; const scenarioName = "channels-add-remove"; @@ -3331,7 +3528,14 @@ export function validateE2eVitestScenariosWorkflowBoundary( validateBedrockRuntimeCompatibleAnthropicVitestJob(errors, jobs); - validateFreeStandingJobSelector(errors, jobs, "gateway-health-honest-vitest", "gateway-health-honest"); + validateIssue2478CrashLoopRecoveryVitestJob(errors, jobs); + + validateFreeStandingJobSelector( + errors, + jobs, + "gateway-health-honest-vitest", + "gateway-health-honest", + ); validateChannelsAddRemoveVitestJob(errors, jobs);