NVIDIA · tyeth-ai-assisted · Mar 17, 2026 · coderabbitai · Mar 18, 2026 · coderabbitai
diff --git a/bin/lib/onboard.js b/bin/lib/onboard.js
@@ -148,6 +148,17 @@ async function startGateway(gpu) {
   // Give DNS a moment to propagate
   require("child_process").spawnSync("sleep", ["5"]);
 
+  // WSL2 GPU fix — CDI mode + libdxcore.so + node label
+  if (gpu && gpu.nimCapable && fs.existsSync("/dev/dxg")) {
+    console.log("  WSL2 detected — applying GPU CDI fixes...");
+    const fixScript = path.join(ROOT, "wsl2-gpu-fix.sh");
+    if (fs.existsSync(fixScript)) {
+      run(`bash "${fixScript}" nemoclaw`, { ignoreError: true });
-    console.log("  WSL2 detected — applying GPU CDI fixes...");
-    const fixScript = path.join(ROOT, "wsl2-gpu-fix.sh");
-    if (fs.existsSync(fixScript)) {
-      run(`bash "${fixScript}" nemoclaw`, { ignoreError: true });
+    console.log("  WSL2 detected — applying GPU CDI fixes...");
+    const fixScript = path.join(ROOT, "wsl2-gpu-fix.sh");
+    if (fs.existsSync(fixScript)) {
+      try {
+        run(`bash "${fixScript}" nemoclaw`, { ignoreError: false });
+      } catch {
+        console.log("  Warning: WSL2 GPU fix failed; GPU sandbox creation may fail on WSL2.");
+      }
-    console.log("  WSL2 detected — applying GPU CDI fixes...");
-    const fixScript = path.join(ROOT, "wsl2-gpu-fix.sh");
-    if (fs.existsSync(fixScript)) {
-      run(`bash "${fixScript}" nemoclaw`, { ignoreError: true });
+    console.log("  WSL2 detected — applying GPU CDI fixes...");
+    const fixScript = path.join(ROOT, "wsl2-gpu-fix.sh");
+    if (fs.existsSync(fixScript)) {
+      try {
+        run(`bash "${fixScript}" nemoclaw`, { ignoreError: false });
+      } catch {
+        console.log("  Warning: WSL2 GPU fix failed; GPU sandbox creation may fail on WSL2.");
+      }
+    } else {
+      console.log("  Warning: wsl2-gpu-fix.sh not found at " + fixScript);
+      console.log("  GPU sandbox creation may fail on WSL2. See: https://github.com/NVIDIA/OpenShell/issues/404");
+    }
+  }
 }
 
 // ── Step 3: Sandbox ──────────────────────────────────────────────

diff --git a/scripts/setup.sh b/scripts/setup.sh
@@ -88,6 +88,17 @@ for i in 1 2 3 4 5; do
 done
 info "Gateway is healthy"
 
+# 1b. WSL2 GPU fix — CDI mode + libdxcore.so + node label
+if [ -c /dev/dxg ] && command -v nvidia-smi > /dev/null 2>&1; then
+  info "WSL2 detected — applying GPU CDI fixes..."
+  WSL2_FIX="${REPO_DIR}/wsl2-gpu-fix.sh"
+  if [ -x "$WSL2_FIX" ]; then
+    bash "$WSL2_FIX" nemoclaw
+  else
+    warn "wsl2-gpu-fix.sh not found at $WSL2_FIX — GPU sandbox may fail on WSL2"
+  fi
-  WSL2_FIX="${REPO_DIR}/wsl2-gpu-fix.sh"
-  if [ -x "$WSL2_FIX" ]; then
-    bash "$WSL2_FIX" nemoclaw
-  else
-    warn "wsl2-gpu-fix.sh not found at $WSL2_FIX — GPU sandbox may fail on WSL2"
-  fi
+  WSL2_FIX="${REPO_DIR}/wsl2-gpu-fix.sh"
+  if [ -f "$WSL2_FIX" ]; then
+    bash "$WSL2_FIX" nemoclaw
+  else
+    warn "wsl2-gpu-fix.sh not found at $WSL2_FIX — GPU sandbox may fail on WSL2"
+  fi
-  WSL2_FIX="${REPO_DIR}/wsl2-gpu-fix.sh"
-  if [ -x "$WSL2_FIX" ]; then
-    bash "$WSL2_FIX" nemoclaw
-  else
-    warn "wsl2-gpu-fix.sh not found at $WSL2_FIX — GPU sandbox may fail on WSL2"
-  fi
+  WSL2_FIX="${REPO_DIR}/wsl2-gpu-fix.sh"
+  if [ -f "$WSL2_FIX" ]; then
+    bash "$WSL2_FIX" nemoclaw
+  else
+    warn "wsl2-gpu-fix.sh not found at $WSL2_FIX — GPU sandbox may fail on WSL2"
+  fi
+fi
+
 # 2. CoreDNS fix (Colima only)
 if [ -S "$HOME/.colima/default/docker.sock" ]; then
   info "Patching CoreDNS for Colima..."
@@ -113,19 +124,24 @@ if curl -s http://localhost:8000/v1/models > /dev/null 2>&1 || python3 -c "impor
     "OPENAI_BASE_URL=http://host.openshell.internal:8000/v1"
 fi
 
-# 4a. Ollama (macOS local inference)
-if [ "$(uname -s)" = "Darwin" ]; then
-  if ! command -v ollama > /dev/null 2>&1; then
-    info "Installing Ollama..."
-    brew install ollama 2>/dev/null || warn "Ollama install failed (brew required). Install manually: https://ollama.com"
+# 4a. Ollama (local inference — macOS or Linux)
+if command -v ollama > /dev/null 2>&1 || curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
+  if ! curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
+    info "Starting Ollama service..."
+    OLLAMA_HOST=0.0.0.0:11434 ollama serve > /dev/null 2>&1 &
+    sleep 2
   fi
+  upsert_provider \
+    "ollama-local" \
+    "openai" \
+    "OPENAI_API_KEY=ollama" \
+    "OPENAI_BASE_URL=http://host.openshell.internal:11434/v1"
+elif [ "$(uname -s)" = "Darwin" ] && ! command -v ollama > /dev/null 2>&1; then
+  info "Installing Ollama..."
+  brew install ollama 2>/dev/null || warn "Ollama install failed. Install manually: https://ollama.com"
   if command -v ollama > /dev/null 2>&1; then
-    # Start Ollama service if not running
-    if ! curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
-      info "Starting Ollama service..."
-      OLLAMA_HOST=0.0.0.0:11434 ollama serve > /dev/null 2>&1 &
-      sleep 2
-    fi
+    OLLAMA_HOST=0.0.0.0:11434 ollama serve > /dev/null 2>&1 &
+    sleep 2
     upsert_provider \
       "ollama-local" \
       "openai" \

diff --git a/wsl2-gpu-fix.sh b/wsl2-gpu-fix.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+# wsl2-gpu-fix.sh — Apply WSL2 GPU fixes to an OpenShell gateway
+# Run after: openshell gateway start --gpu
+# Usage: ./wsl2-gpu-fix.sh [gateway-name]
+#
+# This script applies the same fixes as the PR (NVIDIA/OpenShell#411)
+# at runtime, until the upstream image ships with WSL2 support.
+
+set -euo pipefail
+
+GATEWAY="${1:-nemoclaw}"
+echo "Applying WSL2 GPU fixes to gateway '$GATEWAY'..."
+
+# Check gateway is up
+if ! openshell status 2>&1 | grep -q "Connected"; then
+    echo "Error: gateway not connected. Start it first: openshell gateway start --gpu --name $GATEWAY"
+    exit 1
+fi
+
+# Check we're on WSL2
+if [ ! -c /dev/dxg ] 2>/dev/null; then
+    echo "Not WSL2 (/dev/dxg absent) — no fixes needed"
+    exit 0
+fi
+
+echo "[1/4] Generating CDI spec with GPU UUIDs and libdxcore.so..."
+openshell doctor exec -- sh -c '
+mkdir -p /var/run/cdi
+
+# Gather info
+GPU_UUID=$(nvidia-smi --query-gpu=gpu_uuid --format=csv,noheader 2>/dev/null | tr -d " " | head -1)
+DXCORE_PATH=$(find /usr/lib -name "libdxcore.so" 2>/dev/null | head -1)
+DXCORE_DIR=$(dirname "$DXCORE_PATH" 2>/dev/null || echo "/usr/lib/x86_64-linux-gnu")
+DRIVER_DIR=$(ls -d /usr/lib/wsl/drivers/nv*.inf_amd64_* 2>/dev/null | head -1)
+
+if [ -z "$DRIVER_DIR" ]; then
+    echo "Error: no NVIDIA WSL driver store found"
+    exit 1
+fi
+
+# Write complete CDI spec from scratch (avoids fragile sed patching)
+cat > /var/run/cdi/nvidia.yaml << CDIEOF
+---
+cdiVersion: "0.5.0"
+kind: nvidia.com/gpu
+devices:
+    - name: all
+      containerEdits:
+        deviceNodes:
+            - path: /dev/dxg
+    - name: "${GPU_UUID}"
+      containerEdits:
+        deviceNodes:
+            - path: /dev/dxg
+    - name: "0"
+      containerEdits:
+        deviceNodes:
+            - path: /dev/dxg
+containerEdits:
+    env:
+        - NVIDIA_VISIBLE_DEVICES=void
+    hooks:
+        - hookName: createContainer
+          path: /usr/bin/nvidia-cdi-hook
+          args:
+            - nvidia-cdi-hook
+            - create-symlinks
+            - --link
+            - ${DRIVER_DIR}/nvidia-smi::/usr/bin/nvidia-smi
+          env:
+            - NVIDIA_CTK_DEBUG=false
+        - hookName: createContainer
+          path: /usr/bin/nvidia-cdi-hook
+          args:
+            - nvidia-cdi-hook
+            - update-ldcache
+            - --folder
+            - ${DRIVER_DIR}
+            - --folder
+            - ${DXCORE_DIR}
+          env:
+            - NVIDIA_CTK_DEBUG=false
+    mounts:
+        - hostPath: ${DXCORE_PATH}
+          containerPath: ${DXCORE_PATH}
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/libcuda.so.1.1
+          containerPath: ${DRIVER_DIR}/libcuda.so.1.1
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/libcuda_loader.so
+          containerPath: ${DRIVER_DIR}/libcuda_loader.so
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/libnvdxgdmal.so.1
+          containerPath: ${DRIVER_DIR}/libnvdxgdmal.so.1
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/libnvidia-ml.so.1
+          containerPath: ${DRIVER_DIR}/libnvidia-ml.so.1
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/libnvidia-ml_loader.so
+          containerPath: ${DRIVER_DIR}/libnvidia-ml_loader.so
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/libnvidia-ptxjitcompiler.so.1
+          containerPath: ${DRIVER_DIR}/libnvidia-ptxjitcompiler.so.1
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/nvcubins.bin
+          containerPath: ${DRIVER_DIR}/nvcubins.bin
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/nvidia-smi
+          containerPath: ${DRIVER_DIR}/nvidia-smi
+          options: [ro, nosuid, nodev, rbind, rprivate]
+CDIEOF
+
+nvidia-ctk cdi list 2>&1
+'
+
+echo "[2/4] Switching nvidia runtime to CDI mode..."
+openshell doctor exec -- sed -i 's/mode = "auto"/mode = "cdi"/' /etc/nvidia-container-runtime/config.toml
+
+echo "[3/4] Labeling node with NVIDIA PCI vendor..."
+openshell doctor exec -- sh -c '
+NODE=$(kubectl get nodes -o jsonpath="{.items[0].metadata.name}")
+kubectl label node $NODE feature.node.kubernetes.io/pci-10de.present=true --overwrite
+' 2>&1
+
+echo "[4/4] Waiting for nvidia-device-plugin..."
+for i in $(seq 1 60); do
+    GPU=$(openshell doctor exec -- kubectl get nodes -o jsonpath='{.items[0].status.allocatable.nvidia\.com/gpu}' 2>/dev/null || true)
+    if [ "$GPU" = "1" ]; then
+        echo "GPU ready: nvidia.com/gpu=$GPU"
+        break
+    fi
+    [ "$((i % 10))" = "0" ] && echo "  still waiting ($i/60)..."
+    sleep 2
+done
+
+if [ "$GPU" != "1" ]; then
+    echo "Warning: GPU resource not yet advertised after 120s"
+    echo "Checking device plugin pods..."
+    openshell doctor exec -- kubectl -n nvidia-device-plugin get pods 2>&1
+    exit 1
+fi
-echo "[4/4] Waiting for nvidia-device-plugin..."
-for i in $(seq 1 60); do
-    GPU=$(openshell doctor exec -- kubectl get nodes -o jsonpath='{.items[0].status.allocatable.nvidia\.com/gpu}' 2>/dev/null || true)
-    if [ "$GPU" = "1" ]; then
-        echo "GPU ready: nvidia.com/gpu=$GPU"
-        break
-    fi
-    [ "$((i % 10))" = "0" ] && echo "  still waiting ($i/60)..."
-    sleep 2
-done
-
-if [ "$GPU" != "1" ]; then
-    echo "Warning: GPU resource not yet advertised after 120s"
-    echo "Checking device plugin pods..."
-    openshell doctor exec -- kubectl -n nvidia-device-plugin get pods 2>&1
-    exit 1
-fi
+echo "[4/4] Waiting for nvidia-device-plugin..."
+for i in $(seq 1 60); do
+    GPU=$(openshell doctor exec -- kubectl get nodes -o jsonpath='{.items[0].status.allocatable.nvidia\.com/gpu}' 2>/dev/null || true)
+    if [[ "$GPU" =~ ^[1-9][0-9]*$ ]]; then
+        echo "GPU ready: nvidia.com/gpu=$GPU"
+        break
+    fi
+    [ "$((i % 10))" = "0" ] && echo "  still waiting ($i/60)..."
+    sleep 2
+done
+
+if ! [[ "$GPU" =~ ^[1-9][0-9]*$ ]]; then
+    echo "Warning: GPU resource not yet advertised after 120s"
+    echo "Checking device plugin pods..."
+    openshell doctor exec -- kubectl -n nvidia-device-plugin get pods 2>&1
+    exit 1
+fi
-echo "[4/4] Waiting for nvidia-device-plugin..."
-for i in $(seq 1 60); do
-    GPU=$(openshell doctor exec -- kubectl get nodes -o jsonpath='{.items[0].status.allocatable.nvidia\.com/gpu}' 2>/dev/null || true)
-    if [ "$GPU" = "1" ]; then
-        echo "GPU ready: nvidia.com/gpu=$GPU"
-        break
-    fi
-    [ "$((i % 10))" = "0" ] && echo "  still waiting ($i/60)..."
-    sleep 2
-done
-
-if [ "$GPU" != "1" ]; then
-    echo "Warning: GPU resource not yet advertised after 120s"
-    echo "Checking device plugin pods..."
-    openshell doctor exec -- kubectl -n nvidia-device-plugin get pods 2>&1
-    exit 1
-fi
+echo "[4/4] Waiting for nvidia-device-plugin..."
+for i in $(seq 1 60); do
+    GPU=$(openshell doctor exec -- kubectl get nodes -o jsonpath='{.items[0].status.allocatable.nvidia\.com/gpu}' 2>/dev/null || true)
+    if [[ "$GPU" =~ ^[1-9][0-9]*$ ]]; then
+        echo "GPU ready: nvidia.com/gpu=$GPU"
+        break
+    fi
+    [ "$((i % 10))" = "0" ] && echo "  still waiting ($i/60)..."
+    sleep 2
+done
+
+if ! [[ "$GPU" =~ ^[1-9][0-9]*$ ]]; then
+    echo "Warning: GPU resource not yet advertised after 120s"
+    echo "Checking device plugin pods..."
+    openshell doctor exec -- kubectl -n nvidia-device-plugin get pods 2>&1
+    exit 1
+fi
+
+echo ""
+echo "WSL2 GPU fixes applied successfully."
+echo "Sandbox creation with --gpu should now work."