From 3322c66620bab9d93be5bab703b067e5d9fb2e4a Mon Sep 17 00:00:00 2001
From: tyeth <tyethgundry@googlemail.com>
Date: Tue, 17 Mar 2026 23:53:13 +0000
Subject: [PATCH] fix(gpu): add WSL2 GPU support and ollama provider on Linux

WSL2 GPU support:
- Add wsl2-gpu-fix.sh that applies CDI mode, libdxcore.so injection,
  and node labeling after gateway start (workaround until OpenShell
  ships native WSL2 support via NVIDIA/OpenShell#411)
- Hook it into both onboard.js (interactive wizard) and setup.sh
  (legacy script) so it runs automatically after gateway creation
- Writes a complete CDI spec from scratch instead of fragile sed
  patching of the nvidia-ctk generated spec

Ollama on Linux:
- setup.sh only created the ollama-local provider on macOS (Darwin)
- Now detects ollama on any platform (Linux/WSL2 included)
- Enables local GPU inference via ollama for WSL2 users

Closes NVIDIA/NemoClaw#TBD
See also: NVIDIA/OpenShell#404, NVIDIA/OpenShell#411
---
 bin/lib/onboard.js |  11 ++++
 scripts/setup.sh   |  38 ++++++++----
 wsl2-gpu-fix.sh    | 145 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 183 insertions(+), 11 deletions(-)
 create mode 100755 wsl2-gpu-fix.sh

diff --git a/bin/lib/onboard.js b/bin/lib/onboard.js
index 8e0f0396f..08b10ca96 100644
--- a/bin/lib/onboard.js
+++ b/bin/lib/onboard.js
@@ -148,6 +148,17 @@ async function startGateway(gpu) {
   // Give DNS a moment to propagate
   require("child_process").spawnSync("sleep", ["5"]);
 
+  // WSL2 GPU fix — CDI mode + libdxcore.so + node label
+  if (gpu && gpu.nimCapable && fs.existsSync("/dev/dxg")) {
+    console.log("  WSL2 detected — applying GPU CDI fixes...");
+    const fixScript = path.join(ROOT, "wsl2-gpu-fix.sh");
+    if (fs.existsSync(fixScript)) {
+      run(`bash "${fixScript}" nemoclaw`, { ignoreError: true });
+    } else {
+      console.log("  Warning: wsl2-gpu-fix.sh not found at " + fixScript);
+      console.log("  GPU sandbox creation may fail on WSL2. See: https://github.com/NVIDIA/OpenShell/issues/404");
+    }
+  }
 }
 
 // ── Step 3: Sandbox ──────────────────────────────────────────────
diff --git a/scripts/setup.sh b/scripts/setup.sh
index 77d24a77a..d507f4b07 100755
--- a/scripts/setup.sh
+++ b/scripts/setup.sh
@@ -88,6 +88,17 @@ for i in 1 2 3 4 5; do
 done
 info "Gateway is healthy"
 
+# 1b. WSL2 GPU fix — CDI mode + libdxcore.so + node label
+if [ -c /dev/dxg ] && command -v nvidia-smi > /dev/null 2>&1; then
+  info "WSL2 detected — applying GPU CDI fixes..."
+  WSL2_FIX="${REPO_DIR}/wsl2-gpu-fix.sh"
+  if [ -x "$WSL2_FIX" ]; then
+    bash "$WSL2_FIX" nemoclaw
+  else
+    warn "wsl2-gpu-fix.sh not found at $WSL2_FIX — GPU sandbox may fail on WSL2"
+  fi
+fi
+
 # 2. CoreDNS fix (Colima only)
 if [ -S "$HOME/.colima/default/docker.sock" ]; then
   info "Patching CoreDNS for Colima..."
@@ -113,19 +124,24 @@ if curl -s http://localhost:8000/v1/models > /dev/null 2>&1 || python3 -c "impor
     "OPENAI_BASE_URL=http://host.openshell.internal:8000/v1"
 fi
 
-# 4a. Ollama (macOS local inference)
-if [ "$(uname -s)" = "Darwin" ]; then
-  if ! command -v ollama > /dev/null 2>&1; then
-    info "Installing Ollama..."
-    brew install ollama 2>/dev/null || warn "Ollama install failed (brew required). Install manually: https://ollama.com"
+# 4a. Ollama (local inference — macOS or Linux)
+if command -v ollama > /dev/null 2>&1 || curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
+  if ! curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
+    info "Starting Ollama service..."
+    OLLAMA_HOST=0.0.0.0:11434 ollama serve > /dev/null 2>&1 &
+    sleep 2
   fi
+  upsert_provider \
+    "ollama-local" \
+    "openai" \
+    "OPENAI_API_KEY=ollama" \
+    "OPENAI_BASE_URL=http://host.openshell.internal:11434/v1"
+elif [ "$(uname -s)" = "Darwin" ] && ! command -v ollama > /dev/null 2>&1; then
+  info "Installing Ollama..."
+  brew install ollama 2>/dev/null || warn "Ollama install failed. Install manually: https://ollama.com"
   if command -v ollama > /dev/null 2>&1; then
-    # Start Ollama service if not running
-    if ! curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
-      info "Starting Ollama service..."
-      OLLAMA_HOST=0.0.0.0:11434 ollama serve > /dev/null 2>&1 &
-      sleep 2
-    fi
+    OLLAMA_HOST=0.0.0.0:11434 ollama serve > /dev/null 2>&1 &
+    sleep 2
     upsert_provider \
       "ollama-local" \
       "openai" \
diff --git a/wsl2-gpu-fix.sh b/wsl2-gpu-fix.sh
new file mode 100755
index 000000000..ffb523482
--- /dev/null
+++ b/wsl2-gpu-fix.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+# wsl2-gpu-fix.sh — Apply WSL2 GPU fixes to an OpenShell gateway
+# Run after: openshell gateway start --gpu
+# Usage: ./wsl2-gpu-fix.sh [gateway-name]
+#
+# This script applies the same fixes as the PR (NVIDIA/OpenShell#411)
+# at runtime, until the upstream image ships with WSL2 support.
+
+set -euo pipefail
+
+GATEWAY="${1:-nemoclaw}"
+echo "Applying WSL2 GPU fixes to gateway '$GATEWAY'..."
+
+# Check gateway is up
+if ! openshell status 2>&1 | grep -q "Connected"; then
+    echo "Error: gateway not connected. Start it first: openshell gateway start --gpu --name $GATEWAY"
+    exit 1
+fi
+
+# Check we're on WSL2
+if [ ! -c /dev/dxg ] 2>/dev/null; then
+    echo "Not WSL2 (/dev/dxg absent) — no fixes needed"
+    exit 0
+fi
+
+echo "[1/4] Generating CDI spec with GPU UUIDs and libdxcore.so..."
+openshell doctor exec -- sh -c '
+mkdir -p /var/run/cdi
+
+# Gather info
+GPU_UUID=$(nvidia-smi --query-gpu=gpu_uuid --format=csv,noheader 2>/dev/null | tr -d " " | head -1)
+DXCORE_PATH=$(find /usr/lib -name "libdxcore.so" 2>/dev/null | head -1)
+DXCORE_DIR=$(dirname "$DXCORE_PATH" 2>/dev/null || echo "/usr/lib/x86_64-linux-gnu")
+DRIVER_DIR=$(ls -d /usr/lib/wsl/drivers/nv*.inf_amd64_* 2>/dev/null | head -1)
+
+if [ -z "$DRIVER_DIR" ]; then
+    echo "Error: no NVIDIA WSL driver store found"
+    exit 1
+fi
+
+# Write complete CDI spec from scratch (avoids fragile sed patching)
+cat > /var/run/cdi/nvidia.yaml << CDIEOF
+---
+cdiVersion: "0.5.0"
+kind: nvidia.com/gpu
+devices:
+    - name: all
+      containerEdits:
+        deviceNodes:
+            - path: /dev/dxg
+    - name: "${GPU_UUID}"
+      containerEdits:
+        deviceNodes:
+            - path: /dev/dxg
+    - name: "0"
+      containerEdits:
+        deviceNodes:
+            - path: /dev/dxg
+containerEdits:
+    env:
+        - NVIDIA_VISIBLE_DEVICES=void
+    hooks:
+        - hookName: createContainer
+          path: /usr/bin/nvidia-cdi-hook
+          args:
+            - nvidia-cdi-hook
+            - create-symlinks
+            - --link
+            - ${DRIVER_DIR}/nvidia-smi::/usr/bin/nvidia-smi
+          env:
+            - NVIDIA_CTK_DEBUG=false
+        - hookName: createContainer
+          path: /usr/bin/nvidia-cdi-hook
+          args:
+            - nvidia-cdi-hook
+            - update-ldcache
+            - --folder
+            - ${DRIVER_DIR}
+            - --folder
+            - ${DXCORE_DIR}
+          env:
+            - NVIDIA_CTK_DEBUG=false
+    mounts:
+        - hostPath: ${DXCORE_PATH}
+          containerPath: ${DXCORE_PATH}
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/libcuda.so.1.1
+          containerPath: ${DRIVER_DIR}/libcuda.so.1.1
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/libcuda_loader.so
+          containerPath: ${DRIVER_DIR}/libcuda_loader.so
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/libnvdxgdmal.so.1
+          containerPath: ${DRIVER_DIR}/libnvdxgdmal.so.1
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/libnvidia-ml.so.1
+          containerPath: ${DRIVER_DIR}/libnvidia-ml.so.1
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/libnvidia-ml_loader.so
+          containerPath: ${DRIVER_DIR}/libnvidia-ml_loader.so
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/libnvidia-ptxjitcompiler.so.1
+          containerPath: ${DRIVER_DIR}/libnvidia-ptxjitcompiler.so.1
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/nvcubins.bin
+          containerPath: ${DRIVER_DIR}/nvcubins.bin
+          options: [ro, nosuid, nodev, rbind, rprivate]
+        - hostPath: ${DRIVER_DIR}/nvidia-smi
+          containerPath: ${DRIVER_DIR}/nvidia-smi
+          options: [ro, nosuid, nodev, rbind, rprivate]
+CDIEOF
+
+nvidia-ctk cdi list 2>&1
+'
+
+echo "[2/4] Switching nvidia runtime to CDI mode..."
+openshell doctor exec -- sed -i 's/mode = "auto"/mode = "cdi"/' /etc/nvidia-container-runtime/config.toml
+
+echo "[3/4] Labeling node with NVIDIA PCI vendor..."
+openshell doctor exec -- sh -c '
+NODE=$(kubectl get nodes -o jsonpath="{.items[0].metadata.name}")
+kubectl label node $NODE feature.node.kubernetes.io/pci-10de.present=true --overwrite
+' 2>&1
+
+echo "[4/4] Waiting for nvidia-device-plugin..."
+for i in $(seq 1 60); do
+    GPU=$(openshell doctor exec -- kubectl get nodes -o jsonpath='{.items[0].status.allocatable.nvidia\.com/gpu}' 2>/dev/null || true)
+    if [ "$GPU" = "1" ]; then
+        echo "GPU ready: nvidia.com/gpu=$GPU"
+        break
+    fi
+    [ "$((i % 10))" = "0" ] && echo "  still waiting ($i/60)..."
+    sleep 2
+done
+
+if [ "$GPU" != "1" ]; then
+    echo "Warning: GPU resource not yet advertised after 120s"
+    echo "Checking device plugin pods..."
+    openshell doctor exec -- kubectl -n nvidia-device-plugin get pods 2>&1
+    exit 1
+fi
+
+echo ""
+echo "WSL2 GPU fixes applied successfully."
+echo "Sandbox creation with --gpu should now work."