From 3322c66620bab9d93be5bab703b067e5d9fb2e4a Mon Sep 17 00:00:00 2001 From: tyeth Date: Tue, 17 Mar 2026 23:53:13 +0000 Subject: [PATCH] fix(gpu): add WSL2 GPU support and ollama provider on Linux WSL2 GPU support: - Add wsl2-gpu-fix.sh that applies CDI mode, libdxcore.so injection, and node labeling after gateway start (workaround until OpenShell ships native WSL2 support via NVIDIA/OpenShell#411) - Hook it into both onboard.js (interactive wizard) and setup.sh (legacy script) so it runs automatically after gateway creation - Writes a complete CDI spec from scratch instead of fragile sed patching of the nvidia-ctk generated spec Ollama on Linux: - setup.sh only created the ollama-local provider on macOS (Darwin) - Now detects ollama on any platform (Linux/WSL2 included) - Enables local GPU inference via ollama for WSL2 users Closes NVIDIA/NemoClaw#TBD See also: NVIDIA/OpenShell#404, NVIDIA/OpenShell#411 --- bin/lib/onboard.js | 11 ++++ scripts/setup.sh | 38 ++++++++---- wsl2-gpu-fix.sh | 145 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 183 insertions(+), 11 deletions(-) create mode 100755 wsl2-gpu-fix.sh diff --git a/bin/lib/onboard.js b/bin/lib/onboard.js index 8e0f0396f..08b10ca96 100644 --- a/bin/lib/onboard.js +++ b/bin/lib/onboard.js @@ -148,6 +148,17 @@ async function startGateway(gpu) { // Give DNS a moment to propagate require("child_process").spawnSync("sleep", ["5"]); + // WSL2 GPU fix — CDI mode + libdxcore.so + node label + if (gpu && gpu.nimCapable && fs.existsSync("/dev/dxg")) { + console.log(" WSL2 detected — applying GPU CDI fixes..."); + const fixScript = path.join(ROOT, "wsl2-gpu-fix.sh"); + if (fs.existsSync(fixScript)) { + run(`bash "${fixScript}" nemoclaw`, { ignoreError: true }); + } else { + console.log(" Warning: wsl2-gpu-fix.sh not found at " + fixScript); + console.log(" GPU sandbox creation may fail on WSL2. See: https://github.com/NVIDIA/OpenShell/issues/404"); + } + } } // ── Step 3: Sandbox ────────────────────────────────────────────── diff --git a/scripts/setup.sh b/scripts/setup.sh index 77d24a77a..d507f4b07 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -88,6 +88,17 @@ for i in 1 2 3 4 5; do done info "Gateway is healthy" +# 1b. WSL2 GPU fix — CDI mode + libdxcore.so + node label +if [ -c /dev/dxg ] && command -v nvidia-smi > /dev/null 2>&1; then + info "WSL2 detected — applying GPU CDI fixes..." + WSL2_FIX="${REPO_DIR}/wsl2-gpu-fix.sh" + if [ -x "$WSL2_FIX" ]; then + bash "$WSL2_FIX" nemoclaw + else + warn "wsl2-gpu-fix.sh not found at $WSL2_FIX — GPU sandbox may fail on WSL2" + fi +fi + # 2. CoreDNS fix (Colima only) if [ -S "$HOME/.colima/default/docker.sock" ]; then info "Patching CoreDNS for Colima..." @@ -113,19 +124,24 @@ if curl -s http://localhost:8000/v1/models > /dev/null 2>&1 || python3 -c "impor "OPENAI_BASE_URL=http://host.openshell.internal:8000/v1" fi -# 4a. Ollama (macOS local inference) -if [ "$(uname -s)" = "Darwin" ]; then - if ! command -v ollama > /dev/null 2>&1; then - info "Installing Ollama..." - brew install ollama 2>/dev/null || warn "Ollama install failed (brew required). Install manually: https://ollama.com" +# 4a. Ollama (local inference — macOS or Linux) +if command -v ollama > /dev/null 2>&1 || curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then + if ! curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then + info "Starting Ollama service..." + OLLAMA_HOST=0.0.0.0:11434 ollama serve > /dev/null 2>&1 & + sleep 2 fi + upsert_provider \ + "ollama-local" \ + "openai" \ + "OPENAI_API_KEY=ollama" \ + "OPENAI_BASE_URL=http://host.openshell.internal:11434/v1" +elif [ "$(uname -s)" = "Darwin" ] && ! command -v ollama > /dev/null 2>&1; then + info "Installing Ollama..." + brew install ollama 2>/dev/null || warn "Ollama install failed. Install manually: https://ollama.com" if command -v ollama > /dev/null 2>&1; then - # Start Ollama service if not running - if ! curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then - info "Starting Ollama service..." - OLLAMA_HOST=0.0.0.0:11434 ollama serve > /dev/null 2>&1 & - sleep 2 - fi + OLLAMA_HOST=0.0.0.0:11434 ollama serve > /dev/null 2>&1 & + sleep 2 upsert_provider \ "ollama-local" \ "openai" \ diff --git a/wsl2-gpu-fix.sh b/wsl2-gpu-fix.sh new file mode 100755 index 000000000..ffb523482 --- /dev/null +++ b/wsl2-gpu-fix.sh @@ -0,0 +1,145 @@ +#!/bin/bash +# wsl2-gpu-fix.sh — Apply WSL2 GPU fixes to an OpenShell gateway +# Run after: openshell gateway start --gpu +# Usage: ./wsl2-gpu-fix.sh [gateway-name] +# +# This script applies the same fixes as the PR (NVIDIA/OpenShell#411) +# at runtime, until the upstream image ships with WSL2 support. + +set -euo pipefail + +GATEWAY="${1:-nemoclaw}" +echo "Applying WSL2 GPU fixes to gateway '$GATEWAY'..." + +# Check gateway is up +if ! openshell status 2>&1 | grep -q "Connected"; then + echo "Error: gateway not connected. Start it first: openshell gateway start --gpu --name $GATEWAY" + exit 1 +fi + +# Check we're on WSL2 +if [ ! -c /dev/dxg ] 2>/dev/null; then + echo "Not WSL2 (/dev/dxg absent) — no fixes needed" + exit 0 +fi + +echo "[1/4] Generating CDI spec with GPU UUIDs and libdxcore.so..." +openshell doctor exec -- sh -c ' +mkdir -p /var/run/cdi + +# Gather info +GPU_UUID=$(nvidia-smi --query-gpu=gpu_uuid --format=csv,noheader 2>/dev/null | tr -d " " | head -1) +DXCORE_PATH=$(find /usr/lib -name "libdxcore.so" 2>/dev/null | head -1) +DXCORE_DIR=$(dirname "$DXCORE_PATH" 2>/dev/null || echo "/usr/lib/x86_64-linux-gnu") +DRIVER_DIR=$(ls -d /usr/lib/wsl/drivers/nv*.inf_amd64_* 2>/dev/null | head -1) + +if [ -z "$DRIVER_DIR" ]; then + echo "Error: no NVIDIA WSL driver store found" + exit 1 +fi + +# Write complete CDI spec from scratch (avoids fragile sed patching) +cat > /var/run/cdi/nvidia.yaml << CDIEOF +--- +cdiVersion: "0.5.0" +kind: nvidia.com/gpu +devices: + - name: all + containerEdits: + deviceNodes: + - path: /dev/dxg + - name: "${GPU_UUID}" + containerEdits: + deviceNodes: + - path: /dev/dxg + - name: "0" + containerEdits: + deviceNodes: + - path: /dev/dxg +containerEdits: + env: + - NVIDIA_VISIBLE_DEVICES=void + hooks: + - hookName: createContainer + path: /usr/bin/nvidia-cdi-hook + args: + - nvidia-cdi-hook + - create-symlinks + - --link + - ${DRIVER_DIR}/nvidia-smi::/usr/bin/nvidia-smi + env: + - NVIDIA_CTK_DEBUG=false + - hookName: createContainer + path: /usr/bin/nvidia-cdi-hook + args: + - nvidia-cdi-hook + - update-ldcache + - --folder + - ${DRIVER_DIR} + - --folder + - ${DXCORE_DIR} + env: + - NVIDIA_CTK_DEBUG=false + mounts: + - hostPath: ${DXCORE_PATH} + containerPath: ${DXCORE_PATH} + options: [ro, nosuid, nodev, rbind, rprivate] + - hostPath: ${DRIVER_DIR}/libcuda.so.1.1 + containerPath: ${DRIVER_DIR}/libcuda.so.1.1 + options: [ro, nosuid, nodev, rbind, rprivate] + - hostPath: ${DRIVER_DIR}/libcuda_loader.so + containerPath: ${DRIVER_DIR}/libcuda_loader.so + options: [ro, nosuid, nodev, rbind, rprivate] + - hostPath: ${DRIVER_DIR}/libnvdxgdmal.so.1 + containerPath: ${DRIVER_DIR}/libnvdxgdmal.so.1 + options: [ro, nosuid, nodev, rbind, rprivate] + - hostPath: ${DRIVER_DIR}/libnvidia-ml.so.1 + containerPath: ${DRIVER_DIR}/libnvidia-ml.so.1 + options: [ro, nosuid, nodev, rbind, rprivate] + - hostPath: ${DRIVER_DIR}/libnvidia-ml_loader.so + containerPath: ${DRIVER_DIR}/libnvidia-ml_loader.so + options: [ro, nosuid, nodev, rbind, rprivate] + - hostPath: ${DRIVER_DIR}/libnvidia-ptxjitcompiler.so.1 + containerPath: ${DRIVER_DIR}/libnvidia-ptxjitcompiler.so.1 + options: [ro, nosuid, nodev, rbind, rprivate] + - hostPath: ${DRIVER_DIR}/nvcubins.bin + containerPath: ${DRIVER_DIR}/nvcubins.bin + options: [ro, nosuid, nodev, rbind, rprivate] + - hostPath: ${DRIVER_DIR}/nvidia-smi + containerPath: ${DRIVER_DIR}/nvidia-smi + options: [ro, nosuid, nodev, rbind, rprivate] +CDIEOF + +nvidia-ctk cdi list 2>&1 +' + +echo "[2/4] Switching nvidia runtime to CDI mode..." +openshell doctor exec -- sed -i 's/mode = "auto"/mode = "cdi"/' /etc/nvidia-container-runtime/config.toml + +echo "[3/4] Labeling node with NVIDIA PCI vendor..." +openshell doctor exec -- sh -c ' +NODE=$(kubectl get nodes -o jsonpath="{.items[0].metadata.name}") +kubectl label node $NODE feature.node.kubernetes.io/pci-10de.present=true --overwrite +' 2>&1 + +echo "[4/4] Waiting for nvidia-device-plugin..." +for i in $(seq 1 60); do + GPU=$(openshell doctor exec -- kubectl get nodes -o jsonpath='{.items[0].status.allocatable.nvidia\.com/gpu}' 2>/dev/null || true) + if [ "$GPU" = "1" ]; then + echo "GPU ready: nvidia.com/gpu=$GPU" + break + fi + [ "$((i % 10))" = "0" ] && echo " still waiting ($i/60)..." + sleep 2 +done + +if [ "$GPU" != "1" ]; then + echo "Warning: GPU resource not yet advertised after 120s" + echo "Checking device plugin pods..." + openshell doctor exec -- kubectl -n nvidia-device-plugin get pods 2>&1 + exit 1 +fi + +echo "" +echo "WSL2 GPU fixes applied successfully." +echo "Sandbox creation with --gpu should now work."