@@ -413,76 +413,6 @@ jobs:
413413 with :
414414 is-release : ${{ github.ref_type == 'tag' }}
415415
416- # Fast-feedback probe for changes to ci/tools/install_gpu_driver.sh.
417- # Allocates one L4 GPU + container, runs the driver swap to a
418- # hard-coded version, then drives nvmlDeviceSetPersistenceMode via
419- # raw ctypes -- the *exact* NVML call that cuda.core's
420- # test_persistence_mode_enabled exercises. Total runtime is ~5 min
421- # vs. ~30 min for a full test matrix; runs on every PR push so we
422- # can iterate on `ci/tools/install_gpu_driver.sh` quickly.
423- probe-driver-swap :
424- name : Probe custom-DRIVER install
425- if : ${{ github.repository_owner == 'nvidia' && !cancelled() }}
426- runs-on : " linux-amd64-gpu-l4-latest-1"
427- timeout-minutes : 15
428- defaults :
429- run :
430- shell : bash --noprofile --norc -xeuo pipefail {0}
431- container :
432- options : -u root --security-opt seccomp=unconfined --shm-size 16g --privileged --pid=host
433- image : ubuntu:22.04
434- env :
435- NVIDIA_VISIBLE_DEVICES : ${{ env.NVIDIA_VISIBLE_DEVICES }}
436- steps :
437- - name : Checkout
438- uses : actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
439-
440- - name : Install host deps
441- run : |
442- apt-get update -qq
443- apt-get -y install --no-install-recommends util-linux python3
444-
445- - name : Install GPU driver
446- env :
447- DRIVER : ' 610.43.02'
448- GPU_TYPE : ' l4'
449- run : ./ci/tools/install_gpu_driver.sh
450-
451- - name : Show post-install host + container state
452- run : |
453- nvidia-smi --query-gpu=name,driver_version,persistence_mode --format=csv
454- echo
455- echo "=== /run/nvidia-persistenced ==="
456- ls -la /run/nvidia-persistenced/ 2>&1 || echo "MISSING"
457- echo
458- echo "=== nvidia-persistenced process ==="
459- pgrep -laf nvidia-persistenced || echo "(none)"
460-
461- - name : Drive nvmlDeviceSetPersistenceMode via ctypes
462- run : |
463- python3 <<'PYEOF'
464- import ctypes, sys
465- NVML_SUCCESS, NVML_ERROR_NO_PERMISSION, NVML_ERROR_UNKNOWN = 0, 4, 999
466- nvml = ctypes.CDLL("libnvidia-ml.so.1")
467- assert nvml.nvmlInit_v2() == 0, "nvmlInit_v2 failed"
468- h = ctypes.c_void_p()
469- assert nvml.nvmlDeviceGetHandleByIndex_v2(0, ctypes.byref(h)) == 0
470- m = ctypes.c_uint(99)
471- nvml.nvmlDeviceGetPersistenceMode(h, ctypes.byref(m))
472- print(f"current persistence_mode = {m.value} (1=ENABLED, 0=DISABLED)")
473- ret = nvml.nvmlDeviceSetPersistenceMode(h, 0)
474- print(f"SET DISABLED -> {ret} # 0=SUCCESS, 4=NO_PERMISSION, 999=UNKNOWN")
475- if ret == NVML_ERROR_UNKNOWN:
476- print("FAIL: NVML_ERROR_UNKNOWN -- daemon-down failure mode reproduced", file=sys.stderr)
477- sys.exit(1)
478- if ret != NVML_SUCCESS:
479- print(f"FAIL: unexpected return code {ret}", file=sys.stderr)
480- sys.exit(1)
481- # restore
482- nvml.nvmlDeviceSetPersistenceMode(h, m.value)
483- print("OK")
484- PYEOF
485-
486416 checks :
487417 name : Check job status
488418 if : always()
0 commit comments