Skip to content

Commit d2c25eb

Browse files
committed
Revert: remove the probe-driver-swap fast-feedback job
Added in a3f1573 for fast iteration on install_gpu_driver.sh; no longer needed now that the script has stabilized.
1 parent 8d8a9ef commit d2c25eb

1 file changed

Lines changed: 0 additions & 70 deletions

File tree

.github/workflows/ci.yml

Lines changed: 0 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -413,76 +413,6 @@ jobs:
413413
with:
414414
is-release: ${{ github.ref_type == 'tag' }}
415415

416-
# Fast-feedback probe for changes to ci/tools/install_gpu_driver.sh.
417-
# Allocates one L4 GPU + container, runs the driver swap to a
418-
# hard-coded version, then drives nvmlDeviceSetPersistenceMode via
419-
# raw ctypes -- the *exact* NVML call that cuda.core's
420-
# test_persistence_mode_enabled exercises. Total runtime is ~5 min
421-
# vs. ~30 min for a full test matrix; runs on every PR push so we
422-
# can iterate on `ci/tools/install_gpu_driver.sh` quickly.
423-
probe-driver-swap:
424-
name: Probe custom-DRIVER install
425-
if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
426-
runs-on: "linux-amd64-gpu-l4-latest-1"
427-
timeout-minutes: 15
428-
defaults:
429-
run:
430-
shell: bash --noprofile --norc -xeuo pipefail {0}
431-
container:
432-
options: -u root --security-opt seccomp=unconfined --shm-size 16g --privileged --pid=host
433-
image: ubuntu:22.04
434-
env:
435-
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
436-
steps:
437-
- name: Checkout
438-
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
439-
440-
- name: Install host deps
441-
run: |
442-
apt-get update -qq
443-
apt-get -y install --no-install-recommends util-linux python3
444-
445-
- name: Install GPU driver
446-
env:
447-
DRIVER: '610.43.02'
448-
GPU_TYPE: 'l4'
449-
run: ./ci/tools/install_gpu_driver.sh
450-
451-
- name: Show post-install host + container state
452-
run: |
453-
nvidia-smi --query-gpu=name,driver_version,persistence_mode --format=csv
454-
echo
455-
echo "=== /run/nvidia-persistenced ==="
456-
ls -la /run/nvidia-persistenced/ 2>&1 || echo "MISSING"
457-
echo
458-
echo "=== nvidia-persistenced process ==="
459-
pgrep -laf nvidia-persistenced || echo "(none)"
460-
461-
- name: Drive nvmlDeviceSetPersistenceMode via ctypes
462-
run: |
463-
python3 <<'PYEOF'
464-
import ctypes, sys
465-
NVML_SUCCESS, NVML_ERROR_NO_PERMISSION, NVML_ERROR_UNKNOWN = 0, 4, 999
466-
nvml = ctypes.CDLL("libnvidia-ml.so.1")
467-
assert nvml.nvmlInit_v2() == 0, "nvmlInit_v2 failed"
468-
h = ctypes.c_void_p()
469-
assert nvml.nvmlDeviceGetHandleByIndex_v2(0, ctypes.byref(h)) == 0
470-
m = ctypes.c_uint(99)
471-
nvml.nvmlDeviceGetPersistenceMode(h, ctypes.byref(m))
472-
print(f"current persistence_mode = {m.value} (1=ENABLED, 0=DISABLED)")
473-
ret = nvml.nvmlDeviceSetPersistenceMode(h, 0)
474-
print(f"SET DISABLED -> {ret} # 0=SUCCESS, 4=NO_PERMISSION, 999=UNKNOWN")
475-
if ret == NVML_ERROR_UNKNOWN:
476-
print("FAIL: NVML_ERROR_UNKNOWN -- daemon-down failure mode reproduced", file=sys.stderr)
477-
sys.exit(1)
478-
if ret != NVML_SUCCESS:
479-
print(f"FAIL: unexpected return code {ret}", file=sys.stderr)
480-
sys.exit(1)
481-
# restore
482-
nvml.nvmlDeviceSetPersistenceMode(h, m.value)
483-
print("OK")
484-
PYEOF
485-
486416
checks:
487417
name: Check job status
488418
if: always()

0 commit comments

Comments
 (0)