From ed447b803591383b8aff2f800e7c85a57da0eda2 Mon Sep 17 00:00:00 2001 From: AnilAltinay Date: Thu, 7 May 2026 10:48:26 -0700 Subject: [PATCH] Add debugging output when only one GPU is detected in CUDA tests. When cudaGetDeviceCount returns 1, print environment variables (CUDA_VISIBLE_DEVICES, NVIDIA_VISIBLE_DEVICES), list /dev/nvidia*, and run nvidia-smi -L. Hopefully this helps diagnose issues where not all expected GPUs are visible to the CUDA application. Right now I cannot reproduce the failure locally. PiperOrigin-RevId: 912043151 --- images/gpu/cuda-tests/list_features.cu | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/images/gpu/cuda-tests/list_features.cu b/images/gpu/cuda-tests/list_features.cu index 6f95cf7f1a..e57de32375 100644 --- a/images/gpu/cuda-tests/list_features.cu +++ b/images/gpu/cuda-tests/list_features.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include "cuda_test_util.h" // NOLINT(build/include) @@ -53,6 +54,22 @@ int main(int argc, char *argv[]) { int gpuCount = -1; CHECK_CUDA(cudaGetDeviceCount(&gpuCount)); printf("// Number of GPUs: %d\n", gpuCount); + if (gpuCount == 1) { + printf("// Warning: Only 1 GPU detected by cudaGetDeviceCount.\n"); + printf("// Debugging info:\n"); + const char* cuda_visible_devices = getenv("CUDA_VISIBLE_DEVICES"); + printf("// CUDA_VISIBLE_DEVICES: %s\n", + cuda_visible_devices ? cuda_visible_devices : "unset"); + const char* nvidia_visible_devices = getenv("NVIDIA_VISIBLE_DEVICES"); + printf("// NVIDIA_VISIBLE_DEVICES: %s\n", + nvidia_visible_devices ? nvidia_visible_devices : "unset"); + printf("// Device nodes:\n"); + fflush(stdout); + system("ls -l /dev/nvidia* 2>&1 | sed 's/^/\\/\\/ /'"); + printf("// nvidia-smi -L output:\n"); + fflush(stdout); + system("nvidia-smi -L 2>&1 | sed 's/^/\\/\\/ /'"); + } if (gpuCount >= 2) { int canAccessAToB = -1; CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessAToB, 0, 1));