From 68af1feeacc9364921b1b0817b763110ffc7f779 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Thu, 16 Apr 2026 16:36:45 -0700 Subject: [PATCH 1/2] Expand cudacompat hook to always inspect libcuda.so ELF header if available This commit reverts https://github.com/NVIDIA/nvidia-container-toolkit/commit/08bf5839aeeb0e940fd6dcb53a570d808406efbc which made it so that the libcuda.so ELF header was only ever inspected on certain Tegra systems, like Orin. We now leverage the libcuda.so ELF header in all cases (if available) to determine if the CUDA compat libraries bundled in the container should be used in favor of the host driver libraries. This allows us to support CUDA minor version compatibility. Signed-off-by: Christopher Desiniotis --- .../cudacompat/cuda-elf-header.go | 26 +++- .../cudacompat/cuda-elf-header_test.go | 121 +++++++++++++++++- cmd/nvidia-cdi-hook/cudacompat/cudacompat.go | 21 +-- .../cudacompat/cudacompat_test.go | 93 ++++++++++++++ .../{ => 575.57.08}/libcuda.so.575.57.08 | Bin .../{ => 590.44.01}/libcuda.so.590.44.01 | Bin 6 files changed, 245 insertions(+), 16 deletions(-) rename testdata/compat/{ => 575.57.08}/libcuda.so.575.57.08 (100%) rename testdata/compat/{ => 590.44.01}/libcuda.so.590.44.01 (100%) diff --git a/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header.go b/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header.go index d94652de8..b71c0c78b 100644 --- a/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header.go +++ b/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header.go @@ -24,6 +24,7 @@ import ( "encoding/json" "fmt" "os" + "slices" "strings" "golang.org/x/mod/semver" @@ -118,14 +119,31 @@ func getCUDAFwdCompatibilitySection(lib *elf.File) *elf.Section { // UseCompat checks whether the CUDA compat libraries with the specified elf // header should be used given the specified host versions. -// This is done by comparing the host CUDA version with the CUDA version -// specified in the ELF header. -func (h *compatElfHeader) UseCompat(hostCUDAVersion string) bool { +// If the host driver version is specified, we check if the driver version +// is supported in the ELF header. If no host driver version is provided, we +// fall back to checking the CUDA version specified in the ELF header. +func (h *compatElfHeader) UseCompat(compatDriverVersion string, hostDriverVersion string, hostCUDAVersion string) bool { if h == nil { return false } - return h.CUDAVersion.UseCompat(hostCUDAVersion) + if compatDriverVersion == "" || hostDriverVersion == "" { + if hostCUDAVersion != "" { + return h.CUDAVersion.UseCompat(hostCUDAVersion) + } + return false + } + + hostDriverMajor, err := extractMajorVersion(hostDriverVersion) + if err != nil { + return false + } + + if !slices.Contains(h.Driver, hostDriverMajor) { + return false + } + + return semver.Compare(normalizeVersion(compatDriverVersion), normalizeVersion(hostDriverVersion)) > 0 } type cudaVersion string diff --git a/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header_test.go b/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header_test.go index c1c69e201..a2f081774 100644 --- a/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header_test.go +++ b/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header_test.go @@ -38,8 +38,8 @@ func TestGetCUDACompatElfHeader(t *testing.T) { expected *compatElfHeader }{ { - description: "wip", - filename: "libcuda.so.575.57.08", + description: "575.57.08", + filename: "575.57.08/libcuda.so.575.57.08", expected: &compatElfHeader{ Format: 1, CUDAVersion: "12.9", @@ -48,8 +48,8 @@ func TestGetCUDACompatElfHeader(t *testing.T) { }, }, { - description: "wip", - filename: "libcuda.so.590.44.01", + description: "590.44.01", + filename: "590.44.01/libcuda.so.590.44.01", expected: &compatElfHeader{ Format: 1, CUDAVersion: "13.1", @@ -70,3 +70,116 @@ func TestGetCUDACompatElfHeader(t *testing.T) { }) } } + +func TestUseCompat(t *testing.T) { + testCases := []struct { + description string + elfHeader *compatElfHeader + compatDriverVersion string + hostDriverVersion string + hostCudaVersion string + expected bool + }{ + { + description: "container cuda version greater than host cuda version", + elfHeader: &compatElfHeader{ + Format: 1, + CUDAVersion: "12.9", + Driver: []int{535, 550, 560, 565, 570, 575}, + Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14}, + }, + hostCudaVersion: "12.8", + expected: true, + }, + { + description: "container cuda version same as host cuda version", + elfHeader: &compatElfHeader{ + Format: 1, + CUDAVersion: "12.9", + Driver: []int{535, 550, 560, 565, 570, 575}, + Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14}, + }, + hostCudaVersion: "12.9", + expected: false, + }, + { + description: "container cuda version less than host cuda version", + elfHeader: &compatElfHeader{ + Format: 1, + CUDAVersion: "12.9", + Driver: []int{535, 550, 560, 565, 570, 575}, + Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14}, + }, + hostCudaVersion: "12.10", + expected: false, + }, + { + description: "host driver branch not supported in compat elf header", + elfHeader: &compatElfHeader{ + Format: 1, + CUDAVersion: "12.9", + Driver: []int{535, 550, 560, 565, 570, 575}, + Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14}, + }, + compatDriverVersion: "575.57.08", + hostDriverVersion: "590.44.01", + expected: false, + }, + { + description: "host driver branch supported in compat elf header, host driver branch < compat driver branch", + elfHeader: &compatElfHeader{ + Format: 1, + CUDAVersion: "12.9", + Driver: []int{535, 550, 560, 565, 570, 575}, + Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14}, + }, + compatDriverVersion: "575.57.08", + hostDriverVersion: "570.211.01", + expected: true, + }, + { + description: "host driver branch same as compat driver branch, compat driver > host driver", + elfHeader: &compatElfHeader{ + Format: 1, + CUDAVersion: "12.9", + Driver: []int{535, 550, 560, 565, 570, 575}, + Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14}, + }, + compatDriverVersion: "575.57.08", + hostDriverVersion: "575.10.10", + expected: true, + }, + { + description: "host driver branch same as compat driver branch, compat driver = host driver", + elfHeader: &compatElfHeader{ + Format: 1, + CUDAVersion: "12.9", + Driver: []int{535, 550, 560, 565, 570, 575}, + Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14}, + }, + compatDriverVersion: "575.57.08", + hostDriverVersion: "575.57.08", + expected: false, + }, + { + description: "host driver branch same as compat driver branch, compat driver < host driver", + elfHeader: &compatElfHeader{ + Format: 1, + CUDAVersion: "12.9", + Driver: []int{535, 550, 560, 565, 570, 575}, + Device: []int{1, 2, 7, 8, 9, 10, 11, 12, 13, 14}, + }, + compatDriverVersion: "575.57.08", + hostDriverVersion: "575.99.99", + expected: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + useCompat := tc.elfHeader.UseCompat(tc.compatDriverVersion, tc.hostDriverVersion, tc.hostCudaVersion) + + require.EqualValues(t, tc.expected, useCompat) + }) + } +} diff --git a/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go b/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go index deadbdb69..9ec404236 100644 --- a/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go +++ b/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go @@ -177,16 +177,19 @@ func (m command) getContainerForwardCompatDir(containerRoot containerRoot, o *op } func (m command) useCompatLibraries(libcudaCompatPath string, hostDriverVersion string, hostCUDAVersion string) (bool, error) { + // First check the ELF header of the libcuda.so included in the compat directory. + // If this is present, we use the ELF header to determine whether the CUDA compat + // libraries in the container should be used over the host driver libraries. + compatDriverVersion := strings.TrimPrefix(filepath.Base(libcudaCompatPath), "libcuda.so.") + cudaCompatHeader, _ := GetCUDACompatElfHeader(libcudaCompatPath) + if cudaCompatHeader != nil { + return cudaCompatHeader.UseCompat(compatDriverVersion, hostDriverVersion, hostCUDAVersion), nil + } + // If the host CUDA version is specified, we need to inspect the ELF header // of the compat libraries in the container to determine whether these - // should be used. + // should be used. Return early if we cannot read the ELF header. if hostCUDAVersion != "" { - cudaCompatHeader, _ := GetCUDACompatElfHeader(libcudaCompatPath) - if cudaCompatHeader != nil { - return cudaCompatHeader.UseCompat(hostCUDAVersion), nil - } - // If we were unable to read the CUDA header, we do not use the compat - // libraries. return false, nil } @@ -196,12 +199,14 @@ func (m command) useCompatLibraries(libcudaCompatPath string, hostDriverVersion return false, nil } + // If we reach this point, it means we could not read the ELf header but + // the host driver version is specified. We fall back to comparing the major + // versions of the host driver and compat driver. driverMajor, err := extractMajorVersion(hostDriverVersion) if err != nil { return false, fmt.Errorf("failed to extract major version from %q: %v", hostDriverVersion, err) } - compatDriverVersion := strings.TrimPrefix(filepath.Base(libcudaCompatPath), "libcuda.so.") compatMajor, err := extractMajorVersion(compatDriverVersion) if err != nil { return false, fmt.Errorf("failed to extract major version from %q: %v", compatDriverVersion, err) diff --git a/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go b/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go index 5dbc6a253..22f8f6bf5 100644 --- a/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go +++ b/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go @@ -24,6 +24,8 @@ import ( testlog "github.com/sirupsen/logrus/hooks/test" "github.com/stretchr/testify/require" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/test" ) func TestCompatLibs(t *testing.T) { @@ -172,6 +174,97 @@ func TestCompatLibs(t *testing.T) { } } +func TestCompatLibsWithElfHeader(t *testing.T) { + logger, _ := testlog.NewNullLogger() + moduleRoot, err := test.GetModuleRoot() + require.NoError(t, err) + + dataRoot := filepath.Join(moduleRoot, "testdata") + + testCases := []struct { + description string + options options + expectedContainerForwardCompatDir string + }{ + { + description: "container cuda version greater than host cuda version", + options: options{ + cudaCompatContainerRoot: "compat/575.57.08", + hostCudaVersion: "12.8", + }, + expectedContainerForwardCompatDir: "/compat/575.57.08", + }, + { + description: "container cuda version same as host cuda version", + options: options{ + cudaCompatContainerRoot: "compat/575.57.08", + hostCudaVersion: "12.9", + }, + expectedContainerForwardCompatDir: "", + }, + { + description: "container cuda version less than host cuda version", + options: options{ + cudaCompatContainerRoot: "compat/575.57.08", + hostCudaVersion: "12.10", + }, + expectedContainerForwardCompatDir: "", + }, + { + description: "host driver branch not supported in compat elf header", + options: options{ + cudaCompatContainerRoot: "compat/575.57.08", + hostDriverVersion: "590.44.01", + }, + expectedContainerForwardCompatDir: "", + }, + { + description: "host driver branch supported in compat elf header, host driver branch < compat driver branch", + options: options{ + cudaCompatContainerRoot: "compat/575.57.08", + hostDriverVersion: "570.211.01", + }, + expectedContainerForwardCompatDir: "/compat/575.57.08", + }, + { + description: "host driver branch same as compat driver branch, compat driver > host driver", + options: options{ + cudaCompatContainerRoot: "compat/575.57.08", + hostDriverVersion: "575.10.10", + }, + expectedContainerForwardCompatDir: "/compat/575.57.08", + }, + { + description: "host driver branch same as compat driver branch, compat driver = host driver", + options: options{ + cudaCompatContainerRoot: "compat/575.57.08", + hostDriverVersion: "575.57.08", + }, + expectedContainerForwardCompatDir: "", + }, + { + description: "host driver branch same as compat driver branch, compat driver < host driver", + options: options{ + cudaCompatContainerRoot: "compat/575.57.08", + hostDriverVersion: "575.99.99", + }, + expectedContainerForwardCompatDir: "", + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + containerRootDir := dataRoot + c := command{ + logger: logger, + } + containerForwardCompatDir, err := c.getContainerForwardCompatDir(containerRoot(containerRootDir), &tc.options) + require.NoError(t, err) + require.EqualValues(t, tc.expectedContainerForwardCompatDir, containerForwardCompatDir) + }) + } +} + func TestUpdateLdconfig(t *testing.T) { logger, _ := testlog.NewNullLogger() testCases := []struct { diff --git a/testdata/compat/libcuda.so.575.57.08 b/testdata/compat/575.57.08/libcuda.so.575.57.08 similarity index 100% rename from testdata/compat/libcuda.so.575.57.08 rename to testdata/compat/575.57.08/libcuda.so.575.57.08 diff --git a/testdata/compat/libcuda.so.590.44.01 b/testdata/compat/590.44.01/libcuda.so.590.44.01 similarity index 100% rename from testdata/compat/libcuda.so.590.44.01 rename to testdata/compat/590.44.01/libcuda.so.590.44.01 From d8b823d9b7a1dd9fbf9a0f3eb2427b97b19d5b56 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Tue, 21 Apr 2026 08:42:49 -0700 Subject: [PATCH 2/2] [cudacompat] update version comparison to account for leading zeros in version strings Any version string passed as an argument to semver.Compare() must be a valid semantic version. It is common for NVIDIA driver versions to have leading zeros in the MINOR or PATCH portion of a version string, e.g. 575.57.08. As a result, a call to semver.Compare("575.57.08", "575.10.10") would incorrectly return -1 because the first argument is not a valid semantic version. And from https://pkg.go.dev/golang.org/x/mod/semver#Compare: ''' An invalid semantic version string is considered less than a valid one. All invalid semantic version strings compare equal to each other. ''' Signed-off-by: Christopher Desiniotis --- .../cudacompat/cuda-elf-header.go | 25 +++++- .../cudacompat/cuda-elf-header_test.go | 90 +++++++++++++++++++ 2 files changed, 112 insertions(+), 3 deletions(-) diff --git a/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header.go b/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header.go index b71c0c78b..eb416da6f 100644 --- a/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header.go +++ b/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header.go @@ -143,7 +143,7 @@ func (h *compatElfHeader) UseCompat(compatDriverVersion string, hostDriverVersio return false } - return semver.Compare(normalizeVersion(compatDriverVersion), normalizeVersion(hostDriverVersion)) > 0 + return compareVersions(compatDriverVersion, hostDriverVersion) > 0 } type cudaVersion string @@ -155,9 +155,28 @@ func (containerVersion cudaVersion) UseCompat(hostVersion string) bool { return false } - return semver.Compare(normalizeVersion(containerVersion), normalizeVersion(hostVersion)) > 0 + return compareVersions(containerVersion, hostVersion) > 0 } +func compareVersions[T string | cudaVersion, O string | cudaVersion](this T, other O) int { + return semver.Compare(normalizeVersion(this), normalizeVersion(other)) +} + +// normalizeVersion converts the given version into a valid semantic version. +// This function will always return a string in the format of vMAJOR.MINOR.PATCH +// It accounts for version strings that have leading zeros, which is common +// in NVIDIA driver version strings. For example, 570.211.01 will be converted to +// v570.22.1 func normalizeVersion[T string | cudaVersion](v T) string { - return "v" + strings.TrimPrefix(string(v), "v") + majorMinorPatch := []string{"0", "0", "0"} + versionParts := strings.SplitN(strings.TrimPrefix(string(v), "v"), ".", 3) + for i, versionPart := range versionParts { + trimmed := strings.TrimLeft(versionPart, "0") + if trimmed == "" { + trimmed = "0" + } + majorMinorPatch[i] = trimmed + } + + return "v" + strings.Join(majorMinorPatch, ".") } diff --git a/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header_test.go b/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header_test.go index a2f081774..ee2ee2fa0 100644 --- a/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header_test.go +++ b/cmd/nvidia-cdi-hook/cudacompat/cuda-elf-header_test.go @@ -183,3 +183,93 @@ func TestUseCompat(t *testing.T) { }) } } + +func TestCompareVersions(t *testing.T) { + testCases := []struct { + description string + a string + b string + expected int + }{ + { + description: "empty", + expected: 0, + }, + { + description: "less than", + a: "1.2.3", + b: "2.4.5", + expected: -1, + }, + { + description: "equal", + a: "1.1.1", + b: "1.1.1", + expected: 0, + }, + { + description: "equal with leading zeros in version string", + a: "1.1.1", + b: "1.01.1", + expected: 0, + }, + { + description: "greater than", + a: "2.4.5", + b: "2.4.4", + expected: 1, + }, + } + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + require.EqualValues(t, tc.expected, compareVersions(tc.a, tc.b)) + }) + } + +} + +func TestNormalizeVersion(t *testing.T) { + testCases := []struct { + description string + input string + expected string + }{ + { + description: "empty", + input: "", + expected: "v0.0.0", + }, + { + description: "major is 0", + input: "v0.1.2", + expected: "v0.1.2", + }, + { + description: "major only", + input: "1", + expected: "v1.0.0", + }, + { + description: "major and minor only", + input: "1.1", + expected: "v1.1.0", + }, + { + description: "zero-padded version", + input: "01.02.03", + expected: "v1.2.3", + }, + { + description: "valid semantic version", + input: "v1.2.3-4+567", + expected: "v1.2.3-4+567", + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + output := normalizeVersion(tc.input) + require.EqualValues(t, tc.expected, output) + }) + } +}