diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 5b9535dba..42aaf4782 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -1658,20 +1658,20 @@ type VGPUDevicesConfigSpec struct { // CDIConfigSpec defines how the Container Device Interface is used in the cluster. type CDIConfigSpec struct { - // Enabled indicates whether CDI can be used to make GPUs accessible to containers. + // Enabled indicates whether the Container Device Interface (CDI) should be used as the mechanism for making GPUs accessible to containers. // +kubebuilder:validation:Optional - // +kubebuilder:default=false + // +kubebuilder:default=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable CDI as a mechanism for making GPUs accessible to containers" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable CDI as the mechanism for making GPUs accessible to containers" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" Enabled *bool `json:"enabled,omitempty"` - // Default indicates whether to use CDI as the default mechanism for providing GPU access to containers. + // Deprecated: This field is no longer used. Setting cdi.enabled=true will configure CDI as the default mechanism for making GPUs accessible to containers. // +kubebuilder:validation:Optional // +kubebuilder:default=false // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Configure CDI as the default mechanism for making GPUs accessible to containers" - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Deprecated: This field is no longer used" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch,urn:alm:descriptor:com.tectonic.ui:hidden" Default *bool `json:"default,omitempty"` } @@ -2070,20 +2070,11 @@ func (l *DriverLicensingConfigSpec) IsNLSEnabled() bool { // providing GPU access to containers func (c *CDIConfigSpec) IsEnabled() bool { if c.Enabled == nil { - return false + return true } return *c.Enabled } -// IsDefault returns true if CDI is enabled as the default -// mechanism for providing GPU access to containers -func (c *CDIConfigSpec) IsDefault() bool { - if c.Default == nil { - return false - } - return *c.Default -} - // IsEnabled returns true if Kata Manager is enabled func (k *KataManagerSpec) IsEnabled() bool { if k.Enabled == nil { diff --git a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml index cd0603f6a..238756fca 100644 --- a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml +++ b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml @@ -34,6 +34,9 @@ metadata: "initContainer": { } }, + "cdi": { + "enabled": true + }, "sandboxWorkloads": { "enabled": false, "defaultWorkload": "container" @@ -531,6 +534,20 @@ spec: path: toolkit.imagePullPolicy x-descriptors: - 'urn:alm:descriptor:com.tectonic.ui:imagePullPolicy' + - displayName: CDI + description: Container Device Interface (CDI) Configuration + path: cdi + - displayName: Enabled + description: 'Enabled indicates whether CDI should be used as the mechanism for making GPUs accessible to containers.' + path: cdi.enabled + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:booleanSwitch' + - displayName: Default + description: 'Deprecated: This field is no longer used. Setting cdi.enabled=true will configure CDI as the default mechanism for making GPUs accessible to containers.' + path: cdi.default + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:hidden' + - 'urn:alm:descriptor:com.tectonic.ui:booleanSwitch' - displayName: NVIDIA DCGM config description: NVIDIA DCGM config path: dcgm diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index c032907c6..0ebfa1b58 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -136,13 +136,15 @@ spec: properties: default: default: false - description: Default indicates whether to use CDI as the default - mechanism for providing GPU access to containers. + description: 'Deprecated: This field is no longer used. Setting + cdi.enabled=true will configure CDI as the default mechanism + for making GPUs accessible to containers.' type: boolean enabled: - default: false - description: Enabled indicates whether CDI can be used to make - GPUs accessible to containers. + default: true + description: Enabled indicates whether the Container Device Interface + (CDI) should be used as the mechanism for making GPUs accessible + to containers. type: boolean type: object daemonsets: diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index c032907c6..0ebfa1b58 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -136,13 +136,15 @@ spec: properties: default: default: false - description: Default indicates whether to use CDI as the default - mechanism for providing GPU access to containers. + description: 'Deprecated: This field is no longer used. Setting + cdi.enabled=true will configure CDI as the default mechanism + for making GPUs accessible to containers.' type: boolean enabled: - default: false - description: Enabled indicates whether CDI can be used to make - GPUs accessible to containers. + default: true + description: Enabled indicates whether the Container Device Interface + (CDI) should be used as the mechanism for making GPUs accessible + to containers. type: boolean type: object daemonsets: diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 0e10e3817..41e6abac4 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -147,8 +147,8 @@ const ( CDIEnabledEnvName = "CDI_ENABLED" // NvidiaCDIHookPathEnvName is the name of the envvar specifying the path to the 'nvidia-cdi-hook' binary NvidiaCDIHookPathEnvName = "NVIDIA_CDI_HOOK_PATH" - // CrioConfigModeEnvName is the name of the envvar controlling how the toolkit container updates the cri-o configuration - CrioConfigModeEnvName = "CRIO_CONFIG_MODE" + // CRIOConfigModeEnvName is the name of the envvar controlling how the toolkit container updates the cri-o configuration + CRIOConfigModeEnvName = "CRIO_CONFIG_MODE" // DeviceListStrategyEnvName is the name of the envvar for configuring the device-list-strategy in the device-plugin DeviceListStrategyEnvName = "DEVICE_LIST_STRATEGY" // CDIAnnotationPrefixEnvName is the name of the device-plugin envvar for configuring the CDI annotation prefix @@ -178,6 +178,8 @@ const ( // DriverInstallDirCtrPathEnvName is the name of the envvar used by the driver-validator to represent the path // of the driver install dir mounted in the container DriverInstallDirCtrPathEnvName = "DRIVER_INSTALL_DIR_CTR_PATH" + // NvidiaRuntimeSetAsDefaultEnvName is the name of the toolkit container env for configuring NVIDIA Container Runtime as the default runtime + NvidiaRuntimeSetAsDefaultEnvName = "NVIDIA_RUNTIME_SET_AS_DEFAULT" ) // ContainerProbe defines container probe types @@ -952,8 +954,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol return err } - // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClassName(&obj.Spec.Template.Spec, config) // update env required for MIG support applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) @@ -1223,8 +1224,35 @@ func getProxyEnv(proxyConfig *apiconfigv1.Proxy) []corev1.EnvVar { return envVars } +func transformToolkitCtrForCDI(container *corev1.Container) { + // When CDI is enabled in GPU Operator, we leverage native CDI support in containerd / cri-o + // to inject GPUs into workloads. We do not configure 'nvidia' as the default runtime. The + // 'nvidia' runtime will be set as the runtime class for our management containers so that + // they get access to all GPUs. + // + // Note: one could override this and continue to configure 'nvidia' as the default runtime + // by directly setting the 'NVIDIA_RUNTIME_SET_AS_DEFAULT' environment variable to 'true' in + // the toolkit container. One can leverage the 'toolkit.env' field in ClusterPolicy to + // directly configure environment variables for the toolkit container. + setContainerEnv(container, CDIEnabledEnvName, "true") + setContainerEnv(container, NvidiaRuntimeSetAsDefaultEnvName, "false") + setContainerEnv(container, NvidiaCtrRuntimeModeEnvName, "cdi") +} + // TransformToolkit transforms Nvidia container-toolkit daemonset with required config as per ClusterPolicy func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { + var mainContainer *corev1.Container + mainContainerName := "nvidia-container-toolkit-ctr" + for i, ctr := range obj.Spec.Template.Spec.Containers { + if ctr.Name == mainContainerName { + mainContainer = &obj.Spec.Template.Spec.Containers[i] + break + } + } + if mainContainer == nil { + return fmt.Errorf("failed to find main container %q", mainContainerName) + } + // update validation container err := transformValidationInitContainer(obj, config) if err != nil { @@ -1235,10 +1263,10 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n if err != nil { return err } - obj.Spec.Template.Spec.Containers[0].Image = image + mainContainer.Image = image // update image pull policy - obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.Toolkit.ImagePullPolicy) + mainContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.Toolkit.ImagePullPolicy) // set image pull secrets if len(config.Toolkit.ImagePullSecrets) > 0 { @@ -1256,17 +1284,12 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n // update env required for CDI support if config.CDI.IsEnabled() { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeCDIPrefixesEnvName, "nvidia.cdi.k8s.io/") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CrioConfigModeEnvName, "config") - if config.CDI.IsDefault() { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeModeEnvName, "cdi") - } + transformToolkitCtrForCDI(mainContainer) } // set install directory for the toolkit if config.Toolkit.InstallDir != "" && config.Toolkit.InstallDir != DefaultToolkitInstallDir { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), ToolkitInstallDirEnvName, config.Toolkit.InstallDir) + setContainerEnv(mainContainer, ToolkitInstallDirEnvName, config.Toolkit.InstallDir) for i, volume := range obj.Spec.Template.Spec.Volumes { if volume.Name == "toolkit-install-dir" { @@ -1275,9 +1298,9 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n } } - for i, volumeMount := range obj.Spec.Template.Spec.Containers[0].VolumeMounts { + for i, volumeMount := range mainContainer.VolumeMounts { if volumeMount.Name == "toolkit-install-dir" { - obj.Spec.Template.Spec.Containers[0].VolumeMounts[i].MountPath = config.Toolkit.InstallDir + mainContainer.VolumeMounts[i].MountPath = config.Toolkit.InstallDir break } } @@ -1294,13 +1317,13 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n if len(config.Toolkit.Env) > 0 { for _, env := range config.Toolkit.Env { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value) + setContainerEnv(mainContainer, env.Name, env.Value) } } // configure runtime runtime := n.runtime.String() - err = transformForRuntime(obj, config, runtime, "nvidia-container-toolkit-ctr") + err = transformForRuntime(obj, config, runtime, mainContainerName) if err != nil { return fmt.Errorf("error transforming toolkit daemonset : %w", err) } @@ -1324,7 +1347,12 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, if runtime == gpuv1.Containerd.String() { // Set the runtime class name that is to be configured for containerd - setContainerEnv(mainContainer, "CONTAINERD_RUNTIME_CLASS", getRuntimeClass(config)) + setContainerEnv(mainContainer, "CONTAINERD_RUNTIME_CLASS", getRuntimeClassName(config)) + } + + if runtime == gpuv1.CRIO.String() { + // We add the nvidia runtime to the cri-o config by default instead of installing the OCI prestart hook + setContainerEnv(mainContainer, CRIOConfigModeEnvName, "config") } // setup mounts for runtime config file @@ -1381,8 +1409,30 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, return nil } +func transformDevicePluginCtrForCDI(container *corev1.Container, config *gpuv1.ClusterPolicySpec) { + setContainerEnv(container, CDIEnabledEnvName, "true") + setContainerEnv(container, DeviceListStrategyEnvName, "cdi-annotations,cdi-cri") + setContainerEnv(container, CDIAnnotationPrefixEnvName, "cdi.k8s.io/") + + if config.Toolkit.IsEnabled() { + setContainerEnv(container, NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook")) + } +} + // TransformDevicePlugin transforms k8s-device-plugin daemonset with required config as per ClusterPolicy func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { + var mainContainer *corev1.Container + mainContainerName := "nvidia-device-plugin" + for i, ctr := range obj.Spec.Template.Spec.Containers { + if ctr.Name == mainContainerName { + mainContainer = &obj.Spec.Template.Spec.Containers[i] + break + } + } + if mainContainer == nil { + return fmt.Errorf("failed to find main container %q", mainContainerName) + } + // update validation container err := transformValidationInitContainer(obj, config) if err != nil { @@ -1394,10 +1444,10 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe if err != nil { return err } - obj.Spec.Template.Spec.Containers[0].Image = image + mainContainer.Image = image // update image pull policy - obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.DevicePlugin.ImagePullPolicy) + mainContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.DevicePlugin.ImagePullPolicy) // set image pull secrets if len(config.DevicePlugin.ImagePullSecrets) > 0 { @@ -1414,13 +1464,13 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe } // set arguments if specified for device-plugin container if len(config.DevicePlugin.Args) > 0 { - obj.Spec.Template.Spec.Containers[0].Args = config.DevicePlugin.Args + mainContainer.Args = config.DevicePlugin.Args } // add env to allow injection of /dev/nvidia-fs and /dev/infiniband devices for GDS if config.GPUDirectStorage != nil && config.GPUDirectStorage.IsEnabled() { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), GDSEnabledEnvName, "true") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), MOFEDEnabledEnvName, "true") + setContainerEnv(mainContainer, GDSEnabledEnvName, "true") + setContainerEnv(mainContainer, MOFEDEnabledEnvName, "true") } // apply plugin configuration through ConfigMap if one is provided @@ -1429,20 +1479,14 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe return err } - // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClassName(&obj.Spec.Template.Spec, config) // update env required for MIG support - applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) + applyMIGConfiguration(mainContainer, config.MIG.Strategy) // update env required for CDI support if config.CDI.IsEnabled() { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DeviceListStrategyEnvName, "envvar,cdi-annotations") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIAnnotationPrefixEnvName, "nvidia.cdi.k8s.io/") - if config.Toolkit.IsEnabled() { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook")) - } + transformDevicePluginCtrForCDI(mainContainer, config) } // update MPS volumes and set MPS_ROOT env var if a custom MPS root is configured @@ -1456,12 +1500,12 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe obj.Spec.Template.Spec.Volumes[i].HostPath.Path = filepath.Join(config.DevicePlugin.MPS.Root, "shm") } } - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), MPSRootEnvName, config.DevicePlugin.MPS.Root) + setContainerEnv(mainContainer, MPSRootEnvName, config.DevicePlugin.MPS.Root) } if len(config.DevicePlugin.Env) > 0 { for _, env := range config.DevicePlugin.Env { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value) + setContainerEnv(mainContainer, env.Name, env.Value) } } @@ -1524,8 +1568,7 @@ func TransformMPSControlDaemon(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic return err } - // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClassName(&obj.Spec.Template.Spec, config) // update env required for MIG support applyMIGConfiguration(mainContainer, config.MIG.Strategy) @@ -1633,8 +1676,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe } } - // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClassName(&obj.Spec.Template.Spec, config) // mount configmap for custom metrics if provided by user if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" { @@ -1756,8 +1798,7 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu } } - // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClassName(&obj.Spec.Template.Spec, config) return nil } @@ -1799,8 +1840,7 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, obj.Spec.Template.Spec.Containers[0].Args = config.MIGManager.Args } - // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClassName(&obj.Spec.Template.Spec, config) // set ConfigMap name for "mig-parted-config" Volume for i, vol := range obj.Spec.Template.Spec.Volumes { @@ -2093,8 +2133,7 @@ func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, return fmt.Errorf("%v", err) } - // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClassName(&obj.Spec.Template.Spec, config) var validatorErr error // apply changes for individual component validators(initContainers) @@ -2427,20 +2466,16 @@ func setContainerEnv(c *corev1.Container, key, value string) { c.Env = append(c.Env, corev1.EnvVar{Name: key, Value: value}) } -func getRuntimeClass(config *gpuv1.ClusterPolicySpec) string { +func getRuntimeClassName(config *gpuv1.ClusterPolicySpec) string { if config.Operator.RuntimeClass != "" { return config.Operator.RuntimeClass } return DefaultRuntimeClass } -func setRuntimeClass(podSpec *corev1.PodSpec, runtime gpuv1.Runtime, runtimeClass string) { - if runtime == gpuv1.Containerd { - if runtimeClass == "" { - runtimeClass = DefaultRuntimeClass - } - podSpec.RuntimeClassName = &runtimeClass - } +func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec) { + runtimeClassName := getRuntimeClassName(config) + podSpec.RuntimeClassName = &runtimeClassName } func setContainerProbe(container *corev1.Container, probe *gpuv1.ContainerProbeSpec, probeType ContainerProbe) { @@ -4737,7 +4772,7 @@ func transformRuntimeClassLegacy(n ClusterPolicyController, spec nodev1.RuntimeC // apply runtime class name as per ClusterPolicy if obj.Name == "FILLED_BY_OPERATOR" { - runtimeClassName := getRuntimeClass(&n.singleton.Spec) + runtimeClassName := getRuntimeClassName(&n.singleton.Spec) obj.Name = runtimeClassName obj.Handler = runtimeClassName } @@ -4784,7 +4819,7 @@ func transformRuntimeClass(n ClusterPolicyController, spec nodev1.RuntimeClass) // apply runtime class name as per ClusterPolicy if obj.Name == "FILLED_BY_OPERATOR" { - runtimeClassName := getRuntimeClass(&n.singleton.Spec) + runtimeClassName := getRuntimeClassName(&n.singleton.Spec) obj.Name = runtimeClassName obj.Handler = runtimeClassName } diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 4b75e62a9..0afebe039 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -353,6 +353,7 @@ func TestTransformForRuntime(t *testing.T) { Name: "test-ctr", Env: []corev1.EnvVar{ {Name: "RUNTIME", Value: gpuv1.CRIO.String()}, + {Name: CRIOConfigModeEnvName, Value: "config"}, {Name: "RUNTIME_CONFIG", Value: filepath.Join(DefaultRuntimeConfigTargetDir, filepath.Base(DefaultCRIOConfigFile))}, {Name: "CRIO_CONFIG", Value: filepath.Join(DefaultRuntimeConfigTargetDir, filepath.Base(DefaultCRIOConfigFile))}, }, @@ -642,6 +643,9 @@ func TestTransformToolkit(t *testing.T) { }, }, Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"}, + {Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"}, {Name: "foo", Value: "bar"}, {Name: "RUNTIME", Value: "containerd"}, {Name: "CONTAINERD_RUNTIME_CLASS", Value: "nvidia"}, @@ -712,6 +716,9 @@ func TestTransformToolkit(t *testing.T) { }, }, Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"}, + {Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"}, {Name: "CONTAINERD_CONFIG", Value: "/runtime/config-dir/config.toml"}, {Name: "CONTAINERD_SOCKET", Value: "/runtime/sock-dir/containerd.sock"}, {Name: "CONTAINERD_RUNTIME_CLASS", Value: "nvidia"}, @@ -755,7 +762,7 @@ func TestTransformDevicePlugin(t *testing.T) { { description: "transform device plugin", ds: NewDaemonset(). - WithContainer(corev1.Container{Name: "nvidia-device-plugin-ctr"}). + WithContainer(corev1.Container{Name: "nvidia-device-plugin"}). WithContainer(corev1.Container{Name: "dummy"}), cpSpec: &gpuv1.ClusterPolicySpec{ DevicePlugin: gpuv1.DevicePluginSpec{ @@ -769,14 +776,22 @@ func TestTransformDevicePlugin(t *testing.T) { {Name: "foo", Value: "bar"}, }, }, + Toolkit: gpuv1.ToolkitSpec{ + Enabled: newBoolPtr(true), + InstallDir: "/path/to/install", + }, }, expectedDs: NewDaemonset().WithContainer(corev1.Container{ - Name: "nvidia-device-plugin-ctr", + Name: "nvidia-device-plugin", Image: "nvcr.io/nvidia/cloud-native/nvidia-device-plugin:v1.0.0", ImagePullPolicy: corev1.PullIfNotPresent, Args: []string{"--fail-on-init-error=false"}, Env: []corev1.EnvVar{ {Name: "NVIDIA_MIG_MONITOR_DEVICES", Value: "all"}, + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: DeviceListStrategyEnvName, Value: "cdi-annotations,cdi-cri"}, + {Name: CDIAnnotationPrefixEnvName, Value: "cdi.k8s.io/"}, + {Name: NvidiaCDIHookPathEnvName, Value: "/path/to/install/toolkit/nvidia-cdi-hook"}, {Name: "foo", Value: "bar"}, }, }).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia"), @@ -866,6 +881,10 @@ func TestTransformMigManager(t *testing.T) { {Name: "foo", Value: "bar"}, }, }, + Toolkit: gpuv1.ToolkitSpec{ + Enabled: newBoolPtr(true), + InstallDir: "/path/to/install", + }, }, expectedDs: NewDaemonset().WithContainer(corev1.Container{ Name: "mig-manager", @@ -873,6 +892,8 @@ func TestTransformMigManager(t *testing.T) { ImagePullPolicy: corev1.PullIfNotPresent, Args: []string{"--test-flag"}, Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: NvidiaCDIHookPathEnvName, Value: "/path/to/install/toolkit/nvidia-cdi-hook"}, {Name: "foo", Value: "bar"}, }, }).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia"), @@ -1821,3 +1842,95 @@ func TestTransformDriver(t *testing.T) { }) } } + +func TestTransformToolkitCtrForCDI(t *testing.T) { + testCases := []struct { + description string + ds Daemonset + cpSpec *gpuv1.ClusterPolicySpec + expectedDs Daemonset + }{ + { + description: "cdi enabled", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + CDI: gpuv1.CDIConfigSpec{ + Enabled: newBoolPtr(true), + }, + }, + expectedDs: NewDaemonset().WithContainer( + corev1.Container{ + Name: "main-ctr", + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"}, + {Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"}, + }, + }), + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + mainContainer := &tc.ds.Spec.Template.Spec.Containers[0] + transformToolkitCtrForCDI(mainContainer) + require.EqualValues(t, tc.expectedDs, tc.ds) + }) + } +} + +func TestTransformDevicePluginCtrForCDI(t *testing.T) { + testCases := []struct { + description string + ds Daemonset + cpSpec *gpuv1.ClusterPolicySpec + expectedDs Daemonset + }{ + { + description: "toolkit disabled", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + Toolkit: gpuv1.ToolkitSpec{ + Enabled: newBoolPtr(false), + }, + }, + expectedDs: NewDaemonset().WithContainer( + corev1.Container{ + Name: "main-ctr", + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: DeviceListStrategyEnvName, Value: "cdi-annotations,cdi-cri"}, + {Name: CDIAnnotationPrefixEnvName, Value: "cdi.k8s.io/"}, + }, + }), + }, + { + description: "toolkit enabled", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + Toolkit: gpuv1.ToolkitSpec{ + Enabled: newBoolPtr(true), + InstallDir: "/path/to/install", + }, + }, + expectedDs: NewDaemonset().WithContainer( + corev1.Container{ + Name: "main-ctr", + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: DeviceListStrategyEnvName, Value: "cdi-annotations,cdi-cri"}, + {Name: CDIAnnotationPrefixEnvName, Value: "cdi.k8s.io/"}, + {Name: NvidiaCDIHookPathEnvName, Value: "/path/to/install/toolkit/nvidia-cdi-hook"}, + }, + }), + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + mainContainer := &tc.ds.Spec.Template.Spec.Containers[0] + transformDevicePluginCtrForCDI(mainContainer, tc.cpSpec) + require.EqualValues(t, tc.expectedDs, tc.ds) + }) + } +} diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index c032907c6..0ebfa1b58 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -136,13 +136,15 @@ spec: properties: default: default: false - description: Default indicates whether to use CDI as the default - mechanism for providing GPU access to containers. + description: 'Deprecated: This field is no longer used. Setting + cdi.enabled=true will configure CDI as the default mechanism + for making GPUs accessible to containers.' type: boolean enabled: - default: false - description: Enabled indicates whether CDI can be used to make - GPUs accessible to containers. + default: true + description: Enabled indicates whether the Container Device Interface + (CDI) should be used as the mechanism for making GPUs accessible + to containers. type: boolean type: object daemonsets: diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 4f457dc42..f9c01b1fd 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -13,8 +13,7 @@ psa: enabled: false cdi: - enabled: false - default: false + enabled: true sandboxWorkloads: enabled: false diff --git a/docker/Dockerfile b/docker/Dockerfile index 47805845e..476f771f9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -121,6 +121,13 @@ COPY --from=sample-builder /build/vectorAdd /usr/bin/vectorAdd # Once new sample images are published that contain the compat libs, we can update the below. COPY --from=builder /usr/local/cuda/compat /usr/local/cuda/compat +# The distroless image does not contain the ld.so.cache file. The update-ldcache +# createContainer hook will skip updating the container's ldcache if this file +# does not exist. This can lead to issues running the CUDA vectorAdd sample +# if the NVIDIA libraries are not present in the default dynamic linker search +# path(s). +COPY --from=sample-builder /etc/ld.so.cache /etc/ + COPY assets /opt/gpu-operator/ COPY manifests /opt/gpu-operator/manifests COPY validator/manifests /opt/validator/manifests diff --git a/tests/holodeck.yaml b/tests/holodeck.yaml index e30131457..ce4d96568 100644 --- a/tests/holodeck.yaml +++ b/tests/holodeck.yaml @@ -27,6 +27,7 @@ spec: containerRuntime: install: true name: containerd + version: 1.7.27 kubernetes: install: true installer: kubeadm