Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions api/config/v1/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ type CommandLineFlags struct {
MpsRoot *string `json:"mpsRoot,omitempty" yaml:"mpsRoot,omitempty"`
NvidiaDriverRoot *string `json:"nvidiaDriverRoot,omitempty" yaml:"nvidiaDriverRoot,omitempty"`
NvidiaDevRoot *string `json:"nvidiaDevRoot,omitempty" yaml:"nvidiaDevRoot,omitempty"`
GDRCopyEnabled *bool `json:"gdrcopyEnabled" yaml:"gdrcopyEnabled"`
GDSEnabled *bool `json:"gdsEnabled" yaml:"gdsEnabled"`
MOFEDEnabled *bool `json:"mofedEnabled" yaml:"mofedEnabled"`
UseNodeFeatureAPI *bool `json:"useNodeFeatureAPI" yaml:"useNodeFeatureAPI"`
Expand Down Expand Up @@ -126,6 +127,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
updateFromCLIFlag(&f.NvidiaDriverRoot, c, n)
case "dev-root", "nvidia-dev-root":
updateFromCLIFlag(&f.NvidiaDevRoot, c, n)
case "gdrcopy-enabled":
updateFromCLIFlag(&f.GDRCopyEnabled, c, n)
case "gds-enabled":
updateFromCLIFlag(&f.GDSEnabled, c, n)
case "mofed-enabled":
Expand Down
3 changes: 3 additions & 0 deletions api/config/v1/flags_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ func TestMarshalFlags(t *testing.T) {
output: `{
"migStrategy": null,
"failOnInitError": null,
"gdrcopyEnabled": null,
"gdsEnabled": null,
"mofedEnabled": null,
"useNodeFeatureAPI": null,
Expand All @@ -177,6 +178,7 @@ func TestMarshalFlags(t *testing.T) {
output: `{
"migStrategy": null,
"failOnInitError": null,
"gdrcopyEnabled": null,
"gdsEnabled": null,
"mofedEnabled": null,
"useNodeFeatureAPI": null,
Expand All @@ -201,6 +203,7 @@ func TestMarshalFlags(t *testing.T) {
output: `{
"migStrategy": null,
"failOnInitError": null,
"gdrcopyEnabled": null,
"gdsEnabled": null,
"mofedEnabled": null,
"useNodeFeatureAPI": null,
Expand Down
9 changes: 7 additions & 2 deletions cmd/nvidia-device-plugin/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,19 @@ func main() {
Usage: "the desired strategy for passing device IDs to the underlying runtime:\n\t\t[uuid | index]",
EnvVars: []string{"DEVICE_ID_STRATEGY"},
},
&cli.BoolFlag{
Name: "gdrcopy-enabled",
Usage: "ensure that containers that request NVIDIA GPU resources are started with GDRCopy support",
EnvVars: []string{"GDRCOPY_ENABLED"},
},
&cli.BoolFlag{
Name: "gds-enabled",
Usage: "ensure that containers are started with NVIDIA_GDS=enabled",
Usage: "ensure that containers that request NVIDIA GPU resources are started with GPUDirect Storage support",
EnvVars: []string{"GDS_ENABLED"},
},
&cli.BoolFlag{
Name: "mofed-enabled",
Usage: "ensure that containers are started with NVIDIA_MOFED=enabled",
Usage: "ensure that containers that request NVIDIA GPU resources are started with MOFED support",
EnvVars: []string{"MOFED_ENABLED"},
},
&cli.StringFlag{
Expand Down
1 change: 1 addition & 0 deletions cmd/nvidia-device-plugin/plugin-manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ func GetPlugins(ctx context.Context, infolib info.Interface, nvmllib nvml.Interf
cdi.WithNvidiaCTKPath(*config.Flags.Plugin.NvidiaCTKPath),
cdi.WithDeviceIDStrategy(*config.Flags.Plugin.DeviceIDStrategy),
cdi.WithVendor("k8s.device-plugin.nvidia.com"),
cdi.WithGdrcopyEnabled(*config.Flags.GDRCopyEnabled),
cdi.WithGdsEnabled(*config.Flags.GDSEnabled),
cdi.WithMofedEnabled(*config.Flags.MOFEDEnabled),
cdi.WithImexChannels(imexChannels),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,10 @@ spec:
- name: NVIDIA_CDI_HOOK_PATH
value: {{ .Values.cdi.nvidiaHookPath }}
{{- end }}
{{- if typeIs "bool" .Values.gdrcopyEnabled }}
- name: GDRCOPY_ENABLED
value: {{ .Values.gdrcopyEnabled | quote }}
{{- end }}
{{- if typeIs "bool" .Values.gdsEnabled }}
- name: GDS_ENABLED
value: {{ .Values.gdsEnabled | quote }}
Expand Down
1 change: 1 addition & 0 deletions deployments/helm/nvidia-device-plugin/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ failOnInitError: null
deviceListStrategy: null
deviceIDStrategy: null
nvidiaDriverRoot: null
gdrcopyEnabled: null
gdsEnabled: null
mofedEnabled: null
deviceDiscoveryStrategy: null
Expand Down
12 changes: 6 additions & 6 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ module github.com/NVIDIA/k8s-device-plugin

go 1.24.0

toolchain go1.24.1
toolchain go1.24.5

require (
github.com/NVIDIA/go-gpuallocator v0.6.0
github.com/NVIDIA/go-nvlib v0.8.0
github.com/NVIDIA/go-nvlib v0.8.1
github.com/NVIDIA/go-nvml v0.13.0-1
github.com/NVIDIA/nvidia-container-toolkit v1.17.8
github.com/NVIDIA/nvidia-container-toolkit v1.18.0-rc.5
github.com/fsnotify/fsnotify v1.9.0
github.com/google/renameio v1.0.1
github.com/google/uuid v1.6.0
Expand All @@ -28,8 +28,8 @@ require (
sigs.k8s.io/node-feature-discovery v0.17.3
sigs.k8s.io/node-feature-discovery/api/nfd v0.17.3
sigs.k8s.io/yaml v1.4.0
tags.cncf.io/container-device-interface v0.8.1
tags.cncf.io/container-device-interface/specs-go v0.8.0
tags.cncf.io/container-device-interface v1.0.1
tags.cncf.io/container-device-interface/specs-go v1.0.0
)

require (
Expand Down Expand Up @@ -65,7 +65,7 @@ require (
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
golang.org/x/net v0.38.0 // indirect
golang.org/x/oauth2 v0.27.0 // indirect
golang.org/x/sys v0.33.0 // indirect
golang.org/x/sys v0.36.0 // indirect
golang.org/x/term v0.30.0 // indirect
golang.org/x/text v0.23.0 // indirect
golang.org/x/time v0.8.0 // indirect
Expand Down
20 changes: 10 additions & 10 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
github.com/NVIDIA/go-gpuallocator v0.6.0 h1:2PA2swx59gJYREPkZNTGtyCP6Pnz3WEgnYsXlRkyvkk=
github.com/NVIDIA/go-gpuallocator v0.6.0/go.mod h1:c+Yspg+/QxWOmoSQeuI48Z/7nS+mMPtxyj1NYUTwewY=
github.com/NVIDIA/go-nvlib v0.8.0 h1:vorMvnsJYvZaxiluSXFd+fIFeQFPWSiSjNPiJyvDs0c=
github.com/NVIDIA/go-nvlib v0.8.0/go.mod h1:bV+OEgjJCbFXf5T8c082mVPFuiF+gKwf9CMT7DWGUBI=
github.com/NVIDIA/go-nvlib v0.8.1 h1:OPEHVvn3zcV5OXB68A7WRpeCnYMRSPl7LdeJH/d3gZI=
github.com/NVIDIA/go-nvlib v0.8.1/go.mod h1:7mzx9FSdO9fXWP9NKuZmWkCwhkEcSWQFe2tmFwtLb9c=
github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw=
github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
github.com/NVIDIA/nvidia-container-toolkit v1.17.8 h1:ndE23TKvQBicsZT88mzZudygn6JNOe6+UsIgqk6gGvw=
github.com/NVIDIA/nvidia-container-toolkit v1.17.8/go.mod h1:khOgMW80+g8eX/1zPlO4demLShHht9I0YEm8ngcPgwk=
github.com/NVIDIA/nvidia-container-toolkit v1.18.0-rc.5 h1:ft4S4nyT2jI1tV6CAFRMMZdrtd8HAfmuE9X9ieoDz+Y=
github.com/NVIDIA/nvidia-container-toolkit v1.18.0-rc.5/go.mod h1:t/awbHrDkz8ec0vecKo82Cn/11YkuD2ngE5RT9wuAgU=
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo=
Expand Down Expand Up @@ -169,8 +169,8 @@ golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k=
golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
Expand Down Expand Up @@ -232,7 +232,7 @@ sigs.k8s.io/structured-merge-diff/v4 v4.4.2 h1:MdmvkGuXi/8io6ixD5wud3vOLwc1rj0aN
sigs.k8s.io/structured-merge-diff/v4 v4.4.2/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4=
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=
tags.cncf.io/container-device-interface v0.8.1 h1:c0jN4Mt6781jD67NdPajmZlD1qrqQyov/Xfoab37lj0=
tags.cncf.io/container-device-interface v0.8.1/go.mod h1:Apb7N4VdILW0EVdEMRYXIDVRZfNJZ+kmEUss2kRRQ6Y=
tags.cncf.io/container-device-interface/specs-go v0.8.0 h1:QYGFzGxvYK/ZLMrjhvY0RjpUavIn4KcmRmVP/JjdBTA=
tags.cncf.io/container-device-interface/specs-go v0.8.0/go.mod h1:BhJIkjjPh4qpys+qm4DAYtUyryaTDg9zris+AczXyws=
tags.cncf.io/container-device-interface v1.0.1 h1:KqQDr4vIlxwfYh0Ed/uJGVgX+CHAkahrgabg6Q8GYxc=
tags.cncf.io/container-device-interface v1.0.1/go.mod h1:JojJIOeW3hNbcnOH2q0NrWNha/JuHoDZcmYxAZwb2i0=
tags.cncf.io/container-device-interface/specs-go v1.0.0 h1:8gLw29hH1ZQP9K1YtAzpvkHCjjyIxHZYzBAvlQ+0vD8=
tags.cncf.io/container-device-interface/specs-go v1.0.0/go.mod h1:u86hoFWqnh3hWz3esofRFKbI261bUlvUfLKGrDhJkgQ=
9 changes: 2 additions & 7 deletions internal/cdi/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,11 @@

package cdi

import "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"

// Interface provides the API to the 'cdi' package
//
//go:generate moq -stub -out api_mock.go . Interface
//go:generate moq -rm -fmt=goimports -stub -out api_mock.go . Interface
type Interface interface {
CreateSpecFile() error
QualifiedName(string, string) string
}

type cdiSpecGenerator interface {
GetSpec() (spec.Interface, error)
AdditionalDevices() []string
}
44 changes: 42 additions & 2 deletions internal/cdi/api_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

48 changes: 40 additions & 8 deletions internal/cdi/cdi.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,14 @@ type cdiHandler struct {

deviceListStrategies spec.DeviceListStrategies

gdsEnabled bool
mofedEnabled bool
gdsEnabled bool
mofedEnabled bool
gdrcopyEnabled bool

imexChannels imex.Channels

cdilibs map[string]cdiSpecGenerator
cdilibs map[string]nvcdi.SpecGenerator
additionalModes []string
}

var _ Interface = &cdiHandler{}
Expand Down Expand Up @@ -111,7 +113,7 @@ func New(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interf
return nil, err
}

c.cdilibs = make(map[string]cdiSpecGenerator)
c.cdilibs = make(map[string]nvcdi.SpecGenerator)

c.cdilibs["gpu"], err = nvcdi.New(
nvcdi.WithInfoLib(c.infolib),
Expand All @@ -133,15 +135,17 @@ func New(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interf
c.cdilibs["imex-channel"] = c.newImexChannelSpecGenerator()
}

var additionalModes []string
if c.gdrcopyEnabled {
c.additionalModes = append(c.additionalModes, "gdrcopy")
}
if c.gdsEnabled {
additionalModes = append(additionalModes, "gds")
c.additionalModes = append(c.additionalModes, "gds")
}
if c.mofedEnabled {
additionalModes = append(additionalModes, "mofed")
c.additionalModes = append(c.additionalModes, "mofed")
}

for _, mode := range additionalModes {
for _, mode := range c.additionalModes {
lib, err := nvcdi.New(
nvcdi.WithInfoLib(c.infolib),
nvcdi.WithLogger(c.logger),
Expand All @@ -162,6 +166,7 @@ func New(infolib info.Interface, nvmllib nvml.Interface, devicelib device.Interf

// CreateSpecFile creates a CDI spec file for the specified devices.
func (cdi *cdiHandler) CreateSpecFile() error {
var emptySpecs []string
for class, cdilib := range cdi.cdilibs {
cdi.logger.Infof("Generating CDI spec for resource: %s/%s", cdi.vendor, class)

Expand Down Expand Up @@ -193,10 +198,22 @@ func (cdi *cdiHandler) CreateSpecFile() error {

err = spec.Save(filepath.Join(cdiRoot, specName+".json"))
if err != nil {
// TODO: This is a brittle check since it relies on exact string matches.
// We should pull this functionality into the CDI tooling instead.
if strings.Contains(err.Error(), "invalid device, empty device edits") {
klog.ErrorS(err, "Ignoring empty CDI specs", "vendor", cdi.vendor, "class", class)
emptySpecs = append(emptySpecs, class)
continue
}
return fmt.Errorf("failed to save CDI spec: %v", err)
}
}

// Remove the classes with empty specs from the supported types.
for _, emptySpec := range emptySpecs {
delete(cdi.cdilibs, emptySpec)
}

return nil
}

Expand Down Expand Up @@ -229,3 +246,18 @@ func (cdi *cdiHandler) getRootTransformer() transform.Transformer {
func (cdi *cdiHandler) QualifiedName(class string, id string) string {
return cdiparser.QualifiedName(cdi.vendor, class, id)
}

// AdditionalDevices returns the optional CDI devices based on the device plugin
// configuration.
// Here we check for requested modes as well as whether the modes have a valid
// CDI spec associated with them.
func (cdi *cdiHandler) AdditionalDevices() []string {
var devices []string
for _, mode := range cdi.additionalModes {
if cdi.cdilibs[mode] == nil {
continue
}
devices = append(devices, cdi.QualifiedName(mode, "all"))
}
return devices
}
5 changes: 3 additions & 2 deletions internal/cdi/imex.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package cdi
import (
"tags.cncf.io/container-device-interface/specs-go"

"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"

"github.com/NVIDIA/k8s-device-plugin/internal/imex"
Expand All @@ -29,7 +30,7 @@ type imexChannelCDILib struct {
imexChannels imex.Channels
}

func (cdi *cdiHandler) newImexChannelSpecGenerator() cdiSpecGenerator {
func (cdi *cdiHandler) newImexChannelSpecGenerator() nvcdi.SpecGenerator {
lib := &imexChannelCDILib{
vendor: cdi.vendor,
imexChannels: cdi.imexChannels,
Expand All @@ -39,7 +40,7 @@ func (cdi *cdiHandler) newImexChannelSpecGenerator() cdiSpecGenerator {
}

// GetSpec returns the CDI specs for IMEX channels.
func (l *imexChannelCDILib) GetSpec() (spec.Interface, error) {
func (l *imexChannelCDILib) GetSpec(...string) (spec.Interface, error) {
var deviceSpecs []specs.Device
for _, channel := range l.imexChannels {
deviceSpec := specs.Device{
Expand Down
4 changes: 4 additions & 0 deletions internal/cdi/null.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ func NewNullHandler() Interface {
return &null{}
}

func (n *null) AdditionalDevices() []string {
return nil
}

// CreateSpecFile is a no-op for the null handler.
func (n *null) CreateSpecFile() error {
return nil
Expand Down
Loading