Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 167 additions & 6 deletions cmd/nerdctl/container/container_run_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,16 @@ func TestRunDeviceCDI(t *testing.T) {
// Although CDI injection is supported by Docker, specifying the --cdi-spec-dirs on the command line is not.
testutil.DockerIncompatible(t)
cdiSpecDir := filepath.Join(t.TempDir(), "cdi")
writeTestCDISpec(t, cdiSpecDir)
const testCDIVendor1 = `
cdiVersion: "0.3.0"
kind: "vendor1.com/device"
devices:
- name: foo
containerEdits:
env:
- FOO=injected
`
writeTestCDISpec(t, testCDIVendor1, "vendor1.yaml", cdiSpecDir)

base := testutil.NewBase(t)
base.Cmd("--cdi-spec-dirs", cdiSpecDir, "run",
Expand All @@ -689,7 +698,16 @@ func TestRunDeviceCDIWithNerdctlConfig(t *testing.T) {
// Although CDI injection is supported by Docker, specifying the --cdi-spec-dirs on the command line is not.
testutil.DockerIncompatible(t)
cdiSpecDir := filepath.Join(t.TempDir(), "cdi")
writeTestCDISpec(t, cdiSpecDir)
const testCDIVendor1 = `
cdiVersion: "0.3.0"
kind: "vendor1.com/device"
devices:
- name: foo
containerEdits:
env:
- FOO=injected
`
writeTestCDISpec(t, testCDIVendor1, "vendor1.yaml", cdiSpecDir)

tomlPath := filepath.Join(t.TempDir(), "nerdctl.toml")
err := os.WriteFile(tomlPath, []byte(fmt.Sprintf(`
Expand All @@ -706,8 +724,128 @@ cdi_spec_dirs = ["%s"]
).AssertOutContains("FOO=injected")
}

func writeTestCDISpec(t *testing.T, cdiSpecDir string) {
const testCDIVendor1 = `
// TestRunGPU tests GPU injection using the --gpus flag.
func TestRunGPU(t *testing.T) {
t.Parallel()
// Although CDI injection is supported by Docker, specifying the --cdi-spec-dirs on the command line is not.
testutil.DockerIncompatible(t)
const nvidiaSpec = `
cdiVersion: "0.5.0"
kind: "nvidia.com/gpu"
devices:
- name: "0"
containerEdits:
env:
- NVIDIA_GPU_0=injected
- name: "1"
containerEdits:
env:
- NVIDIA_GPU_1=injected
`
const amdSpec = `
cdiVersion: "0.5.0"
kind: "amd.com/gpu"
devices:
- name: "0"
containerEdits:
env:
- AMD_GPU_0=injected
- name: "1"
containerEdits:
env:
- AMD_GPU_1=injected
`
const unknownSpec = `
cdiVersion: "0.5.0"
kind: "unknown.com/gpu"
devices:
- name: "0"
containerEdits:
env:
- UNKNOWN_GPU_0=injected
`

testCases := []struct {
name string
specs map[string]string
gpuFlags []string
expectedEnvs []string
expectFail bool
}{
{
name: "nvidia device injection",
specs: map[string]string{"nvidia.yaml": nvidiaSpec},
gpuFlags: []string{"--gpus", "2"},
expectedEnvs: []string{"NVIDIA_GPU_0=injected", "NVIDIA_GPU_1=injected"},
},
{
name: "amd device injection",
specs: map[string]string{"amd.yaml": amdSpec},
gpuFlags: []string{"--gpus", "2"},
expectedEnvs: []string{"AMD_GPU_0=injected", "AMD_GPU_1=injected"},
},
{
name: "multiple vendors",
specs: map[string]string{"nvidia.yaml": nvidiaSpec, "amd.yaml": amdSpec},
gpuFlags: []string{"--gpus", "1"},
expectedEnvs: []string{"NVIDIA_GPU_0=injected"},
},
{
name: "unknown vendor fails",
specs: map[string]string{"unknown.yaml": unknownSpec},
gpuFlags: []string{"--gpus", "1"},
expectFail: true,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
tmpDir := t.TempDir()
for fileName, spec := range tc.specs {
writeTestCDISpec(t, spec, fileName, tmpDir)
}

base := testutil.NewBase(t)
args := []string{"--cdi-spec-dirs", tmpDir, "run", "--rm"}
args = append(args, tc.gpuFlags...)
args = append(args, testutil.AlpineImage, "env")

if tc.expectFail {
base.Cmd(args...).AssertFail()
} else {
base.Cmd(args...).AssertOutWithFunc(func(stdout string) error {
for _, expectedEnv := range tc.expectedEnvs {
if !strings.Contains(stdout, expectedEnv) {
return fmt.Errorf("%s not found", expectedEnv)
}
}
return nil
})
}
})
}
}

// TestRunGPUWithOtherCDIDevices tests GPU CDI injection along with other CDI devices.
func TestRunGPUWithOtherCDIDevices(t *testing.T) {
t.Parallel()
// Although CDI injection is supported by Docker, specifying the --cdi-spec-dirs on the command line is not.
testutil.DockerIncompatible(t)
const amdSpec = `
cdiVersion: "0.5.0"
kind: "amd.com/gpu"
devices:
- name: "0"
containerEdits:
env:
- AMD_GPU_0=injected
- name: "1"
containerEdits:
env:
- AMD_GPU_1=injected
`
const vendor1Spec = `
cdiVersion: "0.3.0"
kind: "vendor1.com/device"
devices:
Expand All @@ -716,10 +854,33 @@ devices:
env:
- FOO=injected
`
tmpDir := t.TempDir()
writeTestCDISpec(t, amdSpec, "amd.yaml", tmpDir)
writeTestCDISpec(t, vendor1Spec, "vendor1.yaml", tmpDir)

base := testutil.NewBase(t)
base.Cmd("--cdi-spec-dirs", tmpDir, "run", "--rm",
"--gpus", "2",
"--device", "vendor1.com/device=foo",
testutil.AlpineImage, "env",
).AssertOutWithFunc(func(stdout string) error {
if !strings.Contains(stdout, "AMD_GPU_0=injected") {
return errors.New("AMD_GPU_0=injected not found")
}
if !strings.Contains(stdout, "AMD_GPU_1=injected") {
return errors.New("AMD_GPU_1=injected not found")
}
if !strings.Contains(stdout, "FOO=injected") {
return errors.New("FOO=injected not found")
}
return nil
})
}

func writeTestCDISpec(t *testing.T, spec string, fileName string, cdiSpecDir string) {
err := os.MkdirAll(cdiSpecDir, 0700)
assert.NilError(t, err)
cdiSpecPath := filepath.Join(cdiSpecDir, "vendor1.yaml")
err = os.WriteFile(cdiSpecPath, []byte(testCDIVendor1), 0400)
cdiSpecPath := filepath.Join(cdiSpecDir, fileName)
err = os.WriteFile(cdiSpecPath, []byte(spec), 0400)
assert.NilError(t, err)
}
34 changes: 22 additions & 12 deletions docs/gpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,32 +7,40 @@
> The description in this section applies to nerdctl v2.3 or later.
> Users of prior releases of nerdctl should refer to <https://github.com/containerd/nerdctl/blob/v2.2.0/docs/gpu.md>

nerdctl provides docker-compatible NVIDIA GPU support.
nerdctl provides docker-compatible NVIDIA and AMD GPU support.

## Prerequisites

- NVIDIA Drivers
- Same requirement as when you use GPUs on Docker. For details, please refer to [the doc by NVIDIA](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#pre-requisites).
- The NVIDIA Container Toolkit
- containerd relies on the NVIDIA Container Toolkit to make GPUs usable inside a container. You can install the NVIDIA Container Toolkit by following the [official installation instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
- GPU Drivers
- Same requirement as when you use GPUs on Docker. For details, please refer to these docs by [NVIDIA](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#pre-requisites) and [AMD](https://instinct.docs.amd.com/projects/container-toolkit/en/latest/container-runtime/quick-start-guide.html#step-2-install-the-amdgpu-driver).
- Container Toolkit
- containerd relies on vendor Container Toolkits to make GPUs available to the containers. You can install those by following the official installation instructions from [NVIDIA](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) and [AMD](https://instinct.docs.amd.com/projects/container-toolkit/en/latest/container-runtime/quick-start-guide.html).
- CDI Specification
- Container Device Interface (CDI) specification for the GPU devices is required for the GPU support to work. Follow the official documentation from [NVIDIA](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html) and [AMD](https://instinct.docs.amd.com/projects/container-toolkit/en/latest/container-runtime/cdi-guide.html) to ensure that the required CDI specifications are present on the system.

## Options for `nerdctl run --gpus`

`nerdctl run --gpus` is compatible to [`docker run --gpus`](https://docs.docker.com/engine/reference/commandline/run/#access-an-nvidia-gpu).

You can specify number of GPUs to use via `--gpus` option.
The following example exposes all available GPUs.
The following examples expose all available GPUs to the container.

```
nerdctl run -it --rm --gpus all nvidia/cuda:12.3.1-base-ubuntu20.04 nvidia-smi
```

or

```
nerdctl run -it --rm --gpus=all rocm/rocm-terminal rocm-smi
```

You can also pass detailed configuration to `--gpus` option as a list of key-value pairs. The following options are provided.

- `count`: number of GPUs to use. `all` exposes all available GPUs.
- `device`: IDs of GPUs to use. UUID or numbers of GPUs can be specified.
- `device`: IDs of GPUs to use. UUID or numbers of GPUs can be specified. This only works for NVIDIA GPUs.

The following example exposes a specific GPU to the container.
The following example exposes a specific NVIDIA GPU to the container.

```
nerdctl run -it --rm --gpus 'device=GPU-3a23c669-1f69-c64e-cf85-44e9b07e7a2a' nvidia/cuda:12.3.1-base-ubuntu20.04 nvidia-smi
Expand Down Expand Up @@ -72,17 +80,17 @@ services:

### `nerdctl run --gpus` fails due to an unresolvable CDI device

If the required CDI specifications for NVIDIA devices are not available on the
If the required CDI specifications for your GPU devices are not available on the
system, the `nerdctl run` command will fail with an error similar to: `CDI device injection failed: unresolvable CDI devices nvidia.com/gpu=all` (the
exact error message will depend on the device(s) requested).
exact error message will depend on the vendor and the device(s) requested).

This should be the same error message that is reported when the `--device` flag
is used to request a CDI device:
```
nerdctl run --device=nvidia.com/gpu=all
```

Ensure that the NVIDIA Container Toolkit (>= v1.18.0 is recommended) is installed and the requested CDI devices are present in the ouptut of `nvidia-ctk cdi list`:
Ensure that the NVIDIA (or AMD) Container Toolkit is installed and the requested CDI devices are present in the ouptut of `nvidia-ctk cdi list` (or `amd-ctk cdi list` for AMD GPUs):

```
$ nvidia-ctk cdi list
Expand All @@ -92,7 +100,9 @@ nvidia.com/gpu=GPU-3eb87630-93d5-b2b6-b8ff-9b359caf4ee2
nvidia.com/gpu=all
```

See the NVIDIA Container Toolkit [CDI documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html) for more information.
For NVIDIA Container Toolkit, version >= v1.18.0 is recommended. See the NVIDIA Container Toolkit [CDI documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html) for more information.

For AMD Container Toolkit, version >= v1.2.0 is recommended. See the AMD Container Toolkit [CDI documentation](https://instinct.docs.amd.com/projects/container-toolkit/en/latest/container-runtime/cdi-guide.html) for more information.


### `nerdctl run --gpus` fails when using the Nvidia gpu-operator
Expand Down
9 changes: 8 additions & 1 deletion pkg/cmd/container/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,14 @@ func Create(ctx context.Context, client *containerd.Client, args []string, netMa
}
opts = append(opts, platformOpts...)

opts = append(opts, withCDIDevices(options.GOptions.CDISpecDirs, options.CDIDevices...))
if len(options.CDIDevices) > 0 || len(options.GPUs) > 0 {
opts = append(opts, withStaticCDIRegistry(options.GOptions.CDISpecDirs))
}

opts = append(opts,
withGPUs(options.GPUs...),
withCDIDevices(options.CDIDevices...),
)

if _, err := referenceutil.Parse(args[0]); errors.Is(err, referenceutil.ErrLoadOCIArchiveRequired) {
imageRef := args[0]
Expand Down
63 changes: 54 additions & 9 deletions pkg/cmd/container/run_cdi.go
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we do that in a follow up?

Once this is merged, we might organise our docs better and put the latest links here to help users install the AMD CDI specs through amd-ctk.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have updated the documentation in this PR itself. PTAL whenever you get time.

cc @elezar

Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,68 @@ import (
"github.com/containerd/containerd/v2/core/containers"
cdispec "github.com/containerd/containerd/v2/pkg/cdi"
"github.com/containerd/containerd/v2/pkg/oci"
"github.com/containerd/log"
)

// withCDIDevices creates the OCI runtime spec options for injecting CDI devices.
// Two options are returned: The first ensures that the CDI registry is initialized with
// refresh disabled, and the second injects the devices into the container.
func withCDIDevices(cdiSpecDirs []string, devices ...string) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *oci.Spec) error {
if len(devices) == 0 {
return nil
// detectGPUVendorFromCDI detects the first available GPU vendor from CDI cache.
// Returns empty string if no known vendor is found.
func detectGPUVendorFromCDI() string {
cache := cdi.GetDefaultCache()
availableVendors := cache.ListVendors()
knownGPUVendors := []string{"nvidia.com", "amd.com"}
for _, known := range knownGPUVendors {
for _, available := range availableVendors {
if known == available {
return known
}
}
}

// We configure the CDI registry with the configured spec dirs and disable refresh.
cdi.Configure(
return ""
}

// withStaticCDIRegistry inits the CDI registry with given spec dirs
// and disables auto-refresh.
func withStaticCDIRegistry(cdiSpecDirs []string) oci.SpecOpts {
return func(ctx context.Context, _ oci.Client, _ *containers.Container, _ *oci.Spec) error {
_ = cdi.Configure(
cdi.WithSpecDirs(cdiSpecDirs...),
cdi.WithAutoRefresh(false),
)
if err := cdi.Refresh(); err != nil {
// We don't consider registry refresh failure a fatal error.
// For instance, a dynamically generated invalid CDI Spec file for
// any particular vendor shouldn't prevent injection of devices of
// different vendors. CDI itself knows better and it will fail the
// injection if necessary.
log.L.Warnf("CDI cache refresh failed: %v", err)
}
return nil
}
}

// withCDIDevices creates the OCI runtime spec options for injecting CDI devices.
func withCDIDevices(devices ...string) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *oci.Spec) error {
if len(devices) == 0 {
return nil
}
return cdispec.WithCDIDevices(devices...)(ctx, client, c, s)
}
}

// withGPUs creates the OCI runtime spec options for injecting GPUs via CDI.
// It parses the given GPU options and converts them to CDI device IDs.
// withCDIDevices is then used to perform the actual injection.
func withGPUs(gpuOpts ...string) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *oci.Spec) error {
if len(gpuOpts) == 0 {
return nil
}
cdiDevices, err := parseGPUOpts(gpuOpts)
if err != nil {
return err
}
return withCDIDevices(cdiDevices...)(ctx, client, c, s)
}
}
Loading