Skip to content

Commit bbdc514

Browse files
committed
fix(gpu): prefer single CDI devices for local runtimes
Prefer a single CDI-qualified device when Docker or Podman resolves the default GPU request to one GPU. Allow nvidia.com/gpu=all only as a WSL2 all-only compatibility fallback, using Docker daemon info and Podman's /dev/dxg probe to identify that case. Update driver docs, architecture notes, and GPU e2e coverage for the default selection behavior. Signed-off-by: Evan Lezar <elezar@nvidia.com>
1 parent 97986d9 commit bbdc514

10 files changed

Lines changed: 945 additions & 85 deletions

File tree

architecture/compute-runtimes.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,12 @@ through the driver configuration. The Helm chart defaults sandbox agents to
4545
`Unconfined` so runtime/default AppArmor profiles do not block supervisor
4646
network namespace setup on AppArmor-enabled nodes.
4747

48+
GPU requests enter the driver layer through `SandboxSpec.gpu` and
49+
`SandboxSpec.gpu_device`. Docker and Podman map default GPU requests to one
50+
concrete NVIDIA CDI device when individual CDI devices are available, use
51+
`nvidia.com/gpu=all` only for WSL2/all-only compatibility, and pass explicit
52+
driver-native device IDs through.
53+
4854
VM runtime state paths are derived only from driver-validated sandbox IDs
4955
matching `[A-Za-z0-9._-]{1,128}`. The gateway-owned VM driver socket uses a
5056
private `run/` directory plus Unix peer UID/PID checks. Standalone

crates/openshell-core/src/gpu.rs

Lines changed: 296 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,183 @@
33

44
//! Shared GPU request helpers.
55
6+
use std::fmt;
7+
use std::sync::atomic::{AtomicUsize, Ordering};
8+
69
use crate::config::CDI_GPU_DEVICE_ALL;
710

8-
/// Resolve the existing GPU request fields into CDI device identifiers.
9-
///
10-
/// `None` means no GPU was requested. A GPU request with no explicit device
11-
/// ID uses the CDI all-GPU request; otherwise the driver-native ID passes
12-
/// through unchanged.
13-
#[must_use]
14-
pub fn cdi_gpu_device_ids(gpu: bool, gpu_device: &str) -> Option<Vec<String>> {
15-
gpu.then(|| {
16-
if gpu_device.is_empty() {
17-
vec![CDI_GPU_DEVICE_ALL.to_string()]
11+
const CDI_NVIDIA_GPU_PREFIX: &str = "nvidia.com/gpu=";
12+
const CDI_NVIDIA_GPU_ALL_SUFFIX: &str = "all";
13+
14+
/// Normalized CDI GPU inventory used by local container drivers.
15+
#[derive(Debug, Clone, Default, PartialEq, Eq)]
16+
pub struct CdiGpuInventory {
17+
device_ids: Vec<String>,
18+
}
19+
20+
impl CdiGpuInventory {
21+
/// Build a normalized inventory from runtime-reported CDI device IDs.
22+
#[must_use]
23+
pub fn new(device_ids: impl IntoIterator<Item = impl AsRef<str>>) -> Self {
24+
let mut device_ids = device_ids
25+
.into_iter()
26+
.filter_map(|id| {
27+
let id = id.as_ref().trim();
28+
id.starts_with(CDI_NVIDIA_GPU_PREFIX)
29+
.then(|| id.to_string())
30+
})
31+
.collect::<Vec<_>>();
32+
device_ids.sort();
33+
device_ids.dedup();
34+
Self { device_ids }
35+
}
36+
37+
#[must_use]
38+
pub fn as_slice(&self) -> &[String] {
39+
&self.device_ids
40+
}
41+
42+
#[must_use]
43+
pub fn is_empty(&self) -> bool {
44+
self.device_ids.is_empty()
45+
}
46+
47+
fn default_device_family(
48+
&self,
49+
allow_all_devices: bool,
50+
) -> Result<Vec<String>, CdiGpuSelectionError> {
51+
let mut indexed = self
52+
.device_ids
53+
.iter()
54+
.filter_map(|id| {
55+
let suffix = cdi_nvidia_gpu_suffix(id)?;
56+
let index = suffix.parse::<u64>().ok()?;
57+
Some((index, id.clone()))
58+
})
59+
.collect::<Vec<_>>();
60+
if !indexed.is_empty() {
61+
indexed.sort_by(|left, right| left.0.cmp(&right.0).then_with(|| left.1.cmp(&right.1)));
62+
return Ok(indexed.into_iter().map(|(_, id)| id).collect());
63+
}
64+
65+
let mut named = self
66+
.device_ids
67+
.iter()
68+
.filter_map(|id| {
69+
let suffix = cdi_nvidia_gpu_suffix(id)?;
70+
(suffix != CDI_NVIDIA_GPU_ALL_SUFFIX).then(|| id.clone())
71+
})
72+
.collect::<Vec<_>>();
73+
if !named.is_empty() {
74+
named.sort();
75+
return Ok(named);
76+
}
77+
78+
if self.device_ids.iter().any(|id| id == CDI_GPU_DEVICE_ALL) {
79+
if !allow_all_devices {
80+
return Err(CdiGpuSelectionError::AllDevicesDefaultUnsupported);
81+
}
82+
return Ok(vec![CDI_GPU_DEVICE_ALL.to_string()]);
83+
}
84+
85+
Err(CdiGpuSelectionError::NoAvailableDevices)
86+
}
87+
}
88+
89+
/// Concurrency-safe round-robin cursor for default CDI GPU selection.
90+
#[derive(Debug, Default)]
91+
pub struct CdiGpuRoundRobin {
92+
next: AtomicUsize,
93+
}
94+
95+
impl CdiGpuRoundRobin {
96+
#[must_use]
97+
pub const fn new() -> Self {
98+
Self {
99+
next: AtomicUsize::new(0),
100+
}
101+
}
102+
103+
/// Return the next default device ID and advance the cursor.
104+
pub fn next_default_device_id(
105+
&self,
106+
inventory: &CdiGpuInventory,
107+
allow_all_devices: bool,
108+
) -> Result<String, CdiGpuSelectionError> {
109+
self.selected_default_device_id(inventory, true, allow_all_devices)
110+
}
111+
112+
/// Return the current default device ID without advancing the cursor.
113+
pub fn peek_default_device_id(
114+
&self,
115+
inventory: &CdiGpuInventory,
116+
allow_all_devices: bool,
117+
) -> Result<String, CdiGpuSelectionError> {
118+
self.selected_default_device_id(inventory, false, allow_all_devices)
119+
}
120+
121+
fn selected_default_device_id(
122+
&self,
123+
inventory: &CdiGpuInventory,
124+
consume: bool,
125+
allow_all_devices: bool,
126+
) -> Result<String, CdiGpuSelectionError> {
127+
let devices = inventory.default_device_family(allow_all_devices)?;
128+
let base = if consume {
129+
self.next.fetch_add(1, Ordering::Relaxed)
18130
} else {
19-
vec![gpu_device.to_string()]
131+
self.next.load(Ordering::Relaxed)
132+
};
133+
Ok(devices[base % devices.len()].clone())
134+
}
135+
}
136+
137+
/// CDI GPU selection failed.
138+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
139+
pub enum CdiGpuSelectionError {
140+
NoAvailableDevices,
141+
MissingDefaultDevice,
142+
AllDevicesDefaultUnsupported,
143+
}
144+
145+
impl fmt::Display for CdiGpuSelectionError {
146+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
147+
match self {
148+
Self::NoAvailableDevices => f.write_str("no NVIDIA CDI GPU devices were discovered"),
149+
Self::MissingDefaultDevice => {
150+
f.write_str("GPU request requires a selected default CDI GPU device")
151+
}
152+
Self::AllDevicesDefaultUnsupported => f.write_str(
153+
"default GPU request resolved only to nvidia.com/gpu=all, which is not allowed on this platform; pass --gpu-device nvidia.com/gpu=all explicitly to request all GPUs",
154+
),
20155
}
21-
})
156+
}
157+
}
158+
159+
impl std::error::Error for CdiGpuSelectionError {}
160+
161+
/// Resolve the existing GPU request fields into CDI device identifiers.
162+
///
163+
/// `None` means no GPU was requested. A GPU request with an explicit device ID
164+
/// passes through unchanged. A default GPU request uses the driver-selected
165+
/// default CDI ID.
166+
pub fn cdi_gpu_device_ids(
167+
gpu: bool,
168+
gpu_device: &str,
169+
selected_default_device: Option<&str>,
170+
) -> Result<Option<Vec<String>>, CdiGpuSelectionError> {
171+
if !gpu {
172+
return Ok(None);
173+
}
174+
if !gpu_device.is_empty() {
175+
return Ok(Some(vec![gpu_device.to_string()]));
176+
}
177+
let device = selected_default_device.ok_or(CdiGpuSelectionError::MissingDefaultDevice)?;
178+
Ok(Some(vec![device.to_string()]))
179+
}
180+
181+
fn cdi_nvidia_gpu_suffix(id: &str) -> Option<&str> {
182+
id.strip_prefix(CDI_NVIDIA_GPU_PREFIX)
22183
}
23184

24185
#[cfg(test)]
@@ -27,22 +188,139 @@ mod tests {
27188

28189
#[test]
29190
fn cdi_gpu_device_ids_returns_none_when_absent() {
30-
assert_eq!(cdi_gpu_device_ids(false, ""), None);
191+
assert_eq!(cdi_gpu_device_ids(false, "", None), Ok(None));
31192
}
32193

33194
#[test]
34-
fn cdi_gpu_device_ids_defaults_empty_request_to_all_gpus() {
195+
fn cdi_gpu_device_ids_uses_selected_default_device() {
35196
assert_eq!(
36-
cdi_gpu_device_ids(true, ""),
37-
Some(vec![CDI_GPU_DEVICE_ALL.to_string()])
197+
cdi_gpu_device_ids(true, "", Some("nvidia.com/gpu=0")),
198+
Ok(Some(vec!["nvidia.com/gpu=0".to_string()]))
199+
);
200+
}
201+
202+
#[test]
203+
fn cdi_gpu_device_ids_rejects_missing_default_device() {
204+
assert_eq!(
205+
cdi_gpu_device_ids(true, "", None),
206+
Err(CdiGpuSelectionError::MissingDefaultDevice)
38207
);
39208
}
40209

41210
#[test]
42211
fn cdi_gpu_device_ids_passes_explicit_device_id_through() {
43212
assert_eq!(
44-
cdi_gpu_device_ids(true, "nvidia.com/gpu=0"),
45-
Some(vec!["nvidia.com/gpu=0".to_string()])
213+
cdi_gpu_device_ids(true, "nvidia.com/gpu=0", None),
214+
Ok(Some(vec!["nvidia.com/gpu=0".to_string()]))
215+
);
216+
}
217+
218+
#[test]
219+
fn inventory_filters_and_deduplicates_nvidia_gpu_ids() {
220+
let inventory = CdiGpuInventory::new([
221+
"nvidia.com/gpu=1",
222+
"vendor.example/device=0",
223+
"nvidia.com/gpu=1",
224+
" nvidia.com/gpu=0 ",
225+
]);
226+
227+
assert_eq!(
228+
inventory.as_slice(),
229+
&vec![
230+
"nvidia.com/gpu=0".to_string(),
231+
"nvidia.com/gpu=1".to_string()
232+
]
233+
);
234+
}
235+
236+
#[test]
237+
fn round_robin_prefers_indexed_family_and_sorts_numerically() {
238+
let inventory = CdiGpuInventory::new([
239+
"nvidia.com/gpu=10",
240+
"nvidia.com/gpu=UUID-b",
241+
"nvidia.com/gpu=2",
242+
"nvidia.com/gpu=all",
243+
]);
244+
let selector = CdiGpuRoundRobin::new();
245+
246+
assert_eq!(
247+
selector.next_default_device_id(&inventory, false),
248+
Ok("nvidia.com/gpu=2".to_string())
249+
);
250+
assert_eq!(
251+
selector.next_default_device_id(&inventory, false),
252+
Ok("nvidia.com/gpu=10".to_string())
253+
);
254+
assert_eq!(
255+
selector.next_default_device_id(&inventory, false),
256+
Ok("nvidia.com/gpu=2".to_string())
257+
);
258+
}
259+
260+
#[test]
261+
fn round_robin_uses_named_family_when_no_indexed_ids_exist() {
262+
let inventory = CdiGpuInventory::new(["nvidia.com/gpu=UUID-b", "nvidia.com/gpu=UUID-a"]);
263+
let selector = CdiGpuRoundRobin::new();
264+
265+
assert_eq!(
266+
selector.next_default_device_id(&inventory, false),
267+
Ok("nvidia.com/gpu=UUID-a".to_string())
268+
);
269+
}
270+
271+
#[test]
272+
fn round_robin_uses_all_only_inventory_when_allowed() {
273+
let inventory = CdiGpuInventory::new([CDI_GPU_DEVICE_ALL]);
274+
let selector = CdiGpuRoundRobin::new();
275+
276+
assert_eq!(
277+
selector.next_default_device_id(&inventory, true),
278+
Ok(CDI_GPU_DEVICE_ALL.to_string())
279+
);
280+
}
281+
282+
#[test]
283+
fn round_robin_rejects_all_only_inventory_when_not_allowed() {
284+
let inventory = CdiGpuInventory::new([CDI_GPU_DEVICE_ALL]);
285+
let selector = CdiGpuRoundRobin::new();
286+
287+
assert_eq!(
288+
selector.next_default_device_id(&inventory, false),
289+
Err(CdiGpuSelectionError::AllDevicesDefaultUnsupported)
290+
);
291+
}
292+
293+
#[test]
294+
fn round_robin_rejects_empty_inventory() {
295+
let inventory = CdiGpuInventory::new(["vendor.example/device=0"]);
296+
let selector = CdiGpuRoundRobin::new();
297+
298+
assert_eq!(
299+
selector.next_default_device_id(&inventory, false),
300+
Err(CdiGpuSelectionError::NoAvailableDevices)
301+
);
302+
}
303+
304+
#[test]
305+
fn peek_does_not_advance_round_robin_cursor() {
306+
let inventory = CdiGpuInventory::new(["nvidia.com/gpu=0", "nvidia.com/gpu=1"]);
307+
let selector = CdiGpuRoundRobin::new();
308+
309+
assert_eq!(
310+
selector.peek_default_device_id(&inventory, false),
311+
Ok("nvidia.com/gpu=0".to_string())
312+
);
313+
assert_eq!(
314+
selector.peek_default_device_id(&inventory, false),
315+
Ok("nvidia.com/gpu=0".to_string())
316+
);
317+
assert_eq!(
318+
selector.next_default_device_id(&inventory, false),
319+
Ok("nvidia.com/gpu=0".to_string())
320+
);
321+
assert_eq!(
322+
selector.next_default_device_id(&inventory, false),
323+
Ok("nvidia.com/gpu=1".to_string())
46324
);
47325
}
48326
}

crates/openshell-driver-docker/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ contract:
3232
| `apparmor=unconfined` | Avoids Docker's default profile blocking required mount operations. |
3333
| `restart_policy = unless-stopped` | Keeps managed sandboxes resumable across daemon or gateway restarts. |
3434
| `PidsLimit` | Enforces the sandbox PID budget at the Docker cgroup layer. Set `[openshell.drivers.docker].sandbox_pids_limit = 0` to inherit the Docker/runtime default. |
35-
| CDI GPU request | Uses the sandbox `gpu_device` value when set; otherwise requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. |
35+
| CDI GPU request | Uses the sandbox `gpu_device` value when set; otherwise selects one concrete NVIDIA CDI GPU when the sandbox spec asks for GPU support and daemon CDI support is detected. Docker daemon `/info` can permit `nvidia.com/gpu=all` as a WSL2 all-only compatibility fallback. |
3636

3737
The agent child process does not retain these supervisor privileges.
3838

0 commit comments

Comments
 (0)