33
44//! Shared GPU request helpers.
55
6+ use std:: fmt;
7+ use std:: sync:: atomic:: { AtomicUsize , Ordering } ;
8+
69use crate :: config:: CDI_GPU_DEVICE_ALL ;
710
8- /// Resolve the existing GPU request fields into CDI device identifiers.
9- ///
10- /// `None` means no GPU was requested. A GPU request with no explicit device
11- /// ID uses the CDI all-GPU request; otherwise the driver-native ID passes
12- /// through unchanged.
13- #[ must_use]
14- pub fn cdi_gpu_device_ids ( gpu : bool , gpu_device : & str ) -> Option < Vec < String > > {
15- gpu. then ( || {
16- if gpu_device. is_empty ( ) {
17- vec ! [ CDI_GPU_DEVICE_ALL . to_string( ) ]
11+ const CDI_NVIDIA_GPU_PREFIX : & str = "nvidia.com/gpu=" ;
12+ const CDI_NVIDIA_GPU_ALL_SUFFIX : & str = "all" ;
13+
14+ /// Normalized CDI GPU inventory used by local container drivers.
15+ #[ derive( Debug , Clone , Default , PartialEq , Eq ) ]
16+ pub struct CdiGpuInventory {
17+ device_ids : Vec < String > ,
18+ }
19+
20+ impl CdiGpuInventory {
21+ /// Build a normalized inventory from runtime-reported CDI device IDs.
22+ #[ must_use]
23+ pub fn new ( device_ids : impl IntoIterator < Item = impl AsRef < str > > ) -> Self {
24+ let mut device_ids = device_ids
25+ . into_iter ( )
26+ . filter_map ( |id| {
27+ let id = id. as_ref ( ) . trim ( ) ;
28+ id. starts_with ( CDI_NVIDIA_GPU_PREFIX )
29+ . then ( || id. to_string ( ) )
30+ } )
31+ . collect :: < Vec < _ > > ( ) ;
32+ device_ids. sort ( ) ;
33+ device_ids. dedup ( ) ;
34+ Self { device_ids }
35+ }
36+
37+ #[ must_use]
38+ pub fn as_slice ( & self ) -> & [ String ] {
39+ & self . device_ids
40+ }
41+
42+ #[ must_use]
43+ pub fn is_empty ( & self ) -> bool {
44+ self . device_ids . is_empty ( )
45+ }
46+
47+ fn default_device_family (
48+ & self ,
49+ allow_all_devices : bool ,
50+ ) -> Result < Vec < String > , CdiGpuSelectionError > {
51+ let mut indexed = self
52+ . device_ids
53+ . iter ( )
54+ . filter_map ( |id| {
55+ let suffix = cdi_nvidia_gpu_suffix ( id) ?;
56+ let index = suffix. parse :: < u64 > ( ) . ok ( ) ?;
57+ Some ( ( index, id. clone ( ) ) )
58+ } )
59+ . collect :: < Vec < _ > > ( ) ;
60+ if !indexed. is_empty ( ) {
61+ indexed. sort_by ( |left, right| left. 0 . cmp ( & right. 0 ) . then_with ( || left. 1 . cmp ( & right. 1 ) ) ) ;
62+ return Ok ( indexed. into_iter ( ) . map ( |( _, id) | id) . collect ( ) ) ;
63+ }
64+
65+ let mut named = self
66+ . device_ids
67+ . iter ( )
68+ . filter_map ( |id| {
69+ let suffix = cdi_nvidia_gpu_suffix ( id) ?;
70+ ( suffix != CDI_NVIDIA_GPU_ALL_SUFFIX ) . then ( || id. clone ( ) )
71+ } )
72+ . collect :: < Vec < _ > > ( ) ;
73+ if !named. is_empty ( ) {
74+ named. sort ( ) ;
75+ return Ok ( named) ;
76+ }
77+
78+ if self . device_ids . iter ( ) . any ( |id| id == CDI_GPU_DEVICE_ALL ) {
79+ if !allow_all_devices {
80+ return Err ( CdiGpuSelectionError :: AllDevicesDefaultUnsupported ) ;
81+ }
82+ return Ok ( vec ! [ CDI_GPU_DEVICE_ALL . to_string( ) ] ) ;
83+ }
84+
85+ Err ( CdiGpuSelectionError :: NoAvailableDevices )
86+ }
87+ }
88+
89+ /// Concurrency-safe round-robin cursor for default CDI GPU selection.
90+ #[ derive( Debug , Default ) ]
91+ pub struct CdiGpuRoundRobin {
92+ next : AtomicUsize ,
93+ }
94+
95+ impl CdiGpuRoundRobin {
96+ #[ must_use]
97+ pub const fn new ( ) -> Self {
98+ Self {
99+ next : AtomicUsize :: new ( 0 ) ,
100+ }
101+ }
102+
103+ /// Return the next default device ID and advance the cursor.
104+ pub fn next_default_device_id (
105+ & self ,
106+ inventory : & CdiGpuInventory ,
107+ allow_all_devices : bool ,
108+ ) -> Result < String , CdiGpuSelectionError > {
109+ self . selected_default_device_id ( inventory, true , allow_all_devices)
110+ }
111+
112+ /// Return the current default device ID without advancing the cursor.
113+ pub fn peek_default_device_id (
114+ & self ,
115+ inventory : & CdiGpuInventory ,
116+ allow_all_devices : bool ,
117+ ) -> Result < String , CdiGpuSelectionError > {
118+ self . selected_default_device_id ( inventory, false , allow_all_devices)
119+ }
120+
121+ fn selected_default_device_id (
122+ & self ,
123+ inventory : & CdiGpuInventory ,
124+ consume : bool ,
125+ allow_all_devices : bool ,
126+ ) -> Result < String , CdiGpuSelectionError > {
127+ let devices = inventory. default_device_family ( allow_all_devices) ?;
128+ let base = if consume {
129+ self . next . fetch_add ( 1 , Ordering :: Relaxed )
18130 } else {
19- vec ! [ gpu_device. to_string( ) ]
131+ self . next . load ( Ordering :: Relaxed )
132+ } ;
133+ Ok ( devices[ base % devices. len ( ) ] . clone ( ) )
134+ }
135+ }
136+
137+ /// CDI GPU selection failed.
138+ #[ derive( Debug , Clone , Copy , PartialEq , Eq ) ]
139+ pub enum CdiGpuSelectionError {
140+ NoAvailableDevices ,
141+ MissingDefaultDevice ,
142+ AllDevicesDefaultUnsupported ,
143+ }
144+
145+ impl fmt:: Display for CdiGpuSelectionError {
146+ fn fmt ( & self , f : & mut fmt:: Formatter < ' _ > ) -> fmt:: Result {
147+ match self {
148+ Self :: NoAvailableDevices => f. write_str ( "no NVIDIA CDI GPU devices were discovered" ) ,
149+ Self :: MissingDefaultDevice => {
150+ f. write_str ( "GPU request requires a selected default CDI GPU device" )
151+ }
152+ Self :: AllDevicesDefaultUnsupported => f. write_str (
153+ "default GPU request resolved only to nvidia.com/gpu=all, which is not allowed on this platform; pass --gpu-device nvidia.com/gpu=all explicitly to request all GPUs" ,
154+ ) ,
20155 }
21- } )
156+ }
157+ }
158+
159+ impl std:: error:: Error for CdiGpuSelectionError { }
160+
161+ /// Resolve the existing GPU request fields into CDI device identifiers.
162+ ///
163+ /// `None` means no GPU was requested. A GPU request with an explicit device ID
164+ /// passes through unchanged. A default GPU request uses the driver-selected
165+ /// default CDI ID.
166+ pub fn cdi_gpu_device_ids (
167+ gpu : bool ,
168+ gpu_device : & str ,
169+ selected_default_device : Option < & str > ,
170+ ) -> Result < Option < Vec < String > > , CdiGpuSelectionError > {
171+ if !gpu {
172+ return Ok ( None ) ;
173+ }
174+ if !gpu_device. is_empty ( ) {
175+ return Ok ( Some ( vec ! [ gpu_device. to_string( ) ] ) ) ;
176+ }
177+ let device = selected_default_device. ok_or ( CdiGpuSelectionError :: MissingDefaultDevice ) ?;
178+ Ok ( Some ( vec ! [ device. to_string( ) ] ) )
179+ }
180+
181+ fn cdi_nvidia_gpu_suffix ( id : & str ) -> Option < & str > {
182+ id. strip_prefix ( CDI_NVIDIA_GPU_PREFIX )
22183}
23184
24185#[ cfg( test) ]
@@ -27,22 +188,139 @@ mod tests {
27188
28189 #[ test]
29190 fn cdi_gpu_device_ids_returns_none_when_absent ( ) {
30- assert_eq ! ( cdi_gpu_device_ids( false , "" ) , None ) ;
191+ assert_eq ! ( cdi_gpu_device_ids( false , "" , None ) , Ok ( None ) ) ;
31192 }
32193
33194 #[ test]
34- fn cdi_gpu_device_ids_defaults_empty_request_to_all_gpus ( ) {
195+ fn cdi_gpu_device_ids_uses_selected_default_device ( ) {
35196 assert_eq ! (
36- cdi_gpu_device_ids( true , "" ) ,
37- Some ( vec![ CDI_GPU_DEVICE_ALL . to_string( ) ] )
197+ cdi_gpu_device_ids( true , "" , Some ( "nvidia.com/gpu=0" ) ) ,
198+ Ok ( Some ( vec![ "nvidia.com/gpu=0" . to_string( ) ] ) )
199+ ) ;
200+ }
201+
202+ #[ test]
203+ fn cdi_gpu_device_ids_rejects_missing_default_device ( ) {
204+ assert_eq ! (
205+ cdi_gpu_device_ids( true , "" , None ) ,
206+ Err ( CdiGpuSelectionError :: MissingDefaultDevice )
38207 ) ;
39208 }
40209
41210 #[ test]
42211 fn cdi_gpu_device_ids_passes_explicit_device_id_through ( ) {
43212 assert_eq ! (
44- cdi_gpu_device_ids( true , "nvidia.com/gpu=0" ) ,
45- Some ( vec![ "nvidia.com/gpu=0" . to_string( ) ] )
213+ cdi_gpu_device_ids( true , "nvidia.com/gpu=0" , None ) ,
214+ Ok ( Some ( vec![ "nvidia.com/gpu=0" . to_string( ) ] ) )
215+ ) ;
216+ }
217+
218+ #[ test]
219+ fn inventory_filters_and_deduplicates_nvidia_gpu_ids ( ) {
220+ let inventory = CdiGpuInventory :: new ( [
221+ "nvidia.com/gpu=1" ,
222+ "vendor.example/device=0" ,
223+ "nvidia.com/gpu=1" ,
224+ " nvidia.com/gpu=0 " ,
225+ ] ) ;
226+
227+ assert_eq ! (
228+ inventory. as_slice( ) ,
229+ & vec![
230+ "nvidia.com/gpu=0" . to_string( ) ,
231+ "nvidia.com/gpu=1" . to_string( )
232+ ]
233+ ) ;
234+ }
235+
236+ #[ test]
237+ fn round_robin_prefers_indexed_family_and_sorts_numerically ( ) {
238+ let inventory = CdiGpuInventory :: new ( [
239+ "nvidia.com/gpu=10" ,
240+ "nvidia.com/gpu=UUID-b" ,
241+ "nvidia.com/gpu=2" ,
242+ "nvidia.com/gpu=all" ,
243+ ] ) ;
244+ let selector = CdiGpuRoundRobin :: new ( ) ;
245+
246+ assert_eq ! (
247+ selector. next_default_device_id( & inventory, false ) ,
248+ Ok ( "nvidia.com/gpu=2" . to_string( ) )
249+ ) ;
250+ assert_eq ! (
251+ selector. next_default_device_id( & inventory, false ) ,
252+ Ok ( "nvidia.com/gpu=10" . to_string( ) )
253+ ) ;
254+ assert_eq ! (
255+ selector. next_default_device_id( & inventory, false ) ,
256+ Ok ( "nvidia.com/gpu=2" . to_string( ) )
257+ ) ;
258+ }
259+
260+ #[ test]
261+ fn round_robin_uses_named_family_when_no_indexed_ids_exist ( ) {
262+ let inventory = CdiGpuInventory :: new ( [ "nvidia.com/gpu=UUID-b" , "nvidia.com/gpu=UUID-a" ] ) ;
263+ let selector = CdiGpuRoundRobin :: new ( ) ;
264+
265+ assert_eq ! (
266+ selector. next_default_device_id( & inventory, false ) ,
267+ Ok ( "nvidia.com/gpu=UUID-a" . to_string( ) )
268+ ) ;
269+ }
270+
271+ #[ test]
272+ fn round_robin_uses_all_only_inventory_when_allowed ( ) {
273+ let inventory = CdiGpuInventory :: new ( [ CDI_GPU_DEVICE_ALL ] ) ;
274+ let selector = CdiGpuRoundRobin :: new ( ) ;
275+
276+ assert_eq ! (
277+ selector. next_default_device_id( & inventory, true ) ,
278+ Ok ( CDI_GPU_DEVICE_ALL . to_string( ) )
279+ ) ;
280+ }
281+
282+ #[ test]
283+ fn round_robin_rejects_all_only_inventory_when_not_allowed ( ) {
284+ let inventory = CdiGpuInventory :: new ( [ CDI_GPU_DEVICE_ALL ] ) ;
285+ let selector = CdiGpuRoundRobin :: new ( ) ;
286+
287+ assert_eq ! (
288+ selector. next_default_device_id( & inventory, false ) ,
289+ Err ( CdiGpuSelectionError :: AllDevicesDefaultUnsupported )
290+ ) ;
291+ }
292+
293+ #[ test]
294+ fn round_robin_rejects_empty_inventory ( ) {
295+ let inventory = CdiGpuInventory :: new ( [ "vendor.example/device=0" ] ) ;
296+ let selector = CdiGpuRoundRobin :: new ( ) ;
297+
298+ assert_eq ! (
299+ selector. next_default_device_id( & inventory, false ) ,
300+ Err ( CdiGpuSelectionError :: NoAvailableDevices )
301+ ) ;
302+ }
303+
304+ #[ test]
305+ fn peek_does_not_advance_round_robin_cursor ( ) {
306+ let inventory = CdiGpuInventory :: new ( [ "nvidia.com/gpu=0" , "nvidia.com/gpu=1" ] ) ;
307+ let selector = CdiGpuRoundRobin :: new ( ) ;
308+
309+ assert_eq ! (
310+ selector. peek_default_device_id( & inventory, false ) ,
311+ Ok ( "nvidia.com/gpu=0" . to_string( ) )
312+ ) ;
313+ assert_eq ! (
314+ selector. peek_default_device_id( & inventory, false ) ,
315+ Ok ( "nvidia.com/gpu=0" . to_string( ) )
316+ ) ;
317+ assert_eq ! (
318+ selector. next_default_device_id( & inventory, false ) ,
319+ Ok ( "nvidia.com/gpu=0" . to_string( ) )
320+ ) ;
321+ assert_eq ! (
322+ selector. next_default_device_id( & inventory, false ) ,
323+ Ok ( "nvidia.com/gpu=1" . to_string( ) )
46324 ) ;
47325 }
48326}
0 commit comments