|
3 | 3 |
|
4 | 4 | //! GPU workload validation e2e tests. |
5 | 5 |
|
| 6 | +use std::fs; |
| 7 | +use std::path::{Path, PathBuf}; |
| 8 | + |
6 | 9 | use openshell_e2e::harness::output::strip_ansi; |
7 | 10 | use openshell_e2e::harness::sandbox::SandboxGuard; |
| 11 | +use serde::Deserialize; |
8 | 12 |
|
9 | | -const CUDA_WORKLOAD_IMAGE_ENV: &str = "OPENSHELL_E2E_GPU_CUDA_WORKLOAD_IMAGE"; |
| 13 | +const WORKLOAD_MANIFEST_ENV: &str = "OPENSHELL_E2E_WORKLOAD_MANIFEST"; |
10 | 14 | const GPU_WORKLOAD_SUCCESS_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_SUCCESS"; |
11 | | -const GPU_WORKLOAD_BINARY: &str = "/usr/local/bin/openshell-gpu-workload"; |
| 15 | +const GPU_WORKLOAD_FAILURE_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_FAILURE"; |
| 16 | + |
| 17 | +#[derive(Debug, Deserialize)] |
| 18 | +struct WorkloadManifest { |
| 19 | + workloads: Vec<WorkloadDefinition>, |
| 20 | +} |
| 21 | + |
| 22 | +#[derive(Clone, Debug, Deserialize)] |
| 23 | +struct WorkloadDefinition { |
| 24 | + name: String, |
| 25 | + image: String, |
| 26 | + command: Vec<String>, |
| 27 | + expect: WorkloadExpectation, |
| 28 | + #[serde(default)] |
| 29 | + requirements: WorkloadRequirements, |
| 30 | +} |
12 | 31 |
|
13 | | -fn cuda_workload_image() -> Option<String> { |
14 | | - std::env::var(CUDA_WORKLOAD_IMAGE_ENV) |
| 32 | +#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)] |
| 33 | +#[serde(rename_all = "lowercase")] |
| 34 | +enum WorkloadExpectation { |
| 35 | + Pass, |
| 36 | + Fail, |
| 37 | +} |
| 38 | + |
| 39 | +#[derive(Clone, Debug, Default, Deserialize)] |
| 40 | +struct WorkloadRequirements { |
| 41 | + #[serde(default)] |
| 42 | + gpu: bool, |
| 43 | +} |
| 44 | + |
| 45 | +fn default_workload_manifest_path() -> PathBuf { |
| 46 | + Path::new(env!("CARGO_MANIFEST_DIR")).join("../gpu/images/.build/workloads.yaml") |
| 47 | +} |
| 48 | + |
| 49 | +fn workload_manifest_path() -> PathBuf { |
| 50 | + std::env::var(WORKLOAD_MANIFEST_ENV) |
15 | 51 | .ok() |
16 | 52 | .map(|value| value.trim().to_string()) |
17 | 53 | .filter(|value| !value.is_empty()) |
| 54 | + .map(PathBuf::from) |
| 55 | + .unwrap_or_else(default_workload_manifest_path) |
18 | 56 | } |
19 | 57 |
|
20 | | -#[tokio::test] |
21 | | -async fn cuda_gpu_workload_validation_runs_explicit_workload_binary() { |
22 | | - let Some(image) = cuda_workload_image() else { |
23 | | - eprintln!("skipping CUDA GPU workload validation: {CUDA_WORKLOAD_IMAGE_ENV} is not set"); |
24 | | - return; |
| 58 | +fn load_workload_manifest() -> Option<WorkloadManifest> { |
| 59 | + let path = workload_manifest_path(); |
| 60 | + let explicit_override = std::env::var(WORKLOAD_MANIFEST_ENV) |
| 61 | + .ok() |
| 62 | + .map(|value| !value.trim().is_empty()) |
| 63 | + .unwrap_or(false); |
| 64 | + |
| 65 | + let contents = match fs::read_to_string(&path) { |
| 66 | + Ok(contents) => contents, |
| 67 | + Err(err) if !explicit_override && err.kind() == std::io::ErrorKind::NotFound => { |
| 68 | + eprintln!( |
| 69 | + "skipping GPU workload validation: no workload manifest at {}. \ |
| 70 | + Run `mise run e2e:workloads:build` to create the local manifest \ |
| 71 | + or set {WORKLOAD_MANIFEST_ENV} to an external manifest.", |
| 72 | + path.display() |
| 73 | + ); |
| 74 | + return None; |
| 75 | + } |
| 76 | + Err(err) => panic!("failed to read workload manifest {}: {err}", path.display()), |
25 | 77 | }; |
26 | 78 |
|
27 | | - let mut guard = SandboxGuard::create(&[ |
28 | | - "--gpu", |
29 | | - "--from", |
30 | | - image.as_str(), |
31 | | - "--", |
32 | | - GPU_WORKLOAD_BINARY, |
33 | | - ]) |
34 | | - .await |
35 | | - .unwrap_or_else(|err| { |
| 79 | + let manifest: WorkloadManifest = serde_yaml::from_str(&contents).unwrap_or_else(|err| { |
36 | 80 | panic!( |
37 | | - "CUDA GPU workload sandbox create failed for image {image} with binary {GPU_WORKLOAD_BINARY}:\n{err}" |
| 81 | + "failed to parse workload manifest {}: {err}", |
| 82 | + path.display() |
| 83 | + ) |
| 84 | + }); |
| 85 | + assert!( |
| 86 | + !manifest.workloads.is_empty(), |
| 87 | + "workload manifest {} contains no workloads", |
| 88 | + path.display() |
| 89 | + ); |
| 90 | + Some(manifest) |
| 91 | +} |
| 92 | + |
| 93 | +async fn assert_expected_pass(workload: &WorkloadDefinition) { |
| 94 | + let mut args = vec![ |
| 95 | + "--gpu".to_string(), |
| 96 | + "--from".to_string(), |
| 97 | + workload.image.clone(), |
| 98 | + "--".to_string(), |
| 99 | + ]; |
| 100 | + args.extend(workload.command.clone()); |
| 101 | + let arg_refs = args.iter().map(String::as_str).collect::<Vec<_>>(); |
| 102 | + |
| 103 | + let mut guard = SandboxGuard::create(&arg_refs).await.unwrap_or_else(|err| { |
| 104 | + panic!( |
| 105 | + "GPU workload '{}' expected success but sandbox create failed:\n{err}", |
| 106 | + workload.name |
38 | 107 | ) |
39 | 108 | }); |
40 | 109 |
|
41 | 110 | let clean_output = strip_ansi(&guard.create_output); |
42 | 111 | assert!( |
43 | 112 | clean_output.contains(GPU_WORKLOAD_SUCCESS_MARKER), |
44 | | - "expected success marker {GPU_WORKLOAD_SUCCESS_MARKER} for image {image} in sandbox output:\n{clean_output}" |
| 113 | + "expected success marker {GPU_WORKLOAD_SUCCESS_MARKER} for workload '{}' image {} in sandbox output:\n{clean_output}", |
| 114 | + workload.name, |
| 115 | + workload.image, |
45 | 116 | ); |
46 | 117 |
|
47 | 118 | guard.cleanup().await; |
48 | 119 | } |
| 120 | + |
| 121 | +async fn assert_expected_fail(workload: &WorkloadDefinition) { |
| 122 | + let mut args = vec![ |
| 123 | + "--gpu".to_string(), |
| 124 | + "--from".to_string(), |
| 125 | + workload.image.clone(), |
| 126 | + "--".to_string(), |
| 127 | + ]; |
| 128 | + args.extend(workload.command.clone()); |
| 129 | + let arg_refs = args.iter().map(String::as_str).collect::<Vec<_>>(); |
| 130 | + |
| 131 | + match SandboxGuard::create(&arg_refs).await { |
| 132 | + Ok(mut guard) => { |
| 133 | + let clean_output = strip_ansi(&guard.create_output); |
| 134 | + guard.cleanup().await; |
| 135 | + panic!( |
| 136 | + "GPU workload '{}' unexpectedly succeeded. Output:\n{clean_output}", |
| 137 | + workload.name |
| 138 | + ); |
| 139 | + } |
| 140 | + Err(err) => { |
| 141 | + let clean_output = strip_ansi(&err); |
| 142 | + assert!( |
| 143 | + clean_output.contains(GPU_WORKLOAD_FAILURE_MARKER), |
| 144 | + "expected failure marker {GPU_WORKLOAD_FAILURE_MARKER} for workload '{}' image {} in failure output:\n{clean_output}", |
| 145 | + workload.name, |
| 146 | + workload.image, |
| 147 | + ); |
| 148 | + } |
| 149 | + } |
| 150 | +} |
| 151 | + |
| 152 | +#[tokio::test] |
| 153 | +async fn gpu_workload_manifest_runs_expected_workloads() { |
| 154 | + let Some(manifest) = load_workload_manifest() else { |
| 155 | + return; |
| 156 | + }; |
| 157 | + |
| 158 | + let gpu_workloads = manifest |
| 159 | + .workloads |
| 160 | + .into_iter() |
| 161 | + .filter(|workload| workload.requirements.gpu) |
| 162 | + .collect::<Vec<_>>(); |
| 163 | + |
| 164 | + assert!( |
| 165 | + !gpu_workloads.is_empty(), |
| 166 | + "workload manifest contains no GPU-tagged workloads" |
| 167 | + ); |
| 168 | + |
| 169 | + for workload in gpu_workloads { |
| 170 | + assert!( |
| 171 | + !workload.command.is_empty(), |
| 172 | + "workload '{}' must declare a non-empty command", |
| 173 | + workload.name |
| 174 | + ); |
| 175 | + |
| 176 | + match workload.expect { |
| 177 | + WorkloadExpectation::Pass => assert_expected_pass(&workload).await, |
| 178 | + WorkloadExpectation::Fail => assert_expected_fail(&workload).await, |
| 179 | + } |
| 180 | + } |
| 181 | +} |
0 commit comments