Skip to content

Commit 1c8f7b7

Browse files
committed
test(e2e): run gpu workloads from manifest
Signed-off-by: Evan Lezar <elezar@nvidia.com>
1 parent 045bc2e commit 1c8f7b7

4 files changed

Lines changed: 186 additions & 22 deletions

File tree

e2e/rust/Cargo.lock

Lines changed: 28 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

e2e/rust/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,9 @@ sha1 = "0.10"
109109
sha2 = "0.10"
110110
hex = "0.4"
111111
rand = "0.9"
112+
serde = { version = "1", features = ["derive"] }
112113
serde_json = "1"
114+
serde_yaml = "0.9"
113115

114116
[lints.rust]
115117
unsafe_code = "warn"

e2e/rust/e2e-docker.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@ set -euo pipefail
1111
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
1212
E2E_TEST="${OPENSHELL_E2E_DOCKER_TEST:-smoke}"
1313
E2E_FEATURES="${OPENSHELL_E2E_DOCKER_FEATURES:-e2e,e2e-docker}"
14+
DEFAULT_WORKLOAD_MANIFEST="${ROOT}/e2e/gpu/images/.build/workloads.yaml"
1415

1516
cargo build -p openshell-cli --features openshell-core/dev-settings
1617

17-
if [ "${E2E_TEST}" = "gpu" ] && [ -z "${OPENSHELL_E2E_GPU_CUDA_WORKLOAD_IMAGE:-}" ]; then
18-
echo "note: running GPU e2e without OPENSHELL_E2E_GPU_CUDA_WORKLOAD_IMAGE; CUDA workload validation will log an explicit skip"
18+
if [ "${E2E_TEST}" = "gpu" ] && [ -z "${OPENSHELL_E2E_WORKLOAD_MANIFEST:-}" ] && [ ! -f "${DEFAULT_WORKLOAD_MANIFEST}" ]; then
19+
echo "note: running GPU e2e without a workload manifest; workload validation will log an explicit skip. Build one with 'mise run e2e:workloads:build' or set OPENSHELL_E2E_WORKLOAD_MANIFEST."
1920
fi
2021

2122
exec "${ROOT}/e2e/with-docker-gateway.sh" \

e2e/rust/tests/gpu/workloads.rs

Lines changed: 153 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,46 +3,179 @@
33

44
//! GPU workload validation e2e tests.
55
6+
use std::fs;
7+
use std::path::{Path, PathBuf};
8+
69
use openshell_e2e::harness::output::strip_ansi;
710
use openshell_e2e::harness::sandbox::SandboxGuard;
11+
use serde::Deserialize;
812

9-
const CUDA_WORKLOAD_IMAGE_ENV: &str = "OPENSHELL_E2E_GPU_CUDA_WORKLOAD_IMAGE";
13+
const WORKLOAD_MANIFEST_ENV: &str = "OPENSHELL_E2E_WORKLOAD_MANIFEST";
1014
const GPU_WORKLOAD_SUCCESS_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_SUCCESS";
11-
const GPU_WORKLOAD_BINARY: &str = "/usr/local/bin/openshell-gpu-workload";
15+
const GPU_WORKLOAD_FAILURE_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_FAILURE";
16+
17+
#[derive(Debug, Deserialize)]
18+
struct WorkloadManifest {
19+
workloads: Vec<WorkloadDefinition>,
20+
}
21+
22+
#[derive(Clone, Debug, Deserialize)]
23+
struct WorkloadDefinition {
24+
name: String,
25+
image: String,
26+
command: Vec<String>,
27+
expect: WorkloadExpectation,
28+
#[serde(default)]
29+
requirements: WorkloadRequirements,
30+
}
1231

13-
fn cuda_workload_image() -> Option<String> {
14-
std::env::var(CUDA_WORKLOAD_IMAGE_ENV)
32+
#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)]
33+
#[serde(rename_all = "lowercase")]
34+
enum WorkloadExpectation {
35+
Pass,
36+
Fail,
37+
}
38+
39+
#[derive(Clone, Debug, Default, Deserialize)]
40+
struct WorkloadRequirements {
41+
#[serde(default)]
42+
gpu: bool,
43+
}
44+
45+
fn default_workload_manifest_path() -> PathBuf {
46+
Path::new(env!("CARGO_MANIFEST_DIR")).join("../gpu/images/.build/workloads.yaml")
47+
}
48+
49+
fn workload_manifest_path() -> PathBuf {
50+
std::env::var(WORKLOAD_MANIFEST_ENV)
1551
.ok()
1652
.map(|value| value.trim().to_string())
1753
.filter(|value| !value.is_empty())
54+
.map(PathBuf::from)
55+
.unwrap_or_else(default_workload_manifest_path)
1856
}
1957

20-
#[tokio::test]
21-
async fn cuda_gpu_workload_validation_runs_explicit_workload_binary() {
22-
let Some(image) = cuda_workload_image() else {
23-
eprintln!("skipping CUDA GPU workload validation: {CUDA_WORKLOAD_IMAGE_ENV} is not set");
24-
return;
58+
fn load_workload_manifest() -> Option<WorkloadManifest> {
59+
let path = workload_manifest_path();
60+
let explicit_override = std::env::var(WORKLOAD_MANIFEST_ENV)
61+
.ok()
62+
.map(|value| !value.trim().is_empty())
63+
.unwrap_or(false);
64+
65+
let contents = match fs::read_to_string(&path) {
66+
Ok(contents) => contents,
67+
Err(err) if !explicit_override && err.kind() == std::io::ErrorKind::NotFound => {
68+
eprintln!(
69+
"skipping GPU workload validation: no workload manifest at {}. \
70+
Run `mise run e2e:workloads:build` to create the local manifest \
71+
or set {WORKLOAD_MANIFEST_ENV} to an external manifest.",
72+
path.display()
73+
);
74+
return None;
75+
}
76+
Err(err) => panic!("failed to read workload manifest {}: {err}", path.display()),
2577
};
2678

27-
let mut guard = SandboxGuard::create(&[
28-
"--gpu",
29-
"--from",
30-
image.as_str(),
31-
"--",
32-
GPU_WORKLOAD_BINARY,
33-
])
34-
.await
35-
.unwrap_or_else(|err| {
79+
let manifest: WorkloadManifest = serde_yaml::from_str(&contents).unwrap_or_else(|err| {
3680
panic!(
37-
"CUDA GPU workload sandbox create failed for image {image} with binary {GPU_WORKLOAD_BINARY}:\n{err}"
81+
"failed to parse workload manifest {}: {err}",
82+
path.display()
83+
)
84+
});
85+
assert!(
86+
!manifest.workloads.is_empty(),
87+
"workload manifest {} contains no workloads",
88+
path.display()
89+
);
90+
Some(manifest)
91+
}
92+
93+
async fn assert_expected_pass(workload: &WorkloadDefinition) {
94+
let mut args = vec![
95+
"--gpu".to_string(),
96+
"--from".to_string(),
97+
workload.image.clone(),
98+
"--".to_string(),
99+
];
100+
args.extend(workload.command.clone());
101+
let arg_refs = args.iter().map(String::as_str).collect::<Vec<_>>();
102+
103+
let mut guard = SandboxGuard::create(&arg_refs).await.unwrap_or_else(|err| {
104+
panic!(
105+
"GPU workload '{}' expected success but sandbox create failed:\n{err}",
106+
workload.name
38107
)
39108
});
40109

41110
let clean_output = strip_ansi(&guard.create_output);
42111
assert!(
43112
clean_output.contains(GPU_WORKLOAD_SUCCESS_MARKER),
44-
"expected success marker {GPU_WORKLOAD_SUCCESS_MARKER} for image {image} in sandbox output:\n{clean_output}"
113+
"expected success marker {GPU_WORKLOAD_SUCCESS_MARKER} for workload '{}' image {} in sandbox output:\n{clean_output}",
114+
workload.name,
115+
workload.image,
45116
);
46117

47118
guard.cleanup().await;
48119
}
120+
121+
async fn assert_expected_fail(workload: &WorkloadDefinition) {
122+
let mut args = vec![
123+
"--gpu".to_string(),
124+
"--from".to_string(),
125+
workload.image.clone(),
126+
"--".to_string(),
127+
];
128+
args.extend(workload.command.clone());
129+
let arg_refs = args.iter().map(String::as_str).collect::<Vec<_>>();
130+
131+
match SandboxGuard::create(&arg_refs).await {
132+
Ok(mut guard) => {
133+
let clean_output = strip_ansi(&guard.create_output);
134+
guard.cleanup().await;
135+
panic!(
136+
"GPU workload '{}' unexpectedly succeeded. Output:\n{clean_output}",
137+
workload.name
138+
);
139+
}
140+
Err(err) => {
141+
let clean_output = strip_ansi(&err);
142+
assert!(
143+
clean_output.contains(GPU_WORKLOAD_FAILURE_MARKER),
144+
"expected failure marker {GPU_WORKLOAD_FAILURE_MARKER} for workload '{}' image {} in failure output:\n{clean_output}",
145+
workload.name,
146+
workload.image,
147+
);
148+
}
149+
}
150+
}
151+
152+
#[tokio::test]
153+
async fn gpu_workload_manifest_runs_expected_workloads() {
154+
let Some(manifest) = load_workload_manifest() else {
155+
return;
156+
};
157+
158+
let gpu_workloads = manifest
159+
.workloads
160+
.into_iter()
161+
.filter(|workload| workload.requirements.gpu)
162+
.collect::<Vec<_>>();
163+
164+
assert!(
165+
!gpu_workloads.is_empty(),
166+
"workload manifest contains no GPU-tagged workloads"
167+
);
168+
169+
for workload in gpu_workloads {
170+
assert!(
171+
!workload.command.is_empty(),
172+
"workload '{}' must declare a non-empty command",
173+
workload.name
174+
);
175+
176+
match workload.expect {
177+
WorkloadExpectation::Pass => assert_expected_pass(&workload).await,
178+
WorkloadExpectation::Fail => assert_expected_fail(&workload).await,
179+
}
180+
}
181+
}

0 commit comments

Comments
 (0)