Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/e2e-gpu-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,8 @@ jobs:
docker info --format '{{json .CDISpecDirs}}'
docker run --rm --device nvidia.com/gpu=all "${OPENSHELL_E2E_GPU_PROBE_IMAGE}" nvidia-smi -L

- name: Build GPU workload images
run: mise run --no-deps --skip-deps e2e:workloads:build

- name: Run tests
run: mise run --no-deps --skip-deps e2e:docker:gpu
123 changes: 123 additions & 0 deletions crates/openshell-cli/src/ssh.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@ use tokio::process::Command as TokioCommand;
use tokio_stream::wrappers::ReceiverStream;

const FOREGROUND_FORWARD_STARTUP_GRACE_PERIOD: Duration = Duration::from_secs(2);
const HOST_TOOL_LINKER_ENV: &[&str] = &[
"DYLD_FALLBACK_LIBRARY_PATH",
"DYLD_INSERT_LIBRARIES",
"DYLD_LIBRARY_PATH",
"LD_AUDIT",
"LD_LIBRARY_PATH",
"LD_PRELOAD",
"LIBRARY_PATH",
"NIX_LD_LIBRARY_PATH",
];

#[derive(Clone, Copy, Debug)]
pub enum Editor {
Expand Down Expand Up @@ -121,6 +131,7 @@ async fn ssh_session_config(
&session.token,
gateway_name,
);
let proxy_command = proxy_command_with_preserved_environment(proxy_command);

Ok(SshSessionConfig {
proxy_command,
Expand All @@ -137,6 +148,7 @@ fn ssh_base_command(proxy_command: &str) -> Command {
std::env::var("OPENSHELL_SSH_LOG_LEVEL").unwrap_or_else(|_| "ERROR".to_string());

let mut command = Command::new("ssh");
sanitize_host_tool_environment(&mut command);
command
.arg("-o")
.arg(format!("ProxyCommand={proxy_command}"))
Expand All @@ -159,6 +171,30 @@ fn ssh_base_command(proxy_command: &str) -> Command {
command
}

fn sanitize_host_tool_environment(command: &mut Command) {
for key in HOST_TOOL_LINKER_ENV {
command.env_remove(key);
}
}

fn proxy_command_with_preserved_environment(proxy_command: String) -> String {
let assignments = HOST_TOOL_LINKER_ENV
.iter()
.filter_map(|key| {
std::env::var_os(key).map(|value| {
let value = value.to_string_lossy();
format!("{key}={}", shell_escape(&value))
})
})
.collect::<Vec<_>>();

if assignments.is_empty() {
proxy_command
} else {
format!("env {} {proxy_command}", assignments.join(" "))
}
}

#[cfg(unix)]
const TRANSIENT_TTY_SIGNALS: &[Signal] = &[Signal::SIGINT, Signal::SIGQUIT, Signal::SIGTERM];

Expand Down Expand Up @@ -1508,6 +1544,93 @@ mod tests {
use super::*;
use crate::TEST_ENV_LOCK;

#[test]
fn ssh_base_command_removes_host_linker_environment() {
let command = ssh_base_command("openshell ssh-proxy");
let removed_keys = command
.get_envs()
.filter(|(_, value)| value.is_none())
.map(|(key, _)| key.to_string_lossy().into_owned())
.collect::<Vec<_>>();

for key in HOST_TOOL_LINKER_ENV {
assert!(
removed_keys.iter().any(|removed| removed == key),
"expected ssh command to remove {key}"
);
}
}

#[test]
#[allow(unsafe_code)] // Test-only: env vars require unsafe in Rust 2024.
fn proxy_command_preserves_linker_environment_for_proxy_child() {
let _guard = TEST_ENV_LOCK
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner);
let old_env = HOST_TOOL_LINKER_ENV
.iter()
.map(|key| (*key, std::env::var_os(key)))
.collect::<Vec<_>>();

unsafe {
for key in HOST_TOOL_LINKER_ENV {
std::env::remove_var(key);
}
std::env::set_var("LD_LIBRARY_PATH", "/nix/store/z3 lib:/opt/lib");
}

let proxy_command =
proxy_command_with_preserved_environment("openshell ssh-proxy".to_string());
let has_assignment = proxy_command.contains("LD_LIBRARY_PATH='/nix/store/z3 lib:/opt/lib'");
let has_env_prefix = proxy_command.starts_with("env ");
let has_command = proxy_command.ends_with(" openshell ssh-proxy");

unsafe {
for (key, value) in old_env {
match value {
Some(value) => std::env::set_var(key, value),
None => std::env::remove_var(key),
}
}
}

assert!(has_assignment, "unexpected proxy command: {proxy_command}");
assert!(has_env_prefix, "unexpected proxy command: {proxy_command}");
assert!(has_command, "unexpected proxy command: {proxy_command}");
}

#[test]
#[allow(unsafe_code)] // Test-only: env vars require unsafe in Rust 2024.
fn proxy_command_is_unchanged_without_linker_environment() {
let _guard = TEST_ENV_LOCK
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner);
let old_env = HOST_TOOL_LINKER_ENV
.iter()
.map(|key| (*key, std::env::var_os(key)))
.collect::<Vec<_>>();

unsafe {
for key in HOST_TOOL_LINKER_ENV {
std::env::remove_var(key);
}
}

let proxy_command =
proxy_command_with_preserved_environment("openshell ssh-proxy".to_string());

unsafe {
for (key, value) in old_env {
match value {
Some(value) => std::env::set_var(key, value),
None => std::env::remove_var(key),
}
}
}

assert_eq!(proxy_command, "openshell ssh-proxy");
}

#[test]
fn upsert_host_block_appends_when_missing() {
let input = "Host existing\n HostName example.com\n";
Expand Down
9 changes: 5 additions & 4 deletions crates/openshell-driver-docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,11 @@ The Docker driver bind-mounts a host-side Linux `openshell-sandbox` binary into
each sandbox container. Resolution order is:

1. `supervisor_bin` in `[openshell.drivers.docker]`.
2. A sibling `openshell-sandbox` next to the running `openshell-gateway` binary.
3. A local Linux cargo target build for the Docker daemon architecture.
4. `supervisor_image` in `[openshell.drivers.docker]`, or the
release-matched default supervisor image, extracting `/openshell-sandbox`.
2. `supervisor_image` in `[openshell.drivers.docker]`, extracting
`/openshell-sandbox` from that image.
3. A sibling `openshell-sandbox` next to the running `openshell-gateway` binary.
4. A local Linux cargo target build for the Docker daemon architecture.
5. The release-matched default supervisor image, extracting `/openshell-sandbox`.

Release and Docker-image gateway builds bake the matching supervisor image tag
into the binary at compile time. The default Docker supervisor image is not
Expand Down
28 changes: 16 additions & 12 deletions crates/openshell-driver-docker/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ const DOCKER_NETWORK_DRIVER: &str = "bridge";

/// Default image holding the Linux `openshell-sandbox` binary. The gateway
/// pulls this image and extracts the binary to a host-side cache when no
/// explicit `supervisor_bin` override or local build is available.
/// explicit `supervisor_bin`, configured `supervisor_image`, sibling binary,
/// or local build is available.
const DEFAULT_DOCKER_SUPERVISOR_IMAGE_REPO: &str = "ghcr.io/nvidia/openshell/supervisor";

/// Return the default `ghcr.io/nvidia/openshell/supervisor:<tag>` reference
Expand Down Expand Up @@ -2960,7 +2961,14 @@ pub(crate) async fn resolve_supervisor_bin(
return Ok(path);
}

// Tier 2: sibling `openshell-sandbox` next to the running gateway
// Tier 2: explicit supervisor_image in [openshell.drivers.docker].
// A configured image should be the source of truth even when a local
// developer build is present under target/.
if let Some(image) = docker_config.supervisor_image.clone() {
return extract_supervisor_bin_from_image(docker, &image).await;
}

// Tier 3: sibling `openshell-sandbox` next to the running gateway
// (release artifact layout). Linux-only because the sibling must be a
// Linux ELF to bind-mount into a Linux container.
if cfg!(target_os = "linux") {
Expand All @@ -2977,9 +2985,9 @@ pub(crate) async fn resolve_supervisor_bin(
}
}

// Tier 3: local cargo target build (developer workflow). Preferred
// over a registry pull when available because it matches whatever the
// developer just built.
// Tier 4: local cargo target build (developer workflow). Preferred
// over the default registry image when available because it matches
// whatever the developer just built.
let target_candidates = linux_supervisor_candidates(daemon_arch);
for candidate in &target_candidates {
if candidate.is_file() {
Expand All @@ -2990,13 +2998,9 @@ pub(crate) async fn resolve_supervisor_bin(
}
}

// Tier 4: pull the supervisor image from a registry and extract the
// binary to a host-side cache keyed by image content digest. This is
// the default path for released gateway binaries.
let image = docker_config
.supervisor_image
.clone()
.unwrap_or_else(default_docker_supervisor_image);
// Tier 5: pull the release-matched default supervisor image and extract
// the binary to a host-side cache keyed by image content digest.
let image = default_docker_supervisor_image();
extract_supervisor_bin_from_image(docker, &image).await
}

Expand Down
1 change: 1 addition & 0 deletions docs/reference/gateway-config.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ sandbox_namespace = "docker-dev"
grpc_endpoint = "https://host.openshell.internal:17670"
# Skip the image-pull-and-extract step by pointing at a locally built binary.
supervisor_bin = "/usr/local/libexec/openshell/openshell-sandbox"
# When supervisor_bin is omitted, Docker extracts /openshell-sandbox from this image.
supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest"
guest_tls_ca = "/etc/openshell/certs/ca.pem"
guest_tls_cert = "/etc/openshell/certs/client.pem"
Expand Down
77 changes: 59 additions & 18 deletions e2e/gpu/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

# GPU workload images

This directory defines workload test images for OpenShell GPU validation.
This directory defines workload test images currently used by the OpenShell GPU
e2e suite.

## Contract

Expand All @@ -22,11 +23,10 @@ Each workload image must:
command explicitly.

OpenShell sandbox creation replaces the image entrypoint with the supervisor and
does not run the OCI image `CMD`. When these images are used through OpenShell,
the workload command from each manifest entry must be passed explicitly.
does not run the OCI image `CMD`. E2e tests that use these images through
OpenShell run the command from each manifest entry explicitly.

The image build task writes a local workload manifest. Each workload entry
carries:
The test harness is manifest-driven. Each workload entry carries:

- `name`
- `image`
Expand Down Expand Up @@ -61,18 +61,17 @@ The build task uses `tasks/scripts/container-engine.sh`. Set
`CONTAINER_ENGINE=docker` or `CONTAINER_ENGINE=podman` to choose an engine
explicitly. When unset, the helper uses its existing auto-detection behavior.

Local tags use the current commit short SHA plus a short fingerprint of the
external build inputs. Dirty local trees append `-dirty`. Set
`OPENSHELL_GPU_WORKLOAD_IMAGE_TAG=<tag>` to override the tag.
Local tags use a short SHA-256 fingerprint of the selected workload contexts
and external build inputs. Set `OPENSHELL_GPU_WORKLOAD_IMAGE_TAG=<tag>` to
override the tag.

The task writes the latest build refs to:

```text
e2e/gpu/images/.build/latest.env
```

The task also writes a local workload manifest for downstream tooling and
future workload-runner integration:
The task also writes the local workload manifest used by the Rust e2e runner:

```text
e2e/gpu/images/.build/workloads.yaml
Expand All @@ -90,8 +89,7 @@ source e2e/gpu/images/.build/latest.env
```

That env file exports `OPENSHELL_E2E_WORKLOAD_MANIFEST` pointing at the local
manifest. The current checked-in Rust GPU e2e target does not consume this
manifest yet. The per-image refs remain available as a convenience for direct
manifest. The per-image refs remain available as a convenience for direct
container-engine validation.

## Direct Validation
Expand Down Expand Up @@ -124,14 +122,57 @@ where Podman CDI is configured.
Direct container-engine validation catches image, CDI, CUDA, and host GPU setup
issues before OpenShell sandbox behavior is involved.

## OpenShell GPU E2E
## Manifest-Driven Validation

The current Rust GPU validation target is:
The Rust GPU validation target is:

```shell
mise run e2e:gpu
cargo test --manifest-path e2e/rust/Cargo.toml --features e2e-docker-gpu --test gpu -- --nocapture
```

That target runs `gpu_device_selection`. It validates GPU request and device
selection behavior against a Docker-backed gateway. It does not run the
workload manifest generated by `mise run e2e:workloads:build`.
The workload validation path reads:

```text
OPENSHELL_E2E_WORKLOAD_MANIFEST
```

When that variable is unset, the runner uses the default local manifest path:

```text
e2e/gpu/images/.build/workloads.yaml
```

If neither path exists, the workload validation test prints a clear skip
message telling you to run:

```shell
mise run e2e:workloads:build
```

or to set `OPENSHELL_E2E_WORKLOAD_MANIFEST` to an external manifest.

Each manifest entry supplies the sandbox image and command. OpenShell runs that
command through `openshell sandbox create --gpu --from <image> -- <command>`.
The test runner iterates all GPU-tagged workload entries and enforces each
entry's declared expectation:

- `expect: pass` requires `OPENSHELL_GPU_WORKLOAD_SUCCESS`
- `expect: fail` requires `OPENSHELL_GPU_WORKLOAD_FAILURE`

The current local manifest includes three workloads:

- `smoke-pass` expected to pass
- `smoke-fail` expected to fail
- `cuda-basic` expected to pass

## External Manifests

External workload catalogs can use the same schema. Point the runner at one
with:

```shell
export OPENSHELL_E2E_WORKLOAD_MANIFEST=/abs/path/to/workloads.yaml
```

That lets alternate workload manifests use the same test runner without
introducing per-workload env vars.
Loading
Loading