diff --git a/crates/openshell-bootstrap/src/constants.rs b/crates/openshell-bootstrap/src/constants.rs index ff283b3e..74e381fd 100644 --- a/crates/openshell-bootstrap/src/constants.rs +++ b/crates/openshell-bootstrap/src/constants.rs @@ -11,6 +11,8 @@ pub const SERVER_TLS_SECRET_NAME: &str = "openshell-server-tls"; pub const SERVER_CLIENT_CA_SECRET_NAME: &str = "openshell-server-client-ca"; /// K8s secret holding the client TLS certificate, key, and CA cert (shared by CLI and sandboxes). pub const CLIENT_TLS_SECRET_NAME: &str = "openshell-client-tls"; +/// K8s secret holding the SSH handshake HMAC secret (shared by gateway and sandbox pods). +pub const SSH_HANDSHAKE_SECRET_NAME: &str = "openshell-ssh-handshake"; pub fn container_name(name: &str) -> String { format!("openshell-cluster-{name}") diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index 9c365bfe..ae88c49b 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -8,8 +8,9 @@ use bollard::API_DEFAULT_VERSION; use bollard::Docker; use bollard::errors::Error as BollardError; use bollard::models::{ - ContainerCreateBody, DeviceRequest, HostConfig, HostConfigCgroupnsModeEnum, - NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, VolumeCreateRequest, + ContainerCreateBody, DeviceRequest, EndpointSettings, HostConfig, HostConfigCgroupnsModeEnum, + NetworkConnectRequest, NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, + RestartPolicy, RestartPolicyNameEnum, VolumeCreateRequest, }; use bollard::query_parameters::{ CreateContainerOptions, CreateImageOptions, InspectContainerOptions, InspectNetworkOptions, @@ -482,6 +483,17 @@ pub async fn ensure_container( }; if image_matches { + // The container exists with the correct image, but its network + // attachment may be stale. When the gateway is resumed after a + // container kill, `ensure_network` destroys and recreates the + // Docker network (giving it a new ID). The stopped container + // still references the old network ID, so `docker start` would + // fail with "network not found". + // + // Fix: disconnect from any existing networks and reconnect to + // the current (just-created) network before returning. + let expected_net = network_name(name); + reconcile_container_network(docker, &container_name, &expected_net).await?; return Ok(()); } @@ -532,6 +544,12 @@ pub async fn ensure_container( port_bindings: Some(port_bindings), binds: Some(vec![format!("{}:/var/lib/rancher/k3s", volume_name(name))]), network_mode: Some(network_name(name)), + // Automatically restart the container when Docker restarts, unless the + // user explicitly stopped it with `gateway stop`. + restart_policy: Some(RestartPolicy { + name: Some(RestartPolicyNameEnum::UNLESS_STOPPED), + maximum_retry_count: None, + }), // Add host gateway aliases for DNS resolution. // This allows both the entrypoint script and the running gateway // process to reach services on the Docker host. @@ -919,6 +937,48 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<() Ok(()) } +/// Clean up the gateway container and network, preserving the persistent volume. +/// +/// Used when a resume attempt fails — we want to remove the container we may +/// have just created but keep the volume so the user can retry without losing +/// their k3s/etcd state and sandbox data. +pub async fn cleanup_gateway_container(docker: &Docker, name: &str) -> Result<()> { + let container_name = container_name(name); + let net_name = network_name(name); + + // Disconnect container from network + let _ = docker + .disconnect_network( + &net_name, + NetworkDisconnectRequest { + container: container_name.clone(), + force: Some(true), + }, + ) + .await; + + let _ = stop_container(docker, &container_name).await; + + let remove_container = docker + .remove_container( + &container_name, + Some(RemoveContainerOptions { + force: true, + ..Default::default() + }), + ) + .await; + if let Err(err) = remove_container + && !is_not_found(&err) + { + return Err(err).into_diagnostic(); + } + + force_remove_network(docker, &net_name).await?; + + Ok(()) +} + /// Forcefully remove a Docker network, disconnecting any remaining /// containers first. This ensures that stale Docker network endpoints /// cannot prevent port bindings from being released. @@ -956,6 +1016,71 @@ async fn force_remove_network(docker: &Docker, net_name: &str) -> Result<()> { } } +/// Ensure a stopped container is connected to the expected Docker network. +/// +/// When a gateway is resumed after the container was killed (but not removed), +/// `ensure_network` destroys and recreates the network with a new ID. The +/// stopped container still holds a reference to the old network ID in its +/// config, so `docker start` would fail with a 404 "network not found" error. +/// +/// This function disconnects the container from any networks that no longer +/// match the expected network name and connects it to the correct one. +async fn reconcile_container_network( + docker: &Docker, + container_name: &str, + expected_network: &str, +) -> Result<()> { + let info = docker + .inspect_container(container_name, None::) + .await + .into_diagnostic() + .wrap_err("failed to inspect container for network reconciliation")?; + + // Check the container's current network attachments via NetworkSettings. + let attached_networks: Vec = info + .network_settings + .as_ref() + .and_then(|ns| ns.networks.as_ref()) + .map(|nets| nets.keys().cloned().collect()) + .unwrap_or_default(); + + // If the container is already attached to the expected network (by name), + // Docker will resolve the name to the current network ID on start. + // However, when the network was destroyed and recreated, the container's + // stored endpoint references the old ID. Disconnect and reconnect to + // pick up the new network ID. + for net_name in &attached_networks { + let _ = docker + .disconnect_network( + net_name, + NetworkDisconnectRequest { + container: container_name.to_string(), + force: Some(true), + }, + ) + .await; + } + + // Connect to the (freshly created) expected network. + docker + .connect_network( + expected_network, + NetworkConnectRequest { + container: container_name.to_string(), + endpoint_config: Some(EndpointSettings::default()), + }, + ) + .await + .into_diagnostic() + .wrap_err("failed to connect container to gateway network")?; + + tracing::debug!( + "Reconciled network for container {container_name}: disconnected from {attached_networks:?}, connected to {expected_network}" + ); + + Ok(()) +} + fn is_not_found(err: &BollardError) -> bool { matches!( err, diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 9098fd4a..22e417fa 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -26,12 +26,13 @@ use miette::{IntoDiagnostic, Result}; use std::sync::{Arc, Mutex}; use crate::constants::{ - CLIENT_TLS_SECRET_NAME, SERVER_CLIENT_CA_SECRET_NAME, SERVER_TLS_SECRET_NAME, network_name, - volume_name, + CLIENT_TLS_SECRET_NAME, SERVER_CLIENT_CA_SECRET_NAME, SERVER_TLS_SECRET_NAME, + SSH_HANDSHAKE_SECRET_NAME, network_name, volume_name, }; use crate::docker::{ - check_existing_gateway, check_port_conflicts, destroy_gateway_resources, ensure_container, - ensure_image, ensure_network, ensure_volume, start_container, stop_container, + check_existing_gateway, check_port_conflicts, cleanup_gateway_container, + destroy_gateway_resources, ensure_container, ensure_image, ensure_network, ensure_volume, + start_container, stop_container, }; use crate::metadata::{ create_gateway_metadata, create_gateway_metadata_with_host, local_gateway_host, @@ -288,19 +289,22 @@ where (preflight.docker, None) }; - // If an existing gateway is found, either tear it down (when recreate is - // requested) or bail out so the caller can prompt the user / reuse it. + // If an existing gateway is found, decide how to proceed: + // - recreate: destroy everything and start fresh + // - otherwise: auto-resume from existing state (the ensure_* calls are + // idempotent and will reuse the volume, create a container if needed, + // and start it) + let mut resume = false; if let Some(existing) = check_existing_gateway(&target_docker, &name).await? { if recreate { log("[status] Removing existing gateway".to_string()); destroy_gateway_resources(&target_docker, &name).await?; + } else if existing.container_running { + log("[status] Gateway is already running".to_string()); + resume = true; } else { - return Err(miette::miette!( - "Gateway '{name}' already exists (container_running={}).\n\ - Use --recreate to destroy and redeploy, or destroy it first with:\n\n \ - openshell gateway destroy --name {name}", - existing.container_running, - )); + log("[status] Resuming gateway from existing state".to_string()); + resume = true; } } @@ -455,6 +459,11 @@ where store_pki_bundle(&name, &pki_bundle)?; + // Reconcile SSH handshake secret: reuse existing K8s secret if present, + // generate and persist a new one otherwise. This secret is stored in etcd + // (on the persistent volume) so it survives container restarts. + reconcile_ssh_handshake_secret(&target_docker, &name, &log).await?; + // Push locally-built component images into the k3s containerd runtime. // This is the "push" path for local development — images are exported from // the local Docker daemon and streamed into the cluster's containerd so @@ -524,15 +533,30 @@ where docker: target_docker, }), Err(deploy_err) => { - // Automatically clean up Docker resources (volume, container, network, - // image) so the environment is left in a retryable state. - tracing::info!("deploy failed, cleaning up gateway resources for '{name}'"); - if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await { - tracing::warn!( - "automatic cleanup after failed deploy also failed: {cleanup_err}. \ - Manual cleanup may be required: \ - openshell gateway destroy --name {name}" + if resume { + // When resuming, preserve the volume so the user can retry. + // Only clean up the container and network that we may have created. + tracing::info!( + "resume failed, cleaning up container for '{name}' (preserving volume)" ); + if let Err(cleanup_err) = cleanup_gateway_container(&target_docker, &name).await { + tracing::warn!( + "automatic cleanup after failed resume also failed: {cleanup_err}. \ + Manual cleanup may be required: \ + openshell gateway destroy --name {name}" + ); + } + } else { + // Automatically clean up Docker resources (volume, container, network, + // image) so the environment is left in a retryable state. + tracing::info!("deploy failed, cleaning up gateway resources for '{name}'"); + if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await { + tracing::warn!( + "automatic cleanup after failed deploy also failed: {cleanup_err}. \ + Manual cleanup may be required: \ + openshell gateway destroy --name {name}" + ); + } } Err(deploy_err) } @@ -809,6 +833,14 @@ where let cname = container_name(name); let kubeconfig = constants::KUBECONFIG_PATH; + // Wait for the k3s API server and openshell namespace before attempting + // to read secrets. Without this, kubectl fails transiently on resume + // (k3s hasn't booted yet), the code assumes secrets are gone, and + // regenerates PKI unnecessarily — triggering a server rollout restart + // and TLS errors for in-flight connections. + log("[progress] Waiting for openshell namespace".to_string()); + wait_for_namespace(docker, &cname, kubeconfig, "openshell").await?; + // Try to load existing secrets. match load_existing_pki_bundle(docker, &cname, kubeconfig).await { Ok(bundle) => { @@ -823,10 +855,6 @@ where } // Generate fresh PKI and apply to cluster. - // Namespace may still be creating on first bootstrap, so wait here only - // when rotation is actually needed. - log("[progress] Waiting for openshell namespace".to_string()); - wait_for_namespace(docker, &cname, kubeconfig, "openshell").await?; log("[progress] Generating TLS certificates".to_string()); let bundle = generate_pki(extra_sans)?; log("[progress] Applying TLS secrets to gateway".to_string()); @@ -837,6 +865,72 @@ where Ok((bundle, true)) } +/// Reconcile the SSH handshake HMAC secret as a Kubernetes Secret. +/// +/// If the secret already exists in the cluster, this is a no-op. Otherwise a +/// fresh 32-byte hex secret is generated and applied. Because the secret lives +/// in etcd (backed by the persistent Docker volume), it survives container +/// restarts without regeneration — existing sandbox SSH sessions remain valid. +async fn reconcile_ssh_handshake_secret(docker: &Docker, name: &str, log: &F) -> Result<()> +where + F: Fn(String) + Sync, +{ + use miette::WrapErr; + + let cname = container_name(name); + let kubeconfig = constants::KUBECONFIG_PATH; + + // Check if the secret already exists. + let (output, exit_code) = exec_capture_with_exit( + docker, + &cname, + vec![ + "sh".to_string(), + "-c".to_string(), + format!( + "KUBECONFIG={kubeconfig} kubectl -n openshell get secret {SSH_HANDSHAKE_SECRET_NAME} -o jsonpath='{{.data.secret}}' 2>/dev/null" + ), + ], + ) + .await?; + + if exit_code == 0 && !output.trim().is_empty() { + tracing::debug!( + "existing SSH handshake secret found ({} bytes encoded)", + output.trim().len() + ); + log("[progress] Reusing existing SSH handshake secret".to_string()); + return Ok(()); + } + + // Generate a new 32-byte hex secret and create the K8s secret. + log("[progress] Generating SSH handshake secret".to_string()); + let (output, exit_code) = exec_capture_with_exit( + docker, + &cname, + vec![ + "sh".to_string(), + "-c".to_string(), + format!( + "SECRET=$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \\n') && \ + KUBECONFIG={kubeconfig} kubectl -n openshell create secret generic {SSH_HANDSHAKE_SECRET_NAME} \ + --from-literal=secret=$SECRET --dry-run=client -o yaml | \ + KUBECONFIG={kubeconfig} kubectl apply -f -" + ), + ], + ) + .await?; + + if exit_code != 0 { + return Err(miette::miette!( + "failed to create SSH handshake secret (exit {exit_code}): {output}" + )) + .wrap_err("failed to apply SSH handshake secret"); + } + + Ok(()) +} + /// Load existing TLS secrets from the cluster and reconstruct a [`PkiBundle`]. /// /// Returns an error string describing why secrets couldn't be loaded (for logging). diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs index e976061f..6a6e4d25 100644 --- a/crates/openshell-cli/src/bootstrap.rs +++ b/crates/openshell-cli/src/bootstrap.rs @@ -144,43 +144,58 @@ pub async fn run_bootstrap( ); eprintln!(); - // Auto-bootstrap always recreates if stale Docker resources are found - // (e.g. metadata was deleted but container/volume still exist). - let mut options = openshell_bootstrap::DeployOptions::new(&gateway_name).with_recreate(true); - if let Some(dest) = remote { - let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest); - if let Some(key) = ssh_key { - remote_opts = remote_opts.with_ssh_key(key); + // Build deploy options. The deploy flow auto-resumes from existing state + // (preserving sandboxes and secrets) when it finds an existing gateway. + // If the initial attempt fails, fall back to a full recreate. + let build_options = |recreate: bool| { + let mut opts = openshell_bootstrap::DeployOptions::new(&gateway_name) + .with_recreate(recreate) + .with_gpu(gpu); + if let Some(dest) = remote { + let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest); + if let Some(key) = ssh_key { + remote_opts = remote_opts.with_ssh_key(key); + } + opts = opts.with_remote(remote_opts); } - options = options.with_remote(remote_opts); - } - // Read registry credentials from environment for the auto-bootstrap path. - // The explicit `--registry-username` / `--registry-token` flags are only - // on `gateway start`; when bootstrapping via `sandbox create`, the env - // vars are the mechanism. - if let Ok(username) = std::env::var("OPENSHELL_REGISTRY_USERNAME") - && !username.trim().is_empty() - { - options = options.with_registry_username(username); - } - if let Ok(token) = std::env::var("OPENSHELL_REGISTRY_TOKEN") - && !token.trim().is_empty() - { - options = options.with_registry_token(token); - } - // Read gateway host override from environment. Needed whenever the - // client cannot reach the Docker host at 127.0.0.1 — CI containers, - // WSL, remote Docker hosts, etc. The explicit `--gateway-host` flag - // is only on `gateway start`; this env var covers the auto-bootstrap - // path triggered by `sandbox create`. - if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST") - && !host.trim().is_empty() - { - options = options.with_gateway_host(host); - } - options = options.with_gpu(gpu); + // Read registry credentials from environment for the auto-bootstrap path. + // The explicit `--registry-username` / `--registry-token` flags are only + // on `gateway start`; when bootstrapping via `sandbox create`, the env + // vars are the mechanism. + if let Ok(username) = std::env::var("OPENSHELL_REGISTRY_USERNAME") + && !username.trim().is_empty() + { + opts = opts.with_registry_username(username); + } + if let Ok(token) = std::env::var("OPENSHELL_REGISTRY_TOKEN") + && !token.trim().is_empty() + { + opts = opts.with_registry_token(token); + } + // Read gateway host override from environment. Needed whenever the + // client cannot reach the Docker host at 127.0.0.1 — CI containers, + // WSL, remote Docker hosts, etc. The explicit `--gateway-host` flag + // is only on `gateway start`; this env var covers the auto-bootstrap + // path triggered by `sandbox create`. + if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST") + && !host.trim().is_empty() + { + opts = opts.with_gateway_host(host); + } + opts + }; - let handle = deploy_gateway_with_panel(options, &gateway_name, location).await?; + // Deploy the gateway. The deploy flow auto-resumes from existing state + // when it finds one. If that fails, fall back to a full recreate. + let handle = match deploy_gateway_with_panel(build_options(false), &gateway_name, location) + .await + { + Ok(handle) => handle, + Err(resume_err) => { + tracing::warn!("auto-bootstrap resume failed, falling back to recreate: {resume_err}"); + deploy_gateway_with_panel(build_options(true), &gateway_name, location).await? + } + }; let server = handle.gateway_endpoint().to_string(); print_deploy_summary(&gateway_name, &handle); @@ -207,7 +222,7 @@ pub async fn run_bootstrap( /// Retry connecting to the gateway gRPC endpoint until it succeeds or a /// timeout is reached. Uses exponential backoff starting at 500 ms, doubling /// up to 4 s, with a total deadline of 30 s. -async fn wait_for_grpc_ready(server: &str, tls: &TlsOptions) -> Result<()> { +pub(crate) async fn wait_for_grpc_ready(server: &str, tls: &TlsOptions) -> Result<()> { const MAX_WAIT: Duration = Duration::from_secs(30); const INITIAL_BACKOFF: Duration = Duration::from_millis(500); diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 2f9dd2f7..366cb691 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1367,47 +1367,16 @@ pub async fn gateway_admin_deploy( opts }); - // Check whether a gateway already exists. If so, prompt the user (unless - // --recreate was passed or we're in non-interactive mode). - let mut should_recreate = recreate; - if let Some(existing) = - openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await? - { - if !should_recreate { - let interactive = std::io::stdin().is_terminal() && std::io::stderr().is_terminal(); - if interactive { - let status = if existing.container_running { - "running" - } else if existing.container_exists { - "stopped" - } else { - "volume only" - }; - eprintln!(); + // If the gateway is already running and we're not recreating, short-circuit. + if !recreate { + if let Some(existing) = + openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await? + { + if existing.container_running { eprintln!( - "{} Gateway '{name}' already exists ({status}).", - "!".yellow().bold() + "{} Gateway '{name}' is already running.", + "✓".green().bold() ); - if let Some(image) = &existing.container_image { - eprintln!(" {} {}", "Image:".dimmed(), image); - } - eprintln!(); - eprint!("Destroy and recreate? [y/N] "); - std::io::stderr().flush().ok(); - let mut input = String::new(); - std::io::stdin() - .read_line(&mut input) - .into_diagnostic() - .wrap_err("failed to read user input")?; - let choice = input.trim().to_lowercase(); - should_recreate = choice == "y" || choice == "yes"; - if !should_recreate { - eprintln!("Keeping existing gateway."); - return Ok(()); - } - } else { - // Non-interactive mode: reuse existing gateway silently. - eprintln!("Gateway '{name}' already exists, reusing."); return Ok(()); } } @@ -1418,7 +1387,7 @@ pub async fn gateway_admin_deploy( .with_disable_tls(disable_tls) .with_disable_gateway_auth(disable_gateway_auth) .with_gpu(gpu) - .with_recreate(should_recreate); + .with_recreate(recreate); if let Some(opts) = remote_opts { options = options.with_remote(opts); } @@ -1434,6 +1403,15 @@ pub async fn gateway_admin_deploy( let handle = deploy_gateway_with_panel(options, name, location).await?; + // Wait for the gRPC endpoint to actually accept connections before + // declaring the gateway ready. The Docker health check may pass before + // the gRPC listener inside the pod is fully bound. + let server = handle.gateway_endpoint().to_string(); + let tls = TlsOptions::default() + .with_gateway_name(name) + .with_default_paths(&server); + crate::bootstrap::wait_for_grpc_ready(&server, &tls).await?; + print_deploy_summary(name, &handle); // Auto-activate: set this gateway as the active gateway. diff --git a/crates/openshell-cli/src/ssh.rs b/crates/openshell-cli/src/ssh.rs index 4b284bff..44b48da7 100644 --- a/crates/openshell-cli/src/ssh.rs +++ b/crates/openshell-cli/src/ssh.rs @@ -693,27 +693,50 @@ pub async fn sandbox_ssh_proxy( .ok_or_else(|| miette::miette!("gateway URL missing port"))?; let connect_path = url.path(); - let mut stream: Box = - connect_gateway(scheme, gateway_host, gateway_port, tls).await?; - let request = format!( "CONNECT {connect_path} HTTP/1.1\r\nHost: {gateway_host}\r\nX-Sandbox-Id: {sandbox_id}\r\nX-Sandbox-Token: {token}\r\n\r\n" ); - stream - .write_all(request.as_bytes()) - .await - .into_diagnostic()?; - // Wrap in a BufReader **before** reading the HTTP response. The gateway - // may send the 200 OK response and the first SSH protocol bytes in the - // same TCP segment / WebSocket frame. A plain `read()` would consume - // those SSH bytes into our buffer and discard them, causing SSH to see a - // truncated protocol banner and exit with code 255. BufReader ensures - // any bytes read past the `\r\n\r\n` header boundary stay buffered and - // are returned by subsequent reads during the bidirectional copy phase. - let mut buf_stream = BufReader::new(stream); - let status = read_connect_status(&mut buf_stream).await?; - if status != 200 { + // The gateway returns 412 (Precondition Failed) when the sandbox pod + // exists but hasn't reached Ready phase yet. This is a transient state + // after sandbox allocation — retry with backoff instead of failing + // immediately. + const MAX_CONNECT_WAIT: Duration = Duration::from_secs(60); + const INITIAL_BACKOFF: Duration = Duration::from_secs(1); + + let start = std::time::Instant::now(); + let mut backoff = INITIAL_BACKOFF; + let mut buf_stream; + + loop { + let mut stream: Box = + connect_gateway(scheme, gateway_host, gateway_port, tls).await?; + stream + .write_all(request.as_bytes()) + .await + .into_diagnostic()?; + + // Wrap in a BufReader **before** reading the HTTP response. The gateway + // may send the 200 OK response and the first SSH protocol bytes in the + // same TCP segment / WebSocket frame. A plain `read()` would consume + // those SSH bytes into our buffer and discard them, causing SSH to see a + // truncated protocol banner and exit with code 255. BufReader ensures + // any bytes read past the `\r\n\r\n` header boundary stay buffered and + // are returned by subsequent reads during the bidirectional copy phase. + buf_stream = BufReader::new(stream); + let status = read_connect_status(&mut buf_stream).await?; + if status == 200 { + break; + } + if status == 412 && start.elapsed() < MAX_CONNECT_WAIT { + tracing::debug!( + elapsed = ?start.elapsed(), + "sandbox not yet ready (HTTP 412), retrying in {backoff:?}" + ); + tokio::time::sleep(backoff).await; + backoff = (backoff * 2).min(Duration::from_secs(8)); + continue; + } return Err(miette::miette!( "gateway CONNECT failed with status {status}" )); diff --git a/crates/openshell-sandbox/src/secrets.rs b/crates/openshell-sandbox/src/secrets.rs index 4ee1ee84..002eab55 100644 --- a/crates/openshell-sandbox/src/secrets.rs +++ b/crates/openshell-sandbox/src/secrets.rs @@ -6,7 +6,7 @@ use std::collections::HashMap; const PLACEHOLDER_PREFIX: &str = "openshell:resolve:env:"; #[derive(Debug, Clone, Default)] -pub(crate) struct SecretResolver { +pub struct SecretResolver { by_placeholder: HashMap, } diff --git a/crates/openshell-server/src/sandbox/mod.rs b/crates/openshell-server/src/sandbox/mod.rs index bb943c12..754a9d95 100644 --- a/crates/openshell-server/src/sandbox/mod.rs +++ b/crates/openshell-server/src/sandbox/mod.rs @@ -1011,7 +1011,7 @@ fn inject_pod_template( mut pod_template: serde_json::Value, template: &SandboxTemplate, gpu: bool, - default_image: &str, + _default_image: &str, image_pull_policy: &str, sandbox_id: &str, sandbox_name: &str, diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index 84b8cf9a..3b069b7b 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -402,10 +402,10 @@ if [ -n "${IMAGE_PULL_POLICY:-}" ] && [ -f "$HELMCHART" ]; then sed -i "s|pullPolicy: Always|pullPolicy: ${IMAGE_PULL_POLICY}|" "$HELMCHART" fi -# Generate a random SSH handshake secret for the NSSH1 HMAC handshake between -# the gateway and sandbox SSH servers. This is required — the server will refuse -# to start without it. -SSH_HANDSHAKE_SECRET="${SSH_HANDSHAKE_SECRET:-$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \n')}" +# SSH handshake secret: previously generated here and injected via sed into the +# HelmChart CR. Now persisted as a Kubernetes Secret (openshell-ssh-handshake) +# created by the bootstrap process after k3s starts. This ensures the secret +# survives container restarts without regeneration. # Inject SSH gateway host/port into the HelmChart manifest so the openshell # server returns the correct address to CLI clients for SSH proxy CONNECT. @@ -424,9 +424,6 @@ if [ -f "$HELMCHART" ]; then # Clear the placeholder so the default (8080) is used sed -i "s|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g" "$HELMCHART" fi - echo "Setting SSH handshake secret" - sed -i "s|__SSH_HANDSHAKE_SECRET__|${SSH_HANDSHAKE_SECRET}|g" "$HELMCHART" - # Disable gateway auth: when set, the server accepts connections without # client certificates (for reverse-proxy / Cloudflare Tunnel deployments). if [ "${DISABLE_GATEWAY_AUTH:-}" = "true" ]; then diff --git a/deploy/docker/cluster-healthcheck.sh b/deploy/docker/cluster-healthcheck.sh index 68210b45..1bf76f71 100644 --- a/deploy/docker/cluster-healthcheck.sh +++ b/deploy/docker/cluster-healthcheck.sh @@ -68,3 +68,6 @@ if [ "${DISABLE_TLS:-}" != "true" ]; then kubectl -n openshell get secret openshell-server-tls >/dev/null 2>&1 || exit 1 kubectl -n openshell get secret openshell-client-tls >/dev/null 2>&1 || exit 1 fi + +# Verify SSH handshake secret exists (created by openshell-bootstrap alongside TLS secrets) +kubectl -n openshell get secret openshell-ssh-handshake >/dev/null 2>&1 || exit 1 diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index 1be8f14a..ed503a78 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -77,7 +77,10 @@ spec: value: {{ .Values.server.hostGatewayIP | quote }} {{- end }} - name: OPENSHELL_SSH_HANDSHAKE_SECRET - value: {{ required "server.sshHandshakeSecret is required" .Values.server.sshHandshakeSecret | quote }} + valueFrom: + secretKeyRef: + name: {{ .Values.server.sshHandshakeSecretName | quote }} + key: secret {{- if .Values.server.disableTls }} - name: OPENSHELL_DISABLE_TLS value: "true" diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index ccc8d1ff..d698e812 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -85,10 +85,10 @@ server: sshGatewayPort: 0 # TLS configuration for the server. The server always terminates mTLS # directly and requires client certificates. - # HMAC secret used for the NSSH1 handshake between gateway and sandbox SSH. - # Required — the server will refuse to start if empty. For cluster deployments - # this is auto-generated by the entrypoint script. - sshHandshakeSecret: "" + # Name of the Kubernetes Secret holding the NSSH1 HMAC handshake key. + # The secret must contain a `secret` key with the hex-encoded HMAC key. + # For cluster deployments this is auto-created by the bootstrap process. + sshHandshakeSecretName: "openshell-ssh-handshake" # Host gateway IP for sandbox pod hostAliases. When set, sandbox pods get # hostAliases entries mapping host.docker.internal and host.openshell.internal # to this IP, allowing them to reach services running on the Docker host. diff --git a/deploy/kube/manifests/openshell-helmchart.yaml b/deploy/kube/manifests/openshell-helmchart.yaml index 2245c72e..ae22ddc6 100644 --- a/deploy/kube/manifests/openshell-helmchart.yaml +++ b/deploy/kube/manifests/openshell-helmchart.yaml @@ -32,7 +32,6 @@ spec: sandboxImage: ghcr.io/nvidia/openshell-community/sandboxes/base:latest sshGatewayHost: __SSH_GATEWAY_HOST__ sshGatewayPort: __SSH_GATEWAY_PORT__ - sshHandshakeSecret: __SSH_HANDSHAKE_SECRET__ grpcEndpoint: "https://openshell.openshell.svc.cluster.local:8080" hostGatewayIP: __HOST_GATEWAY_IP__ disableGatewayAuth: __DISABLE_GATEWAY_AUTH__ diff --git a/e2e/rust/tests/gateway_resume.rs b/e2e/rust/tests/gateway_resume.rs new file mode 100644 index 00000000..67d28d19 --- /dev/null +++ b/e2e/rust/tests/gateway_resume.rs @@ -0,0 +1,434 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e")] + +//! E2E tests for gateway resume from existing state. +//! +//! These tests verify that `openshell gateway start` resumes from existing +//! Docker volume state (after stop or container removal) and that the SSH +//! handshake secret persists across container restarts. +//! +//! **Requires a running gateway** — the `e2e:rust` mise task bootstraps one. + +use std::process::{Command, Stdio}; +use std::time::Duration; + +use openshell_e2e::harness::binary::openshell_cmd; +use openshell_e2e::harness::output::strip_ansi; +use tokio::time::sleep; + +/// Default gateway name used by the e2e cluster. +const GATEWAY_NAME: &str = "openshell"; + +/// Docker container name for the default gateway. +fn container_name() -> String { + format!("openshell-cluster-{GATEWAY_NAME}") +} + +/// Run `openshell ` and return (combined output, exit code). +async fn run_cli(args: &[&str]) -> (String, i32) { + let mut cmd = openshell_cmd(); + cmd.args(args) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + let output = cmd.output().await.expect("spawn openshell"); + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); + let stderr = String::from_utf8_lossy(&output.stderr).to_string(); + let combined = format!("{stdout}{stderr}"); + let code = output.status.code().unwrap_or(-1); + (combined, code) +} + +/// Run `docker ` synchronously and return (stdout, exit code). +fn docker_cmd(args: &[&str]) -> (String, i32) { + let output = Command::new("docker") + .args(args) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .expect("spawn docker"); + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); + let code = output.status.code().unwrap_or(-1); + (stdout, code) +} + +/// Wait for the gateway to become healthy by polling `openshell status`. +async fn wait_for_healthy(timeout: Duration) { + let start = std::time::Instant::now(); + loop { + let (output, code) = run_cli(&["status"]).await; + let clean = strip_ansi(&output).to_lowercase(); + if code == 0 && (clean.contains("healthy") || clean.contains("running") || clean.contains("connected") || clean.contains("✓")) { + return; + } + if start.elapsed() > timeout { + panic!( + "gateway did not become healthy within {}s. Last output:\n{}", + timeout.as_secs(), + strip_ansi(&output) + ); + } + sleep(Duration::from_secs(3)).await; + } +} + +/// Read the SSH handshake secret from the K8s secret inside the cluster. +fn read_ssh_handshake_secret() -> Option { + let cname = container_name(); + let (output, code) = docker_cmd(&[ + "exec", + &cname, + "sh", + "-c", + "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl -n openshell get secret openshell-ssh-handshake -o jsonpath='{.data.secret}' 2>/dev/null", + ]); + if code == 0 && !output.trim().is_empty() { + Some(output.trim().to_string()) + } else { + None + } +} + +// ------------------------------------------------------------------- +// Test: `gateway start` on an already-running gateway succeeds +// ------------------------------------------------------------------- + +/// When the gateway is already running, `openshell gateway start` should +/// return immediately with exit code 0 and indicate it's already running. +#[tokio::test] +async fn gateway_start_on_running_gateway_succeeds() { + // Precondition: gateway is running (e2e cluster is up). + wait_for_healthy(Duration::from_secs(30)).await; + + let (output, code) = run_cli(&["gateway", "start"]).await; + let clean = strip_ansi(&output); + + assert_eq!( + code, 0, + "gateway start on running gateway should exit 0:\n{clean}" + ); + assert!( + clean.to_lowercase().contains("already running"), + "output should indicate gateway is already running:\n{clean}" + ); +} + +// ------------------------------------------------------------------- +// Test: gateway stop → start resumes, sandbox survives +// ------------------------------------------------------------------- + +/// After `gateway stop` then `gateway start`, the gateway should resume +/// from existing state. A sandbox created before the stop should still +/// appear in the sandbox list after restart. +#[tokio::test] +async fn gateway_stop_start_resumes_with_sandbox() { + // Precondition: gateway is healthy. + wait_for_healthy(Duration::from_secs(30)).await; + + // Create a sandbox that we'll check for after restart. + let (create_output, create_code) = + run_cli(&["sandbox", "create", "--", "echo", "resume-test"]).await; + let clean_create = strip_ansi(&create_output); + assert_eq!( + create_code, 0, + "sandbox create should succeed:\n{clean_create}" + ); + + // Extract sandbox name from output. + let sandbox_name = clean_create + .lines() + .find_map(|line| { + if let Some((_, rest)) = line.split_once("Created sandbox:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else if let Some((_, rest)) = line.split_once("Name:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else { + None + } + }) + .expect("should extract sandbox name from create output"); + + // Stop the gateway. + let (stop_output, stop_code) = run_cli(&["gateway", "stop"]).await; + assert_eq!( + stop_code, 0, + "gateway stop should succeed:\n{}", + strip_ansi(&stop_output) + ); + + // Wait a moment for the container to fully stop. + sleep(Duration::from_secs(3)).await; + + // Verify container is stopped. + let (inspect_out, _) = docker_cmd(&[ + "inspect", + "-f", + "{{.State.Running}}", + &container_name(), + ]); + assert_eq!( + inspect_out.trim(), + "false", + "container should be stopped after gateway stop" + ); + + // Start the gateway again — should resume from existing state. + let (start_output, start_code) = run_cli(&["gateway", "start"]).await; + let clean_start = strip_ansi(&start_output); + assert_eq!( + start_code, 0, + "gateway start after stop should succeed:\n{clean_start}" + ); + + // Wait for the gateway to become healthy again. + wait_for_healthy(Duration::from_secs(180)).await; + + // Verify the sandbox still exists. + let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await; + let clean_list = strip_ansi(&list_output); + assert_eq!( + list_code, 0, + "sandbox list should succeed after resume:\n{clean_list}" + ); + assert!( + clean_list.contains(&sandbox_name), + "sandbox '{sandbox_name}' should survive gateway stop/start.\nList output:\n{clean_list}" + ); + + // Cleanup: delete the test sandbox. + let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await; +} + +// ------------------------------------------------------------------- +// Test: container removed → gateway start resumes +// ------------------------------------------------------------------- + +/// After the Docker container is force-removed (simulating Docker restart), +/// `openshell gateway start` should resume from the existing volume. +#[tokio::test] +async fn gateway_start_resumes_after_container_removal() { + // Precondition: gateway is healthy. + wait_for_healthy(Duration::from_secs(30)).await; + + // Create a sandbox to verify state persistence. + let (create_output, create_code) = + run_cli(&["sandbox", "create", "--", "echo", "container-rm-test"]).await; + let clean_create = strip_ansi(&create_output); + assert_eq!( + create_code, 0, + "sandbox create should succeed:\n{clean_create}" + ); + + let sandbox_name = clean_create + .lines() + .find_map(|line| { + if let Some((_, rest)) = line.split_once("Created sandbox:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else if let Some((_, rest)) = line.split_once("Name:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else { + None + } + }) + .expect("should extract sandbox name from create output"); + + // Force-remove the container (simulates Docker restart / OOM kill). + let (_, rm_code) = docker_cmd(&["rm", "-f", &container_name()]); + assert_eq!(rm_code, 0, "docker rm -f should succeed"); + + // Verify the volume still exists. + let (vol_out, vol_code) = docker_cmd(&[ + "volume", + "inspect", + &format!("openshell-cluster-{GATEWAY_NAME}"), + ]); + assert_eq!( + vol_code, 0, + "volume should still exist after container removal:\n{vol_out}" + ); + + // Start the gateway — should resume from the volume. + let (start_output, start_code) = run_cli(&["gateway", "start"]).await; + let clean_start = strip_ansi(&start_output); + assert_eq!( + start_code, 0, + "gateway start after container removal should succeed:\n{clean_start}" + ); + + // Wait for healthy. + wait_for_healthy(Duration::from_secs(180)).await; + + // Verify sandbox survived. + let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await; + let clean_list = strip_ansi(&list_output); + assert_eq!( + list_code, 0, + "sandbox list should succeed after resume:\n{clean_list}" + ); + assert!( + clean_list.contains(&sandbox_name), + "sandbox '{sandbox_name}' should survive container removal + resume.\nList output:\n{clean_list}" + ); + + // Cleanup. + let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await; +} + +// ------------------------------------------------------------------- +// Test: container killed → gateway start resumes, sandboxes survive, +// new sandbox create works +// ------------------------------------------------------------------- + +/// When a container is killed (stopped but NOT removed), `gateway start` +/// should resume from existing state. This validates three things: +/// +/// 1. The stale Docker network reference is reconciled (ensure_network +/// destroys and recreates the network with a new ID). +/// 2. Existing sandboxes created before the kill survive the restart. +/// 3. New `sandbox create` works after resume — the TLS certificates +/// are reused (not needlessly regenerated), so the CLI's mTLS certs +/// still match the server. +#[tokio::test] +async fn gateway_start_resumes_after_container_kill() { + // Precondition: gateway is healthy. + wait_for_healthy(Duration::from_secs(30)).await; + + let cname = container_name(); + let net_name = format!("openshell-cluster-{GATEWAY_NAME}"); + + // Create a sandbox before the kill to verify state persistence. + let (create_output, create_code) = + run_cli(&["sandbox", "create", "--", "echo", "kill-resume-test"]).await; + let clean_create = strip_ansi(&create_output); + assert_eq!( + create_code, 0, + "sandbox create should succeed:\n{clean_create}" + ); + + let sandbox_before = clean_create + .lines() + .find_map(|line| { + if let Some((_, rest)) = line.split_once("Created sandbox:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else if let Some((_, rest)) = line.split_once("Name:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else { + None + } + }) + .expect("should extract sandbox name from create output"); + + // Kill the container (it remains as a stopped container, unlike `docker rm`). + let (_, kill_code) = docker_cmd(&["kill", &cname]); + assert_eq!(kill_code, 0, "docker kill should succeed"); + + sleep(Duration::from_secs(3)).await; + + // Remove the Docker network to simulate a stale network reference. + // The bootstrap `ensure_network` always destroys and recreates, so + // after this the container's stored network ID will be invalid. + let _ = docker_cmd(&["network", "disconnect", "-f", &net_name, &cname]); + let (_, net_rm_code) = docker_cmd(&["network", "rm", &net_name]); + assert_eq!( + net_rm_code, 0, + "docker network rm should succeed (or network already gone)" + ); + + // Start the gateway — must handle stale network + reuse existing PKI. + let (start_output, start_code) = run_cli(&["gateway", "start"]).await; + let clean_start = strip_ansi(&start_output); + assert_eq!( + start_code, 0, + "gateway start after kill should succeed:\n{clean_start}" + ); + + // Wait for the gateway to become healthy again. + wait_for_healthy(Duration::from_secs(180)).await; + + // Verify the pre-existing sandbox survived. + let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await; + let clean_list = strip_ansi(&list_output); + assert_eq!( + list_code, 0, + "sandbox list should succeed after resume:\n{clean_list}" + ); + assert!( + clean_list.contains(&sandbox_before), + "sandbox '{sandbox_before}' should survive container kill + resume.\nList output:\n{clean_list}" + ); + + // Create a new sandbox to verify TLS is working end-to-end. + let (new_create_output, new_create_code) = + run_cli(&["sandbox", "create", "--", "echo", "post-resume-test"]).await; + let clean_new = strip_ansi(&new_create_output); + assert_eq!( + new_create_code, 0, + "sandbox create after resume should succeed (TLS must work):\n{clean_new}" + ); + + let sandbox_after = clean_new + .lines() + .find_map(|line| { + if let Some((_, rest)) = line.split_once("Created sandbox:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else if let Some((_, rest)) = line.split_once("Name:") { + rest.split_whitespace().next().map(ToOwned::to_owned) + } else { + None + } + }) + .expect("should extract sandbox name from post-resume create output"); + + // Cleanup. + let _ = run_cli(&["sandbox", "delete", &sandbox_before]).await; + let _ = run_cli(&["sandbox", "delete", &sandbox_after]).await; +} + +// ------------------------------------------------------------------- +// Test: SSH handshake secret persists across container restart +// ------------------------------------------------------------------- + +/// The SSH handshake K8s secret should persist across gateway stop/start +/// cycles — the same base64-encoded value should be returned before and +/// after the restart. +#[tokio::test] +async fn ssh_handshake_secret_persists_across_restart() { + // Precondition: gateway is healthy. + wait_for_healthy(Duration::from_secs(30)).await; + + // Read the SSH handshake secret before restart. + let secret_before = read_ssh_handshake_secret() + .expect("SSH handshake secret should exist before restart"); + assert!( + !secret_before.is_empty(), + "SSH handshake secret should not be empty" + ); + + // Stop the gateway. + let (_, stop_code) = run_cli(&["gateway", "stop"]).await; + assert_eq!(stop_code, 0, "gateway stop should succeed"); + + sleep(Duration::from_secs(3)).await; + + // Start the gateway. + let (start_output, start_code) = run_cli(&["gateway", "start"]).await; + assert_eq!( + start_code, 0, + "gateway start should succeed:\n{}", + strip_ansi(&start_output) + ); + + // Wait for healthy. + wait_for_healthy(Duration::from_secs(180)).await; + + // Read the secret after restart. + let secret_after = read_ssh_handshake_secret() + .expect("SSH handshake secret should exist after restart"); + + assert_eq!( + secret_before, secret_after, + "SSH handshake secret should be identical before and after restart" + ); +} diff --git a/tasks/scripts/cluster-deploy-fast.sh b/tasks/scripts/cluster-deploy-fast.sh index 600bdd6c..307e7623 100755 --- a/tasks/scripts/cluster-deploy-fast.sh +++ b/tasks/scripts/cluster-deploy-fast.sh @@ -408,12 +408,13 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then # terminates mTLS (there is no server.tls.enabled toggle). Without this, # a prior Helm override or chart default change could silently regress # sandbox callbacks to plaintext. - # Retrieve the existing handshake secret from the running release, or generate - # a new one if this is the first deploy with the mandatory secret. - EXISTING_SECRET=$(cluster_exec "helm get values openshell -n openshell -o json 2>/dev/null \ - | grep -o '\"sshHandshakeSecret\":\"[^\"]*\"' \ - | cut -d'\"' -f4") || true - SSH_HANDSHAKE_SECRET="${EXISTING_SECRET:-$(openssl rand -hex 32)}" + # Ensure the SSH handshake K8s secret exists. The bootstrap process normally + # creates it, but fast-deploy may run before bootstrap on a fresh cluster. + EXISTING_SECRET=$(cluster_exec "kubectl -n openshell get secret openshell-ssh-handshake -o jsonpath='{.data.secret}' 2>/dev/null | base64 -d" 2>/dev/null) || true + if [ -z "${EXISTING_SECRET}" ]; then + SSH_HANDSHAKE_SECRET="$(openssl rand -hex 32)" + cluster_exec "kubectl -n openshell create secret generic openshell-ssh-handshake --from-literal=secret='${SSH_HANDSHAKE_SECRET}' --dry-run=client -o yaml | kubectl apply -f -" + fi # Retrieve the host gateway IP from the entrypoint-rendered HelmChart CR so # that hostAliases for host.openshell.internal are preserved across fast deploys. @@ -433,7 +434,6 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then --set server.tls.certSecretName=openshell-server-tls \ --set server.tls.clientCaSecretName=openshell-server-client-ca \ --set server.tls.clientTlsSecretName=openshell-client-tls \ - --set server.sshHandshakeSecret=${SSH_HANDSHAKE_SECRET} \ ${HOST_GATEWAY_ARGS} \ ${helm_wait_args}" helm_end=$(date +%s)