From 9b0af9c09b91fe0b23009bd0d17e89993476f439 Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Thu, 19 Mar 2026 14:59:07 -0700
Subject: [PATCH 1/7] feat(bootstrap): resume gateway from existing state and
 persist SSH handshake secret

Add a resume code path to gateway start so existing Docker volume state
(k3s, etcd, sandboxes, secrets) is reused instead of requiring a full
destroy/recreate cycle. When the container is gone but the volume remains
(e.g. Docker restart), the CLI automatically creates a new container with
the existing volume and reconciles PKI and secrets.

Move the SSH handshake HMAC secret from ephemeral generation in the
cluster entrypoint (regenerated on every container start) to a Kubernetes
Secret that persists in etcd on the Docker volume. This ensures sandbox
SSH sessions survive container restarts.

Key changes:
- Add DeployOptions.resume flag with resume branch in deploy flow
- Add cleanup_gateway_container for volume-preserving failure cleanup
- Auto-resume in gateway_admin_deploy (stopped/volume-only states)
- Auto-bootstrap tries resume first, falls back to recreate
- Add unless-stopped Docker restart policy to gateway container
- Reconcile SSH handshake secret as K8s Secret alongside TLS PKI
- Update Helm chart to read secret via secretKeyRef
- Add SSH handshake secret to cluster health check

Closes #487
---
 crates/openshell-bootstrap/src/constants.rs   |   2 +
 crates/openshell-bootstrap/src/docker.rs      |  51 +++++-
 crates/openshell-bootstrap/src/lib.rs         | 145 ++++++++++++++++--
 crates/openshell-cli/src/bootstrap.rs         |  89 ++++++-----
 crates/openshell-cli/src/run.rs               |  61 +++-----
 deploy/docker/cluster-entrypoint.sh           |  11 +-
 deploy/docker/cluster-healthcheck.sh          |   3 +
 .../helm/openshell/templates/statefulset.yaml |   5 +-
 deploy/helm/openshell/values.yaml             |   8 +-
 .../kube/manifests/openshell-helmchart.yaml   |   1 -
 tasks/scripts/cluster-deploy-fast.sh          |  14 +-
 11 files changed, 279 insertions(+), 111 deletions(-)

diff --git a/crates/openshell-bootstrap/src/constants.rs b/crates/openshell-bootstrap/src/constants.rs
index ff283b3e..74e381fd 100644
--- a/crates/openshell-bootstrap/src/constants.rs
+++ b/crates/openshell-bootstrap/src/constants.rs
@@ -11,6 +11,8 @@ pub const SERVER_TLS_SECRET_NAME: &str = "openshell-server-tls";
 pub const SERVER_CLIENT_CA_SECRET_NAME: &str = "openshell-server-client-ca";
 /// K8s secret holding the client TLS certificate, key, and CA cert (shared by CLI and sandboxes).
 pub const CLIENT_TLS_SECRET_NAME: &str = "openshell-client-tls";
+/// K8s secret holding the SSH handshake HMAC secret (shared by gateway and sandbox pods).
+pub const SSH_HANDSHAKE_SECRET_NAME: &str = "openshell-ssh-handshake";
 
 pub fn container_name(name: &str) -> String {
     format!("openshell-cluster-{name}")
diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs
index 9c365bfe..72b9542d 100644
--- a/crates/openshell-bootstrap/src/docker.rs
+++ b/crates/openshell-bootstrap/src/docker.rs
@@ -9,7 +9,8 @@ use bollard::Docker;
 use bollard::errors::Error as BollardError;
 use bollard::models::{
     ContainerCreateBody, DeviceRequest, HostConfig, HostConfigCgroupnsModeEnum,
-    NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, VolumeCreateRequest,
+    NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, RestartPolicy,
+    RestartPolicyNameEnum, VolumeCreateRequest,
 };
 use bollard::query_parameters::{
     CreateContainerOptions, CreateImageOptions, InspectContainerOptions, InspectNetworkOptions,
@@ -532,6 +533,12 @@ pub async fn ensure_container(
         port_bindings: Some(port_bindings),
         binds: Some(vec![format!("{}:/var/lib/rancher/k3s", volume_name(name))]),
         network_mode: Some(network_name(name)),
+        // Automatically restart the container when Docker restarts, unless the
+        // user explicitly stopped it with `gateway stop`.
+        restart_policy: Some(RestartPolicy {
+            name: Some(RestartPolicyNameEnum::UNLESS_STOPPED),
+            maximum_retry_count: None,
+        }),
         // Add host gateway aliases for DNS resolution.
         // This allows both the entrypoint script and the running gateway
         // process to reach services on the Docker host.
@@ -919,6 +926,48 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<()
     Ok(())
 }
 
+/// Clean up the gateway container and network, preserving the persistent volume.
+///
+/// Used when a resume attempt fails — we want to remove the container we may
+/// have just created but keep the volume so the user can retry without losing
+/// their k3s/etcd state and sandbox data.
+pub async fn cleanup_gateway_container(docker: &Docker, name: &str) -> Result<()> {
+    let container_name = container_name(name);
+    let net_name = network_name(name);
+
+    // Disconnect container from network
+    let _ = docker
+        .disconnect_network(
+            &net_name,
+            NetworkDisconnectRequest {
+                container: container_name.clone(),
+                force: Some(true),
+            },
+        )
+        .await;
+
+    let _ = stop_container(docker, &container_name).await;
+
+    let remove_container = docker
+        .remove_container(
+            &container_name,
+            Some(RemoveContainerOptions {
+                force: true,
+                ..Default::default()
+            }),
+        )
+        .await;
+    if let Err(err) = remove_container
+        && !is_not_found(&err)
+    {
+        return Err(err).into_diagnostic();
+    }
+
+    force_remove_network(docker, &net_name).await?;
+
+    Ok(())
+}
+
 /// Forcefully remove a Docker network, disconnecting any remaining
 /// containers first. This ensures that stale Docker network endpoints
 /// cannot prevent port bindings from being released.
diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs
index 9098fd4a..5cee9cee 100644
--- a/crates/openshell-bootstrap/src/lib.rs
+++ b/crates/openshell-bootstrap/src/lib.rs
@@ -26,12 +26,13 @@ use miette::{IntoDiagnostic, Result};
 use std::sync::{Arc, Mutex};
 
 use crate::constants::{
-    CLIENT_TLS_SECRET_NAME, SERVER_CLIENT_CA_SECRET_NAME, SERVER_TLS_SECRET_NAME, network_name,
-    volume_name,
+    CLIENT_TLS_SECRET_NAME, SERVER_CLIENT_CA_SECRET_NAME, SERVER_TLS_SECRET_NAME,
+    SSH_HANDSHAKE_SECRET_NAME, network_name, volume_name,
 };
 use crate::docker::{
-    check_existing_gateway, check_port_conflicts, destroy_gateway_resources, ensure_container,
-    ensure_image, ensure_network, ensure_volume, start_container, stop_container,
+    check_existing_gateway, check_port_conflicts, cleanup_gateway_container,
+    destroy_gateway_resources, ensure_container, ensure_image, ensure_network, ensure_volume,
+    start_container, stop_container,
 };
 use crate::metadata::{
     create_gateway_metadata, create_gateway_metadata_with_host, local_gateway_host,
@@ -119,6 +120,11 @@ pub struct DeployOptions {
     /// When false, an existing gateway is left as-is and deployment is
     /// skipped (the caller is responsible for prompting the user first).
     pub recreate: bool,
+    /// When true, resume from existing state (volume, stopped container)
+    /// instead of erroring out. The deploy flow reuses the existing volume
+    /// and creates a new container if needed, preserving k3s/etcd state,
+    /// sandbox pods, and secrets.
+    pub resume: bool,
 }
 
 impl DeployOptions {
@@ -135,6 +141,7 @@ impl DeployOptions {
             registry_token: None,
             gpu: false,
             recreate: false,
+            resume: false,
         }
     }
 
@@ -200,6 +207,13 @@ impl DeployOptions {
         self.recreate = recreate;
         self
     }
+
+    /// Set whether to resume from existing state (volume, stopped container).
+    #[must_use]
+    pub fn with_resume(mut self, resume: bool) -> Self {
+        self.resume = resume;
+        self
+    }
 }
 
 #[derive(Debug, Clone)]
@@ -264,6 +278,7 @@ where
     let registry_token = options.registry_token;
     let gpu = options.gpu;
     let recreate = options.recreate;
+    let resume = options.resume;
 
     // Wrap on_log in Arc<Mutex<>> so we can share it with pull_remote_image
     // which needs a 'static callback for the bollard streaming pull.
@@ -288,12 +303,28 @@ where
         (preflight.docker, None)
     };
 
-    // If an existing gateway is found, either tear it down (when recreate is
-    // requested) or bail out so the caller can prompt the user / reuse it.
+    // Guard: recreate takes precedence if both are set (shouldn't happen in practice).
+    if resume && recreate {
+        tracing::warn!("both resume and recreate set; recreate takes precedence");
+    }
+
+    // If an existing gateway is found, decide how to proceed:
+    // - recreate: destroy everything and start fresh
+    // - resume: keep existing state and create/start the container
+    // - neither: error out (caller should prompt the user)
     if let Some(existing) = check_existing_gateway(&target_docker, &name).await? {
         if recreate {
             log("[status] Removing existing gateway".to_string());
             destroy_gateway_resources(&target_docker, &name).await?;
+        } else if resume {
+            if existing.container_running {
+                log("[status] Gateway is already running".to_string());
+            } else {
+                log("[status] Resuming gateway from existing state".to_string());
+            }
+            // Fall through to ensure_* calls — they are idempotent and will
+            // reuse the existing volume, create a container if needed, and
+            // start it.
         } else {
             return Err(miette::miette!(
                 "Gateway '{name}' already exists (container_running={}).\n\
@@ -455,6 +486,11 @@ where
 
         store_pki_bundle(&name, &pki_bundle)?;
 
+        // Reconcile SSH handshake secret: reuse existing K8s secret if present,
+        // generate and persist a new one otherwise. This secret is stored in etcd
+        // (on the persistent volume) so it survives container restarts.
+        reconcile_ssh_handshake_secret(&target_docker, &name, &log).await?;
+
         // Push locally-built component images into the k3s containerd runtime.
         // This is the "push" path for local development — images are exported from
         // the local Docker daemon and streamed into the cluster's containerd so
@@ -524,15 +560,30 @@ where
             docker: target_docker,
         }),
         Err(deploy_err) => {
-            // Automatically clean up Docker resources (volume, container, network,
-            // image) so the environment is left in a retryable state.
-            tracing::info!("deploy failed, cleaning up gateway resources for '{name}'");
-            if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await {
-                tracing::warn!(
-                    "automatic cleanup after failed deploy also failed: {cleanup_err}. \
-                     Manual cleanup may be required: \
-                     openshell gateway destroy --name {name}"
+            if resume {
+                // When resuming, preserve the volume so the user can retry.
+                // Only clean up the container and network that we may have created.
+                tracing::info!(
+                    "resume failed, cleaning up container for '{name}' (preserving volume)"
                 );
+                if let Err(cleanup_err) = cleanup_gateway_container(&target_docker, &name).await {
+                    tracing::warn!(
+                        "automatic cleanup after failed resume also failed: {cleanup_err}. \
+                         Manual cleanup may be required: \
+                         openshell gateway destroy --name {name}"
+                    );
+                }
+            } else {
+                // Automatically clean up Docker resources (volume, container, network,
+                // image) so the environment is left in a retryable state.
+                tracing::info!("deploy failed, cleaning up gateway resources for '{name}'");
+                if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await {
+                    tracing::warn!(
+                        "automatic cleanup after failed deploy also failed: {cleanup_err}. \
+                         Manual cleanup may be required: \
+                         openshell gateway destroy --name {name}"
+                    );
+                }
             }
             Err(deploy_err)
         }
@@ -837,6 +888,72 @@ where
     Ok((bundle, true))
 }
 
+/// Reconcile the SSH handshake HMAC secret as a Kubernetes Secret.
+///
+/// If the secret already exists in the cluster, this is a no-op. Otherwise a
+/// fresh 32-byte hex secret is generated and applied. Because the secret lives
+/// in etcd (backed by the persistent Docker volume), it survives container
+/// restarts without regeneration — existing sandbox SSH sessions remain valid.
+async fn reconcile_ssh_handshake_secret<F>(docker: &Docker, name: &str, log: &F) -> Result<()>
+where
+    F: Fn(String) + Sync,
+{
+    use miette::WrapErr;
+
+    let cname = container_name(name);
+    let kubeconfig = constants::KUBECONFIG_PATH;
+
+    // Check if the secret already exists.
+    let (output, exit_code) = exec_capture_with_exit(
+        docker,
+        &cname,
+        vec![
+            "sh".to_string(),
+            "-c".to_string(),
+            format!(
+                "KUBECONFIG={kubeconfig} kubectl -n openshell get secret {SSH_HANDSHAKE_SECRET_NAME} -o jsonpath='{{.data.secret}}' 2>/dev/null"
+            ),
+        ],
+    )
+    .await?;
+
+    if exit_code == 0 && !output.trim().is_empty() {
+        tracing::debug!(
+            "existing SSH handshake secret found ({} bytes encoded)",
+            output.trim().len()
+        );
+        log("[progress] Reusing existing SSH handshake secret".to_string());
+        return Ok(());
+    }
+
+    // Generate a new 32-byte hex secret and create the K8s secret.
+    log("[progress] Generating SSH handshake secret".to_string());
+    let (output, exit_code) = exec_capture_with_exit(
+        docker,
+        &cname,
+        vec![
+            "sh".to_string(),
+            "-c".to_string(),
+            format!(
+                "SECRET=$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \\n') && \
+                 KUBECONFIG={kubeconfig} kubectl -n openshell create secret generic {SSH_HANDSHAKE_SECRET_NAME} \
+                 --from-literal=secret=$SECRET --dry-run=client -o yaml | \
+                 KUBECONFIG={kubeconfig} kubectl apply -f -"
+            ),
+        ],
+    )
+    .await?;
+
+    if exit_code != 0 {
+        return Err(miette::miette!(
+            "failed to create SSH handshake secret (exit {exit_code}): {output}"
+        ))
+        .wrap_err("failed to apply SSH handshake secret");
+    }
+
+    Ok(())
+}
+
 /// Load existing TLS secrets from the cluster and reconstruct a [`PkiBundle`].
 ///
 /// Returns an error string describing why secrets couldn't be loaded (for logging).
diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs
index e976061f..9cbe8332 100644
--- a/crates/openshell-cli/src/bootstrap.rs
+++ b/crates/openshell-cli/src/bootstrap.rs
@@ -144,43 +144,62 @@ pub async fn run_bootstrap(
     );
     eprintln!();
 
-    // Auto-bootstrap always recreates if stale Docker resources are found
-    // (e.g. metadata was deleted but container/volume still exist).
-    let mut options = openshell_bootstrap::DeployOptions::new(&gateway_name).with_recreate(true);
-    if let Some(dest) = remote {
-        let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest);
-        if let Some(key) = ssh_key {
-            remote_opts = remote_opts.with_ssh_key(key);
+    // Build base deploy options. The auto-bootstrap path tries to resume from
+    // existing state first (preserving sandboxes and secrets), falling back to
+    // a full recreate if the resume fails.
+    let build_options = |resume: bool, recreate: bool| {
+        let mut opts = openshell_bootstrap::DeployOptions::new(&gateway_name)
+            .with_resume(resume)
+            .with_recreate(recreate)
+            .with_gpu(gpu);
+        if let Some(dest) = remote {
+            let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest);
+            if let Some(key) = ssh_key {
+                remote_opts = remote_opts.with_ssh_key(key);
+            }
+            opts = opts.with_remote(remote_opts);
         }
-        options = options.with_remote(remote_opts);
-    }
-    // Read registry credentials from environment for the auto-bootstrap path.
-    // The explicit `--registry-username` / `--registry-token` flags are only
-    // on `gateway start`; when bootstrapping via `sandbox create`, the env
-    // vars are the mechanism.
-    if let Ok(username) = std::env::var("OPENSHELL_REGISTRY_USERNAME")
-        && !username.trim().is_empty()
-    {
-        options = options.with_registry_username(username);
-    }
-    if let Ok(token) = std::env::var("OPENSHELL_REGISTRY_TOKEN")
-        && !token.trim().is_empty()
-    {
-        options = options.with_registry_token(token);
-    }
-    // Read gateway host override from environment. Needed whenever the
-    // client cannot reach the Docker host at 127.0.0.1 — CI containers,
-    // WSL, remote Docker hosts, etc. The explicit `--gateway-host` flag
-    // is only on `gateway start`; this env var covers the auto-bootstrap
-    // path triggered by `sandbox create`.
-    if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST")
-        && !host.trim().is_empty()
+        // Read registry credentials from environment for the auto-bootstrap path.
+        // The explicit `--registry-username` / `--registry-token` flags are only
+        // on `gateway start`; when bootstrapping via `sandbox create`, the env
+        // vars are the mechanism.
+        if let Ok(username) = std::env::var("OPENSHELL_REGISTRY_USERNAME")
+            && !username.trim().is_empty()
+        {
+            opts = opts.with_registry_username(username);
+        }
+        if let Ok(token) = std::env::var("OPENSHELL_REGISTRY_TOKEN")
+            && !token.trim().is_empty()
+        {
+            opts = opts.with_registry_token(token);
+        }
+        // Read gateway host override from environment. Needed whenever the
+        // client cannot reach the Docker host at 127.0.0.1 — CI containers,
+        // WSL, remote Docker hosts, etc. The explicit `--gateway-host` flag
+        // is only on `gateway start`; this env var covers the auto-bootstrap
+        // path triggered by `sandbox create`.
+        if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST")
+            && !host.trim().is_empty()
+        {
+            opts = opts.with_gateway_host(host);
+        }
+        opts
+    };
+
+    // Try resume first to preserve existing sandboxes and secrets.
+    let handle = match deploy_gateway_with_panel(
+        build_options(true, false),
+        &gateway_name,
+        location,
+    )
+    .await
     {
-        options = options.with_gateway_host(host);
-    }
-    options = options.with_gpu(gpu);
-
-    let handle = deploy_gateway_with_panel(options, &gateway_name, location).await?;
+        Ok(handle) => handle,
+        Err(resume_err) => {
+            tracing::warn!("auto-bootstrap resume failed, falling back to recreate: {resume_err}");
+            deploy_gateway_with_panel(build_options(false, true), &gateway_name, location).await?
+        }
+    };
     let server = handle.gateway_endpoint().to_string();
 
     print_deploy_summary(&gateway_name, &handle);
diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs
index 2f9dd2f7..6fb4395a 100644
--- a/crates/openshell-cli/src/run.rs
+++ b/crates/openshell-cli/src/run.rs
@@ -1367,49 +1367,27 @@ pub async fn gateway_admin_deploy(
         opts
     });
 
-    // Check whether a gateway already exists. If so, prompt the user (unless
-    // --recreate was passed or we're in non-interactive mode).
-    let mut should_recreate = recreate;
+    // Check whether a gateway already exists and decide how to proceed:
+    // - --recreate: always destroy and start fresh
+    // - existing state (stopped container / volume only): auto-resume
+    // - already running: return immediately (nothing to do)
+    let should_recreate = recreate;
+    let mut should_resume = false;
     if let Some(existing) =
         openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await?
     {
-        if !should_recreate {
-            let interactive = std::io::stdin().is_terminal() && std::io::stderr().is_terminal();
-            if interactive {
-                let status = if existing.container_running {
-                    "running"
-                } else if existing.container_exists {
-                    "stopped"
-                } else {
-                    "volume only"
-                };
-                eprintln!();
-                eprintln!(
-                    "{} Gateway '{name}' already exists ({status}).",
-                    "!".yellow().bold()
-                );
-                if let Some(image) = &existing.container_image {
-                    eprintln!("  {} {}", "Image:".dimmed(), image);
-                }
-                eprintln!();
-                eprint!("Destroy and recreate? [y/N] ");
-                std::io::stderr().flush().ok();
-                let mut input = String::new();
-                std::io::stdin()
-                    .read_line(&mut input)
-                    .into_diagnostic()
-                    .wrap_err("failed to read user input")?;
-                let choice = input.trim().to_lowercase();
-                should_recreate = choice == "y" || choice == "yes";
-                if !should_recreate {
-                    eprintln!("Keeping existing gateway.");
-                    return Ok(());
-                }
-            } else {
-                // Non-interactive mode: reuse existing gateway silently.
-                eprintln!("Gateway '{name}' already exists, reusing.");
-                return Ok(());
-            }
+        if should_recreate {
+            // --recreate flag: fall through to destroy and redeploy.
+        } else if existing.container_running {
+            // Already running — nothing to do.
+            eprintln!(
+                "{} Gateway '{name}' is already running.",
+                "✓".green().bold()
+            );
+            return Ok(());
+        } else {
+            // Stopped container or volume-only: auto-resume from existing state.
+            should_resume = true;
         }
     }
 
@@ -1418,7 +1396,8 @@ pub async fn gateway_admin_deploy(
         .with_disable_tls(disable_tls)
         .with_disable_gateway_auth(disable_gateway_auth)
         .with_gpu(gpu)
-        .with_recreate(should_recreate);
+        .with_recreate(should_recreate)
+        .with_resume(should_resume);
     if let Some(opts) = remote_opts {
         options = options.with_remote(opts);
     }
diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh
index 84b8cf9a..3b069b7b 100644
--- a/deploy/docker/cluster-entrypoint.sh
+++ b/deploy/docker/cluster-entrypoint.sh
@@ -402,10 +402,10 @@ if [ -n "${IMAGE_PULL_POLICY:-}" ] && [ -f "$HELMCHART" ]; then
     sed -i "s|pullPolicy: Always|pullPolicy: ${IMAGE_PULL_POLICY}|" "$HELMCHART"
 fi
 
-# Generate a random SSH handshake secret for the NSSH1 HMAC handshake between
-# the gateway and sandbox SSH servers. This is required — the server will refuse
-# to start without it.
-SSH_HANDSHAKE_SECRET="${SSH_HANDSHAKE_SECRET:-$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \n')}"
+# SSH handshake secret: previously generated here and injected via sed into the
+# HelmChart CR. Now persisted as a Kubernetes Secret (openshell-ssh-handshake)
+# created by the bootstrap process after k3s starts. This ensures the secret
+# survives container restarts without regeneration.
 
 # Inject SSH gateway host/port into the HelmChart manifest so the openshell
 # server returns the correct address to CLI clients for SSH proxy CONNECT.
@@ -424,9 +424,6 @@ if [ -f "$HELMCHART" ]; then
         # Clear the placeholder so the default (8080) is used
         sed -i "s|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g" "$HELMCHART"
     fi
-    echo "Setting SSH handshake secret"
-    sed -i "s|__SSH_HANDSHAKE_SECRET__|${SSH_HANDSHAKE_SECRET}|g" "$HELMCHART"
-
     # Disable gateway auth: when set, the server accepts connections without
     # client certificates (for reverse-proxy / Cloudflare Tunnel deployments).
     if [ "${DISABLE_GATEWAY_AUTH:-}" = "true" ]; then
diff --git a/deploy/docker/cluster-healthcheck.sh b/deploy/docker/cluster-healthcheck.sh
index 68210b45..1bf76f71 100644
--- a/deploy/docker/cluster-healthcheck.sh
+++ b/deploy/docker/cluster-healthcheck.sh
@@ -68,3 +68,6 @@ if [ "${DISABLE_TLS:-}" != "true" ]; then
     kubectl -n openshell get secret openshell-server-tls >/dev/null 2>&1 || exit 1
     kubectl -n openshell get secret openshell-client-tls >/dev/null 2>&1 || exit 1
 fi
+
+# Verify SSH handshake secret exists (created by openshell-bootstrap alongside TLS secrets)
+kubectl -n openshell get secret openshell-ssh-handshake >/dev/null 2>&1 || exit 1
diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml
index 1be8f14a..ed503a78 100644
--- a/deploy/helm/openshell/templates/statefulset.yaml
+++ b/deploy/helm/openshell/templates/statefulset.yaml
@@ -77,7 +77,10 @@ spec:
               value: {{ .Values.server.hostGatewayIP | quote }}
             {{- end }}
             - name: OPENSHELL_SSH_HANDSHAKE_SECRET
-              value: {{ required "server.sshHandshakeSecret is required" .Values.server.sshHandshakeSecret | quote }}
+              valueFrom:
+                secretKeyRef:
+                  name: {{ .Values.server.sshHandshakeSecretName | quote }}
+                  key: secret
             {{- if .Values.server.disableTls }}
             - name: OPENSHELL_DISABLE_TLS
               value: "true"
diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml
index ccc8d1ff..d698e812 100644
--- a/deploy/helm/openshell/values.yaml
+++ b/deploy/helm/openshell/values.yaml
@@ -85,10 +85,10 @@ server:
   sshGatewayPort: 0
   # TLS configuration for the server. The server always terminates mTLS
   # directly and requires client certificates.
-  # HMAC secret used for the NSSH1 handshake between gateway and sandbox SSH.
-  # Required — the server will refuse to start if empty. For cluster deployments
-  # this is auto-generated by the entrypoint script.
-  sshHandshakeSecret: ""
+  # Name of the Kubernetes Secret holding the NSSH1 HMAC handshake key.
+  # The secret must contain a `secret` key with the hex-encoded HMAC key.
+  # For cluster deployments this is auto-created by the bootstrap process.
+  sshHandshakeSecretName: "openshell-ssh-handshake"
   # Host gateway IP for sandbox pod hostAliases. When set, sandbox pods get
   # hostAliases entries mapping host.docker.internal and host.openshell.internal
   # to this IP, allowing them to reach services running on the Docker host.
diff --git a/deploy/kube/manifests/openshell-helmchart.yaml b/deploy/kube/manifests/openshell-helmchart.yaml
index 2245c72e..ae22ddc6 100644
--- a/deploy/kube/manifests/openshell-helmchart.yaml
+++ b/deploy/kube/manifests/openshell-helmchart.yaml
@@ -32,7 +32,6 @@ spec:
       sandboxImage: ghcr.io/nvidia/openshell-community/sandboxes/base:latest
       sshGatewayHost: __SSH_GATEWAY_HOST__
       sshGatewayPort: __SSH_GATEWAY_PORT__
-      sshHandshakeSecret: __SSH_HANDSHAKE_SECRET__
       grpcEndpoint: "https://openshell.openshell.svc.cluster.local:8080"
       hostGatewayIP: __HOST_GATEWAY_IP__
       disableGatewayAuth: __DISABLE_GATEWAY_AUTH__
diff --git a/tasks/scripts/cluster-deploy-fast.sh b/tasks/scripts/cluster-deploy-fast.sh
index 600bdd6c..307e7623 100755
--- a/tasks/scripts/cluster-deploy-fast.sh
+++ b/tasks/scripts/cluster-deploy-fast.sh
@@ -408,12 +408,13 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then
   # terminates mTLS (there is no server.tls.enabled toggle). Without this,
   # a prior Helm override or chart default change could silently regress
   # sandbox callbacks to plaintext.
-  # Retrieve the existing handshake secret from the running release, or generate
-  # a new one if this is the first deploy with the mandatory secret.
-  EXISTING_SECRET=$(cluster_exec "helm get values openshell -n openshell -o json 2>/dev/null \
-    | grep -o '\"sshHandshakeSecret\":\"[^\"]*\"' \
-    | cut -d'\"' -f4") || true
-  SSH_HANDSHAKE_SECRET="${EXISTING_SECRET:-$(openssl rand -hex 32)}"
+  # Ensure the SSH handshake K8s secret exists. The bootstrap process normally
+  # creates it, but fast-deploy may run before bootstrap on a fresh cluster.
+  EXISTING_SECRET=$(cluster_exec "kubectl -n openshell get secret openshell-ssh-handshake -o jsonpath='{.data.secret}' 2>/dev/null | base64 -d" 2>/dev/null) || true
+  if [ -z "${EXISTING_SECRET}" ]; then
+    SSH_HANDSHAKE_SECRET="$(openssl rand -hex 32)"
+    cluster_exec "kubectl -n openshell create secret generic openshell-ssh-handshake --from-literal=secret='${SSH_HANDSHAKE_SECRET}' --dry-run=client -o yaml | kubectl apply -f -"
+  fi
 
   # Retrieve the host gateway IP from the entrypoint-rendered HelmChart CR so
   # that hostAliases for host.openshell.internal are preserved across fast deploys.
@@ -433,7 +434,6 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then
     --set server.tls.certSecretName=openshell-server-tls \
     --set server.tls.clientCaSecretName=openshell-server-client-ca \
     --set server.tls.clientTlsSecretName=openshell-client-tls \
-    --set server.sshHandshakeSecret=${SSH_HANDSHAKE_SECRET} \
     ${HOST_GATEWAY_ARGS} \
     ${helm_wait_args}"
   helm_end=$(date +%s)

From b91783552c7a5400809d28d0d63379bc0d7abc9c Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Thu, 19 Mar 2026 21:43:22 -0700
Subject: [PATCH 2/7] add e2e tests

---
 e2e/rust/tests/gateway_resume.rs | 324 +++++++++++++++++++++++++++++++
 1 file changed, 324 insertions(+)
 create mode 100644 e2e/rust/tests/gateway_resume.rs

diff --git a/e2e/rust/tests/gateway_resume.rs b/e2e/rust/tests/gateway_resume.rs
new file mode 100644
index 00000000..b9e9b971
--- /dev/null
+++ b/e2e/rust/tests/gateway_resume.rs
@@ -0,0 +1,324 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#![cfg(feature = "e2e")]
+
+//! E2E tests for gateway resume from existing state.
+//!
+//! These tests verify that `openshell gateway start` resumes from existing
+//! Docker volume state (after stop or container removal) and that the SSH
+//! handshake secret persists across container restarts.
+//!
+//! **Requires a running gateway** — the `e2e:rust` mise task bootstraps one.
+
+use std::process::{Command, Stdio};
+use std::time::Duration;
+
+use openshell_e2e::harness::binary::openshell_cmd;
+use openshell_e2e::harness::output::strip_ansi;
+use tokio::time::sleep;
+
+/// Default gateway name used by the e2e cluster.
+const GATEWAY_NAME: &str = "openshell";
+
+/// Docker container name for the default gateway.
+fn container_name() -> String {
+    format!("openshell-cluster-{GATEWAY_NAME}")
+}
+
+/// Run `openshell <args>` and return (combined output, exit code).
+async fn run_cli(args: &[&str]) -> (String, i32) {
+    let mut cmd = openshell_cmd();
+    cmd.args(args)
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped());
+
+    let output = cmd.output().await.expect("spawn openshell");
+    let stdout = String::from_utf8_lossy(&output.stdout).to_string();
+    let stderr = String::from_utf8_lossy(&output.stderr).to_string();
+    let combined = format!("{stdout}{stderr}");
+    let code = output.status.code().unwrap_or(-1);
+    (combined, code)
+}
+
+/// Run `docker <args>` synchronously and return (stdout, exit code).
+fn docker_cmd(args: &[&str]) -> (String, i32) {
+    let output = Command::new("docker")
+        .args(args)
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .output()
+        .expect("spawn docker");
+    let stdout = String::from_utf8_lossy(&output.stdout).to_string();
+    let code = output.status.code().unwrap_or(-1);
+    (stdout, code)
+}
+
+/// Wait for the gateway to become healthy by polling `openshell status`.
+async fn wait_for_healthy(timeout: Duration) {
+    let start = std::time::Instant::now();
+    loop {
+        let (output, code) = run_cli(&["status"]).await;
+        let clean = strip_ansi(&output).to_lowercase();
+        if code == 0 && (clean.contains("healthy") || clean.contains("running") || clean.contains("✓")) {
+            return;
+        }
+        if start.elapsed() > timeout {
+            panic!(
+                "gateway did not become healthy within {}s. Last output:\n{}",
+                timeout.as_secs(),
+                strip_ansi(&output)
+            );
+        }
+        sleep(Duration::from_secs(3)).await;
+    }
+}
+
+/// Read the SSH handshake secret from the K8s secret inside the cluster.
+fn read_ssh_handshake_secret() -> Option<String> {
+    let cname = container_name();
+    let (output, code) = docker_cmd(&[
+        "exec",
+        &cname,
+        "sh",
+        "-c",
+        "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl -n openshell get secret openshell-ssh-handshake -o jsonpath='{.data.secret}' 2>/dev/null",
+    ]);
+    if code == 0 && !output.trim().is_empty() {
+        Some(output.trim().to_string())
+    } else {
+        None
+    }
+}
+
+// -------------------------------------------------------------------
+// Test: `gateway start` on an already-running gateway succeeds
+// -------------------------------------------------------------------
+
+/// When the gateway is already running, `openshell gateway start` should
+/// return immediately with exit code 0 and indicate it's already running.
+#[tokio::test]
+async fn gateway_start_on_running_gateway_succeeds() {
+    // Precondition: gateway is running (e2e cluster is up).
+    wait_for_healthy(Duration::from_secs(30)).await;
+
+    let (output, code) = run_cli(&["gateway", "start"]).await;
+    let clean = strip_ansi(&output);
+
+    assert_eq!(
+        code, 0,
+        "gateway start on running gateway should exit 0:\n{clean}"
+    );
+    assert!(
+        clean.to_lowercase().contains("already running"),
+        "output should indicate gateway is already running:\n{clean}"
+    );
+}
+
+// -------------------------------------------------------------------
+// Test: gateway stop → start resumes, sandbox survives
+// -------------------------------------------------------------------
+
+/// After `gateway stop` then `gateway start`, the gateway should resume
+/// from existing state. A sandbox created before the stop should still
+/// appear in the sandbox list after restart.
+#[tokio::test]
+async fn gateway_stop_start_resumes_with_sandbox() {
+    // Precondition: gateway is healthy.
+    wait_for_healthy(Duration::from_secs(30)).await;
+
+    // Create a sandbox that we'll check for after restart.
+    let (create_output, create_code) =
+        run_cli(&["sandbox", "create", "--", "echo", "resume-test"]).await;
+    let clean_create = strip_ansi(&create_output);
+    assert_eq!(
+        create_code, 0,
+        "sandbox create should succeed:\n{clean_create}"
+    );
+
+    // Extract sandbox name from output.
+    let sandbox_name = clean_create
+        .lines()
+        .find_map(|line| {
+            if let Some((_, rest)) = line.split_once("Created sandbox:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else if let Some((_, rest)) = line.split_once("Name:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else {
+                None
+            }
+        })
+        .expect("should extract sandbox name from create output");
+
+    // Stop the gateway.
+    let (stop_output, stop_code) = run_cli(&["gateway", "stop"]).await;
+    assert_eq!(
+        stop_code, 0,
+        "gateway stop should succeed:\n{}",
+        strip_ansi(&stop_output)
+    );
+
+    // Wait a moment for the container to fully stop.
+    sleep(Duration::from_secs(3)).await;
+
+    // Verify container is stopped.
+    let (inspect_out, _) = docker_cmd(&[
+        "inspect",
+        "-f",
+        "{{.State.Running}}",
+        &container_name(),
+    ]);
+    assert_eq!(
+        inspect_out.trim(),
+        "false",
+        "container should be stopped after gateway stop"
+    );
+
+    // Start the gateway again — should resume from existing state.
+    let (start_output, start_code) = run_cli(&["gateway", "start"]).await;
+    let clean_start = strip_ansi(&start_output);
+    assert_eq!(
+        start_code, 0,
+        "gateway start after stop should succeed:\n{clean_start}"
+    );
+
+    // Wait for the gateway to become healthy again.
+    wait_for_healthy(Duration::from_secs(180)).await;
+
+    // Verify the sandbox still exists.
+    let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await;
+    let clean_list = strip_ansi(&list_output);
+    assert_eq!(
+        list_code, 0,
+        "sandbox list should succeed after resume:\n{clean_list}"
+    );
+    assert!(
+        clean_list.contains(&sandbox_name),
+        "sandbox '{sandbox_name}' should survive gateway stop/start.\nList output:\n{clean_list}"
+    );
+
+    // Cleanup: delete the test sandbox.
+    let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await;
+}
+
+// -------------------------------------------------------------------
+// Test: container removed → gateway start resumes
+// -------------------------------------------------------------------
+
+/// After the Docker container is force-removed (simulating Docker restart),
+/// `openshell gateway start` should resume from the existing volume.
+#[tokio::test]
+async fn gateway_start_resumes_after_container_removal() {
+    // Precondition: gateway is healthy.
+    wait_for_healthy(Duration::from_secs(30)).await;
+
+    // Create a sandbox to verify state persistence.
+    let (create_output, create_code) =
+        run_cli(&["sandbox", "create", "--", "echo", "container-rm-test"]).await;
+    let clean_create = strip_ansi(&create_output);
+    assert_eq!(
+        create_code, 0,
+        "sandbox create should succeed:\n{clean_create}"
+    );
+
+    let sandbox_name = clean_create
+        .lines()
+        .find_map(|line| {
+            if let Some((_, rest)) = line.split_once("Created sandbox:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else if let Some((_, rest)) = line.split_once("Name:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else {
+                None
+            }
+        })
+        .expect("should extract sandbox name from create output");
+
+    // Force-remove the container (simulates Docker restart / OOM kill).
+    let (_, rm_code) = docker_cmd(&["rm", "-f", &container_name()]);
+    assert_eq!(rm_code, 0, "docker rm -f should succeed");
+
+    // Verify the volume still exists.
+    let (vol_out, vol_code) = docker_cmd(&[
+        "volume",
+        "inspect",
+        &format!("openshell-cluster-{GATEWAY_NAME}"),
+    ]);
+    assert_eq!(
+        vol_code, 0,
+        "volume should still exist after container removal:\n{vol_out}"
+    );
+
+    // Start the gateway — should resume from the volume.
+    let (start_output, start_code) = run_cli(&["gateway", "start"]).await;
+    let clean_start = strip_ansi(&start_output);
+    assert_eq!(
+        start_code, 0,
+        "gateway start after container removal should succeed:\n{clean_start}"
+    );
+
+    // Wait for healthy.
+    wait_for_healthy(Duration::from_secs(180)).await;
+
+    // Verify sandbox survived.
+    let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await;
+    let clean_list = strip_ansi(&list_output);
+    assert_eq!(
+        list_code, 0,
+        "sandbox list should succeed after resume:\n{clean_list}"
+    );
+    assert!(
+        clean_list.contains(&sandbox_name),
+        "sandbox '{sandbox_name}' should survive container removal + resume.\nList output:\n{clean_list}"
+    );
+
+    // Cleanup.
+    let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await;
+}
+
+// -------------------------------------------------------------------
+// Test: SSH handshake secret persists across container restart
+// -------------------------------------------------------------------
+
+/// The SSH handshake K8s secret should persist across gateway stop/start
+/// cycles — the same base64-encoded value should be returned before and
+/// after the restart.
+#[tokio::test]
+async fn ssh_handshake_secret_persists_across_restart() {
+    // Precondition: gateway is healthy.
+    wait_for_healthy(Duration::from_secs(30)).await;
+
+    // Read the SSH handshake secret before restart.
+    let secret_before = read_ssh_handshake_secret()
+        .expect("SSH handshake secret should exist before restart");
+    assert!(
+        !secret_before.is_empty(),
+        "SSH handshake secret should not be empty"
+    );
+
+    // Stop the gateway.
+    let (_, stop_code) = run_cli(&["gateway", "stop"]).await;
+    assert_eq!(stop_code, 0, "gateway stop should succeed");
+
+    sleep(Duration::from_secs(3)).await;
+
+    // Start the gateway.
+    let (start_output, start_code) = run_cli(&["gateway", "start"]).await;
+    assert_eq!(
+        start_code, 0,
+        "gateway start should succeed:\n{}",
+        strip_ansi(&start_output)
+    );
+
+    // Wait for healthy.
+    wait_for_healthy(Duration::from_secs(180)).await;
+
+    // Read the secret after restart.
+    let secret_after = read_ssh_handshake_secret()
+        .expect("SSH handshake secret should exist after restart");
+
+    assert_eq!(
+        secret_before, secret_after,
+        "SSH handshake secret should be identical before and after restart"
+    );
+}

From bec5c406634e05b2718812d8ae8b17452f7663bd Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Thu, 19 Mar 2026 22:24:38 -0700
Subject: [PATCH 3/7] fix(bootstrap): handle stale network and PKI on gateway
 resume

On resume after container kill, ensure_network destroys and recreates
the Docker network with a new ID. The stopped container still referenced
the old network ID, causing 'network not found' on start. Fix by
reconciling the container's network attachment in ensure_container.

Also, reconcile_pki was attempting to load K8s secrets before k3s had
booted, failing transiently, and regenerating PKI unnecessarily. This
triggered a server rollout restart causing TLS errors. Fix by waiting
for the openshell namespace before attempting to read existing secrets.

Add gRPC readiness check to gateway_admin_deploy so the CLI waits for
the server to accept connections before declaring the gateway ready.

Add e2e test covering container kill, stale network, sandbox persistence,
and sandbox create after resume.
---
 crates/openshell-bootstrap/src/docker.rs |  82 ++++++++++++++++-
 crates/openshell-bootstrap/src/lib.rs    |  12 ++-
 crates/openshell-cli/src/bootstrap.rs    |   2 +-
 crates/openshell-cli/src/run.rs          |   9 ++
 e2e/rust/tests/gateway_resume.rs         | 110 +++++++++++++++++++++++
 5 files changed, 207 insertions(+), 8 deletions(-)

diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs
index 72b9542d..ae88c49b 100644
--- a/crates/openshell-bootstrap/src/docker.rs
+++ b/crates/openshell-bootstrap/src/docker.rs
@@ -8,9 +8,9 @@ use bollard::API_DEFAULT_VERSION;
 use bollard::Docker;
 use bollard::errors::Error as BollardError;
 use bollard::models::{
-    ContainerCreateBody, DeviceRequest, HostConfig, HostConfigCgroupnsModeEnum,
-    NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, RestartPolicy,
-    RestartPolicyNameEnum, VolumeCreateRequest,
+    ContainerCreateBody, DeviceRequest, EndpointSettings, HostConfig, HostConfigCgroupnsModeEnum,
+    NetworkConnectRequest, NetworkCreateRequest, NetworkDisconnectRequest, PortBinding,
+    RestartPolicy, RestartPolicyNameEnum, VolumeCreateRequest,
 };
 use bollard::query_parameters::{
     CreateContainerOptions, CreateImageOptions, InspectContainerOptions, InspectNetworkOptions,
@@ -483,6 +483,17 @@ pub async fn ensure_container(
             };
 
             if image_matches {
+                // The container exists with the correct image, but its network
+                // attachment may be stale. When the gateway is resumed after a
+                // container kill, `ensure_network` destroys and recreates the
+                // Docker network (giving it a new ID). The stopped container
+                // still references the old network ID, so `docker start` would
+                // fail with "network <old-id> not found".
+                //
+                // Fix: disconnect from any existing networks and reconnect to
+                // the current (just-created) network before returning.
+                let expected_net = network_name(name);
+                reconcile_container_network(docker, &container_name, &expected_net).await?;
                 return Ok(());
             }
 
@@ -1005,6 +1016,71 @@ async fn force_remove_network(docker: &Docker, net_name: &str) -> Result<()> {
     }
 }
 
+/// Ensure a stopped container is connected to the expected Docker network.
+///
+/// When a gateway is resumed after the container was killed (but not removed),
+/// `ensure_network` destroys and recreates the network with a new ID. The
+/// stopped container still holds a reference to the old network ID in its
+/// config, so `docker start` would fail with a 404 "network not found" error.
+///
+/// This function disconnects the container from any networks that no longer
+/// match the expected network name and connects it to the correct one.
+async fn reconcile_container_network(
+    docker: &Docker,
+    container_name: &str,
+    expected_network: &str,
+) -> Result<()> {
+    let info = docker
+        .inspect_container(container_name, None::<InspectContainerOptions>)
+        .await
+        .into_diagnostic()
+        .wrap_err("failed to inspect container for network reconciliation")?;
+
+    // Check the container's current network attachments via NetworkSettings.
+    let attached_networks: Vec<String> = info
+        .network_settings
+        .as_ref()
+        .and_then(|ns| ns.networks.as_ref())
+        .map(|nets| nets.keys().cloned().collect())
+        .unwrap_or_default();
+
+    // If the container is already attached to the expected network (by name),
+    // Docker will resolve the name to the current network ID on start.
+    // However, when the network was destroyed and recreated, the container's
+    // stored endpoint references the old ID. Disconnect and reconnect to
+    // pick up the new network ID.
+    for net_name in &attached_networks {
+        let _ = docker
+            .disconnect_network(
+                net_name,
+                NetworkDisconnectRequest {
+                    container: container_name.to_string(),
+                    force: Some(true),
+                },
+            )
+            .await;
+    }
+
+    // Connect to the (freshly created) expected network.
+    docker
+        .connect_network(
+            expected_network,
+            NetworkConnectRequest {
+                container: container_name.to_string(),
+                endpoint_config: Some(EndpointSettings::default()),
+            },
+        )
+        .await
+        .into_diagnostic()
+        .wrap_err("failed to connect container to gateway network")?;
+
+    tracing::debug!(
+        "Reconciled network for container {container_name}: disconnected from {attached_networks:?}, connected to {expected_network}"
+    );
+
+    Ok(())
+}
+
 fn is_not_found(err: &BollardError) -> bool {
     matches!(
         err,
diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs
index 5cee9cee..eca973ec 100644
--- a/crates/openshell-bootstrap/src/lib.rs
+++ b/crates/openshell-bootstrap/src/lib.rs
@@ -860,6 +860,14 @@ where
     let cname = container_name(name);
     let kubeconfig = constants::KUBECONFIG_PATH;
 
+    // Wait for the k3s API server and openshell namespace before attempting
+    // to read secrets. Without this, kubectl fails transiently on resume
+    // (k3s hasn't booted yet), the code assumes secrets are gone, and
+    // regenerates PKI unnecessarily — triggering a server rollout restart
+    // and TLS errors for in-flight connections.
+    log("[progress] Waiting for openshell namespace".to_string());
+    wait_for_namespace(docker, &cname, kubeconfig, "openshell").await?;
+
     // Try to load existing secrets.
     match load_existing_pki_bundle(docker, &cname, kubeconfig).await {
         Ok(bundle) => {
@@ -874,10 +882,6 @@ where
     }
 
     // Generate fresh PKI and apply to cluster.
-    // Namespace may still be creating on first bootstrap, so wait here only
-    // when rotation is actually needed.
-    log("[progress] Waiting for openshell namespace".to_string());
-    wait_for_namespace(docker, &cname, kubeconfig, "openshell").await?;
     log("[progress] Generating TLS certificates".to_string());
     let bundle = generate_pki(extra_sans)?;
     log("[progress] Applying TLS secrets to gateway".to_string());
diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs
index 9cbe8332..0928991e 100644
--- a/crates/openshell-cli/src/bootstrap.rs
+++ b/crates/openshell-cli/src/bootstrap.rs
@@ -226,7 +226,7 @@ pub async fn run_bootstrap(
 /// Retry connecting to the gateway gRPC endpoint until it succeeds or a
 /// timeout is reached. Uses exponential backoff starting at 500 ms, doubling
 /// up to 4 s, with a total deadline of 30 s.
-async fn wait_for_grpc_ready(server: &str, tls: &TlsOptions) -> Result<()> {
+pub(crate) async fn wait_for_grpc_ready(server: &str, tls: &TlsOptions) -> Result<()> {
     const MAX_WAIT: Duration = Duration::from_secs(30);
     const INITIAL_BACKOFF: Duration = Duration::from_millis(500);
 
diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs
index 6fb4395a..2abd45a1 100644
--- a/crates/openshell-cli/src/run.rs
+++ b/crates/openshell-cli/src/run.rs
@@ -1413,6 +1413,15 @@ pub async fn gateway_admin_deploy(
 
     let handle = deploy_gateway_with_panel(options, name, location).await?;
 
+    // Wait for the gRPC endpoint to actually accept connections before
+    // declaring the gateway ready. The Docker health check may pass before
+    // the gRPC listener inside the pod is fully bound.
+    let server = handle.gateway_endpoint().to_string();
+    let tls = TlsOptions::default()
+        .with_gateway_name(name)
+        .with_default_paths(&server);
+    crate::bootstrap::wait_for_grpc_ready(&server, &tls).await?;
+
     print_deploy_summary(name, &handle);
 
     // Auto-activate: set this gateway as the active gateway.
diff --git a/e2e/rust/tests/gateway_resume.rs b/e2e/rust/tests/gateway_resume.rs
index b9e9b971..dbe091cc 100644
--- a/e2e/rust/tests/gateway_resume.rs
+++ b/e2e/rust/tests/gateway_resume.rs
@@ -276,6 +276,116 @@ async fn gateway_start_resumes_after_container_removal() {
     let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await;
 }
 
+// -------------------------------------------------------------------
+// Test: container killed → gateway start resumes, sandboxes survive,
+//       new sandbox create works
+// -------------------------------------------------------------------
+
+/// When a container is killed (stopped but NOT removed), `gateway start`
+/// should resume from existing state. This validates three things:
+///
+/// 1. The stale Docker network reference is reconciled (ensure_network
+///    destroys and recreates the network with a new ID).
+/// 2. Existing sandboxes created before the kill survive the restart.
+/// 3. New `sandbox create` works after resume — the TLS certificates
+///    are reused (not needlessly regenerated), so the CLI's mTLS certs
+///    still match the server.
+#[tokio::test]
+async fn gateway_start_resumes_after_container_kill() {
+    // Precondition: gateway is healthy.
+    wait_for_healthy(Duration::from_secs(30)).await;
+
+    let cname = container_name();
+    let net_name = format!("openshell-cluster-{GATEWAY_NAME}");
+
+    // Create a sandbox before the kill to verify state persistence.
+    let (create_output, create_code) =
+        run_cli(&["sandbox", "create", "--", "echo", "kill-resume-test"]).await;
+    let clean_create = strip_ansi(&create_output);
+    assert_eq!(
+        create_code, 0,
+        "sandbox create should succeed:\n{clean_create}"
+    );
+
+    let sandbox_before = clean_create
+        .lines()
+        .find_map(|line| {
+            if let Some((_, rest)) = line.split_once("Created sandbox:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else if let Some((_, rest)) = line.split_once("Name:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else {
+                None
+            }
+        })
+        .expect("should extract sandbox name from create output");
+
+    // Kill the container (it remains as a stopped container, unlike `docker rm`).
+    let (_, kill_code) = docker_cmd(&["kill", &cname]);
+    assert_eq!(kill_code, 0, "docker kill should succeed");
+
+    sleep(Duration::from_secs(3)).await;
+
+    // Remove the Docker network to simulate a stale network reference.
+    // The bootstrap `ensure_network` always destroys and recreates, so
+    // after this the container's stored network ID will be invalid.
+    let _ = docker_cmd(&["network", "disconnect", "-f", &net_name, &cname]);
+    let (_, net_rm_code) = docker_cmd(&["network", "rm", &net_name]);
+    assert_eq!(
+        net_rm_code, 0,
+        "docker network rm should succeed (or network already gone)"
+    );
+
+    // Start the gateway — must handle stale network + reuse existing PKI.
+    let (start_output, start_code) = run_cli(&["gateway", "start"]).await;
+    let clean_start = strip_ansi(&start_output);
+    assert_eq!(
+        start_code, 0,
+        "gateway start after kill should succeed:\n{clean_start}"
+    );
+
+    // Wait for the gateway to become healthy again.
+    wait_for_healthy(Duration::from_secs(180)).await;
+
+    // Verify the pre-existing sandbox survived.
+    let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await;
+    let clean_list = strip_ansi(&list_output);
+    assert_eq!(
+        list_code, 0,
+        "sandbox list should succeed after resume:\n{clean_list}"
+    );
+    assert!(
+        clean_list.contains(&sandbox_before),
+        "sandbox '{sandbox_before}' should survive container kill + resume.\nList output:\n{clean_list}"
+    );
+
+    // Create a new sandbox to verify TLS is working end-to-end.
+    let (new_create_output, new_create_code) =
+        run_cli(&["sandbox", "create", "--", "echo", "post-resume-test"]).await;
+    let clean_new = strip_ansi(&new_create_output);
+    assert_eq!(
+        new_create_code, 0,
+        "sandbox create after resume should succeed (TLS must work):\n{clean_new}"
+    );
+
+    let sandbox_after = clean_new
+        .lines()
+        .find_map(|line| {
+            if let Some((_, rest)) = line.split_once("Created sandbox:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else if let Some((_, rest)) = line.split_once("Name:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else {
+                None
+            }
+        })
+        .expect("should extract sandbox name from post-resume create output");
+
+    // Cleanup.
+    let _ = run_cli(&["sandbox", "delete", &sandbox_before]).await;
+    let _ = run_cli(&["sandbox", "delete", &sandbox_after]).await;
+}
+
 // -------------------------------------------------------------------
 // Test: SSH handshake secret persists across container restart
 // -------------------------------------------------------------------

From d8e22752af79cde52b4e78e4f28abe087437e0d3 Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Thu, 19 Mar 2026 23:05:31 -0700
Subject: [PATCH 4/7] fix(e2e): match 'connected' status in gateway resume
 health check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The wait_for_healthy helper checked for 'healthy', 'running', or '✓'
but openshell status outputs 'Connected'. All five gateway_resume tests
were failing because the health check never matched.
---
 e2e/rust/tests/gateway_resume.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/e2e/rust/tests/gateway_resume.rs b/e2e/rust/tests/gateway_resume.rs
index dbe091cc..67d28d19 100644
--- a/e2e/rust/tests/gateway_resume.rs
+++ b/e2e/rust/tests/gateway_resume.rs
@@ -60,7 +60,7 @@ async fn wait_for_healthy(timeout: Duration) {
     loop {
         let (output, code) = run_cli(&["status"]).await;
         let clean = strip_ansi(&output).to_lowercase();
-        if code == 0 && (clean.contains("healthy") || clean.contains("running") || clean.contains("✓")) {
+        if code == 0 && (clean.contains("healthy") || clean.contains("running") || clean.contains("connected") || clean.contains("✓")) {
             return;
         }
         if start.elapsed() > timeout {

From 494010eafb9eb6ee921f3db42df00c2334a9364f Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Thu, 19 Mar 2026 23:13:24 -0700
Subject: [PATCH 5/7] refactor(bootstrap): remove resume from DeployOptions,
 auto-detect internally

The deploy flow now auto-detects whether to resume by checking for
existing gateway state inside deploy_gateway_with_logs. Callers no
longer need to compute and pass a resume flag. The explicit gateway
start path still short-circuits for already-running gateways to avoid
redundant work.
---
 crates/openshell-bootstrap/src/lib.rs | 45 ++++++---------------------
 crates/openshell-cli/src/bootstrap.rs | 22 ++++++-------
 crates/openshell-cli/src/run.rs       | 36 ++++++++-------------
 3 files changed, 31 insertions(+), 72 deletions(-)

diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs
index eca973ec..22e417fa 100644
--- a/crates/openshell-bootstrap/src/lib.rs
+++ b/crates/openshell-bootstrap/src/lib.rs
@@ -120,11 +120,6 @@ pub struct DeployOptions {
     /// When false, an existing gateway is left as-is and deployment is
     /// skipped (the caller is responsible for prompting the user first).
     pub recreate: bool,
-    /// When true, resume from existing state (volume, stopped container)
-    /// instead of erroring out. The deploy flow reuses the existing volume
-    /// and creates a new container if needed, preserving k3s/etcd state,
-    /// sandbox pods, and secrets.
-    pub resume: bool,
 }
 
 impl DeployOptions {
@@ -141,7 +136,6 @@ impl DeployOptions {
             registry_token: None,
             gpu: false,
             recreate: false,
-            resume: false,
         }
     }
 
@@ -207,13 +201,6 @@ impl DeployOptions {
         self.recreate = recreate;
         self
     }
-
-    /// Set whether to resume from existing state (volume, stopped container).
-    #[must_use]
-    pub fn with_resume(mut self, resume: bool) -> Self {
-        self.resume = resume;
-        self
-    }
 }
 
 #[derive(Debug, Clone)]
@@ -278,7 +265,6 @@ where
     let registry_token = options.registry_token;
     let gpu = options.gpu;
     let recreate = options.recreate;
-    let resume = options.resume;
 
     // Wrap on_log in Arc<Mutex<>> so we can share it with pull_remote_image
     // which needs a 'static callback for the bollard streaming pull.
@@ -303,35 +289,22 @@ where
         (preflight.docker, None)
     };
 
-    // Guard: recreate takes precedence if both are set (shouldn't happen in practice).
-    if resume && recreate {
-        tracing::warn!("both resume and recreate set; recreate takes precedence");
-    }
-
     // If an existing gateway is found, decide how to proceed:
     // - recreate: destroy everything and start fresh
-    // - resume: keep existing state and create/start the container
-    // - neither: error out (caller should prompt the user)
+    // - otherwise: auto-resume from existing state (the ensure_* calls are
+    //   idempotent and will reuse the volume, create a container if needed,
+    //   and start it)
+    let mut resume = false;
     if let Some(existing) = check_existing_gateway(&target_docker, &name).await? {
         if recreate {
             log("[status] Removing existing gateway".to_string());
             destroy_gateway_resources(&target_docker, &name).await?;
-        } else if resume {
-            if existing.container_running {
-                log("[status] Gateway is already running".to_string());
-            } else {
-                log("[status] Resuming gateway from existing state".to_string());
-            }
-            // Fall through to ensure_* calls — they are idempotent and will
-            // reuse the existing volume, create a container if needed, and
-            // start it.
+        } else if existing.container_running {
+            log("[status] Gateway is already running".to_string());
+            resume = true;
         } else {
-            return Err(miette::miette!(
-                "Gateway '{name}' already exists (container_running={}).\n\
-                 Use --recreate to destroy and redeploy, or destroy it first with:\n\n    \
-                 openshell gateway destroy --name {name}",
-                existing.container_running,
-            ));
+            log("[status] Resuming gateway from existing state".to_string());
+            resume = true;
         }
     }
 
diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs
index 0928991e..6a6e4d25 100644
--- a/crates/openshell-cli/src/bootstrap.rs
+++ b/crates/openshell-cli/src/bootstrap.rs
@@ -144,12 +144,11 @@ pub async fn run_bootstrap(
     );
     eprintln!();
 
-    // Build base deploy options. The auto-bootstrap path tries to resume from
-    // existing state first (preserving sandboxes and secrets), falling back to
-    // a full recreate if the resume fails.
-    let build_options = |resume: bool, recreate: bool| {
+    // Build deploy options. The deploy flow auto-resumes from existing state
+    // (preserving sandboxes and secrets) when it finds an existing gateway.
+    // If the initial attempt fails, fall back to a full recreate.
+    let build_options = |recreate: bool| {
         let mut opts = openshell_bootstrap::DeployOptions::new(&gateway_name)
-            .with_resume(resume)
             .with_recreate(recreate)
             .with_gpu(gpu);
         if let Some(dest) = remote {
@@ -186,18 +185,15 @@ pub async fn run_bootstrap(
         opts
     };
 
-    // Try resume first to preserve existing sandboxes and secrets.
-    let handle = match deploy_gateway_with_panel(
-        build_options(true, false),
-        &gateway_name,
-        location,
-    )
-    .await
+    // Deploy the gateway. The deploy flow auto-resumes from existing state
+    // when it finds one. If that fails, fall back to a full recreate.
+    let handle = match deploy_gateway_with_panel(build_options(false), &gateway_name, location)
+        .await
     {
         Ok(handle) => handle,
         Err(resume_err) => {
             tracing::warn!("auto-bootstrap resume failed, falling back to recreate: {resume_err}");
-            deploy_gateway_with_panel(build_options(false, true), &gateway_name, location).await?
+            deploy_gateway_with_panel(build_options(true), &gateway_name, location).await?
         }
     };
     let server = handle.gateway_endpoint().to_string();
diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs
index 2abd45a1..366cb691 100644
--- a/crates/openshell-cli/src/run.rs
+++ b/crates/openshell-cli/src/run.rs
@@ -1367,27 +1367,18 @@ pub async fn gateway_admin_deploy(
         opts
     });
 
-    // Check whether a gateway already exists and decide how to proceed:
-    // - --recreate: always destroy and start fresh
-    // - existing state (stopped container / volume only): auto-resume
-    // - already running: return immediately (nothing to do)
-    let should_recreate = recreate;
-    let mut should_resume = false;
-    if let Some(existing) =
-        openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await?
-    {
-        if should_recreate {
-            // --recreate flag: fall through to destroy and redeploy.
-        } else if existing.container_running {
-            // Already running — nothing to do.
-            eprintln!(
-                "{} Gateway '{name}' is already running.",
-                "✓".green().bold()
-            );
-            return Ok(());
-        } else {
-            // Stopped container or volume-only: auto-resume from existing state.
-            should_resume = true;
+    // If the gateway is already running and we're not recreating, short-circuit.
+    if !recreate {
+        if let Some(existing) =
+            openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await?
+        {
+            if existing.container_running {
+                eprintln!(
+                    "{} Gateway '{name}' is already running.",
+                    "✓".green().bold()
+                );
+                return Ok(());
+            }
         }
     }
 
@@ -1396,8 +1387,7 @@ pub async fn gateway_admin_deploy(
         .with_disable_tls(disable_tls)
         .with_disable_gateway_auth(disable_gateway_auth)
         .with_gpu(gpu)
-        .with_recreate(should_recreate)
-        .with_resume(should_resume);
+        .with_recreate(recreate);
     if let Some(opts) = remote_opts {
         options = options.with_remote(opts);
     }

From b68357d30ed326eb4df3000607199f78f09388eb Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Fri, 20 Mar 2026 07:01:39 -0700
Subject: [PATCH 6/7] fix(ssh): retry CONNECT on 412 when sandbox not yet ready

The gateway returns HTTP 412 (Precondition Failed) when the sandbox pod
exists but hasn't reached Ready phase yet. This is a transient state
after allocation. Instead of failing immediately, retry with exponential
backoff (1s to 8s) for up to 60 seconds.
---
 crates/openshell-cli/src/ssh.rs | 57 +++++++++++++++++++++++----------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/crates/openshell-cli/src/ssh.rs b/crates/openshell-cli/src/ssh.rs
index 4b284bff..369f03ac 100644
--- a/crates/openshell-cli/src/ssh.rs
+++ b/crates/openshell-cli/src/ssh.rs
@@ -693,27 +693,50 @@ pub async fn sandbox_ssh_proxy(
         .ok_or_else(|| miette::miette!("gateway URL missing port"))?;
     let connect_path = url.path();
 
-    let mut stream: Box<dyn ProxyStream> =
-        connect_gateway(scheme, gateway_host, gateway_port, tls).await?;
-
     let request = format!(
         "CONNECT {connect_path} HTTP/1.1\r\nHost: {gateway_host}\r\nX-Sandbox-Id: {sandbox_id}\r\nX-Sandbox-Token: {token}\r\n\r\n"
     );
-    stream
-        .write_all(request.as_bytes())
-        .await
-        .into_diagnostic()?;
 
-    // Wrap in a BufReader **before** reading the HTTP response.  The gateway
-    // may send the 200 OK response and the first SSH protocol bytes in the
-    // same TCP segment / WebSocket frame.  A plain `read()` would consume
-    // those SSH bytes into our buffer and discard them, causing SSH to see a
-    // truncated protocol banner and exit with code 255.  BufReader ensures
-    // any bytes read past the `\r\n\r\n` header boundary stay buffered and
-    // are returned by subsequent reads during the bidirectional copy phase.
-    let mut buf_stream = BufReader::new(stream);
-    let status = read_connect_status(&mut buf_stream).await?;
-    if status != 200 {
+    // The gateway returns 412 (Precondition Failed) when the sandbox pod
+    // exists but hasn't reached Ready phase yet. This is a transient state
+    // after sandbox allocation — retry with backoff instead of failing
+    // immediately.
+    const MAX_CONNECT_WAIT: std::time::Duration = std::time::Duration::from_secs(60);
+    const INITIAL_BACKOFF: std::time::Duration = std::time::Duration::from_secs(1);
+
+    let start = std::time::Instant::now();
+    let mut backoff = INITIAL_BACKOFF;
+    let mut buf_stream;
+
+    loop {
+        let mut stream: Box<dyn ProxyStream> =
+            connect_gateway(scheme, gateway_host, gateway_port, tls).await?;
+        stream
+            .write_all(request.as_bytes())
+            .await
+            .into_diagnostic()?;
+
+        // Wrap in a BufReader **before** reading the HTTP response.  The gateway
+        // may send the 200 OK response and the first SSH protocol bytes in the
+        // same TCP segment / WebSocket frame.  A plain `read()` would consume
+        // those SSH bytes into our buffer and discard them, causing SSH to see a
+        // truncated protocol banner and exit with code 255.  BufReader ensures
+        // any bytes read past the `\r\n\r\n` header boundary stay buffered and
+        // are returned by subsequent reads during the bidirectional copy phase.
+        buf_stream = BufReader::new(stream);
+        let status = read_connect_status(&mut buf_stream).await?;
+        if status == 200 {
+            break;
+        }
+        if status == 412 && start.elapsed() < MAX_CONNECT_WAIT {
+            tracing::debug!(
+                elapsed = ?start.elapsed(),
+                "sandbox not yet ready (HTTP 412), retrying in {backoff:?}"
+            );
+            tokio::time::sleep(backoff).await;
+            backoff = (backoff * 2).min(std::time::Duration::from_secs(8));
+            continue;
+        }
         return Err(miette::miette!(
             "gateway CONNECT failed with status {status}"
         ));

From 46a1bcf57f1f135a467c0052b17a8217d1be6c8b Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Fri, 20 Mar 2026 07:11:00 -0700
Subject: [PATCH 7/7] fix: resolve compile warnings across workspace

- Remove duplicate Duration import and use unqualified Duration in ssh.rs
- Prefix unused default_image parameter with underscore in sandbox/mod.rs
- Make SecretResolver pub to match its use in pub function signature
---
 crates/openshell-cli/src/ssh.rs            | 6 +++---
 crates/openshell-sandbox/src/secrets.rs    | 2 +-
 crates/openshell-server/src/sandbox/mod.rs | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/crates/openshell-cli/src/ssh.rs b/crates/openshell-cli/src/ssh.rs
index 369f03ac..44b48da7 100644
--- a/crates/openshell-cli/src/ssh.rs
+++ b/crates/openshell-cli/src/ssh.rs
@@ -701,8 +701,8 @@ pub async fn sandbox_ssh_proxy(
     // exists but hasn't reached Ready phase yet. This is a transient state
     // after sandbox allocation — retry with backoff instead of failing
     // immediately.
-    const MAX_CONNECT_WAIT: std::time::Duration = std::time::Duration::from_secs(60);
-    const INITIAL_BACKOFF: std::time::Duration = std::time::Duration::from_secs(1);
+    const MAX_CONNECT_WAIT: Duration = Duration::from_secs(60);
+    const INITIAL_BACKOFF: Duration = Duration::from_secs(1);
 
     let start = std::time::Instant::now();
     let mut backoff = INITIAL_BACKOFF;
@@ -734,7 +734,7 @@ pub async fn sandbox_ssh_proxy(
                 "sandbox not yet ready (HTTP 412), retrying in {backoff:?}"
             );
             tokio::time::sleep(backoff).await;
-            backoff = (backoff * 2).min(std::time::Duration::from_secs(8));
+            backoff = (backoff * 2).min(Duration::from_secs(8));
             continue;
         }
         return Err(miette::miette!(
diff --git a/crates/openshell-sandbox/src/secrets.rs b/crates/openshell-sandbox/src/secrets.rs
index 4ee1ee84..002eab55 100644
--- a/crates/openshell-sandbox/src/secrets.rs
+++ b/crates/openshell-sandbox/src/secrets.rs
@@ -6,7 +6,7 @@ use std::collections::HashMap;
 const PLACEHOLDER_PREFIX: &str = "openshell:resolve:env:";
 
 #[derive(Debug, Clone, Default)]
-pub(crate) struct SecretResolver {
+pub struct SecretResolver {
     by_placeholder: HashMap<String, String>,
 }
 
diff --git a/crates/openshell-server/src/sandbox/mod.rs b/crates/openshell-server/src/sandbox/mod.rs
index bb943c12..754a9d95 100644
--- a/crates/openshell-server/src/sandbox/mod.rs
+++ b/crates/openshell-server/src/sandbox/mod.rs
@@ -1011,7 +1011,7 @@ fn inject_pod_template(
     mut pod_template: serde_json::Value,
     template: &SandboxTemplate,
     gpu: bool,
-    default_image: &str,
+    _default_image: &str,
     image_pull_policy: &str,
     sandbox_id: &str,
     sandbox_name: &str,