diff --git a/crates/openshell-bootstrap/src/constants.rs b/crates/openshell-bootstrap/src/constants.rs
index ff283b3e..74e381fd 100644
--- a/crates/openshell-bootstrap/src/constants.rs
+++ b/crates/openshell-bootstrap/src/constants.rs
@@ -11,6 +11,8 @@ pub const SERVER_TLS_SECRET_NAME: &str = "openshell-server-tls";
 pub const SERVER_CLIENT_CA_SECRET_NAME: &str = "openshell-server-client-ca";
 /// K8s secret holding the client TLS certificate, key, and CA cert (shared by CLI and sandboxes).
 pub const CLIENT_TLS_SECRET_NAME: &str = "openshell-client-tls";
+/// K8s secret holding the SSH handshake HMAC secret (shared by gateway and sandbox pods).
+pub const SSH_HANDSHAKE_SECRET_NAME: &str = "openshell-ssh-handshake";
 
 pub fn container_name(name: &str) -> String {
     format!("openshell-cluster-{name}")
diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs
index 9c365bfe..ae88c49b 100644
--- a/crates/openshell-bootstrap/src/docker.rs
+++ b/crates/openshell-bootstrap/src/docker.rs
@@ -8,8 +8,9 @@ use bollard::API_DEFAULT_VERSION;
 use bollard::Docker;
 use bollard::errors::Error as BollardError;
 use bollard::models::{
-    ContainerCreateBody, DeviceRequest, HostConfig, HostConfigCgroupnsModeEnum,
-    NetworkCreateRequest, NetworkDisconnectRequest, PortBinding, VolumeCreateRequest,
+    ContainerCreateBody, DeviceRequest, EndpointSettings, HostConfig, HostConfigCgroupnsModeEnum,
+    NetworkConnectRequest, NetworkCreateRequest, NetworkDisconnectRequest, PortBinding,
+    RestartPolicy, RestartPolicyNameEnum, VolumeCreateRequest,
 };
 use bollard::query_parameters::{
     CreateContainerOptions, CreateImageOptions, InspectContainerOptions, InspectNetworkOptions,
@@ -482,6 +483,17 @@ pub async fn ensure_container(
             };
 
             if image_matches {
+                // The container exists with the correct image, but its network
+                // attachment may be stale. When the gateway is resumed after a
+                // container kill, `ensure_network` destroys and recreates the
+                // Docker network (giving it a new ID). The stopped container
+                // still references the old network ID, so `docker start` would
+                // fail with "network <old-id> not found".
+                //
+                // Fix: disconnect from any existing networks and reconnect to
+                // the current (just-created) network before returning.
+                let expected_net = network_name(name);
+                reconcile_container_network(docker, &container_name, &expected_net).await?;
                 return Ok(());
             }
 
@@ -532,6 +544,12 @@ pub async fn ensure_container(
         port_bindings: Some(port_bindings),
         binds: Some(vec![format!("{}:/var/lib/rancher/k3s", volume_name(name))]),
         network_mode: Some(network_name(name)),
+        // Automatically restart the container when Docker restarts, unless the
+        // user explicitly stopped it with `gateway stop`.
+        restart_policy: Some(RestartPolicy {
+            name: Some(RestartPolicyNameEnum::UNLESS_STOPPED),
+            maximum_retry_count: None,
+        }),
         // Add host gateway aliases for DNS resolution.
         // This allows both the entrypoint script and the running gateway
         // process to reach services on the Docker host.
@@ -919,6 +937,48 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<()
     Ok(())
 }
 
+/// Clean up the gateway container and network, preserving the persistent volume.
+///
+/// Used when a resume attempt fails — we want to remove the container we may
+/// have just created but keep the volume so the user can retry without losing
+/// their k3s/etcd state and sandbox data.
+pub async fn cleanup_gateway_container(docker: &Docker, name: &str) -> Result<()> {
+    let container_name = container_name(name);
+    let net_name = network_name(name);
+
+    // Disconnect container from network
+    let _ = docker
+        .disconnect_network(
+            &net_name,
+            NetworkDisconnectRequest {
+                container: container_name.clone(),
+                force: Some(true),
+            },
+        )
+        .await;
+
+    let _ = stop_container(docker, &container_name).await;
+
+    let remove_container = docker
+        .remove_container(
+            &container_name,
+            Some(RemoveContainerOptions {
+                force: true,
+                ..Default::default()
+            }),
+        )
+        .await;
+    if let Err(err) = remove_container
+        && !is_not_found(&err)
+    {
+        return Err(err).into_diagnostic();
+    }
+
+    force_remove_network(docker, &net_name).await?;
+
+    Ok(())
+}
+
 /// Forcefully remove a Docker network, disconnecting any remaining
 /// containers first. This ensures that stale Docker network endpoints
 /// cannot prevent port bindings from being released.
@@ -956,6 +1016,71 @@ async fn force_remove_network(docker: &Docker, net_name: &str) -> Result<()> {
     }
 }
 
+/// Ensure a stopped container is connected to the expected Docker network.
+///
+/// When a gateway is resumed after the container was killed (but not removed),
+/// `ensure_network` destroys and recreates the network with a new ID. The
+/// stopped container still holds a reference to the old network ID in its
+/// config, so `docker start` would fail with a 404 "network not found" error.
+///
+/// This function disconnects the container from any networks that no longer
+/// match the expected network name and connects it to the correct one.
+async fn reconcile_container_network(
+    docker: &Docker,
+    container_name: &str,
+    expected_network: &str,
+) -> Result<()> {
+    let info = docker
+        .inspect_container(container_name, None::<InspectContainerOptions>)
+        .await
+        .into_diagnostic()
+        .wrap_err("failed to inspect container for network reconciliation")?;
+
+    // Check the container's current network attachments via NetworkSettings.
+    let attached_networks: Vec<String> = info
+        .network_settings
+        .as_ref()
+        .and_then(|ns| ns.networks.as_ref())
+        .map(|nets| nets.keys().cloned().collect())
+        .unwrap_or_default();
+
+    // If the container is already attached to the expected network (by name),
+    // Docker will resolve the name to the current network ID on start.
+    // However, when the network was destroyed and recreated, the container's
+    // stored endpoint references the old ID. Disconnect and reconnect to
+    // pick up the new network ID.
+    for net_name in &attached_networks {
+        let _ = docker
+            .disconnect_network(
+                net_name,
+                NetworkDisconnectRequest {
+                    container: container_name.to_string(),
+                    force: Some(true),
+                },
+            )
+            .await;
+    }
+
+    // Connect to the (freshly created) expected network.
+    docker
+        .connect_network(
+            expected_network,
+            NetworkConnectRequest {
+                container: container_name.to_string(),
+                endpoint_config: Some(EndpointSettings::default()),
+            },
+        )
+        .await
+        .into_diagnostic()
+        .wrap_err("failed to connect container to gateway network")?;
+
+    tracing::debug!(
+        "Reconciled network for container {container_name}: disconnected from {attached_networks:?}, connected to {expected_network}"
+    );
+
+    Ok(())
+}
+
 fn is_not_found(err: &BollardError) -> bool {
     matches!(
         err,
diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs
index 9098fd4a..22e417fa 100644
--- a/crates/openshell-bootstrap/src/lib.rs
+++ b/crates/openshell-bootstrap/src/lib.rs
@@ -26,12 +26,13 @@ use miette::{IntoDiagnostic, Result};
 use std::sync::{Arc, Mutex};
 
 use crate::constants::{
-    CLIENT_TLS_SECRET_NAME, SERVER_CLIENT_CA_SECRET_NAME, SERVER_TLS_SECRET_NAME, network_name,
-    volume_name,
+    CLIENT_TLS_SECRET_NAME, SERVER_CLIENT_CA_SECRET_NAME, SERVER_TLS_SECRET_NAME,
+    SSH_HANDSHAKE_SECRET_NAME, network_name, volume_name,
 };
 use crate::docker::{
-    check_existing_gateway, check_port_conflicts, destroy_gateway_resources, ensure_container,
-    ensure_image, ensure_network, ensure_volume, start_container, stop_container,
+    check_existing_gateway, check_port_conflicts, cleanup_gateway_container,
+    destroy_gateway_resources, ensure_container, ensure_image, ensure_network, ensure_volume,
+    start_container, stop_container,
 };
 use crate::metadata::{
     create_gateway_metadata, create_gateway_metadata_with_host, local_gateway_host,
@@ -288,19 +289,22 @@ where
         (preflight.docker, None)
     };
 
-    // If an existing gateway is found, either tear it down (when recreate is
-    // requested) or bail out so the caller can prompt the user / reuse it.
+    // If an existing gateway is found, decide how to proceed:
+    // - recreate: destroy everything and start fresh
+    // - otherwise: auto-resume from existing state (the ensure_* calls are
+    //   idempotent and will reuse the volume, create a container if needed,
+    //   and start it)
+    let mut resume = false;
     if let Some(existing) = check_existing_gateway(&target_docker, &name).await? {
         if recreate {
             log("[status] Removing existing gateway".to_string());
             destroy_gateway_resources(&target_docker, &name).await?;
+        } else if existing.container_running {
+            log("[status] Gateway is already running".to_string());
+            resume = true;
         } else {
-            return Err(miette::miette!(
-                "Gateway '{name}' already exists (container_running={}).\n\
-                 Use --recreate to destroy and redeploy, or destroy it first with:\n\n    \
-                 openshell gateway destroy --name {name}",
-                existing.container_running,
-            ));
+            log("[status] Resuming gateway from existing state".to_string());
+            resume = true;
         }
     }
 
@@ -455,6 +459,11 @@ where
 
         store_pki_bundle(&name, &pki_bundle)?;
 
+        // Reconcile SSH handshake secret: reuse existing K8s secret if present,
+        // generate and persist a new one otherwise. This secret is stored in etcd
+        // (on the persistent volume) so it survives container restarts.
+        reconcile_ssh_handshake_secret(&target_docker, &name, &log).await?;
+
         // Push locally-built component images into the k3s containerd runtime.
         // This is the "push" path for local development — images are exported from
         // the local Docker daemon and streamed into the cluster's containerd so
@@ -524,15 +533,30 @@ where
             docker: target_docker,
         }),
         Err(deploy_err) => {
-            // Automatically clean up Docker resources (volume, container, network,
-            // image) so the environment is left in a retryable state.
-            tracing::info!("deploy failed, cleaning up gateway resources for '{name}'");
-            if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await {
-                tracing::warn!(
-                    "automatic cleanup after failed deploy also failed: {cleanup_err}. \
-                     Manual cleanup may be required: \
-                     openshell gateway destroy --name {name}"
+            if resume {
+                // When resuming, preserve the volume so the user can retry.
+                // Only clean up the container and network that we may have created.
+                tracing::info!(
+                    "resume failed, cleaning up container for '{name}' (preserving volume)"
                 );
+                if let Err(cleanup_err) = cleanup_gateway_container(&target_docker, &name).await {
+                    tracing::warn!(
+                        "automatic cleanup after failed resume also failed: {cleanup_err}. \
+                         Manual cleanup may be required: \
+                         openshell gateway destroy --name {name}"
+                    );
+                }
+            } else {
+                // Automatically clean up Docker resources (volume, container, network,
+                // image) so the environment is left in a retryable state.
+                tracing::info!("deploy failed, cleaning up gateway resources for '{name}'");
+                if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await {
+                    tracing::warn!(
+                        "automatic cleanup after failed deploy also failed: {cleanup_err}. \
+                         Manual cleanup may be required: \
+                         openshell gateway destroy --name {name}"
+                    );
+                }
             }
             Err(deploy_err)
         }
@@ -809,6 +833,14 @@ where
     let cname = container_name(name);
     let kubeconfig = constants::KUBECONFIG_PATH;
 
+    // Wait for the k3s API server and openshell namespace before attempting
+    // to read secrets. Without this, kubectl fails transiently on resume
+    // (k3s hasn't booted yet), the code assumes secrets are gone, and
+    // regenerates PKI unnecessarily — triggering a server rollout restart
+    // and TLS errors for in-flight connections.
+    log("[progress] Waiting for openshell namespace".to_string());
+    wait_for_namespace(docker, &cname, kubeconfig, "openshell").await?;
+
     // Try to load existing secrets.
     match load_existing_pki_bundle(docker, &cname, kubeconfig).await {
         Ok(bundle) => {
@@ -823,10 +855,6 @@ where
     }
 
     // Generate fresh PKI and apply to cluster.
-    // Namespace may still be creating on first bootstrap, so wait here only
-    // when rotation is actually needed.
-    log("[progress] Waiting for openshell namespace".to_string());
-    wait_for_namespace(docker, &cname, kubeconfig, "openshell").await?;
     log("[progress] Generating TLS certificates".to_string());
     let bundle = generate_pki(extra_sans)?;
     log("[progress] Applying TLS secrets to gateway".to_string());
@@ -837,6 +865,72 @@ where
     Ok((bundle, true))
 }
 
+/// Reconcile the SSH handshake HMAC secret as a Kubernetes Secret.
+///
+/// If the secret already exists in the cluster, this is a no-op. Otherwise a
+/// fresh 32-byte hex secret is generated and applied. Because the secret lives
+/// in etcd (backed by the persistent Docker volume), it survives container
+/// restarts without regeneration — existing sandbox SSH sessions remain valid.
+async fn reconcile_ssh_handshake_secret<F>(docker: &Docker, name: &str, log: &F) -> Result<()>
+where
+    F: Fn(String) + Sync,
+{
+    use miette::WrapErr;
+
+    let cname = container_name(name);
+    let kubeconfig = constants::KUBECONFIG_PATH;
+
+    // Check if the secret already exists.
+    let (output, exit_code) = exec_capture_with_exit(
+        docker,
+        &cname,
+        vec![
+            "sh".to_string(),
+            "-c".to_string(),
+            format!(
+                "KUBECONFIG={kubeconfig} kubectl -n openshell get secret {SSH_HANDSHAKE_SECRET_NAME} -o jsonpath='{{.data.secret}}' 2>/dev/null"
+            ),
+        ],
+    )
+    .await?;
+
+    if exit_code == 0 && !output.trim().is_empty() {
+        tracing::debug!(
+            "existing SSH handshake secret found ({} bytes encoded)",
+            output.trim().len()
+        );
+        log("[progress] Reusing existing SSH handshake secret".to_string());
+        return Ok(());
+    }
+
+    // Generate a new 32-byte hex secret and create the K8s secret.
+    log("[progress] Generating SSH handshake secret".to_string());
+    let (output, exit_code) = exec_capture_with_exit(
+        docker,
+        &cname,
+        vec![
+            "sh".to_string(),
+            "-c".to_string(),
+            format!(
+                "SECRET=$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \\n') && \
+                 KUBECONFIG={kubeconfig} kubectl -n openshell create secret generic {SSH_HANDSHAKE_SECRET_NAME} \
+                 --from-literal=secret=$SECRET --dry-run=client -o yaml | \
+                 KUBECONFIG={kubeconfig} kubectl apply -f -"
+            ),
+        ],
+    )
+    .await?;
+
+    if exit_code != 0 {
+        return Err(miette::miette!(
+            "failed to create SSH handshake secret (exit {exit_code}): {output}"
+        ))
+        .wrap_err("failed to apply SSH handshake secret");
+    }
+
+    Ok(())
+}
+
 /// Load existing TLS secrets from the cluster and reconstruct a [`PkiBundle`].
 ///
 /// Returns an error string describing why secrets couldn't be loaded (for logging).
diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs
index e976061f..6a6e4d25 100644
--- a/crates/openshell-cli/src/bootstrap.rs
+++ b/crates/openshell-cli/src/bootstrap.rs
@@ -144,43 +144,58 @@ pub async fn run_bootstrap(
     );
     eprintln!();
 
-    // Auto-bootstrap always recreates if stale Docker resources are found
-    // (e.g. metadata was deleted but container/volume still exist).
-    let mut options = openshell_bootstrap::DeployOptions::new(&gateway_name).with_recreate(true);
-    if let Some(dest) = remote {
-        let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest);
-        if let Some(key) = ssh_key {
-            remote_opts = remote_opts.with_ssh_key(key);
+    // Build deploy options. The deploy flow auto-resumes from existing state
+    // (preserving sandboxes and secrets) when it finds an existing gateway.
+    // If the initial attempt fails, fall back to a full recreate.
+    let build_options = |recreate: bool| {
+        let mut opts = openshell_bootstrap::DeployOptions::new(&gateway_name)
+            .with_recreate(recreate)
+            .with_gpu(gpu);
+        if let Some(dest) = remote {
+            let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest);
+            if let Some(key) = ssh_key {
+                remote_opts = remote_opts.with_ssh_key(key);
+            }
+            opts = opts.with_remote(remote_opts);
         }
-        options = options.with_remote(remote_opts);
-    }
-    // Read registry credentials from environment for the auto-bootstrap path.
-    // The explicit `--registry-username` / `--registry-token` flags are only
-    // on `gateway start`; when bootstrapping via `sandbox create`, the env
-    // vars are the mechanism.
-    if let Ok(username) = std::env::var("OPENSHELL_REGISTRY_USERNAME")
-        && !username.trim().is_empty()
-    {
-        options = options.with_registry_username(username);
-    }
-    if let Ok(token) = std::env::var("OPENSHELL_REGISTRY_TOKEN")
-        && !token.trim().is_empty()
-    {
-        options = options.with_registry_token(token);
-    }
-    // Read gateway host override from environment. Needed whenever the
-    // client cannot reach the Docker host at 127.0.0.1 — CI containers,
-    // WSL, remote Docker hosts, etc. The explicit `--gateway-host` flag
-    // is only on `gateway start`; this env var covers the auto-bootstrap
-    // path triggered by `sandbox create`.
-    if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST")
-        && !host.trim().is_empty()
-    {
-        options = options.with_gateway_host(host);
-    }
-    options = options.with_gpu(gpu);
+        // Read registry credentials from environment for the auto-bootstrap path.
+        // The explicit `--registry-username` / `--registry-token` flags are only
+        // on `gateway start`; when bootstrapping via `sandbox create`, the env
+        // vars are the mechanism.
+        if let Ok(username) = std::env::var("OPENSHELL_REGISTRY_USERNAME")
+            && !username.trim().is_empty()
+        {
+            opts = opts.with_registry_username(username);
+        }
+        if let Ok(token) = std::env::var("OPENSHELL_REGISTRY_TOKEN")
+            && !token.trim().is_empty()
+        {
+            opts = opts.with_registry_token(token);
+        }
+        // Read gateway host override from environment. Needed whenever the
+        // client cannot reach the Docker host at 127.0.0.1 — CI containers,
+        // WSL, remote Docker hosts, etc. The explicit `--gateway-host` flag
+        // is only on `gateway start`; this env var covers the auto-bootstrap
+        // path triggered by `sandbox create`.
+        if let Ok(host) = std::env::var("OPENSHELL_GATEWAY_HOST")
+            && !host.trim().is_empty()
+        {
+            opts = opts.with_gateway_host(host);
+        }
+        opts
+    };
 
-    let handle = deploy_gateway_with_panel(options, &gateway_name, location).await?;
+    // Deploy the gateway. The deploy flow auto-resumes from existing state
+    // when it finds one. If that fails, fall back to a full recreate.
+    let handle = match deploy_gateway_with_panel(build_options(false), &gateway_name, location)
+        .await
+    {
+        Ok(handle) => handle,
+        Err(resume_err) => {
+            tracing::warn!("auto-bootstrap resume failed, falling back to recreate: {resume_err}");
+            deploy_gateway_with_panel(build_options(true), &gateway_name, location).await?
+        }
+    };
     let server = handle.gateway_endpoint().to_string();
 
     print_deploy_summary(&gateway_name, &handle);
@@ -207,7 +222,7 @@ pub async fn run_bootstrap(
 /// Retry connecting to the gateway gRPC endpoint until it succeeds or a
 /// timeout is reached. Uses exponential backoff starting at 500 ms, doubling
 /// up to 4 s, with a total deadline of 30 s.
-async fn wait_for_grpc_ready(server: &str, tls: &TlsOptions) -> Result<()> {
+pub(crate) async fn wait_for_grpc_ready(server: &str, tls: &TlsOptions) -> Result<()> {
     const MAX_WAIT: Duration = Duration::from_secs(30);
     const INITIAL_BACKOFF: Duration = Duration::from_millis(500);
 
diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs
index 2f9dd2f7..366cb691 100644
--- a/crates/openshell-cli/src/run.rs
+++ b/crates/openshell-cli/src/run.rs
@@ -1367,47 +1367,16 @@ pub async fn gateway_admin_deploy(
         opts
     });
 
-    // Check whether a gateway already exists. If so, prompt the user (unless
-    // --recreate was passed or we're in non-interactive mode).
-    let mut should_recreate = recreate;
-    if let Some(existing) =
-        openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await?
-    {
-        if !should_recreate {
-            let interactive = std::io::stdin().is_terminal() && std::io::stderr().is_terminal();
-            if interactive {
-                let status = if existing.container_running {
-                    "running"
-                } else if existing.container_exists {
-                    "stopped"
-                } else {
-                    "volume only"
-                };
-                eprintln!();
+    // If the gateway is already running and we're not recreating, short-circuit.
+    if !recreate {
+        if let Some(existing) =
+            openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await?
+        {
+            if existing.container_running {
                 eprintln!(
-                    "{} Gateway '{name}' already exists ({status}).",
-                    "!".yellow().bold()
+                    "{} Gateway '{name}' is already running.",
+                    "✓".green().bold()
                 );
-                if let Some(image) = &existing.container_image {
-                    eprintln!("  {} {}", "Image:".dimmed(), image);
-                }
-                eprintln!();
-                eprint!("Destroy and recreate? [y/N] ");
-                std::io::stderr().flush().ok();
-                let mut input = String::new();
-                std::io::stdin()
-                    .read_line(&mut input)
-                    .into_diagnostic()
-                    .wrap_err("failed to read user input")?;
-                let choice = input.trim().to_lowercase();
-                should_recreate = choice == "y" || choice == "yes";
-                if !should_recreate {
-                    eprintln!("Keeping existing gateway.");
-                    return Ok(());
-                }
-            } else {
-                // Non-interactive mode: reuse existing gateway silently.
-                eprintln!("Gateway '{name}' already exists, reusing.");
                 return Ok(());
             }
         }
@@ -1418,7 +1387,7 @@ pub async fn gateway_admin_deploy(
         .with_disable_tls(disable_tls)
         .with_disable_gateway_auth(disable_gateway_auth)
         .with_gpu(gpu)
-        .with_recreate(should_recreate);
+        .with_recreate(recreate);
     if let Some(opts) = remote_opts {
         options = options.with_remote(opts);
     }
@@ -1434,6 +1403,15 @@ pub async fn gateway_admin_deploy(
 
     let handle = deploy_gateway_with_panel(options, name, location).await?;
 
+    // Wait for the gRPC endpoint to actually accept connections before
+    // declaring the gateway ready. The Docker health check may pass before
+    // the gRPC listener inside the pod is fully bound.
+    let server = handle.gateway_endpoint().to_string();
+    let tls = TlsOptions::default()
+        .with_gateway_name(name)
+        .with_default_paths(&server);
+    crate::bootstrap::wait_for_grpc_ready(&server, &tls).await?;
+
     print_deploy_summary(name, &handle);
 
     // Auto-activate: set this gateway as the active gateway.
diff --git a/crates/openshell-cli/src/ssh.rs b/crates/openshell-cli/src/ssh.rs
index 4b284bff..44b48da7 100644
--- a/crates/openshell-cli/src/ssh.rs
+++ b/crates/openshell-cli/src/ssh.rs
@@ -693,27 +693,50 @@ pub async fn sandbox_ssh_proxy(
         .ok_or_else(|| miette::miette!("gateway URL missing port"))?;
     let connect_path = url.path();
 
-    let mut stream: Box<dyn ProxyStream> =
-        connect_gateway(scheme, gateway_host, gateway_port, tls).await?;
-
     let request = format!(
         "CONNECT {connect_path} HTTP/1.1\r\nHost: {gateway_host}\r\nX-Sandbox-Id: {sandbox_id}\r\nX-Sandbox-Token: {token}\r\n\r\n"
     );
-    stream
-        .write_all(request.as_bytes())
-        .await
-        .into_diagnostic()?;
 
-    // Wrap in a BufReader **before** reading the HTTP response.  The gateway
-    // may send the 200 OK response and the first SSH protocol bytes in the
-    // same TCP segment / WebSocket frame.  A plain `read()` would consume
-    // those SSH bytes into our buffer and discard them, causing SSH to see a
-    // truncated protocol banner and exit with code 255.  BufReader ensures
-    // any bytes read past the `\r\n\r\n` header boundary stay buffered and
-    // are returned by subsequent reads during the bidirectional copy phase.
-    let mut buf_stream = BufReader::new(stream);
-    let status = read_connect_status(&mut buf_stream).await?;
-    if status != 200 {
+    // The gateway returns 412 (Precondition Failed) when the sandbox pod
+    // exists but hasn't reached Ready phase yet. This is a transient state
+    // after sandbox allocation — retry with backoff instead of failing
+    // immediately.
+    const MAX_CONNECT_WAIT: Duration = Duration::from_secs(60);
+    const INITIAL_BACKOFF: Duration = Duration::from_secs(1);
+
+    let start = std::time::Instant::now();
+    let mut backoff = INITIAL_BACKOFF;
+    let mut buf_stream;
+
+    loop {
+        let mut stream: Box<dyn ProxyStream> =
+            connect_gateway(scheme, gateway_host, gateway_port, tls).await?;
+        stream
+            .write_all(request.as_bytes())
+            .await
+            .into_diagnostic()?;
+
+        // Wrap in a BufReader **before** reading the HTTP response.  The gateway
+        // may send the 200 OK response and the first SSH protocol bytes in the
+        // same TCP segment / WebSocket frame.  A plain `read()` would consume
+        // those SSH bytes into our buffer and discard them, causing SSH to see a
+        // truncated protocol banner and exit with code 255.  BufReader ensures
+        // any bytes read past the `\r\n\r\n` header boundary stay buffered and
+        // are returned by subsequent reads during the bidirectional copy phase.
+        buf_stream = BufReader::new(stream);
+        let status = read_connect_status(&mut buf_stream).await?;
+        if status == 200 {
+            break;
+        }
+        if status == 412 && start.elapsed() < MAX_CONNECT_WAIT {
+            tracing::debug!(
+                elapsed = ?start.elapsed(),
+                "sandbox not yet ready (HTTP 412), retrying in {backoff:?}"
+            );
+            tokio::time::sleep(backoff).await;
+            backoff = (backoff * 2).min(Duration::from_secs(8));
+            continue;
+        }
         return Err(miette::miette!(
             "gateway CONNECT failed with status {status}"
         ));
diff --git a/crates/openshell-sandbox/src/secrets.rs b/crates/openshell-sandbox/src/secrets.rs
index 4ee1ee84..002eab55 100644
--- a/crates/openshell-sandbox/src/secrets.rs
+++ b/crates/openshell-sandbox/src/secrets.rs
@@ -6,7 +6,7 @@ use std::collections::HashMap;
 const PLACEHOLDER_PREFIX: &str = "openshell:resolve:env:";
 
 #[derive(Debug, Clone, Default)]
-pub(crate) struct SecretResolver {
+pub struct SecretResolver {
     by_placeholder: HashMap<String, String>,
 }
 
diff --git a/crates/openshell-server/src/sandbox/mod.rs b/crates/openshell-server/src/sandbox/mod.rs
index bb943c12..754a9d95 100644
--- a/crates/openshell-server/src/sandbox/mod.rs
+++ b/crates/openshell-server/src/sandbox/mod.rs
@@ -1011,7 +1011,7 @@ fn inject_pod_template(
     mut pod_template: serde_json::Value,
     template: &SandboxTemplate,
     gpu: bool,
-    default_image: &str,
+    _default_image: &str,
     image_pull_policy: &str,
     sandbox_id: &str,
     sandbox_name: &str,
diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh
index 84b8cf9a..3b069b7b 100644
--- a/deploy/docker/cluster-entrypoint.sh
+++ b/deploy/docker/cluster-entrypoint.sh
@@ -402,10 +402,10 @@ if [ -n "${IMAGE_PULL_POLICY:-}" ] && [ -f "$HELMCHART" ]; then
     sed -i "s|pullPolicy: Always|pullPolicy: ${IMAGE_PULL_POLICY}|" "$HELMCHART"
 fi
 
-# Generate a random SSH handshake secret for the NSSH1 HMAC handshake between
-# the gateway and sandbox SSH servers. This is required — the server will refuse
-# to start without it.
-SSH_HANDSHAKE_SECRET="${SSH_HANDSHAKE_SECRET:-$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \n')}"
+# SSH handshake secret: previously generated here and injected via sed into the
+# HelmChart CR. Now persisted as a Kubernetes Secret (openshell-ssh-handshake)
+# created by the bootstrap process after k3s starts. This ensures the secret
+# survives container restarts without regeneration.
 
 # Inject SSH gateway host/port into the HelmChart manifest so the openshell
 # server returns the correct address to CLI clients for SSH proxy CONNECT.
@@ -424,9 +424,6 @@ if [ -f "$HELMCHART" ]; then
         # Clear the placeholder so the default (8080) is used
         sed -i "s|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g" "$HELMCHART"
     fi
-    echo "Setting SSH handshake secret"
-    sed -i "s|__SSH_HANDSHAKE_SECRET__|${SSH_HANDSHAKE_SECRET}|g" "$HELMCHART"
-
     # Disable gateway auth: when set, the server accepts connections without
     # client certificates (for reverse-proxy / Cloudflare Tunnel deployments).
     if [ "${DISABLE_GATEWAY_AUTH:-}" = "true" ]; then
diff --git a/deploy/docker/cluster-healthcheck.sh b/deploy/docker/cluster-healthcheck.sh
index 68210b45..1bf76f71 100644
--- a/deploy/docker/cluster-healthcheck.sh
+++ b/deploy/docker/cluster-healthcheck.sh
@@ -68,3 +68,6 @@ if [ "${DISABLE_TLS:-}" != "true" ]; then
     kubectl -n openshell get secret openshell-server-tls >/dev/null 2>&1 || exit 1
     kubectl -n openshell get secret openshell-client-tls >/dev/null 2>&1 || exit 1
 fi
+
+# Verify SSH handshake secret exists (created by openshell-bootstrap alongside TLS secrets)
+kubectl -n openshell get secret openshell-ssh-handshake >/dev/null 2>&1 || exit 1
diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml
index 1be8f14a..ed503a78 100644
--- a/deploy/helm/openshell/templates/statefulset.yaml
+++ b/deploy/helm/openshell/templates/statefulset.yaml
@@ -77,7 +77,10 @@ spec:
               value: {{ .Values.server.hostGatewayIP | quote }}
             {{- end }}
             - name: OPENSHELL_SSH_HANDSHAKE_SECRET
-              value: {{ required "server.sshHandshakeSecret is required" .Values.server.sshHandshakeSecret | quote }}
+              valueFrom:
+                secretKeyRef:
+                  name: {{ .Values.server.sshHandshakeSecretName | quote }}
+                  key: secret
             {{- if .Values.server.disableTls }}
             - name: OPENSHELL_DISABLE_TLS
               value: "true"
diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml
index ccc8d1ff..d698e812 100644
--- a/deploy/helm/openshell/values.yaml
+++ b/deploy/helm/openshell/values.yaml
@@ -85,10 +85,10 @@ server:
   sshGatewayPort: 0
   # TLS configuration for the server. The server always terminates mTLS
   # directly and requires client certificates.
-  # HMAC secret used for the NSSH1 handshake between gateway and sandbox SSH.
-  # Required — the server will refuse to start if empty. For cluster deployments
-  # this is auto-generated by the entrypoint script.
-  sshHandshakeSecret: ""
+  # Name of the Kubernetes Secret holding the NSSH1 HMAC handshake key.
+  # The secret must contain a `secret` key with the hex-encoded HMAC key.
+  # For cluster deployments this is auto-created by the bootstrap process.
+  sshHandshakeSecretName: "openshell-ssh-handshake"
   # Host gateway IP for sandbox pod hostAliases. When set, sandbox pods get
   # hostAliases entries mapping host.docker.internal and host.openshell.internal
   # to this IP, allowing them to reach services running on the Docker host.
diff --git a/deploy/kube/manifests/openshell-helmchart.yaml b/deploy/kube/manifests/openshell-helmchart.yaml
index 2245c72e..ae22ddc6 100644
--- a/deploy/kube/manifests/openshell-helmchart.yaml
+++ b/deploy/kube/manifests/openshell-helmchart.yaml
@@ -32,7 +32,6 @@ spec:
       sandboxImage: ghcr.io/nvidia/openshell-community/sandboxes/base:latest
       sshGatewayHost: __SSH_GATEWAY_HOST__
       sshGatewayPort: __SSH_GATEWAY_PORT__
-      sshHandshakeSecret: __SSH_HANDSHAKE_SECRET__
       grpcEndpoint: "https://openshell.openshell.svc.cluster.local:8080"
       hostGatewayIP: __HOST_GATEWAY_IP__
       disableGatewayAuth: __DISABLE_GATEWAY_AUTH__
diff --git a/e2e/rust/tests/gateway_resume.rs b/e2e/rust/tests/gateway_resume.rs
new file mode 100644
index 00000000..67d28d19
--- /dev/null
+++ b/e2e/rust/tests/gateway_resume.rs
@@ -0,0 +1,434 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#![cfg(feature = "e2e")]
+
+//! E2E tests for gateway resume from existing state.
+//!
+//! These tests verify that `openshell gateway start` resumes from existing
+//! Docker volume state (after stop or container removal) and that the SSH
+//! handshake secret persists across container restarts.
+//!
+//! **Requires a running gateway** — the `e2e:rust` mise task bootstraps one.
+
+use std::process::{Command, Stdio};
+use std::time::Duration;
+
+use openshell_e2e::harness::binary::openshell_cmd;
+use openshell_e2e::harness::output::strip_ansi;
+use tokio::time::sleep;
+
+/// Default gateway name used by the e2e cluster.
+const GATEWAY_NAME: &str = "openshell";
+
+/// Docker container name for the default gateway.
+fn container_name() -> String {
+    format!("openshell-cluster-{GATEWAY_NAME}")
+}
+
+/// Run `openshell <args>` and return (combined output, exit code).
+async fn run_cli(args: &[&str]) -> (String, i32) {
+    let mut cmd = openshell_cmd();
+    cmd.args(args)
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped());
+
+    let output = cmd.output().await.expect("spawn openshell");
+    let stdout = String::from_utf8_lossy(&output.stdout).to_string();
+    let stderr = String::from_utf8_lossy(&output.stderr).to_string();
+    let combined = format!("{stdout}{stderr}");
+    let code = output.status.code().unwrap_or(-1);
+    (combined, code)
+}
+
+/// Run `docker <args>` synchronously and return (stdout, exit code).
+fn docker_cmd(args: &[&str]) -> (String, i32) {
+    let output = Command::new("docker")
+        .args(args)
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .output()
+        .expect("spawn docker");
+    let stdout = String::from_utf8_lossy(&output.stdout).to_string();
+    let code = output.status.code().unwrap_or(-1);
+    (stdout, code)
+}
+
+/// Wait for the gateway to become healthy by polling `openshell status`.
+async fn wait_for_healthy(timeout: Duration) {
+    let start = std::time::Instant::now();
+    loop {
+        let (output, code) = run_cli(&["status"]).await;
+        let clean = strip_ansi(&output).to_lowercase();
+        if code == 0 && (clean.contains("healthy") || clean.contains("running") || clean.contains("connected") || clean.contains("✓")) {
+            return;
+        }
+        if start.elapsed() > timeout {
+            panic!(
+                "gateway did not become healthy within {}s. Last output:\n{}",
+                timeout.as_secs(),
+                strip_ansi(&output)
+            );
+        }
+        sleep(Duration::from_secs(3)).await;
+    }
+}
+
+/// Read the SSH handshake secret from the K8s secret inside the cluster.
+fn read_ssh_handshake_secret() -> Option<String> {
+    let cname = container_name();
+    let (output, code) = docker_cmd(&[
+        "exec",
+        &cname,
+        "sh",
+        "-c",
+        "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl -n openshell get secret openshell-ssh-handshake -o jsonpath='{.data.secret}' 2>/dev/null",
+    ]);
+    if code == 0 && !output.trim().is_empty() {
+        Some(output.trim().to_string())
+    } else {
+        None
+    }
+}
+
+// -------------------------------------------------------------------
+// Test: `gateway start` on an already-running gateway succeeds
+// -------------------------------------------------------------------
+
+/// When the gateway is already running, `openshell gateway start` should
+/// return immediately with exit code 0 and indicate it's already running.
+#[tokio::test]
+async fn gateway_start_on_running_gateway_succeeds() {
+    // Precondition: gateway is running (e2e cluster is up).
+    wait_for_healthy(Duration::from_secs(30)).await;
+
+    let (output, code) = run_cli(&["gateway", "start"]).await;
+    let clean = strip_ansi(&output);
+
+    assert_eq!(
+        code, 0,
+        "gateway start on running gateway should exit 0:\n{clean}"
+    );
+    assert!(
+        clean.to_lowercase().contains("already running"),
+        "output should indicate gateway is already running:\n{clean}"
+    );
+}
+
+// -------------------------------------------------------------------
+// Test: gateway stop → start resumes, sandbox survives
+// -------------------------------------------------------------------
+
+/// After `gateway stop` then `gateway start`, the gateway should resume
+/// from existing state. A sandbox created before the stop should still
+/// appear in the sandbox list after restart.
+#[tokio::test]
+async fn gateway_stop_start_resumes_with_sandbox() {
+    // Precondition: gateway is healthy.
+    wait_for_healthy(Duration::from_secs(30)).await;
+
+    // Create a sandbox that we'll check for after restart.
+    let (create_output, create_code) =
+        run_cli(&["sandbox", "create", "--", "echo", "resume-test"]).await;
+    let clean_create = strip_ansi(&create_output);
+    assert_eq!(
+        create_code, 0,
+        "sandbox create should succeed:\n{clean_create}"
+    );
+
+    // Extract sandbox name from output.
+    let sandbox_name = clean_create
+        .lines()
+        .find_map(|line| {
+            if let Some((_, rest)) = line.split_once("Created sandbox:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else if let Some((_, rest)) = line.split_once("Name:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else {
+                None
+            }
+        })
+        .expect("should extract sandbox name from create output");
+
+    // Stop the gateway.
+    let (stop_output, stop_code) = run_cli(&["gateway", "stop"]).await;
+    assert_eq!(
+        stop_code, 0,
+        "gateway stop should succeed:\n{}",
+        strip_ansi(&stop_output)
+    );
+
+    // Wait a moment for the container to fully stop.
+    sleep(Duration::from_secs(3)).await;
+
+    // Verify container is stopped.
+    let (inspect_out, _) = docker_cmd(&[
+        "inspect",
+        "-f",
+        "{{.State.Running}}",
+        &container_name(),
+    ]);
+    assert_eq!(
+        inspect_out.trim(),
+        "false",
+        "container should be stopped after gateway stop"
+    );
+
+    // Start the gateway again — should resume from existing state.
+    let (start_output, start_code) = run_cli(&["gateway", "start"]).await;
+    let clean_start = strip_ansi(&start_output);
+    assert_eq!(
+        start_code, 0,
+        "gateway start after stop should succeed:\n{clean_start}"
+    );
+
+    // Wait for the gateway to become healthy again.
+    wait_for_healthy(Duration::from_secs(180)).await;
+
+    // Verify the sandbox still exists.
+    let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await;
+    let clean_list = strip_ansi(&list_output);
+    assert_eq!(
+        list_code, 0,
+        "sandbox list should succeed after resume:\n{clean_list}"
+    );
+    assert!(
+        clean_list.contains(&sandbox_name),
+        "sandbox '{sandbox_name}' should survive gateway stop/start.\nList output:\n{clean_list}"
+    );
+
+    // Cleanup: delete the test sandbox.
+    let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await;
+}
+
+// -------------------------------------------------------------------
+// Test: container removed → gateway start resumes
+// -------------------------------------------------------------------
+
+/// After the Docker container is force-removed (simulating Docker restart),
+/// `openshell gateway start` should resume from the existing volume.
+#[tokio::test]
+async fn gateway_start_resumes_after_container_removal() {
+    // Precondition: gateway is healthy.
+    wait_for_healthy(Duration::from_secs(30)).await;
+
+    // Create a sandbox to verify state persistence.
+    let (create_output, create_code) =
+        run_cli(&["sandbox", "create", "--", "echo", "container-rm-test"]).await;
+    let clean_create = strip_ansi(&create_output);
+    assert_eq!(
+        create_code, 0,
+        "sandbox create should succeed:\n{clean_create}"
+    );
+
+    let sandbox_name = clean_create
+        .lines()
+        .find_map(|line| {
+            if let Some((_, rest)) = line.split_once("Created sandbox:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else if let Some((_, rest)) = line.split_once("Name:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else {
+                None
+            }
+        })
+        .expect("should extract sandbox name from create output");
+
+    // Force-remove the container (simulates Docker restart / OOM kill).
+    let (_, rm_code) = docker_cmd(&["rm", "-f", &container_name()]);
+    assert_eq!(rm_code, 0, "docker rm -f should succeed");
+
+    // Verify the volume still exists.
+    let (vol_out, vol_code) = docker_cmd(&[
+        "volume",
+        "inspect",
+        &format!("openshell-cluster-{GATEWAY_NAME}"),
+    ]);
+    assert_eq!(
+        vol_code, 0,
+        "volume should still exist after container removal:\n{vol_out}"
+    );
+
+    // Start the gateway — should resume from the volume.
+    let (start_output, start_code) = run_cli(&["gateway", "start"]).await;
+    let clean_start = strip_ansi(&start_output);
+    assert_eq!(
+        start_code, 0,
+        "gateway start after container removal should succeed:\n{clean_start}"
+    );
+
+    // Wait for healthy.
+    wait_for_healthy(Duration::from_secs(180)).await;
+
+    // Verify sandbox survived.
+    let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await;
+    let clean_list = strip_ansi(&list_output);
+    assert_eq!(
+        list_code, 0,
+        "sandbox list should succeed after resume:\n{clean_list}"
+    );
+    assert!(
+        clean_list.contains(&sandbox_name),
+        "sandbox '{sandbox_name}' should survive container removal + resume.\nList output:\n{clean_list}"
+    );
+
+    // Cleanup.
+    let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await;
+}
+
+// -------------------------------------------------------------------
+// Test: container killed → gateway start resumes, sandboxes survive,
+//       new sandbox create works
+// -------------------------------------------------------------------
+
+/// When a container is killed (stopped but NOT removed), `gateway start`
+/// should resume from existing state. This validates three things:
+///
+/// 1. The stale Docker network reference is reconciled (ensure_network
+///    destroys and recreates the network with a new ID).
+/// 2. Existing sandboxes created before the kill survive the restart.
+/// 3. New `sandbox create` works after resume — the TLS certificates
+///    are reused (not needlessly regenerated), so the CLI's mTLS certs
+///    still match the server.
+#[tokio::test]
+async fn gateway_start_resumes_after_container_kill() {
+    // Precondition: gateway is healthy.
+    wait_for_healthy(Duration::from_secs(30)).await;
+
+    let cname = container_name();
+    let net_name = format!("openshell-cluster-{GATEWAY_NAME}");
+
+    // Create a sandbox before the kill to verify state persistence.
+    let (create_output, create_code) =
+        run_cli(&["sandbox", "create", "--", "echo", "kill-resume-test"]).await;
+    let clean_create = strip_ansi(&create_output);
+    assert_eq!(
+        create_code, 0,
+        "sandbox create should succeed:\n{clean_create}"
+    );
+
+    let sandbox_before = clean_create
+        .lines()
+        .find_map(|line| {
+            if let Some((_, rest)) = line.split_once("Created sandbox:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else if let Some((_, rest)) = line.split_once("Name:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else {
+                None
+            }
+        })
+        .expect("should extract sandbox name from create output");
+
+    // Kill the container (it remains as a stopped container, unlike `docker rm`).
+    let (_, kill_code) = docker_cmd(&["kill", &cname]);
+    assert_eq!(kill_code, 0, "docker kill should succeed");
+
+    sleep(Duration::from_secs(3)).await;
+
+    // Remove the Docker network to simulate a stale network reference.
+    // The bootstrap `ensure_network` always destroys and recreates, so
+    // after this the container's stored network ID will be invalid.
+    let _ = docker_cmd(&["network", "disconnect", "-f", &net_name, &cname]);
+    let (_, net_rm_code) = docker_cmd(&["network", "rm", &net_name]);
+    assert_eq!(
+        net_rm_code, 0,
+        "docker network rm should succeed (or network already gone)"
+    );
+
+    // Start the gateway — must handle stale network + reuse existing PKI.
+    let (start_output, start_code) = run_cli(&["gateway", "start"]).await;
+    let clean_start = strip_ansi(&start_output);
+    assert_eq!(
+        start_code, 0,
+        "gateway start after kill should succeed:\n{clean_start}"
+    );
+
+    // Wait for the gateway to become healthy again.
+    wait_for_healthy(Duration::from_secs(180)).await;
+
+    // Verify the pre-existing sandbox survived.
+    let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await;
+    let clean_list = strip_ansi(&list_output);
+    assert_eq!(
+        list_code, 0,
+        "sandbox list should succeed after resume:\n{clean_list}"
+    );
+    assert!(
+        clean_list.contains(&sandbox_before),
+        "sandbox '{sandbox_before}' should survive container kill + resume.\nList output:\n{clean_list}"
+    );
+
+    // Create a new sandbox to verify TLS is working end-to-end.
+    let (new_create_output, new_create_code) =
+        run_cli(&["sandbox", "create", "--", "echo", "post-resume-test"]).await;
+    let clean_new = strip_ansi(&new_create_output);
+    assert_eq!(
+        new_create_code, 0,
+        "sandbox create after resume should succeed (TLS must work):\n{clean_new}"
+    );
+
+    let sandbox_after = clean_new
+        .lines()
+        .find_map(|line| {
+            if let Some((_, rest)) = line.split_once("Created sandbox:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else if let Some((_, rest)) = line.split_once("Name:") {
+                rest.split_whitespace().next().map(ToOwned::to_owned)
+            } else {
+                None
+            }
+        })
+        .expect("should extract sandbox name from post-resume create output");
+
+    // Cleanup.
+    let _ = run_cli(&["sandbox", "delete", &sandbox_before]).await;
+    let _ = run_cli(&["sandbox", "delete", &sandbox_after]).await;
+}
+
+// -------------------------------------------------------------------
+// Test: SSH handshake secret persists across container restart
+// -------------------------------------------------------------------
+
+/// The SSH handshake K8s secret should persist across gateway stop/start
+/// cycles — the same base64-encoded value should be returned before and
+/// after the restart.
+#[tokio::test]
+async fn ssh_handshake_secret_persists_across_restart() {
+    // Precondition: gateway is healthy.
+    wait_for_healthy(Duration::from_secs(30)).await;
+
+    // Read the SSH handshake secret before restart.
+    let secret_before = read_ssh_handshake_secret()
+        .expect("SSH handshake secret should exist before restart");
+    assert!(
+        !secret_before.is_empty(),
+        "SSH handshake secret should not be empty"
+    );
+
+    // Stop the gateway.
+    let (_, stop_code) = run_cli(&["gateway", "stop"]).await;
+    assert_eq!(stop_code, 0, "gateway stop should succeed");
+
+    sleep(Duration::from_secs(3)).await;
+
+    // Start the gateway.
+    let (start_output, start_code) = run_cli(&["gateway", "start"]).await;
+    assert_eq!(
+        start_code, 0,
+        "gateway start should succeed:\n{}",
+        strip_ansi(&start_output)
+    );
+
+    // Wait for healthy.
+    wait_for_healthy(Duration::from_secs(180)).await;
+
+    // Read the secret after restart.
+    let secret_after = read_ssh_handshake_secret()
+        .expect("SSH handshake secret should exist after restart");
+
+    assert_eq!(
+        secret_before, secret_after,
+        "SSH handshake secret should be identical before and after restart"
+    );
+}
diff --git a/tasks/scripts/cluster-deploy-fast.sh b/tasks/scripts/cluster-deploy-fast.sh
index 600bdd6c..307e7623 100755
--- a/tasks/scripts/cluster-deploy-fast.sh
+++ b/tasks/scripts/cluster-deploy-fast.sh
@@ -408,12 +408,13 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then
   # terminates mTLS (there is no server.tls.enabled toggle). Without this,
   # a prior Helm override or chart default change could silently regress
   # sandbox callbacks to plaintext.
-  # Retrieve the existing handshake secret from the running release, or generate
-  # a new one if this is the first deploy with the mandatory secret.
-  EXISTING_SECRET=$(cluster_exec "helm get values openshell -n openshell -o json 2>/dev/null \
-    | grep -o '\"sshHandshakeSecret\":\"[^\"]*\"' \
-    | cut -d'\"' -f4") || true
-  SSH_HANDSHAKE_SECRET="${EXISTING_SECRET:-$(openssl rand -hex 32)}"
+  # Ensure the SSH handshake K8s secret exists. The bootstrap process normally
+  # creates it, but fast-deploy may run before bootstrap on a fresh cluster.
+  EXISTING_SECRET=$(cluster_exec "kubectl -n openshell get secret openshell-ssh-handshake -o jsonpath='{.data.secret}' 2>/dev/null | base64 -d" 2>/dev/null) || true
+  if [ -z "${EXISTING_SECRET}" ]; then
+    SSH_HANDSHAKE_SECRET="$(openssl rand -hex 32)"
+    cluster_exec "kubectl -n openshell create secret generic openshell-ssh-handshake --from-literal=secret='${SSH_HANDSHAKE_SECRET}' --dry-run=client -o yaml | kubectl apply -f -"
+  fi
 
   # Retrieve the host gateway IP from the entrypoint-rendered HelmChart CR so
   # that hostAliases for host.openshell.internal are preserved across fast deploys.
@@ -433,7 +434,6 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then
     --set server.tls.certSecretName=openshell-server-tls \
     --set server.tls.clientCaSecretName=openshell-server-client-ca \
     --set server.tls.clientTlsSecretName=openshell-client-tls \
-    --set server.sshHandshakeSecret=${SSH_HANDSHAKE_SECRET} \
     ${HOST_GATEWAY_ARGS} \
     ${helm_wait_args}"
   helm_end=$(date +%s)