From 287865356e543c72c5e398ac23f68a38a5cae23c Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Wed, 18 Mar 2026 16:34:06 -0700
Subject: [PATCH 1/2] fix(bootstrap): auto-cleanup Docker resources on failed
 gateway deploy

When deploy_gateway_with_logs() fails after creating Docker resources
(volume, container, network), the orphaned volume blocks subsequent
retries with 'Corrupted cluster state'. Wrap the post-resource-creation
steps in an async block and call destroy_gateway_resources() on failure
to leave the environment in a clean, retryable state.

Also update the corrupted state diagnosis to reflect that cleanup is now
automatic and mark the error as retryable.

Fixes #463
---
 crates/openshell-bootstrap/src/errors.rs |  12 +-
 crates/openshell-bootstrap/src/lib.rs    | 224 +++++++++++++----------
 2 files changed, 129 insertions(+), 107 deletions(-)
diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs
index 1511b362..23ed324a 100644
--- a/crates/openshell-bootstrap/src/errors.rs
+++ b/crates/openshell-bootstrap/src/errors.rs
@@ -175,21 +175,19 @@ fn diagnose_corrupted_state(gateway_name: &str) -> GatewayFailureDiagnosis {
     GatewayFailureDiagnosis {
         summary: "Corrupted cluster state".to_string(),
         explanation: "The gateway cluster has corrupted internal state, likely from a previous \
-            interrupted startup or unclean shutdown."
+            interrupted startup or unclean shutdown. Resources from the failed deploy have been \
+            automatically cleaned up."
             .to_string(),
         recovery_steps: vec![
+            RecoveryStep::new("Retry the gateway start (cleanup was automatic)"),
             RecoveryStep::with_command(
-                "Destroy and recreate the gateway",
+                "If the retry fails, manually destroy and recreate",
                 format!(
                     "openshell gateway destroy --name {gateway_name} && openshell gateway start"
                 ),
             ),
-            RecoveryStep::with_command(
-                "If that fails, remove the volume for a clean slate",
-                format!("docker volume rm openshell-cluster-{gateway_name}"),
-            ),
         ],
-        retryable: false,
+        retryable: true,
     }
 }
 
diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs
index b6423aae..aa0f181e 100644
--- a/crates/openshell-bootstrap/src/lib.rs
+++ b/crates/openshell-bootstrap/src/lib.rs
@@ -400,119 +400,143 @@ where
         ));
     }
 
-    ensure_container(
-        &target_docker,
-        &name,
-        &image_ref,
-        &extra_sans,
-        ssh_gateway_host.as_deref(),
-        port,
-        disable_tls,
-        disable_gateway_auth,
-        registry_username.as_deref(),
-        registry_token.as_deref(),
-        gpu,
-    )
-    .await?;
-    start_container(&target_docker, &name).await?;
-
-    // Clean up stale k3s nodes left over from previous container instances that
-    // used the same persistent volume. Without this, pods remain scheduled on
-    // NotReady ghost nodes and the health check will time out.
-    match clean_stale_nodes(&target_docker, &name).await {
-        Ok(0) => {}
-        Ok(n) => tracing::debug!("removed {n} stale node(s)"),
-        Err(err) => {
-            tracing::debug!("stale node cleanup failed (non-fatal): {err}");
+    // From this point on, Docker resources (container, volume, network) are
+    // being created. If any subsequent step fails, we must clean up to avoid
+    // leaving an orphaned volume in a corrupted state that blocks retries.
+    // See: https://github.com/NVIDIA/OpenShell/issues/463
+    let deploy_result: Result<GatewayMetadata> = async {
+        ensure_container(
+            &target_docker,
+            &name,
+            &image_ref,
+            &extra_sans,
+            ssh_gateway_host.as_deref(),
+            port,
+            disable_tls,
+            disable_gateway_auth,
+            registry_username.as_deref(),
+            registry_token.as_deref(),
+            gpu,
+        )
+        .await?;
+        start_container(&target_docker, &name).await?;
+
+        // Clean up stale k3s nodes left over from previous container instances that
+        // used the same persistent volume. Without this, pods remain scheduled on
+        // NotReady ghost nodes and the health check will time out.
+        match clean_stale_nodes(&target_docker, &name).await {
+            Ok(0) => {}
+            Ok(n) => tracing::debug!("removed {n} stale node(s)"),
+            Err(err) => {
+                tracing::debug!("stale node cleanup failed (non-fatal): {err}");
+            }
         }
-    }
 
-    // Reconcile PKI: reuse existing cluster TLS secrets if they are complete and
-    // valid; only generate fresh PKI when secrets are missing, incomplete,
-    // malformed, or expiring within MIN_REMAINING_VALIDITY_DAYS.
-    //
-    // Ordering is: reconcile secrets -> (if rotated and workload exists:
-    // rollout restart and wait) -> persist CLI-side bundle.
-    //
-    // We check workload presence before reconciliation. On a fresh/recreated
-    // cluster, secrets are always newly generated and a restart is unnecessary.
-    // Restarting only when workload pre-existed avoids extra rollout latency.
-    let workload_existed_before_pki = openshell_workload_exists(&target_docker, &name).await?;
-    let (pki_bundle, rotated) = reconcile_pki(&target_docker, &name, &extra_sans, &log).await?;
-
-    if rotated && workload_existed_before_pki {
-        // If an openshell workload is already running, it must be restarted so
-        // it picks up the new TLS secrets before we write CLI-side certs.
-        // A failed rollout is a hard error — CLI certs must not be persisted
-        // if the server cannot come up with the new PKI.
-        restart_openshell_deployment(&target_docker, &name).await?;
-    }
+        // Reconcile PKI: reuse existing cluster TLS secrets if they are complete and
+        // valid; only generate fresh PKI when secrets are missing, incomplete,
+        // malformed, or expiring within MIN_REMAINING_VALIDITY_DAYS.
+        //
+        // Ordering is: reconcile secrets -> (if rotated and workload exists:
+        // rollout restart and wait) -> persist CLI-side bundle.
+        //
+        // We check workload presence before reconciliation. On a fresh/recreated
+        // cluster, secrets are always newly generated and a restart is unnecessary.
+        // Restarting only when workload pre-existed avoids extra rollout latency.
+        let workload_existed_before_pki = openshell_workload_exists(&target_docker, &name).await?;
+        let (pki_bundle, rotated) = reconcile_pki(&target_docker, &name, &extra_sans, &log).await?;
+
+        if rotated && workload_existed_before_pki {
+            // If an openshell workload is already running, it must be restarted so
+            // it picks up the new TLS secrets before we write CLI-side certs.
+            // A failed rollout is a hard error — CLI certs must not be persisted
+            // if the server cannot come up with the new PKI.
+            restart_openshell_deployment(&target_docker, &name).await?;
+        }
 
-    store_pki_bundle(&name, &pki_bundle)?;
+        store_pki_bundle(&name, &pki_bundle)?;
+
+        // Push locally-built component images into the k3s containerd runtime.
+        // This is the "push" path for local development — images are exported from
+        // the local Docker daemon and streamed into the cluster's containerd so
+        // k3s can resolve them without pulling from the remote registry.
+        if remote_opts.is_none()
+            && let Ok(push_images_str) = std::env::var("OPENSHELL_PUSH_IMAGES")
+        {
+            let images: Vec<&str> = push_images_str
+                .split(',')
+                .map(str::trim)
+                .filter(|s| !s.is_empty())
+                .collect();
+            if !images.is_empty() {
+                log("[status] Deploying components".to_string());
+                let local_docker = Docker::connect_with_local_defaults().into_diagnostic()?;
+                let container = container_name(&name);
+                let on_log_ref = Arc::clone(&on_log);
+                let mut push_log = move |msg: String| {
+                    if let Ok(mut f) = on_log_ref.lock() {
+                        f(msg);
+                    }
+                };
+                push::push_local_images(
+                    &local_docker,
+                    &target_docker,
+                    &container,
+                    &images,
+                    &mut push_log,
+                )
+                .await?;
 
-    // Push locally-built component images into the k3s containerd runtime.
-    // This is the "push" path for local development — images are exported from
-    // the local Docker daemon and streamed into the cluster's containerd so
-    // k3s can resolve them without pulling from the remote registry.
-    if remote_opts.is_none()
-        && let Ok(push_images_str) = std::env::var("OPENSHELL_PUSH_IMAGES")
-    {
-        let images: Vec<&str> = push_images_str
-            .split(',')
-            .map(str::trim)
-            .filter(|s| !s.is_empty())
-            .collect();
-        if !images.is_empty() {
-            log("[status] Deploying components".to_string());
-            let local_docker = Docker::connect_with_local_defaults().into_diagnostic()?;
-            let container = container_name(&name);
+                restart_openshell_deployment(&target_docker, &name).await?;
+            }
+        }
+
+        log("[status] Starting gateway".to_string());
+        {
+            // Create a short-lived closure that locks on each call rather than holding
+            // the MutexGuard across await points.
             let on_log_ref = Arc::clone(&on_log);
-            let mut push_log = move |msg: String| {
+            let mut gateway_log = move |msg: String| {
                 if let Ok(mut f) = on_log_ref.lock() {
                     f(msg);
                 }
             };
-            push::push_local_images(
-                &local_docker,
-                &target_docker,
-                &container,
-                &images,
-                &mut push_log,
-            )
-            .await?;
-
-            restart_openshell_deployment(&target_docker, &name).await?;
+            wait_for_gateway_ready(&target_docker, &name, &mut gateway_log).await?;
         }
-    }
 
-    log("[status] Starting gateway".to_string());
-    {
-        // Create a short-lived closure that locks on each call rather than holding
-        // the MutexGuard across await points.
-        let on_log_ref = Arc::clone(&on_log);
-        let mut gateway_log = move |msg: String| {
-            if let Ok(mut f) = on_log_ref.lock() {
-                f(msg);
+        // Create and store gateway metadata.
+        let metadata = create_gateway_metadata_with_host(
+            &name,
+            remote_opts.as_ref(),
+            port,
+            ssh_gateway_host.as_deref(),
+            disable_tls,
+        );
+        store_gateway_metadata(&name, &metadata)?;
+
+        Ok(metadata)
+    }
+    .await;
+
+    match deploy_result {
+        Ok(metadata) => Ok(GatewayHandle {
+            name,
+            metadata,
+            docker: target_docker,
+        }),
+        Err(deploy_err) => {
+            // Automatically clean up Docker resources (volume, container, network,
+            // image) so the environment is left in a retryable state.
+            tracing::info!("deploy failed, cleaning up gateway resources for '{name}'");
+            if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await {
+                tracing::warn!(
+                    "automatic cleanup after failed deploy also failed: {cleanup_err}. \
+                     Manual cleanup may be required: \
+                     openshell gateway destroy --name {name}"
+                );
             }
-        };
-        wait_for_gateway_ready(&target_docker, &name, &mut gateway_log).await?;
+            Err(deploy_err)
+        }
     }
-
-    // Create and store gateway metadata.
-    let metadata = create_gateway_metadata_with_host(
-        &name,
-        remote_opts.as_ref(),
-        port,
-        ssh_gateway_host.as_deref(),
-        disable_tls,
-    );
-    store_gateway_metadata(&name, &metadata)?;
-
-    Ok(GatewayHandle {
-        name,
-        metadata,
-        docker: target_docker,
-    })
 }
 
 /// Get a handle to an existing gateway.

From f9765fecf4df84c1f2a431408ce13a55ae7fa62a Mon Sep 17 00:00:00 2001
From: Drew Newberry <anewberry@nvidia.com>
Date: Wed, 18 Mar 2026 16:37:38 -0700
Subject: [PATCH 2/2] test(bootstrap): add tests for auto-cleanup diagnosis
 behavior

Verify that the corrupted state diagnosis correctly reflects automatic
cleanup: retryable flag is true, explanation mentions automatic cleanup,
recovery steps no longer include manual docker volume rm, and the
fallback command interpolates the gateway name.

Refs #463
---
 crates/openshell-bootstrap/src/errors.rs | 81 ++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs
index 23ed324a..a71d113a 100644
--- a/crates/openshell-bootstrap/src/errors.rs
+++ b/crates/openshell-bootstrap/src/errors.rs
@@ -481,6 +481,87 @@ mod tests {
         assert!(d.summary.contains("Corrupted"));
     }
 
+    #[test]
+    fn test_diagnose_corrupted_state_is_retryable_after_auto_cleanup() {
+        // After the auto-cleanup fix (#463), corrupted state errors should be
+        // marked retryable because deploy_gateway_with_logs now automatically
+        // cleans up Docker resources on failure.
+        let d = diagnose_failure(
+            "mygw",
+            "K8s namespace not ready",
+            Some("configmaps \"extension-apiserver-authentication\" is forbidden"),
+        )
+        .expect("should match corrupted state pattern");
+        assert!(
+            d.retryable,
+            "corrupted state should be retryable after auto-cleanup"
+        );
+        assert!(
+            d.explanation.contains("automatically cleaned up"),
+            "explanation should mention automatic cleanup, got: {}",
+            d.explanation
+        );
+    }
+
+    #[test]
+    fn test_diagnose_corrupted_state_recovery_no_manual_volume_rm() {
+        // The recovery steps should no longer include a manual docker volume rm
+        // command, since cleanup is now automatic. The first step should tell
+        // the user to simply retry.
+        let d = diagnose_failure("mygw", "cannot get resource \"namespaces\"", None)
+            .expect("should match corrupted state pattern");
+
+        let all_commands: Vec<String> = d
+            .recovery_steps
+            .iter()
+            .filter_map(|s| s.command.clone())
+            .collect();
+        let all_commands_joined = all_commands.join(" ");
+
+        assert!(
+            !all_commands_joined.contains("docker volume rm"),
+            "recovery steps should not include manual docker volume rm, got: {all_commands_joined}"
+        );
+
+        // First step should be a description-only step (no command) about retrying
+        assert!(
+            d.recovery_steps[0].command.is_none(),
+            "first recovery step should be description-only (automatic cleanup)"
+        );
+        assert!(
+            d.recovery_steps[0]
+                .description
+                .contains("cleanup was automatic"),
+            "first recovery step should mention automatic cleanup"
+        );
+    }
+
+    #[test]
+    fn test_diagnose_corrupted_state_fallback_step_includes_gateway_name() {
+        // The fallback recovery step should interpolate the gateway name so
+        // users can copy-paste the command.
+        let d = diagnose_failure("my-gateway", "is forbidden", None)
+            .expect("should match corrupted state pattern");
+
+        assert!(
+            d.recovery_steps.len() >= 2,
+            "should have at least 2 recovery steps"
+        );
+        let fallback = &d.recovery_steps[1];
+        let cmd = fallback
+            .command
+            .as_deref()
+            .expect("fallback step should have a command");
+        assert!(
+            cmd.contains("my-gateway"),
+            "fallback command should contain gateway name, got: {cmd}"
+        );
+        assert!(
+            cmd.contains("openshell gateway destroy"),
+            "fallback command should include gateway destroy, got: {cmd}"
+        );
+    }
+
     #[test]
     fn test_diagnose_no_default_route() {
         let diagnosis = diagnose_failure(