From 287865356e543c72c5e398ac23f68a38a5cae23c Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Wed, 18 Mar 2026 16:34:06 -0700 Subject: [PATCH 1/2] fix(bootstrap): auto-cleanup Docker resources on failed gateway deploy When deploy_gateway_with_logs() fails after creating Docker resources (volume, container, network), the orphaned volume blocks subsequent retries with 'Corrupted cluster state'. Wrap the post-resource-creation steps in an async block and call destroy_gateway_resources() on failure to leave the environment in a clean, retryable state. Also update the corrupted state diagnosis to reflect that cleanup is now automatic and mark the error as retryable. Fixes #463 --- crates/openshell-bootstrap/src/errors.rs | 12 +- crates/openshell-bootstrap/src/lib.rs | 224 +++++++++++++---------- 2 files changed, 129 insertions(+), 107 deletions(-) diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs index 1511b362..23ed324a 100644 --- a/crates/openshell-bootstrap/src/errors.rs +++ b/crates/openshell-bootstrap/src/errors.rs @@ -175,21 +175,19 @@ fn diagnose_corrupted_state(gateway_name: &str) -> GatewayFailureDiagnosis { GatewayFailureDiagnosis { summary: "Corrupted cluster state".to_string(), explanation: "The gateway cluster has corrupted internal state, likely from a previous \ - interrupted startup or unclean shutdown." + interrupted startup or unclean shutdown. Resources from the failed deploy have been \ + automatically cleaned up." .to_string(), recovery_steps: vec![ + RecoveryStep::new("Retry the gateway start (cleanup was automatic)"), RecoveryStep::with_command( - "Destroy and recreate the gateway", + "If the retry fails, manually destroy and recreate", format!( "openshell gateway destroy --name {gateway_name} && openshell gateway start" ), ), - RecoveryStep::with_command( - "If that fails, remove the volume for a clean slate", - format!("docker volume rm openshell-cluster-{gateway_name}"), - ), ], - retryable: false, + retryable: true, } } diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index b6423aae..aa0f181e 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -400,119 +400,143 @@ where )); } - ensure_container( - &target_docker, - &name, - &image_ref, - &extra_sans, - ssh_gateway_host.as_deref(), - port, - disable_tls, - disable_gateway_auth, - registry_username.as_deref(), - registry_token.as_deref(), - gpu, - ) - .await?; - start_container(&target_docker, &name).await?; - - // Clean up stale k3s nodes left over from previous container instances that - // used the same persistent volume. Without this, pods remain scheduled on - // NotReady ghost nodes and the health check will time out. - match clean_stale_nodes(&target_docker, &name).await { - Ok(0) => {} - Ok(n) => tracing::debug!("removed {n} stale node(s)"), - Err(err) => { - tracing::debug!("stale node cleanup failed (non-fatal): {err}"); + // From this point on, Docker resources (container, volume, network) are + // being created. If any subsequent step fails, we must clean up to avoid + // leaving an orphaned volume in a corrupted state that blocks retries. + // See: https://github.com/NVIDIA/OpenShell/issues/463 + let deploy_result: Result = async { + ensure_container( + &target_docker, + &name, + &image_ref, + &extra_sans, + ssh_gateway_host.as_deref(), + port, + disable_tls, + disable_gateway_auth, + registry_username.as_deref(), + registry_token.as_deref(), + gpu, + ) + .await?; + start_container(&target_docker, &name).await?; + + // Clean up stale k3s nodes left over from previous container instances that + // used the same persistent volume. Without this, pods remain scheduled on + // NotReady ghost nodes and the health check will time out. + match clean_stale_nodes(&target_docker, &name).await { + Ok(0) => {} + Ok(n) => tracing::debug!("removed {n} stale node(s)"), + Err(err) => { + tracing::debug!("stale node cleanup failed (non-fatal): {err}"); + } } - } - // Reconcile PKI: reuse existing cluster TLS secrets if they are complete and - // valid; only generate fresh PKI when secrets are missing, incomplete, - // malformed, or expiring within MIN_REMAINING_VALIDITY_DAYS. - // - // Ordering is: reconcile secrets -> (if rotated and workload exists: - // rollout restart and wait) -> persist CLI-side bundle. - // - // We check workload presence before reconciliation. On a fresh/recreated - // cluster, secrets are always newly generated and a restart is unnecessary. - // Restarting only when workload pre-existed avoids extra rollout latency. - let workload_existed_before_pki = openshell_workload_exists(&target_docker, &name).await?; - let (pki_bundle, rotated) = reconcile_pki(&target_docker, &name, &extra_sans, &log).await?; - - if rotated && workload_existed_before_pki { - // If an openshell workload is already running, it must be restarted so - // it picks up the new TLS secrets before we write CLI-side certs. - // A failed rollout is a hard error — CLI certs must not be persisted - // if the server cannot come up with the new PKI. - restart_openshell_deployment(&target_docker, &name).await?; - } + // Reconcile PKI: reuse existing cluster TLS secrets if they are complete and + // valid; only generate fresh PKI when secrets are missing, incomplete, + // malformed, or expiring within MIN_REMAINING_VALIDITY_DAYS. + // + // Ordering is: reconcile secrets -> (if rotated and workload exists: + // rollout restart and wait) -> persist CLI-side bundle. + // + // We check workload presence before reconciliation. On a fresh/recreated + // cluster, secrets are always newly generated and a restart is unnecessary. + // Restarting only when workload pre-existed avoids extra rollout latency. + let workload_existed_before_pki = openshell_workload_exists(&target_docker, &name).await?; + let (pki_bundle, rotated) = reconcile_pki(&target_docker, &name, &extra_sans, &log).await?; + + if rotated && workload_existed_before_pki { + // If an openshell workload is already running, it must be restarted so + // it picks up the new TLS secrets before we write CLI-side certs. + // A failed rollout is a hard error — CLI certs must not be persisted + // if the server cannot come up with the new PKI. + restart_openshell_deployment(&target_docker, &name).await?; + } - store_pki_bundle(&name, &pki_bundle)?; + store_pki_bundle(&name, &pki_bundle)?; + + // Push locally-built component images into the k3s containerd runtime. + // This is the "push" path for local development — images are exported from + // the local Docker daemon and streamed into the cluster's containerd so + // k3s can resolve them without pulling from the remote registry. + if remote_opts.is_none() + && let Ok(push_images_str) = std::env::var("OPENSHELL_PUSH_IMAGES") + { + let images: Vec<&str> = push_images_str + .split(',') + .map(str::trim) + .filter(|s| !s.is_empty()) + .collect(); + if !images.is_empty() { + log("[status] Deploying components".to_string()); + let local_docker = Docker::connect_with_local_defaults().into_diagnostic()?; + let container = container_name(&name); + let on_log_ref = Arc::clone(&on_log); + let mut push_log = move |msg: String| { + if let Ok(mut f) = on_log_ref.lock() { + f(msg); + } + }; + push::push_local_images( + &local_docker, + &target_docker, + &container, + &images, + &mut push_log, + ) + .await?; - // Push locally-built component images into the k3s containerd runtime. - // This is the "push" path for local development — images are exported from - // the local Docker daemon and streamed into the cluster's containerd so - // k3s can resolve them without pulling from the remote registry. - if remote_opts.is_none() - && let Ok(push_images_str) = std::env::var("OPENSHELL_PUSH_IMAGES") - { - let images: Vec<&str> = push_images_str - .split(',') - .map(str::trim) - .filter(|s| !s.is_empty()) - .collect(); - if !images.is_empty() { - log("[status] Deploying components".to_string()); - let local_docker = Docker::connect_with_local_defaults().into_diagnostic()?; - let container = container_name(&name); + restart_openshell_deployment(&target_docker, &name).await?; + } + } + + log("[status] Starting gateway".to_string()); + { + // Create a short-lived closure that locks on each call rather than holding + // the MutexGuard across await points. let on_log_ref = Arc::clone(&on_log); - let mut push_log = move |msg: String| { + let mut gateway_log = move |msg: String| { if let Ok(mut f) = on_log_ref.lock() { f(msg); } }; - push::push_local_images( - &local_docker, - &target_docker, - &container, - &images, - &mut push_log, - ) - .await?; - - restart_openshell_deployment(&target_docker, &name).await?; + wait_for_gateway_ready(&target_docker, &name, &mut gateway_log).await?; } - } - log("[status] Starting gateway".to_string()); - { - // Create a short-lived closure that locks on each call rather than holding - // the MutexGuard across await points. - let on_log_ref = Arc::clone(&on_log); - let mut gateway_log = move |msg: String| { - if let Ok(mut f) = on_log_ref.lock() { - f(msg); + // Create and store gateway metadata. + let metadata = create_gateway_metadata_with_host( + &name, + remote_opts.as_ref(), + port, + ssh_gateway_host.as_deref(), + disable_tls, + ); + store_gateway_metadata(&name, &metadata)?; + + Ok(metadata) + } + .await; + + match deploy_result { + Ok(metadata) => Ok(GatewayHandle { + name, + metadata, + docker: target_docker, + }), + Err(deploy_err) => { + // Automatically clean up Docker resources (volume, container, network, + // image) so the environment is left in a retryable state. + tracing::info!("deploy failed, cleaning up gateway resources for '{name}'"); + if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await { + tracing::warn!( + "automatic cleanup after failed deploy also failed: {cleanup_err}. \ + Manual cleanup may be required: \ + openshell gateway destroy --name {name}" + ); } - }; - wait_for_gateway_ready(&target_docker, &name, &mut gateway_log).await?; + Err(deploy_err) + } } - - // Create and store gateway metadata. - let metadata = create_gateway_metadata_with_host( - &name, - remote_opts.as_ref(), - port, - ssh_gateway_host.as_deref(), - disable_tls, - ); - store_gateway_metadata(&name, &metadata)?; - - Ok(GatewayHandle { - name, - metadata, - docker: target_docker, - }) } /// Get a handle to an existing gateway. From f9765fecf4df84c1f2a431408ce13a55ae7fa62a Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Wed, 18 Mar 2026 16:37:38 -0700 Subject: [PATCH 2/2] test(bootstrap): add tests for auto-cleanup diagnosis behavior Verify that the corrupted state diagnosis correctly reflects automatic cleanup: retryable flag is true, explanation mentions automatic cleanup, recovery steps no longer include manual docker volume rm, and the fallback command interpolates the gateway name. Refs #463 --- crates/openshell-bootstrap/src/errors.rs | 81 ++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs index 23ed324a..a71d113a 100644 --- a/crates/openshell-bootstrap/src/errors.rs +++ b/crates/openshell-bootstrap/src/errors.rs @@ -481,6 +481,87 @@ mod tests { assert!(d.summary.contains("Corrupted")); } + #[test] + fn test_diagnose_corrupted_state_is_retryable_after_auto_cleanup() { + // After the auto-cleanup fix (#463), corrupted state errors should be + // marked retryable because deploy_gateway_with_logs now automatically + // cleans up Docker resources on failure. + let d = diagnose_failure( + "mygw", + "K8s namespace not ready", + Some("configmaps \"extension-apiserver-authentication\" is forbidden"), + ) + .expect("should match corrupted state pattern"); + assert!( + d.retryable, + "corrupted state should be retryable after auto-cleanup" + ); + assert!( + d.explanation.contains("automatically cleaned up"), + "explanation should mention automatic cleanup, got: {}", + d.explanation + ); + } + + #[test] + fn test_diagnose_corrupted_state_recovery_no_manual_volume_rm() { + // The recovery steps should no longer include a manual docker volume rm + // command, since cleanup is now automatic. The first step should tell + // the user to simply retry. + let d = diagnose_failure("mygw", "cannot get resource \"namespaces\"", None) + .expect("should match corrupted state pattern"); + + let all_commands: Vec = d + .recovery_steps + .iter() + .filter_map(|s| s.command.clone()) + .collect(); + let all_commands_joined = all_commands.join(" "); + + assert!( + !all_commands_joined.contains("docker volume rm"), + "recovery steps should not include manual docker volume rm, got: {all_commands_joined}" + ); + + // First step should be a description-only step (no command) about retrying + assert!( + d.recovery_steps[0].command.is_none(), + "first recovery step should be description-only (automatic cleanup)" + ); + assert!( + d.recovery_steps[0] + .description + .contains("cleanup was automatic"), + "first recovery step should mention automatic cleanup" + ); + } + + #[test] + fn test_diagnose_corrupted_state_fallback_step_includes_gateway_name() { + // The fallback recovery step should interpolate the gateway name so + // users can copy-paste the command. + let d = diagnose_failure("my-gateway", "is forbidden", None) + .expect("should match corrupted state pattern"); + + assert!( + d.recovery_steps.len() >= 2, + "should have at least 2 recovery steps" + ); + let fallback = &d.recovery_steps[1]; + let cmd = fallback + .command + .as_deref() + .expect("fallback step should have a command"); + assert!( + cmd.contains("my-gateway"), + "fallback command should contain gateway name, got: {cmd}" + ); + assert!( + cmd.contains("openshell gateway destroy"), + "fallback command should include gateway destroy, got: {cmd}" + ); + } + #[test] fn test_diagnose_no_default_route() { let diagnosis = diagnose_failure(