From 8e9b035c6688eb55d0136967df49689afd76d14f Mon Sep 17 00:00:00 2001 From: Peter Rekdal Khan-Sunde Date: Thu, 21 May 2026 11:57:49 +0200 Subject: [PATCH 1/3] Fix restart convergence proof --- crates/surge-cli/src/commands/install/mod.rs | 8 +- .../src/commands/install/remote/runtime.rs | 18 ++- crates/surge-core/src/install/mod.rs | 19 ++- .../src/install/runtime_manifest.rs | 26 +++- crates/surge-ffi/src/lib.rs | 131 +++++++++++++++++- crates/surge-supervisor/src/main.rs | 108 +++++++++++++-- 6 files changed, 286 insertions(+), 24 deletions(-) diff --git a/crates/surge-cli/src/commands/install/mod.rs b/crates/surge-cli/src/commands/install/mod.rs index 8c98981..0e55f87 100644 --- a/crates/surge-cli/src/commands/install/mod.rs +++ b/crates/surge-cli/src/commands/install/mod.rs @@ -1147,16 +1147,20 @@ mod tests { assert!(probe.contains("active_exe=\"$install_root/app/$main_exe\"")); assert!(probe.contains("contains_target_first_run()")); - assert!(probe.contains("watched_pid_is_running()")); + assert!(probe.contains("contains_target_version_arg()")); + assert!(probe.contains("extract_watched_pid()")); assert!(probe.contains("*\" --surge-first-run $version \"*|*\" $version --surge-first-run \"*")); + assert!(probe.contains("target_app_pids")); + assert!(probe.contains("target_supervisor_seen")); assert!(probe.contains("surge-supervisor")); assert!(probe.contains("--id $supervisor_id")); assert!(probe.contains("app process for $active_exe was not found")); - assert!(probe.contains("app process for $active_exe is running without --surge-first-run proof for $version")); + assert!(probe.contains("app process for $active_exe is running without target proof for $version")); assert!(probe.contains("stale app process for $active_exe is still running without target proof for $version")); assert!(probe.contains("supervisor process '$supervisor_id' is still waiting for the previous child")); assert!(probe.contains("supervisor process '$supervisor_id' is running with stale first-run proof")); assert!(probe.contains("supervisor process '$supervisor_id' was not found")); + assert!(probe.contains("supervisor process '$supervisor_id' is not watching target app process for $version")); } #[cfg(unix)] diff --git a/crates/surge-cli/src/commands/install/remote/runtime.rs b/crates/surge-cli/src/commands/install/remote/runtime.rs index 914e1e0..97fca3a 100644 --- a/crates/surge-cli/src/commands/install/remote/runtime.rs +++ b/crates/surge-cli/src/commands/install/remote/runtime.rs @@ -245,32 +245,38 @@ pub(crate) fn build_remote_process_verification_probe( ) -> String { format!( r#"install_root={}; main_exe={}; supervisor_id={}; version={}; -active_exe="$install_root/app/$main_exe"; app_seen=0; target_app_seen=0; stale_app_seen=0; supervisor_seen=0; stale_supervisor_seen=0; waiting_supervisor_seen=0; +active_exe="$install_root/app/$main_exe"; app_seen=0; target_app_seen=0; stale_app_seen=0; supervisor_seen=0; stale_supervisor_seen=0; waiting_supervisor_seen=0; target_supervisor_seen=0; target_app_pids=""; watched_pids=""; contains_target_first_run() {{ cmd_tokens=" $1 "; case "$cmd_tokens" in *" --surge-first-run $version "*|*" $version --surge-first-run "*) return 0 ;; esac; return 1; }} -watched_pid_is_running() {{ case "$1" in *" watch "*" --pid "*) rest="${{1#* --pid }}"; watched_pid="${{rest%% *}}"; case "$watched_pid" in ""|*[!0-9]*) return 1 ;; esac; kill -0 "$watched_pid" 2>/dev/null ;; esac; return 1; }} +contains_target_version_arg() {{ cmd_tokens=" $1 "; case "$cmd_tokens" in *" $version "*) return 0 ;; esac; return 1; }} +contains_target_proof() {{ contains_target_first_run "$1" || contains_target_version_arg "$1"; }} +extract_watched_pid() {{ case "$1" in *" watch "*" --pid "*) rest="${{1#* --pid }}"; watched_pid="${{rest%% *}}"; case "$watched_pid" in ""|*[!0-9]*) return 1 ;; esac; printf '%s\n' "$watched_pid"; return 0 ;; esac; return 1; }} for cmdline in /proc/[0-9]*/cmdline; do [ -r "$cmdline" ] || continue; pid="${{cmdline%/cmdline}}"; pid="${{pid##*/}}"; case "$pid" in "$$"|"$PPID") continue ;; esac; cmd="$(tr '\0' ' ' < "$cmdline" 2>/dev/null || true)"; [ -n "$cmd" ] || continue; - case "$cmd" in *"surge-supervisor"*) ;; *"$active_exe"*) app_seen=1; if contains_target_first_run "$cmd"; then target_app_seen=1; else stale_app_seen=1; fi ;; esac; + case "$cmd" in *"surge-supervisor"*) ;; *"$active_exe"*) app_seen=1; if contains_target_proof "$cmd"; then target_app_seen=1; target_app_pids="${{target_app_pids}}${{pid}} "; else stale_app_seen=1; fi ;; esac; if [ -n "$supervisor_id" ]; then case "$cmd" in *"surge-supervisor"*"--id $supervisor_id"*) supervisor_seen=1; - if watched_pid_is_running "$cmd"; then waiting_supervisor_seen=1; fi; - case " $cmd " in *" --surge-first-run "*) if ! contains_target_first_run "$cmd"; then stale_supervisor_seen=1; fi ;; esac + if watched_pid="$(extract_watched_pid "$cmd")"; then watched_pids="${{watched_pids}}${{watched_pid}} "; fi; + case " $cmd " in *" --surge-first-run "*) if contains_target_first_run "$cmd"; then target_supervisor_seen=1; else stale_supervisor_seen=1; fi ;; esac ;; esac; fi; done; +for watched_pid in $watched_pids; do + case " $target_app_pids " in *" $watched_pid "*) target_supervisor_seen=1 ;; *) if kill -0 "$watched_pid" 2>/dev/null; then waiting_supervisor_seen=1; fi ;; esac; +done; if [ "$target_app_seen" -ne 1 ]; then - if [ "$app_seen" -eq 1 ]; then echo "app process for $active_exe is running without --surge-first-run proof for $version"; else echo "app process for $active_exe was not found"; fi; + if [ "$app_seen" -eq 1 ]; then echo "app process for $active_exe is running without target proof for $version"; else echo "app process for $active_exe was not found"; fi; exit 0; fi; if [ "$stale_app_seen" -eq 1 ]; then echo "stale app process for $active_exe is still running without target proof for $version"; exit 0; fi; if [ -n "$supervisor_id" ] && [ "$waiting_supervisor_seen" -eq 1 ]; then echo "supervisor process '$supervisor_id' is still waiting for the previous child"; exit 0; fi; if [ -n "$supervisor_id" ] && [ "$stale_supervisor_seen" -eq 1 ]; then echo "supervisor process '$supervisor_id' is running with stale first-run proof"; exit 0; fi; if [ -n "$supervisor_id" ] && [ "$supervisor_seen" -ne 1 ]; then echo "supervisor process '$supervisor_id' was not found"; exit 0; fi; +if [ -n "$supervisor_id" ] && [ "$target_supervisor_seen" -ne 1 ]; then echo "supervisor process '$supervisor_id' is not watching target app process for $version"; exit 0; fi; echo ready"#, shell_single_quote(&install_root.to_string_lossy()), shell_single_quote(main_exe), diff --git a/crates/surge-core/src/install/mod.rs b/crates/surge-core/src/install/mod.rs index 62614d5..4523691 100644 --- a/crates/surge-core/src/install/mod.rs +++ b/crates/surge-core/src/install/mod.rs @@ -16,7 +16,7 @@ pub use self::launch::{auto_start_after_install, auto_start_after_install_sequen pub use self::persistent_assets::{copy_persistent_assets, validate_relative_persistent_asset_path}; pub use self::runtime_manifest::{ LEGACY_RUNTIME_MANIFEST_RELATIVE_PATH, RUNTIME_MANIFEST_RELATIVE_PATH, RuntimeManifestMetadata, - storage_provider_manifest_name, write_runtime_manifest, + read_runtime_manifest_version, storage_provider_manifest_name, write_runtime_manifest, }; #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -111,7 +111,8 @@ mod tests { use super::{ InstallProfile, LEGACY_RUNTIME_MANIFEST_RELATIVE_PATH, RUNTIME_MANIFEST_RELATIVE_PATH, RuntimeManifestMetadata, - install_package_locally_at_root, storage_provider_manifest_name, write_runtime_manifest, + install_package_locally_at_root, read_runtime_manifest_version, storage_provider_manifest_name, + write_runtime_manifest, }; #[test] @@ -169,6 +170,20 @@ mod tests { assert_eq!(raw, legacy_raw); } + #[test] + fn read_runtime_manifest_version_prefers_current_manifest() { + let tmp = tempfile::tempdir().expect("temp dir should exist"); + let current_path = tmp.path().join(RUNTIME_MANIFEST_RELATIVE_PATH); + let legacy_path = tmp.path().join(LEGACY_RUNTIME_MANIFEST_RELATIVE_PATH); + std::fs::create_dir_all(current_path.parent().unwrap()).unwrap(); + std::fs::write(¤t_path, "id: demo\nversion: 2.0.0\nchannel: test\n").unwrap(); + std::fs::write(&legacy_path, "id: demo\nversion: 1.0.0\nchannel: test\n").unwrap(); + + let version = read_runtime_manifest_version(tmp.path()).unwrap(); + + assert_eq!(version.as_deref(), Some("2.0.0")); + } + #[test] fn install_package_locally_preserves_declared_persistent_assets_and_prunes_snapshots() { let tmp = tempfile::tempdir().expect("temp dir should exist"); diff --git a/crates/surge-core/src/install/runtime_manifest.rs b/crates/surge-core/src/install/runtime_manifest.rs index 346e9f8..fc05a84 100644 --- a/crates/surge-core/src/install/runtime_manifest.rs +++ b/crates/surge-core/src/install/runtime_manifest.rs @@ -1,6 +1,6 @@ use std::path::{Path, PathBuf}; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use crate::context::StorageProvider; use crate::error::{Result, SurgeError}; @@ -58,6 +58,11 @@ struct RuntimeManifestFile<'a> { endpoint: &'a str, } +#[derive(Debug, Clone, Deserialize)] +struct RuntimeManifestVersion { + version: String, +} + #[derive(Debug, Clone, Default)] pub(super) struct RuntimeManifestSnapshot { runtime_manifest: Option>, @@ -176,3 +181,22 @@ pub fn write_runtime_manifest( std::fs::write(&legacy_manifest_path, yaml)?; Ok(runtime_manifest_path) } + +pub fn read_runtime_manifest_version(active_app_dir: &Path) -> Result> { + for relative_path in [RUNTIME_MANIFEST_RELATIVE_PATH, LEGACY_RUNTIME_MANIFEST_RELATIVE_PATH] { + let path = active_app_dir.join(relative_path); + if !path.is_file() { + continue; + } + + let raw = std::fs::read(&path)?; + let manifest: RuntimeManifestVersion = serde_yaml::from_slice(&raw) + .map_err(|e| SurgeError::Config(format!("Failed to parse runtime manifest '{}': {e}", path.display())))?; + let version = manifest.version.trim(); + if !version.is_empty() { + return Ok(Some(version.to_string())); + } + } + + Ok(None) +} diff --git a/crates/surge-ffi/src/lib.rs b/crates/surge-ffi/src/lib.rs index cff2f45..557adc3 100644 --- a/crates/surge-ffi/src/lib.rs +++ b/crates/surge-ffi/src/lib.rs @@ -30,6 +30,7 @@ use std::ptr; use surge_core::lock::mutex::DistributedMutex; use surge_core::supervisor::state::{supervisor_pid_file, supervisor_stop_file, write_restart_args}; +use surge_core::update::status::{UpdateConvergenceState, mark_restart_handoff_converged, read_update_status}; pub use crate::context::{ surge_config_set_lock_server, surge_config_set_resource_budget, surge_config_set_storage, surge_context_create, @@ -272,7 +273,10 @@ pub unsafe extern "C" fn surge_supervisor_start( Some(install_dir), &BTreeMap::new(), ) { - Ok(_) => SURGE_OK, + Ok(_) => { + mark_self_supervised_runtime_converged(install_dir, exe.parent().unwrap_or(install_dir)); + SURGE_OK + } Err(e) => { tracing::error!("supervisor_start failed: {e}"); SURGE_ERROR @@ -281,6 +285,63 @@ pub unsafe extern "C" fn surge_supervisor_start( })) } +fn mark_self_supervised_runtime_converged(install_dir: &Path, active_app_dir: &Path) { + let record = match read_update_status(install_dir) { + Ok(Some(record)) => record, + Ok(None) => return, + Err(e) => { + tracing::warn!( + install_root = %install_dir.display(), + reason = %e, + "Failed reading update status before self-supervisor convergence proof" + ); + return; + } + }; + if !matches!( + record.state, + UpdateConvergenceState::InProgress | UpdateConvergenceState::PendingRestart + ) || record.installed_version.trim() != record.target_version.trim() + { + return; + } + + let runtime_version = match surge_core::install::read_runtime_manifest_version(active_app_dir) { + Ok(Some(version)) => version, + Ok(None) => return, + Err(e) => { + tracing::warn!( + install_root = %install_dir.display(), + app_dir = %active_app_dir.display(), + reason = %e, + "Failed reading runtime manifest before self-supervisor convergence proof" + ); + return; + } + }; + if runtime_version.trim() != record.target_version.trim() { + return; + } + + match mark_restart_handoff_converged(install_dir, &record.target_version) { + Ok(Some(_)) => { + tracing::info!( + version = record.target_version, + "Restart handoff converged after self-supervisor accepted the target runtime" + ); + } + Ok(None) => {} + Err(e) => { + tracing::warn!( + install_root = %install_dir.display(), + version = record.target_version, + reason = %e, + "Failed to record self-supervisor convergence proof" + ); + } + } +} + /// Stop a supervised process watcher. #[unsafe(no_mangle)] pub unsafe extern "C" fn surge_supervisor_stop(install_dir: *const c_char, supervisor_id: *const c_char) -> i32 { @@ -437,3 +498,71 @@ pub unsafe extern "C" fn surge_free_cstring(ptr: *mut c_char) { // documented as `free()`-owned, or is null. unsafe { crate::shared::libc_free(ptr.cast::()) }; } + +#[cfg(test)] +mod tests { + use std::path::{Path, PathBuf}; + + struct TestInstallDir { + path: PathBuf, + } + + impl TestInstallDir { + fn new(name: &str) -> Self { + let unique = format!( + "{name}-{}-{}", + std::process::id(), + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + ); + let path = std::env::temp_dir().join(unique); + std::fs::create_dir(&path).unwrap(); + Self { path } + } + + fn path(&self) -> &Path { + &self.path + } + } + + impl Drop for TestInstallDir { + fn drop(&mut self) { + let _ = std::fs::remove_dir_all(&self.path); + } + } + + #[test] + fn self_supervisor_converges_pending_status_when_runtime_manifest_matches() { + let tmp = TestInstallDir::new("surge-ffi-self-supervisor-converges"); + let app_dir = tmp.path().join("app"); + let runtime_manifest = app_dir.join(surge_core::install::RUNTIME_MANIFEST_RELATIVE_PATH); + std::fs::create_dir_all(runtime_manifest.parent().unwrap()).unwrap(); + std::fs::write(&runtime_manifest, "id: demo-app\nversion: 2.0.0\nchannel: test\n").unwrap(); + let pending = surge_core::update::status::UpdateStatusRecord::pending_restart_with_failure_phase( + "demo-app", + "2.0.0", + "2.0.0", + "test", + "2026-05-21T10:00:00Z".to_string(), + "2026-05-21T10:00:01Z".to_string(), + "waiting for old child", + surge_core::update::status::RESTART_HANDOFF_WAITING_FOR_OLD_CHILD_PHASE, + ); + surge_core::update::status::write_update_status(tmp.path(), &pending).unwrap(); + + super::mark_self_supervised_runtime_converged(tmp.path(), &app_dir); + + let status = surge_core::update::status::read_update_status(tmp.path()) + .unwrap() + .expect("status should remain present"); + assert_eq!( + status.state, + surge_core::update::status::UpdateConvergenceState::Converged + ); + assert!(status.supervisor_restart_confirmed); + assert_eq!(status.failure_phase, None); + assert_eq!(status.reason, None); + } +} diff --git a/crates/surge-supervisor/src/main.rs b/crates/surge-supervisor/src/main.rs index d7dbb65..26f4c1e 100644 --- a/crates/surge-supervisor/src/main.rs +++ b/crates/surge-supervisor/src/main.rs @@ -184,11 +184,12 @@ fn run_supervisor( let shutdown = install_signal_handlers(); - // Separate one-shot lifecycle args (--surge-*) from regular args. - // On the first child start, all args are passed. After that, lifecycle - // args are drained so crash-restarts don't re-fire lifecycle callbacks. - let mut lifecycle_args: Vec = args.iter().filter(|a| a.starts_with("--surge-")).cloned().collect(); - let regular_args: Vec = args.iter().filter(|a| !a.starts_with("--surge-")).cloned().collect(); + // On the first child start, all args are passed in their original order. + // After that, one-shot lifecycle args are drained so crash-restarts don't + // re-fire lifecycle callbacks. + let first_child_args = args.to_vec(); + let restart_args = without_lifecycle_args(args); + let mut next_child_args = Some(first_child_args); let mut pending_handoff_version = pending_restart_handoff_version(watched_pid, handoff_version, args); let mut watched_pid = watched_pid; @@ -216,8 +217,7 @@ fn run_supervisor( } } - let mut child_args = regular_args.clone(); - child_args.append(&mut lifecycle_args); + let child_args = next_child_args.take().unwrap_or_else(|| restart_args.clone()); tracing::info!("Starting child process: {}", exe_path.display()); @@ -265,11 +265,10 @@ fn run_supervisor( } fn pending_restart_handoff_version( - watched_pid: Option, + _watched_pid: Option, handoff_version: Option<&str>, args: &[String], ) -> Option { - watched_pid?; handoff_version .map(str::trim) .filter(|version| !version.is_empty()) @@ -295,6 +294,34 @@ fn first_run_version(args: &[String]) -> Option { }) } +fn without_lifecycle_args(args: &[String]) -> Vec { + let mut retained = Vec::with_capacity(args.len()); + let mut index = 0usize; + while index < args.len() { + let arg = &args[index]; + if is_lifecycle_arg(arg) { + index += 1; + if lifecycle_arg_takes_value(arg) && args.get(index).is_some_and(|candidate| !candidate.starts_with("--")) { + index += 1; + } + continue; + } + + retained.push(arg.clone()); + index += 1; + } + + retained +} + +fn is_lifecycle_arg(arg: &str) -> bool { + matches!(arg, "--surge-first-run" | "--surge-installed" | "--surge-updated") || arg.starts_with("--surge-updated=") +} + +fn lifecycle_arg_takes_value(arg: &str) -> bool { + matches!(arg, "--surge-first-run" | "--surge-installed" | "--surge-updated") +} + fn record_restart_handoff_converged(install_dir: &Path, version: &str) { match mark_restart_handoff_converged(install_dir, version) { Ok(Some(_)) => { @@ -640,12 +667,28 @@ mod tests { } #[test] - fn pending_handoff_version_requires_watched_pid() { + fn pending_handoff_version_accepts_first_run_without_watched_pid() { let args = vec!["--surge-first-run".to_string(), "2.0.0".to_string()]; - let version = pending_restart_handoff_version(None, Some("2.0.0"), &args); + let version = pending_restart_handoff_version(None, None, &args); - assert_eq!(version, None); + assert_eq!(version.as_deref(), Some("2.0.0")); + } + + #[test] + fn restart_args_drop_lifecycle_flag_value_pairs() { + let args = vec![ + "--app-mode".to_string(), + "service".to_string(), + "--surge-first-run".to_string(), + "2.0.0".to_string(), + "--surge-updated=2.0.0".to_string(), + "--tail".to_string(), + ]; + + let retained = without_lifecycle_args(&args); + + assert_eq!(retained, vec!["--app-mode", "service", "--tail"]); } #[test] @@ -679,6 +722,47 @@ mod tests { assert_eq!(args, vec!["--app-mode"]); } + #[cfg(unix)] + #[test] + fn run_supervisor_preserves_first_run_flag_value_order() { + use std::os::unix::fs::PermissionsExt; + + let tmp = TestInstallDir::new("surge-supervisor-first-run-order"); + let install_dir = tmp.path(); + let args_path = install_dir.join("args.txt"); + let exe_path = install_dir.join("target-child"); + std::fs::write( + &exe_path, + format!( + "#!/bin/sh\n\ + printf '%s\\n' \"$@\" > '{}'\n", + args_path.display() + ), + ) + .unwrap(); + let mut permissions = std::fs::metadata(&exe_path).unwrap().permissions(); + permissions.set_mode(0o755); + std::fs::set_permissions(&exe_path, permissions).unwrap(); + + run_supervisor( + "demo-supervisor", + install_dir, + &exe_path, + &[ + "--app-mode".to_string(), + "service".to_string(), + "--surge-first-run".to_string(), + "2.0.0".to_string(), + ], + None, + None, + ) + .unwrap(); + + let args = std::fs::read_to_string(args_path).unwrap(); + assert_eq!(args, "--app-mode\nservice\n--surge-first-run\n2.0.0\n"); + } + #[cfg(unix)] #[test] fn watched_handoff_converges_when_target_child_survives_stability_window() { From dcd91f48db081b8e008dbcbbffb96dc5d507953e Mon Sep 17 00:00:00 2001 From: Peter Rekdal Khan-Sunde Date: Thu, 21 May 2026 13:33:30 +0200 Subject: [PATCH 2/3] Fix current runtime proof verification Accept a running app process as current when it is executing the active app binary and the node-local Surge update status is converged for the selected version. The previous probe required transient --surge-first-run command-line proof, so package-current force installs could restart successfully and still fail verification after the supervisor switched to its normal watch command. Validated with the focused surge-cli process verification tests and a live package-current force install against selsbakk-master. --- crates/surge-cli/src/commands/install/mod.rs | 6 ++++++ .../src/commands/install/remote/runtime.rs | 13 +++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/crates/surge-cli/src/commands/install/mod.rs b/crates/surge-cli/src/commands/install/mod.rs index 0e55f87..24b3bb4 100644 --- a/crates/surge-cli/src/commands/install/mod.rs +++ b/crates/surge-cli/src/commands/install/mod.rs @@ -1146,10 +1146,16 @@ mod tests { ); assert!(probe.contains("active_exe=\"$install_root/app/$main_exe\"")); + assert!(probe.contains("status_file=\"$install_root/.surge-update-status.json\"")); + assert!(probe.contains("status_converged=0")); assert!(probe.contains("contains_target_first_run()")); assert!(probe.contains("contains_target_version_arg()")); + assert!(probe.contains("process_exe_matches_active()")); assert!(probe.contains("extract_watched_pid()")); assert!(probe.contains("*\" --surge-first-run $version \"*|*\" $version --surge-first-run \"*")); + assert!(probe.contains("*'\"state\":\"converged\"'*")); + assert!(probe.contains("*'\"installed_version\":\"'\"$version\"'\"'*")); + assert!(probe.contains("[ \"$status_converged\" -eq 1 ] && process_exe_matches_active \"$pid\"")); assert!(probe.contains("target_app_pids")); assert!(probe.contains("target_supervisor_seen")); assert!(probe.contains("surge-supervisor")); diff --git a/crates/surge-cli/src/commands/install/remote/runtime.rs b/crates/surge-cli/src/commands/install/remote/runtime.rs index 97fca3a..1208827 100644 --- a/crates/surge-cli/src/commands/install/remote/runtime.rs +++ b/crates/surge-cli/src/commands/install/remote/runtime.rs @@ -245,10 +245,19 @@ pub(crate) fn build_remote_process_verification_probe( ) -> String { format!( r#"install_root={}; main_exe={}; supervisor_id={}; version={}; -active_exe="$install_root/app/$main_exe"; app_seen=0; target_app_seen=0; stale_app_seen=0; supervisor_seen=0; stale_supervisor_seen=0; waiting_supervisor_seen=0; target_supervisor_seen=0; target_app_pids=""; watched_pids=""; +active_exe="$install_root/app/$main_exe"; status_file="$install_root/.surge-update-status.json"; app_seen=0; target_app_seen=0; stale_app_seen=0; supervisor_seen=0; stale_supervisor_seen=0; waiting_supervisor_seen=0; target_supervisor_seen=0; target_app_pids=""; watched_pids=""; status_converged=0; +if [ -r "$status_file" ]; then + status_compact="$(tr -d '[:space:]' < "$status_file" 2>/dev/null || true)"; + status_has_state=0; status_has_installed=0; status_has_target=0; + case "$status_compact" in *'"state":"converged"'*) status_has_state=1 ;; esac; + case "$status_compact" in *'"installed_version":"'"$version"'"'*) status_has_installed=1 ;; esac; + case "$status_compact" in *'"target_version":"'"$version"'"'*) status_has_target=1 ;; esac; + if [ "$status_has_state" -eq 1 ] && [ "$status_has_installed" -eq 1 ] && [ "$status_has_target" -eq 1 ]; then status_converged=1; fi; +fi; contains_target_first_run() {{ cmd_tokens=" $1 "; case "$cmd_tokens" in *" --surge-first-run $version "*|*" $version --surge-first-run "*) return 0 ;; esac; return 1; }} contains_target_version_arg() {{ cmd_tokens=" $1 "; case "$cmd_tokens" in *" $version "*) return 0 ;; esac; return 1; }} contains_target_proof() {{ contains_target_first_run "$1" || contains_target_version_arg "$1"; }} +process_exe_matches_active() {{ actual="$(readlink "/proc/$1/exe" 2>/dev/null || true)"; [ "$actual" = "$active_exe" ]; }} extract_watched_pid() {{ case "$1" in *" watch "*" --pid "*) rest="${{1#* --pid }}"; watched_pid="${{rest%% *}}"; case "$watched_pid" in ""|*[!0-9]*) return 1 ;; esac; printf '%s\n' "$watched_pid"; return 0 ;; esac; return 1; }} for cmdline in /proc/[0-9]*/cmdline; do [ -r "$cmdline" ] || continue; @@ -256,7 +265,7 @@ for cmdline in /proc/[0-9]*/cmdline; do case "$pid" in "$$"|"$PPID") continue ;; esac; cmd="$(tr '\0' ' ' < "$cmdline" 2>/dev/null || true)"; [ -n "$cmd" ] || continue; - case "$cmd" in *"surge-supervisor"*) ;; *"$active_exe"*) app_seen=1; if contains_target_proof "$cmd"; then target_app_seen=1; target_app_pids="${{target_app_pids}}${{pid}} "; else stale_app_seen=1; fi ;; esac; + case "$cmd" in *"surge-supervisor"*) ;; *"$active_exe"*) app_seen=1; if contains_target_proof "$cmd" || {{ [ "$status_converged" -eq 1 ] && process_exe_matches_active "$pid"; }}; then target_app_seen=1; target_app_pids="${{target_app_pids}}${{pid}} "; else stale_app_seen=1; fi ;; esac; if [ -n "$supervisor_id" ]; then case "$cmd" in *"surge-supervisor"*"--id $supervisor_id"*) supervisor_seen=1; From 82b5f4128de087166894820153c79b9a0029e0dc Mon Sep 17 00:00:00 2001 From: Peter Rekdal Khan-Sunde Date: Wed, 27 May 2026 19:37:08 +0200 Subject: [PATCH 3/3] Split supervisor restart handoff helpers --- crates/surge-supervisor/src/handoff.rs | 161 ++++++++++++++++++++++++ crates/surge-supervisor/src/main.rs | 163 +------------------------ 2 files changed, 167 insertions(+), 157 deletions(-) create mode 100644 crates/surge-supervisor/src/handoff.rs diff --git a/crates/surge-supervisor/src/handoff.rs b/crates/surge-supervisor/src/handoff.rs new file mode 100644 index 0000000..25d0c5a --- /dev/null +++ b/crates/surge-supervisor/src/handoff.rs @@ -0,0 +1,161 @@ +use std::path::Path; +use std::process::ExitStatus; + +use surge_core::update::status::{ + RESTART_HANDOFF_TARGET_CHILD_EXITED_PHASE, mark_restart_handoff_converged, mark_restart_handoff_pending, +}; + +pub(super) fn pending_restart_handoff_version( + _watched_pid: Option, + handoff_version: Option<&str>, + args: &[String], +) -> Option { + handoff_version + .map(str::trim) + .filter(|version| !version.is_empty()) + .map(ToOwned::to_owned) + .or_else(|| first_run_version(args)) +} + +fn first_run_version(args: &[String]) -> Option { + args.iter().enumerate().find_map(|(index, arg)| { + if arg != "--surge-first-run" { + return None; + } + args.get(index + 1) + .filter(|candidate| !candidate.starts_with("--")) + .or_else(|| { + index + .checked_sub(1) + .and_then(|prev| args.get(prev)) + .filter(|candidate| !candidate.starts_with("--")) + }) + .map(|version| version.trim().to_string()) + .filter(|version| !version.is_empty()) + }) +} + +pub(super) fn without_lifecycle_args(args: &[String]) -> Vec { + let mut retained = Vec::with_capacity(args.len()); + let mut index = 0usize; + while index < args.len() { + let arg = &args[index]; + if is_lifecycle_arg(arg) { + index += 1; + if lifecycle_arg_takes_value(arg) && args.get(index).is_some_and(|candidate| !candidate.starts_with("--")) { + index += 1; + } + continue; + } + + retained.push(arg.clone()); + index += 1; + } + + retained +} + +fn is_lifecycle_arg(arg: &str) -> bool { + matches!(arg, "--surge-first-run" | "--surge-installed" | "--surge-updated") || arg.starts_with("--surge-updated=") +} + +fn lifecycle_arg_takes_value(arg: &str) -> bool { + matches!(arg, "--surge-first-run" | "--surge-installed" | "--surge-updated") +} + +pub(super) fn record_restart_handoff_converged(install_dir: &Path, version: &str) { + match mark_restart_handoff_converged(install_dir, version) { + Ok(Some(_)) => { + tracing::info!(version, "Restart handoff converged after target child startup"); + } + Ok(None) => { + tracing::warn!( + install_root = %install_dir.display(), + version, + reason = "no matching pending restart handoff status record", + "Failed to record restart handoff convergence" + ); + } + Err(e) => { + tracing::warn!( + install_root = %install_dir.display(), + version, + reason = %e, + "Failed to record restart handoff convergence" + ); + } + } +} + +pub(super) fn record_restart_handoff_child_exited(install_dir: &Path, version: &str, status: ExitStatus) { + let reason = format!("target child exited with {status} before restart handoff completed"); + match mark_restart_handoff_pending(install_dir, version, &reason, RESTART_HANDOFF_TARGET_CHILD_EXITED_PHASE) { + Ok(Some(_)) => { + tracing::warn!(version, %status, "Restart handoff target child exited before startup proof completed"); + } + Ok(None) => { + tracing::warn!( + install_root = %install_dir.display(), + version, + reason = "no matching pending restart handoff status record", + "Failed to record restart handoff child exit" + ); + } + Err(e) => { + tracing::warn!( + install_root = %install_dir.display(), + version, + reason = %e, + "Failed to record restart handoff child exit" + ); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn pending_handoff_version_uses_explicit_watch_target() { + let args = vec!["--app-mode".to_string()]; + + let version = pending_restart_handoff_version(Some(42), Some(" 2.0.0 "), &args); + + assert_eq!(version.as_deref(), Some("2.0.0")); + } + + #[test] + fn pending_handoff_version_falls_back_to_first_run_arg() { + let args = vec!["--surge-first-run".to_string(), "2.0.0".to_string()]; + + let version = pending_restart_handoff_version(Some(42), None, &args); + + assert_eq!(version.as_deref(), Some("2.0.0")); + } + + #[test] + fn pending_handoff_version_accepts_first_run_without_watched_pid() { + let args = vec!["--surge-first-run".to_string(), "2.0.0".to_string()]; + + let version = pending_restart_handoff_version(None, None, &args); + + assert_eq!(version.as_deref(), Some("2.0.0")); + } + + #[test] + fn restart_args_drop_lifecycle_flag_value_pairs() { + let args = vec![ + "--app-mode".to_string(), + "service".to_string(), + "--surge-first-run".to_string(), + "2.0.0".to_string(), + "--surge-updated=2.0.0".to_string(), + "--tail".to_string(), + ]; + + let retained = without_lifecycle_args(&args); + + assert_eq!(retained, vec!["--app-mode", "service", "--tail"]); + } +} diff --git a/crates/surge-supervisor/src/main.rs b/crates/surge-supervisor/src/main.rs index 26f4c1e..a8a3030 100644 --- a/crates/surge-supervisor/src/main.rs +++ b/crates/surge-supervisor/src/main.rs @@ -6,13 +6,12 @@ use std::process::{Child, Command, ExitCode, ExitStatus}; use clap::{Parser, Subcommand}; use surge_core::supervisor::state::{supervisor_pid_file, supervisor_stop_file}; -use surge_core::update::status::{ - RESTART_HANDOFF_TARGET_CHILD_EXITED_PHASE, mark_restart_handoff_converged, mark_restart_handoff_pending, -}; #[cfg(windows)] use sysinfo::{Pid, ProcessesToUpdate, System}; use thiserror::Error; +mod handoff; + const RESTART_HANDOFF_STABILITY_WINDOW: std::time::Duration = std::time::Duration::from_secs(4); #[derive(Parser)] @@ -188,9 +187,9 @@ fn run_supervisor( // After that, one-shot lifecycle args are drained so crash-restarts don't // re-fire lifecycle callbacks. let first_child_args = args.to_vec(); - let restart_args = without_lifecycle_args(args); + let restart_args = handoff::without_lifecycle_args(args); let mut next_child_args = Some(first_child_args); - let mut pending_handoff_version = pending_restart_handoff_version(watched_pid, handoff_version, args); + let mut pending_handoff_version = handoff::pending_restart_handoff_version(watched_pid, handoff_version, args); let mut watched_pid = watched_pid; loop { @@ -264,113 +263,6 @@ fn run_supervisor( Ok(()) } -fn pending_restart_handoff_version( - _watched_pid: Option, - handoff_version: Option<&str>, - args: &[String], -) -> Option { - handoff_version - .map(str::trim) - .filter(|version| !version.is_empty()) - .map(ToOwned::to_owned) - .or_else(|| first_run_version(args)) -} - -fn first_run_version(args: &[String]) -> Option { - args.iter().enumerate().find_map(|(index, arg)| { - if arg != "--surge-first-run" { - return None; - } - args.get(index + 1) - .filter(|candidate| !candidate.starts_with("--")) - .or_else(|| { - index - .checked_sub(1) - .and_then(|prev| args.get(prev)) - .filter(|candidate| !candidate.starts_with("--")) - }) - .map(|version| version.trim().to_string()) - .filter(|version| !version.is_empty()) - }) -} - -fn without_lifecycle_args(args: &[String]) -> Vec { - let mut retained = Vec::with_capacity(args.len()); - let mut index = 0usize; - while index < args.len() { - let arg = &args[index]; - if is_lifecycle_arg(arg) { - index += 1; - if lifecycle_arg_takes_value(arg) && args.get(index).is_some_and(|candidate| !candidate.starts_with("--")) { - index += 1; - } - continue; - } - - retained.push(arg.clone()); - index += 1; - } - - retained -} - -fn is_lifecycle_arg(arg: &str) -> bool { - matches!(arg, "--surge-first-run" | "--surge-installed" | "--surge-updated") || arg.starts_with("--surge-updated=") -} - -fn lifecycle_arg_takes_value(arg: &str) -> bool { - matches!(arg, "--surge-first-run" | "--surge-installed" | "--surge-updated") -} - -fn record_restart_handoff_converged(install_dir: &Path, version: &str) { - match mark_restart_handoff_converged(install_dir, version) { - Ok(Some(_)) => { - tracing::info!(version, "Restart handoff converged after target child startup"); - } - Ok(None) => { - tracing::warn!( - install_root = %install_dir.display(), - version, - reason = "no matching pending restart handoff status record", - "Failed to record restart handoff convergence" - ); - } - Err(e) => { - tracing::warn!( - install_root = %install_dir.display(), - version, - reason = %e, - "Failed to record restart handoff convergence" - ); - } - } -} - -fn record_restart_handoff_child_exited(install_dir: &Path, version: &str, status: ExitStatus) { - let reason = format!("target child exited with {status} before restart handoff completed"); - match mark_restart_handoff_pending(install_dir, version, &reason, RESTART_HANDOFF_TARGET_CHILD_EXITED_PHASE) { - Ok(Some(_)) => { - tracing::warn!(version, %status, "Restart handoff target child exited before startup proof completed"); - } - Ok(None) => { - tracing::warn!( - install_root = %install_dir.display(), - version, - reason = "no matching pending restart handoff status record", - "Failed to record restart handoff child exit" - ); - } - Err(e) => { - tracing::warn!( - install_root = %install_dir.display(), - version, - reason = %e, - "Failed to record restart handoff child exit" - ); - } - } -} - fn wait_for_supervised_child( child: &mut Child, shutdown: &std::sync::Arc, @@ -384,12 +276,12 @@ fn wait_for_supervised_child( match wait_for_child_startup_or_stop(child, shutdown, stop_file, RESTART_HANDOFF_STABILITY_WINDOW)? { StartupOutcome::Running => { - record_restart_handoff_converged(install_dir, &version); + handoff::record_restart_handoff_converged(install_dir, &version); *pending_handoff_version = None; wait_for_child_exit_status(child, shutdown, stop_file) } StartupOutcome::Exited(status) => { - record_restart_handoff_child_exited(install_dir, &version, status); + handoff::record_restart_handoff_child_exited(install_dir, &version, status); Ok(Some(status)) } StartupOutcome::StopRequested => { @@ -648,49 +540,6 @@ mod tests { } } - #[test] - fn pending_handoff_version_uses_explicit_watch_target() { - let args = vec!["--app-mode".to_string()]; - - let version = pending_restart_handoff_version(Some(42), Some(" 2.0.0 "), &args); - - assert_eq!(version.as_deref(), Some("2.0.0")); - } - - #[test] - fn pending_handoff_version_falls_back_to_first_run_arg() { - let args = vec!["--surge-first-run".to_string(), "2.0.0".to_string()]; - - let version = pending_restart_handoff_version(Some(42), None, &args); - - assert_eq!(version.as_deref(), Some("2.0.0")); - } - - #[test] - fn pending_handoff_version_accepts_first_run_without_watched_pid() { - let args = vec!["--surge-first-run".to_string(), "2.0.0".to_string()]; - - let version = pending_restart_handoff_version(None, None, &args); - - assert_eq!(version.as_deref(), Some("2.0.0")); - } - - #[test] - fn restart_args_drop_lifecycle_flag_value_pairs() { - let args = vec![ - "--app-mode".to_string(), - "service".to_string(), - "--surge-first-run".to_string(), - "2.0.0".to_string(), - "--surge-updated=2.0.0".to_string(), - "--tail".to_string(), - ]; - - let retained = without_lifecycle_args(&args); - - assert_eq!(retained, vec!["--app-mode", "service", "--tail"]); - } - #[test] fn watch_command_accepts_handoff_version_before_trailing_args() { let cli = Cli::try_parse_from([