From 90c8ba8582a33d622d0c630a5d09899619f3d1ec Mon Sep 17 00:00:00 2001 From: Mathieu David Date: Sat, 20 Jun 2026 18:01:53 +0200 Subject: [PATCH 1/3] fix: keep the action server's borrowed clock alive An action server's expired-goals timer borrows the node's clock by raw pointer (passed to rcl_action_server_init), and rcl_action_notify_goal_done -- called whenever a goal terminates or is dropped -- dereferences it. But ActionServerHandle kept only the node handle alive, not the clock (which is owned by the node's NodeState). A goal holds an Arc, so a goal that outlives the node -- e.g. a goal-handler future still being torn down at executor/context shutdown -- would, on drop, call rcl_action_notify_goal_done against a freed clock: an intermittent use-after-free at teardown (manifesting as a "transition_to_aborted RCL_RET_ERROR" log or a SIGSEGV). Keep the clock alive by holding it in ActionServerHandle so it lives as long as the rcl_action_server and any goal handle that references it. Adds a regression test asserting the server takes a reference to the clock it borrows; without the fix it fails deterministically. The full single-process test suite, which segfaulted ~25-33% of runs before, was clean across 32 runs under gdb after. --- rclrs/src/action.rs | 39 +++++++++++++++++++++++++++++++ rclrs/src/action/action_server.rs | 20 ++++++++++++---- 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/rclrs/src/action.rs b/rclrs/src/action.rs index b123e044..1daecd91 100644 --- a/rclrs/src/action.rs +++ b/rclrs/src/action.rs @@ -264,6 +264,45 @@ mod tests { use std::time::Duration; use tokio::sync::mpsc::unbounded_channel; + /// Regression: the action server's expired-goals timer borrows the node's + /// clock by raw pointer (`rcl_action_server_init`), and + /// `rcl_action_notify_goal_done` — called whenever a goal terminates or is + /// dropped — dereferences it. The server must keep that clock alive for as + /// long as it, and any goal handle holding an `Arc`, + /// lives; otherwise a goal that outlives the node (e.g. a goal-handler future + /// still being torn down) dereferences a freed clock in its `Drop` — an + /// intermittent use-after-free at teardown. + /// + /// We verify the invariant directly: creating the server must take a strong + /// reference to the node clock it borrows. Without the fix this is byte-for- + /// byte the precondition for the crash, and the assertion fails. + #[test] + fn action_server_holds_a_reference_to_its_borrowed_clock() { + let executor = Context::default().create_basic_executor(); + let node = executor + .create_node(&format!("test_action_clock_uaf_{}", line!())) + .unwrap(); + let action_name = format!("test_action_clock_uaf_{}_action", line!()); + + let clock = node.get_clock(); + let before = std::sync::Arc::strong_count(clock.get_rcl_clock()); + + let _action_server = node + .create_action_server(&action_name, |handle| { + fibonacci_action(handle, TestActionSettings::default()) + }) + .unwrap(); + + let after = std::sync::Arc::strong_count(clock.get_rcl_clock()); + assert!( + after > before, + "action server does not hold a reference to the node clock it borrows \ + by raw pointer (strong count before={before}, after={after}); a goal \ + outliving the node would hit a use-after-free in \ + rcl_action_notify_goal_done", + ); + } + #[test] fn test_action_server_availability() { let mut executor = Context::default().create_basic_executor(); diff --git a/rclrs/src/action/action_server.rs b/rclrs/src/action/action_server.rs index 468eff10..a1e3315b 100644 --- a/rclrs/src/action/action_server.rs +++ b/rclrs/src/action/action_server.rs @@ -1,9 +1,9 @@ use super::empty_goal_status_array; use crate::{ action::GoalUuid, error::ToResult, rcl_bindings::*, ActionGoalReceiver, CancelResponseCode, - DropGuard, GoalStatusCode, Node, NodeHandle, QoSProfile, RclPrimitive, RclPrimitiveHandle, - RclPrimitiveKind, RclrsError, ReadyKind, TakeFailedAsNone, Waitable, WaitableLifecycle, - ENTITY_LIFECYCLE_MUTEX, + Clock, DropGuard, GoalStatusCode, Node, NodeHandle, QoSProfile, RclPrimitive, + RclPrimitiveHandle, RclPrimitiveKind, RclrsError, ReadyKind, TakeFailedAsNone, Waitable, + WaitableLifecycle, ENTITY_LIFECYCLE_MUTEX, }; use futures::future::BoxFuture; use ros_env::action_msgs::srv::CancelGoal_Response; @@ -286,9 +286,12 @@ impl ActionServerState { let action_server_options = (&options).into(); + // The rcl_action_server borrows this clock by raw pointer for its + // expired-goals timer (see `rcl_action_server_init` below), so the handle + // must keep it alive — kept past this block for that reason. + let clock = node.get_clock(); { let mut rcl_node = node.handle().rcl_node.lock().unwrap(); - let clock = node.get_clock(); let rcl_clock = clock.get_rcl_clock(); let mut rcl_clock = rcl_clock.lock().unwrap(); let _lifecycle_lock = ENTITY_LIFECYCLE_MUTEX.lock().unwrap(); @@ -316,6 +319,7 @@ impl ActionServerState { let handle = Arc::new(ActionServerHandle { rcl_action_server: Mutex::new(rcl_action_server), node_handle: Arc::clone(&node.handle()), + _clock: clock, goals: Default::default(), }); @@ -677,6 +681,14 @@ pub(crate) struct ActionServerHandle { /// Ensure the node remains active while the action server is running. #[allow(unused)] node_handle: Arc, + /// The clock the `rcl_action_server` borrows by raw pointer for its + /// expired-goals timer. We must keep it alive for the lifetime of the server, + /// including for any goal handle that outlives the node: a goal holds an + /// `Arc` and, on drop, calls `rcl_action_notify_goal_done` + /// (which touches this clock). Without holding it here, a goal dropped after + /// the node has been torn down would dereference a freed clock. + #[allow(dead_code)] + _clock: Clock, /// Ensure the `impl_*` of the action server goals remain valid until they /// have expired or until the rcl_action_server_t gets fini-ed. goals: Mutex>>>, From 48c928f38dcd322caeca1a9bcb8af9c16e253573 Mon Sep 17 00:00:00 2001 From: Mathieu David Date: Sat, 20 Jun 2026 18:07:54 +0200 Subject: [PATCH 2/3] fix: finalize the rcl action server on drop ActionServerHandle had no Drop, so rcl_action_server_fini was never called: the rcl action server -- and its underlying services, publishers, and expired-goals timer -- leaked every time an action server was dropped. The action *client* already finalizes itself this way (ActionClientHandle::drop), so the server was an oversight. Add a Drop that finalizes the server, mirroring the action client and the service/client handles. It runs before the struct's fields are dropped, so the clock the handle now holds is still alive when rcl_action_server_fini touches the expired-goals timer. --- rclrs/src/action/action_server.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/rclrs/src/action/action_server.rs b/rclrs/src/action/action_server.rs index a1e3315b..142be40a 100644 --- a/rclrs/src/action/action_server.rs +++ b/rclrs/src/action/action_server.rs @@ -698,6 +698,23 @@ pub(crate) struct ActionServerHandle { // they are running in. Therefore, this type can be safely sent to another thread. unsafe impl Send for rcl_action_server_t {} +impl Drop for ActionServerHandle { + fn drop(&mut self) { + // Finalize the rcl action server (mirrors `ActionClientHandle`'s Drop). + // This runs before the struct's fields are dropped, so `_clock` is still + // alive — `rcl_action_server_fini` touches the expired-goals timer, which + // borrows that clock. + let rcl_action_server = self.rcl_action_server.get_mut().unwrap(); + let mut rcl_node = self.node_handle.rcl_node.lock().unwrap(); + let _lifecycle_lock = ENTITY_LIFECYCLE_MUTEX.lock().unwrap(); + // SAFETY: The entity lifecycle mutex is locked to protect against the risk of + // global variables in the rmw implementation being unsafely modified during cleanup. + unsafe { + rcl_action_server_fini(rcl_action_server, &mut *rcl_node); + } + } +} + impl ActionServerHandle { pub(super) fn lock(&self) -> MutexGuard<'_, rcl_action_server_t> { self.rcl_action_server.lock().unwrap() From 8ffffecf0af9249dc89da3af77264ef8d1e0daf3 Mon Sep 17 00:00:00 2001 From: Mathieu David Date: Mon, 22 Jun 2026 20:36:55 +0200 Subject: [PATCH 3/3] PR feedback --- rclrs/src/action/action_server.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rclrs/src/action/action_server.rs b/rclrs/src/action/action_server.rs index 142be40a..9c374755 100644 --- a/rclrs/src/action/action_server.rs +++ b/rclrs/src/action/action_server.rs @@ -319,7 +319,7 @@ impl ActionServerState { let handle = Arc::new(ActionServerHandle { rcl_action_server: Mutex::new(rcl_action_server), node_handle: Arc::clone(&node.handle()), - _clock: clock, + clock: clock, goals: Default::default(), }); @@ -687,8 +687,8 @@ pub(crate) struct ActionServerHandle { /// `Arc` and, on drop, calls `rcl_action_notify_goal_done` /// (which touches this clock). Without holding it here, a goal dropped after /// the node has been torn down would dereference a freed clock. - #[allow(dead_code)] - _clock: Clock, + #[allow(unused)] + clock: Clock, /// Ensure the `impl_*` of the action server goals remain valid until they /// have expired or until the rcl_action_server_t gets fini-ed. goals: Mutex>>>,