diff --git a/rclrs/src/action.rs b/rclrs/src/action.rs index b123e044..1daecd91 100644 --- a/rclrs/src/action.rs +++ b/rclrs/src/action.rs @@ -264,6 +264,45 @@ mod tests { use std::time::Duration; use tokio::sync::mpsc::unbounded_channel; + /// Regression: the action server's expired-goals timer borrows the node's + /// clock by raw pointer (`rcl_action_server_init`), and + /// `rcl_action_notify_goal_done` — called whenever a goal terminates or is + /// dropped — dereferences it. The server must keep that clock alive for as + /// long as it, and any goal handle holding an `Arc`, + /// lives; otherwise a goal that outlives the node (e.g. a goal-handler future + /// still being torn down) dereferences a freed clock in its `Drop` — an + /// intermittent use-after-free at teardown. + /// + /// We verify the invariant directly: creating the server must take a strong + /// reference to the node clock it borrows. Without the fix this is byte-for- + /// byte the precondition for the crash, and the assertion fails. + #[test] + fn action_server_holds_a_reference_to_its_borrowed_clock() { + let executor = Context::default().create_basic_executor(); + let node = executor + .create_node(&format!("test_action_clock_uaf_{}", line!())) + .unwrap(); + let action_name = format!("test_action_clock_uaf_{}_action", line!()); + + let clock = node.get_clock(); + let before = std::sync::Arc::strong_count(clock.get_rcl_clock()); + + let _action_server = node + .create_action_server(&action_name, |handle| { + fibonacci_action(handle, TestActionSettings::default()) + }) + .unwrap(); + + let after = std::sync::Arc::strong_count(clock.get_rcl_clock()); + assert!( + after > before, + "action server does not hold a reference to the node clock it borrows \ + by raw pointer (strong count before={before}, after={after}); a goal \ + outliving the node would hit a use-after-free in \ + rcl_action_notify_goal_done", + ); + } + #[test] fn test_action_server_availability() { let mut executor = Context::default().create_basic_executor(); diff --git a/rclrs/src/action/action_server.rs b/rclrs/src/action/action_server.rs index 468eff10..9c374755 100644 --- a/rclrs/src/action/action_server.rs +++ b/rclrs/src/action/action_server.rs @@ -1,9 +1,9 @@ use super::empty_goal_status_array; use crate::{ action::GoalUuid, error::ToResult, rcl_bindings::*, ActionGoalReceiver, CancelResponseCode, - DropGuard, GoalStatusCode, Node, NodeHandle, QoSProfile, RclPrimitive, RclPrimitiveHandle, - RclPrimitiveKind, RclrsError, ReadyKind, TakeFailedAsNone, Waitable, WaitableLifecycle, - ENTITY_LIFECYCLE_MUTEX, + Clock, DropGuard, GoalStatusCode, Node, NodeHandle, QoSProfile, RclPrimitive, + RclPrimitiveHandle, RclPrimitiveKind, RclrsError, ReadyKind, TakeFailedAsNone, Waitable, + WaitableLifecycle, ENTITY_LIFECYCLE_MUTEX, }; use futures::future::BoxFuture; use ros_env::action_msgs::srv::CancelGoal_Response; @@ -286,9 +286,12 @@ impl ActionServerState { let action_server_options = (&options).into(); + // The rcl_action_server borrows this clock by raw pointer for its + // expired-goals timer (see `rcl_action_server_init` below), so the handle + // must keep it alive — kept past this block for that reason. + let clock = node.get_clock(); { let mut rcl_node = node.handle().rcl_node.lock().unwrap(); - let clock = node.get_clock(); let rcl_clock = clock.get_rcl_clock(); let mut rcl_clock = rcl_clock.lock().unwrap(); let _lifecycle_lock = ENTITY_LIFECYCLE_MUTEX.lock().unwrap(); @@ -316,6 +319,7 @@ impl ActionServerState { let handle = Arc::new(ActionServerHandle { rcl_action_server: Mutex::new(rcl_action_server), node_handle: Arc::clone(&node.handle()), + clock: clock, goals: Default::default(), }); @@ -677,6 +681,14 @@ pub(crate) struct ActionServerHandle { /// Ensure the node remains active while the action server is running. #[allow(unused)] node_handle: Arc, + /// The clock the `rcl_action_server` borrows by raw pointer for its + /// expired-goals timer. We must keep it alive for the lifetime of the server, + /// including for any goal handle that outlives the node: a goal holds an + /// `Arc` and, on drop, calls `rcl_action_notify_goal_done` + /// (which touches this clock). Without holding it here, a goal dropped after + /// the node has been torn down would dereference a freed clock. + #[allow(unused)] + clock: Clock, /// Ensure the `impl_*` of the action server goals remain valid until they /// have expired or until the rcl_action_server_t gets fini-ed. goals: Mutex>>>, @@ -686,6 +698,23 @@ pub(crate) struct ActionServerHandle { // they are running in. Therefore, this type can be safely sent to another thread. unsafe impl Send for rcl_action_server_t {} +impl Drop for ActionServerHandle { + fn drop(&mut self) { + // Finalize the rcl action server (mirrors `ActionClientHandle`'s Drop). + // This runs before the struct's fields are dropped, so `_clock` is still + // alive — `rcl_action_server_fini` touches the expired-goals timer, which + // borrows that clock. + let rcl_action_server = self.rcl_action_server.get_mut().unwrap(); + let mut rcl_node = self.node_handle.rcl_node.lock().unwrap(); + let _lifecycle_lock = ENTITY_LIFECYCLE_MUTEX.lock().unwrap(); + // SAFETY: The entity lifecycle mutex is locked to protect against the risk of + // global variables in the rmw implementation being unsafely modified during cleanup. + unsafe { + rcl_action_server_fini(rcl_action_server, &mut *rcl_node); + } + } +} + impl ActionServerHandle { pub(super) fn lock(&self) -> MutexGuard<'_, rcl_action_server_t> { self.rcl_action_server.lock().unwrap()