From 5e9347767e08679183367da875dd4fb6c358e365 Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Fri, 17 Apr 2026 21:30:58 +0000 Subject: [PATCH 1/6] fix UAF condition of single-instance TA --- litebox_runner_lvbs/src/lib.rs | 44 ++++++++++++++++++++++--------- litebox_shim_optee/src/session.rs | 6 ++++- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index ee598e4be..0eb56074c 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -547,10 +547,17 @@ fn open_session_single_instance( ) -> Result<(), OpteeSmcReturnCode> { // Use try_lock to avoid spinning - return EThreadLimit if TA is in use // The Linux driver will handle this by waiting and retrying - let instance = instance_arc + let mut instance = instance_arc .try_lock() .ok_or(OpteeSmcReturnCode::EThreadLimit)?; + // `task_page_table_id == None` means the instance was destroyed by a concurrent + // close/panic between our cache lookup and acquiring the instance lock. + // Return `EThreadLimit` so the Linux driver retries. + let task_pt_id = instance + .task_page_table_id + .ok_or(OpteeSmcReturnCode::EThreadLimit)?; + // Allocate session ID BEFORE calling load_ta_context so TA gets correct ID. // Use SessionIdGuard to ensure the ID is recycled on any error path // (before it is registered with the session manager). @@ -562,11 +569,10 @@ fn open_session_single_instance( debug_serial_println!( "Reusing single-instance TA: uuid={:?}, task_pt_id={}, session_id={}", ta_uuid, - instance.task_page_table_id, + task_pt_id, runner_session_id ); - let task_pt_id = instance.task_page_table_id; let ta_flags = instance.loaded_program.ta_flags; // Switch to the existing TA's page table @@ -642,6 +648,9 @@ fn open_session_single_instance( )?; session_manager().remove_single_instance(&ta_uuid); + // Invalidate the id in the struct *before* teardown so any + // future lock holder observes `None` and bails. + instance.task_page_table_id = None; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. @@ -780,9 +789,6 @@ fn open_session_new_instance( "ldelf/TA_CreateEntryPoint failed: return_code={:?}", ldelf_return_code ); - // Safety: We are about to tear down this TA instance; - // no references to user-space memory will be held afterwards. - unsafe { teardown_ta_page_table(&shim, task_pt_id) }; // Write error response back to normal world write_msg_args_to_normal_world( @@ -794,6 +800,10 @@ fn open_session_new_instance( Some(ta_req_info), )?; + // Safety: We are about to tear down this TA instance; + // no references to user-space memory will be held afterwards. + unsafe { teardown_ta_page_table(&shim, task_pt_id) }; + return Ok(()); } @@ -879,7 +889,7 @@ fn open_session_new_instance( let instance = Arc::new(SpinMutex::new(TaInstance { shim, loaded_program, - task_page_table_id: task_pt_id, + task_page_table_id: Some(task_pt_id), })); // Cache single-instance TAs for future sessions @@ -936,11 +946,14 @@ fn handle_invoke_command( .ok_or(OpteeSmcReturnCode::EBadCmd)?; // Use try_lock to avoid spinning - return EThreadLimit if TA is in use // The Linux driver will handle this by waiting and retrying - let Some(instance) = session_entry.instance.try_lock() else { + let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - - let task_pt_id = instance.task_page_table_id; + // `task_page_table_id == None` means the instance was destroyed. + // Return `EBadCmd` because the client must start over with OpenSession. + let task_pt_id = instance + .task_page_table_id + .ok_or(OpteeSmcReturnCode::EBadCmd)?; // Switch to the TA instance's page table unsafe { switch_to_task_page_table(task_pt_id)? }; @@ -1021,6 +1034,7 @@ fn handle_invoke_command( if ta_flags.is_single_instance() { session_manager().remove_single_instance(&ta_uuid); } + instance.task_page_table_id = None; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. @@ -1079,11 +1093,14 @@ fn handle_close_session( .ok_or(OpteeSmcReturnCode::EBadCmd)?; // Use try_lock to avoid spinning - return EThreadLimit if TA is in use // The Linux driver will handle this by waiting and retrying - let Some(instance) = session_entry.instance.try_lock() else { + let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - - let task_pt_id = instance.task_page_table_id; + // `task_page_table_id == None` means the instance was destroyed. + // Return `EBadCmd` because the client must start over with OpenSession. + let task_pt_id = instance + .task_page_table_id + .ok_or(OpteeSmcReturnCode::EBadCmd)?; // Switch to the TA instance's page table unsafe { switch_to_task_page_table(task_pt_id)? }; @@ -1151,6 +1168,7 @@ fn handle_close_session( if entry.ta_flags.is_single_instance() { session_manager().remove_single_instance(&entry.ta_uuid); } + instance.task_page_table_id = None; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index 0ab69f544..974a7c58e 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -116,7 +116,11 @@ pub struct TaInstance { /// after initialization because it contains internal state that may not survive moves. pub loaded_program: alloc::boxed::Box, /// The task page table ID associated with this TA instance. - pub task_page_table_id: usize, + /// + /// `None` after the page table has been torn down. Any lock holder that + /// observes `None` must treat the instance as destroyed and bail out + /// without touching the page table. + pub task_page_table_id: Option, } // SAFETY: TaInstance is protected by SpinMutex and try_lock (`SessionEntry`) From d93cc2e933619ae4f29ca46f9d1e4409b0267c5e Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Tue, 21 Apr 2026 03:29:27 +0000 Subject: [PATCH 2/6] add fall-through path for a concurrently closed single-instance TA --- litebox_runner_lvbs/src/lib.rs | 137 ++++++++++++++++++++---------- litebox_shim_optee/src/session.rs | 25 ++++++ 2 files changed, 119 insertions(+), 43 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 0eb56074c..91c325166 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -474,6 +474,8 @@ fn handle_open_session( msg_args: &mut OpteeMsgArgs, msg_args_phys_addr: u64, ) -> Result<(), OpteeSmcReturnCode> { + const MAX_OPEN_SESSION_RETRIES: u32 = 4; + let ta_req_info = decode_ta_request(msg_args).map_err(|_| OpteeSmcReturnCode::EBadCmd)?; if ta_req_info.entry_func != UteeEntryFunc::OpenSession { return Err(OpteeSmcReturnCode::EBadCmd); @@ -493,47 +495,80 @@ fn handle_open_session( if is_single_instance { // Fast path: Reuse a cached single-instance TA if one exists. if let Some(existing) = session_manager().get_single_instance(&ta_uuid) { - return open_session_single_instance( + match open_session_single_instance( msg_args, msg_args_phys_addr, - existing, + existing.clone(), params, ta_uuid, &ta_req_info, - ); + )? { + OpenSessionOutcome::Done => return Ok(()), + // Cached instance was torn down in a race with a concurrent + // close/panic. Drop the stale entry from the cache only if + // still the same Arc and fall through. + OpenSessionOutcome::InstanceDestroyed => { + session_manager().remove_single_instance_if_same(&ta_uuid, &existing); + } + } } } - // Create a new TA instance. For single-instance TAs, this also re-checks the cache - // under the lock and prevents concurrent instance creation of the same UUID. - // For multi-instance TAs, only the global capacity limit is enforced. - match session_manager().with_creation_slot(&ta_uuid, is_single_instance, || { - open_session_new_instance( - msg_args, - msg_args_phys_addr, - params, - ta_uuid, - client_identity, - &ta_req_info, - ) - })? { - CreationReservation::ExistingSingleInstance(existing) => open_session_single_instance( - msg_args, - msg_args_phys_addr, - existing, - params, - ta_uuid, - &ta_req_info, - ), - CreationReservation::SlotReserved => Ok(()), + // Create a new TA instance, or reuse a freshly-cached one. For + // single-instance TAs, `with_creation_slot` re-checks the cache under + // its lock and serializes instance creation per UUID. + // + // If we observe a destroyed instance via `ExistingSingleInstance`, we + // clear the stale cache entry and loop. In normal operation the + // destroyer removes the cache entry before setting + // `task_page_table_id = None`, so this loop terminates in at + // most one extra iteration. The bounded retry cap guards + // against pathological cases where other cores repeatedly install and + // destroy fresh instances for the same UUID. After the cap we fall + // back to `EThreadLimit` so the driver retries the whole call. + for _ in 0..MAX_OPEN_SESSION_RETRIES { + match session_manager().with_creation_slot(&ta_uuid, is_single_instance, || { + open_session_new_instance( + msg_args, + msg_args_phys_addr, + params, + ta_uuid, + client_identity, + &ta_req_info, + ) + })? { + CreationReservation::ExistingSingleInstance(existing) => { + match open_session_single_instance( + msg_args, + msg_args_phys_addr, + existing.clone(), + params, + ta_uuid, + &ta_req_info, + )? { + OpenSessionOutcome::Done => return Ok(()), + OpenSessionOutcome::InstanceDestroyed => { + session_manager().remove_single_instance_if_same(&ta_uuid, &existing); + } + } + } + CreationReservation::SlotReserved => return Ok(()), + } } + + Err(OpteeSmcReturnCode::EThreadLimit) +} + +/// Outcome of [`open_session_single_instance`]. +enum OpenSessionOutcome { + /// Session was successfully opened (or a TA-level error was written back). + Done, + /// The cached `TaInstance` was torn down concurrently. + InstanceDestroyed, } /// Open a new session on an existing single-instance TA. /// -/// Returns `Err(OpteeSmcReturnCode::EThreadLimit)` if the TA instance is currently in use. -/// The Linux driver will wait and retry automatically. -/// /// If the TA's OpenSession entry point returns an error, the session is not registered. /// For cleanup semantics, see OP-TEE OS `tee_ta_open_session()` in `tee_ta_manager.c`. #[allow(clippy::type_complexity)] @@ -544,19 +579,19 @@ fn open_session_single_instance( params: &[litebox_common_optee::UteeParamOwned], ta_uuid: litebox_common_optee::TeeUuid, ta_req_info: &litebox_shim_optee::msg_handler::TaRequestInfo, -) -> Result<(), OpteeSmcReturnCode> { +) -> Result { // Use try_lock to avoid spinning - return EThreadLimit if TA is in use // The Linux driver will handle this by waiting and retrying let mut instance = instance_arc .try_lock() .ok_or(OpteeSmcReturnCode::EThreadLimit)?; - // `task_page_table_id == None` means the instance was destroyed by a concurrent - // close/panic between our cache lookup and acquiring the instance lock. - // Return `EThreadLimit` so the Linux driver retries. - let task_pt_id = instance - .task_page_table_id - .ok_or(OpteeSmcReturnCode::EThreadLimit)?; + // `task_page_table_id == None` means the instance was destroyed by a + // concurrent close/panic between our cache lookup and acquiring the + // instance lock. + let Some(task_pt_id) = instance.task_page_table_id else { + return Ok(OpenSessionOutcome::InstanceDestroyed); + }; // Allocate session ID BEFORE calling load_ta_context so TA gets correct ID. // Use SessionIdGuard to ensure the ID is recycled on any error path @@ -662,7 +697,7 @@ fn open_session_single_instance( // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just // cleaning it up. Currently we always clean up on panic. - return Ok(()); + return Ok(OpenSessionOutcome::Done); } } @@ -678,7 +713,7 @@ fn open_session_single_instance( Some(ta_req_info), )?; - return Ok(()); + return Ok(OpenSessionOutcome::Done); } drop(instance); @@ -702,7 +737,7 @@ fn open_session_single_instance( runner_session_id ); - Ok(()) + Ok(OpenSessionOutcome::Done) } /// Create a new TA instance for a session. @@ -1096,11 +1131,27 @@ fn handle_close_session( let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - // `task_page_table_id == None` means the instance was destroyed. - // Return `EBadCmd` because the client must start over with OpenSession. - let task_pt_id = instance - .task_page_table_id - .ok_or(OpteeSmcReturnCode::EBadCmd)?; + // `task_page_table_id == None` means the TA instance was already torn + // down (e.g., a prior InvokeCommand panicked with TARGET_DEAD and cleaned + // it up). From the client's perspective the session no longer exists, + // so CloseSession is trivially successful. + let Some(task_pt_id) = instance.task_page_table_id else { + drop(instance); + session_manager().unregister_session(session_id); + write_msg_args_to_normal_world( + msg_args, + msg_args_phys_addr, + TeeResult::Success, + None, + None, + None, + )?; + debug_serial_println!( + "CloseSession complete: session_id={}, TA instance already destroyed", + session_id + ); + return Ok(()); + }; // Switch to the TA instance's page table unsafe { switch_to_task_page_table(task_pt_id)? }; diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index 974a7c58e..04bec7616 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -246,6 +246,19 @@ impl SingleInstanceCache { self.inner.lock().remove(uuid) } + /// Remove a cached single-instance TA by UUID, but only if the currently + /// cached entry is the same `Arc` as `expected`. + pub fn remove_if_same(&self, uuid: &TeeUuid, expected: &Arc>) -> bool { + let mut guard = self.inner.lock(); + match guard.get(uuid) { + Some(current) if Arc::ptr_eq(current, expected) => { + guard.remove(uuid); + true + } + _ => false, + } + } + /// Get the number of cached single-instance TAs. pub fn len(&self) -> usize { self.inner.lock().len() @@ -440,6 +453,18 @@ impl SessionManager { self.single_instance_cache.remove(uuid) } + /// Remove a single-instance TA from the cache only if the currently + /// cached `Arc` is the same as `expected`. + /// + /// See [`SingleInstanceCache::remove_if_same`]. + pub fn remove_single_instance_if_same( + &self, + uuid: &TeeUuid, + expected: &Arc>, + ) -> bool { + self.single_instance_cache.remove_if_same(uuid, expected) + } + /// Get the total count of unique TA instances (for limit checking). /// /// This counts: From 2398161577c753f9ed50c22f8c284d1ef13d306f Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Tue, 21 Apr 2026 03:58:12 +0000 Subject: [PATCH 3/6] tear down target_dead --- litebox_runner_lvbs/src/lib.rs | 79 ++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 91c325166..2e52a6ee4 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -984,11 +984,14 @@ fn handle_invoke_command( let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - // `task_page_table_id == None` means the instance was destroyed. - // Return `EBadCmd` because the client must start over with OpenSession. - let task_pt_id = instance - .task_page_table_id - .ok_or(OpteeSmcReturnCode::EBadCmd)?; + // `task_page_table_id == None` means the TA instance was already torn + // down (e.g., a prior InvokeCommand panicked with TARGET_DEAD). The + // session is orphaned. + let Some(task_pt_id) = instance.task_page_table_id else { + drop(instance); + session_manager().unregister_session(session_id); + return Err(OpteeSmcReturnCode::EBadCmd); + }; // Switch to the TA instance's page table unsafe { switch_to_task_page_table(task_pt_id)? }; @@ -1032,8 +1035,17 @@ fn handle_invoke_command( let return_code: u32 = ctx.rax.truncate(); let return_code = TeeResult::try_from(return_code).unwrap_or(TeeResult::GenericError); - // Per OP-TEE OS: if TA panics (TARGET_DEAD), clean up the session/instance - // Reference: tee_ta_invoke_command() in tee_ta_manager.c + // Per OP-TEE OS: if TA panics (TARGET_DEAD), the TA context is + // unrecoverable; all sessions on the same single-instance TA are + // implicitly dead (Ref: tee_ta_invoke_command() in tee_ta_manager.c). + // Tear down the instance so that any subsequent op on remaining + // sessions observes `task_page_table_id == None` and fails cleanly. + // Remaining sessions stay in the session map until their clients + // call CloseSession; at that point `handle_close_session`'s None + // path will unregister them and report Success. Note that we don't + // unregister/recycle those session IDs immediately to avoid potential + // use-after-free issues (clients don't know whether those session IDs + // are invalid). if return_code == TeeResult::TargetDead { debug_serial_println!( "InvokeCommand: TA panicked (TARGET_DEAD), session_id={}", @@ -1042,18 +1054,10 @@ fn handle_invoke_command( let ta_uuid = session_entry.ta_uuid; let ta_flags = session_entry.ta_flags; - let instance_arc = session_entry.instance.clone(); - // Remove the session from the map + // Remove this session from the map session_manager().unregister_session(session_id); - // Check if this was the last session using the TA instance by counting - // remaining sessions that reference this instance. - let remaining_sessions = session_manager() - .sessions() - .count_sessions_for_instance(&instance_arc); - let is_last_session = remaining_sessions == 0; - // Write response BEFORE switching page tables (accesses user memory) write_msg_args_to_normal_world( msg_args, @@ -1064,31 +1068,32 @@ fn handle_invoke_command( Some(&ta_req_info), )?; - if is_last_session { - // Clear single-instance cache if applicable - if ta_flags.is_single_instance() { - session_manager().remove_single_instance(&ta_uuid); - } - instance.task_page_table_id = None; + // Clear single-instance cache so new OpenSessions for this UUID + // create a fresh instance instead of hitting the zombie one. + if ta_flags.is_single_instance() { + session_manager().remove_single_instance(&ta_uuid); + } - // Safety: We are about to tear down this TA instance; - // no references to user-space memory will be held afterwards. - // The lock is held, so no other core can enter the TA. - unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; + // Invalidate the id in the struct *before* teardown so any other + // lock holder (on another session sharing this Arc) observes `None` + // and bails. + instance.task_page_table_id = None; - drop(instance); + // Safety: We are about to tear down this TA instance; + // no references to user-space memory will be held afterwards. + // The lock is held, so no other core can enter the TA. + unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; - debug_serial_println!( - "InvokeCommand: cleaned up dead TA instance, task_pt_id={}", - task_pt_id - ); + drop(instance); - // TODO: Per OP-TEE OS semantics, if the TA has INSTANCE_KEEP_ALIVE but not - // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just - // cleaning it up. Currently we always clean up on panic. - } else { - drop(instance); - } + debug_serial_println!( + "InvokeCommand: cleaned up dead TA instance, task_pt_id={}", + task_pt_id + ); + + // TODO: Per OP-TEE OS semantics, if the TA has INSTANCE_KEEP_ALIVE but not + // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just + // cleaning it up. Currently we always clean up on panic. return Ok(()); } From a2714b05f014efe0486321aa1490d0f4dd1ad946 Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Tue, 21 Apr 2026 04:11:55 +0000 Subject: [PATCH 4/6] let the normal world know target_dead --- litebox_runner_lvbs/src/lib.rs | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 2e52a6ee4..60ebc497e 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -518,14 +518,10 @@ fn handle_open_session( // single-instance TAs, `with_creation_slot` re-checks the cache under // its lock and serializes instance creation per UUID. // - // If we observe a destroyed instance via `ExistingSingleInstance`, we - // clear the stale cache entry and loop. In normal operation the - // destroyer removes the cache entry before setting - // `task_page_table_id = None`, so this loop terminates in at - // most one extra iteration. The bounded retry cap guards - // against pathological cases where other cores repeatedly install and - // destroy fresh instances for the same UUID. After the cap we fall - // back to `EThreadLimit` so the driver retries the whole call. + // The bounded retry cap (2 iterations suffice in steady state; 4 gives + // slack for back-to-back churn where other cores repeatedly install and + // destroy fresh instances for the same UUID) ultimately falls back to + // `EThreadLimit` so the driver retries the whole call. for _ in 0..MAX_OPEN_SESSION_RETRIES { match session_manager().with_creation_slot(&ta_uuid, is_single_instance, || { open_session_new_instance( @@ -548,6 +544,7 @@ fn handle_open_session( )? { OpenSessionOutcome::Done => return Ok(()), OpenSessionOutcome::InstanceDestroyed => { + // defense-in-depth against future reordering of destroyer steps session_manager().remove_single_instance_if_same(&ta_uuid, &existing); } } @@ -569,6 +566,9 @@ enum OpenSessionOutcome { /// Open a new session on an existing single-instance TA. /// +/// Returns `Err(OpteeSmcReturnCode::EThreadLimit)` if the TA instance is currently in use. +/// The Linux driver will wait and retry automatically. +/// /// If the TA's OpenSession entry point returns an error, the session is not registered. /// For cleanup semantics, see OP-TEE OS `tee_ta_open_session()` in `tee_ta_manager.c`. #[allow(clippy::type_complexity)] @@ -986,11 +986,23 @@ fn handle_invoke_command( }; // `task_page_table_id == None` means the TA instance was already torn // down (e.g., a prior InvokeCommand panicked with TARGET_DEAD). The - // session is orphaned. + // session is orphaned. Report TARGET_DEAD to the client. let Some(task_pt_id) = instance.task_page_table_id else { drop(instance); session_manager().unregister_session(session_id); - return Err(OpteeSmcReturnCode::EBadCmd); + write_msg_args_to_normal_world( + msg_args, + msg_args_phys_addr, + TeeResult::TargetDead, + None, + None, + None, + )?; + debug_serial_println!( + "InvokeCommand: session_id={} on already-destroyed TA instance", + session_id + ); + return Ok(()); }; // Switch to the TA instance's page table @@ -1233,7 +1245,6 @@ fn handle_close_session( // Drop the instance to release shim/loaded_program resources drop(instance); - drop(entry); debug_serial_println!( "CloseSession complete: deleted task_pt_id={} (last session)", From 003e7b0a9be7897ed31ed38d589009e86c135da6 Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Tue, 21 Apr 2026 20:12:14 +0000 Subject: [PATCH 5/6] drop racy fast path which has less benefit --- litebox_runner_lvbs/src/lib.rs | 136 ++++++++++++------------------ litebox_shim_optee/src/session.rs | 24 +++--- 2 files changed, 64 insertions(+), 96 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 60ebc497e..23efd6ebe 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -474,8 +474,6 @@ fn handle_open_session( msg_args: &mut OpteeMsgArgs, msg_args_phys_addr: u64, ) -> Result<(), OpteeSmcReturnCode> { - const MAX_OPEN_SESSION_RETRIES: u32 = 4; - let ta_req_info = decode_ta_request(msg_args).map_err(|_| OpteeSmcReturnCode::EBadCmd)?; if ta_req_info.entry_func != UteeEntryFunc::OpenSession { return Err(OpteeSmcReturnCode::EBadCmd); @@ -492,9 +490,23 @@ fn handle_open_session( .get_known_flags(&ta_uuid) .is_none_or(|f| f.is_single_instance()); - if is_single_instance { - // Fast path: Reuse a cached single-instance TA if one exists. - if let Some(existing) = session_manager().get_single_instance(&ta_uuid) { + // Resolve or create the TA instance. + // For single-instance TAs, `with_creation_slot` re-checks the cache + // under its lock and serializes instance creation per UUID. + // If a cache hit returns a zombie (an instance torn down by a + // concurrent close/panic), we mirror OP-TEE's `tee_ta_open_session` + // behavior: evict the dead entry and report `TargetDead` to the caller. + match session_manager().with_creation_slot(&ta_uuid, is_single_instance, || { + open_session_new_instance( + msg_args, + msg_args_phys_addr, + params, + ta_uuid, + client_identity, + &ta_req_info, + ) + })? { + CreationReservation::ExistingSingleInstance(existing) => { match open_session_single_instance( msg_args, msg_args_phys_addr, @@ -503,57 +515,25 @@ fn handle_open_session( ta_uuid, &ta_req_info, )? { - OpenSessionOutcome::Done => return Ok(()), - // Cached instance was torn down in a race with a concurrent - // close/panic. Drop the stale entry from the cache only if - // still the same Arc and fall through. + OpenSessionOutcome::Done => Ok(()), OpenSessionOutcome::InstanceDestroyed => { + // Evict the zombie. Analog of OP-TEE's `maybe_release_ta_ctx` + // removing the dead ctx from `tee_ctxes`. session_manager().remove_single_instance_if_same(&ta_uuid, &existing); + write_msg_args_to_normal_world( + msg_args, + msg_args_phys_addr, + TeeResult::TargetDead, + None, + None, + Some(&ta_req_info), + )?; + Ok(()) } } } + CreationReservation::SlotReserved => Ok(()), } - - // Create a new TA instance, or reuse a freshly-cached one. For - // single-instance TAs, `with_creation_slot` re-checks the cache under - // its lock and serializes instance creation per UUID. - // - // The bounded retry cap (2 iterations suffice in steady state; 4 gives - // slack for back-to-back churn where other cores repeatedly install and - // destroy fresh instances for the same UUID) ultimately falls back to - // `EThreadLimit` so the driver retries the whole call. - for _ in 0..MAX_OPEN_SESSION_RETRIES { - match session_manager().with_creation_slot(&ta_uuid, is_single_instance, || { - open_session_new_instance( - msg_args, - msg_args_phys_addr, - params, - ta_uuid, - client_identity, - &ta_req_info, - ) - })? { - CreationReservation::ExistingSingleInstance(existing) => { - match open_session_single_instance( - msg_args, - msg_args_phys_addr, - existing.clone(), - params, - ta_uuid, - &ta_req_info, - )? { - OpenSessionOutcome::Done => return Ok(()), - OpenSessionOutcome::InstanceDestroyed => { - // defense-in-depth against future reordering of destroyer steps - session_manager().remove_single_instance_if_same(&ta_uuid, &existing); - } - } - } - CreationReservation::SlotReserved => return Ok(()), - } - } - - Err(OpteeSmcReturnCode::EThreadLimit) } /// Outcome of [`open_session_single_instance`]. @@ -586,12 +566,11 @@ fn open_session_single_instance( .try_lock() .ok_or(OpteeSmcReturnCode::EThreadLimit)?; - // `task_page_table_id == None` means the instance was destroyed by a - // concurrent close/panic between our cache lookup and acquiring the - // instance lock. - let Some(task_pt_id) = instance.task_page_table_id else { + // `dead == true` means the instance was destroyed by a concurrent close/panic + if instance.dead { return Ok(OpenSessionOutcome::InstanceDestroyed); - }; + } + let task_pt_id = instance.task_page_table_id; // Allocate session ID BEFORE calling load_ta_context so TA gets correct ID. // Use SessionIdGuard to ensure the ID is recycled on any error path @@ -683,9 +662,9 @@ fn open_session_single_instance( )?; session_manager().remove_single_instance(&ta_uuid); - // Invalidate the id in the struct *before* teardown so any - // future lock holder observes `None` and bails. - instance.task_page_table_id = None; + // Mark the instance dead *before* teardown so any future + // lock holder bails without touching the invalid page table. + instance.dead = true; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. @@ -924,7 +903,8 @@ fn open_session_new_instance( let instance = Arc::new(SpinMutex::new(TaInstance { shim, loaded_program, - task_page_table_id: Some(task_pt_id), + task_page_table_id: task_pt_id, + dead: false, })); // Cache single-instance TAs for future sessions @@ -984,10 +964,9 @@ fn handle_invoke_command( let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - // `task_page_table_id == None` means the TA instance was already torn - // down (e.g., a prior InvokeCommand panicked with TARGET_DEAD). The - // session is orphaned. Report TARGET_DEAD to the client. - let Some(task_pt_id) = instance.task_page_table_id else { + // `dead == true` means the TA instance was already torn down. + // The session is orphaned. Report TARGET_DEAD to the client. + if instance.dead { drop(instance); session_manager().unregister_session(session_id); write_msg_args_to_normal_world( @@ -1003,7 +982,8 @@ fn handle_invoke_command( session_id ); return Ok(()); - }; + } + let task_pt_id = instance.task_page_table_id; // Switch to the TA instance's page table unsafe { switch_to_task_page_table(task_pt_id)? }; @@ -1050,14 +1030,6 @@ fn handle_invoke_command( // Per OP-TEE OS: if TA panics (TARGET_DEAD), the TA context is // unrecoverable; all sessions on the same single-instance TA are // implicitly dead (Ref: tee_ta_invoke_command() in tee_ta_manager.c). - // Tear down the instance so that any subsequent op on remaining - // sessions observes `task_page_table_id == None` and fails cleanly. - // Remaining sessions stay in the session map until their clients - // call CloseSession; at that point `handle_close_session`'s None - // path will unregister them and report Success. Note that we don't - // unregister/recycle those session IDs immediately to avoid potential - // use-after-free issues (clients don't know whether those session IDs - // are invalid). if return_code == TeeResult::TargetDead { debug_serial_println!( "InvokeCommand: TA panicked (TARGET_DEAD), session_id={}", @@ -1086,10 +1058,8 @@ fn handle_invoke_command( session_manager().remove_single_instance(&ta_uuid); } - // Invalidate the id in the struct *before* teardown so any other - // lock holder (on another session sharing this Arc) observes `None` - // and bails. - instance.task_page_table_id = None; + // Mark the instance dead *before* teardown. + instance.dead = true; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. @@ -1148,11 +1118,10 @@ fn handle_close_session( let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - // `task_page_table_id == None` means the TA instance was already torn - // down (e.g., a prior InvokeCommand panicked with TARGET_DEAD and cleaned - // it up). From the client's perspective the session no longer exists, - // so CloseSession is trivially successful. - let Some(task_pt_id) = instance.task_page_table_id else { + // `dead == true` means the TA instance was already torn down. + // From the client's perspective the session no longer exists, so + // CloseSession is trivially successful. + if instance.dead { drop(instance); session_manager().unregister_session(session_id); write_msg_args_to_normal_world( @@ -1168,7 +1137,8 @@ fn handle_close_session( session_id ); return Ok(()); - }; + } + let task_pt_id = instance.task_page_table_id; // Switch to the TA instance's page table unsafe { switch_to_task_page_table(task_pt_id)? }; @@ -1236,7 +1206,7 @@ fn handle_close_session( if entry.ta_flags.is_single_instance() { session_manager().remove_single_instance(&entry.ta_uuid); } - instance.task_page_table_id = None; + instance.dead = true; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index 04bec7616..74e8a1e0c 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -115,12 +115,14 @@ pub struct TaInstance { /// Boxed to keep it at a fixed heap address - the Task inside must not be moved /// after initialization because it contains internal state that may not survive moves. pub loaded_program: alloc::boxed::Box, - /// The task page table ID associated with this TA instance. - /// - /// `None` after the page table has been torn down. Any lock holder that - /// observes `None` must treat the instance as destroyed and bail out - /// without touching the page table. - pub task_page_table_id: Option, + /// The task page table ID associated with this TA instance. Valid only + /// while `dead == false`. + pub task_page_table_id: usize, + /// Set when the TA panics or its last session closes. Any lock holder + /// should first check whether `dead == true` and if it is, bail without + /// touching `task_page_table_id`. `shim` and `loaded_program` remain + /// valid until the last `Arc` is dropped. + pub dead: bool, } // SAFETY: TaInstance is protected by SpinMutex and try_lock (`SessionEntry`) @@ -398,11 +400,6 @@ impl SessionManager { &self.single_instance_cache } - /// Get a cached single-instance TA by UUID. - pub fn get_single_instance(&self, uuid: &TeeUuid) -> Option>> { - self.single_instance_cache.get(uuid) - } - /// Cache a single-instance TA. pub fn cache_single_instance(&self, uuid: TeeUuid, instance: Arc>) { self.single_instance_cache.insert(uuid, instance); @@ -515,8 +512,9 @@ impl SessionManager { let mut state = self.creation_state.lock(); if is_single_instance { - // Re-check single-instance cache under the creation lock to close - // the TOCTOU window between the caller's get_single_instance() and here. + // Check the single-instance cache under the creation lock. A + // hit means another core finished creating the instance for + // this UUID; reuse it instead of starting a new load. if let Some(existing) = self.single_instance_cache.get(uuid) { return Ok(CreationReservation::ExistingSingleInstance(existing)); } From e755d5ea42364559e03f57742cec2a9927d9abfc Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Tue, 21 Apr 2026 20:47:47 +0000 Subject: [PATCH 6/6] fix writing message back to normal world --- litebox_runner_lvbs/src/lib.rs | 61 +++++++++++++------------------ litebox_shim_optee/src/session.rs | 6 +-- 2 files changed, 26 insertions(+), 41 deletions(-) diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 23efd6ebe..a23ce4aa5 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -520,14 +520,9 @@ fn handle_open_session( // Evict the zombie. Analog of OP-TEE's `maybe_release_ta_ctx` // removing the dead ctx from `tee_ctxes`. session_manager().remove_single_instance_if_same(&ta_uuid, &existing); - write_msg_args_to_normal_world( - msg_args, - msg_args_phys_addr, - TeeResult::TargetDead, - None, - None, - Some(&ta_req_info), - )?; + msg_args.ret = TeeResult::TargetDead; + msg_args.ret_origin = TeeOrigin::Tee; + write_non_ta_msg_args_to_normal_world(msg_args, msg_args_phys_addr)?; Ok(()) } } @@ -652,14 +647,14 @@ fn open_session_single_instance( ); // Write error response BEFORE switching page tables (accesses user memory) - write_msg_args_to_normal_world( + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, return_code, None, // No session ID on failure Some(&ta_params), Some(ta_req_info), - )?; + ); session_manager().remove_single_instance(&ta_uuid); // Mark the instance dead *before* teardown so any future @@ -676,6 +671,7 @@ fn open_session_single_instance( // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just // cleaning it up. Currently we always clean up on panic. + write_result?; return Ok(OpenSessionOutcome::Done); } } @@ -805,19 +801,20 @@ fn open_session_new_instance( ); // Write error response back to normal world - write_msg_args_to_normal_world( + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, ldelf_return_code, None, // No session ID on failure None, Some(ta_req_info), - )?; + ); // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. unsafe { teardown_ta_page_table(&shim, task_pt_id) }; + write_result?; return Ok(()); } @@ -883,19 +880,20 @@ fn open_session_new_instance( ); // Write error response back to normal world - write_msg_args_to_normal_world( + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, return_code, None, // No session ID on failure Some(&ta_params), Some(ta_req_info), - )?; + ); // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. unsafe { teardown_ta_page_table(&shim, task_pt_id) }; + write_result?; return Ok(()); } @@ -969,14 +967,9 @@ fn handle_invoke_command( if instance.dead { drop(instance); session_manager().unregister_session(session_id); - write_msg_args_to_normal_world( - msg_args, - msg_args_phys_addr, - TeeResult::TargetDead, - None, - None, - None, - )?; + msg_args.ret = TeeResult::TargetDead; + msg_args.ret_origin = TeeOrigin::Tee; + write_non_ta_msg_args_to_normal_world(msg_args, msg_args_phys_addr)?; debug_serial_println!( "InvokeCommand: session_id={} on already-destroyed TA instance", session_id @@ -1043,14 +1036,14 @@ fn handle_invoke_command( session_manager().unregister_session(session_id); // Write response BEFORE switching page tables (accesses user memory) - write_msg_args_to_normal_world( + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, return_code, None, Some(&ta_params), Some(&ta_req_info), - )?; + ); // Clear single-instance cache so new OpenSessions for this UUID // create a fresh instance instead of hitting the zombie one. @@ -1077,6 +1070,7 @@ fn handle_invoke_command( // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just // cleaning it up. Currently we always clean up on panic. + write_result?; return Ok(()); } @@ -1124,14 +1118,9 @@ fn handle_close_session( if instance.dead { drop(instance); session_manager().unregister_session(session_id); - write_msg_args_to_normal_world( - msg_args, - msg_args_phys_addr, - TeeResult::Success, - None, - None, - None, - )?; + msg_args.ret = TeeResult::Success; + msg_args.ret_origin = TeeOrigin::TrustedApp; + write_non_ta_msg_args_to_normal_world(msg_args, msg_args_phys_addr)?; debug_serial_println!( "CloseSession complete: session_id={}, TA instance already destroyed", session_id @@ -1167,14 +1156,14 @@ fn handle_close_session( } // CloseSession always succeeds (TA_CloseSessionEntryPoint returns void) - write_msg_args_to_normal_world( + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, TeeResult::Success, None, None, None, - )?; + ); // Clone the instance Arc before dropping the lock for later cleanup check let instance_arc = session_entry.instance.clone(); @@ -1199,7 +1188,7 @@ fn handle_close_session( "CloseSession complete: session_id={}, TA kept alive (INSTANCE_KEEP_ALIVE flag)", session_id ); - return Ok(()); + return write_result; } // Clear single-instance cache if this was a single-instance TA @@ -1229,7 +1218,7 @@ fn handle_close_session( ); } - Ok(()) + write_result } /// Update msg_args with return values and write back to normal world memory. diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index 74e8a1e0c..d1efd4985 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -248,9 +248,7 @@ impl SingleInstanceCache { self.inner.lock().remove(uuid) } - /// Remove a cached single-instance TA by UUID, but only if the currently - /// cached entry is the same `Arc` as `expected`. - pub fn remove_if_same(&self, uuid: &TeeUuid, expected: &Arc>) -> bool { + fn remove_if_same(&self, uuid: &TeeUuid, expected: &Arc>) -> bool { let mut guard = self.inner.lock(); match guard.get(uuid) { Some(current) if Arc::ptr_eq(current, expected) => { @@ -452,8 +450,6 @@ impl SessionManager { /// Remove a single-instance TA from the cache only if the currently /// cached `Arc` is the same as `expected`. - /// - /// See [`SingleInstanceCache::remove_if_same`]. pub fn remove_single_instance_if_same( &self, uuid: &TeeUuid,