diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index ee598e4be..a23ce4aa5 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -490,23 +490,12 @@ fn handle_open_session( .get_known_flags(&ta_uuid) .is_none_or(|f| f.is_single_instance()); - if is_single_instance { - // Fast path: Reuse a cached single-instance TA if one exists. - if let Some(existing) = session_manager().get_single_instance(&ta_uuid) { - return open_session_single_instance( - msg_args, - msg_args_phys_addr, - existing, - params, - ta_uuid, - &ta_req_info, - ); - } - } - - // Create a new TA instance. For single-instance TAs, this also re-checks the cache - // under the lock and prevents concurrent instance creation of the same UUID. - // For multi-instance TAs, only the global capacity limit is enforced. + // Resolve or create the TA instance. + // For single-instance TAs, `with_creation_slot` re-checks the cache + // under its lock and serializes instance creation per UUID. + // If a cache hit returns a zombie (an instance torn down by a + // concurrent close/panic), we mirror OP-TEE's `tee_ta_open_session` + // behavior: evict the dead entry and report `TargetDead` to the caller. match session_manager().with_creation_slot(&ta_uuid, is_single_instance, || { open_session_new_instance( msg_args, @@ -517,18 +506,39 @@ fn handle_open_session( &ta_req_info, ) })? { - CreationReservation::ExistingSingleInstance(existing) => open_session_single_instance( - msg_args, - msg_args_phys_addr, - existing, - params, - ta_uuid, - &ta_req_info, - ), + CreationReservation::ExistingSingleInstance(existing) => { + match open_session_single_instance( + msg_args, + msg_args_phys_addr, + existing.clone(), + params, + ta_uuid, + &ta_req_info, + )? { + OpenSessionOutcome::Done => Ok(()), + OpenSessionOutcome::InstanceDestroyed => { + // Evict the zombie. Analog of OP-TEE's `maybe_release_ta_ctx` + // removing the dead ctx from `tee_ctxes`. + session_manager().remove_single_instance_if_same(&ta_uuid, &existing); + msg_args.ret = TeeResult::TargetDead; + msg_args.ret_origin = TeeOrigin::Tee; + write_non_ta_msg_args_to_normal_world(msg_args, msg_args_phys_addr)?; + Ok(()) + } + } + } CreationReservation::SlotReserved => Ok(()), } } +/// Outcome of [`open_session_single_instance`]. +enum OpenSessionOutcome { + /// Session was successfully opened (or a TA-level error was written back). + Done, + /// The cached `TaInstance` was torn down concurrently. + InstanceDestroyed, +} + /// Open a new session on an existing single-instance TA. /// /// Returns `Err(OpteeSmcReturnCode::EThreadLimit)` if the TA instance is currently in use. @@ -544,13 +554,19 @@ fn open_session_single_instance( params: &[litebox_common_optee::UteeParamOwned], ta_uuid: litebox_common_optee::TeeUuid, ta_req_info: &litebox_shim_optee::msg_handler::TaRequestInfo, -) -> Result<(), OpteeSmcReturnCode> { +) -> Result { // Use try_lock to avoid spinning - return EThreadLimit if TA is in use // The Linux driver will handle this by waiting and retrying - let instance = instance_arc + let mut instance = instance_arc .try_lock() .ok_or(OpteeSmcReturnCode::EThreadLimit)?; + // `dead == true` means the instance was destroyed by a concurrent close/panic + if instance.dead { + return Ok(OpenSessionOutcome::InstanceDestroyed); + } + let task_pt_id = instance.task_page_table_id; + // Allocate session ID BEFORE calling load_ta_context so TA gets correct ID. // Use SessionIdGuard to ensure the ID is recycled on any error path // (before it is registered with the session manager). @@ -562,11 +578,10 @@ fn open_session_single_instance( debug_serial_println!( "Reusing single-instance TA: uuid={:?}, task_pt_id={}, session_id={}", ta_uuid, - instance.task_page_table_id, + task_pt_id, runner_session_id ); - let task_pt_id = instance.task_page_table_id; let ta_flags = instance.loaded_program.ta_flags; // Switch to the existing TA's page table @@ -632,16 +647,19 @@ fn open_session_single_instance( ); // Write error response BEFORE switching page tables (accesses user memory) - write_msg_args_to_normal_world( + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, return_code, None, // No session ID on failure Some(&ta_params), Some(ta_req_info), - )?; + ); session_manager().remove_single_instance(&ta_uuid); + // Mark the instance dead *before* teardown so any future + // lock holder bails without touching the invalid page table. + instance.dead = true; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. @@ -653,7 +671,8 @@ fn open_session_single_instance( // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just // cleaning it up. Currently we always clean up on panic. - return Ok(()); + write_result?; + return Ok(OpenSessionOutcome::Done); } } @@ -669,7 +688,7 @@ fn open_session_single_instance( Some(ta_req_info), )?; - return Ok(()); + return Ok(OpenSessionOutcome::Done); } drop(instance); @@ -693,7 +712,7 @@ fn open_session_single_instance( runner_session_id ); - Ok(()) + Ok(OpenSessionOutcome::Done) } /// Create a new TA instance for a session. @@ -780,20 +799,22 @@ fn open_session_new_instance( "ldelf/TA_CreateEntryPoint failed: return_code={:?}", ldelf_return_code ); - // Safety: We are about to tear down this TA instance; - // no references to user-space memory will be held afterwards. - unsafe { teardown_ta_page_table(&shim, task_pt_id) }; // Write error response back to normal world - write_msg_args_to_normal_world( + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, ldelf_return_code, None, // No session ID on failure None, Some(ta_req_info), - )?; + ); + // Safety: We are about to tear down this TA instance; + // no references to user-space memory will be held afterwards. + unsafe { teardown_ta_page_table(&shim, task_pt_id) }; + + write_result?; return Ok(()); } @@ -859,19 +880,20 @@ fn open_session_new_instance( ); // Write error response back to normal world - write_msg_args_to_normal_world( + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, return_code, None, // No session ID on failure Some(&ta_params), Some(ta_req_info), - )?; + ); // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. unsafe { teardown_ta_page_table(&shim, task_pt_id) }; + write_result?; return Ok(()); } @@ -880,6 +902,7 @@ fn open_session_new_instance( shim, loaded_program, task_page_table_id: task_pt_id, + dead: false, })); // Cache single-instance TAs for future sessions @@ -936,10 +959,23 @@ fn handle_invoke_command( .ok_or(OpteeSmcReturnCode::EBadCmd)?; // Use try_lock to avoid spinning - return EThreadLimit if TA is in use // The Linux driver will handle this by waiting and retrying - let Some(instance) = session_entry.instance.try_lock() else { + let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - + // `dead == true` means the TA instance was already torn down. + // The session is orphaned. Report TARGET_DEAD to the client. + if instance.dead { + drop(instance); + session_manager().unregister_session(session_id); + msg_args.ret = TeeResult::TargetDead; + msg_args.ret_origin = TeeOrigin::Tee; + write_non_ta_msg_args_to_normal_world(msg_args, msg_args_phys_addr)?; + debug_serial_println!( + "InvokeCommand: session_id={} on already-destroyed TA instance", + session_id + ); + return Ok(()); + } let task_pt_id = instance.task_page_table_id; // Switch to the TA instance's page table @@ -984,8 +1020,9 @@ fn handle_invoke_command( let return_code: u32 = ctx.rax.truncate(); let return_code = TeeResult::try_from(return_code).unwrap_or(TeeResult::GenericError); - // Per OP-TEE OS: if TA panics (TARGET_DEAD), clean up the session/instance - // Reference: tee_ta_invoke_command() in tee_ta_manager.c + // Per OP-TEE OS: if TA panics (TARGET_DEAD), the TA context is + // unrecoverable; all sessions on the same single-instance TA are + // implicitly dead (Ref: tee_ta_invoke_command() in tee_ta_manager.c). if return_code == TeeResult::TargetDead { debug_serial_println!( "InvokeCommand: TA panicked (TARGET_DEAD), session_id={}", @@ -994,53 +1031,46 @@ fn handle_invoke_command( let ta_uuid = session_entry.ta_uuid; let ta_flags = session_entry.ta_flags; - let instance_arc = session_entry.instance.clone(); - // Remove the session from the map + // Remove this session from the map session_manager().unregister_session(session_id); - // Check if this was the last session using the TA instance by counting - // remaining sessions that reference this instance. - let remaining_sessions = session_manager() - .sessions() - .count_sessions_for_instance(&instance_arc); - let is_last_session = remaining_sessions == 0; - // Write response BEFORE switching page tables (accesses user memory) - write_msg_args_to_normal_world( + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, return_code, None, Some(&ta_params), Some(&ta_req_info), - )?; + ); - if is_last_session { - // Clear single-instance cache if applicable - if ta_flags.is_single_instance() { - session_manager().remove_single_instance(&ta_uuid); - } + // Clear single-instance cache so new OpenSessions for this UUID + // create a fresh instance instead of hitting the zombie one. + if ta_flags.is_single_instance() { + session_manager().remove_single_instance(&ta_uuid); + } - // Safety: We are about to tear down this TA instance; - // no references to user-space memory will be held afterwards. - // The lock is held, so no other core can enter the TA. - unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; + // Mark the instance dead *before* teardown. + instance.dead = true; - drop(instance); + // Safety: We are about to tear down this TA instance; + // no references to user-space memory will be held afterwards. + // The lock is held, so no other core can enter the TA. + unsafe { teardown_ta_page_table(&instance.shim, task_pt_id) }; - debug_serial_println!( - "InvokeCommand: cleaned up dead TA instance, task_pt_id={}", - task_pt_id - ); + drop(instance); - // TODO: Per OP-TEE OS semantics, if the TA has INSTANCE_KEEP_ALIVE but not - // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just - // cleaning it up. Currently we always clean up on panic. - } else { - drop(instance); - } + debug_serial_println!( + "InvokeCommand: cleaned up dead TA instance, task_pt_id={}", + task_pt_id + ); + // TODO: Per OP-TEE OS semantics, if the TA has INSTANCE_KEEP_ALIVE but not + // INSTANCE_KEEP_CRASHED, we should respawn the TA here instead of just + // cleaning it up. Currently we always clean up on panic. + + write_result?; return Ok(()); } @@ -1079,10 +1109,24 @@ fn handle_close_session( .ok_or(OpteeSmcReturnCode::EBadCmd)?; // Use try_lock to avoid spinning - return EThreadLimit if TA is in use // The Linux driver will handle this by waiting and retrying - let Some(instance) = session_entry.instance.try_lock() else { + let Some(mut instance) = session_entry.instance.try_lock() else { return Err(OpteeSmcReturnCode::EThreadLimit); }; - + // `dead == true` means the TA instance was already torn down. + // From the client's perspective the session no longer exists, so + // CloseSession is trivially successful. + if instance.dead { + drop(instance); + session_manager().unregister_session(session_id); + msg_args.ret = TeeResult::Success; + msg_args.ret_origin = TeeOrigin::TrustedApp; + write_non_ta_msg_args_to_normal_world(msg_args, msg_args_phys_addr)?; + debug_serial_println!( + "CloseSession complete: session_id={}, TA instance already destroyed", + session_id + ); + return Ok(()); + } let task_pt_id = instance.task_page_table_id; // Switch to the TA instance's page table @@ -1112,14 +1156,14 @@ fn handle_close_session( } // CloseSession always succeeds (TA_CloseSessionEntryPoint returns void) - write_msg_args_to_normal_world( + let write_result = write_msg_args_to_normal_world( msg_args, msg_args_phys_addr, TeeResult::Success, None, None, None, - )?; + ); // Clone the instance Arc before dropping the lock for later cleanup check let instance_arc = session_entry.instance.clone(); @@ -1144,13 +1188,14 @@ fn handle_close_session( "CloseSession complete: session_id={}, TA kept alive (INSTANCE_KEEP_ALIVE flag)", session_id ); - return Ok(()); + return write_result; } // Clear single-instance cache if this was a single-instance TA if entry.ta_flags.is_single_instance() { session_manager().remove_single_instance(&entry.ta_uuid); } + instance.dead = true; // Safety: We are about to tear down this TA instance; // no references to user-space memory will be held afterwards. @@ -1159,7 +1204,6 @@ fn handle_close_session( // Drop the instance to release shim/loaded_program resources drop(instance); - drop(entry); debug_serial_println!( "CloseSession complete: deleted task_pt_id={} (last session)", @@ -1174,7 +1218,7 @@ fn handle_close_session( ); } - Ok(()) + write_result } /// Update msg_args with return values and write back to normal world memory. diff --git a/litebox_shim_optee/src/session.rs b/litebox_shim_optee/src/session.rs index 0ab69f544..d1efd4985 100644 --- a/litebox_shim_optee/src/session.rs +++ b/litebox_shim_optee/src/session.rs @@ -115,8 +115,14 @@ pub struct TaInstance { /// Boxed to keep it at a fixed heap address - the Task inside must not be moved /// after initialization because it contains internal state that may not survive moves. pub loaded_program: alloc::boxed::Box, - /// The task page table ID associated with this TA instance. + /// The task page table ID associated with this TA instance. Valid only + /// while `dead == false`. pub task_page_table_id: usize, + /// Set when the TA panics or its last session closes. Any lock holder + /// should first check whether `dead == true` and if it is, bail without + /// touching `task_page_table_id`. `shim` and `loaded_program` remain + /// valid until the last `Arc` is dropped. + pub dead: bool, } // SAFETY: TaInstance is protected by SpinMutex and try_lock (`SessionEntry`) @@ -242,6 +248,17 @@ impl SingleInstanceCache { self.inner.lock().remove(uuid) } + fn remove_if_same(&self, uuid: &TeeUuid, expected: &Arc>) -> bool { + let mut guard = self.inner.lock(); + match guard.get(uuid) { + Some(current) if Arc::ptr_eq(current, expected) => { + guard.remove(uuid); + true + } + _ => false, + } + } + /// Get the number of cached single-instance TAs. pub fn len(&self) -> usize { self.inner.lock().len() @@ -381,11 +398,6 @@ impl SessionManager { &self.single_instance_cache } - /// Get a cached single-instance TA by UUID. - pub fn get_single_instance(&self, uuid: &TeeUuid) -> Option>> { - self.single_instance_cache.get(uuid) - } - /// Cache a single-instance TA. pub fn cache_single_instance(&self, uuid: TeeUuid, instance: Arc>) { self.single_instance_cache.insert(uuid, instance); @@ -436,6 +448,16 @@ impl SessionManager { self.single_instance_cache.remove(uuid) } + /// Remove a single-instance TA from the cache only if the currently + /// cached `Arc` is the same as `expected`. + pub fn remove_single_instance_if_same( + &self, + uuid: &TeeUuid, + expected: &Arc>, + ) -> bool { + self.single_instance_cache.remove_if_same(uuid, expected) + } + /// Get the total count of unique TA instances (for limit checking). /// /// This counts: @@ -486,8 +508,9 @@ impl SessionManager { let mut state = self.creation_state.lock(); if is_single_instance { - // Re-check single-instance cache under the creation lock to close - // the TOCTOU window between the caller's get_single_instance() and here. + // Check the single-instance cache under the creation lock. A + // hit means another core finished creating the instance for + // this UUID; reuse it instead of starting a new load. if let Some(existing) = self.single_instance_cache.get(uuid) { return Ok(CreationReservation::ExistingSingleInstance(existing)); }