From 21653dbc6dba9eb5f5b3437cfe2d9502a62d70c2 Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Tue, 31 Mar 2026 18:12:27 -0700 Subject: [PATCH 01/12] update --- src/tensorrt_provider_factory.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/tensorrt_provider_factory.cc b/src/tensorrt_provider_factory.cc index f0e985a..336b271 100644 --- a/src/tensorrt_provider_factory.cc +++ b/src/tensorrt_provider_factory.cc @@ -88,15 +88,22 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp // The memory info is required to create allocator and gpu data transfer. int num_cuda_devices = 0; cudaGetDeviceCount(&num_cuda_devices); + + if (num_cuda_devices == 0) { + return factory->ort_api.CreateStatus(ORT_FAIL, "No CUDA devices found."); + } + RETURN_IF_ERROR(factory->CreateMemoryInfoForDevices(num_cuda_devices)); int32_t device_id = 0; + constexpr uint32_t kNvidiaVendorId = 0x10DE; for (size_t i = 0; i < num_devices && num_ep_devices < max_ep_devices; ++i) { // C API const OrtHardwareDevice& device = *devices[i]; - if (factory->ort_api.HardwareDevice_Type(&device) == OrtHardwareDeviceType::OrtHardwareDeviceType_GPU) { + if (factory->ort_api.HardwareDevice_Type(&device) == OrtHardwareDeviceType::OrtHardwareDeviceType_GPU && + factory->ort_api.HardwareDevice_VendorId(&device) == kNvidiaVendorId) { // These can be returned as nullptr if you have nothing to add. OrtKeyValuePairs* ep_metadata = nullptr; OrtKeyValuePairs* ep_options = nullptr; From 1de07bd80509590819138914dc658f9ec05d09e7 Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Wed, 1 Apr 2026 09:17:16 -0700 Subject: [PATCH 02/12] check cuda api return value --- src/tensorrt_provider_factory.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/tensorrt_provider_factory.cc b/src/tensorrt_provider_factory.cc index 336b271..28f12f3 100644 --- a/src/tensorrt_provider_factory.cc +++ b/src/tensorrt_provider_factory.cc @@ -87,10 +87,13 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp // Create two memory infos per device. // The memory info is required to create allocator and gpu data transfer. int num_cuda_devices = 0; - cudaGetDeviceCount(&num_cuda_devices); + cudaError_t cuda_err = cudaGetDeviceCount(&num_cuda_devices); + if (cuda_err != cudaSuccess) { + return factory->ort_api.CreateStatus(ORT_EP_FAIL, cudaGetErrorString(cuda_err)); + } if (num_cuda_devices == 0) { - return factory->ort_api.CreateStatus(ORT_FAIL, "No CUDA devices found."); + return factory->ort_api.CreateStatus(ORT_EP_FAIL, "No CUDA devices found."); } RETURN_IF_ERROR(factory->CreateMemoryInfoForDevices(num_cuda_devices)); From ab9803756d73fbc4c6aeb628c64ae9b79ef18c18 Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Wed, 1 Apr 2026 12:21:09 -0700 Subject: [PATCH 03/12] address reviewer's comments --- src/tensorrt_execution_provider_data_transfer.cc | 7 ++----- src/tensorrt_provider_factory.cc | 8 +++++--- src/utils/ep_utils.h | 2 ++ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/tensorrt_execution_provider_data_transfer.cc b/src/tensorrt_execution_provider_data_transfer.cc index ca74a33..64dae35 100644 --- a/src/tensorrt_execution_provider_data_transfer.cc +++ b/src/tensorrt_execution_provider_data_transfer.cc @@ -23,12 +23,9 @@ bool ORT_API_CALL TRTEpDataTransfer::CanCopyImpl(const OrtDataTransferImpl* this auto src_vendor_id = impl.ep_api.MemoryDevice_GetVendorId(src_memory_device); auto dst_vendor_id = impl.ep_api.MemoryDevice_GetVendorId(dst_memory_device); - // 0x10DE is the PCI vendor ID for NVIDIA - constexpr uint32_t nvidia_vendor_id = 0x10DE; - // Reject if GPU device is not NVIDIA - if ((src_type == OrtMemoryInfoDeviceType_GPU && src_vendor_id != nvidia_vendor_id) || - (dst_type == OrtMemoryInfoDeviceType_GPU && dst_vendor_id != nvidia_vendor_id)) { + if ((src_type == OrtMemoryInfoDeviceType_GPU && src_vendor_id != kNvidiaVendorId) || + (dst_type == OrtMemoryInfoDeviceType_GPU && dst_vendor_id != kNvidiaVendorId)) { return false; } diff --git a/src/tensorrt_provider_factory.cc b/src/tensorrt_provider_factory.cc index 28f12f3..33772cf 100644 --- a/src/tensorrt_provider_factory.cc +++ b/src/tensorrt_provider_factory.cc @@ -55,7 +55,7 @@ OrtStatus* TensorrtExecutionProviderFactory::CreateMemoryInfoForDevices(int num_ for (int device_id = 0; device_id < num_devices; ++device_id) { OrtMemoryInfo* mem_info = nullptr; RETURN_IF_ERROR(ort_api.CreateMemoryInfo_V2("Cuda", OrtMemoryInfoDeviceType_GPU, - /*vendor OrtDevice::VendorIds::NVIDIA*/ 0x10DE, + /* vendor_id */ kNvidiaVendorId, /* device_id */ device_id, OrtDeviceMemoryType_DEFAULT, /*alignment*/ 0, OrtAllocatorType::OrtDeviceAllocator, &mem_info)); @@ -64,7 +64,7 @@ OrtStatus* TensorrtExecutionProviderFactory::CreateMemoryInfoForDevices(int num_ // HOST_ACCESSIBLE memory should use the non-CPU device type mem_info = nullptr; RETURN_IF_ERROR(ort_api.CreateMemoryInfo_V2("CudaPinned", OrtMemoryInfoDeviceType_GPU, - /*vendor OrtDevice::VendorIds::NVIDIA*/ 0x10DE, + /* vendor_id */ kNvidiaVendorId, /* device_id */ device_id, OrtDeviceMemoryType_HOST_ACCESSIBLE, /*alignment*/ 0, OrtAllocatorType::OrtDeviceAllocator, &mem_info)); @@ -99,7 +99,6 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp RETURN_IF_ERROR(factory->CreateMemoryInfoForDevices(num_cuda_devices)); int32_t device_id = 0; - constexpr uint32_t kNvidiaVendorId = 0x10DE; for (size_t i = 0; i < num_devices && num_ep_devices < max_ep_devices; ++i) { // C API @@ -130,6 +129,9 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp return status; } + RETURN_IF_NOT(device_id < num_cuda_devices, + "The device_id for supported device exceeds the number of CUDA devices."); + const OrtMemoryInfo* cuda_gpu_mem_info = factory->cuda_gpu_memory_infos[device_id].get(); const OrtMemoryInfo* cuda_pinned_mem_info = factory->cuda_pinned_memory_infos[device_id].get(); diff --git a/src/utils/ep_utils.h b/src/utils/ep_utils.h index f940195..4ba8d05 100644 --- a/src/utils/ep_utils.h +++ b/src/utils/ep_utils.h @@ -23,6 +23,8 @@ struct ApiPtrs { namespace trt_ep { +constexpr uint32_t kNvidiaVendorId = 0x10DE; + #define ENFORCE(condition, ...) \ do { \ if (!(condition)) { \ From 0fa5f544152a100d5afd3ef403a05f9412283ff5 Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Wed, 1 Apr 2026 17:17:56 -0700 Subject: [PATCH 04/12] address reviewer's comments --- src/tensorrt_provider_factory.cc | 151 +++++++++++++++---------------- src/tensorrt_provider_factory.h | 2 +- 2 files changed, 76 insertions(+), 77 deletions(-) diff --git a/src/tensorrt_provider_factory.cc b/src/tensorrt_provider_factory.cc index 33772cf..4f70971 100644 --- a/src/tensorrt_provider_factory.cc +++ b/src/tensorrt_provider_factory.cc @@ -48,27 +48,29 @@ const char* ORT_API_CALL TensorrtExecutionProviderFactory::GetVersionImpl(const return factory->ep_version_.c_str(); } -OrtStatus* TensorrtExecutionProviderFactory::CreateMemoryInfoForDevices(int num_devices) { - cuda_gpu_memory_infos.reserve(num_devices); - cuda_pinned_memory_infos.reserve(num_devices); - - for (int device_id = 0; device_id < num_devices; ++device_id) { +OrtStatus* TensorrtExecutionProviderFactory::EnsureMemoryInfoForDevice(uint32_t device_id) { + if (cuda_gpu_memory_infos.find(device_id) == cuda_gpu_memory_infos.end()) { OrtMemoryInfo* mem_info = nullptr; RETURN_IF_ERROR(ort_api.CreateMemoryInfo_V2("Cuda", OrtMemoryInfoDeviceType_GPU, /* vendor_id */ kNvidiaVendorId, - /* device_id */ device_id, OrtDeviceMemoryType_DEFAULT, - /*alignment*/ 0, OrtAllocatorType::OrtDeviceAllocator, &mem_info)); - - cuda_gpu_memory_infos[device_id] = MemoryInfoUniquePtr(mem_info, ort_api.ReleaseMemoryInfo); + /* device_id */ static_cast(device_id), + OrtDeviceMemoryType_DEFAULT, + /* alignment */ 0, + OrtAllocatorType::OrtDeviceAllocator, + &mem_info)); + cuda_gpu_memory_infos.emplace(device_id, MemoryInfoUniquePtr(mem_info, ort_api.ReleaseMemoryInfo)); + } - // HOST_ACCESSIBLE memory should use the non-CPU device type - mem_info = nullptr; + if (cuda_pinned_memory_infos.find(device_id) == cuda_pinned_memory_infos.end()) { + OrtMemoryInfo* mem_info = nullptr; RETURN_IF_ERROR(ort_api.CreateMemoryInfo_V2("CudaPinned", OrtMemoryInfoDeviceType_GPU, /* vendor_id */ kNvidiaVendorId, - /* device_id */ device_id, OrtDeviceMemoryType_HOST_ACCESSIBLE, - /*alignment*/ 0, OrtAllocatorType::OrtDeviceAllocator, &mem_info)); - - cuda_pinned_memory_infos[device_id] = MemoryInfoUniquePtr(mem_info, ort_api.ReleaseMemoryInfo); + /* device_id */ static_cast(device_id), + OrtDeviceMemoryType_HOST_ACCESSIBLE, + /* alignment */ 0, + OrtAllocatorType::OrtDeviceAllocator, + &mem_info)); + cuda_pinned_memory_infos.emplace(device_id, MemoryInfoUniquePtr(mem_info, ort_api.ReleaseMemoryInfo)); } return nullptr; @@ -84,89 +86,75 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp size_t& num_ep_devices = *p_num_ep_devices; auto* factory = static_cast(this_ptr); - // Create two memory infos per device. - // The memory info is required to create allocator and gpu data transfer. + // Prevent unbounded growth if ORT calls this function multiple times. + factory->cuda_gpu_mem_devices.clear(); + factory->cuda_pinned_mem_devices.clear(); + factory->cuda_gpu_mem_devices.reserve(num_devices); + factory->cuda_pinned_mem_devices.reserve(num_devices); + int num_cuda_devices = 0; cudaError_t cuda_err = cudaGetDeviceCount(&num_cuda_devices); - if (cuda_err != cudaSuccess) { - return factory->ort_api.CreateStatus(ORT_EP_FAIL, cudaGetErrorString(cuda_err)); - } - if (num_cuda_devices == 0) { - return factory->ort_api.CreateStatus(ORT_EP_FAIL, "No CUDA devices found."); + if (cuda_err == cudaErrorNoDevice || num_cuda_devices == 0) { + return nullptr; } - RETURN_IF_ERROR(factory->CreateMemoryInfoForDevices(num_cuda_devices)); - - int32_t device_id = 0; + if (cuda_err != cudaSuccess) { + return factory->ort_api.CreateStatus(ORT_EP_FAIL, cudaGetErrorString(cuda_err)); + } for (size_t i = 0; i < num_devices && num_ep_devices < max_ep_devices; ++i) { - // C API const OrtHardwareDevice& device = *devices[i]; - if (factory->ort_api.HardwareDevice_Type(&device) == OrtHardwareDeviceType::OrtHardwareDeviceType_GPU && - factory->ort_api.HardwareDevice_VendorId(&device) == kNvidiaVendorId) { - // These can be returned as nullptr if you have nothing to add. - OrtKeyValuePairs* ep_metadata = nullptr; - OrtKeyValuePairs* ep_options = nullptr; - factory->ort_api.CreateKeyValuePairs(&ep_metadata); - factory->ort_api.CreateKeyValuePairs(&ep_options); - - // The ep options can be provided here as default values. - // Users can also call SessionOptionsAppendExecutionProvider_V2 C API with provided ep options to override. - factory->ort_api.AddKeyValuePair(ep_metadata, "gpu_type", "data center"); // random example using made up values - factory->ort_api.AddKeyValuePair(ep_options, "trt_builder_optimization_level", "3"); + if (factory->ort_api.HardwareDevice_Type(&device) != OrtHardwareDeviceType::OrtHardwareDeviceType_GPU || + factory->ort_api.HardwareDevice_VendorId(&device) != kNvidiaVendorId) { + continue; + } - // OrtEpDevice copies ep_metadata and ep_options. - OrtEpDevice* ep_device = nullptr; - auto* status = factory->ort_api.GetEpApi()->CreateEpDevice(factory, &device, ep_metadata, ep_options, - &ep_device); + const uint32_t cuda_device_id = + static_cast(factory->ort_api.HardwareDevice_DeviceId(&device)); - factory->ort_api.ReleaseKeyValuePairs(ep_metadata); - factory->ort_api.ReleaseKeyValuePairs(ep_options); + // Create and cache the OrtMemoryInfo for this device if we haven't already, + // so they can be used for allocator and data transfer creation. + RETURN_IF_ERROR(factory->EnsureMemoryInfoForDevice(cuda_device_id)); - if (status != nullptr) { - return status; - } + const OrtMemoryInfo* cuda_gpu_mem_info = factory->cuda_gpu_memory_infos.at(cuda_device_id).get(); + const OrtMemoryInfo* cuda_pinned_mem_info = factory->cuda_pinned_memory_infos.at(cuda_device_id).get(); - RETURN_IF_NOT(device_id < num_cuda_devices, - "The device_id for supported device exceeds the number of CUDA devices."); + // These can be returned as nullptr if EP has nothing to add. + OrtKeyValuePairs* ep_metadata = nullptr; + OrtKeyValuePairs* ep_options = nullptr; + factory->ort_api.CreateKeyValuePairs(&ep_metadata); + factory->ort_api.CreateKeyValuePairs(&ep_options); - const OrtMemoryInfo* cuda_gpu_mem_info = factory->cuda_gpu_memory_infos[device_id].get(); - const OrtMemoryInfo* cuda_pinned_mem_info = factory->cuda_pinned_memory_infos[device_id].get(); + // The ep options can be provided here as default values. + // Users can also call SessionOptionsAppendExecutionProvider_V2 C API with provided ep options to override. + factory->ort_api.AddKeyValuePair(ep_metadata, "gpu_type", "data center"); + factory->ort_api.AddKeyValuePair(ep_options, "trt_builder_optimization_level", "3"); - // Register the allocator info required by TRT EP. - RETURN_IF_ERROR(factory->ep_api.EpDevice_AddAllocatorInfo(ep_device, cuda_gpu_mem_info)); - RETURN_IF_ERROR(factory->ep_api.EpDevice_AddAllocatorInfo(ep_device, cuda_pinned_mem_info)); + // OrtEpDevice copies ep_metadata and ep_options. + OrtEpDevice* ep_device = nullptr; + auto* status = factory->ort_api.GetEpApi()->CreateEpDevice(factory, &device, ep_metadata, ep_options, &ep_device); - // Get memory device from memory info for gpu data transfer - factory->cuda_gpu_mem_devices.push_back(factory->ep_api.MemoryInfo_GetMemoryDevice(cuda_gpu_mem_info)); - factory->cuda_pinned_mem_devices.push_back(factory->ep_api.MemoryInfo_GetMemoryDevice(cuda_pinned_mem_info)); + factory->ort_api.ReleaseKeyValuePairs(ep_metadata); + factory->ort_api.ReleaseKeyValuePairs(ep_options); - ep_devices[num_ep_devices++] = ep_device; - ++device_id; + if (status != nullptr) { + return status; } - // C++ API equivalent. Throws on error. - //{ - // Ort::ConstHardwareDevice device(devices[i]); - // if (device.Type() == OrtHardwareDeviceType::OrtHardwareDeviceType_GPU) { - // Ort::KeyValuePairs ep_metadata; - // Ort::KeyValuePairs ep_options; - // ep_metadata.Add("version", "0.1"); - // ep_options.Add("trt_builder_optimization_level", "3"); - // Ort::EpDevice ep_device{*this_ptr, device, ep_metadata.GetConst(), ep_options.GetConst()}; - // ep_devices[num_ep_devices++] = ep_device.release(); - // } - //} + RETURN_IF_ERROR(factory->ep_api.EpDevice_AddAllocatorInfo(ep_device, cuda_gpu_mem_info)); + RETURN_IF_ERROR(factory->ep_api.EpDevice_AddAllocatorInfo(ep_device, cuda_pinned_mem_info)); + + factory->cuda_gpu_mem_devices.push_back(factory->ep_api.MemoryInfo_GetMemoryDevice(cuda_gpu_mem_info)); + factory->cuda_pinned_mem_devices.push_back(factory->ep_api.MemoryInfo_GetMemoryDevice(cuda_pinned_mem_info)); + + ep_devices[num_ep_devices++] = ep_device; } - // Create gpu data transfer auto data_transfer_impl = std::make_unique(static_cast(*factory), - factory->cuda_gpu_mem_devices, // device memory - factory->cuda_pinned_mem_devices // shared memory - ); - + factory->cuda_gpu_mem_devices, + factory->cuda_pinned_mem_devices); factory->data_transfer_impl = std::move(data_transfer_impl); return nullptr; @@ -322,6 +310,17 @@ EXPORT_SYMBOL OrtStatus* CreateEpFactories(const char* registration_name, const const OrtEpApi* ort_ep_api = ort_api->GetEpApi(); const OrtModelEditorApi* model_editor_api = ort_api->GetModelEditorApi(); + // Fail fast if CUDA device is not available. + int num_cuda_devices = 0; + const cudaError_t cuda_err = cudaGetDeviceCount(&num_cuda_devices); + if (cuda_err != cudaSuccess) { + return ort_api->CreateStatus(ORT_EP_FAIL, cudaGetErrorString(cuda_err)); + } + + if (num_cuda_devices == 0) { + return ort_api->CreateStatus(ORT_EP_FAIL, "No CUDA devices found on the system."); + } + // Manual init for the C++ API Ort::InitApi(ort_api); diff --git a/src/tensorrt_provider_factory.h b/src/tensorrt_provider_factory.h index d016a9f..42473f3 100644 --- a/src/tensorrt_provider_factory.h +++ b/src/tensorrt_provider_factory.h @@ -17,7 +17,7 @@ struct TensorrtExecutionProviderFactory : public OrtEpFactory, public ApiPtrs { public: TensorrtExecutionProviderFactory(const char* ep_name, const OrtLogger& default_logger, ApiPtrs apis); - OrtStatus* CreateMemoryInfoForDevices(int num_devices); + OrtStatus* EnsureMemoryInfoForDevice(uint32_t device_id); // Called by child OrtEp instances to retrieve the cached kernel registry for that EP. OrtStatus* GetKernelRegistryForEp(TensorrtExecutionProvider& ep, /*out*/ const OrtKernelRegistry** kernel_registry); From d8a8d84400f05db191a7a68f930c1e860fc35a5b Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Thu, 2 Apr 2026 12:14:55 -0700 Subject: [PATCH 05/12] address reviewer's comments --- src/tensorrt_provider_factory.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/tensorrt_provider_factory.cc b/src/tensorrt_provider_factory.cc index 4f70971..46f06a8 100644 --- a/src/tensorrt_provider_factory.cc +++ b/src/tensorrt_provider_factory.cc @@ -114,7 +114,7 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp const uint32_t cuda_device_id = static_cast(factory->ort_api.HardwareDevice_DeviceId(&device)); - // Create and cache the OrtMemoryInfo for this device if we haven't already, + // Create and cache the OrtMemoryInfo for this device if we haven't already, // so they can be used for allocator and data transfer creation. RETURN_IF_ERROR(factory->EnsureMemoryInfoForDevice(cuda_device_id)); @@ -314,11 +314,15 @@ EXPORT_SYMBOL OrtStatus* CreateEpFactories(const char* registration_name, const int num_cuda_devices = 0; const cudaError_t cuda_err = cudaGetDeviceCount(&num_cuda_devices); if (cuda_err != cudaSuccess) { - return ort_api->CreateStatus(ORT_EP_FAIL, cudaGetErrorString(cuda_err)); + return ort_api->CreateStatus(ORT_EP_FAIL, cudaGetErrorString(cuda_err)); } if (num_cuda_devices == 0) { - return ort_api->CreateStatus(ORT_EP_FAIL, "No CUDA devices found on the system."); + Ort::ThrowOnError(ort_api->Logger_LogMessage(default_logger, + OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, + "No CUDA devices found on the system." + "TensorRT execution provider will still be created but will not be able to run any models.", + ORT_FILE, __LINE__, __FUNCTION__)); } // Manual init for the C++ API From bb325a27972d3c71094a769b581fc5b5a425c126 Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Thu, 2 Apr 2026 13:32:47 -0700 Subject: [PATCH 06/12] update --- src/tensorrt_execution_provider.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tensorrt_execution_provider.def b/src/tensorrt_execution_provider.def index ae83cb7..d2589b2 100644 --- a/src/tensorrt_execution_provider.def +++ b/src/tensorrt_execution_provider.def @@ -1,4 +1,4 @@ -LIBRARY "TensorRTEp.dll" +LIBRARY "ORTTensorRTEp.dll" EXPORTS CreateEpFactories @1 ReleaseEpFactory @2 From 86b17b04223953132b7637e287cde33efef6857c Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Fri, 10 Apr 2026 13:23:32 -0700 Subject: [PATCH 07/12] refactor --- src/tensorrt_execution_provider.cc | 22 +- ...ensorrt_execution_provider_data_transfer.h | 8 +- src/tensorrt_provider_factory.cc | 215 ++++++++++++------ src/tensorrt_provider_factory.h | 72 +++++- 4 files changed, 218 insertions(+), 99 deletions(-) diff --git a/src/tensorrt_execution_provider.cc b/src/tensorrt_execution_provider.cc index 80cc43d..3f3fed7 100644 --- a/src/tensorrt_execution_provider.cc +++ b/src/tensorrt_execution_provider.cc @@ -3050,11 +3050,12 @@ OrtStatus* TRTEpNodeComputeInfo::ComputeImpl(OrtNodeComputeInfo* this_ptr, void* std::unordered_map& dds_output_allocator_maps = ep.GetDDSOutputAllocators(); auto& dds_output_allocator_map = dds_output_allocator_maps[fused_node_name]; - // Get default OrtMemoryInfo from factory - const OrtMemoryInfo* mem_info = nullptr; - if (ep.factory_.cuda_gpu_memory_infos.find(device_id) != - ep.factory_.cuda_gpu_memory_infos.end()) { - mem_info = ep.factory_.cuda_gpu_memory_infos[device_id].get(); + // Get default OrtMemoryInfo from factory's device cache + const OrtMemoryInfo* mem_info = ep.factory_.GetMemoryInfoByOrdinal(device_id, /* is pinned */false); + if (mem_info == nullptr) { + std::string err_msg = "TensorRT EP failed to get OrtMemoryInfo for device_id " + + std::to_string(device_id) + " from provider factory."; + return ep.ort_api.CreateStatus(ORT_EP_FAIL, err_msg.c_str()); } // Get allocator from OrtKernelContext @@ -3770,11 +3771,12 @@ OrtStatus* TRTEpEpContextNodeComputeInfo::ComputeImpl(OrtNodeComputeInfo* this_p std::unordered_map> shape_tensor_values; // This map holds "shape tensor -> shape values" for the shape tensor input across this inference run std::unordered_map> shape_tensor_values_int64; // same as above but for int64 shape tensor input - // Get default OrtMemoryInfo from factory - const OrtMemoryInfo* mem_info = nullptr; - if (ep.factory_.cuda_gpu_memory_infos.find(device_id) != - ep.factory_.cuda_gpu_memory_infos.end()) { - mem_info = ep.factory_.cuda_gpu_memory_infos[device_id].get(); + // Get default OrtMemoryInfo from factory's device cache + const OrtMemoryInfo* mem_info = ep.factory_.GetMemoryInfoByOrdinal(device_id, /* is pinned */false); + if (mem_info == nullptr) { + std::string err_msg = "TensorRT EP failed to get OrtMemoryInfo for device_id " + + std::to_string(device_id) + " from provider factory."; + return ep.ort_api.CreateStatus(ORT_EP_FAIL, err_msg.c_str()); } // Get allocator from OrtKernelContext diff --git a/src/tensorrt_execution_provider_data_transfer.h b/src/tensorrt_execution_provider_data_transfer.h index 816a5eb..c6bf8a6 100644 --- a/src/tensorrt_execution_provider_data_transfer.h +++ b/src/tensorrt_execution_provider_data_transfer.h @@ -9,9 +9,7 @@ namespace trt_ep { struct TRTEpDataTransfer : OrtDataTransferImpl, ApiPtrs { - TRTEpDataTransfer(ApiPtrs api_ptrs, std::vector& device_mem_infos, - std::vector& shared_mem_infos) - : ApiPtrs(api_ptrs), cuda_gpu_mem_devices_{device_mem_infos}, cuda_pinned_mem_devices_{shared_mem_infos} { + TRTEpDataTransfer(ApiPtrs api_ptrs) : OrtDataTransferImpl{}, ApiPtrs(api_ptrs) { CanCopy = CanCopyImpl; CopyTensors = CopyTensorsImpl; Release = ReleaseImpl; @@ -26,9 +24,5 @@ struct TRTEpDataTransfer : OrtDataTransferImpl, ApiPtrs { OrtValue** dst_tensors_ptr, OrtSyncStream** streams_ptr, size_t num_tensors) noexcept; static void ORT_API_CALL ReleaseImpl(OrtDataTransferImpl* this_ptr) noexcept; - - private: - std::vector& cuda_gpu_mem_devices_; - std::vector& cuda_pinned_mem_devices_; }; } // namespace trt_ep \ No newline at end of file diff --git a/src/tensorrt_provider_factory.cc b/src/tensorrt_provider_factory.cc index 46f06a8..b4b74a4 100644 --- a/src/tensorrt_provider_factory.cc +++ b/src/tensorrt_provider_factory.cc @@ -14,23 +14,29 @@ namespace trt_ep { TensorrtExecutionProviderFactory::TensorrtExecutionProviderFactory(const char* ep_name, const OrtLogger& default_logger, ApiPtrs apis) - : OrtEpFactory {}, ApiPtrs(apis), default_logger_{default_logger}, ep_name_{ep_name} { + : OrtEpFactory {}, + ApiPtrs(apis), + default_logger_{default_logger}, + ep_name_{ep_name}, + ort_api_{apis.ort_api}, + ep_api_{apis.ep_api} { ort_version_supported = ORT_API_VERSION; // set to the ORT version we were compiled with. GetName = GetNameImpl; GetVendor = GetVendorImpl; GetVersion = GetVersionImpl; - GetSupportedDevices = GetSupportedDevicesImpl; - CreateEp = CreateEpImpl; ReleaseEp = ReleaseEpImpl; - CreateAllocator = CreateAllocatorImpl; ReleaseAllocator = ReleaseAllocatorImpl; - CreateDataTransfer = CreateDataTransferImpl; + IsStreamAware = IsStreamAwareImpl; +} - IsStreamAware = IsStreamAwareImpl; +TensorrtExecutionProviderFactory::~TensorrtExecutionProviderFactory() { + if (kernel_registry_ != nullptr) { + ep_api_.ReleaseKernelRegistry(kernel_registry_); + } } const char* ORT_API_CALL TensorrtExecutionProviderFactory::GetNameImpl(const OrtEpFactory* this_ptr) noexcept { @@ -48,32 +54,26 @@ const char* ORT_API_CALL TensorrtExecutionProviderFactory::GetVersionImpl(const return factory->ep_version_.c_str(); } -OrtStatus* TensorrtExecutionProviderFactory::EnsureMemoryInfoForDevice(uint32_t device_id) { - if (cuda_gpu_memory_infos.find(device_id) == cuda_gpu_memory_infos.end()) { - OrtMemoryInfo* mem_info = nullptr; - RETURN_IF_ERROR(ort_api.CreateMemoryInfo_V2("Cuda", OrtMemoryInfoDeviceType_GPU, - /* vendor_id */ kNvidiaVendorId, - /* device_id */ static_cast(device_id), - OrtDeviceMemoryType_DEFAULT, - /* alignment */ 0, - OrtAllocatorType::OrtDeviceAllocator, - &mem_info)); - cuda_gpu_memory_infos.emplace(device_id, MemoryInfoUniquePtr(mem_info, ort_api.ReleaseMemoryInfo)); - } - - if (cuda_pinned_memory_infos.find(device_id) == cuda_pinned_memory_infos.end()) { - OrtMemoryInfo* mem_info = nullptr; - RETURN_IF_ERROR(ort_api.CreateMemoryInfo_V2("CudaPinned", OrtMemoryInfoDeviceType_GPU, - /* vendor_id */ kNvidiaVendorId, - /* device_id */ static_cast(device_id), - OrtDeviceMemoryType_HOST_ACCESSIBLE, - /* alignment */ 0, - OrtAllocatorType::OrtDeviceAllocator, - &mem_info)); - cuda_pinned_memory_infos.emplace(device_id, MemoryInfoUniquePtr(mem_info, ort_api.ReleaseMemoryInfo)); - } +const OrtMemoryInfo* TensorrtExecutionProviderFactory::GetMemoryInfoByOrdinal(int cuda_ordinal, bool is_pinned) { + // Get default OrtMemoryInfo from factory's device cache + const OrtMemoryInfo* mem_info = nullptr; + auto* cache_entry = FindDeviceCacheEntryByOrdinal(cuda_ordinal); + if (cache_entry != nullptr) { + mem_info = is_pinned ? cache_entry->pinned_memory_info : + cache_entry->device_memory_info; // Ort::MemoryInfo implicitly converts to OrtMemoryInfo* + } + return mem_info; +} - return nullptr; +TensorrtExecutionProviderFactory::HardwareDeviceKey TensorrtExecutionProviderFactory::MakeDeviceKey(const OrtApi& ort_api, + const OrtHardwareDevice& device, + int cuda_ordinal) { + return { + ort_api.HardwareDevice_Type(&device), + ort_api.HardwareDevice_VendorId(&device), + ort_api.HardwareDevice_DeviceId(&device), + cuda_ordinal, + }; } OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImpl( @@ -86,23 +86,29 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp size_t& num_ep_devices = *p_num_ep_devices; auto* factory = static_cast(this_ptr); - // Prevent unbounded growth if ORT calls this function multiple times. - factory->cuda_gpu_mem_devices.clear(); - factory->cuda_pinned_mem_devices.clear(); - factory->cuda_gpu_mem_devices.reserve(num_devices); - factory->cuda_pinned_mem_devices.reserve(num_devices); - - int num_cuda_devices = 0; - cudaError_t cuda_err = cudaGetDeviceCount(&num_cuda_devices); - - if (cuda_err == cudaErrorNoDevice || num_cuda_devices == 0) { - return nullptr; + // Clear stale ordinal mappings from any prior enumeration. + { + std::lock_guard lock(factory->device_cache_mutex_); + factory->ordinal_to_device_key_.clear(); } + auto release_ep_devices = [&](OrtStatus* status) -> OrtStatus* { + for (size_t j = 0; j < num_ep_devices; ++j) { + factory->ep_api.ReleaseEpDevice(ep_devices[j]); + ep_devices[j] = nullptr; + } + num_ep_devices = 0; + return status; + }; + + // Query CUDA device count once upfront so we can validate assigned ordinals. + int cuda_device_count = 0; + cudaError_t cuda_err = cudaGetDeviceCount(&cuda_device_count); if (cuda_err != cudaSuccess) { - return factory->ort_api.CreateStatus(ORT_EP_FAIL, cudaGetErrorString(cuda_err)); + cuda_device_count = 0; // no CUDA devices available } + int cuda_device_index = 0; for (size_t i = 0; i < num_devices && num_ep_devices < max_ep_devices; ++i) { const OrtHardwareDevice& device = *devices[i]; @@ -111,26 +117,64 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp continue; } - const uint32_t cuda_device_id = - static_cast(factory->ort_api.HardwareDevice_DeviceId(&device)); + // CUDA uses contiguous ordinals for CUDA-visible NVIDIA devices. Build that + // mapping from the filtered hardware-device list instead of relying on the + // ORT hardware device id, which is not guaranteed to be a CUDA ordinal. + int current_device_id = cuda_device_index++; - // Create and cache the OrtMemoryInfo for this device if we haven't already, - // so they can be used for allocator and data transfer creation. - RETURN_IF_ERROR(factory->EnsureMemoryInfoForDevice(cuda_device_id)); + // Validate the assigned ordinal is within the range of CUDA-visible devices. + // If hardware enumeration reports GPUs not visible to CUDA (e.g. due to + // CUDA_VISIBLE_DEVICES), skip them to avoid failures in allocator/stream creation. + if (current_device_id >= cuda_device_count) { + continue; + } - const OrtMemoryInfo* cuda_gpu_mem_info = factory->cuda_gpu_memory_infos.at(cuda_device_id).get(); - const OrtMemoryInfo* cuda_pinned_mem_info = factory->cuda_pinned_memory_infos.at(cuda_device_id).get(); + const auto device_key = MakeDeviceKey(factory->ort_api, device, current_device_id); + DeviceCacheEntry* cache_entry = nullptr; + { + std::lock_guard lock(factory->device_cache_mutex_); + auto [it, inserted] = factory->device_cache_.try_emplace(device_key); + if (inserted) { + it->second.cuda_device_id = current_device_id; + it->second.device_memory_info = Ort::MemoryInfo{"Cuda", + OrtMemoryInfoDeviceType_GPU, + kNvidiaVendorId, + static_cast(current_device_id), + OrtDeviceMemoryType_DEFAULT, + /*alignment is default*/ 0, + OrtAllocatorType::OrtDeviceAllocator}; + it->second.pinned_memory_info = Ort::MemoryInfo{"CudaPinned", + OrtMemoryInfoDeviceType_GPU, + kNvidiaVendorId, + static_cast(current_device_id), + OrtDeviceMemoryType_HOST_ACCESSIBLE, + /*alignment is default*/ 0, + OrtAllocatorType::OrtDeviceAllocator}; + } + + cache_entry = &it->second; + current_device_id = cache_entry->cuda_device_id; + // Build ordinal -> key mapping for CreateAllocatorImpl lookups. + factory->ordinal_to_device_key_[current_device_id] = device_key; + } // These can be returned as nullptr if EP has nothing to add. OrtKeyValuePairs* ep_metadata = nullptr; OrtKeyValuePairs* ep_options = nullptr; factory->ort_api.CreateKeyValuePairs(&ep_metadata); factory->ort_api.CreateKeyValuePairs(&ep_options); - - // The ep options can be provided here as default values. - // Users can also call SessionOptionsAppendExecutionProvider_V2 C API with provided ep options to override. - factory->ort_api.AddKeyValuePair(ep_metadata, "gpu_type", "data center"); - factory->ort_api.AddKeyValuePair(ep_options, "trt_builder_optimization_level", "3"); + factory->ort_api.AddKeyValuePair(ep_metadata, "cuda_device_id", std::to_string(current_device_id).c_str()); + factory->ort_api.AddKeyValuePair(ep_options, "device_id", std::to_string(current_device_id).c_str()); + + // Get CUDA device properties for metadata + { + cudaDeviceProp prop; + if (cudaGetDeviceProperties(&prop, current_device_id) == cudaSuccess) { + factory->ort_api.AddKeyValuePair(ep_metadata, "cuda_device_name", prop.name); + factory->ort_api.AddKeyValuePair(ep_metadata, "cuda_compute_capability", + (std::to_string(prop.major) + "." + std::to_string(prop.minor)).c_str()); + } + } // OrtEpDevice copies ep_metadata and ep_options. OrtEpDevice* ep_device = nullptr; @@ -140,22 +184,32 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp factory->ort_api.ReleaseKeyValuePairs(ep_options); if (status != nullptr) { - return status; + return release_ep_devices(status); } - RETURN_IF_ERROR(factory->ep_api.EpDevice_AddAllocatorInfo(ep_device, cuda_gpu_mem_info)); - RETURN_IF_ERROR(factory->ep_api.EpDevice_AddAllocatorInfo(ep_device, cuda_pinned_mem_info)); + auto release_current_ep_device = [factory](OrtEpDevice* device) { + factory->ep_api.ReleaseEpDevice(device); + }; - factory->cuda_gpu_mem_devices.push_back(factory->ep_api.MemoryInfo_GetMemoryDevice(cuda_gpu_mem_info)); - factory->cuda_pinned_mem_devices.push_back(factory->ep_api.MemoryInfo_GetMemoryDevice(cuda_pinned_mem_info)); + // ep_device_guard owns the current device. On error, release_ep_devices cleans up + // previously committed devices [0, num_ep_devices), while the guard cleans up this one. + std::unique_ptr ep_device_guard(ep_device, release_current_ep_device); - ep_devices[num_ep_devices++] = ep_device; - } + // Register allocator info for GPU device memory + status = factory->ep_api.EpDevice_AddAllocatorInfo(ep_device, cache_entry->device_memory_info); + if (status != nullptr) { + return release_ep_devices(status); + } + + // Register allocator info for pinned host memory associated with the + // same CUDA ordinal as the device allocator above. + status = factory->ep_api.EpDevice_AddAllocatorInfo(ep_device, cache_entry->pinned_memory_info); + if (status != nullptr) { + return release_ep_devices(status); + } - auto data_transfer_impl = std::make_unique(static_cast(*factory), - factory->cuda_gpu_mem_devices, - factory->cuda_pinned_mem_devices); - factory->data_transfer_impl = std::move(data_transfer_impl); + ep_devices[num_ep_devices++] = ep_device_guard.release(); + } return nullptr; } @@ -261,7 +315,9 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateDataTransferImpl OrtEpFactory* this_ptr, OrtDataTransferImpl** data_transfer) noexcept { auto& factory = *static_cast(this_ptr); - *data_transfer = factory.data_transfer_impl.get(); + + auto data_transfer_impl = std::make_unique(static_cast(factory)); + *data_transfer = data_transfer_impl.release(); return nullptr; } @@ -279,22 +335,39 @@ OrtStatus* TensorrtExecutionProviderFactory::GetKernelRegistryForEp(TensorrtExec } if (kernel_registry_ == nullptr) { - // Optional state that is provided to kernels on creation (can be null). - // We pass the OrtDataTransferImpl created by this factory to allow kernels to copy data between devices. - void* op_kernel_state = static_cast(data_transfer_impl.get()); const char* ep_name = ep.GetName(static_cast(&ep)); // This statement creates the kernel registry and caches it in the OrtEpFactory instance. // We assume that all EPs created by this factory can use the same kernel registry. This may not be the // case in a more complex OrtEpFactory that can create EP instances that are each configured for different // hardware devices. In such a scenario, a different kernel registry may be created for each EP configuration. - RETURN_IF_ERROR(CreateKernelRegistry(ep_name, op_kernel_state, &kernel_registry_)); + RETURN_IF_ERROR(CreateKernelRegistry(ep_name, nullptr, &kernel_registry_)); } *out_kernel_registry = kernel_registry_; return nullptr; } +TensorrtExecutionProviderFactory::DeviceCacheEntry* TensorrtExecutionProviderFactory::FindDeviceCacheEntryByOrdinalLocked(int cuda_ordinal) { + auto key_it = ordinal_to_device_key_.find(cuda_ordinal); + if (key_it == ordinal_to_device_key_.end()) { + return nullptr; + } + auto cache_it = device_cache_.find(key_it->second); + if (cache_it == device_cache_.end()) { + return nullptr; + } + return &cache_it->second; +} + +// IMPORTANT: Entries are never erased from device_cache_ after insertion. +// This guarantees pointer stability for DeviceCacheEntry* returned by +// FindDeviceCacheEntryByOrdinal() after the lock is released. +TensorrtExecutionProviderFactory::DeviceCacheEntry* TensorrtExecutionProviderFactory::FindDeviceCacheEntryByOrdinal(int cuda_ordinal) { + std::lock_guard lock(device_cache_mutex_); + return FindDeviceCacheEntryByOrdinalLocked(cuda_ordinal); +} + } // namespace trt_ep #define EXPORT_SYMBOL diff --git a/src/tensorrt_provider_factory.h b/src/tensorrt_provider_factory.h index 42473f3..0372712 100644 --- a/src/tensorrt_provider_factory.h +++ b/src/tensorrt_provider_factory.h @@ -4,6 +4,13 @@ #include "tensorrt_execution_provider_data_transfer.h" #include "cuda_allocator.h" +#include +#include +#include +#include +#include +#include + using MemoryInfoUniquePtr = std::unique_ptr>; namespace trt_ep { @@ -16,26 +23,17 @@ struct TensorrtExecutionProvider; struct TensorrtExecutionProviderFactory : public OrtEpFactory, public ApiPtrs { public: TensorrtExecutionProviderFactory(const char* ep_name, const OrtLogger& default_logger, ApiPtrs apis); - - OrtStatus* EnsureMemoryInfoForDevice(uint32_t device_id); + ~TensorrtExecutionProviderFactory(); // Called by child OrtEp instances to retrieve the cached kernel registry for that EP. OrtStatus* GetKernelRegistryForEp(TensorrtExecutionProvider& ep, /*out*/ const OrtKernelRegistry** kernel_registry); - // CUDA gpu memory and CUDA pinned memory are required for allocator and data transfer, these are the OrtMemoryInfo - // instance required for that. - // Current TRT EP implementation uses one default OrtMemoryInfo and one host accessible OrtMemoryInfo per ep device. - std::unordered_map cuda_gpu_memory_infos; // device id -> memory info - std::unordered_map cuda_pinned_memory_infos; + const OrtMemoryInfo* GetMemoryInfoByOrdinal(int cuda_ordinal, bool is_pinned); // Keeps allocators per ep device in factory so they can be shared across sessions. std::unordered_map> cuda_gpu_allocators; // device id -> allocator std::unordered_map> cuda_pinned_allocators; - std::vector cuda_gpu_mem_devices; - std::vector cuda_pinned_mem_devices; - std::unique_ptr data_transfer_impl; // data transfer implementation for this factory - private: static const char* ORT_API_CALL GetNameImpl(const OrtEpFactory* this_ptr) noexcept; @@ -69,6 +67,9 @@ struct TensorrtExecutionProviderFactory : public OrtEpFactory, public ApiPtrs { const std::string ep_name_; // EP name const std::string vendor_{"Nvidia"}; // EP vendor name const std::string ep_version_{"0.1.0"}; // EP version + + const OrtApi& ort_api_; + const OrtEpApi& ep_api_; const OrtLogger& default_logger_; // Cached kernel registry used by all OrtEp instances created by this factory. Refer to OrtEp::GetKernelRegistry. @@ -76,5 +77,54 @@ struct TensorrtExecutionProviderFactory : public OrtEpFactory, public ApiPtrs { // Note: If this factory instead created EP instances that each supported different hardware configurations, then // the factory could cache a different kernel registry per EP configuration. OrtKernelRegistry* kernel_registry_ = nullptr; + + struct HardwareDeviceKey { + OrtHardwareDeviceType type{ OrtHardwareDeviceType::OrtHardwareDeviceType_CPU }; + uint32_t vendor_id{ 0 }; + uint32_t device_id{ 0 }; // PCI device ID — identifies the hardware model, NOT a unique device + int cuda_ordinal{ -1 }; // CUDA ordinal — unique per physical GPU on this host + + bool operator==(const HardwareDeviceKey& other) const noexcept { + return type == other.type && + vendor_id == other.vendor_id && + device_id == other.device_id && + cuda_ordinal == other.cuda_ordinal; + } + }; + + struct HardwareDeviceKeyHasher { + size_t operator()(const HardwareDeviceKey& key) const noexcept { + size_t hash = static_cast(key.type); + hash = (hash * 1315423911u) ^ static_cast(key.vendor_id); + hash = (hash * 1315423911u) ^ static_cast(key.device_id); + hash = (hash * 1315423911u) ^ static_cast(key.cuda_ordinal); + return hash; + } + }; + + static HardwareDeviceKey MakeDeviceKey(const OrtApi& ort_api, + const OrtHardwareDevice& device, + int cuda_ordinal); + + struct DeviceCacheEntry { + int cuda_device_id{ -1 }; + Ort::MemoryInfo device_memory_info{ nullptr }; + Ort::MemoryInfo pinned_memory_info{ nullptr }; + }; + + // Per-physical-device cache. The key includes the CUDA ordinal to distinguish + // identical GPUs (same PCI vendor/device ID) on multi-GPU hosts. + std::mutex device_cache_mutex_; + std::unordered_map device_cache_; + + // Ordinal-to-HardwareDeviceKey mapping built during GetSupportedDevicesImpl. + std::unordered_map ordinal_to_device_key_; + + /// Find the DeviceCacheEntry for a given CUDA ordinal. + /// Returns nullptr if the ordinal has not been registered. + DeviceCacheEntry* FindDeviceCacheEntryByOrdinal(int cuda_ordinal); + + /// Same as FindDeviceCacheEntryByOrdinal but assumes device_cache_mutex_ is already held. + DeviceCacheEntry* FindDeviceCacheEntryByOrdinalLocked(int cuda_ordinal); }; } // namespace trt_ep \ No newline at end of file From 41ea68c120e3157ab24a3ace826c8f53ebb17dc5 Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Fri, 10 Apr 2026 16:12:32 -0700 Subject: [PATCH 08/12] add TRT builder place holder for test --- src/tensorrt_provider_factory.cc | 50 ++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/tensorrt_provider_factory.cc b/src/tensorrt_provider_factory.cc index b4b74a4..85a1c92 100644 --- a/src/tensorrt_provider_factory.cc +++ b/src/tensorrt_provider_factory.cc @@ -11,6 +11,47 @@ #include #include +// --------------------------------------------------------------------------- +// TensorRT builder placeholder for test scenarios. +// +// TensorRT loads/unloads heavy internal libraries every time all IBuilder +// instances are destroyed. During unit testing (e.g., onnxruntime_provider_test) +// EPs are rapidly created and torn down, causing repeated overhead. +// +// ORT's test_main.cc has the same optimization behind `#ifdef USE_TENSORRT`, +// but that define is never set for plugin EPs. Instead we guard creation with +// an environment variable that the test harness can set: +// +// set ORT_TRT_EP_ENABLE_BUILDER_PLACEHOLDER=1 +// +// The placeholder is created once in CreateEpFactories() and destroyed in +// ReleaseEpFactory(), matching the factory's lifetime. +// --------------------------------------------------------------------------- +namespace { + +class PlaceholderTrtLogger : public nvinfer1::ILogger { + public: + void log(Severity /*severity*/, const char* /*msg*/) noexcept override {} +}; + +PlaceholderTrtLogger g_placeholder_trt_logger; +std::unique_ptr g_trt_builder_placeholder; + +void MaybeCreateBuilderPlaceholder() { + if (g_trt_builder_placeholder) return; // already created + + const char* env = std::getenv("ORT_TRT_EP_ENABLE_BUILDER_PLACEHOLDER"); + if (env != nullptr && std::string(env) == "1") { + g_trt_builder_placeholder.reset(nvinfer1::createInferBuilder(g_placeholder_trt_logger)); + } +} + +void DestroyBuilderPlaceholder() { + g_trt_builder_placeholder.reset(); +} + +} // namespace + namespace trt_ep { TensorrtExecutionProviderFactory::TensorrtExecutionProviderFactory(const char* ep_name, const OrtLogger& default_logger, ApiPtrs apis) @@ -401,6 +442,11 @@ EXPORT_SYMBOL OrtStatus* CreateEpFactories(const char* registration_name, const // Manual init for the C++ API Ort::InitApi(ort_api); + // Create TRT builder placeholder if running under a test harness. + // This prevents TensorRT from repeatedly loading/unloading internal + // libraries as EP instances are created and destroyed across tests. + MaybeCreateBuilderPlaceholder(); + // Factory could use registration_name or define its own EP name. std::unique_ptr factory = std::make_unique(registration_name, *default_logger, ApiPtrs{*ort_api, *ort_ep_api, *model_editor_api}); @@ -417,6 +463,10 @@ EXPORT_SYMBOL OrtStatus* CreateEpFactories(const char* registration_name, const EXPORT_SYMBOL OrtStatus* ReleaseEpFactory(OrtEpFactory* factory) { delete static_cast(factory); + + // Release the placeholder builder when the last factory is torn down. + DestroyBuilderPlaceholder(); + return nullptr; } From 3699c7ff45cfdbed9a6ccccdc0fab01eccfc95e9 Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Tue, 14 Apr 2026 11:25:56 -0700 Subject: [PATCH 09/12] address reviewer's comments --- ...nsorrt_execution_provider_data_transfer.cc | 7 +-- src/tensorrt_provider_factory.cc | 59 +++++++++++++------ 2 files changed, 42 insertions(+), 24 deletions(-) diff --git a/src/tensorrt_execution_provider_data_transfer.cc b/src/tensorrt_execution_provider_data_transfer.cc index 64dae35..61af7ee 100644 --- a/src/tensorrt_execution_provider_data_transfer.cc +++ b/src/tensorrt_execution_provider_data_transfer.cc @@ -107,11 +107,6 @@ OrtStatus* ORT_API_CALL TRTEpDataTransfer::CopyTensorsImpl(OrtDataTransferImpl* /*static*/ void ORT_API_CALL TRTEpDataTransfer::ReleaseImpl(OrtDataTransferImpl* this_ptr) noexcept { - // In our setup the factory owns a shared ExampleDataTransfer instance so it will do the cleanup, and we ignore - // the call to Release from the plugin_ep::DataTransfer dtor (see /onnxruntime/core/framework/plugin_data_transfer.h) - // - // If you create a new instance on each call to OrtEpFactory::CreateDataTransfer you call `delete` here - // delete static_cast(this_ptr); - ; + delete static_cast(this_ptr); } } // namespace trt_ep diff --git a/src/tensorrt_provider_factory.cc b/src/tensorrt_provider_factory.cc index 85a1c92..0838ecb 100644 --- a/src/tensorrt_provider_factory.cc +++ b/src/tensorrt_provider_factory.cc @@ -149,7 +149,14 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp cuda_device_count = 0; // no CUDA devices available } - int cuda_device_index = 0; + if (cuda_device_count == 0) { + RETURN_IF_ERROR(factory->ort_api.Logger_LogMessage(&factory->default_logger_, + OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, + "No CUDA devices found on the system. No OrtEpDevice will be created and returned.", + ORT_FILE, __LINE__, __FUNCTION__)); + } + + int cuda_device_index_fallback = 0; // fallback counter when metadata lacks PCI bus ID for (size_t i = 0; i < num_devices && num_ep_devices < max_ep_devices; ++i) { const OrtHardwareDevice& device = *devices[i]; @@ -158,10 +165,27 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp continue; } - // CUDA uses contiguous ordinals for CUDA-visible NVIDIA devices. Build that - // mapping from the filtered hardware-device list instead of relying on the - // ORT hardware device id, which is not guaranteed to be a CUDA ordinal. - int current_device_id = cuda_device_index++; + // Try to resolve the CUDA ordinal from pci_bus_id metadata if available. + // This is more reliable than counter-based ordinal assignment because it is + // not affected by enumeration order, CUDA_VISIBLE_DEVICES remapping, or + // mixed-vendor GPU configurations. + int current_device_id = -1; + const OrtKeyValuePairs* metadata = factory->ort_api_.HardwareDevice_Metadata(&device); + if (metadata != nullptr) { + const char* pci_bus_id = factory->ort_api_.GetKeyValue(metadata, "pci_bus_id"); + if (pci_bus_id != nullptr && pci_bus_id[0] != '\0') { + int resolved_ordinal = -1; + cudaError_t err = cudaDeviceGetByPCIBusId(&resolved_ordinal, pci_bus_id); + if (err == cudaSuccess && resolved_ordinal >= 0 && resolved_ordinal < cuda_device_count) { + current_device_id = resolved_ordinal; + } + } + } + + // Fallback: if pci_bus_id was not available, use counter-based ordinal assignment. + if (current_device_id < 0) { + current_device_id = cuda_device_index_fallback++; + } // Validate the assigned ordinal is within the range of CUDA-visible devices. // If hardware enumeration reports GPUs not visible to CUDA (e.g. due to @@ -424,24 +448,23 @@ EXPORT_SYMBOL OrtStatus* CreateEpFactories(const char* registration_name, const const OrtEpApi* ort_ep_api = ort_api->GetEpApi(); const OrtModelEditorApi* model_editor_api = ort_api->GetModelEditorApi(); - // Fail fast if CUDA device is not available. - int num_cuda_devices = 0; - const cudaError_t cuda_err = cudaGetDeviceCount(&num_cuda_devices); + // Manual init for the C++ API + Ort::InitApi(ort_api); + + int cuda_device_count = 0; + const cudaError_t cuda_err = cudaGetDeviceCount(&cuda_device_count); if (cuda_err != cudaSuccess) { - return ort_api->CreateStatus(ORT_EP_FAIL, cudaGetErrorString(cuda_err)); + cuda_device_count = 0; // no CUDA devices available } - if (num_cuda_devices == 0) { - Ort::ThrowOnError(ort_api->Logger_LogMessage(default_logger, - OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, - "No CUDA devices found on the system." - "TensorRT execution provider will still be created but will not be able to run any models.", - ORT_FILE, __LINE__, __FUNCTION__)); + if (cuda_device_count == 0) { + RETURN_IF_ERROR(ort_api->Logger_LogMessage(default_logger, + OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, + "No CUDA devices found on the system." + "TensorRT execution provider will still be created but will not be able to run any models.", + ORT_FILE, __LINE__, __FUNCTION__)); } - // Manual init for the C++ API - Ort::InitApi(ort_api); - // Create TRT builder placeholder if running under a test harness. // This prevents TensorRT from repeatedly loading/unloading internal // libraries as EP instances are created and destroyed across tests. From 0e3671843535ac7e8c311c8c4a4f02d7ae412427 Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Tue, 14 Apr 2026 13:31:47 -0700 Subject: [PATCH 10/12] address reviewer's comments --- src/tensorrt_provider_factory.cc | 33 ++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/tensorrt_provider_factory.cc b/src/tensorrt_provider_factory.cc index 0838ecb..3fccea9 100644 --- a/src/tensorrt_provider_factory.cc +++ b/src/tensorrt_provider_factory.cc @@ -146,7 +146,11 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp int cuda_device_count = 0; cudaError_t cuda_err = cudaGetDeviceCount(&cuda_device_count); if (cuda_err != cudaSuccess) { - cuda_device_count = 0; // no CUDA devices available + // CUDA API failure (e.g., driver not loaded, version mismatch) is a hard error. + // This is distinct from the case where CUDA works but reports zero devices. + std::string err_msg = std::string("cudaGetDeviceCount failed: ") + cudaGetErrorString(cuda_err) + + " (" + std::to_string(static_cast(cuda_err)) + ")"; + return factory->ort_api.CreateStatus(ORT_RUNTIME_EXCEPTION, err_msg.c_str()); } if (cuda_device_count == 0) { @@ -454,15 +458,28 @@ EXPORT_SYMBOL OrtStatus* CreateEpFactories(const char* registration_name, const int cuda_device_count = 0; const cudaError_t cuda_err = cudaGetDeviceCount(&cuda_device_count); if (cuda_err != cudaSuccess) { - cuda_device_count = 0; // no CUDA devices available + // CUDA API failure (e.g., driver not loaded, version mismatch) is a hard error. + // This is distinct from the case where CUDA works but reports zero devices. + std::string err_msg = std::string("cudaGetDeviceCount failed: ") + cudaGetErrorString(cuda_err) + + " (" + std::to_string(static_cast(cuda_err)) + ")"; + return ort_api->CreateStatus(ORT_RUNTIME_EXCEPTION, err_msg.c_str()); } - if (cuda_device_count == 0) { - RETURN_IF_ERROR(ort_api->Logger_LogMessage(default_logger, - OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, - "No CUDA devices found on the system." - "TensorRT execution provider will still be created but will not be able to run any models.", - ORT_FILE, __LINE__, __FUNCTION__)); + try { + if (cuda_device_count == 0) { + auto* log_status = ort_api->Logger_LogMessage(default_logger, ORT_LOGGING_LEVEL_INFO, + "No CUDA devices found on the system." + "TensorRT execution provider will still be " + "created but will not be able to run any models.", + ORT_FILE, __LINE__, __FUNCTION__); + if (log_status) ort_api->ReleaseStatus(log_status); + } + } + catch (const std::exception& ex) { + auto* log_status = ort_api->Logger_LogMessage(default_logger, ORT_LOGGING_LEVEL_ERROR, + ex.what(), ORT_FILE, __LINE__, __FUNCTION__); + if (log_status) ort_api->ReleaseStatus(log_status); + return ort_api->CreateStatus(ORT_EP_FAIL, ex.what()); } // Create TRT builder placeholder if running under a test harness. From 100f4402044d772e83778ecd79cdfb8d2e55a480 Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Wed, 15 Apr 2026 12:06:58 -0700 Subject: [PATCH 11/12] address reviewer's comment --- src/tensorrt_provider_factory.cc | 73 +++++++++++++++++--------------- 1 file changed, 39 insertions(+), 34 deletions(-) diff --git a/src/tensorrt_provider_factory.cc b/src/tensorrt_provider_factory.cc index 3fccea9..db57a94 100644 --- a/src/tensorrt_provider_factory.cc +++ b/src/tensorrt_provider_factory.cc @@ -455,17 +455,17 @@ EXPORT_SYMBOL OrtStatus* CreateEpFactories(const char* registration_name, const // Manual init for the C++ API Ort::InitApi(ort_api); - int cuda_device_count = 0; - const cudaError_t cuda_err = cudaGetDeviceCount(&cuda_device_count); - if (cuda_err != cudaSuccess) { - // CUDA API failure (e.g., driver not loaded, version mismatch) is a hard error. - // This is distinct from the case where CUDA works but reports zero devices. - std::string err_msg = std::string("cudaGetDeviceCount failed: ") + cudaGetErrorString(cuda_err) + - " (" + std::to_string(static_cast(cuda_err)) + ")"; - return ort_api->CreateStatus(ORT_RUNTIME_EXCEPTION, err_msg.c_str()); - } - try { + int cuda_device_count = 0; + const cudaError_t cuda_err = cudaGetDeviceCount(&cuda_device_count); + if (cuda_err != cudaSuccess) { + // CUDA API failure (e.g., driver not loaded, version mismatch) is a hard error. + // This is distinct from the case where CUDA works but reports zero devices. + std::string err_msg = std::string("cudaGetDeviceCount failed: ") + cudaGetErrorString(cuda_err) + + " (" + std::to_string(static_cast(cuda_err)) + ")"; + return ort_api->CreateStatus(ORT_RUNTIME_EXCEPTION, err_msg.c_str()); + } + if (cuda_device_count == 0) { auto* log_status = ort_api->Logger_LogMessage(default_logger, ORT_LOGGING_LEVEL_INFO, "No CUDA devices found on the system." @@ -474,38 +474,43 @@ EXPORT_SYMBOL OrtStatus* CreateEpFactories(const char* registration_name, const ORT_FILE, __LINE__, __FUNCTION__); if (log_status) ort_api->ReleaseStatus(log_status); } - } - catch (const std::exception& ex) { - auto* log_status = ort_api->Logger_LogMessage(default_logger, ORT_LOGGING_LEVEL_ERROR, - ex.what(), ORT_FILE, __LINE__, __FUNCTION__); - if (log_status) ort_api->ReleaseStatus(log_status); - return ort_api->CreateStatus(ORT_EP_FAIL, ex.what()); - } - // Create TRT builder placeholder if running under a test harness. - // This prevents TensorRT from repeatedly loading/unloading internal - // libraries as EP instances are created and destroyed across tests. - MaybeCreateBuilderPlaceholder(); + // Create TRT builder placeholder if running under a test harness. + // This prevents TensorRT from repeatedly loading/unloading internal + // libraries as EP instances are created and destroyed across tests. + MaybeCreateBuilderPlaceholder(); - // Factory could use registration_name or define its own EP name. - std::unique_ptr factory = std::make_unique(registration_name, *default_logger, ApiPtrs{*ort_api, *ort_ep_api, *model_editor_api}); + // Factory could use registration_name or define its own EP name. + std::unique_ptr factory = std::make_unique( + registration_name, *default_logger, ApiPtrs{*ort_api, *ort_ep_api, *model_editor_api}); - if (max_factories < 1) { - return ort_api->CreateStatus(ORT_INVALID_ARGUMENT, - "Not enough space to return EP factory. Need at least one."); - } + if (max_factories < 1) { + return ort_api->CreateStatus(ORT_INVALID_ARGUMENT, + "Not enough space to return EP factory. Need at least one."); + } - factories[0] = factory.release(); - *num_factories = 1; + factories[0] = factory.release(); + *num_factories = 1; - return nullptr; + return nullptr; + } catch (const std::exception& ex) { + return ort_api->CreateStatus(ORT_EP_FAIL, ex.what()); + } catch (...) { + return ort_api->CreateStatus(ORT_EP_FAIL, "Unknown exception in CreateEpFactories"); + } } EXPORT_SYMBOL OrtStatus* ReleaseEpFactory(OrtEpFactory* factory) { - delete static_cast(factory); - - // Release the placeholder builder when the last factory is torn down. - DestroyBuilderPlaceholder(); + try { + delete static_cast(factory); + + // Release the placeholder builder when the last factory is torn down. + DestroyBuilderPlaceholder(); + } catch (const std::exception& ex) { + // Best-effort: ReleaseEpFactory shouldn't normally throw, but guard the C boundary. + (void)ex; + } catch (...) { + } return nullptr; } From e8e0de683f48862c16368940bc0dd8b57201a891 Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Wed, 15 Apr 2026 13:31:31 -0700 Subject: [PATCH 12/12] add try/catch as C API boundary guard --- src/tensorrt_provider_factory.cc | 80 ++++++++++++++++++++++++++++---- 1 file changed, 71 insertions(+), 9 deletions(-) diff --git a/src/tensorrt_provider_factory.cc b/src/tensorrt_provider_factory.cc index db57a94..41319c1 100644 --- a/src/tensorrt_provider_factory.cc +++ b/src/tensorrt_provider_factory.cc @@ -11,6 +11,29 @@ #include #include +// --------------------------------------------------------------------------- +// C API boundary guard. +// +// Every C API entry point (ORT_API_CALL / extern "C") that returns OrtStatus* +// must catch all C++ exceptions before they cross the boundary — propagating +// exceptions through a C ABI is undefined behaviour. +// +// Usage: +// OrtStatus* ORT_API_CALL SomeImpl(...) noexcept { +// API_IMPL_BEGIN +// ... body ... +// return nullptr; +// API_IMPL_END(factory->ort_api) +// } +// --------------------------------------------------------------------------- +#define API_IMPL_BEGIN try { +#define API_IMPL_END(ort_api_ref) \ + } catch (const std::exception& ex) { \ + return (ort_api_ref).CreateStatus(ORT_EP_FAIL, ex.what()); \ + } catch (...) { \ + return (ort_api_ref).CreateStatus(ORT_EP_FAIL, "Unknown exception in TRT EP"); \ + } + // --------------------------------------------------------------------------- // TensorRT builder placeholder for test scenarios. // @@ -124,8 +147,9 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp OrtEpDevice** ep_devices, size_t max_ep_devices, size_t* p_num_ep_devices) noexcept { - size_t& num_ep_devices = *p_num_ep_devices; auto* factory = static_cast(this_ptr); + API_IMPL_BEGIN + size_t& num_ep_devices = *p_num_ep_devices; // Clear stale ordinal mappings from any prior enumeration. { @@ -281,6 +305,7 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp } return nullptr; + API_IMPL_END(factory->ort_api) } OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateEpImpl( @@ -292,6 +317,7 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateEpImpl( _In_ const OrtLogger* logger, _Out_ OrtEp** ep) noexcept { auto* factory = static_cast(this_ptr); *ep = nullptr; + API_IMPL_BEGIN if (num_devices != 1) { // we only registered for GPU and only expected to be selected for one GPU @@ -314,11 +340,29 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateEpImpl( *ep = trt_ep.release(); return nullptr; + API_IMPL_END(factory->ort_api) } -void ORT_API_CALL TensorrtExecutionProviderFactory::ReleaseEpImpl(OrtEpFactory* /*this_ptr*/, OrtEp* ep) noexcept { - TensorrtExecutionProvider* trt_ep = static_cast(ep); - delete trt_ep; +void ORT_API_CALL TensorrtExecutionProviderFactory::ReleaseEpImpl(OrtEpFactory* this_ptr, OrtEp* ep) noexcept { + try { + TensorrtExecutionProvider* trt_ep = static_cast(ep); + delete trt_ep; + } catch (const std::exception& ex) { + // void return — cannot report via OrtStatus*. Log so teardown failures are diagnosable. + auto* factory = static_cast(this_ptr); + auto* log_status = factory->ort_api.Logger_LogMessage(&factory->default_logger_, + OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR, + (std::string("Exception in ReleaseEpImpl: ") + ex.what()).c_str(), + ORT_FILE, __LINE__, __FUNCTION__); + if (log_status) factory->ort_api.ReleaseStatus(log_status); + } catch (...) { + auto* factory = static_cast(this_ptr); + auto* log_status = factory->ort_api.Logger_LogMessage(&factory->default_logger_, + OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR, + "Unknown exception in ReleaseEpImpl", + ORT_FILE, __LINE__, __FUNCTION__); + if (log_status) factory->ort_api.ReleaseStatus(log_status); + } } OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateAllocatorImpl(OrtEpFactory* this_ptr, @@ -326,6 +370,7 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateAllocatorImpl(Or const OrtKeyValuePairs* /*allocator_options*/, OrtAllocator** allocator) noexcept { auto& factory = *static_cast(this_ptr); + API_IMPL_BEGIN // NOTE: The factory implementation is free to return a shared OrtAllocator* instance instead of creating a new // allocator on each call. To do this have an allocator instance as an OrtEpFactory class member and make @@ -372,6 +417,7 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateAllocatorImpl(Or } return nullptr; + API_IMPL_END(factory.ort_api) } void ORT_API_CALL TensorrtExecutionProviderFactory::ReleaseAllocatorImpl(OrtEpFactory* /*this*/, @@ -384,11 +430,13 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateDataTransferImpl OrtEpFactory* this_ptr, OrtDataTransferImpl** data_transfer) noexcept { auto& factory = *static_cast(this_ptr); + API_IMPL_BEGIN auto data_transfer_impl = std::make_unique(static_cast(factory)); *data_transfer = data_transfer_impl.release(); return nullptr; + API_IMPL_END(factory.ort_api) } bool ORT_API_CALL TensorrtExecutionProviderFactory::IsStreamAwareImpl(const OrtEpFactory* /*this_ptr*/) noexcept { @@ -501,18 +549,32 @@ EXPORT_SYMBOL OrtStatus* CreateEpFactories(const char* registration_name, const } EXPORT_SYMBOL OrtStatus* ReleaseEpFactory(OrtEpFactory* factory) { + const OrtApi* ort_api = nullptr; + try { - delete static_cast(factory); + // Grab the OrtApi reference before destroying the factory, so we can + // use it to create an error status if the catch block is reached. + auto* trt_factory = static_cast(factory); + ort_api = &trt_factory->ort_api; + + delete trt_factory; // Release the placeholder builder when the last factory is torn down. DestroyBuilderPlaceholder(); + + return nullptr; } catch (const std::exception& ex) { - // Best-effort: ReleaseEpFactory shouldn't normally throw, but guard the C boundary. - (void)ex; + if (ort_api != nullptr) { + return ort_api->CreateStatus(ORT_EP_FAIL, ex.what()); + } + // ort_api not yet captured — nothing we can do except not crash. + return nullptr; } catch (...) { + if (ort_api != nullptr) { + return ort_api->CreateStatus(ORT_EP_FAIL, "Unknown exception in ReleaseEpFactory"); + } + return nullptr; } - - return nullptr; } } // extern "C"