From 1b5bc577d368506d16c1588561ebec591c470eb9 Mon Sep 17 00:00:00 2001 From: Felix Date: Fri, 18 Jul 2025 17:06:31 -0700 Subject: [PATCH 1/8] Mulithread safe buffer implemeted --- .../include/yolox_cpp/singeframe_buffer.hpp | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singeframe_buffer.hpp diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singeframe_buffer.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singeframe_buffer.hpp new file mode 100644 index 0000000..299b06d --- /dev/null +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singeframe_buffer.hpp @@ -0,0 +1,30 @@ +#pragma once +#include "yolox_ros_cpp/yolox_ros_cpp.hpp" +#include +#include + +class SingleFrameBuffer { + +private: + cv::Mat frame; + std::mutex mtx; + std::condition_variable not_empty; + +public: + void push(const cv::Mat& frame) { + std::unique_lock lock(mtx); + this->frame = frame; + not_empty.notify_one(); + } + cv::Mat pop() { + cv::Mat output_frame; + { + std::unique_lock lock(mtx); + not_empty.wait(lock, [this] {return !frame.empty()}); + + output_frame = std::move(this->frame); + this->frame.release(); + } + return output_frame; + } +}; \ No newline at end of file From f9a8e156a47853cf1b5b6e5f3eef790c72ec9cf6 Mon Sep 17 00:00:00 2001 From: Felix Date: Fri, 18 Jul 2025 17:34:25 -0700 Subject: [PATCH 2/8] Naming issue lol --- .../include/yolox_cpp/singleframe_buffer.hpp | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singleframe_buffer.hpp diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singleframe_buffer.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singleframe_buffer.hpp new file mode 100644 index 0000000..299b06d --- /dev/null +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singleframe_buffer.hpp @@ -0,0 +1,30 @@ +#pragma once +#include "yolox_ros_cpp/yolox_ros_cpp.hpp" +#include +#include + +class SingleFrameBuffer { + +private: + cv::Mat frame; + std::mutex mtx; + std::condition_variable not_empty; + +public: + void push(const cv::Mat& frame) { + std::unique_lock lock(mtx); + this->frame = frame; + not_empty.notify_one(); + } + cv::Mat pop() { + cv::Mat output_frame; + { + std::unique_lock lock(mtx); + not_empty.wait(lock, [this] {return !frame.empty()}); + + output_frame = std::move(this->frame); + this->frame.release(); + } + return output_frame; + } +}; \ No newline at end of file From c186b250ee044f8c0dfc1b54c542161be502d467 Mon Sep 17 00:00:00 2001 From: Felix Date: Thu, 31 Jul 2025 13:25:38 -0700 Subject: [PATCH 3/8] preprocessing moved to GPU. --- yolox_ros_cpp/yolox_cpp/CMakeLists.txt | 5 ++ .../include/yolox_cpp/blobFromImage.cu | 27 ++++++++++ .../yolox_cpp/include/yolox_cpp/core.hpp | 53 +++++++++++++++++++ .../include/yolox_cpp/singeframe_buffer.hpp | 30 ----------- .../include/yolox_cpp/singleframe_buffer.hpp | 2 +- yolox_ros_cpp/yolox_cpp/package.xml | 1 + .../yolox_cpp/src/yolox_tensorrt.cpp | 1 - 7 files changed, 87 insertions(+), 32 deletions(-) create mode 100644 yolox_ros_cpp/yolox_cpp/include/yolox_cpp/blobFromImage.cu delete mode 100644 yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singeframe_buffer.hpp diff --git a/yolox_ros_cpp/yolox_cpp/CMakeLists.txt b/yolox_ros_cpp/yolox_cpp/CMakeLists.txt index db849f4..dfff2c4 100644 --- a/yolox_ros_cpp/yolox_cpp/CMakeLists.txt +++ b/yolox_ros_cpp/yolox_cpp/CMakeLists.txt @@ -85,6 +85,11 @@ configure_file( ) ament_auto_add_library(yolox_cpp SHARED ${TARGET_SRC}) + +find_package(VPI REQUIRED) +target_include_directories(yolox_cpp PRIVATE ${VPI_INCLUDE_DIRS}) +target_link_libraries(yolox_cpp PRIVATE ${VPI_LIBRARIES}) + ament_target_dependencies(yolox_cpp ${TARGET_DPENDENCIES}) ament_export_dependencies(${TARGET_DPENDENCIES}) target_link_libraries(yolox_cpp ${TARGET_LIBS}) diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/blobFromImage.cu b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/blobFromImage.cu new file mode 100644 index 0000000..6771e16 --- /dev/null +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/blobFromImage.cu @@ -0,0 +1,27 @@ +_global__ void blobFromImage(uchar3* image_data, float* output, int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width || y >= height) return; + + int uchar3_idx = y * width + x; + + int b_idx = 0 * height * width + y * width + x; + int g_idx = 1 * height * width + y * width + x; + int r_idx = 2 * height * width + y * width + x; + + output[b_idx] = image_data[uchar3_idx].x; + output[g_idx] = image_data[uchar3_idx].y; + output[r_idx] = image_data[uchar3_idx].z; + + return; +} + +void launchBlobFromImage(uchar3* d_input, float* d_output, int width, int height) { + dim3 block(16, 16); + dim3 grid((width + block.x - 1) / block.x, + (height + block.y - 1) / block.y); + + blobFromImage<<>>(d_input, d_output, width, height); + cudaDeviceSynchronize(); +} \ No newline at end of file diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp index 63ebf62..4e3447e 100644 --- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp @@ -3,6 +3,18 @@ #include +#include +#include +#include +#include +#include + +#include +#include + +void launchBlobFromImage(uchar3* d_input, float* d_output, int width, int height, cudaStream_t stream); + +#endif // _YOLOX_CPP_CORE_HPP namespace yolox_cpp { /** @@ -73,6 +85,32 @@ namespace yolox_cpp return out; } + // Assumes that a stream has already been initialized + VPIImage static_resize_gpu(const cv::Mat &img, VPIStream stream) { + VPIImage vpi_image = nullptr; + vpiImageCreateOpenCVMatWrapper(img, 0, &vpi_image); + + VPIImageFormat type; + vpiImageGetFormat(vpi_image, &type); + + const float r = std::min( + static_cast(input_w_) / static_cast(img.cols), + static_cast(input_h_) / static_cast(img.rows)); + const int unpad_w = r * img.cols; + const int unpad_h = r * img.rows; + + // Create rescaled image, make blank with right dimensions first + // and then map the original image onto it + VPIImage rescaled = nullptr; + vpiImageCreate(unpad_h, unpad_w, type, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &rescaled); + vpiSubmitRescale(stream, VPI_BACKEND_CUDA, vpi_image, rescaled, VPI_INTERP_LINEAR, VPI_BORDER_ZERO, nullptr); + + // Destroy unneeded images + vpiImageDestroy(vpi_image); + + return vpi_output; + } + // for NCHW void blobFromImage(const cv::Mat &img, float *blob_data) { @@ -112,6 +150,21 @@ namespace yolox_cpp memcpy(blob_data, img_f32.data, img.rows * img.cols * channels * sizeof(float)); } + void blobFromVPIImage(VPIImage input, float* blob_output, cudaStream_t stream) { + VIPImageData data; + vpiImageLock(input, VPI_LOCK_READ, &data); + + auto& plane = data.buffer.pitch.planes[0]; + uchar3* ptr = reinterpret_cast(plane.data); + int width = plane.width; + int height = plane.height; + + // Cuda Kernel + launchBlobFromImage<<>>(ptr, blob_output, width, height); + + vpiImageUnlock(input); + } + void generate_grids_and_stride(const int target_w, const int target_h, const std::vector &strides, std::vector &grid_strides) { grid_strides.clear(); diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singeframe_buffer.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singeframe_buffer.hpp deleted file mode 100644 index 299b06d..0000000 --- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singeframe_buffer.hpp +++ /dev/null @@ -1,30 +0,0 @@ -#pragma once -#include "yolox_ros_cpp/yolox_ros_cpp.hpp" -#include -#include - -class SingleFrameBuffer { - -private: - cv::Mat frame; - std::mutex mtx; - std::condition_variable not_empty; - -public: - void push(const cv::Mat& frame) { - std::unique_lock lock(mtx); - this->frame = frame; - not_empty.notify_one(); - } - cv::Mat pop() { - cv::Mat output_frame; - { - std::unique_lock lock(mtx); - not_empty.wait(lock, [this] {return !frame.empty()}); - - output_frame = std::move(this->frame); - this->frame.release(); - } - return output_frame; - } -}; \ No newline at end of file diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singleframe_buffer.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singleframe_buffer.hpp index 299b06d..04c0c63 100644 --- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singleframe_buffer.hpp +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singleframe_buffer.hpp @@ -20,7 +20,7 @@ class SingleFrameBuffer { cv::Mat output_frame; { std::unique_lock lock(mtx); - not_empty.wait(lock, [this] {return !frame.empty()}); + not_empty.wait(lock, [this] { return !this->frame.empty(); }); output_frame = std::move(this->frame); this->frame.release(); diff --git a/yolox_ros_cpp/yolox_cpp/package.xml b/yolox_ros_cpp/yolox_cpp/package.xml index f18153e..da52a11 100644 --- a/yolox_ros_cpp/yolox_cpp/package.xml +++ b/yolox_ros_cpp/yolox_cpp/package.xml @@ -11,6 +11,7 @@ ament_cmake_auto OpenCV + vpi ament_lint_auto ament_lint_common diff --git a/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp b/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp index 93992bf..67e455b 100644 --- a/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp +++ b/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp @@ -105,7 +105,6 @@ namespace yolox_cpp // inference this->doInference(input_blob_.data(), output_blob_.data()); - // postprocess const float scale = std::min( static_cast(this->input_w_) / static_cast(frame.cols), From 6546b45179f39b3ee63b70d8186d9acf41a02de9 Mon Sep 17 00:00:00 2001 From: Felix Date: Fri, 1 Aug 2025 11:45:55 -0700 Subject: [PATCH 4/8] inference logic updated --- .../yolox_cpp/include/yolox_cpp/core.hpp | 2 +- .../yolox_cpp/src/yolox_tensorrt.cpp | 24 ++++++++++++++----- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp index 4e3447e..8205f47 100644 --- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp @@ -108,7 +108,7 @@ namespace yolox_cpp // Destroy unneeded images vpiImageDestroy(vpi_image); - return vpi_output; + return rescaled; } // for NCHW diff --git a/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp b/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp index 67e455b..63411de 100644 --- a/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp +++ b/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp @@ -96,14 +96,28 @@ namespace yolox_cpp { // preprocess auto now = std::chrono::system_clock::now(); - auto pr_img = static_resize(frame); + auto pr_img = static_resize_gpu(frame); auto end = std::chrono::system_clock::now(); auto elapsed_inf = std::chrono::duration_cast(end - now); printf("resize time: %5ld us\n", elapsed_inf.count()); - blobFromImage(pr_img, input_blob_.data()); + + VPIImageData data; + vpiImageLock(pr_img, VPI_LOCK_READ, &data); + + int width = data.buffer.pitch.planes[0].width; + int height = data.buffer.pitch.planes[1].height; + + float* output; + + int num_elements = 3 * width * height; + cudaMalloc(&output, sizeof(float) * num_elements); + + cudaStream_t stream; + + blobFromVPIImage(pr_img, output, input_blob_.data(), stream); // inference - this->doInference(input_blob_.data(), output_blob_.data()); + this->doInference(input_blob_.data(), output_blob_.data(), stream); // postprocess const float scale = std::min( @@ -119,10 +133,8 @@ namespace yolox_cpp return objects; } - void YoloXTensorRT::doInference(const float *input, float *output) + void YoloXTensorRT::doInference(const float *input, float *output, cudaStream_t stream) { - // Create stream - cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host From ed8bc33cee86309d5fd5bd995a925c724ecb29a8 Mon Sep 17 00:00:00 2001 From: Felix Date: Fri, 1 Aug 2025 15:46:03 -0700 Subject: [PATCH 5/8] Grid and Strides Work --- yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp | 6 ++++++ .../include/yolox_cpp/{blobFromImage.cu => cuda_utils.cu} | 7 ++++++- yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp | 3 +++ yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp | 2 ++ 4 files changed, 17 insertions(+), 1 deletion(-) rename yolox_ros_cpp/yolox_cpp/include/yolox_cpp/{blobFromImage.cu => cuda_utils.cu} (84%) diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp index 8205f47..4c8ab1f 100644 --- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp @@ -34,6 +34,8 @@ namespace yolox_cpp int grid0; int grid1; int stride; + + __host__ __device__ GridAndStride(const int grid0_, const int grid1_, const int stride_) : grid0(grid0_), grid1(grid1_), stride(stride_) { @@ -71,6 +73,10 @@ namespace yolox_cpp const std::vector strides_p6_ = {8, 16, 32, 64}; std::vector grid_strides_; + const int strides_gpu_[3] = {8, 16, 32}; + const int strides_p6_gpu_[4] = {8, 16, 32, 64}; + GridAndStride* grid_strides_gpu_; + cv::Mat static_resize(const cv::Mat &img) { const float r = std::min( diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/blobFromImage.cu b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cu similarity index 84% rename from yolox_ros_cpp/yolox_cpp/include/yolox_cpp/blobFromImage.cu rename to yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cu index 6771e16..fd3baf6 100644 --- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/blobFromImage.cu +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cu @@ -1,4 +1,5 @@ -_global__ void blobFromImage(uchar3* image_data, float* output, int width, int height) { + +__global__ void blobFromImage(uchar3* image_data, float* output, int width, int height) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -24,4 +25,8 @@ void launchBlobFromImage(uchar3* d_input, float* d_output, int width, int height blobFromImage<<>>(d_input, d_output, width, height); cudaDeviceSynchronize(); +} + +__global__ void generate_grids_and_strides() { + } \ No newline at end of file diff --git a/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp b/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp index 63411de..f76db6c 100644 --- a/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp +++ b/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp @@ -115,6 +115,9 @@ namespace yolox_cpp cudaStream_t stream; blobFromVPIImage(pr_img, output, input_blob_.data(), stream); + + // No more use for the VPI Image + vpiImageDestroy(pr_img); // inference this->doInference(input_blob_.data(), output_blob_.data(), stream); diff --git a/yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp b/yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp index ebcb5d1..46d8c6a 100644 --- a/yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp +++ b/yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp @@ -92,6 +92,8 @@ namespace yolox_ros_cpp RCLCPP_INFO(this->get_logger(), "model loaded"); rclcpp::QoS qos_profile(1); // Queue depth of 1 + qos_profile.best_effort(); // Default is "reliable" which tries to retransmit missed frames, best_effort() avoids this + // Good for realtime this->sub_image_ = image_transport::create_subscription( this, this->params_.src_image_topic_name, std::bind(&YoloXNode::colorImageCallback, this, std::placeholders::_1), From de5708e4de18e3f8e89747b755619c8b2ef112b5 Mon Sep 17 00:00:00 2001 From: Felix Date: Mon, 4 Aug 2025 10:04:12 -0700 Subject: [PATCH 6/8] Working Build --- yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp | 13 ++++++------- .../yolox_cpp/include/yolox_cpp/cuda_utils.cu | 6 +++--- .../yolox_cpp/include/yolox_cpp/yolox_tensorrt.hpp | 2 +- yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp | 14 ++++++++++---- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp index 4c8ab1f..ee2957d 100644 --- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp @@ -14,7 +14,6 @@ void launchBlobFromImage(uchar3* d_input, float* d_output, int width, int height, cudaStream_t stream); -#endif // _YOLOX_CPP_CORE_HPP namespace yolox_cpp { /** @@ -94,7 +93,7 @@ namespace yolox_cpp // Assumes that a stream has already been initialized VPIImage static_resize_gpu(const cv::Mat &img, VPIStream stream) { VPIImage vpi_image = nullptr; - vpiImageCreateOpenCVMatWrapper(img, 0, &vpi_image); + vpiImageCreateWrapperOpenCVMat(img, 0, &vpi_image); VPIImageFormat type; vpiImageGetFormat(vpi_image, &type); @@ -109,7 +108,7 @@ namespace yolox_cpp // and then map the original image onto it VPIImage rescaled = nullptr; vpiImageCreate(unpad_h, unpad_w, type, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &rescaled); - vpiSubmitRescale(stream, VPI_BACKEND_CUDA, vpi_image, rescaled, VPI_INTERP_LINEAR, VPI_BORDER_ZERO, nullptr); + vpiSubmitRescale(stream, VPI_BACKEND_CUDA, vpi_image, rescaled, VPI_INTERP_LINEAR, VPI_BORDER_ZERO, 0); // Destroy unneeded images vpiImageDestroy(vpi_image); @@ -157,8 +156,8 @@ namespace yolox_cpp } void blobFromVPIImage(VPIImage input, float* blob_output, cudaStream_t stream) { - VIPImageData data; - vpiImageLock(input, VPI_LOCK_READ, &data); + VPIImageData data; + vpiImageLockData(input, VPI_LOCK_READ, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &data); auto& plane = data.buffer.pitch.planes[0]; uchar3* ptr = reinterpret_cast(plane.data); @@ -166,7 +165,7 @@ namespace yolox_cpp int height = plane.height; // Cuda Kernel - launchBlobFromImage<<>>(ptr, blob_output, width, height); + launchBlobFromImage(ptr, blob_output, width, height, stream); vpiImageUnlock(input); } @@ -326,4 +325,4 @@ namespace yolox_cpp } }; } -#endif +#endif // _YOLOX_CPP_CORE_HPP \ No newline at end of file diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cu b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cu index fd3baf6..918d245 100644 --- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cu +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cu @@ -18,13 +18,13 @@ __global__ void blobFromImage(uchar3* image_data, float* output, int width, int return; } -void launchBlobFromImage(uchar3* d_input, float* d_output, int width, int height) { +void launchBlobFromImage(uchar3* d_input, float* d_output, int width, int height, cudaStream_t stream) { dim3 block(16, 16); dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y); - blobFromImage<<>>(d_input, d_output, width, height); - cudaDeviceSynchronize(); + blobFromImage<<>>(d_input, d_output, width, height); + cudaDeviceSynchronize(stream); } __global__ void generate_grids_and_strides() { diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/yolox_tensorrt.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/yolox_tensorrt.hpp index 15ee452..622fd2f 100644 --- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/yolox_tensorrt.hpp +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/yolox_tensorrt.hpp @@ -40,7 +40,7 @@ namespace yolox_cpp{ std::vector inference(const cv::Mat& frame) override; private: - void doInference(const float* input, float* output); + void doInference(const float* input, float* output, cudaStream_t stream); int DEVICE_ = 0; Logger gLogger_; diff --git a/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp b/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp index f76db6c..6488d16 100644 --- a/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp +++ b/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp @@ -94,15 +94,19 @@ namespace yolox_cpp std::vector YoloXTensorRT::inference(const cv::Mat &frame) { + // Create VPI Stream + VPIStream vpi_stream; + vpiStreamCreate(0, &vpi_stream); + // preprocess auto now = std::chrono::system_clock::now(); - auto pr_img = static_resize_gpu(frame); + auto pr_img = static_resize_gpu(frame, vpi_stream); auto end = std::chrono::system_clock::now(); auto elapsed_inf = std::chrono::duration_cast(end - now); printf("resize time: %5ld us\n", elapsed_inf.count()); VPIImageData data; - vpiImageLock(pr_img, VPI_LOCK_READ, &data); + vpiImageLockData(pr_img, VPI_LOCK_READ, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &data); int width = data.buffer.pitch.planes[0].width; int height = data.buffer.pitch.planes[1].height; @@ -113,14 +117,16 @@ namespace yolox_cpp cudaMalloc(&output, sizeof(float) * num_elements); cudaStream_t stream; + cudaStreamCreate(&stream); - blobFromVPIImage(pr_img, output, input_blob_.data(), stream); + blobFromVPIImage(pr_img, output, stream); // No more use for the VPI Image vpiImageDestroy(pr_img); + vpiStreamDestroy(vpi_stream); // inference - this->doInference(input_blob_.data(), output_blob_.data(), stream); + this->doInference(output, output_blob_.data(), stream); // postprocess const float scale = std::min( From 5caacf8b7833e2dad0536f32d0497fb2509f0bea Mon Sep 17 00:00:00 2001 From: Felix Date: Wed, 17 Sep 2025 15:44:52 -0700 Subject: [PATCH 7/8] Save --- yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp index ee2957d..b2c921e 100644 --- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp @@ -91,7 +91,7 @@ namespace yolox_cpp } // Assumes that a stream has already been initialized - VPIImage static_resize_gpu(const cv::Mat &img, VPIStream stream) { + VPIImage static_resize_vpi(const cv::Mat &img, VPIStream stream) { VPIImage vpi_image = nullptr; vpiImageCreateWrapperOpenCVMat(img, 0, &vpi_image); @@ -160,7 +160,8 @@ namespace yolox_cpp vpiImageLockData(input, VPI_LOCK_READ, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &data); auto& plane = data.buffer.pitch.planes[0]; - uchar3* ptr = reinterpret_cast(plane.data); + uchar3* ptr = reint +erpret_cast(plane.data); int width = plane.width; int height = plane.height; @@ -176,7 +177,8 @@ namespace yolox_cpp for (auto stride : strides) { const int num_grid_w = target_w / stride; - const int num_grid_h = target_h / stride; + const int num_g +rid_h = target_h / stride; for (int g1 = 0; g1 < num_grid_h; ++g1) { for (int g0 = 0; g0 < num_grid_w; ++g0) @@ -189,7 +191,8 @@ namespace yolox_cpp void generate_yolox_proposals(const std::vector &grid_strides, const float *feat_ptr, const float prob_threshold, std::vector &objects) { - const int num_anchors = grid_strides.size(); + const int num_ancho +rs = grid_strides.size(); objects.clear(); for (int anchor_idx = 0; anchor_idx < num_anchors; ++anchor_idx) From 007894444c10e4a74a4d6ab6aa332515f10c9f8e Mon Sep 17 00:00:00 2001 From: LuminousLlama Date: Thu, 25 Sep 2025 15:25:43 -0700 Subject: [PATCH 8/8] Stable BUILDING version --- yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp | 10 ++++------ yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp index b2c921e..42d6ecb 100644 --- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp @@ -136,6 +136,7 @@ namespace yolox_cpp } // for NHWC + void blobFromImage_nhwc(const cv::Mat &img, float *blob_data) { const size_t channels = 3; @@ -160,8 +161,7 @@ namespace yolox_cpp vpiImageLockData(input, VPI_LOCK_READ, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &data); auto& plane = data.buffer.pitch.planes[0]; - uchar3* ptr = reint -erpret_cast(plane.data); + uchar3* ptr = reinterpret_cast(plane.data); int width = plane.width; int height = plane.height; @@ -177,8 +177,7 @@ erpret_cast(plane.data); for (auto stride : strides) { const int num_grid_w = target_w / stride; - const int num_g -rid_h = target_h / stride; + const int num_grid_h = target_h / stride; for (int g1 = 0; g1 < num_grid_h; ++g1) { for (int g0 = 0; g0 < num_grid_w; ++g0) @@ -191,8 +190,7 @@ rid_h = target_h / stride; void generate_yolox_proposals(const std::vector &grid_strides, const float *feat_ptr, const float prob_threshold, std::vector &objects) { - const int num_ancho -rs = grid_strides.size(); + const int num_anchors = grid_strides.size(); objects.clear(); for (int anchor_idx = 0; anchor_idx < num_anchors; ++anchor_idx) diff --git a/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp b/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp index 6488d16..2ddab8d 100644 --- a/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp +++ b/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp @@ -100,7 +100,7 @@ namespace yolox_cpp // preprocess auto now = std::chrono::system_clock::now(); - auto pr_img = static_resize_gpu(frame, vpi_stream); + auto pr_img = static_resize_vpi(frame, vpi_stream); auto end = std::chrono::system_clock::now(); auto elapsed_inf = std::chrono::duration_cast(end - now); printf("resize time: %5ld us\n", elapsed_inf.count());