Triton-Robotics · LuminousLlama · Jul 19, 2025 · Jul 19, 2025 · Jul 31, 2025 · Aug 1, 2025
diff --git a/yolox_ros_cpp/yolox_cpp/CMakeLists.txt b/yolox_ros_cpp/yolox_cpp/CMakeLists.txt
@@ -85,6 +85,11 @@ configure_file(
 )
 
 ament_auto_add_library(yolox_cpp SHARED ${TARGET_SRC})
+
+find_package(VPI REQUIRED)
+target_include_directories(yolox_cpp PRIVATE ${VPI_INCLUDE_DIRS})
+target_link_libraries(yolox_cpp PRIVATE ${VPI_LIBRARIES})
+
 ament_target_dependencies(yolox_cpp ${TARGET_DPENDENCIES})
 ament_export_dependencies(${TARGET_DPENDENCIES})
 target_link_libraries(yolox_cpp ${TARGET_LIBS})

diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp
@@ -3,6 +3,17 @@
 
 #include <opencv2/core/types.hpp>
 
+#include <vpi/VPI.h>                         
+#include <vpi/Image.h>                       
+#include <vpi/Stream.h>                     
+#include <vpi/algo/Rescale.h>               
+#include <vpi/OpenCVInterop.hpp>
+
+#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+
+void launchBlobFromImage(uchar3* d_input, float* d_output, int width, int height, cudaStream_t stream);
+
 namespace yolox_cpp
 {
 /**
@@ -22,6 +33,8 @@ namespace yolox_cpp
         int grid0;
         int grid1;
         int stride;
+
+        __host__ __device__
         GridAndStride(const int grid0_, const int grid1_, const int stride_)
             : grid0(grid0_), grid1(grid1_), stride(stride_)
         {
@@ -59,6 +72,10 @@ namespace yolox_cpp
         const std::vector<int> strides_p6_ = {8, 16, 32, 64};
         std::vector<GridAndStride> grid_strides_;
 
+        const int strides_gpu_[3] = {8, 16, 32};
+        const int strides_p6_gpu_[4] = {8, 16, 32, 64};
+        GridAndStride* grid_strides_gpu_;
+
         cv::Mat static_resize(const cv::Mat &img)
         {
             const float r = std::min(
@@ -73,6 +90,32 @@ namespace yolox_cpp
             return out;
         }
 
+        // Assumes that a stream has already been initialized
+        VPIImage static_resize_vpi(const cv::Mat &img, VPIStream stream) {
+            VPIImage vpi_image = nullptr;
+            vpiImageCreateWrapperOpenCVMat(img, 0, &vpi_image);
+
+            VPIImageFormat type;
+            vpiImageGetFormat(vpi_image, &type);
+
+            const float r = std::min(
+                static_cast<float>(input_w_) / static_cast<float>(img.cols),
+                static_cast<float>(input_h_) / static_cast<float>(img.rows));
+            const int unpad_w = r * img.cols;
+            const int unpad_h = r * img.rows;
+
+            // Create rescaled image, make blank with right dimensions first
+            // and then map the original image onto it 
+            VPIImage rescaled = nullptr;
+            vpiImageCreate(unpad_h, unpad_w, type, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &rescaled);
+            vpiSubmitRescale(stream, VPI_BACKEND_CUDA, vpi_image, rescaled, VPI_INTERP_LINEAR, VPI_BORDER_ZERO, 0);
+
+            // Destroy unneeded images
+            vpiImageDestroy(vpi_image);
+
+            return rescaled;
+        }
+
         // for NCHW
         void blobFromImage(const cv::Mat &img, float *blob_data)
         {
@@ -93,6 +136,7 @@ namespace yolox_cpp
         }
 
         // for NHWC
+
         void blobFromImage_nhwc(const cv::Mat &img, float *blob_data)
         {
             const size_t channels = 3;
@@ -112,6 +156,21 @@ namespace yolox_cpp
             memcpy(blob_data, img_f32.data, img.rows * img.cols * channels * sizeof(float));
         }
 
+        void blobFromVPIImage(VPIImage input, float* blob_output, cudaStream_t stream) {
+            VPIImageData data;
+            vpiImageLockData(input, VPI_LOCK_READ, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &data);
+
+            auto& plane = data.buffer.pitch.planes[0];
+            uchar3* ptr = reinterpret_cast<uchar3*>(plane.data);
+            int width = plane.width;
+            int height = plane.height;
+
+            // Cuda Kernel
+            launchBlobFromImage(ptr, blob_output, width, height, stream);
+
+            vpiImageUnlock(input);
+        }
+
         void generate_grids_and_stride(const int target_w, const int target_h, const std::vector<int> &strides, std::vector<GridAndStride> &grid_strides)
         {
             grid_strides.clear();
@@ -267,4 +326,4 @@ namespace yolox_cpp
         }
     };
 }
-#endif
+#endif // _YOLOX_CPP_CORE_HPP
diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cu b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cu
@@ -0,0 +1,32 @@
+
+__global__ void blobFromImage(uchar3* image_data, float* output, int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= width || y >= height) return;
+
+    int uchar3_idx = y * width + x;
+
+    int b_idx = 0 * height * width + y * width + x;
+    int g_idx = 1 * height * width + y * width + x;
+    int r_idx = 2 * height * width + y * width + x;
+
+    output[b_idx] = image_data[uchar3_idx].x;
+    output[g_idx] = image_data[uchar3_idx].y;
+    output[r_idx] = image_data[uchar3_idx].z;
+
+    return;
+}
+
+void launchBlobFromImage(uchar3* d_input, float* d_output, int width, int height, cudaStream_t stream) {
+    dim3 block(16, 16);
+    dim3 grid((width + block.x - 1) / block.x, 
+              (height + block.y - 1) / block.y);
+
+    blobFromImage<<<grid, block, 0, stream>>>(d_input, d_output, width, height);
+    cudaDeviceSynchronize(stream);
+}
+
+__global__ void generate_grids_and_strides() {
+
+}
diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singleframe_buffer.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singleframe_buffer.hpp
@@ -0,0 +1,30 @@
+#pragma once 
+#include "yolox_ros_cpp/yolox_ros_cpp.hpp"
+#include <condition_variable>
+#include <mutex>
+
+class SingleFrameBuffer {
+
+private:
+    cv::Mat frame;
+    std::mutex mtx;
+    std::condition_variable not_empty;
+
+public:
+    void push(const cv::Mat& frame) {
+        std::unique_lock<std::mutex> lock(mtx);
+        this->frame = frame;
+        not_empty.notify_one();
+    }
+    cv::Mat pop() {
+        cv::Mat output_frame;
+        {
+            std::unique_lock<std::mutex> lock(mtx);
+            not_empty.wait(lock, [this] { return !this->frame.empty(); });
+
+            output_frame = std::move(this->frame);
+            this->frame.release();
+        }
+        return output_frame;
+    }
+};
diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/yolox_tensorrt.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/yolox_tensorrt.hpp
@@ -40,7 +40,7 @@ namespace yolox_cpp{
             std::vector<Object> inference(const cv::Mat& frame) override;
 
         private:
-            void doInference(const float* input, float* output);
+            void doInference(const float* input, float* output, cudaStream_t stream);
 
             int DEVICE_ = 0;
             Logger gLogger_;

diff --git a/yolox_ros_cpp/yolox_cpp/package.xml b/yolox_ros_cpp/yolox_cpp/package.xml
@@ -11,6 +11,7 @@
   <buildtool_depend>ament_cmake_auto</buildtool_depend>
 
   <depend>OpenCV</depend>
+  <depend>vpi</depend>
 
   <test_depend>ament_lint_auto</test_depend>
   <test_depend>ament_lint_common</test_depend>

diff --git a/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp b/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp
@@ -94,18 +94,40 @@ namespace yolox_cpp
 
     std::vector<Object> YoloXTensorRT::inference(const cv::Mat &frame)
     {
+        // Create VPI Stream
+        VPIStream vpi_stream;
+        vpiStreamCreate(0, &vpi_stream);
+
         // preprocess
         auto now = std::chrono::system_clock::now();
-        auto pr_img = static_resize(frame);
+        auto pr_img = static_resize_vpi(frame, vpi_stream);
         auto end = std::chrono::system_clock::now();
         auto elapsed_inf = std::chrono::duration_cast<std::chrono::microseconds>(end - now);
         printf("resize time: %5ld us\n", elapsed_inf.count());
-        blobFromImage(pr_img, input_blob_.data());
+
+        VPIImageData data;
+        vpiImageLockData(pr_img, VPI_LOCK_READ, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &data);
+
+        int width = data.buffer.pitch.planes[0].width;
+        int height = data.buffer.pitch.planes[1].height;
+
+        float* output;
 
-        // inference
-        this->doInference(input_blob_.data(), output_blob_.data());
+        int num_elements = 3 * width * height;
+        cudaMalloc(&output, sizeof(float) * num_elements);
 
+        cudaStream_t stream;
+        cudaStreamCreate(&stream);
+
+        blobFromVPIImage(pr_img, output, stream);
+
+        // No more use for the VPI Image
+        vpiImageDestroy(pr_img);
+        vpiStreamDestroy(vpi_stream);
 
+        // inference
+        this->doInference(output, output_blob_.data(), stream);
+
         // postprocess
         const float scale = std::min(
             static_cast<float>(this->input_w_) / static_cast<float>(frame.cols),
@@ -120,10 +142,8 @@ namespace yolox_cpp
         return objects;
     }
 
-    void YoloXTensorRT::doInference(const float *input, float *output)
+    void YoloXTensorRT::doInference(const float *input, float *output, cudaStream_t stream)
     {
-        // Create stream
-        cudaStream_t stream;
         CHECK(cudaStreamCreate(&stream));
 
         // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host

diff --git a/yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp b/yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp
@@ -92,6 +92,8 @@ namespace yolox_ros_cpp
         RCLCPP_INFO(this->get_logger(), "model loaded");
 
         rclcpp::QoS qos_profile(1);  // Queue depth of 1
+        qos_profile.best_effort(); // Default is "reliable" which tries to retransmit missed frames, best_effort() avoids this
+                                   // Good for realtime
 this->sub_image_ = image_transport::create_subscription(
             this, this->params_.src_image_topic_name,
             std::bind(&YoloXNode::colorImageCallback, this, std::placeholders::_1),