diff --git a/yolox_ros_cpp/yolox_cpp/CMakeLists.txt b/yolox_ros_cpp/yolox_cpp/CMakeLists.txt index db849f4..dfff2c4 100644 --- a/yolox_ros_cpp/yolox_cpp/CMakeLists.txt +++ b/yolox_ros_cpp/yolox_cpp/CMakeLists.txt @@ -85,6 +85,11 @@ configure_file( ) ament_auto_add_library(yolox_cpp SHARED ${TARGET_SRC}) + +find_package(VPI REQUIRED) +target_include_directories(yolox_cpp PRIVATE ${VPI_INCLUDE_DIRS}) +target_link_libraries(yolox_cpp PRIVATE ${VPI_LIBRARIES}) + ament_target_dependencies(yolox_cpp ${TARGET_DPENDENCIES}) ament_export_dependencies(${TARGET_DPENDENCIES}) target_link_libraries(yolox_cpp ${TARGET_LIBS}) diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp index 63ebf62..42d6ecb 100644 --- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp @@ -3,6 +3,17 @@ #include +#include +#include +#include +#include +#include + +#include +#include + +void launchBlobFromImage(uchar3* d_input, float* d_output, int width, int height, cudaStream_t stream); + namespace yolox_cpp { /** @@ -22,6 +33,8 @@ namespace yolox_cpp int grid0; int grid1; int stride; + + __host__ __device__ GridAndStride(const int grid0_, const int grid1_, const int stride_) : grid0(grid0_), grid1(grid1_), stride(stride_) { @@ -59,6 +72,10 @@ namespace yolox_cpp const std::vector strides_p6_ = {8, 16, 32, 64}; std::vector grid_strides_; + const int strides_gpu_[3] = {8, 16, 32}; + const int strides_p6_gpu_[4] = {8, 16, 32, 64}; + GridAndStride* grid_strides_gpu_; + cv::Mat static_resize(const cv::Mat &img) { const float r = std::min( @@ -73,6 +90,32 @@ namespace yolox_cpp return out; } + // Assumes that a stream has already been initialized + VPIImage static_resize_vpi(const cv::Mat &img, VPIStream stream) { + VPIImage vpi_image = nullptr; + vpiImageCreateWrapperOpenCVMat(img, 0, &vpi_image); + + VPIImageFormat type; + vpiImageGetFormat(vpi_image, &type); + + const float r = std::min( + static_cast(input_w_) / static_cast(img.cols), + static_cast(input_h_) / static_cast(img.rows)); + const int unpad_w = r * img.cols; + const int unpad_h = r * img.rows; + + // Create rescaled image, make blank with right dimensions first + // and then map the original image onto it + VPIImage rescaled = nullptr; + vpiImageCreate(unpad_h, unpad_w, type, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &rescaled); + vpiSubmitRescale(stream, VPI_BACKEND_CUDA, vpi_image, rescaled, VPI_INTERP_LINEAR, VPI_BORDER_ZERO, 0); + + // Destroy unneeded images + vpiImageDestroy(vpi_image); + + return rescaled; + } + // for NCHW void blobFromImage(const cv::Mat &img, float *blob_data) { @@ -93,6 +136,7 @@ namespace yolox_cpp } // for NHWC + void blobFromImage_nhwc(const cv::Mat &img, float *blob_data) { const size_t channels = 3; @@ -112,6 +156,21 @@ namespace yolox_cpp memcpy(blob_data, img_f32.data, img.rows * img.cols * channels * sizeof(float)); } + void blobFromVPIImage(VPIImage input, float* blob_output, cudaStream_t stream) { + VPIImageData data; + vpiImageLockData(input, VPI_LOCK_READ, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &data); + + auto& plane = data.buffer.pitch.planes[0]; + uchar3* ptr = reinterpret_cast(plane.data); + int width = plane.width; + int height = plane.height; + + // Cuda Kernel + launchBlobFromImage(ptr, blob_output, width, height, stream); + + vpiImageUnlock(input); + } + void generate_grids_and_stride(const int target_w, const int target_h, const std::vector &strides, std::vector &grid_strides) { grid_strides.clear(); @@ -267,4 +326,4 @@ namespace yolox_cpp } }; } -#endif +#endif // _YOLOX_CPP_CORE_HPP \ No newline at end of file diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cu b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cu new file mode 100644 index 0000000..918d245 --- /dev/null +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cu @@ -0,0 +1,32 @@ + +__global__ void blobFromImage(uchar3* image_data, float* output, int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width || y >= height) return; + + int uchar3_idx = y * width + x; + + int b_idx = 0 * height * width + y * width + x; + int g_idx = 1 * height * width + y * width + x; + int r_idx = 2 * height * width + y * width + x; + + output[b_idx] = image_data[uchar3_idx].x; + output[g_idx] = image_data[uchar3_idx].y; + output[r_idx] = image_data[uchar3_idx].z; + + return; +} + +void launchBlobFromImage(uchar3* d_input, float* d_output, int width, int height, cudaStream_t stream) { + dim3 block(16, 16); + dim3 grid((width + block.x - 1) / block.x, + (height + block.y - 1) / block.y); + + blobFromImage<<>>(d_input, d_output, width, height); + cudaDeviceSynchronize(stream); +} + +__global__ void generate_grids_and_strides() { + +} \ No newline at end of file diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singleframe_buffer.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singleframe_buffer.hpp new file mode 100644 index 0000000..04c0c63 --- /dev/null +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singleframe_buffer.hpp @@ -0,0 +1,30 @@ +#pragma once +#include "yolox_ros_cpp/yolox_ros_cpp.hpp" +#include +#include + +class SingleFrameBuffer { + +private: + cv::Mat frame; + std::mutex mtx; + std::condition_variable not_empty; + +public: + void push(const cv::Mat& frame) { + std::unique_lock lock(mtx); + this->frame = frame; + not_empty.notify_one(); + } + cv::Mat pop() { + cv::Mat output_frame; + { + std::unique_lock lock(mtx); + not_empty.wait(lock, [this] { return !this->frame.empty(); }); + + output_frame = std::move(this->frame); + this->frame.release(); + } + return output_frame; + } +}; \ No newline at end of file diff --git a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/yolox_tensorrt.hpp b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/yolox_tensorrt.hpp index 15ee452..622fd2f 100644 --- a/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/yolox_tensorrt.hpp +++ b/yolox_ros_cpp/yolox_cpp/include/yolox_cpp/yolox_tensorrt.hpp @@ -40,7 +40,7 @@ namespace yolox_cpp{ std::vector inference(const cv::Mat& frame) override; private: - void doInference(const float* input, float* output); + void doInference(const float* input, float* output, cudaStream_t stream); int DEVICE_ = 0; Logger gLogger_; diff --git a/yolox_ros_cpp/yolox_cpp/package.xml b/yolox_ros_cpp/yolox_cpp/package.xml index f18153e..da52a11 100644 --- a/yolox_ros_cpp/yolox_cpp/package.xml +++ b/yolox_ros_cpp/yolox_cpp/package.xml @@ -11,6 +11,7 @@ ament_cmake_auto OpenCV + vpi ament_lint_auto ament_lint_common diff --git a/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp b/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp index 93992bf..2ddab8d 100644 --- a/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp +++ b/yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp @@ -94,18 +94,40 @@ namespace yolox_cpp std::vector YoloXTensorRT::inference(const cv::Mat &frame) { + // Create VPI Stream + VPIStream vpi_stream; + vpiStreamCreate(0, &vpi_stream); + // preprocess auto now = std::chrono::system_clock::now(); - auto pr_img = static_resize(frame); + auto pr_img = static_resize_vpi(frame, vpi_stream); auto end = std::chrono::system_clock::now(); auto elapsed_inf = std::chrono::duration_cast(end - now); printf("resize time: %5ld us\n", elapsed_inf.count()); - blobFromImage(pr_img, input_blob_.data()); + + VPIImageData data; + vpiImageLockData(pr_img, VPI_LOCK_READ, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &data); + + int width = data.buffer.pitch.planes[0].width; + int height = data.buffer.pitch.planes[1].height; + + float* output; - // inference - this->doInference(input_blob_.data(), output_blob_.data()); + int num_elements = 3 * width * height; + cudaMalloc(&output, sizeof(float) * num_elements); + cudaStream_t stream; + cudaStreamCreate(&stream); + + blobFromVPIImage(pr_img, output, stream); + + // No more use for the VPI Image + vpiImageDestroy(pr_img); + vpiStreamDestroy(vpi_stream); + // inference + this->doInference(output, output_blob_.data(), stream); + // postprocess const float scale = std::min( static_cast(this->input_w_) / static_cast(frame.cols), @@ -120,10 +142,8 @@ namespace yolox_cpp return objects; } - void YoloXTensorRT::doInference(const float *input, float *output) + void YoloXTensorRT::doInference(const float *input, float *output, cudaStream_t stream) { - // Create stream - cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host diff --git a/yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp b/yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp index ebcb5d1..46d8c6a 100644 --- a/yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp +++ b/yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp @@ -92,6 +92,8 @@ namespace yolox_ros_cpp RCLCPP_INFO(this->get_logger(), "model loaded"); rclcpp::QoS qos_profile(1); // Queue depth of 1 + qos_profile.best_effort(); // Default is "reliable" which tries to retransmit missed frames, best_effort() avoids this + // Good for realtime this->sub_image_ = image_transport::create_subscription( this, this->params_.src_image_topic_name, std::bind(&YoloXNode::colorImageCallback, this, std::placeholders::_1),