Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions yolox_ros_cpp/yolox_cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ configure_file(
)

ament_auto_add_library(yolox_cpp SHARED ${TARGET_SRC})

find_package(VPI REQUIRED)
target_include_directories(yolox_cpp PRIVATE ${VPI_INCLUDE_DIRS})
target_link_libraries(yolox_cpp PRIVATE ${VPI_LIBRARIES})

ament_target_dependencies(yolox_cpp ${TARGET_DPENDENCIES})
ament_export_dependencies(${TARGET_DPENDENCIES})
target_link_libraries(yolox_cpp ${TARGET_LIBS})
Expand Down
61 changes: 60 additions & 1 deletion yolox_ros_cpp/yolox_cpp/include/yolox_cpp/core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,17 @@

#include <opencv2/core/types.hpp>

#include <vpi/VPI.h>
#include <vpi/Image.h>
#include <vpi/Stream.h>
#include <vpi/algo/Rescale.h>
#include <vpi/OpenCVInterop.hpp>

#include <cuda_runtime_api.h>
#include <cuda_runtime.h>

void launchBlobFromImage(uchar3* d_input, float* d_output, int width, int height, cudaStream_t stream);

namespace yolox_cpp
{
/**
Expand All @@ -22,6 +33,8 @@ namespace yolox_cpp
int grid0;
int grid1;
int stride;

__host__ __device__
GridAndStride(const int grid0_, const int grid1_, const int stride_)
: grid0(grid0_), grid1(grid1_), stride(stride_)
{
Expand Down Expand Up @@ -59,6 +72,10 @@ namespace yolox_cpp
const std::vector<int> strides_p6_ = {8, 16, 32, 64};
std::vector<GridAndStride> grid_strides_;

const int strides_gpu_[3] = {8, 16, 32};
const int strides_p6_gpu_[4] = {8, 16, 32, 64};
GridAndStride* grid_strides_gpu_;

cv::Mat static_resize(const cv::Mat &img)
{
const float r = std::min(
Expand All @@ -73,6 +90,32 @@ namespace yolox_cpp
return out;
}

// Assumes that a stream has already been initialized
VPIImage static_resize_vpi(const cv::Mat &img, VPIStream stream) {
VPIImage vpi_image = nullptr;
vpiImageCreateWrapperOpenCVMat(img, 0, &vpi_image);

VPIImageFormat type;
vpiImageGetFormat(vpi_image, &type);

const float r = std::min(
static_cast<float>(input_w_) / static_cast<float>(img.cols),
static_cast<float>(input_h_) / static_cast<float>(img.rows));
const int unpad_w = r * img.cols;
const int unpad_h = r * img.rows;

// Create rescaled image, make blank with right dimensions first
// and then map the original image onto it
VPIImage rescaled = nullptr;
vpiImageCreate(unpad_h, unpad_w, type, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &rescaled);
vpiSubmitRescale(stream, VPI_BACKEND_CUDA, vpi_image, rescaled, VPI_INTERP_LINEAR, VPI_BORDER_ZERO, 0);

// Destroy unneeded images
vpiImageDestroy(vpi_image);

return rescaled;
}

// for NCHW
void blobFromImage(const cv::Mat &img, float *blob_data)
{
Expand All @@ -93,6 +136,7 @@ namespace yolox_cpp
}

// for NHWC

void blobFromImage_nhwc(const cv::Mat &img, float *blob_data)
{
const size_t channels = 3;
Expand All @@ -112,6 +156,21 @@ namespace yolox_cpp
memcpy(blob_data, img_f32.data, img.rows * img.cols * channels * sizeof(float));
}

void blobFromVPIImage(VPIImage input, float* blob_output, cudaStream_t stream) {
VPIImageData data;
vpiImageLockData(input, VPI_LOCK_READ, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &data);

auto& plane = data.buffer.pitch.planes[0];
uchar3* ptr = reinterpret_cast<uchar3*>(plane.data);
int width = plane.width;
int height = plane.height;

// Cuda Kernel
launchBlobFromImage(ptr, blob_output, width, height, stream);

vpiImageUnlock(input);
}

void generate_grids_and_stride(const int target_w, const int target_h, const std::vector<int> &strides, std::vector<GridAndStride> &grid_strides)
{
grid_strides.clear();
Expand Down Expand Up @@ -267,4 +326,4 @@ namespace yolox_cpp
}
};
}
#endif
#endif // _YOLOX_CPP_CORE_HPP
32 changes: 32 additions & 0 deletions yolox_ros_cpp/yolox_cpp/include/yolox_cpp/cuda_utils.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

__global__ void blobFromImage(uchar3* image_data, float* output, int width, int height) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;

if (x >= width || y >= height) return;

int uchar3_idx = y * width + x;

int b_idx = 0 * height * width + y * width + x;
int g_idx = 1 * height * width + y * width + x;
int r_idx = 2 * height * width + y * width + x;

output[b_idx] = image_data[uchar3_idx].x;
output[g_idx] = image_data[uchar3_idx].y;
output[r_idx] = image_data[uchar3_idx].z;

return;
}

void launchBlobFromImage(uchar3* d_input, float* d_output, int width, int height, cudaStream_t stream) {
dim3 block(16, 16);
dim3 grid((width + block.x - 1) / block.x,
(height + block.y - 1) / block.y);

blobFromImage<<<grid, block, 0, stream>>>(d_input, d_output, width, height);
cudaDeviceSynchronize(stream);
}

__global__ void generate_grids_and_strides() {

}
30 changes: 30 additions & 0 deletions yolox_ros_cpp/yolox_cpp/include/yolox_cpp/singleframe_buffer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#pragma once
#include "yolox_ros_cpp/yolox_ros_cpp.hpp"
#include <condition_variable>
#include <mutex>

class SingleFrameBuffer {

private:
cv::Mat frame;
std::mutex mtx;
std::condition_variable not_empty;

public:
void push(const cv::Mat& frame) {
std::unique_lock<std::mutex> lock(mtx);
this->frame = frame;
not_empty.notify_one();
}
cv::Mat pop() {
cv::Mat output_frame;
{
std::unique_lock<std::mutex> lock(mtx);
not_empty.wait(lock, [this] { return !this->frame.empty(); });

output_frame = std::move(this->frame);
this->frame.release();
}
return output_frame;
}
};
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ namespace yolox_cpp{
std::vector<Object> inference(const cv::Mat& frame) override;

private:
void doInference(const float* input, float* output);
void doInference(const float* input, float* output, cudaStream_t stream);

int DEVICE_ = 0;
Logger gLogger_;
Expand Down
1 change: 1 addition & 0 deletions yolox_ros_cpp/yolox_cpp/package.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
<buildtool_depend>ament_cmake_auto</buildtool_depend>

<depend>OpenCV</depend>
<depend>vpi</depend>

<test_depend>ament_lint_auto</test_depend>
<test_depend>ament_lint_common</test_depend>
Expand Down
34 changes: 27 additions & 7 deletions yolox_ros_cpp/yolox_cpp/src/yolox_tensorrt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,18 +94,40 @@ namespace yolox_cpp

std::vector<Object> YoloXTensorRT::inference(const cv::Mat &frame)
{
// Create VPI Stream
VPIStream vpi_stream;
vpiStreamCreate(0, &vpi_stream);

// preprocess
auto now = std::chrono::system_clock::now();
auto pr_img = static_resize(frame);
auto pr_img = static_resize_vpi(frame, vpi_stream);
auto end = std::chrono::system_clock::now();
auto elapsed_inf = std::chrono::duration_cast<std::chrono::microseconds>(end - now);
printf("resize time: %5ld us\n", elapsed_inf.count());
blobFromImage(pr_img, input_blob_.data());

VPIImageData data;
vpiImageLockData(pr_img, VPI_LOCK_READ, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &data);

int width = data.buffer.pitch.planes[0].width;
int height = data.buffer.pitch.planes[1].height;

float* output;

// inference
this->doInference(input_blob_.data(), output_blob_.data());
int num_elements = 3 * width * height;
cudaMalloc(&output, sizeof(float) * num_elements);

cudaStream_t stream;
cudaStreamCreate(&stream);

blobFromVPIImage(pr_img, output, stream);

// No more use for the VPI Image
vpiImageDestroy(pr_img);
vpiStreamDestroy(vpi_stream);

// inference
this->doInference(output, output_blob_.data(), stream);

// postprocess
const float scale = std::min(
static_cast<float>(this->input_w_) / static_cast<float>(frame.cols),
Expand All @@ -120,10 +142,8 @@ namespace yolox_cpp
return objects;
}

void YoloXTensorRT::doInference(const float *input, float *output)
void YoloXTensorRT::doInference(const float *input, float *output, cudaStream_t stream)
{
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));

// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
Expand Down
2 changes: 2 additions & 0 deletions yolox_ros_cpp/yolox_ros_cpp/src/yolox_ros_cpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ namespace yolox_ros_cpp
RCLCPP_INFO(this->get_logger(), "model loaded");

rclcpp::QoS qos_profile(1); // Queue depth of 1
qos_profile.best_effort(); // Default is "reliable" which tries to retransmit missed frames, best_effort() avoids this
// Good for realtime
this->sub_image_ = image_transport::create_subscription(
this, this->params_.src_image_topic_name,
std::bind(&YoloXNode::colorImageCallback, this, std::placeholders::_1),
Expand Down