fst commit
This commit is contained in:
310
algorithm/common_det/cuda/common_det_cuda_impl.cpp
Normal file
310
algorithm/common_det/cuda/common_det_cuda_impl.cpp
Normal file
@@ -0,0 +1,310 @@
|
||||
#include "common_det_cuda_impl.h"
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
|
||||
#define SV_MODEL_DIR "/SpireCV/models/"
|
||||
#define SV_ROOT_DIR "/SpireCV/"
|
||||
|
||||
|
||||
#ifdef WITH_CUDA
|
||||
#include "yolov7/cuda_utils.h"
|
||||
#include "yolov7/logging.h"
|
||||
#include "yolov7/utils.h"
|
||||
#include "yolov7/preprocess.h"
|
||||
#include "yolov7/postprocess.h"
|
||||
#include "yolov7/model.h"
|
||||
#define TRTCHECK(status) \
|
||||
do \
|
||||
{ \
|
||||
auto ret = (status); \
|
||||
if (ret != 0) \
|
||||
{ \
|
||||
std::cerr << "Cuda failure: " << ret << std::endl; \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define DEVICE 0 // GPU id
|
||||
#define BATCH_SIZE 1
|
||||
#define MAX_IMAGE_INPUT_SIZE_THRESH 3000 * 3000 // ensure it exceed the maximum size in the input images !
|
||||
#endif
|
||||
|
||||
|
||||
namespace sv {
|
||||
|
||||
using namespace cv;
|
||||
|
||||
|
||||
#ifdef WITH_CUDA
|
||||
using namespace nvinfer1;
|
||||
static Logger g_nvlogger;
|
||||
const static int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
|
||||
const static int kOutputSize1 = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
|
||||
const static int kOutputSize2 = 32 * (640 / 4) * (640 / 4);
|
||||
#endif
|
||||
|
||||
|
||||
CommonObjectDetectorCUDAImpl::CommonObjectDetectorCUDAImpl()
|
||||
{
|
||||
#ifdef WITH_CUDA
|
||||
this->_gpu_buffers[0] = nullptr;
|
||||
this->_gpu_buffers[1] = nullptr;
|
||||
this->_gpu_buffers[2] = nullptr;
|
||||
this->_cpu_output_buffer = nullptr;
|
||||
this->_cpu_output_buffer1 = nullptr;
|
||||
this->_cpu_output_buffer2 = nullptr;
|
||||
this->_context = nullptr;
|
||||
this->_engine = nullptr;
|
||||
this->_runtime = nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
CommonObjectDetectorCUDAImpl::~CommonObjectDetectorCUDAImpl()
|
||||
{
|
||||
#ifdef WITH_CUDA
|
||||
// Release stream and buffers
|
||||
cudaStreamDestroy(_stream);
|
||||
if (_gpu_buffers[0])
|
||||
CUDA_CHECK(cudaFree(_gpu_buffers[0]));
|
||||
if (_gpu_buffers[1])
|
||||
CUDA_CHECK(cudaFree(_gpu_buffers[1]));
|
||||
if (_gpu_buffers[2])
|
||||
CUDA_CHECK(cudaFree(_gpu_buffers[2]));
|
||||
if (_cpu_output_buffer)
|
||||
delete[] _cpu_output_buffer;
|
||||
if (_cpu_output_buffer1)
|
||||
delete[] _cpu_output_buffer1;
|
||||
if (_cpu_output_buffer2)
|
||||
delete[] _cpu_output_buffer2;
|
||||
cuda_preprocess_destroy();
|
||||
// Destroy the engine
|
||||
if (_context)
|
||||
_context->destroy();
|
||||
if (_engine)
|
||||
_engine->destroy();
|
||||
if (_runtime)
|
||||
_runtime->destroy();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#ifdef WITH_CUDA
|
||||
void infer(IExecutionContext& context, cudaStream_t& stream, void** gpu_buffers, float* output, int batchsize) {
|
||||
context.enqueue(batchsize, gpu_buffers, stream, nullptr);
|
||||
// context.enqueueV2(gpu_buffers, stream, nullptr);
|
||||
CUDA_CHECK(cudaMemcpyAsync(output, gpu_buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
|
||||
cudaStreamSynchronize(stream);
|
||||
}
|
||||
void infer_seg(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* output1, float* output2, int batchSize) {
|
||||
context.enqueue(batchSize, buffers, stream, nullptr);
|
||||
// context.enqueueV2(buffers, stream, nullptr);
|
||||
CUDA_CHECK(cudaMemcpyAsync(output1, buffers[1], batchSize * kOutputSize1 * sizeof(float), cudaMemcpyDeviceToHost, stream));
|
||||
CUDA_CHECK(cudaMemcpyAsync(output2, buffers[2], batchSize * kOutputSize2 * sizeof(float), cudaMemcpyDeviceToHost, stream));
|
||||
cudaStreamSynchronize(stream);
|
||||
}
|
||||
void CommonObjectDetectorCUDAImpl::_prepare_buffers(int input_h, int input_w) {
|
||||
assert(this->_engine->getNbBindings() == 2);
|
||||
// In order to bind the buffers, we need to know the names of the input and output tensors.
|
||||
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
|
||||
const int inputIndex = this->_engine->getBindingIndex(kInputTensorName);
|
||||
const int outputIndex = this->_engine->getBindingIndex(kOutputTensorName);
|
||||
assert(inputIndex == 0);
|
||||
assert(outputIndex == 1);
|
||||
// Create GPU buffers on device
|
||||
CUDA_CHECK(cudaMalloc((void**)&(this->_gpu_buffers[0]), kBatchSize * 3 * input_h * input_w * sizeof(float)));
|
||||
CUDA_CHECK(cudaMalloc((void**)&(this->_gpu_buffers[1]), kBatchSize * kOutputSize * sizeof(float)));
|
||||
|
||||
this->_cpu_output_buffer = new float[kBatchSize * kOutputSize];
|
||||
}
|
||||
void CommonObjectDetectorCUDAImpl::_prepare_buffers_seg(int input_h, int input_w) {
|
||||
assert(this->_engine->getNbBindings() == 3);
|
||||
// In order to bind the buffers, we need to know the names of the input and output tensors.
|
||||
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
|
||||
const int inputIndex = this->_engine->getBindingIndex(kInputTensorName);
|
||||
const int outputIndex1 = this->_engine->getBindingIndex(kOutputTensorName);
|
||||
const int outputIndex2 = this->_engine->getBindingIndex("proto");
|
||||
assert(inputIndex == 0);
|
||||
assert(outputIndex1 == 1);
|
||||
assert(outputIndex2 == 2);
|
||||
|
||||
// Create GPU buffers on device
|
||||
CUDA_CHECK(cudaMalloc((void**)&(this->_gpu_buffers[0]), kBatchSize * 3 * input_h * input_w * sizeof(float)));
|
||||
CUDA_CHECK(cudaMalloc((void**)&(this->_gpu_buffers[1]), kBatchSize * kOutputSize1 * sizeof(float)));
|
||||
CUDA_CHECK(cudaMalloc((void**)&(this->_gpu_buffers[2]), kBatchSize * kOutputSize2 * sizeof(float)));
|
||||
|
||||
// Alloc CPU buffers
|
||||
this->_cpu_output_buffer1 = new float[kBatchSize * kOutputSize1];
|
||||
this->_cpu_output_buffer2 = new float[kBatchSize * kOutputSize2];
|
||||
}
|
||||
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) {
|
||||
std::ifstream file(engine_name, std::ios::binary);
|
||||
if (!file.good()) {
|
||||
std::cerr << "read " << engine_name << " error!" << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
size_t size = 0;
|
||||
file.seekg(0, file.end);
|
||||
size = file.tellg();
|
||||
file.seekg(0, file.beg);
|
||||
char* serialized_engine = new char[size];
|
||||
assert(serialized_engine);
|
||||
file.read(serialized_engine, size);
|
||||
file.close();
|
||||
|
||||
*runtime = createInferRuntime(g_nvlogger);
|
||||
assert(*runtime);
|
||||
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
|
||||
assert(*engine);
|
||||
*context = (*engine)->createExecutionContext();
|
||||
assert(*context);
|
||||
delete[] serialized_engine;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void CommonObjectDetectorCUDAImpl::cudaDetect(
|
||||
CommonObjectDetectorBase* base_,
|
||||
cv::Mat img_,
|
||||
std::vector<float>& boxes_x_,
|
||||
std::vector<float>& boxes_y_,
|
||||
std::vector<float>& boxes_w_,
|
||||
std::vector<float>& boxes_h_,
|
||||
std::vector<int>& boxes_label_,
|
||||
std::vector<float>& boxes_score_,
|
||||
std::vector<cv::Mat>& boxes_seg_
|
||||
)
|
||||
{
|
||||
#ifdef WITH_CUDA
|
||||
int input_h = base_->getInputH();
|
||||
int input_w = base_->getInputW();
|
||||
bool with_segmentation = base_->withSegmentation();
|
||||
double thrs_conf = base_->getThrsConf();
|
||||
double thrs_nms = base_->getThrsNms();
|
||||
|
||||
std::vector<cv::Mat> img_batch;
|
||||
img_batch.push_back(img_);
|
||||
// Preprocess
|
||||
cuda_batch_preprocess(img_batch, this->_gpu_buffers[0], input_w, input_h, this->_stream);
|
||||
|
||||
// Run inference
|
||||
if (with_segmentation)
|
||||
{
|
||||
infer_seg(*this->_context, this->_stream, (void**)this->_gpu_buffers, this->_cpu_output_buffer1, this->_cpu_output_buffer2, kBatchSize);
|
||||
}
|
||||
else
|
||||
{
|
||||
infer(*this->_context, this->_stream, (void**)this->_gpu_buffers, this->_cpu_output_buffer, kBatchSize);
|
||||
}
|
||||
|
||||
// NMS
|
||||
std::vector<std::vector<Detection>> res_batch;
|
||||
if (with_segmentation)
|
||||
{
|
||||
batch_nms(res_batch, this->_cpu_output_buffer1, img_batch.size(), kOutputSize1, thrs_conf, thrs_nms);
|
||||
}
|
||||
else
|
||||
{
|
||||
batch_nms(res_batch, this->_cpu_output_buffer, img_batch.size(), kOutputSize, thrs_conf, thrs_nms);
|
||||
}
|
||||
|
||||
std::vector<Detection> res = res_batch[0];
|
||||
std::vector<cv::Mat> masks;
|
||||
if (with_segmentation)
|
||||
{
|
||||
masks = process_mask(&(this->_cpu_output_buffer2[0]), kOutputSize2, res, input_h, input_w);
|
||||
}
|
||||
|
||||
|
||||
|
||||
for (size_t j = 0; j < res.size(); j++) {
|
||||
cv::Rect r = get_rect(img_, res[j].bbox, input_h, input_w);
|
||||
if (r.x < 0) r.x = 0;
|
||||
if (r.y < 0) r.y = 0;
|
||||
if (r.x + r.width >= img_.cols) r.width = img_.cols - r.x - 1;
|
||||
if (r.y + r.height >= img_.rows) r.height = img_.rows - r.y - 1;
|
||||
if (r.width > 5 && r.height > 5)
|
||||
{
|
||||
// cv::rectangle(img_show, r, cv::Scalar(0, 0, 255), 2);
|
||||
// cv::putText(img_show, vehiclenames[(int)res[j].class_id], cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 2.2, cv::Scalar(0, 0, 255), 2);
|
||||
boxes_x_.push_back(r.x);
|
||||
boxes_y_.push_back(r.y);
|
||||
boxes_w_.push_back(r.width);
|
||||
boxes_h_.push_back(r.height);
|
||||
|
||||
boxes_label_.push_back((int)res[j].class_id);
|
||||
boxes_score_.push_back(res[j].conf);
|
||||
|
||||
if (with_segmentation)
|
||||
{
|
||||
cv::Mat mask_j = masks[j].clone();
|
||||
boxes_seg_.push_back(mask_j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
bool CommonObjectDetectorCUDAImpl::cudaSetup(CommonObjectDetectorBase* base_)
|
||||
{
|
||||
#ifdef WITH_CUDA
|
||||
std::string dataset = base_->getDataset();
|
||||
int input_h = base_->getInputH();
|
||||
int input_w = base_->getInputW();
|
||||
bool with_segmentation = base_->withSegmentation();
|
||||
double thrs_conf = base_->getThrsConf();
|
||||
double thrs_nms = base_->getThrsNms();
|
||||
|
||||
std::string engine_fn = get_home() + SV_MODEL_DIR + dataset + ".engine";
|
||||
if (input_w == 1280)
|
||||
{
|
||||
engine_fn = get_home() + SV_MODEL_DIR + dataset + "_HD.engine";
|
||||
}
|
||||
if (with_segmentation)
|
||||
{
|
||||
base_->setInputH(640);
|
||||
base_->setInputW(640);
|
||||
engine_fn = get_home() + SV_MODEL_DIR + dataset + "_SEG.engine";
|
||||
}
|
||||
std::cout << "Load: " << engine_fn << std::endl;
|
||||
if (!is_file_exist(engine_fn))
|
||||
{
|
||||
throw std::runtime_error("SpireCV (104) Error loading the CommonObject TensorRT model (File Not Exist)");
|
||||
}
|
||||
|
||||
deserialize_engine(engine_fn, &this->_runtime, &this->_engine, &this->_context);
|
||||
CUDA_CHECK(cudaStreamCreate(&this->_stream));
|
||||
|
||||
// Init CUDA preprocessing
|
||||
cuda_preprocess_init(kMaxInputImageSize);
|
||||
|
||||
if (with_segmentation)
|
||||
{
|
||||
// Prepare cpu and gpu buffers
|
||||
this->_prepare_buffers_seg(input_h, input_w);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Prepare cpu and gpu buffers
|
||||
this->_prepare_buffers(input_h, input_w);
|
||||
}
|
||||
return true;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
58
algorithm/common_det/cuda/common_det_cuda_impl.h
Normal file
58
algorithm/common_det/cuda/common_det_cuda_impl.h
Normal file
@@ -0,0 +1,58 @@
|
||||
#ifndef __SV_COMMON_DET_CUDA__
|
||||
#define __SV_COMMON_DET_CUDA__
|
||||
|
||||
#include "sv_core.h"
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include <opencv2/aruco.hpp>
|
||||
#include <opencv2/tracking.hpp>
|
||||
#include <string>
|
||||
#include <chrono>
|
||||
|
||||
|
||||
|
||||
#ifdef WITH_CUDA
|
||||
#include <NvInfer.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
namespace sv {
|
||||
|
||||
|
||||
class CommonObjectDetectorCUDAImpl
|
||||
{
|
||||
public:
|
||||
CommonObjectDetectorCUDAImpl();
|
||||
~CommonObjectDetectorCUDAImpl();
|
||||
|
||||
bool cudaSetup(CommonObjectDetectorBase* base_);
|
||||
void cudaDetect(
|
||||
CommonObjectDetectorBase* base_,
|
||||
cv::Mat img_,
|
||||
std::vector<float>& boxes_x_,
|
||||
std::vector<float>& boxes_y_,
|
||||
std::vector<float>& boxes_w_,
|
||||
std::vector<float>& boxes_h_,
|
||||
std::vector<int>& boxes_label_,
|
||||
std::vector<float>& boxes_score_,
|
||||
std::vector<cv::Mat>& boxes_seg_
|
||||
);
|
||||
|
||||
#ifdef WITH_CUDA
|
||||
void _prepare_buffers_seg(int input_h, int input_w);
|
||||
void _prepare_buffers(int input_h, int input_w);
|
||||
nvinfer1::IExecutionContext* _context;
|
||||
nvinfer1::IRuntime* _runtime;
|
||||
nvinfer1::ICudaEngine* _engine;
|
||||
cudaStream_t _stream;
|
||||
float* _gpu_buffers[3];
|
||||
float* _cpu_output_buffer;
|
||||
float* _cpu_output_buffer1;
|
||||
float* _cpu_output_buffer2;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
#endif
|
||||
97
algorithm/common_det/cuda/yolov7/calibrator.cpp
Normal file
97
algorithm/common_det/cuda/yolov7/calibrator.cpp
Normal file
@@ -0,0 +1,97 @@
|
||||
#include "calibrator.h"
|
||||
#include "cuda_utils.h"
|
||||
#include "utils.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <fstream>
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include <opencv2/dnn/dnn.hpp>
|
||||
|
||||
static cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
|
||||
int w, h, x, y;
|
||||
float r_w = input_w / (img.cols * 1.0);
|
||||
float r_h = input_h / (img.rows * 1.0);
|
||||
if (r_h > r_w) {
|
||||
w = input_w;
|
||||
h = r_w * img.rows;
|
||||
x = 0;
|
||||
y = (input_h - h) / 2;
|
||||
} else {
|
||||
w = r_h * img.cols;
|
||||
h = input_h;
|
||||
x = (input_w - w) / 2;
|
||||
y = 0;
|
||||
}
|
||||
cv::Mat re(h, w, CV_8UC3);
|
||||
cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
|
||||
cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
|
||||
re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
|
||||
return out;
|
||||
}
|
||||
|
||||
Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache)
|
||||
: batchsize_(batchsize),
|
||||
input_w_(input_w),
|
||||
input_h_(input_h),
|
||||
img_idx_(0),
|
||||
img_dir_(img_dir),
|
||||
calib_table_name_(calib_table_name),
|
||||
input_blob_name_(input_blob_name),
|
||||
read_cache_(read_cache) {
|
||||
input_count_ = 3 * input_w * input_h * batchsize;
|
||||
CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
|
||||
read_files_in_dir(img_dir, img_files_);
|
||||
}
|
||||
|
||||
Int8EntropyCalibrator2::~Int8EntropyCalibrator2() {
|
||||
CUDA_CHECK(cudaFree(device_input_));
|
||||
}
|
||||
|
||||
int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT {
|
||||
return batchsize_;
|
||||
}
|
||||
|
||||
bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT {
|
||||
if (img_idx_ + batchsize_ > (int)img_files_.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<cv::Mat> input_imgs_;
|
||||
for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
|
||||
std::cout << img_files_[i] << " " << i << std::endl;
|
||||
cv::Mat temp = cv::imread(img_dir_ + img_files_[i]);
|
||||
if (temp.empty()) {
|
||||
std::cerr << "Fatal error: image cannot open!" << std::endl;
|
||||
return false;
|
||||
}
|
||||
cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
|
||||
input_imgs_.push_back(pr_img);
|
||||
}
|
||||
img_idx_ += batchsize_;
|
||||
cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false);
|
||||
|
||||
CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
|
||||
assert(!strcmp(names[0], input_blob_name_));
|
||||
bindings[0] = device_input_;
|
||||
return true;
|
||||
}
|
||||
|
||||
const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT {
|
||||
std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
|
||||
calib_cache_.clear();
|
||||
std::ifstream input(calib_table_name_, std::ios::binary);
|
||||
input >> std::noskipws;
|
||||
if (read_cache_ && input.good()) {
|
||||
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
|
||||
}
|
||||
length = calib_cache_.size();
|
||||
return length ? calib_cache_.data() : nullptr;
|
||||
}
|
||||
|
||||
void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT {
|
||||
std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
|
||||
std::ofstream output(calib_table_name_, std::ios::binary);
|
||||
output.write(reinterpret_cast<const char*>(cache), length);
|
||||
}
|
||||
|
||||
36
algorithm/common_det/cuda/yolov7/calibrator.h
Normal file
36
algorithm/common_det/cuda/yolov7/calibrator.h
Normal file
@@ -0,0 +1,36 @@
|
||||
#pragma once
|
||||
|
||||
#include "macros.h"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
//! \class Int8EntropyCalibrator2
|
||||
//!
|
||||
//! \brief Implements Entropy calibrator 2.
|
||||
//! CalibrationAlgoType is kENTROPY_CALIBRATION_2.
|
||||
//!
|
||||
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
|
||||
public:
|
||||
Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);
|
||||
|
||||
virtual ~Int8EntropyCalibrator2();
|
||||
int getBatchSize() const TRT_NOEXCEPT override;
|
||||
bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
|
||||
const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
|
||||
void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
|
||||
|
||||
private:
|
||||
int batchsize_;
|
||||
int input_w_;
|
||||
int input_h_;
|
||||
int img_idx_;
|
||||
std::string img_dir_;
|
||||
std::vector<std::string> img_files_;
|
||||
size_t input_count_;
|
||||
std::string calib_table_name_;
|
||||
const char* input_blob_name_;
|
||||
bool read_cache_;
|
||||
void* device_input_;
|
||||
std::vector<char> calib_cache_;
|
||||
};
|
||||
|
||||
55
algorithm/common_det/cuda/yolov7/config.h
Normal file
55
algorithm/common_det/cuda/yolov7/config.h
Normal file
@@ -0,0 +1,55 @@
|
||||
#pragma once
|
||||
|
||||
/* --------------------------------------------------------
|
||||
* These configs are related to tensorrt model, if these are changed,
|
||||
* please re-compile and re-serialize the tensorrt model.
|
||||
* --------------------------------------------------------*/
|
||||
|
||||
// For INT8, you need prepare the calibration dataset, please refer to
|
||||
// https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5#int8-quantization
|
||||
#define USE_FP16 // set USE_INT8 or USE_FP16 or USE_FP32
|
||||
|
||||
// These are used to define input/output tensor names,
|
||||
// you can set them to whatever you want.
|
||||
const static char* kInputTensorName = "data";
|
||||
const static char* kOutputTensorName = "prob";
|
||||
|
||||
// Detection model and Segmentation model' number of classes
|
||||
// constexpr static int kNumClass = 80;
|
||||
|
||||
// Classfication model's number of classes
|
||||
constexpr static int kClsNumClass = 1000;
|
||||
|
||||
constexpr static int kBatchSize = 1;
|
||||
|
||||
// Yolo's input width and height must by divisible by 32
|
||||
// constexpr static int kInputH = 640;
|
||||
// constexpr static int kInputW = 640;
|
||||
|
||||
// Classfication model's input shape
|
||||
constexpr static int kClsInputH = 224;
|
||||
constexpr static int kClsInputW = 224;
|
||||
|
||||
// Maximum number of output bounding boxes from yololayer plugin.
|
||||
// That is maximum number of output bounding boxes before NMS.
|
||||
constexpr static int kMaxNumOutputBbox = 1000;
|
||||
|
||||
constexpr static int kNumAnchor = 3;
|
||||
|
||||
// The bboxes whose confidence is lower than kIgnoreThresh will be ignored in yololayer plugin.
|
||||
constexpr static float kIgnoreThresh = 0.1f;
|
||||
|
||||
/* --------------------------------------------------------
|
||||
* These configs are NOT related to tensorrt model, if these are changed,
|
||||
* please re-compile, but no need to re-serialize the tensorrt model.
|
||||
* --------------------------------------------------------*/
|
||||
|
||||
// NMS overlapping thresh and final detection confidence thresh
|
||||
const static float kNmsThresh = 0.45f;
|
||||
const static float kConfThresh = 0.5f;
|
||||
|
||||
const static int kGpuId = 0;
|
||||
|
||||
// If your image size is larger than 4096 * 3112, please increase this value
|
||||
const static int kMaxInputImageSize = 4096 * 3112;
|
||||
|
||||
18
algorithm/common_det/cuda/yolov7/cuda_utils.h
Normal file
18
algorithm/common_det/cuda/yolov7/cuda_utils.h
Normal file
@@ -0,0 +1,18 @@
|
||||
#ifndef TRTX_CUDA_UTILS_H_
|
||||
#define TRTX_CUDA_UTILS_H_
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#ifndef CUDA_CHECK
|
||||
#define CUDA_CHECK(callstr)\
|
||||
{\
|
||||
cudaError_t error_code = callstr;\
|
||||
if (error_code != cudaSuccess) {\
|
||||
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
|
||||
assert(0);\
|
||||
}\
|
||||
}
|
||||
#endif // CUDA_CHECK
|
||||
|
||||
#endif // TRTX_CUDA_UTILS_H_
|
||||
|
||||
504
algorithm/common_det/cuda/yolov7/logging.h
Normal file
504
algorithm/common_det/cuda/yolov7/logging.h
Normal file
@@ -0,0 +1,504 @@
|
||||
/*
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef TENSORRT_LOGGING_H
|
||||
#define TENSORRT_LOGGING_H
|
||||
|
||||
#include "NvInferRuntimeCommon.h"
|
||||
#include <cassert>
|
||||
#include <ctime>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include "macros.h"
|
||||
|
||||
using Severity = nvinfer1::ILogger::Severity;
|
||||
|
||||
class LogStreamConsumerBuffer : public std::stringbuf
|
||||
{
|
||||
public:
|
||||
LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
|
||||
: mOutput(stream)
|
||||
, mPrefix(prefix)
|
||||
, mShouldLog(shouldLog)
|
||||
{
|
||||
}
|
||||
|
||||
LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
|
||||
: mOutput(other.mOutput)
|
||||
{
|
||||
}
|
||||
|
||||
~LogStreamConsumerBuffer()
|
||||
{
|
||||
// std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
|
||||
// std::streambuf::pptr() gives a pointer to the current position of the output sequence
|
||||
// if the pointer to the beginning is not equal to the pointer to the current position,
|
||||
// call putOutput() to log the output to the stream
|
||||
if (pbase() != pptr())
|
||||
{
|
||||
putOutput();
|
||||
}
|
||||
}
|
||||
|
||||
// synchronizes the stream buffer and returns 0 on success
|
||||
// synchronizing the stream buffer consists of inserting the buffer contents into the stream,
|
||||
// resetting the buffer and flushing the stream
|
||||
virtual int sync()
|
||||
{
|
||||
putOutput();
|
||||
return 0;
|
||||
}
|
||||
|
||||
void putOutput()
|
||||
{
|
||||
if (mShouldLog)
|
||||
{
|
||||
// prepend timestamp
|
||||
std::time_t timestamp = std::time(nullptr);
|
||||
tm* tm_local = std::localtime(×tamp);
|
||||
std::cout << "[";
|
||||
std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
|
||||
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
|
||||
std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
|
||||
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
|
||||
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
|
||||
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
|
||||
// std::stringbuf::str() gets the string contents of the buffer
|
||||
// insert the buffer contents pre-appended by the appropriate prefix into the stream
|
||||
mOutput << mPrefix << str();
|
||||
// set the buffer to empty
|
||||
str("");
|
||||
// flush the stream
|
||||
mOutput.flush();
|
||||
}
|
||||
}
|
||||
|
||||
void setShouldLog(bool shouldLog)
|
||||
{
|
||||
mShouldLog = shouldLog;
|
||||
}
|
||||
|
||||
private:
|
||||
std::ostream& mOutput;
|
||||
std::string mPrefix;
|
||||
bool mShouldLog;
|
||||
};
|
||||
|
||||
//!
|
||||
//! \class LogStreamConsumerBase
|
||||
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
|
||||
//!
|
||||
class LogStreamConsumerBase
|
||||
{
|
||||
public:
|
||||
LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
|
||||
: mBuffer(stream, prefix, shouldLog)
|
||||
{
|
||||
}
|
||||
|
||||
protected:
|
||||
LogStreamConsumerBuffer mBuffer;
|
||||
};
|
||||
|
||||
//!
|
||||
//! \class LogStreamConsumer
|
||||
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
|
||||
//! Order of base classes is LogStreamConsumerBase and then std::ostream.
|
||||
//! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
|
||||
//! in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
|
||||
//! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
|
||||
//! Please do not change the order of the parent classes.
|
||||
//!
|
||||
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
|
||||
{
|
||||
public:
|
||||
//! \brief Creates a LogStreamConsumer which logs messages with level severity.
|
||||
//! Reportable severity determines if the messages are severe enough to be logged.
|
||||
LogStreamConsumer(Severity reportableSeverity, Severity severity)
|
||||
: LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
|
||||
, std::ostream(&mBuffer) // links the stream buffer with the stream
|
||||
, mShouldLog(severity <= reportableSeverity)
|
||||
, mSeverity(severity)
|
||||
{
|
||||
}
|
||||
|
||||
LogStreamConsumer(LogStreamConsumer&& other)
|
||||
: LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
|
||||
, std::ostream(&mBuffer) // links the stream buffer with the stream
|
||||
, mShouldLog(other.mShouldLog)
|
||||
, mSeverity(other.mSeverity)
|
||||
{
|
||||
}
|
||||
|
||||
void setReportableSeverity(Severity reportableSeverity)
|
||||
{
|
||||
mShouldLog = mSeverity <= reportableSeverity;
|
||||
mBuffer.setShouldLog(mShouldLog);
|
||||
}
|
||||
|
||||
private:
|
||||
static std::ostream& severityOstream(Severity severity)
|
||||
{
|
||||
return severity >= Severity::kINFO ? std::cout : std::cerr;
|
||||
}
|
||||
|
||||
static std::string severityPrefix(Severity severity)
|
||||
{
|
||||
switch (severity)
|
||||
{
|
||||
case Severity::kINTERNAL_ERROR: return "[F] ";
|
||||
case Severity::kERROR: return "[E] ";
|
||||
case Severity::kWARNING: return "[W] ";
|
||||
case Severity::kINFO: return "[I] ";
|
||||
case Severity::kVERBOSE: return "[V] ";
|
||||
default: assert(0); return "";
|
||||
}
|
||||
}
|
||||
|
||||
bool mShouldLog;
|
||||
Severity mSeverity;
|
||||
};
|
||||
|
||||
//! \class Logger
|
||||
//!
|
||||
//! \brief Class which manages logging of TensorRT tools and samples
|
||||
//!
|
||||
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
|
||||
//! and supports logging two types of messages:
|
||||
//!
|
||||
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
|
||||
//! - Test pass/fail messages
|
||||
//!
|
||||
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
|
||||
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
|
||||
//!
|
||||
//! In the future, this class could be extended to support dumping test results to a file in some standard format
|
||||
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
|
||||
//!
|
||||
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
|
||||
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
|
||||
//! library and messages coming from the sample.
|
||||
//!
|
||||
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
|
||||
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
|
||||
//! object.
|
||||
|
||||
class Logger : public nvinfer1::ILogger
|
||||
{
|
||||
public:
|
||||
Logger(Severity severity = Severity::kWARNING)
|
||||
: mReportableSeverity(severity)
|
||||
{
|
||||
}
|
||||
|
||||
//!
|
||||
//! \enum TestResult
|
||||
//! \brief Represents the state of a given test
|
||||
//!
|
||||
enum class TestResult
|
||||
{
|
||||
kRUNNING, //!< The test is running
|
||||
kPASSED, //!< The test passed
|
||||
kFAILED, //!< The test failed
|
||||
kWAIVED //!< The test was waived
|
||||
};
|
||||
|
||||
//!
|
||||
//! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
|
||||
//! \return The nvinfer1::ILogger associated with this Logger
|
||||
//!
|
||||
//! TODO Once all samples are updated to use this method to register the logger with TensorRT,
|
||||
//! we can eliminate the inheritance of Logger from ILogger
|
||||
//!
|
||||
nvinfer1::ILogger& getTRTLogger()
|
||||
{
|
||||
return *this;
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief Implementation of the nvinfer1::ILogger::log() virtual method
|
||||
//!
|
||||
//! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
|
||||
//! inheritance from nvinfer1::ILogger
|
||||
//!
|
||||
void log(Severity severity, const char* msg) TRT_NOEXCEPT override
|
||||
{
|
||||
LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief Method for controlling the verbosity of logging output
|
||||
//!
|
||||
//! \param severity The logger will only emit messages that have severity of this level or higher.
|
||||
//!
|
||||
void setReportableSeverity(Severity severity)
|
||||
{
|
||||
mReportableSeverity = severity;
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief Opaque handle that holds logging information for a particular test
|
||||
//!
|
||||
//! This object is an opaque handle to information used by the Logger to print test results.
|
||||
//! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
|
||||
//! with Logger::reportTest{Start,End}().
|
||||
//!
|
||||
class TestAtom
|
||||
{
|
||||
public:
|
||||
TestAtom(TestAtom&&) = default;
|
||||
|
||||
private:
|
||||
friend class Logger;
|
||||
|
||||
TestAtom(bool started, const std::string& name, const std::string& cmdline)
|
||||
: mStarted(started)
|
||||
, mName(name)
|
||||
, mCmdline(cmdline)
|
||||
{
|
||||
}
|
||||
|
||||
bool mStarted;
|
||||
std::string mName;
|
||||
std::string mCmdline;
|
||||
};
|
||||
|
||||
//!
|
||||
//! \brief Define a test for logging
|
||||
//!
|
||||
//! \param[in] name The name of the test. This should be a string starting with
|
||||
//! "TensorRT" and containing dot-separated strings containing
|
||||
//! the characters [A-Za-z0-9_].
|
||||
//! For example, "TensorRT.sample_googlenet"
|
||||
//! \param[in] cmdline The command line used to reproduce the test
|
||||
//
|
||||
//! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
|
||||
//!
|
||||
static TestAtom defineTest(const std::string& name, const std::string& cmdline)
|
||||
{
|
||||
return TestAtom(false, name, cmdline);
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
|
||||
//! as input
|
||||
//!
|
||||
//! \param[in] name The name of the test
|
||||
//! \param[in] argc The number of command-line arguments
|
||||
//! \param[in] argv The array of command-line arguments (given as C strings)
|
||||
//!
|
||||
//! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
|
||||
static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
|
||||
{
|
||||
auto cmdline = genCmdlineString(argc, argv);
|
||||
return defineTest(name, cmdline);
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief Report that a test has started.
|
||||
//!
|
||||
//! \pre reportTestStart() has not been called yet for the given testAtom
|
||||
//!
|
||||
//! \param[in] testAtom The handle to the test that has started
|
||||
//!
|
||||
static void reportTestStart(TestAtom& testAtom)
|
||||
{
|
||||
reportTestResult(testAtom, TestResult::kRUNNING);
|
||||
assert(!testAtom.mStarted);
|
||||
testAtom.mStarted = true;
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief Report that a test has ended.
|
||||
//!
|
||||
//! \pre reportTestStart() has been called for the given testAtom
|
||||
//!
|
||||
//! \param[in] testAtom The handle to the test that has ended
|
||||
//! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
|
||||
//! TestResult::kFAILED, TestResult::kWAIVED
|
||||
//!
|
||||
static void reportTestEnd(const TestAtom& testAtom, TestResult result)
|
||||
{
|
||||
assert(result != TestResult::kRUNNING);
|
||||
assert(testAtom.mStarted);
|
||||
reportTestResult(testAtom, result);
|
||||
}
|
||||
|
||||
static int reportPass(const TestAtom& testAtom)
|
||||
{
|
||||
reportTestEnd(testAtom, TestResult::kPASSED);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
static int reportFail(const TestAtom& testAtom)
|
||||
{
|
||||
reportTestEnd(testAtom, TestResult::kFAILED);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
static int reportWaive(const TestAtom& testAtom)
|
||||
{
|
||||
reportTestEnd(testAtom, TestResult::kWAIVED);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
static int reportTest(const TestAtom& testAtom, bool pass)
|
||||
{
|
||||
return pass ? reportPass(testAtom) : reportFail(testAtom);
|
||||
}
|
||||
|
||||
Severity getReportableSeverity() const
|
||||
{
|
||||
return mReportableSeverity;
|
||||
}
|
||||
|
||||
private:
|
||||
//!
|
||||
//! \brief returns an appropriate string for prefixing a log message with the given severity
|
||||
//!
|
||||
static const char* severityPrefix(Severity severity)
|
||||
{
|
||||
switch (severity)
|
||||
{
|
||||
case Severity::kINTERNAL_ERROR: return "[F] ";
|
||||
case Severity::kERROR: return "[E] ";
|
||||
case Severity::kWARNING: return "[W] ";
|
||||
case Severity::kINFO: return "[I] ";
|
||||
case Severity::kVERBOSE: return "[V] ";
|
||||
default: assert(0); return "";
|
||||
}
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief returns an appropriate string for prefixing a test result message with the given result
|
||||
//!
|
||||
static const char* testResultString(TestResult result)
|
||||
{
|
||||
switch (result)
|
||||
{
|
||||
case TestResult::kRUNNING: return "RUNNING";
|
||||
case TestResult::kPASSED: return "PASSED";
|
||||
case TestResult::kFAILED: return "FAILED";
|
||||
case TestResult::kWAIVED: return "WAIVED";
|
||||
default: assert(0); return "";
|
||||
}
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
|
||||
//!
|
||||
static std::ostream& severityOstream(Severity severity)
|
||||
{
|
||||
return severity >= Severity::kINFO ? std::cout : std::cerr;
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief method that implements logging test results
|
||||
//!
|
||||
static void reportTestResult(const TestAtom& testAtom, TestResult result)
|
||||
{
|
||||
severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
|
||||
<< testAtom.mCmdline << std::endl;
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief generate a command line string from the given (argc, argv) values
|
||||
//!
|
||||
static std::string genCmdlineString(int argc, char const* const* argv)
|
||||
{
|
||||
std::stringstream ss;
|
||||
for (int i = 0; i < argc; i++)
|
||||
{
|
||||
if (i > 0)
|
||||
ss << " ";
|
||||
ss << argv[i];
|
||||
}
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
Severity mReportableSeverity;
|
||||
};
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
//!
|
||||
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
|
||||
//!
|
||||
//! Example usage:
|
||||
//!
|
||||
//! LOG_VERBOSE(logger) << "hello world" << std::endl;
|
||||
//!
|
||||
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
|
||||
{
|
||||
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
|
||||
//!
|
||||
//! Example usage:
|
||||
//!
|
||||
//! LOG_INFO(logger) << "hello world" << std::endl;
|
||||
//!
|
||||
inline LogStreamConsumer LOG_INFO(const Logger& logger)
|
||||
{
|
||||
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
|
||||
//!
|
||||
//! Example usage:
|
||||
//!
|
||||
//! LOG_WARN(logger) << "hello world" << std::endl;
|
||||
//!
|
||||
inline LogStreamConsumer LOG_WARN(const Logger& logger)
|
||||
{
|
||||
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
|
||||
//!
|
||||
//! Example usage:
|
||||
//!
|
||||
//! LOG_ERROR(logger) << "hello world" << std::endl;
|
||||
//!
|
||||
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
|
||||
{
|
||||
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
|
||||
}
|
||||
|
||||
//!
|
||||
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
|
||||
// ("fatal" severity)
|
||||
//!
|
||||
//! Example usage:
|
||||
//!
|
||||
//! LOG_FATAL(logger) << "hello world" << std::endl;
|
||||
//!
|
||||
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
|
||||
{
|
||||
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
#endif // TENSORRT_LOGGING_H
|
||||
29
algorithm/common_det/cuda/yolov7/macros.h
Normal file
29
algorithm/common_det/cuda/yolov7/macros.h
Normal file
@@ -0,0 +1,29 @@
|
||||
#ifndef __MACROS_H
|
||||
#define __MACROS_H
|
||||
|
||||
#include <NvInfer.h>
|
||||
|
||||
#ifdef API_EXPORTS
|
||||
#if defined(_MSC_VER)
|
||||
#define API __declspec(dllexport)
|
||||
#else
|
||||
#define API __attribute__((visibility("default")))
|
||||
#endif
|
||||
#else
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define API __declspec(dllimport)
|
||||
#else
|
||||
#define API
|
||||
#endif
|
||||
#endif // API_EXPORTS
|
||||
|
||||
#if NV_TENSORRT_MAJOR >= 8
|
||||
#define TRT_NOEXCEPT noexcept
|
||||
#define TRT_CONST_ENQUEUE const
|
||||
#else
|
||||
#define TRT_NOEXCEPT
|
||||
#define TRT_CONST_ENQUEUE
|
||||
#endif
|
||||
|
||||
#endif // __MACROS_H
|
||||
628
algorithm/common_det/cuda/yolov7/model.cpp
Normal file
628
algorithm/common_det/cuda/yolov7/model.cpp
Normal file
@@ -0,0 +1,628 @@
|
||||
#include "model.h"
|
||||
#include "calibrator.h"
|
||||
#include "config.h"
|
||||
#include "yololayer.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
|
||||
using namespace nvinfer1;
|
||||
|
||||
// TensorRT weight files have a simple space delimited format:
|
||||
// [type] [size] <data x size in hex>
|
||||
static std::map<std::string, Weights> loadWeights(const std::string file) {
|
||||
std::cout << "Loading weights: " << file << std::endl;
|
||||
std::map<std::string, Weights> weightMap;
|
||||
|
||||
// Open weights file
|
||||
std::ifstream input(file);
|
||||
assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");
|
||||
|
||||
// Read number of weight blobs
|
||||
int32_t count;
|
||||
input >> count;
|
||||
assert(count > 0 && "Invalid weight map file.");
|
||||
|
||||
while (count--) {
|
||||
Weights wt{ DataType::kFLOAT, nullptr, 0 };
|
||||
uint32_t size;
|
||||
|
||||
// Read name and type of blob
|
||||
std::string name;
|
||||
input >> name >> std::dec >> size;
|
||||
wt.type = DataType::kFLOAT;
|
||||
|
||||
// Load blob
|
||||
uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
|
||||
for (uint32_t x = 0, y = size; x < y; ++x) {
|
||||
input >> std::hex >> val[x];
|
||||
}
|
||||
wt.values = val;
|
||||
|
||||
wt.count = size;
|
||||
weightMap[name] = wt;
|
||||
}
|
||||
|
||||
return weightMap;
|
||||
}
|
||||
|
||||
static int get_width(int x, float gw, int divisor = 8) {
|
||||
return int(ceil((x * gw) / divisor)) * divisor;
|
||||
}
|
||||
|
||||
static int get_depth(int x, float gd) {
|
||||
if (x == 1) return 1;
|
||||
int r = round(x * gd);
|
||||
if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) {
|
||||
--r;
|
||||
}
|
||||
return std::max<int>(r, 1);
|
||||
}
|
||||
|
||||
static IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
|
||||
float* gamma = (float*)weightMap[lname + ".weight"].values;
|
||||
float* beta = (float*)weightMap[lname + ".bias"].values;
|
||||
float* mean = (float*)weightMap[lname + ".running_mean"].values;
|
||||
float* var = (float*)weightMap[lname + ".running_var"].values;
|
||||
int len = weightMap[lname + ".running_var"].count;
|
||||
|
||||
float* scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
|
||||
for (int i = 0; i < len; i++) {
|
||||
scval[i] = gamma[i] / sqrt(var[i] + eps);
|
||||
}
|
||||
Weights scale{ DataType::kFLOAT, scval, len };
|
||||
|
||||
float* shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
|
||||
for (int i = 0; i < len; i++) {
|
||||
shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
|
||||
}
|
||||
Weights shift{ DataType::kFLOAT, shval, len };
|
||||
|
||||
float* pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
|
||||
for (int i = 0; i < len; i++) {
|
||||
pval[i] = 1.0;
|
||||
}
|
||||
Weights power{ DataType::kFLOAT, pval, len };
|
||||
|
||||
weightMap[lname + ".scale"] = scale;
|
||||
weightMap[lname + ".shift"] = shift;
|
||||
weightMap[lname + ".power"] = power;
|
||||
IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
|
||||
assert(scale_1);
|
||||
return scale_1;
|
||||
}
|
||||
|
||||
static ILayer* convBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) {
|
||||
Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
|
||||
int p = ksize / 3;
|
||||
IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts);
|
||||
assert(conv1);
|
||||
conv1->setStrideNd(DimsHW{ s, s });
|
||||
conv1->setPaddingNd(DimsHW{ p, p });
|
||||
conv1->setNbGroups(g);
|
||||
conv1->setName((lname + ".conv").c_str());
|
||||
IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);
|
||||
|
||||
// silu = x * sigmoid
|
||||
auto sig = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID);
|
||||
assert(sig);
|
||||
auto ew = network->addElementWise(*bn1->getOutput(0), *sig->getOutput(0), ElementWiseOperation::kPROD);
|
||||
assert(ew);
|
||||
return ew;
|
||||
}
|
||||
|
||||
static ILayer* focus(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname, int input_h, int input_w) {
|
||||
ISliceLayer* s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ inch, input_h / 2, input_w / 2 }, Dims3{ 1, 2, 2 });
|
||||
ISliceLayer* s2 = network->addSlice(input, Dims3{ 0, 1, 0 }, Dims3{ inch, input_h / 2, input_w / 2 }, Dims3{ 1, 2, 2 });
|
||||
ISliceLayer* s3 = network->addSlice(input, Dims3{ 0, 0, 1 }, Dims3{ inch, input_h / 2, input_w / 2 }, Dims3{ 1, 2, 2 });
|
||||
ISliceLayer* s4 = network->addSlice(input, Dims3{ 0, 1, 1 }, Dims3{ inch, input_h / 2, input_w / 2 }, Dims3{ 1, 2, 2 });
|
||||
ITensor* inputTensors[] = { s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0) };
|
||||
auto cat = network->addConcatenation(inputTensors, 4);
|
||||
auto conv = convBlock(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv");
|
||||
return conv;
|
||||
}
|
||||
|
||||
static ILayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) {
|
||||
auto cv1 = convBlock(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1");
|
||||
auto cv2 = convBlock(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2");
|
||||
if (shortcut && c1 == c2) {
|
||||
auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM);
|
||||
return ew;
|
||||
}
|
||||
return cv2;
|
||||
}
|
||||
|
||||
static ILayer* bottleneckCSP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
|
||||
Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
|
||||
int c_ = (int)((float)c2 * e);
|
||||
auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
|
||||
auto cv2 = network->addConvolutionNd(input, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv2.weight"], emptywts);
|
||||
ITensor* y1 = cv1->getOutput(0);
|
||||
for (int i = 0; i < n; i++) {
|
||||
auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
|
||||
y1 = b->getOutput(0);
|
||||
}
|
||||
auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv3.weight"], emptywts);
|
||||
|
||||
ITensor* inputTensors[] = { cv3->getOutput(0), cv2->getOutput(0) };
|
||||
auto cat = network->addConcatenation(inputTensors, 2);
|
||||
|
||||
IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4);
|
||||
auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU);
|
||||
lr->setAlpha(0.1);
|
||||
|
||||
auto cv4 = convBlock(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4");
|
||||
return cv4;
|
||||
}
|
||||
|
||||
static ILayer* C3(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
|
||||
int c_ = (int)((float)c2 * e);
|
||||
auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
|
||||
auto cv2 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv2");
|
||||
ITensor *y1 = cv1->getOutput(0);
|
||||
for (int i = 0; i < n; i++) {
|
||||
auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
|
||||
y1 = b->getOutput(0);
|
||||
}
|
||||
|
||||
ITensor* inputTensors[] = { y1, cv2->getOutput(0) };
|
||||
auto cat = network->addConcatenation(inputTensors, 2);
|
||||
|
||||
auto cv3 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv3");
|
||||
return cv3;
|
||||
}
|
||||
|
||||
static ILayer* SPP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) {
|
||||
int c_ = c1 / 2;
|
||||
auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
|
||||
|
||||
auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k1, k1 });
|
||||
pool1->setPaddingNd(DimsHW{ k1 / 2, k1 / 2 });
|
||||
pool1->setStrideNd(DimsHW{ 1, 1 });
|
||||
auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k2, k2 });
|
||||
pool2->setPaddingNd(DimsHW{ k2 / 2, k2 / 2 });
|
||||
pool2->setStrideNd(DimsHW{ 1, 1 });
|
||||
auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k3, k3 });
|
||||
pool3->setPaddingNd(DimsHW{ k3 / 2, k3 / 2 });
|
||||
pool3->setStrideNd(DimsHW{ 1, 1 });
|
||||
|
||||
ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) };
|
||||
auto cat = network->addConcatenation(inputTensors, 4);
|
||||
|
||||
auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
|
||||
return cv2;
|
||||
}
|
||||
|
||||
static ILayer* SPPF(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k, std::string lname) {
|
||||
int c_ = c1 / 2;
|
||||
auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
|
||||
|
||||
auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k, k });
|
||||
pool1->setPaddingNd(DimsHW{ k / 2, k / 2 });
|
||||
pool1->setStrideNd(DimsHW{ 1, 1 });
|
||||
auto pool2 = network->addPoolingNd(*pool1->getOutput(0), PoolingType::kMAX, DimsHW{ k, k });
|
||||
pool2->setPaddingNd(DimsHW{ k / 2, k / 2 });
|
||||
pool2->setStrideNd(DimsHW{ 1, 1 });
|
||||
auto pool3 = network->addPoolingNd(*pool2->getOutput(0), PoolingType::kMAX, DimsHW{ k, k });
|
||||
pool3->setPaddingNd(DimsHW{ k / 2, k / 2 });
|
||||
pool3->setStrideNd(DimsHW{ 1, 1 });
|
||||
ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) };
|
||||
auto cat = network->addConcatenation(inputTensors, 4);
|
||||
auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
|
||||
return cv2;
|
||||
}
|
||||
|
||||
static ILayer* Proto(INetworkDefinition* network, std::map<std::string, Weights>& weightMap, ITensor& input, int c_, int c2, std::string lname) {
|
||||
auto cv1 = convBlock(network, weightMap, input, c_, 3, 1, 1, lname + ".cv1");
|
||||
|
||||
auto upsample = network->addResize(*cv1->getOutput(0));
|
||||
assert(upsample);
|
||||
upsample->setResizeMode(ResizeMode::kNEAREST);
|
||||
const float scales[] = {1, 2, 2};
|
||||
upsample->setScales(scales, 3);
|
||||
|
||||
auto cv2 = convBlock(network, weightMap, *upsample->getOutput(0), c_, 3, 1, 1, lname + ".cv2");
|
||||
auto cv3 = convBlock(network, weightMap, *cv2->getOutput(0), c2, 1, 1, 1, lname + ".cv3");
|
||||
assert(cv3);
|
||||
return cv3;
|
||||
}
|
||||
|
||||
static std::vector<std::vector<float>> getAnchors(std::map<std::string, Weights>& weightMap, std::string lname) {
|
||||
std::vector<std::vector<float>> anchors;
|
||||
Weights wts = weightMap[lname + ".anchor_grid"];
|
||||
int anchor_len = kNumAnchor * 2;
|
||||
for (int i = 0; i < wts.count / anchor_len; i++) {
|
||||
auto *p = (const float*)wts.values + i * anchor_len;
|
||||
std::vector<float> anchor(p, p + anchor_len);
|
||||
anchors.push_back(anchor);
|
||||
}
|
||||
return anchors;
|
||||
}
|
||||
|
||||
static IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, std::string lname, std::vector<IConvolutionLayer*> dets, int input_h, int input_w, int n_classes, bool is_segmentation = false) {
|
||||
auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
|
||||
auto anchors = getAnchors(weightMap, lname);
|
||||
PluginField plugin_fields[2];
|
||||
int netinfo[5] = {n_classes, input_w, input_h, kMaxNumOutputBbox, (int)is_segmentation};
|
||||
plugin_fields[0].data = netinfo;
|
||||
plugin_fields[0].length = 5;
|
||||
plugin_fields[0].name = "netinfo";
|
||||
plugin_fields[0].type = PluginFieldType::kFLOAT32;
|
||||
|
||||
//load strides from Detect layer
|
||||
assert(weightMap.find(lname + ".strides") != weightMap.end() && "Not found `strides`, please check gen_wts.py!!!");
|
||||
Weights strides = weightMap[lname + ".strides"];
|
||||
auto *p = (const float*)(strides.values);
|
||||
std::vector<int> scales(p, p + strides.count);
|
||||
|
||||
std::vector<YoloKernel> kernels;
|
||||
for (size_t i = 0; i < anchors.size(); i++) {
|
||||
YoloKernel kernel;
|
||||
kernel.width = input_w / scales[i];
|
||||
kernel.height = input_h / scales[i];
|
||||
memcpy(kernel.anchors, &anchors[i][0], anchors[i].size() * sizeof(float));
|
||||
kernels.push_back(kernel);
|
||||
}
|
||||
plugin_fields[1].data = &kernels[0];
|
||||
plugin_fields[1].length = kernels.size();
|
||||
plugin_fields[1].name = "kernels";
|
||||
plugin_fields[1].type = PluginFieldType::kFLOAT32;
|
||||
PluginFieldCollection plugin_data;
|
||||
plugin_data.nbFields = 2;
|
||||
plugin_data.fields = plugin_fields;
|
||||
IPluginV2 *plugin_obj = creator->createPlugin("yololayer", &plugin_data);
|
||||
std::vector<ITensor*> input_tensors;
|
||||
for (auto det: dets) {
|
||||
input_tensors.push_back(det->getOutput(0));
|
||||
}
|
||||
auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj);
|
||||
return yolo;
|
||||
}
|
||||
|
||||
ICudaEngine* build_det_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name, int input_h, int input_w, int n_classes) {
|
||||
INetworkDefinition* network = builder->createNetworkV2(0U);
|
||||
|
||||
// Create input tensor of shape {3, input_h, input_w}
|
||||
ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, input_h, input_w });
|
||||
assert(data);
|
||||
std::map<std::string, Weights> weightMap = loadWeights(wts_name);
|
||||
|
||||
// Backbone
|
||||
auto conv0 = convBlock(network, weightMap, *data, get_width(64, gw), 6, 2, 1, "model.0");
|
||||
assert(conv0);
|
||||
auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
|
||||
auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
|
||||
auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
|
||||
auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4");
|
||||
auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
|
||||
auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
|
||||
auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7");
|
||||
auto bottleneck_csp8 = C3(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.8");
|
||||
auto spp9 = SPPF(network, weightMap, *bottleneck_csp8->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, "model.9");
|
||||
|
||||
// Head
|
||||
auto conv10 = convBlock(network, weightMap, *spp9->getOutput(0), get_width(512, gw), 1, 1, 1, "model.10");
|
||||
|
||||
auto upsample11 = network->addResize(*conv10->getOutput(0));
|
||||
assert(upsample11);
|
||||
upsample11->setResizeMode(ResizeMode::kNEAREST);
|
||||
upsample11->setOutputDimensions(bottleneck_csp6->getOutput(0)->getDimensions());
|
||||
|
||||
ITensor* inputTensors12[] = { upsample11->getOutput(0), bottleneck_csp6->getOutput(0) };
|
||||
auto cat12 = network->addConcatenation(inputTensors12, 2);
|
||||
auto bottleneck_csp13 = C3(network, weightMap, *cat12->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.13");
|
||||
auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), get_width(256, gw), 1, 1, 1, "model.14");
|
||||
|
||||
auto upsample15 = network->addResize(*conv14->getOutput(0));
|
||||
assert(upsample15);
|
||||
upsample15->setResizeMode(ResizeMode::kNEAREST);
|
||||
upsample15->setOutputDimensions(bottleneck_csp4->getOutput(0)->getDimensions());
|
||||
|
||||
ITensor* inputTensors16[] = { upsample15->getOutput(0), bottleneck_csp4->getOutput(0) };
|
||||
auto cat16 = network->addConcatenation(inputTensors16, 2);
|
||||
|
||||
auto bottleneck_csp17 = C3(network, weightMap, *cat16->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.17");
|
||||
|
||||
// Detect
|
||||
IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
|
||||
auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 3, 2, 1, "model.18");
|
||||
ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
|
||||
auto cat19 = network->addConcatenation(inputTensors19, 2);
|
||||
auto bottleneck_csp20 = C3(network, weightMap, *cat19->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.20");
|
||||
IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
|
||||
auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), get_width(512, gw), 3, 2, 1, "model.21");
|
||||
ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
|
||||
auto cat22 = network->addConcatenation(inputTensors22, 2);
|
||||
auto bottleneck_csp23 = C3(network, weightMap, *cat22->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.23");
|
||||
IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
|
||||
|
||||
auto yolo = addYoLoLayer(network, weightMap, "model.24", std::vector<IConvolutionLayer*>{det0, det1, det2}, input_h, input_w, n_classes);
|
||||
yolo->getOutput(0)->setName(kOutputTensorName);
|
||||
network->markOutput(*yolo->getOutput(0));
|
||||
|
||||
// Engine config
|
||||
builder->setMaxBatchSize(maxBatchSize);
|
||||
config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB
|
||||
#if defined(USE_FP16)
|
||||
config->setFlag(BuilderFlag::kFP16);
|
||||
#elif defined(USE_INT8)
|
||||
std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
|
||||
assert(builder->platformHasFastInt8());
|
||||
config->setFlag(BuilderFlag::kINT8);
|
||||
Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, input_w, input_h, "./coco_calib/", "int8calib.table", kInputTensorName);
|
||||
config->setInt8Calibrator(calibrator);
|
||||
#endif
|
||||
|
||||
std::cout << "Building engine, please wait for a while..." << std::endl;
|
||||
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
|
||||
std::cout << "Build engine successfully!" << std::endl;
|
||||
|
||||
// Don't need the network any more
|
||||
network->destroy();
|
||||
|
||||
// Release host memory
|
||||
for (auto& mem : weightMap) {
|
||||
free((void*)(mem.second.values));
|
||||
}
|
||||
|
||||
return engine;
|
||||
}
|
||||
|
||||
ICudaEngine* build_det_p6_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name, int input_h, int input_w, int n_classes) {
|
||||
INetworkDefinition* network = builder->createNetworkV2(0U);
|
||||
|
||||
// Create input tensor of shape {3, input_h, input_w}
|
||||
ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, input_h, input_w });
|
||||
assert(data);
|
||||
|
||||
std::map<std::string, Weights> weightMap = loadWeights(wts_name);
|
||||
|
||||
// Backbone
|
||||
auto conv0 = convBlock(network, weightMap, *data, get_width(64, gw), 6, 2, 1, "model.0");
|
||||
auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
|
||||
auto c3_2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
|
||||
auto conv3 = convBlock(network, weightMap, *c3_2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
|
||||
auto c3_4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4");
|
||||
auto conv5 = convBlock(network, weightMap, *c3_4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
|
||||
auto c3_6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
|
||||
auto conv7 = convBlock(network, weightMap, *c3_6->getOutput(0), get_width(768, gw), 3, 2, 1, "model.7");
|
||||
auto c3_8 = C3(network, weightMap, *conv7->getOutput(0), get_width(768, gw), get_width(768, gw), get_depth(3, gd), true, 1, 0.5, "model.8");
|
||||
auto conv9 = convBlock(network, weightMap, *c3_8->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.9");
|
||||
auto c3_10 = C3(network, weightMap, *conv9->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.10");
|
||||
auto sppf11 = SPPF(network, weightMap, *c3_10->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, "model.11");
|
||||
|
||||
// Head
|
||||
auto conv12 = convBlock(network, weightMap, *sppf11->getOutput(0), get_width(768, gw), 1, 1, 1, "model.12");
|
||||
auto upsample13 = network->addResize(*conv12->getOutput(0));
|
||||
assert(upsample13);
|
||||
upsample13->setResizeMode(ResizeMode::kNEAREST);
|
||||
upsample13->setOutputDimensions(c3_8->getOutput(0)->getDimensions());
|
||||
ITensor* inputTensors14[] = { upsample13->getOutput(0), c3_8->getOutput(0) };
|
||||
auto cat14 = network->addConcatenation(inputTensors14, 2);
|
||||
auto c3_15 = C3(network, weightMap, *cat14->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.15");
|
||||
|
||||
auto conv16 = convBlock(network, weightMap, *c3_15->getOutput(0), get_width(512, gw), 1, 1, 1, "model.16");
|
||||
auto upsample17 = network->addResize(*conv16->getOutput(0));
|
||||
assert(upsample17);
|
||||
upsample17->setResizeMode(ResizeMode::kNEAREST);
|
||||
upsample17->setOutputDimensions(c3_6->getOutput(0)->getDimensions());
|
||||
ITensor* inputTensors18[] = { upsample17->getOutput(0), c3_6->getOutput(0) };
|
||||
auto cat18 = network->addConcatenation(inputTensors18, 2);
|
||||
auto c3_19 = C3(network, weightMap, *cat18->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.19");
|
||||
|
||||
auto conv20 = convBlock(network, weightMap, *c3_19->getOutput(0), get_width(256, gw), 1, 1, 1, "model.20");
|
||||
auto upsample21 = network->addResize(*conv20->getOutput(0));
|
||||
assert(upsample21);
|
||||
upsample21->setResizeMode(ResizeMode::kNEAREST);
|
||||
upsample21->setOutputDimensions(c3_4->getOutput(0)->getDimensions());
|
||||
ITensor* inputTensors21[] = { upsample21->getOutput(0), c3_4->getOutput(0) };
|
||||
auto cat22 = network->addConcatenation(inputTensors21, 2);
|
||||
auto c3_23 = C3(network, weightMap, *cat22->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.23");
|
||||
|
||||
auto conv24 = convBlock(network, weightMap, *c3_23->getOutput(0), get_width(256, gw), 3, 2, 1, "model.24");
|
||||
ITensor* inputTensors25[] = { conv24->getOutput(0), conv20->getOutput(0) };
|
||||
auto cat25 = network->addConcatenation(inputTensors25, 2);
|
||||
auto c3_26 = C3(network, weightMap, *cat25->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.26");
|
||||
|
||||
auto conv27 = convBlock(network, weightMap, *c3_26->getOutput(0), get_width(512, gw), 3, 2, 1, "model.27");
|
||||
ITensor* inputTensors28[] = { conv27->getOutput(0), conv16->getOutput(0) };
|
||||
auto cat28 = network->addConcatenation(inputTensors28, 2);
|
||||
auto c3_29 = C3(network, weightMap, *cat28->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.29");
|
||||
|
||||
auto conv30 = convBlock(network, weightMap, *c3_29->getOutput(0), get_width(768, gw), 3, 2, 1, "model.30");
|
||||
ITensor* inputTensors31[] = { conv30->getOutput(0), conv12->getOutput(0) };
|
||||
auto cat31 = network->addConcatenation(inputTensors31, 2);
|
||||
auto c3_32 = C3(network, weightMap, *cat31->getOutput(0), get_width(2048, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.32");
|
||||
|
||||
// Detect
|
||||
IConvolutionLayer* det0 = network->addConvolutionNd(*c3_23->getOutput(0), 3 * (n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.0.weight"], weightMap["model.33.m.0.bias"]);
|
||||
IConvolutionLayer* det1 = network->addConvolutionNd(*c3_26->getOutput(0), 3 * (n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.1.weight"], weightMap["model.33.m.1.bias"]);
|
||||
IConvolutionLayer* det2 = network->addConvolutionNd(*c3_29->getOutput(0), 3 * (n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.2.weight"], weightMap["model.33.m.2.bias"]);
|
||||
IConvolutionLayer* det3 = network->addConvolutionNd(*c3_32->getOutput(0), 3 * (n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.3.weight"], weightMap["model.33.m.3.bias"]);
|
||||
|
||||
auto yolo = addYoLoLayer(network, weightMap, "model.33", std::vector<IConvolutionLayer*>{det0, det1, det2, det3}, input_h, input_w, n_classes);
|
||||
yolo->getOutput(0)->setName(kOutputTensorName);
|
||||
network->markOutput(*yolo->getOutput(0));
|
||||
|
||||
// Engine config
|
||||
builder->setMaxBatchSize(maxBatchSize);
|
||||
config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB
|
||||
#if defined(USE_FP16)
|
||||
config->setFlag(BuilderFlag::kFP16);
|
||||
#elif defined(USE_INT8)
|
||||
std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
|
||||
assert(builder->platformHasFastInt8());
|
||||
config->setFlag(BuilderFlag::kINT8);
|
||||
Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, input_w, input_h, "./coco_calib/", "int8calib.table", kInputTensorName);
|
||||
config->setInt8Calibrator(calibrator);
|
||||
#endif
|
||||
|
||||
std::cout << "Building engine, please wait for a while..." << std::endl;
|
||||
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
|
||||
std::cout << "Build engine successfully!" << std::endl;
|
||||
|
||||
// Don't need the network any more
|
||||
network->destroy();
|
||||
|
||||
// Release host memory
|
||||
for (auto& mem : weightMap) {
|
||||
free((void*)(mem.second.values));
|
||||
}
|
||||
|
||||
return engine;
|
||||
}
|
||||
|
||||
ICudaEngine* build_cls_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) {
|
||||
INetworkDefinition* network = builder->createNetworkV2(0U);
|
||||
|
||||
// Create input tensor
|
||||
ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kClsInputH, kClsInputW });
|
||||
assert(data);
|
||||
std::map<std::string, Weights> weightMap = loadWeights(wts_name);
|
||||
|
||||
// Backbone
|
||||
auto conv0 = convBlock(network, weightMap, *data, get_width(64, gw), 6, 2, 1, "model.0");
|
||||
assert(conv0);
|
||||
auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
|
||||
auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
|
||||
auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
|
||||
auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4");
|
||||
auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
|
||||
auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
|
||||
auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7");
|
||||
auto bottleneck_csp8 = C3(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.8");
|
||||
|
||||
// Head
|
||||
auto conv_class = convBlock(network, weightMap, *bottleneck_csp8->getOutput(0), 1280, 1, 1, 1, "model.9.conv");
|
||||
IPoolingLayer* pool2 = network->addPoolingNd(*conv_class->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7});
|
||||
assert(pool2);
|
||||
IFullyConnectedLayer* yolo = network->addFullyConnected(*pool2->getOutput(0), kClsNumClass, weightMap["model.9.linear.weight"], weightMap["model.9.linear.bias"]);
|
||||
assert(yolo);
|
||||
|
||||
yolo->getOutput(0)->setName(kOutputTensorName);
|
||||
network->markOutput(*yolo->getOutput(0));
|
||||
|
||||
// Engine config
|
||||
builder->setMaxBatchSize(maxBatchSize);
|
||||
config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB
|
||||
|
||||
#if defined(USE_FP16)
|
||||
config->setFlag(BuilderFlag::kFP16);
|
||||
#elif defined(USE_INT8)
|
||||
std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
|
||||
assert(builder->platformHasFastInt8());
|
||||
config->setFlag(BuilderFlag::kINT8);
|
||||
Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kClsInputW, kClsInputW, "./coco_calib/", "int8calib.table", kInputTensorName);
|
||||
config->setInt8Calibrator(calibrator);
|
||||
#endif
|
||||
|
||||
std::cout << "Building engine, please wait for a while..." << std::endl;
|
||||
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
|
||||
std::cout << "Build engine successfully!" << std::endl;
|
||||
|
||||
// Don't need the network any more
|
||||
network->destroy();
|
||||
|
||||
// Release host memory
|
||||
for (auto& mem : weightMap) {
|
||||
free((void*)(mem.second.values));
|
||||
}
|
||||
|
||||
return engine;
|
||||
}
|
||||
|
||||
ICudaEngine* build_seg_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name, int input_h, int input_w, int n_classes) {
|
||||
INetworkDefinition* network = builder->createNetworkV2(0U);
|
||||
ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, input_h, input_w });
|
||||
assert(data);
|
||||
std::map<std::string, Weights> weightMap = loadWeights(wts_name);
|
||||
|
||||
// Backbone
|
||||
auto conv0 = convBlock(network, weightMap, *data, get_width(64, gw), 6, 2, 1, "model.0");
|
||||
assert(conv0);
|
||||
auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
|
||||
auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
|
||||
auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
|
||||
auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4");
|
||||
auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
|
||||
auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
|
||||
auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7");
|
||||
auto bottleneck_csp8 = C3(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.8");
|
||||
auto spp9 = SPPF(network, weightMap, *bottleneck_csp8->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, "model.9");
|
||||
|
||||
// Head
|
||||
auto conv10 = convBlock(network, weightMap, *spp9->getOutput(0), get_width(512, gw), 1, 1, 1, "model.10");
|
||||
|
||||
auto upsample11 = network->addResize(*conv10->getOutput(0));
|
||||
assert(upsample11);
|
||||
upsample11->setResizeMode(ResizeMode::kNEAREST);
|
||||
upsample11->setOutputDimensions(bottleneck_csp6->getOutput(0)->getDimensions());
|
||||
|
||||
ITensor* inputTensors12[] = { upsample11->getOutput(0), bottleneck_csp6->getOutput(0) };
|
||||
auto cat12 = network->addConcatenation(inputTensors12, 2);
|
||||
auto bottleneck_csp13 = C3(network, weightMap, *cat12->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.13");
|
||||
auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), get_width(256, gw), 1, 1, 1, "model.14");
|
||||
|
||||
auto upsample15 = network->addResize(*conv14->getOutput(0));
|
||||
assert(upsample15);
|
||||
upsample15->setResizeMode(ResizeMode::kNEAREST);
|
||||
upsample15->setOutputDimensions(bottleneck_csp4->getOutput(0)->getDimensions());
|
||||
|
||||
ITensor* inputTensors16[] = { upsample15->getOutput(0), bottleneck_csp4->getOutput(0) };
|
||||
auto cat16 = network->addConcatenation(inputTensors16, 2);
|
||||
|
||||
auto bottleneck_csp17 = C3(network, weightMap, *cat16->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.17");
|
||||
|
||||
// Segmentation
|
||||
IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (32 + n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
|
||||
auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 3, 2, 1, "model.18");
|
||||
ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
|
||||
auto cat19 = network->addConcatenation(inputTensors19, 2);
|
||||
auto bottleneck_csp20 = C3(network, weightMap, *cat19->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.20");
|
||||
IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (32 + n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
|
||||
auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), get_width(512, gw), 3, 2, 1, "model.21");
|
||||
ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
|
||||
auto cat22 = network->addConcatenation(inputTensors22, 2);
|
||||
auto bottleneck_csp23 = C3(network, weightMap, *cat22->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.23");
|
||||
IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (32 + n_classes + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
|
||||
|
||||
auto yolo = addYoLoLayer(network, weightMap, "model.24", std::vector<IConvolutionLayer*>{det0, det1, det2}, input_h, input_w, n_classes, true);
|
||||
yolo->getOutput(0)->setName(kOutputTensorName);
|
||||
network->markOutput(*yolo->getOutput(0));
|
||||
|
||||
auto proto = Proto(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 32, "model.24.proto");
|
||||
proto->getOutput(0)->setName("proto");
|
||||
network->markOutput(*proto->getOutput(0));
|
||||
|
||||
// Engine config
|
||||
builder->setMaxBatchSize(maxBatchSize);
|
||||
config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB
|
||||
#if defined(USE_FP16)
|
||||
config->setFlag(BuilderFlag::kFP16);
|
||||
#elif defined(USE_INT8)
|
||||
std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
|
||||
assert(builder->platformHasFastInt8());
|
||||
config->setFlag(BuilderFlag::kINT8);
|
||||
Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, input_w, input_h, "./coco_calib/", "int8calib.table", kInputTensorName);
|
||||
config->setInt8Calibrator(calibrator);
|
||||
#endif
|
||||
|
||||
std::cout << "Building engine, please wait for a while..." << std::endl;
|
||||
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
|
||||
std::cout << "Build engine successfully!" << std::endl;
|
||||
|
||||
// Don't need the network any more
|
||||
network->destroy();
|
||||
|
||||
// Release host memory
|
||||
for (auto& mem : weightMap) {
|
||||
free((void*)(mem.second.values));
|
||||
}
|
||||
|
||||
return engine;
|
||||
}
|
||||
|
||||
16
algorithm/common_det/cuda/yolov7/model.h
Normal file
16
algorithm/common_det/cuda/yolov7/model.h
Normal file
@@ -0,0 +1,16 @@
|
||||
#pragma once
|
||||
|
||||
#include <NvInfer.h>
|
||||
#include <string>
|
||||
|
||||
nvinfer1::ICudaEngine* build_det_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
|
||||
nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
|
||||
float& gd, float& gw, std::string& wts_name, int input_h, int input_w, int n_classes);
|
||||
|
||||
nvinfer1::ICudaEngine* build_det_p6_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
|
||||
nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
|
||||
float& gd, float& gw, std::string& wts_name, int input_h, int input_w, int n_classes);
|
||||
|
||||
nvinfer1::ICudaEngine* build_cls_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name);
|
||||
|
||||
nvinfer1::ICudaEngine* build_seg_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name, int input_h, int input_w, int n_classes);
|
||||
198
algorithm/common_det/cuda/yolov7/postprocess.cpp
Normal file
198
algorithm/common_det/cuda/yolov7/postprocess.cpp
Normal file
@@ -0,0 +1,198 @@
|
||||
#include "postprocess.h"
|
||||
#include "utils.h"
|
||||
|
||||
|
||||
cv::Rect get_rect(cv::Mat& img, float bbox[4], int input_h, int input_w)
|
||||
{
|
||||
float l, r, t, b;
|
||||
float r_w = input_w / (img.cols * 1.0);
|
||||
float r_h = input_h / (img.rows * 1.0);
|
||||
if (r_h > r_w) {
|
||||
l = bbox[0] - bbox[2] / 2.f;
|
||||
r = bbox[0] + bbox[2] / 2.f;
|
||||
t = bbox[1] - bbox[3] / 2.f - (input_h - r_w * img.rows) / 2;
|
||||
b = bbox[1] + bbox[3] / 2.f - (input_h - r_w * img.rows) / 2;
|
||||
l = l / r_w;
|
||||
r = r / r_w;
|
||||
t = t / r_w;
|
||||
b = b / r_w;
|
||||
} else {
|
||||
l = bbox[0] - bbox[2] / 2.f - (input_w - r_h * img.cols) / 2;
|
||||
r = bbox[0] + bbox[2] / 2.f - (input_w - r_h * img.cols) / 2;
|
||||
t = bbox[1] - bbox[3] / 2.f;
|
||||
b = bbox[1] + bbox[3] / 2.f;
|
||||
l = l / r_h;
|
||||
r = r / r_h;
|
||||
t = t / r_h;
|
||||
b = b / r_h;
|
||||
}
|
||||
return cv::Rect(round(l), round(t), round(r - l), round(b - t));
|
||||
}
|
||||
|
||||
static float iou(float lbox[4], float rbox[4]) {
|
||||
float interBox[] = {
|
||||
(std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left
|
||||
(std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right
|
||||
(std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top
|
||||
(std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom
|
||||
};
|
||||
|
||||
if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
|
||||
return 0.0f;
|
||||
|
||||
float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]);
|
||||
return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
|
||||
}
|
||||
|
||||
static bool cmp(const Detection& a, const Detection& b) {
|
||||
return a.conf > b.conf;
|
||||
}
|
||||
|
||||
void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
|
||||
int det_size = sizeof(Detection) / sizeof(float);
|
||||
std::map<float, std::vector<Detection>> m;
|
||||
for (int i = 0; i < output[0] && i < kMaxNumOutputBbox; i++) {
|
||||
if (output[1 + det_size * i + 4] <= conf_thresh) continue;
|
||||
Detection det;
|
||||
memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
|
||||
if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Detection>());
|
||||
m[det.class_id].push_back(det);
|
||||
}
|
||||
for (auto it = m.begin(); it != m.end(); it++) {
|
||||
auto& dets = it->second;
|
||||
std::sort(dets.begin(), dets.end(), cmp);
|
||||
for (size_t m = 0; m < dets.size(); ++m) {
|
||||
auto& item = dets[m];
|
||||
res.push_back(item);
|
||||
for (size_t n = m + 1; n < dets.size(); ++n) {
|
||||
if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
|
||||
dets.erase(dets.begin() + n);
|
||||
--n;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void batch_nms(std::vector<std::vector<Detection>>& res_batch, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh) {
|
||||
res_batch.resize(batch_size);
|
||||
for (int i = 0; i < batch_size; i++) {
|
||||
nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
|
||||
}
|
||||
}
|
||||
|
||||
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch, int input_h, int input_w) {
|
||||
for (size_t i = 0; i < img_batch.size(); i++) {
|
||||
auto& res = res_batch[i];
|
||||
cv::Mat img = img_batch[i];
|
||||
for (size_t j = 0; j < res.size(); j++) {
|
||||
cv::Rect r = get_rect(img, res[j].bbox, input_h, input_w);
|
||||
cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
|
||||
cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static cv::Rect get_downscale_rect(float bbox[4], float scale) {
|
||||
float left = bbox[0] - bbox[2] / 2;
|
||||
float top = bbox[1] - bbox[3] / 2;
|
||||
float right = bbox[0] + bbox[2] / 2;
|
||||
float bottom = bbox[1] + bbox[3] / 2;
|
||||
left /= scale;
|
||||
top /= scale;
|
||||
right /= scale;
|
||||
bottom /= scale;
|
||||
return cv::Rect(round(left), round(top), round(right - left), round(bottom - top));
|
||||
}
|
||||
|
||||
std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets, int input_h, int input_w) {
|
||||
std::vector<cv::Mat> masks;
|
||||
for (size_t i = 0; i < dets.size(); i++) {
|
||||
cv::Mat mask_mat = cv::Mat::zeros(input_h / 4, input_w / 4, CV_32FC1);
|
||||
auto r = get_downscale_rect(dets[i].bbox, 4);
|
||||
if (r.x < 0) r.x = 0;
|
||||
if (r.y < 0) r.y = 0;
|
||||
if (r.x + r.width > mask_mat.cols) r.width = mask_mat.cols - r.x;
|
||||
if (r.y + r.height > mask_mat.rows) r.height = mask_mat.rows - r.y;
|
||||
// printf(" %d, %d, %d, %d, (%d, %d)\n", r.x, r.y, r.width, r.height, mask_mat.cols, mask_mat.rows);
|
||||
|
||||
for (int x = r.x; x < r.x + r.width; x++) {
|
||||
for (int y = r.y; y < r.y + r.height; y++) {
|
||||
float e = 0.0f;
|
||||
for (int j = 0; j < 32; j++) {
|
||||
e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x];
|
||||
}
|
||||
e = 1.0f / (1.0f + expf(-e));
|
||||
mask_mat.at<float>(y, x) = e;
|
||||
}
|
||||
}
|
||||
|
||||
cv::resize(mask_mat, mask_mat, cv::Size(input_w, input_h));
|
||||
masks.push_back(mask_mat);
|
||||
}
|
||||
return masks;
|
||||
}
|
||||
|
||||
cv::Mat scale_mask(cv::Mat mask, cv::Mat img, int input_h, int input_w) {
|
||||
int x, y, w, h;
|
||||
float r_w = input_w / (img.cols * 1.0);
|
||||
float r_h = input_h / (img.rows * 1.0);
|
||||
if (r_h > r_w) {
|
||||
w = input_w;
|
||||
h = r_w * img.rows;
|
||||
x = 0;
|
||||
y = (input_h - h) / 2;
|
||||
} else {
|
||||
w = r_h * img.cols;
|
||||
h = input_h;
|
||||
x = (input_w - w) / 2;
|
||||
y = 0;
|
||||
}
|
||||
cv::Rect r(x, y, w, h);
|
||||
cv::Mat res;
|
||||
cv::resize(mask(r), res, img.size());
|
||||
return res;
|
||||
}
|
||||
|
||||
void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks, std::unordered_map<int, std::string>& labels_map, int input_h, int input_w) {
|
||||
static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A,
|
||||
0x92CC17, 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF,
|
||||
0x344593, 0x6473FF, 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF,
|
||||
0xFF95C8, 0xFF37C7};
|
||||
for (size_t i = 0; i < dets.size(); i++) {
|
||||
cv::Mat img_mask = scale_mask(masks[i], img, input_h, input_w);
|
||||
auto color = colors[(int)dets[i].class_id % colors.size()];
|
||||
auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
|
||||
|
||||
cv::Rect r = get_rect(img, dets[i].bbox, input_h, input_w);
|
||||
for (int x = r.x; x < r.x + r.width; x++) {
|
||||
for (int y = r.y; y < r.y + r.height; y++) {
|
||||
float val = img_mask.at<float>(y, x);
|
||||
if (val <= 0.5) continue;
|
||||
img.at<cv::Vec3b>(y, x)[0] = img.at<cv::Vec3b>(y, x)[0] / 2 + bgr[0] / 2;
|
||||
img.at<cv::Vec3b>(y, x)[1] = img.at<cv::Vec3b>(y, x)[1] / 2 + bgr[1] / 2;
|
||||
img.at<cv::Vec3b>(y, x)[2] = img.at<cv::Vec3b>(y, x)[2] / 2 + bgr[2] / 2;
|
||||
}
|
||||
}
|
||||
|
||||
cv::rectangle(img, r, bgr, 2);
|
||||
|
||||
// Get the size of the text
|
||||
cv::Size textSize = cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL);
|
||||
// Set the top left corner of the rectangle
|
||||
cv::Point topLeft(r.x, r.y - textSize.height);
|
||||
|
||||
// Set the bottom right corner of the rectangle
|
||||
cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height);
|
||||
|
||||
// Set the thickness of the rectangle lines
|
||||
int lineThickness = 2;
|
||||
|
||||
// Draw the rectangle on the image
|
||||
cv::rectangle(img, topLeft, bottomRight, bgr, -1);
|
||||
|
||||
cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
16
algorithm/common_det/cuda/yolov7/postprocess.h
Normal file
16
algorithm/common_det/cuda/yolov7/postprocess.h
Normal file
@@ -0,0 +1,16 @@
|
||||
#pragma once
|
||||
|
||||
#include "types.h"
|
||||
#include <opencv2/opencv.hpp>
|
||||
|
||||
cv::Rect get_rect(cv::Mat& img, float bbox[4], int input_h, int input_w);
|
||||
|
||||
void nms(std::vector<Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5);
|
||||
|
||||
void batch_nms(std::vector<std::vector<Detection>>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5);
|
||||
|
||||
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch, int input_h, int input_w);
|
||||
|
||||
std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets, int input_h, int input_w);
|
||||
|
||||
void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks, std::unordered_map<int, std::string>& labels_map, int input_h, int input_w);
|
||||
153
algorithm/common_det/cuda/yolov7/preprocess.cu
Normal file
153
algorithm/common_det/cuda/yolov7/preprocess.cu
Normal file
@@ -0,0 +1,153 @@
|
||||
#include "preprocess.h"
|
||||
#include "cuda_utils.h"
|
||||
|
||||
static uint8_t* img_buffer_host = nullptr;
|
||||
static uint8_t* img_buffer_device = nullptr;
|
||||
|
||||
struct AffineMatrix {
|
||||
float value[6];
|
||||
};
|
||||
|
||||
__global__ void warpaffine_kernel(
|
||||
uint8_t* src, int src_line_size, int src_width,
|
||||
int src_height, float* dst, int dst_width,
|
||||
int dst_height, uint8_t const_value_st,
|
||||
AffineMatrix d2s, int edge) {
|
||||
int position = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (position >= edge) return;
|
||||
|
||||
float m_x1 = d2s.value[0];
|
||||
float m_y1 = d2s.value[1];
|
||||
float m_z1 = d2s.value[2];
|
||||
float m_x2 = d2s.value[3];
|
||||
float m_y2 = d2s.value[4];
|
||||
float m_z2 = d2s.value[5];
|
||||
|
||||
int dx = position % dst_width;
|
||||
int dy = position / dst_width;
|
||||
float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
|
||||
float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
|
||||
float c0, c1, c2;
|
||||
|
||||
if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
|
||||
// out of range
|
||||
c0 = const_value_st;
|
||||
c1 = const_value_st;
|
||||
c2 = const_value_st;
|
||||
} else {
|
||||
int y_low = floorf(src_y);
|
||||
int x_low = floorf(src_x);
|
||||
int y_high = y_low + 1;
|
||||
int x_high = x_low + 1;
|
||||
|
||||
uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
|
||||
float ly = src_y - y_low;
|
||||
float lx = src_x - x_low;
|
||||
float hy = 1 - ly;
|
||||
float hx = 1 - lx;
|
||||
float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
|
||||
uint8_t* v1 = const_value;
|
||||
uint8_t* v2 = const_value;
|
||||
uint8_t* v3 = const_value;
|
||||
uint8_t* v4 = const_value;
|
||||
|
||||
if (y_low >= 0) {
|
||||
if (x_low >= 0)
|
||||
v1 = src + y_low * src_line_size + x_low * 3;
|
||||
|
||||
if (x_high < src_width)
|
||||
v2 = src + y_low * src_line_size + x_high * 3;
|
||||
}
|
||||
|
||||
if (y_high < src_height) {
|
||||
if (x_low >= 0)
|
||||
v3 = src + y_high * src_line_size + x_low * 3;
|
||||
|
||||
if (x_high < src_width)
|
||||
v4 = src + y_high * src_line_size + x_high * 3;
|
||||
}
|
||||
|
||||
c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
|
||||
c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
|
||||
c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
|
||||
}
|
||||
|
||||
// bgr to rgb
|
||||
float t = c2;
|
||||
c2 = c0;
|
||||
c0 = t;
|
||||
|
||||
// normalization
|
||||
c0 = c0 / 255.0f;
|
||||
c1 = c1 / 255.0f;
|
||||
c2 = c2 / 255.0f;
|
||||
|
||||
// rgbrgbrgb to rrrgggbbb
|
||||
int area = dst_width * dst_height;
|
||||
float* pdst_c0 = dst + dy * dst_width + dx;
|
||||
float* pdst_c1 = pdst_c0 + area;
|
||||
float* pdst_c2 = pdst_c1 + area;
|
||||
*pdst_c0 = c0;
|
||||
*pdst_c1 = c1;
|
||||
*pdst_c2 = c2;
|
||||
}
|
||||
|
||||
void cuda_preprocess(
|
||||
uint8_t* src, int src_width, int src_height,
|
||||
float* dst, int dst_width, int dst_height,
|
||||
cudaStream_t stream) {
|
||||
|
||||
int img_size = src_width * src_height * 3;
|
||||
// copy data to pinned memory
|
||||
memcpy(img_buffer_host, src, img_size);
|
||||
// copy data to device memory
|
||||
CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));
|
||||
|
||||
AffineMatrix s2d, d2s;
|
||||
float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);
|
||||
|
||||
s2d.value[0] = scale;
|
||||
s2d.value[1] = 0;
|
||||
s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
|
||||
s2d.value[3] = 0;
|
||||
s2d.value[4] = scale;
|
||||
s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
|
||||
|
||||
cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
|
||||
cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
|
||||
cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
|
||||
|
||||
memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));
|
||||
|
||||
int jobs = dst_height * dst_width;
|
||||
int threads = 256;
|
||||
int blocks = ceil(jobs / (float)threads);
|
||||
|
||||
warpaffine_kernel<<<blocks, threads, 0, stream>>>(
|
||||
img_buffer_device, src_width * 3, src_width,
|
||||
src_height, dst, dst_width,
|
||||
dst_height, 128, d2s, jobs);
|
||||
}
|
||||
|
||||
void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
|
||||
float* dst, int dst_width, int dst_height,
|
||||
cudaStream_t stream) {
|
||||
int dst_size = dst_width * dst_height * 3;
|
||||
for (size_t i = 0; i < img_batch.size(); i++) {
|
||||
cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height, stream);
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_preprocess_init(int max_image_size) {
|
||||
// prepare input data in pinned memory
|
||||
CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
|
||||
// prepare input data in device memory
|
||||
CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
|
||||
}
|
||||
|
||||
void cuda_preprocess_destroy() {
|
||||
CUDA_CHECK(cudaFree(img_buffer_device));
|
||||
CUDA_CHECK(cudaFreeHost(img_buffer_host));
|
||||
}
|
||||
|
||||
15
algorithm/common_det/cuda/yolov7/preprocess.h
Normal file
15
algorithm/common_det/cuda/yolov7/preprocess.h
Normal file
@@ -0,0 +1,15 @@
|
||||
#pragma once
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstdint>
|
||||
#include <opencv2/opencv.hpp>
|
||||
|
||||
void cuda_preprocess_init(int max_image_size);
|
||||
void cuda_preprocess_destroy();
|
||||
void cuda_preprocess(uint8_t* src, int src_width, int src_height,
|
||||
float* dst, int dst_width, int dst_height,
|
||||
cudaStream_t stream);
|
||||
void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
|
||||
float* dst, int dst_width, int dst_height,
|
||||
cudaStream_t stream);
|
||||
|
||||
17
algorithm/common_det/cuda/yolov7/types.h
Normal file
17
algorithm/common_det/cuda/yolov7/types.h
Normal file
@@ -0,0 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#include "config.h"
|
||||
|
||||
struct YoloKernel {
|
||||
int width;
|
||||
int height;
|
||||
float anchors[kNumAnchor * 2];
|
||||
};
|
||||
|
||||
struct alignas(float) Detection {
|
||||
float bbox[4]; // center_x center_y w h
|
||||
float conf; // bbox_conf * cls_conf
|
||||
float class_id;
|
||||
float mask[32];
|
||||
};
|
||||
|
||||
70
algorithm/common_det/cuda/yolov7/utils.h
Normal file
70
algorithm/common_det/cuda/yolov7/utils.h
Normal file
@@ -0,0 +1,70 @@
|
||||
#pragma once
|
||||
|
||||
#include <dirent.h>
|
||||
#include <fstream>
|
||||
#include <unordered_map>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
|
||||
DIR *p_dir = opendir(p_dir_name);
|
||||
if (p_dir == nullptr) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
struct dirent* p_file = nullptr;
|
||||
while ((p_file = readdir(p_dir)) != nullptr) {
|
||||
if (strcmp(p_file->d_name, ".") != 0 &&
|
||||
strcmp(p_file->d_name, "..") != 0) {
|
||||
//std::string cur_file_name(p_dir_name);
|
||||
//cur_file_name += "/";
|
||||
//cur_file_name += p_file->d_name;
|
||||
std::string cur_file_name(p_file->d_name);
|
||||
file_names.push_back(cur_file_name);
|
||||
}
|
||||
}
|
||||
|
||||
closedir(p_dir);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Function to trim leading and trailing whitespace from a string
|
||||
static inline std::string trim_leading_whitespace(const std::string& str) {
|
||||
size_t first = str.find_first_not_of(' ');
|
||||
if (std::string::npos == first) {
|
||||
return str;
|
||||
}
|
||||
size_t last = str.find_last_not_of(' ');
|
||||
return str.substr(first, (last - first + 1));
|
||||
}
|
||||
|
||||
// Src: https://stackoverflow.com/questions/16605967
|
||||
static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
|
||||
std::ostringstream out;
|
||||
out.precision(n);
|
||||
out << std::fixed << a_value;
|
||||
return out.str();
|
||||
}
|
||||
|
||||
static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {
|
||||
|
||||
std::ifstream file(labels_filename);
|
||||
// Read each line of the file
|
||||
std::string line;
|
||||
int index = 0;
|
||||
while (std::getline(file, line)) {
|
||||
// Strip the line of any leading or trailing whitespace
|
||||
line = trim_leading_whitespace(line);
|
||||
|
||||
// Add the stripped line to the labels_map, using the loop index as the key
|
||||
labels_map[index] = line;
|
||||
index++;
|
||||
}
|
||||
// Close the file
|
||||
file.close();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
280
algorithm/common_det/cuda/yolov7/yololayer.cu
Normal file
280
algorithm/common_det/cuda/yolov7/yololayer.cu
Normal file
@@ -0,0 +1,280 @@
|
||||
#include "yololayer.h"
|
||||
#include "cuda_utils.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
namespace Tn {
|
||||
template<typename T>
|
||||
void write(char*& buffer, const T& val) {
|
||||
*reinterpret_cast<T*>(buffer) = val;
|
||||
buffer += sizeof(T);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void read(const char*& buffer, T& val) {
|
||||
val = *reinterpret_cast<const T*>(buffer);
|
||||
buffer += sizeof(T);
|
||||
}
|
||||
}
|
||||
|
||||
namespace nvinfer1 {
|
||||
YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation, const std::vector<YoloKernel>& vYoloKernel) {
|
||||
mClassCount = classCount;
|
||||
mYoloV5NetWidth = netWidth;
|
||||
mYoloV5NetHeight = netHeight;
|
||||
mMaxOutObject = maxOut;
|
||||
is_segmentation_ = is_segmentation;
|
||||
mYoloKernel = vYoloKernel;
|
||||
mKernelCount = vYoloKernel.size();
|
||||
|
||||
CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
|
||||
size_t AnchorLen = sizeof(float)* kNumAnchor * 2;
|
||||
for (int ii = 0; ii < mKernelCount; ii++) {
|
||||
CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
|
||||
const auto& yolo = mYoloKernel[ii];
|
||||
CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
|
||||
}
|
||||
}
|
||||
|
||||
YoloLayerPlugin::~YoloLayerPlugin() {
|
||||
for (int ii = 0; ii < mKernelCount; ii++) {
|
||||
CUDA_CHECK(cudaFree(mAnchor[ii]));
|
||||
}
|
||||
CUDA_CHECK(cudaFreeHost(mAnchor));
|
||||
}
|
||||
|
||||
// create the plugin at runtime from a byte stream
|
||||
YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
|
||||
using namespace Tn;
|
||||
const char *d = reinterpret_cast<const char *>(data), *a = d;
|
||||
read(d, mClassCount);
|
||||
read(d, mThreadCount);
|
||||
read(d, mKernelCount);
|
||||
read(d, mYoloV5NetWidth);
|
||||
read(d, mYoloV5NetHeight);
|
||||
read(d, mMaxOutObject);
|
||||
read(d, is_segmentation_);
|
||||
mYoloKernel.resize(mKernelCount);
|
||||
auto kernelSize = mKernelCount * sizeof(YoloKernel);
|
||||
memcpy(mYoloKernel.data(), d, kernelSize);
|
||||
d += kernelSize;
|
||||
CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
|
||||
size_t AnchorLen = sizeof(float)* kNumAnchor * 2;
|
||||
for (int ii = 0; ii < mKernelCount; ii++) {
|
||||
CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
|
||||
const auto& yolo = mYoloKernel[ii];
|
||||
CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
|
||||
}
|
||||
assert(d == a + length);
|
||||
}
|
||||
|
||||
void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
|
||||
using namespace Tn;
|
||||
char* d = static_cast<char*>(buffer), *a = d;
|
||||
write(d, mClassCount);
|
||||
write(d, mThreadCount);
|
||||
write(d, mKernelCount);
|
||||
write(d, mYoloV5NetWidth);
|
||||
write(d, mYoloV5NetHeight);
|
||||
write(d, mMaxOutObject);
|
||||
write(d, is_segmentation_);
|
||||
auto kernelSize = mKernelCount * sizeof(YoloKernel);
|
||||
memcpy(d, mYoloKernel.data(), kernelSize);
|
||||
d += kernelSize;
|
||||
|
||||
assert(d == a + getSerializationSize());
|
||||
}
|
||||
|
||||
size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
|
||||
size_t s = sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount);
|
||||
s += sizeof(YoloKernel) * mYoloKernel.size();
|
||||
s += sizeof(mYoloV5NetWidth) + sizeof(mYoloV5NetHeight);
|
||||
s += sizeof(mMaxOutObject) + sizeof(is_segmentation_);
|
||||
return s;
|
||||
}
|
||||
|
||||
int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
|
||||
return 0;
|
||||
}
|
||||
|
||||
Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT {
|
||||
//output the result to channel
|
||||
int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float);
|
||||
return Dims3(totalsize + 1, 1, 1);
|
||||
}
|
||||
|
||||
// Set plugin namespace
|
||||
void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
|
||||
mPluginNamespace = pluginNamespace;
|
||||
}
|
||||
|
||||
const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
|
||||
return mPluginNamespace;
|
||||
}
|
||||
|
||||
// Return the DataType of the plugin output at the requested index
|
||||
DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT {
|
||||
return DataType::kFLOAT;
|
||||
}
|
||||
|
||||
// Return true if output tensor is broadcast across a batch.
|
||||
bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Return true if plugin can use input that is broadcast across batch without replication.
|
||||
bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {
|
||||
return false;
|
||||
}
|
||||
|
||||
void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT {}
|
||||
|
||||
// Attach the plugin object to an execution context and grant the plugin the access to some context resource.
|
||||
void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}
|
||||
|
||||
// Detach the plugin object from its execution context.
|
||||
void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}
|
||||
|
||||
const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {
|
||||
return "YoloLayer_TRT";
|
||||
}
|
||||
|
||||
const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
|
||||
return "1";
|
||||
}
|
||||
|
||||
void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
|
||||
delete this;
|
||||
}
|
||||
|
||||
// Clone the plugin
|
||||
IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {
|
||||
YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, is_segmentation_, mYoloKernel);
|
||||
p->setPluginNamespace(mPluginNamespace);
|
||||
return p;
|
||||
}
|
||||
|
||||
__device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };
|
||||
|
||||
__global__ void CalDetection(const float *input, float *output, int noElements,
|
||||
const int netwidth, const int netheight, int maxoutobject, int yoloWidth,
|
||||
int yoloHeight, const float anchors[kNumAnchor * 2], int classes, int outputElem, bool is_segmentation) {
|
||||
|
||||
int idx = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
if (idx >= noElements) return;
|
||||
|
||||
int total_grid = yoloWidth * yoloHeight;
|
||||
int bnIdx = idx / total_grid;
|
||||
idx = idx - total_grid * bnIdx;
|
||||
int info_len_i = 5 + classes;
|
||||
if (is_segmentation) info_len_i += 32;
|
||||
const float* curInput = input + bnIdx * (info_len_i * total_grid * kNumAnchor);
|
||||
|
||||
for (int k = 0; k < kNumAnchor; ++k) {
|
||||
float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
|
||||
if (box_prob < kIgnoreThresh) continue;
|
||||
int class_id = 0;
|
||||
float max_cls_prob = 0.0;
|
||||
for (int i = 5; i < 5 + classes; ++i) {
|
||||
float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
|
||||
if (p > max_cls_prob) {
|
||||
max_cls_prob = p;
|
||||
class_id = i - 5;
|
||||
}
|
||||
}
|
||||
float *res_count = output + bnIdx * outputElem;
|
||||
int count = (int)atomicAdd(res_count, 1);
|
||||
if (count >= maxoutobject) return;
|
||||
char *data = (char*)res_count + sizeof(float) + count * sizeof(Detection);
|
||||
Detection *det = (Detection*)(data);
|
||||
|
||||
int row = idx / yoloWidth;
|
||||
int col = idx % yoloWidth;
|
||||
|
||||
det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth;
|
||||
det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight;
|
||||
|
||||
det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
|
||||
det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k];
|
||||
det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
|
||||
det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1];
|
||||
det->conf = box_prob * max_cls_prob;
|
||||
det->class_id = class_id;
|
||||
|
||||
for (int i = 0; is_segmentation && i < 32; i++) {
|
||||
det->mask[i] = curInput[idx + k * info_len_i * total_grid + (i + 5 + classes) * total_grid];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void YoloLayerPlugin::forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize) {
|
||||
int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
|
||||
for (int idx = 0; idx < batchSize; ++idx) {
|
||||
CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
|
||||
}
|
||||
int numElem = 0;
|
||||
for (unsigned int i = 0; i < mYoloKernel.size(); ++i) {
|
||||
const auto& yolo = mYoloKernel[i];
|
||||
numElem = yolo.width * yolo.height * batchSize;
|
||||
if (numElem < mThreadCount) mThreadCount = numElem;
|
||||
|
||||
CalDetection << < (numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> >
|
||||
(inputs[i], output, numElem, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, yolo.width, yolo.height, (float*)mAnchor[i], mClassCount, outputElem, is_segmentation_);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int YoloLayerPlugin::enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
|
||||
forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, batchSize);
|
||||
return 0;
|
||||
}
|
||||
|
||||
PluginFieldCollection YoloPluginCreator::mFC{};
|
||||
std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
|
||||
|
||||
YoloPluginCreator::YoloPluginCreator() {
|
||||
mPluginAttributes.clear();
|
||||
mFC.nbFields = mPluginAttributes.size();
|
||||
mFC.fields = mPluginAttributes.data();
|
||||
}
|
||||
|
||||
const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
|
||||
return "YoloLayer_TRT";
|
||||
}
|
||||
|
||||
const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
|
||||
return "1";
|
||||
}
|
||||
|
||||
const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
|
||||
return &mFC;
|
||||
}
|
||||
|
||||
IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
|
||||
assert(fc->nbFields == 2);
|
||||
assert(strcmp(fc->fields[0].name, "netinfo") == 0);
|
||||
assert(strcmp(fc->fields[1].name, "kernels") == 0);
|
||||
int *p_netinfo = (int*)(fc->fields[0].data);
|
||||
int class_count = p_netinfo[0];
|
||||
int input_w = p_netinfo[1];
|
||||
int input_h = p_netinfo[2];
|
||||
int max_output_object_count = p_netinfo[3];
|
||||
bool is_segmentation = (bool)p_netinfo[4];
|
||||
std::vector<YoloKernel> kernels(fc->fields[1].length);
|
||||
memcpy(&kernels[0], fc->fields[1].data, kernels.size() * sizeof(YoloKernel));
|
||||
YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation, kernels);
|
||||
obj->setPluginNamespace(mNamespace.c_str());
|
||||
return obj;
|
||||
}
|
||||
|
||||
IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT {
|
||||
// This object will be deleted when the network is destroyed, which will
|
||||
// call YoloLayerPlugin::destroy()
|
||||
YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
|
||||
obj->setPluginNamespace(mNamespace.c_str());
|
||||
return obj;
|
||||
}
|
||||
}
|
||||
|
||||
106
algorithm/common_det/cuda/yolov7/yololayer.h
Normal file
106
algorithm/common_det/cuda/yolov7/yololayer.h
Normal file
@@ -0,0 +1,106 @@
|
||||
#pragma once
|
||||
|
||||
#include "types.h"
|
||||
#include "macros.h"
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
namespace nvinfer1 {
|
||||
class API YoloLayerPlugin : public IPluginV2IOExt {
|
||||
public:
|
||||
YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation, const std::vector<YoloKernel>& vYoloKernel);
|
||||
YoloLayerPlugin(const void* data, size_t length);
|
||||
~YoloLayerPlugin();
|
||||
|
||||
int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
|
||||
|
||||
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;
|
||||
|
||||
int initialize() TRT_NOEXCEPT override;
|
||||
|
||||
virtual void terminate() TRT_NOEXCEPT override {};
|
||||
|
||||
virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }
|
||||
|
||||
virtual int enqueue(int batchSize, const void* const* inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
|
||||
|
||||
virtual size_t getSerializationSize() const TRT_NOEXCEPT override;
|
||||
|
||||
virtual void serialize(void* buffer) const TRT_NOEXCEPT override;
|
||||
|
||||
bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override {
|
||||
return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
|
||||
}
|
||||
|
||||
const char* getPluginType() const TRT_NOEXCEPT override;
|
||||
|
||||
const char* getPluginVersion() const TRT_NOEXCEPT override;
|
||||
|
||||
void destroy() TRT_NOEXCEPT override;
|
||||
|
||||
IPluginV2IOExt* clone() const TRT_NOEXCEPT override;
|
||||
|
||||
void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;
|
||||
|
||||
const char* getPluginNamespace() const TRT_NOEXCEPT override;
|
||||
|
||||
DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
|
||||
|
||||
bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;
|
||||
|
||||
bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;
|
||||
|
||||
void attachToContext(
|
||||
cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
|
||||
|
||||
void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override;
|
||||
|
||||
void detachFromContext() TRT_NOEXCEPT override;
|
||||
|
||||
private:
|
||||
void forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize = 1);
|
||||
int mThreadCount = 256;
|
||||
const char* mPluginNamespace;
|
||||
int mKernelCount;
|
||||
int mClassCount;
|
||||
int mYoloV5NetWidth;
|
||||
int mYoloV5NetHeight;
|
||||
int mMaxOutObject;
|
||||
bool is_segmentation_;
|
||||
std::vector<YoloKernel> mYoloKernel;
|
||||
void** mAnchor;
|
||||
};
|
||||
|
||||
class API YoloPluginCreator : public IPluginCreator {
|
||||
public:
|
||||
YoloPluginCreator();
|
||||
|
||||
~YoloPluginCreator() override = default;
|
||||
|
||||
const char* getPluginName() const TRT_NOEXCEPT override;
|
||||
|
||||
const char* getPluginVersion() const TRT_NOEXCEPT override;
|
||||
|
||||
const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
|
||||
|
||||
IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override;
|
||||
|
||||
IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
|
||||
|
||||
void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override {
|
||||
mNamespace = libNamespace;
|
||||
}
|
||||
|
||||
const char* getPluginNamespace() const TRT_NOEXCEPT override {
|
||||
return mNamespace.c_str();
|
||||
}
|
||||
|
||||
private:
|
||||
std::string mNamespace;
|
||||
static PluginFieldCollection mFC;
|
||||
static std::vector<PluginField> mPluginAttributes;
|
||||
};
|
||||
REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
|
||||
};
|
||||
|
||||
62
algorithm/common_det/sv_common_det.cpp
Normal file
62
algorithm/common_det/sv_common_det.cpp
Normal file
@@ -0,0 +1,62 @@
|
||||
#include "sv_common_det.h"
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
|
||||
#ifdef WITH_CUDA
|
||||
#include <NvInfer.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include "common_det_cuda_impl.h"
|
||||
#endif
|
||||
|
||||
|
||||
namespace sv {
|
||||
|
||||
|
||||
CommonObjectDetector::CommonObjectDetector()
|
||||
{
|
||||
this->_cuda_impl = new CommonObjectDetectorCUDAImpl;
|
||||
}
|
||||
CommonObjectDetector::~CommonObjectDetector()
|
||||
{
|
||||
}
|
||||
|
||||
bool CommonObjectDetector::setupImpl()
|
||||
{
|
||||
#ifdef WITH_CUDA
|
||||
return this->_cuda_impl->cudaSetup(this);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
void CommonObjectDetector::detectImpl(
|
||||
cv::Mat img_,
|
||||
std::vector<float>& boxes_x_,
|
||||
std::vector<float>& boxes_y_,
|
||||
std::vector<float>& boxes_w_,
|
||||
std::vector<float>& boxes_h_,
|
||||
std::vector<int>& boxes_label_,
|
||||
std::vector<float>& boxes_score_,
|
||||
std::vector<cv::Mat>& boxes_seg_
|
||||
)
|
||||
{
|
||||
#ifdef WITH_CUDA
|
||||
this->_cuda_impl->cudaDetect(
|
||||
this,
|
||||
img_,
|
||||
boxes_x_,
|
||||
boxes_y_,
|
||||
boxes_w_,
|
||||
boxes_h_,
|
||||
boxes_label_,
|
||||
boxes_score_,
|
||||
boxes_seg_
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
3863
algorithm/ellipse_det/ellipse_detector.cpp
Normal file
3863
algorithm/ellipse_det/ellipse_detector.cpp
Normal file
File diff suppressed because it is too large
Load Diff
1379
algorithm/ellipse_det/ellipse_detector.h
Normal file
1379
algorithm/ellipse_det/ellipse_detector.h
Normal file
File diff suppressed because it is too large
Load Diff
160
algorithm/landing_det/cuda/landing_det_cuda_impl.cpp
Normal file
160
algorithm/landing_det/cuda/landing_det_cuda_impl.cpp
Normal file
@@ -0,0 +1,160 @@
|
||||
#include "landing_det_cuda_impl.h"
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
|
||||
#define SV_MODEL_DIR "/SpireCV/models/"
|
||||
#define SV_ROOT_DIR "/SpireCV/"
|
||||
|
||||
|
||||
#ifdef WITH_CUDA
|
||||
#include "yolov7/logging.h"
|
||||
#define TRTCHECK(status) \
|
||||
do \
|
||||
{ \
|
||||
auto ret = (status); \
|
||||
if (ret != 0) \
|
||||
{ \
|
||||
std::cerr << "Cuda failure: " << ret << std::endl; \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define DEVICE 0 // GPU id
|
||||
#define BATCH_SIZE 1
|
||||
#define MAX_IMAGE_INPUT_SIZE_THRESH 3000 * 3000 // ensure it exceed the maximum size in the input images !
|
||||
#endif
|
||||
|
||||
|
||||
namespace sv {
|
||||
|
||||
using namespace cv;
|
||||
|
||||
|
||||
#ifdef WITH_CUDA
|
||||
using namespace nvinfer1;
|
||||
static Logger g_nvlogger;
|
||||
#endif
|
||||
|
||||
|
||||
LandingMarkerDetectorCUDAImpl::LandingMarkerDetectorCUDAImpl()
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
LandingMarkerDetectorCUDAImpl::~LandingMarkerDetectorCUDAImpl()
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
bool LandingMarkerDetectorCUDAImpl::cudaSetup()
|
||||
{
|
||||
#ifdef WITH_CUDA
|
||||
std::string trt_model_fn = get_home() + SV_MODEL_DIR + "LandingMarker.engine";
|
||||
if (!is_file_exist(trt_model_fn))
|
||||
{
|
||||
throw std::runtime_error("SpireCV (104) Error loading the LandingMarker TensorRT model (File Not Exist)");
|
||||
}
|
||||
char *trt_model_stream{nullptr};
|
||||
size_t trt_model_size{0};
|
||||
try
|
||||
{
|
||||
std::ifstream file(trt_model_fn, std::ios::binary);
|
||||
file.seekg(0, file.end);
|
||||
trt_model_size = file.tellg();
|
||||
file.seekg(0, file.beg);
|
||||
trt_model_stream = new char[trt_model_size];
|
||||
assert(trt_model_stream);
|
||||
file.read(trt_model_stream, trt_model_size);
|
||||
file.close();
|
||||
}
|
||||
catch (const std::runtime_error &e)
|
||||
{
|
||||
throw std::runtime_error("SpireCV (104) Error loading the TensorRT model!");
|
||||
}
|
||||
|
||||
// TensorRT
|
||||
IRuntime *runtime = nvinfer1::createInferRuntime(g_nvlogger);
|
||||
assert(runtime != nullptr);
|
||||
ICudaEngine *p_cu_engine = runtime->deserializeCudaEngine(trt_model_stream, trt_model_size);
|
||||
assert(p_cu_engine != nullptr);
|
||||
this->_trt_context = p_cu_engine->createExecutionContext();
|
||||
assert(this->_trt_context != nullptr);
|
||||
|
||||
delete[] trt_model_stream;
|
||||
const ICudaEngine &cu_engine = this->_trt_context->getEngine();
|
||||
assert(cu_engine.getNbBindings() == 2);
|
||||
|
||||
this->_input_index = cu_engine.getBindingIndex("input");
|
||||
this->_output_index = cu_engine.getBindingIndex("output");
|
||||
TRTCHECK(cudaMalloc(&_p_buffers[this->_input_index], 1 * 3 * 32 * 32 * sizeof(float)));
|
||||
TRTCHECK(cudaMalloc(&_p_buffers[this->_output_index], 1 * 11 * sizeof(float)));
|
||||
TRTCHECK(cudaStreamCreate(&_cu_stream));
|
||||
|
||||
auto input_dims = nvinfer1::Dims4{1, 3, 32, 32};
|
||||
this->_trt_context->setBindingDimensions(this->_input_index, input_dims);
|
||||
|
||||
this->_p_data = new float[1 * 3 * 32 * 32];
|
||||
this->_p_prob = new float[1 * 11];
|
||||
// Input
|
||||
TRTCHECK(cudaMemcpyAsync(_p_buffers[this->_input_index], this->_p_data, 1 * 3 * 32 * 32 * sizeof(float), cudaMemcpyHostToDevice, this->_cu_stream));
|
||||
// this->_trt_context->enqueue(1, _p_buffers, this->_cu_stream, nullptr);
|
||||
this->_trt_context->enqueueV2(_p_buffers, this->_cu_stream, nullptr);
|
||||
// Output
|
||||
TRTCHECK(cudaMemcpyAsync(this->_p_prob, _p_buffers[this->_output_index], 1 * 11 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream));
|
||||
cudaStreamSynchronize(this->_cu_stream);
|
||||
return true;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
void LandingMarkerDetectorCUDAImpl::cudaRoiCNN(
|
||||
std::vector<cv::Mat>& input_rois_,
|
||||
std::vector<int>& output_labels_
|
||||
)
|
||||
{
|
||||
#ifdef WITH_CUDA
|
||||
output_labels_.clear();
|
||||
for (int i=0; i<input_rois_.size(); i++)
|
||||
{
|
||||
cv::Mat e_roi = input_rois_[i];
|
||||
for (int row = 0; row < 32; ++row)
|
||||
{
|
||||
uchar *uc_pixel = e_roi.data + row * e_roi.step; // compute row id
|
||||
for (int col = 0; col < 32; ++col)
|
||||
{
|
||||
// mean=[136.20, 141.50, 145.41], std=[44.77, 44.20, 44.30]
|
||||
this->_p_data[col + row * 32] = ((float)uc_pixel[0] - 136.20f) / 44.77f;
|
||||
this->_p_data[col + row * 32 + 32 * 32] = ((float)uc_pixel[1] - 141.50f) / 44.20f;
|
||||
this->_p_data[col + row * 32 + 32 * 32 * 2] = ((float)uc_pixel[2] - 145.41f) / 44.30f;
|
||||
uc_pixel += 3;
|
||||
}
|
||||
}
|
||||
|
||||
// Input
|
||||
TRTCHECK(cudaMemcpyAsync(_p_buffers[this->_input_index], this->_p_data, 1 * 3 * 32 * 32 * sizeof(float), cudaMemcpyHostToDevice, this->_cu_stream));
|
||||
// this->_trt_context->enqueue(1, _p_buffers, this->_cu_stream, nullptr);
|
||||
this->_trt_context->enqueueV2(_p_buffers, this->_cu_stream, nullptr);
|
||||
// Output
|
||||
TRTCHECK(cudaMemcpyAsync(this->_p_prob, _p_buffers[this->_output_index], 1 * 11 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream));
|
||||
cudaStreamSynchronize(this->_cu_stream);
|
||||
|
||||
// Find max index
|
||||
double max = 0;
|
||||
int label = 0;
|
||||
for (int i = 0; i < 11; ++i)
|
||||
{
|
||||
if (max < this->_p_prob[i])
|
||||
{
|
||||
max = this->_p_prob[i];
|
||||
label = i;
|
||||
}
|
||||
}
|
||||
output_labels_.push_back(label);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
48
algorithm/landing_det/cuda/landing_det_cuda_impl.h
Normal file
48
algorithm/landing_det/cuda/landing_det_cuda_impl.h
Normal file
@@ -0,0 +1,48 @@
|
||||
#ifndef __SV_LANDING_DET_CUDA__
|
||||
#define __SV_LANDING_DET_CUDA__
|
||||
|
||||
#include "sv_core.h"
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include <opencv2/aruco.hpp>
|
||||
#include <opencv2/tracking.hpp>
|
||||
#include <string>
|
||||
#include <chrono>
|
||||
|
||||
|
||||
|
||||
#ifdef WITH_CUDA
|
||||
#include <NvInfer.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
namespace sv {
|
||||
|
||||
|
||||
class LandingMarkerDetectorCUDAImpl
|
||||
{
|
||||
public:
|
||||
LandingMarkerDetectorCUDAImpl();
|
||||
~LandingMarkerDetectorCUDAImpl();
|
||||
|
||||
bool cudaSetup();
|
||||
void cudaRoiCNN(
|
||||
std::vector<cv::Mat>& input_rois_,
|
||||
std::vector<int>& output_labels_
|
||||
);
|
||||
|
||||
#ifdef WITH_CUDA
|
||||
float *_p_data;
|
||||
float *_p_prob;
|
||||
nvinfer1::IExecutionContext *_trt_context;
|
||||
int _input_index;
|
||||
int _output_index;
|
||||
void *_p_buffers[2];
|
||||
cudaStream_t _cu_stream;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
#endif
|
||||
48
algorithm/landing_det/sv_landing_det.cpp
Normal file
48
algorithm/landing_det/sv_landing_det.cpp
Normal file
@@ -0,0 +1,48 @@
|
||||
#include "sv_landing_det.h"
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
#ifdef WITH_CUDA
|
||||
#include <NvInfer.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include "landing_det_cuda_impl.h"
|
||||
#endif
|
||||
|
||||
|
||||
namespace sv {
|
||||
|
||||
|
||||
LandingMarkerDetector::LandingMarkerDetector()
|
||||
{
|
||||
this->_cuda_impl = new LandingMarkerDetectorCUDAImpl;
|
||||
}
|
||||
LandingMarkerDetector::~LandingMarkerDetector()
|
||||
{
|
||||
}
|
||||
|
||||
bool LandingMarkerDetector::setupImpl()
|
||||
{
|
||||
#ifdef WITH_CUDA
|
||||
return this->_cuda_impl->cudaSetup();
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
void LandingMarkerDetector::roiCNN(
|
||||
std::vector<cv::Mat>& input_rois_,
|
||||
std::vector<int>& output_labels_
|
||||
)
|
||||
{
|
||||
#ifdef WITH_CUDA
|
||||
this->_cuda_impl->cudaRoiCNN(
|
||||
input_rois_,
|
||||
output_labels_
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
1142
algorithm/sv_algorithm_base.cpp
Normal file
1142
algorithm/sv_algorithm_base.cpp
Normal file
File diff suppressed because it is too large
Load Diff
135
algorithm/tracking/ocv470/tracking_ocv470_impl.cpp
Normal file
135
algorithm/tracking/ocv470/tracking_ocv470_impl.cpp
Normal file
@@ -0,0 +1,135 @@
|
||||
#include "tracking_ocv470_impl.h"
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
|
||||
#define SV_MODEL_DIR "/SpireCV/models/"
|
||||
#define SV_ROOT_DIR "/SpireCV/"
|
||||
|
||||
|
||||
namespace sv {
|
||||
|
||||
using namespace cv;
|
||||
|
||||
|
||||
SingleObjectTrackerOCV470Impl::SingleObjectTrackerOCV470Impl()
|
||||
{
|
||||
}
|
||||
|
||||
SingleObjectTrackerOCV470Impl::~SingleObjectTrackerOCV470Impl()
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
bool SingleObjectTrackerOCV470Impl::ocv470Setup(SingleObjectTrackerBase* base_)
|
||||
{
|
||||
this->_algorithm = base_->getAlgorithm();
|
||||
this->_backend = base_->getBackend();
|
||||
this->_target = base_->getTarget();
|
||||
|
||||
#ifdef WITH_OCV470
|
||||
std::string net = get_home() + SV_MODEL_DIR + "dasiamrpn_model.onnx";
|
||||
std::string kernel_cls1 = get_home() + SV_MODEL_DIR + "dasiamrpn_kernel_cls1.onnx";
|
||||
std::string kernel_r1 = get_home() + SV_MODEL_DIR + "dasiamrpn_kernel_r1.onnx";
|
||||
|
||||
std::string backbone = get_home() + SV_MODEL_DIR + "nanotrack_backbone_sim.onnx";
|
||||
std::string neckhead = get_home() + SV_MODEL_DIR + "nanotrack_head_sim.onnx";
|
||||
|
||||
try
|
||||
{
|
||||
TrackerNano::Params nano_params;
|
||||
nano_params.backbone = samples::findFile(backbone);
|
||||
nano_params.neckhead = samples::findFile(neckhead);
|
||||
nano_params.backend = this->_backend;
|
||||
nano_params.target = this->_target;
|
||||
|
||||
_nano = TrackerNano::create(nano_params);
|
||||
}
|
||||
catch (const cv::Exception& ee)
|
||||
{
|
||||
std::cerr << "Exception: " << ee.what() << std::endl;
|
||||
std::cout << "Can't load the network by using the following files:" << std::endl;
|
||||
std::cout << "nanoBackbone : " << backbone << std::endl;
|
||||
std::cout << "nanoNeckhead : " << neckhead << std::endl;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
TrackerDaSiamRPN::Params params;
|
||||
params.model = samples::findFile(net);
|
||||
params.kernel_cls1 = samples::findFile(kernel_cls1);
|
||||
params.kernel_r1 = samples::findFile(kernel_r1);
|
||||
params.backend = this->_backend;
|
||||
params.target = this->_target;
|
||||
_siam_rpn = TrackerDaSiamRPN::create(params);
|
||||
}
|
||||
catch (const cv::Exception& ee)
|
||||
{
|
||||
std::cerr << "Exception: " << ee.what() << std::endl;
|
||||
std::cout << "Can't load the network by using the following files:" << std::endl;
|
||||
std::cout << "siamRPN : " << net << std::endl;
|
||||
std::cout << "siamKernelCL1 : " << kernel_cls1 << std::endl;
|
||||
std::cout << "siamKernelR1 : " << kernel_r1 << std::endl;
|
||||
}
|
||||
return true;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
void SingleObjectTrackerOCV470Impl::ocv470Init(cv::Mat img_, const cv::Rect& bounding_box_)
|
||||
{
|
||||
#ifdef WITH_OCV470
|
||||
if (this->_algorithm == "kcf")
|
||||
{
|
||||
TrackerKCF::Params params;
|
||||
_kcf = TrackerKCF::create(params);
|
||||
_kcf->init(img_, bounding_box_);
|
||||
}
|
||||
else if (this->_algorithm == "csrt")
|
||||
{
|
||||
TrackerCSRT::Params params;
|
||||
_csrt = TrackerCSRT::create(params);
|
||||
_csrt->init(img_, bounding_box_);
|
||||
}
|
||||
else if (this->_algorithm == "siamrpn")
|
||||
{
|
||||
_siam_rpn->init(img_, bounding_box_);
|
||||
}
|
||||
else if (this->_algorithm == "nano")
|
||||
{
|
||||
_nano->init(img_, bounding_box_);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
bool SingleObjectTrackerOCV470Impl::ocv470Track(cv::Mat img_, cv::Rect& output_bbox_)
|
||||
{
|
||||
#ifdef WITH_OCV470
|
||||
bool ok = false;
|
||||
if (this->_algorithm == "kcf")
|
||||
{
|
||||
ok = _kcf->update(img_, output_bbox_);
|
||||
}
|
||||
else if (this->_algorithm == "csrt")
|
||||
{
|
||||
ok = _csrt->update(img_, output_bbox_);
|
||||
}
|
||||
else if (this->_algorithm == "siamrpn")
|
||||
{
|
||||
ok = _siam_rpn->update(img_, output_bbox_);
|
||||
}
|
||||
else if (this->_algorithm == "nano")
|
||||
{
|
||||
ok = _nano->update(img_, output_bbox_);
|
||||
}
|
||||
return ok;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
40
algorithm/tracking/ocv470/tracking_ocv470_impl.h
Normal file
40
algorithm/tracking/ocv470/tracking_ocv470_impl.h
Normal file
@@ -0,0 +1,40 @@
|
||||
#ifndef __SV_TRACKING_OCV470__
|
||||
#define __SV_TRACKING_OCV470__
|
||||
|
||||
#include "sv_core.h"
|
||||
#include <opencv2/opencv.hpp>
|
||||
#include <opencv2/aruco.hpp>
|
||||
#include <opencv2/tracking.hpp>
|
||||
#include <string>
|
||||
#include <chrono>
|
||||
|
||||
|
||||
|
||||
namespace sv {
|
||||
|
||||
|
||||
class SingleObjectTrackerOCV470Impl
|
||||
{
|
||||
public:
|
||||
SingleObjectTrackerOCV470Impl();
|
||||
~SingleObjectTrackerOCV470Impl();
|
||||
|
||||
bool ocv470Setup(SingleObjectTrackerBase* base_);
|
||||
void ocv470Init(cv::Mat img_, const cv::Rect& bounding_box_);
|
||||
bool ocv470Track(cv::Mat img_, cv::Rect& output_bbox_);
|
||||
|
||||
std::string _algorithm;
|
||||
int _backend;
|
||||
int _target;
|
||||
|
||||
#ifdef WITH_OCV470
|
||||
cv::Ptr<cv::TrackerDaSiamRPN> _siam_rpn;
|
||||
cv::Ptr<cv::TrackerKCF> _kcf;
|
||||
cv::Ptr<cv::TrackerCSRT> _csrt;
|
||||
cv::Ptr<cv::TrackerNano> _nano;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
#endif
|
||||
41
algorithm/tracking/sv_tracking.cpp
Normal file
41
algorithm/tracking/sv_tracking.cpp
Normal file
@@ -0,0 +1,41 @@
|
||||
#include "sv_tracking.h"
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
#include "tracking_ocv470_impl.h"
|
||||
|
||||
|
||||
namespace sv {
|
||||
|
||||
|
||||
SingleObjectTracker::SingleObjectTracker()
|
||||
{
|
||||
this->_ocv470_impl = new SingleObjectTrackerOCV470Impl;
|
||||
}
|
||||
SingleObjectTracker::~SingleObjectTracker()
|
||||
{
|
||||
}
|
||||
|
||||
bool SingleObjectTracker::setupImpl()
|
||||
{
|
||||
#ifdef WITH_OCV470
|
||||
return this->_ocv470_impl->ocv470Setup(this);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
void SingleObjectTracker::initImpl(cv::Mat img_, const cv::Rect& bounding_box_)
|
||||
{
|
||||
#ifdef WITH_OCV470
|
||||
this->_ocv470_impl->ocv470Init(img_, bounding_box_);
|
||||
#endif
|
||||
}
|
||||
bool SingleObjectTracker::trackImpl(cv::Mat img_, cv::Rect& output_bbox_)
|
||||
{
|
||||
#ifdef WITH_OCV470
|
||||
return this->_ocv470_impl->ocv470Track(img_, output_bbox_);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user