SpireCV/sot_trt_cuda_impl.cpp

691 lines
27 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "sot_trt_cuda_impl.h"
#include <cmath>
#include <fstream>
#include <opencv2/opencv.hpp>
#define SV_MODEL_DIR "/SpireCV/models/"
#define SV_ROOT_DIR "/SpireCV/"
#ifdef WITH_CUDA
#include "yolov7/logging.h"
#define TRTCHECK(status) \
do \
{ \
auto ret = (status); \
if (ret != 0) \
{ \
std::cerr << "Cuda failure: " << ret << std::endl; \
abort(); \
} \
} while (0)
#define DEVICE 0 // GPU id
#define BATCH_SIZE 1
#define MAX_IMAGE_INPUT_SIZE_THRESH 3000 * 3000 // ensure it exceed the maximum size in the input images !
#endif
namespace sv
{
using namespace std;
using namespace nvinfer1;
static Logger g_nvlogger;
inline float fast_exp(float x)
{
union
{
uint32_t i;
float f;
} v{};
v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
return v.f;
}
inline float sigmoid(float x)
{
return 1.0f / (1.0f + fast_exp(-x));
}
static void Softmax(const Mat &src, Mat &dst)
{
Mat maxVal;
cv::max(src.row(1), src.row(0), maxVal);
src.row(1) -= maxVal;
src.row(0) -= maxVal;
exp(src, dst);
Mat sumVal = dst.row(0) + dst.row(1);
dst.row(0) = dst.row(0) / sumVal;
dst.row(1) = dst.row(1) / sumVal;
}
static float sz_whFun(cv::Point2f wh)
{
float pad = (wh.x + wh.y) * 0.5f;
float sz2 = (wh.x + pad) * (wh.y + pad);
return std::sqrt(sz2);
}
static std::vector<float> sz_change_fun(std::vector<float> w, std::vector<float> h, float sz)
{
int rows = int(std::sqrt(w.size()));
int cols = int(std::sqrt(w.size()));
std::vector<float> pad(rows * cols, 0);
std::vector<float> sz2;
for (int i = 0; i < cols; i++)
{
for (int j = 0; j < rows; j++)
{
pad[i * cols + j] = (w[i * cols + j] + h[i * cols + j]) * 0.5f;
}
}
for (int i = 0; i < cols; i++)
{
for (int j = 0; j < rows; j++)
{
float t = std::sqrt((w[i * rows + j] + pad[i * rows + j]) * (h[i * rows + j] + pad[i * rows + j])) / sz;
sz2.push_back(std::max(t, (float)1.0 / t));
}
}
return sz2;
}
static std::vector<float> ratio_change_fun(std::vector<float> w, std::vector<float> h, cv::Point2f target_sz)
{
int rows = int(std::sqrt(w.size()));
int cols = int(std::sqrt(w.size()));
float ratio = target_sz.x / target_sz.y;
std::vector<float> sz2;
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
float t = ratio / (w[i * cols + j] / h[i * cols + j]);
sz2.push_back(std::max(t, (float)1.0 / t));
}
}
return sz2;
}
SotTrtDetectorCUDAImpl::SotTrtDetectorCUDAImpl()
{
}
SotTrtDetectorCUDAImpl::~SotTrtDetectorCUDAImpl()
{
}
bool SotTrtDetectorCUDAImpl::cudaSetup()
{
std::string trt_model_fn1 = get_home() + SV_MODEL_DIR + "nanotrack_backbone_sim.engine";
std::string trt_model_fn2 = get_home() + SV_MODEL_DIR + "nanotrack_backbone_temp.engine";
std::string trt_model_fn3 = get_home() + SV_MODEL_DIR + "nanotrack_head_sim.engine";
if (!is_file_exist(trt_model_fn1) && !is_file_exist(trt_model_fn2) && !is_file_exist(trt_model_fn3))
{
throw std::runtime_error("SpireCV (104) Error loading the Nanotrack TensorRT model (File Not Exist)");
}
char *trt_model_stream1{nullptr}, *trt_model_stream2{nullptr}, *trt_model_stream3{nullptr};
size_t trt_model_size1{0}, trt_model_size2{0}, trt_model_size3{0};
;
try
{
std::ifstream file1(trt_model_fn1, std::ios::binary);
file1.seekg(0, file1.end);
trt_model_size1 = file1.tellg();
file1.seekg(0, file1.beg);
trt_model_stream1 = new char[trt_model_size1];
assert(trt_model_stream1);
file1.read(trt_model_stream1, trt_model_size1);
file1.close();
}
catch (const std::runtime_error &e)
{
throw std::runtime_error("SpireCV (104) Error loading the TensorRT model!");
}
try
{
std::ifstream file2(trt_model_fn2, std::ios::binary);
file2.seekg(0, file2.end);
trt_model_size2 = file2.tellg();
file2.seekg(0, file2.beg);
trt_model_stream2 = new char[trt_model_size2];
assert(trt_model_stream2);
file2.read(trt_model_stream2, trt_model_size2);
file2.close();
}
catch (const std::runtime_error &e)
{
throw std::runtime_error("SpireCV (104) Error loading the TensorRT model!");
}
try
{
std::ifstream file3(trt_model_fn3, std::ios::binary);
file3.seekg(0, file3.end);
trt_model_size3 = file3.tellg();
file3.seekg(0, file3.beg);
trt_model_stream3 = new char[trt_model_size3];
assert(trt_model_stream3);
file3.read(trt_model_stream3, trt_model_size3);
file3.close();
}
catch (const std::runtime_error &e)
{
throw std::runtime_error("SpireCV (104) Error loading the TensorRT model!");
}
// TensorRT1
IRuntime *runtime_1 = nvinfer1::createInferRuntime(g_nvlogger);
assert(runtime_1 != nullptr);
ICudaEngine *p_cu_engine1 = runtime_1->deserializeCudaEngine(trt_model_stream1, trt_model_size1);
assert(p_cu_engine1 != nullptr);
this->_trt_context[0] = p_cu_engine1->createExecutionContext();
assert(this->_trt_context[0] != nullptr);
delete[] trt_model_stream1;
const ICudaEngine &cu_engine1 = this->_trt_context[0]->getEngine();
assert(cu_engine1.getNbBindings() == 2);
this->_input_index_1 = cu_engine1.getBindingIndex("input");
this->_output_index_1 = cu_engine1.getBindingIndex("output");
TRTCHECK(cudaMalloc(&_p_buffers_1[this->_input_index_1], 1 * 3 * 255 * 255 * sizeof(float)));
TRTCHECK(cudaMalloc(&_p_buffers_1[this->_output_index_1], 1 * 48 * 16 * 16 * sizeof(float)));
TRTCHECK(cudaStreamCreate(&_cu_stream1));
auto input_dims_1 = nvinfer1::Dims4{1, 3, 255, 255};
this->_trt_context[0]->setBindingDimensions(this->_input_index_1, input_dims_1);
this->_p_data_1 = new float[1 * 3 * 255 * 255];
this->_p_prob_1 = new float[1 * 48 * 16 * 16];
// Input
TRTCHECK(cudaMemcpyAsync(_p_buffers_1[this->_input_index_1], this->_p_data_1, 1 * 3 * 255 * 255 * sizeof(float), cudaMemcpyHostToDevice, this->_cu_stream1));
// this->_trt_context->enqueue(1, _p_buffers, this->_cu_stream, nullptr);
this->_trt_context[0]->enqueueV2(_p_buffers_1, this->_cu_stream1, nullptr);
// Output
TRTCHECK(cudaMemcpyAsync(this->_p_prob_1, _p_buffers_1[this->_output_index_1], 1 * 48 * 16 * 16 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream1));
cudaStreamSynchronize(this->_cu_stream1);
// TensorRT2
IRuntime *runtime_2 = nvinfer1::createInferRuntime(g_nvlogger);
assert(runtime_2 != nullptr);
ICudaEngine *p_cu_engine2 = runtime_2->deserializeCudaEngine(trt_model_stream2, trt_model_size2);
assert(p_cu_engine2 != nullptr);
this->_trt_context[1] = p_cu_engine2->createExecutionContext();
assert(this->_trt_context[1] != nullptr);
delete[] trt_model_stream2;
const ICudaEngine &cu_engine2 = this->_trt_context[1]->getEngine();
assert(cu_engine2.getNbBindings() == 2);
this->_input_index_2 = cu_engine2.getBindingIndex("input");
this->_output_index_2 = cu_engine2.getBindingIndex("output");
TRTCHECK(cudaMalloc(&_p_buffers_2[this->_input_index_2], 1 * 3 * 127 * 127 * sizeof(float)));
TRTCHECK(cudaMalloc(&_p_buffers_2[this->_output_index_2], 1 * 48 * 8 * 8 * sizeof(float)));
TRTCHECK(cudaStreamCreate(&_cu_stream2));
auto input_dims_2 = nvinfer1::Dims4{1, 3, 127, 127};
this->_trt_context[1]->setBindingDimensions(this->_input_index_2, input_dims_2);
this->_p_data_2 = new float[1 * 3 * 127 * 127];
this->_p_prob_2 = new float[1 * 48 * 8 * 8];
// Input
TRTCHECK(cudaMemcpyAsync(_p_buffers_2[this->_input_index_2], this->_p_data_2, 1 * 3 * 127 * 127 * sizeof(float), cudaMemcpyHostToDevice, this->_cu_stream2));
// this->_trt_context->enqueue(1, _p_buffers, this->_cu_stream, nullptr);
this->_trt_context[1]->enqueueV2(_p_buffers_2, this->_cu_stream2, nullptr);
// Output
TRTCHECK(cudaMemcpyAsync(this->_p_prob_2, _p_buffers_2[this->_output_index_2], 1 * 48 * 8 * 8 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream2));
cudaStreamSynchronize(this->_cu_stream2);
// TensorRT3
IRuntime *runtime_3 = nvinfer1::createInferRuntime(g_nvlogger);
assert(runtime_3 != nullptr);
ICudaEngine *p_cu_engine3 = runtime_3->deserializeCudaEngine(trt_model_stream3, trt_model_size3);
assert(p_cu_engine3 != nullptr);
this->_trt_context[2] = p_cu_engine3->createExecutionContext();
assert(this->_trt_context[2] != nullptr);
delete[] trt_model_stream3;
const ICudaEngine &cu_engine3 = this->_trt_context[2]->getEngine();
assert(cu_engine3.getNbBindings() == 4);
this->_input_index_3_1 = cu_engine3.getBindingIndex("input1");
this->_input_index_3_2 = cu_engine3.getBindingIndex("input2");
this->_output_index_3_1 = cu_engine3.getBindingIndex("output1");
this->_output_index_3_2 = cu_engine3.getBindingIndex("output2");
TRTCHECK(cudaMalloc(&_p_buffers_3[this->_input_index_3_1], 1 * 48 * 8 * 8 * sizeof(float)));
TRTCHECK(cudaMalloc(&_p_buffers_3[this->_input_index_3_2], 1 * 48 * 16 * 16 * sizeof(float)));
TRTCHECK(cudaMalloc(&_p_buffers_3[this->_output_index_3_1], 1 * 2 * 16 * 16 * sizeof(float)));
TRTCHECK(cudaMalloc(&_p_buffers_3[this->_output_index_3_2], 1 * 4 * 16 * 16 * sizeof(float)));
TRTCHECK(cudaStreamCreate(&_cu_stream3));
auto input_dims_3_1 = nvinfer1::Dims4{1, 48, 8, 8};
auto input_dims_3_2 = nvinfer1::Dims4{1, 48, 16, 16};
this->_trt_context[2]->setBindingDimensions(this->_input_index_3_1, input_dims_3_1);
this->_trt_context[2]->setBindingDimensions(this->_input_index_3_2, input_dims_3_2);
this->_p_data_3_1 = new float[1 * 48 * 8 * 8];
this->_p_data_3_2 = new float[1 * 48 * 16 * 16];
this->_p_prob_3_1 = new float[1 * 2 * 16 * 16];
this->_p_prob_3_2 = new float[1 * 4 * 16 * 16];
// Input
TRTCHECK(cudaMemcpyAsync(_p_buffers_3[this->_input_index_3_1], this->_p_data_3_1, 1 * 48 * 8 * 8 * sizeof(float), cudaMemcpyHostToDevice, this->_cu_stream3));
TRTCHECK(cudaMemcpyAsync(_p_buffers_3[this->_input_index_3_2], this->_p_data_3_2, 1 * 48 * 16 * 16 * sizeof(float), cudaMemcpyHostToDevice, this->_cu_stream3));
// this->_trt_context->enqueue(1, _p_buffers, this->_cu_stream, nullptr);
this->_trt_context[2]->enqueueV2(_p_buffers_3, this->_cu_stream3, nullptr);
// Output
TRTCHECK(cudaMemcpyAsync(this->_p_prob_3_1, _p_buffers_3[this->_output_index_3_1], 1 * 2 * 16 * 16 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream3));
TRTCHECK(cudaMemcpyAsync(this->_p_prob_3_2, _p_buffers_3[this->_output_index_3_2], 1 * 4 * 16 * 16 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream3));
cudaStreamSynchronize(this->_cu_stream3);
return false;
}
void SotTrtDetectorCUDAImpl::cudaInitImpl(cv::Mat img, const cv::Rect &bbox)
{
create_window();
create_grids();
cv::Point target_pos; // cx, cy
cv::Point2f target_sz = {0.f, 0.f}; // w,h
target_pos.x = bbox.x + bbox.width / 2;
target_pos.y = bbox.y + bbox.height / 2;
target_sz.x = bbox.width;
target_sz.y = bbox.height;
cout << "bbox" << bbox << endl;
cout << "target_pos" << target_pos << endl;
cout << "target_sz" << target_sz << endl;
float wc_z = target_sz.x + cfg.context_amount * (target_sz.x + target_sz.y);
float hc_z = target_sz.y + cfg.context_amount * (target_sz.x + target_sz.y);
float s_z = round(sqrt(wc_z * hc_z));
cv::Scalar avg_chans = cv::mean(img);
cv::Mat z_crop;
z_crop = get_subwindow_tracking(img, target_pos, cfg.exemplar_size, int(s_z), avg_chans); // cv::Mat BGR order
// cout<<"z_crop"<<z_crop<<endl;
for (int row = 0; row < 127; ++row)
{
uchar *uc_pixel = z_crop.data + row * z_crop.step; // compute row id
for (int col = 0; col < 127; ++col)
{
// mean=[136.20, 141.50, 145.41], std=[44.77, 44.20, 44.30]
this->_p_data_2[col + row * 127] = ((float)uc_pixel[0] - 136.20f) / 44.77f;
this->_p_data_2[col + row * 127 + 127 * 127] = ((float)uc_pixel[1] - 141.50f) / 44.20f;
this->_p_data_2[col + row * 127 + 127 * 127 * 2] = ((float)uc_pixel[2] - 145.41f) / 44.30f;
uc_pixel += 3;
}
}
// 数据输入以及模板初始化
// TensorRT2
// Input
TRTCHECK(cudaMemcpyAsync(_p_buffers_2[this->_input_index_2], this->_p_data_2, 1 * 3 * 127 * 127 * sizeof(float), cudaMemcpyHostToDevice, this->_cu_stream2));
this->_trt_context[1]->enqueueV2(_p_buffers_2, this->_cu_stream2, nullptr);
// Output
TRTCHECK(cudaMemcpyAsync(this->_p_prob_2, _p_buffers_2[this->_output_index_2], 1 * 48 * 8 * 8 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream2));
cudaStreamSynchronize(this->_cu_stream2);
this->state.channel_ave = avg_chans;
this->state.im_h = img.rows;
this->state.im_w = img.cols;
this->state.target_pos = target_pos;
this->state.target_sz = target_sz;
}
void SotTrtDetectorCUDAImpl::update(const cv::Mat &x_crops, cv::Point &target_pos, cv::Point2f &target_sz, float scale_z, float &cls_score_max)
{
// 图像255的输入
// 转tensor
// torch::Tensor tensor_image_X = torch::from_blob(x_crops.data, {1, x_crops.rows, x_crops.cols, 3}, torch::kByte);
// tensor_image_X = tensor_image_X.permute({0, 3, 1, 2});
// tensor_image_X = tensor_image_X.toType(torch::kFloat);
cv::imshow("img", x_crops);
cv::waitKey(1000);
for (int row = 0; row < 255; ++row)
{
uchar *uc_pixel = x_crops.data + row * x_crops.step; // compute row id
for (int col = 0; col < 255; ++col)
{
// mean=[136.20, 141.50, 145.41], std=[44.77, 44.20, 44.30]
this->_p_data_1[col + row * 255] = ((float)uc_pixel[0] - 136.20f) / 44.77f;
this->_p_data_1[col + row * 255 + 255 * 255] = ((float)uc_pixel[1] - 141.50f) / 44.20f;
this->_p_data_1[col + row * 255 + 255 * 255 * 2] = ((float)uc_pixel[2] - 145.41f) / 44.30f;
uc_pixel += 3;
}
}
// TensorRT1
// Input
TRTCHECK(cudaMemcpyAsync(_p_buffers_1[this->_input_index_1], this->_p_data_1, 1 * 3 * 255 * 255 * sizeof(float), cudaMemcpyHostToDevice, this->_cu_stream1));
this->_trt_context[0]->enqueueV2(_p_buffers_1, this->_cu_stream1, nullptr);
// Output
TRTCHECK(cudaMemcpyAsync(this->_p_prob_1, _p_buffers_1[this->_output_index_1], 1 * 48 * 16 * 16 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream1));
cudaStreamSynchronize(this->_cu_stream1);
// TensorRT3
// Input
TRTCHECK(cudaMemcpyAsync(_p_buffers_3[this->_input_index_3_1], this->_p_prob_2, 1 * 48 * 8 * 8 * sizeof(float), cudaMemcpyHostToDevice, this->_cu_stream3));
TRTCHECK(cudaMemcpyAsync(_p_buffers_3[this->_input_index_3_2], this->_p_prob_1, 1 * 48 * 16 * 16 * sizeof(float), cudaMemcpyHostToDevice, this->_cu_stream3));
this->_trt_context[2]->enqueueV2(_p_buffers_3, this->_cu_stream3, nullptr);
// Output
TRTCHECK(cudaMemcpyAsync(this->_p_prob_3_1, _p_buffers_3[this->_output_index_3_1], 1 * 2 * 16 * 16 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream3));
TRTCHECK(cudaMemcpyAsync(this->_p_prob_3_2, _p_buffers_3[this->_output_index_3_2], 1 * 4 * 16 * 16 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream3));
cudaStreamSynchronize(this->_cu_stream3);
cv::Mat pred_bbox, pred_score;
pred_bbox = cv::Mat(20, 16, CV_32FC(4));
pred_score = cv::Mat(16, 16, CV_32FC(2));
memcpy(pred_bbox.data, this->_p_prob_3_2, sizeof(float) * 4 * 16 * 16);
memcpy(pred_score.data, this->_p_prob_3_1, sizeof(float) * 2 * 16 * 16);
// delete this->_p_prob_3_2,this->_p_prob_3_1;
cv::Mat outputMat = pred_score.reshape(1, 1);
// pred_score = pred_score.row(1);
// pred_score = pred_score.reshape(0, {2, 16, 16});
// cv::Mat scoreSoftmax; // 2x16x16
// Softmax(pred_score, scoreSoftmax);
// std::cout<<scoreSoftmax<<std::endl;
// cv::Mat score = scoreSoftmax.row(1);
// score = score.reshape(0, {scoreSize, scoreSize});
// std::cout <<"pred_score Mat channels: "<<pred_bbox.channels()<< " pred_bbox Mat Size: " << pred_bbox.size() << std::endl;
// std::cout <<"pred_score Mat channels: "<<pred_score.channels()<< " pred_score Mat Size: " << pred_score.size() << std::endl;
// squeeze操作
// torch::Tensor cls_score_result = torch::squeeze(cls_score, 0);
// torch::Tensor bbox_pred_result = torch::squeeze(bbox_pred, 0);
// std::cout << "输出的shape:"<< cls_score_result.sizes() << std::endl;
// std::cout << "输出的shape:"<< bbox_pred_result.sizes() << std::endl;
// std::cout<<cls_score_result<<endl;
std::vector<float> cls_score_sigmoid;
// float* cls_score_data = (float*)cls_score.data;
// float* cls_score_data = cls_score.channel(1);
// cv::Mat cls_score_mat(cv::Size{16, 16}, CV_32F, cls_score_result.index({1,"..."}).data_ptr());
// float* cls_score_data = (float*) cls_score_mat.data;
/* @20220805问题cls_score_result数据范围是[-1,1],而mat的CV_32F为[0,1] 会有精度截断, 改了还是不对*/
float *cls_score_data = (float *)outputMat.ptr<float>(0);
// float *cls_score_data = new float[16 * 16 * 2];
// 复制矩阵数据到float*数组
// std::memcpy(cls_score_data, pred_score.data, 16 * 16 * 2 * sizeof(float));
/* debug */
// for(int i=0;i<256; i++){
// cout<<this->_p_prob_3_1[i]<<endl;
// }
// std::cout << "cls_score_data:"<< cls_score_mat << std::endl;
// torch::Tensor tensor_tmp = cls_score_result.index({1,"..."});
// std::cout << "tensor_tmp:"<< cls_score_result.index({1,"..."}) << std::endl;
cls_score_sigmoid.clear();
int cols = pred_score.cols;
int rows = pred_score.rows;
for (int i = 0; i < cols * rows; i++) //
{
cls_score_sigmoid.push_back(sigmoid(cls_score_data[i]));
// cout <<"cls_score_data "<< i << " : " <<sigmoid(cls_score_data[i])<< endl;
}
std::vector<float> pred_x1(cols * rows, 0), pred_y1(cols * rows, 0), pred_x2(cols * rows, 0), pred_y2(cols * rows, 0);
float *bbox_pred_data1 = (float *)pred_bbox.ptr<float>(0);
float *bbox_pred_data2 = (float *)pred_bbox.ptr<float>(1);
float *bbox_pred_data3 = (float *)pred_bbox.ptr<float>(2);
float *bbox_pred_data4 = (float *)pred_bbox.ptr<float>(3);
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
pred_x1[i * cols + j] = this->grid_to_search_x[i * cols + j] - bbox_pred_data1[i * cols + j];
pred_y1[i * cols + j] = this->grid_to_search_y[i * cols + j] - bbox_pred_data2[i * cols + j];
pred_x2[i * cols + j] = this->grid_to_search_x[i * cols + j] + bbox_pred_data3[i * cols + j];
pred_y2[i * cols + j] = this->grid_to_search_y[i * cols + j] + bbox_pred_data4[i * cols + j];
}
}
// size penalty
std::vector<float> w(cols * rows, 0), h(cols * rows, 0);
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
w[i * cols + j] = pred_x2[i * cols + j] - pred_x1[i * cols + j];
h[i * rows + j] = pred_y2[i * rows + j] - pred_y1[i * cols + j];
}
}
float sz_wh = sz_whFun(target_sz);
std::vector<float> s_c = sz_change_fun(w, h, sz_wh);
std::vector<float> r_c = ratio_change_fun(w, h, target_sz);
std::vector<float> penalty(rows * cols, 0);
for (int i = 0; i < rows * cols; i++)
{
penalty[i] = std::exp(-1 * (s_c[i] * r_c[i] - 1) * cfg.penalty_k);
}
// window penalty
std::vector<float> pscore(rows * cols, 0);
int r_max = 0, c_max = 0;
float maxScore = 0;
for (int i = 0; i < rows * cols; i++)
{
pscore[i] = (penalty[i] * cls_score_sigmoid[i]) * (1 - cfg.window_influence) + this->window[i] * cfg.window_influence;
if (pscore[i] > maxScore)
{
// get max
maxScore = pscore[i];
r_max = std::floor(i / rows);
c_max = ((float)i / rows - r_max) * rows;
}
}
// to real size
float pred_x1_real = pred_x1[r_max * cols + c_max]; // pred_x1[r_max, c_max]
float pred_y1_real = pred_y1[r_max * cols + c_max];
float pred_x2_real = pred_x2[r_max * cols + c_max];
float pred_y2_real = pred_y2[r_max * cols + c_max];
float pred_xs = (pred_x1_real + pred_x2_real) / 2;
float pred_ys = (pred_y1_real + pred_y2_real) / 2;
float pred_w = pred_x2_real - pred_x1_real;
float pred_h = pred_y2_real - pred_y1_real;
float diff_xs = pred_xs - cfg.instance_size / 2;
float diff_ys = pred_ys - cfg.instance_size / 2;
diff_xs /= scale_z;
diff_ys /= scale_z;
pred_w /= scale_z;
pred_h /= scale_z;
target_sz.x = target_sz.x / scale_z;
target_sz.y = target_sz.y / scale_z;
// size learning rate
float lr = penalty[r_max * cols + c_max] * cls_score_sigmoid[r_max * cols + c_max] * cfg.lr;
// size rate
auto res_xs = float(target_pos.x + diff_xs);
auto res_ys = float(target_pos.y + diff_ys);
float res_w = pred_w * lr + (1 - lr) * target_sz.x;
float res_h = pred_h * lr + (1 - lr) * target_sz.y;
target_pos.x = int(res_xs);
target_pos.y = int(res_ys);
target_sz.x = target_sz.x * (1 - lr) + lr * res_w;
target_sz.y = target_sz.y * (1 - lr) + lr * res_h;
cls_score_max = cls_score_sigmoid[r_max * cols + c_max];
// std::cout<<"cls_score_max: "<<cls_score_max<<std::endl;
}
void SotTrtDetectorCUDAImpl::cudaTrackImpl(cv::Mat im, cv::Rect &output_bbox_)
{
cv::Point target_pos = this->state.target_pos;
cv::Point2f target_sz = this->state.target_sz;
float hc_z = target_sz.y + cfg.context_amount * (target_sz.x + target_sz.y);
float wc_z = target_sz.x + cfg.context_amount * (target_sz.x + target_sz.y);
float s_z = sqrt(wc_z * hc_z);
float scale_z = cfg.exemplar_size / s_z;
float d_search = (cfg.instance_size - cfg.exemplar_size) / 2;
float pad = d_search / scale_z;
float s_x = s_z + 2 * pad;
/* add @20220808 to with python */
// float s_x = s_z * (cfg.instance_size / cfg.exemplar_size);
cv::Mat x_crop;
x_crop = get_subwindow_tracking(im, target_pos, cfg.instance_size, int(s_x), state.channel_ave);
// update
target_sz.x = target_sz.x * scale_z;
target_sz.y = target_sz.y * scale_z;
float cls_score_max;
this->update(x_crop, target_pos, target_sz, scale_z, cls_score_max);
target_pos.x = std::max(0, min(state.im_w, target_pos.x));
target_pos.y = std::max(0, min(state.im_h, target_pos.y));
target_sz.x = float(std::max(10, min(state.im_w, int(target_sz.x))));
target_sz.y = float(std::max(10, min(state.im_h, int(target_sz.y))));
state.target_pos = target_pos;
state.target_sz = target_sz;
// target_pos.x = bbox.x + bbox.width / 2;
// target_pos.y = bbox.y + bbox.height / 2;
// target_sz.x = bbox.width;
// target_sz.y = bbox.height;
// output_bbox_.width = state.target_sz.x;
// output_bbox_.height = state.target_sz.y;
// output_bbox_.x = state.target_pos.x - state.target_sz.x / 2;
// output_bbox_.y = state.target_pos.y - state.target_sz.y / 2;
output_bbox_ = {int(target_pos.x - target_sz.x / 2.f), int(target_pos.y - target_sz.y / 2.f), int(target_sz.x), int(target_sz.y)};
}
// 生成每一个格点的坐标
void SotTrtDetectorCUDAImpl::create_window()
{
int score_size = cfg.score_size;
std::vector<float> hanning(score_size, 0);
this->window.resize(score_size * score_size, 0);
for (int i = 0; i < score_size; i++)
{
float w = 0.5f - 0.5f * std::cos(2 * 3.1415926535898f * i / (score_size - 1));
hanning[i] = w;
}
for (int i = 0; i < score_size; i++)
{
for (int j = 0; j < score_size; j++)
{
this->window[i * score_size + j] = hanning[i] * hanning[j];
}
}
}
// 生成每一个格点的坐标
void SotTrtDetectorCUDAImpl::create_grids()
{
/*
each element of feature map on input search image
:return: H*W*2 (position for each element)
*/
int sz = cfg.score_size; // 16x16
this->grid_to_search_x.resize(sz * sz, 0);
this->grid_to_search_y.resize(sz * sz, 0);
for (int i = 0; i < sz; i++)
{
for (int j = 0; j < sz; j++)
{
this->grid_to_search_x[i * sz + j] = j * cfg.total_stride;
this->grid_to_search_y[i * sz + j] = i * cfg.total_stride;
}
}
for (int i = 0; i < sz; i++)
{
for (int j = 0; j < sz; j++)
{
this->grid_to_search_x[i * sz + j] += cfg.instance_size/2;
this->grid_to_search_y[i * sz + j] += cfg.instance_size/2;
}
}
}
cv::Mat SotTrtDetectorCUDAImpl::get_subwindow_tracking(cv::Mat im, cv::Point2f pos, int model_sz, int original_sz, cv::Scalar channel_ave)
{
float c = (float)(original_sz + 1) / 2;
int context_xmin = std::round(pos.x - c);
int context_xmax = context_xmin + original_sz - 1;
int context_ymin = std::round(pos.y - c);
int context_ymax = context_ymin + original_sz - 1;
int left_pad = int(std::max(0, -context_xmin));
int top_pad = int(std::max(0, -context_ymin));
int right_pad = int(std::max(0, context_xmax - im.cols + 1));
int bottom_pad = int(std::max(0, context_ymax - im.rows + 1));
context_xmin += left_pad;
context_xmax += left_pad;
context_ymin += top_pad;
context_ymax += top_pad;
cv::Mat im_path_original;
if (top_pad > 0 || left_pad > 0 || right_pad > 0 || bottom_pad > 0)
{
cv::Mat te_im = cv::Mat::zeros(im.rows + top_pad + bottom_pad, im.cols + left_pad + right_pad, CV_8UC3);
cv::copyMakeBorder(im, te_im, top_pad, bottom_pad, left_pad, right_pad, cv::BORDER_CONSTANT, channel_ave);
im_path_original = te_im(cv::Rect(context_xmin, context_ymin, context_xmax - context_xmin + 1, context_ymax - context_ymin + 1));
}
else
im_path_original = im(cv::Rect(context_xmin, context_ymin, context_xmax - context_xmin + 1, context_ymax - context_ymin + 1));
cv::Mat im_path;
cv::resize(im_path_original, im_path, cv::Size(model_sz, model_sz));
return im_path;
}
}