SpireCV/algorithm/veri/cuda/veri_det_cuda_impl.cpp

#include "veri_det_cuda_impl.h"
#include <cmath>
#include <fstream>
#include "sv_util.h"

#define SV_MODEL_DIR "/SpireCV/models/"
#define SV_ROOT_DIR "/SpireCV/"

#ifdef WITH_CUDA
#include "yolov7/logging.h"
#define TRTCHECK(status)                                 \
  do                                                     \
  {                                                      \
    auto ret = (status);                                 \
    if (ret != 0)                                        \
    {                                                    \
      std::cerr << "Cuda failure: " << ret << std::endl; \
      abort();                                           \
    }                                                    \
  } while (0)

#define DEVICE 0 // GPU id
#define BATCH_SIZE 1

#define MAX_IMAGE_INPUT_SIZE_THRESH 3000 * 3000 // ensure it exceed the maximum size in the input images !
#endif

#include <iostream>
#include <cmath>
int BAT = 1;
float cosineSimilarity(float *vec1, float *vec2, int size)
{
  // 计算向量的点积
  float dotProduct = 0.0f;
  for (int i = 0; i < size; ++i)
  {
    dotProduct += vec1[i] * vec2[i];
  }

  // 计算向量的模长
  float magnitudeVec1 = 0.0f;
  float magnitudeVec2 = 0.0f;
  for (int i = 0; i < size; ++i)
  {
    magnitudeVec1 += vec1[i] * vec1[i];
    magnitudeVec2 += vec2[i] * vec2[i];
  }
  magnitudeVec1 = std::sqrt(magnitudeVec1);
  magnitudeVec2 = std::sqrt(magnitudeVec2);

  // 计算余弦相似性
  float similarity = dotProduct / (magnitudeVec1 * magnitudeVec2);

  return similarity;
}

namespace sv
{

  using namespace cv;

#ifdef WITH_CUDA
  using namespace nvinfer1;
  static Logger g_nvlogger;
#endif

  VeriDetectorCUDAImpl::VeriDetectorCUDAImpl()
  {
  }

  VeriDetectorCUDAImpl::~VeriDetectorCUDAImpl()
  {
  }

  bool VeriDetectorCUDAImpl::cudaSetup()
  {
#ifdef WITH_CUDA
    std::string trt_model_fn = get_home() + SV_MODEL_DIR + "veri.engine";

    std::vector<std::string> files;
    _list_dir(get_home() + SV_MODEL_DIR, files, "-online.engine", "Nv-VERI-mobilenet_v3");
    if (files.size() > 0)
    {
      std::sort(files.rbegin(), files.rend(), _comp_str_lesser);
      trt_model_fn = get_home() + SV_MODEL_DIR + files[0];
    }
    std::cout << "Load: " << trt_model_fn << std::endl;

    if (!is_file_exist(trt_model_fn))
    {
      throw std::runtime_error("SpireCV (104) Error loading the VeriDetector TensorRT model (File Not Exist)");
    }
    char *trt_model_stream{nullptr};
    size_t trt_model_size{0};
    try
    {
      std::ifstream file(trt_model_fn, std::ios::binary);
      file.seekg(0, file.end);
      trt_model_size = file.tellg();
      file.seekg(0, file.beg);
      trt_model_stream = new char[trt_model_size];
      assert(trt_model_stream);
      file.read(trt_model_stream, trt_model_size);
      file.close();
    }
    catch (const std::runtime_error &e)
    {
      throw std::runtime_error("SpireCV (104) Error loading the TensorRT model!");
    }

    // TensorRT
    IRuntime *runtime = nvinfer1::createInferRuntime(g_nvlogger);
    assert(runtime != nullptr);
    ICudaEngine *p_cu_engine = runtime->deserializeCudaEngine(trt_model_stream, trt_model_size);
    assert(p_cu_engine != nullptr);
    this->_trt_context = p_cu_engine->createExecutionContext();
    assert(this->_trt_context != nullptr);

    delete[] trt_model_stream;
    const ICudaEngine &cu_engine = this->_trt_context->getEngine();
    assert(cu_engine.getNbBindings() == 3);

    this->_input_index = cu_engine.getBindingIndex("input");
    this->_output_index1 = cu_engine.getBindingIndex("output");
    this->_output_index2 = cu_engine.getBindingIndex("/head/layers.0/act/Mul_output_0");
    TRTCHECK(cudaMalloc(&_p_buffers[this->_input_index], 2 * 3 * 224 * 224 * sizeof(float)));
    TRTCHECK(cudaMalloc(&_p_buffers[this->_output_index1], 2 * 576 * sizeof(float)));
    TRTCHECK(cudaMalloc(&_p_buffers[this->_output_index2], 2 * 1280 * sizeof(float)));
    TRTCHECK(cudaStreamCreate(&_cu_stream));

    auto input_dims = nvinfer1::Dims4{2, 3, 224, 224};
    this->_trt_context->setBindingDimensions(this->_input_index, input_dims);

    this->_p_data = new float[2 * 3 * 224 * 224];
    this->_p_prob1 = new float[2 * 576];
    this->_p_prob2 = new float[2 * 1280];
    // Input
    TRTCHECK(cudaMemcpyAsync(_p_buffers[this->_input_index], this->_p_data, 2 * 3 * 224 * 224 * sizeof(float), cudaMemcpyHostToDevice, this->_cu_stream));
    // this->_trt_context->enqueue(1, _p_buffers, this->_cu_stream, nullptr);
    this->_trt_context->enqueueV2(_p_buffers, this->_cu_stream, nullptr);
    // Output
    TRTCHECK(cudaMemcpyAsync(this->_p_prob1, _p_buffers[this->_output_index1], 2 * 576 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream));
    TRTCHECK(cudaMemcpyAsync(this->_p_prob2, _p_buffers[this->_output_index2], 2 * 1280 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream));
    cudaStreamSynchronize(this->_cu_stream);
    return true;
#endif
    return false;
  }

  void VeriDetectorCUDAImpl::cudaRoiCNN(
      std::vector<cv::Mat> &input_rois_,
      std::vector<float> &output_labels_)
  {
#ifdef WITH_CUDA


    for (int i = 0; i < 2; i++)
    {
      for (int row = 0; row < 224; ++row)
      {
        uchar *uc_pixel = input_rois_[i].data + row * input_rois_[i].step; // compute row id
        for (int col = 0; col < 224; ++col)
        {
          // mean=[136.20, 141.50, 145.41], std=[44.77, 44.20, 44.30]
          this->_p_data[col + row * 224 + 224 * 224 * 3 * i] = ((float)uc_pixel[0] - 136.20f) / 44.77f;
          this->_p_data[col + row * 224 + 224 * 224  + 224 * 224 * 3 * i] = ((float)uc_pixel[1] - 141.50f) / 44.20f;
          this->_p_data[col + row * 224 + 224 * 224 * 2  + 224 * 224 * 3 * i] = ((float)uc_pixel[2] - 145.41f) / 44.30f;
          uc_pixel += 3;
        }
      }
    }


    // Input
    TRTCHECK(cudaMemcpyAsync(_p_buffers[this->_input_index], this->_p_data, 2 * 3 * 224 * 224 * sizeof(float), cudaMemcpyHostToDevice, this->_cu_stream));
    // this->_trt_context->enqueue(1, _p_buffers, this->_cu_stream, nullptr);
    this->_trt_context->enqueueV2(_p_buffers, this->_cu_stream, nullptr);
    // Output
    TRTCHECK(cudaMemcpyAsync(this->_p_prob1, _p_buffers[this->_output_index1], 2 * 576 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream));
    TRTCHECK(cudaMemcpyAsync(this->_p_prob2, _p_buffers[this->_output_index2], 2 * 1280 * sizeof(float), cudaMemcpyDeviceToHost, this->_cu_stream));
    cudaStreamSynchronize(this->_cu_stream);

    // Find max index
    double max = 0;
    int label = 0;
    for (int i = 0; i < 576; ++i)
    {
      if (max < this->_p_prob1[i])
      {
        max = this->_p_prob1[i];
        label = i;
      }
    }

    float similarity = cosineSimilarity(this->_p_prob2, this->_p_prob2 + 1280, 1280);
    output_labels_.push_back(label);
    output_labels_.push_back(similarity);
  }
#endif
}